Пример #1
0
        public static IEnumerable<KeyValuePair<string, LikelihoodRatio.LikelihoodRatioData<string, string>>> collocations_from_likelihood_ratio_Treebank3()
        {
            var treebank3 = new Treebank3CorpusReader(Path.Combine(
              Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"));

              var bigram_freqs = treebank3.words()
                .Where((x) => x != ", " && !TextTools.is_puctuation(x) && TextTools.not_whitespace.IsMatch(x))
                .NGram(2).Select((a) => a.Aggregate((x, y) => x + " " + y))
                .Freqs();

              var unigram_freqs = treebank3.words()
              .Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).Freqs();
              var N = bigram_freqs.Count();

              foreach (var bigram_llr in bigram_freqs.Generate())
              {
            var llr = new LikelihoodRatio.LikelihoodRatioData<string, string>(N
                  , bigram_llr.Key, bigram_llr.Value, bigram_llr.Key.Split().ToArray(), unigram_freqs);
            {
              yield return new KeyValuePair<string, LikelihoodRatio.LikelihoodRatioData<string, string>>(bigram_llr.Key, llr);
            }
              }
              // TODO: chi-sqare comparison to gen collocations
        }
Пример #2
0
 private static void read_treebank3_period_bigrams()
 {
     var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3");
       var treebank = new Treebank3CorpusReader(path);
       var t1 = treebank.words().Where(x=> (x != "") && x.ToCharArray().Last() == '.');
       var t2 = treebank.words().NGram(2).Where(a=>a.Last()==".").Select(a=>a.Aggregate((x,y)=>String.Concat(x,y)));
       var t_no_periods = treebank.words().Where(x=> (x != "") && x.ToCharArray().Last() != '.')
       .Where((x)=>!TextTools.is_puctuation(x)).NGram(2).Where(a=>a.Last()!=".")
       .Select(a=>a.Aggregate((x,y)=>String.Concat(x,y)));
       foreach (var content in t1.Union(t2))
       {
     //var x = content;
     Console.WriteLine(content);
       }
       Console.WriteLine(@"ending period: {0}", t1.Concat(t2).Count());
       Console.WriteLine(@" no ending period: {0}", t_no_periods.Count());
       Console.WriteLine(@" total: {0}", t_no_periods.Concat(t1.Concat(t2)).Count());
 }
Пример #3
0
        private static void test_log_likelihood_collocation()
        {
            var treebank3 = new Treebank3CorpusReader(Path.Combine(
              Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"));

              var bigram_freqs = treebank3.words()
                .Where((x) => x != ", " && !TextTools.is_puctuation(x) && TextTools.not_whitespace.IsMatch(x))
                .NGram(2).Select((a) => a.Aggregate((x, y) => x + " " + y))
                .Freqs();

              var unigram_freqs = treebank3.words()
              .Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).Freqs();

              foreach (var bigram_llr in LikelihoodRatio.log_likelihood_ratio(unigram_freqs, bigram_freqs).OrderBy((t)=>t.Value).Take(2000))
              {
            Console.WriteLine(String.Format(@"{0}: {1}", bigram_llr.Key, bigram_llr.Value));
              }
        }
Пример #4
0
 private static void read_treebank3()
 {
     var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3");
       var treebank = new Treebank3CorpusReader(path);
       foreach (var content in treebank.words())
       {
     //var x = content;
     Console.WriteLine(content);
       }
 }