public static IEnumerable<KeyValuePair<string, LikelihoodRatio.LikelihoodRatioData<string, string>>> collocations_from_likelihood_ratio_Treebank3() { var treebank3 = new Treebank3CorpusReader(Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3")); var bigram_freqs = treebank3.words() .Where((x) => x != ", " && !TextTools.is_puctuation(x) && TextTools.not_whitespace.IsMatch(x)) .NGram(2).Select((a) => a.Aggregate((x, y) => x + " " + y)) .Freqs(); var unigram_freqs = treebank3.words() .Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).Freqs(); var N = bigram_freqs.Count(); foreach (var bigram_llr in bigram_freqs.Generate()) { var llr = new LikelihoodRatio.LikelihoodRatioData<string, string>(N , bigram_llr.Key, bigram_llr.Value, bigram_llr.Key.Split().ToArray(), unigram_freqs); { yield return new KeyValuePair<string, LikelihoodRatio.LikelihoodRatioData<string, string>>(bigram_llr.Key, llr); } } // TODO: chi-sqare comparison to gen collocations }
private static void read_treebank3_period_bigrams() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"); var treebank = new Treebank3CorpusReader(path); var t1 = treebank.words().Where(x=> (x != "") && x.ToCharArray().Last() == '.'); var t2 = treebank.words().NGram(2).Where(a=>a.Last()==".").Select(a=>a.Aggregate((x,y)=>String.Concat(x,y))); var t_no_periods = treebank.words().Where(x=> (x != "") && x.ToCharArray().Last() != '.') .Where((x)=>!TextTools.is_puctuation(x)).NGram(2).Where(a=>a.Last()!=".") .Select(a=>a.Aggregate((x,y)=>String.Concat(x,y))); foreach (var content in t1.Union(t2)) { //var x = content; Console.WriteLine(content); } Console.WriteLine(@"ending period: {0}", t1.Concat(t2).Count()); Console.WriteLine(@" no ending period: {0}", t_no_periods.Count()); Console.WriteLine(@" total: {0}", t_no_periods.Concat(t1.Concat(t2)).Count()); }
private static void test_log_likelihood_collocation() { var treebank3 = new Treebank3CorpusReader(Path.Combine( Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3")); var bigram_freqs = treebank3.words() .Where((x) => x != ", " && !TextTools.is_puctuation(x) && TextTools.not_whitespace.IsMatch(x)) .NGram(2).Select((a) => a.Aggregate((x, y) => x + " " + y)) .Freqs(); var unigram_freqs = treebank3.words() .Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).Freqs(); foreach (var bigram_llr in LikelihoodRatio.log_likelihood_ratio(unigram_freqs, bigram_freqs).OrderBy((t)=>t.Value).Take(2000)) { Console.WriteLine(String.Format(@"{0}: {1}", bigram_llr.Key, bigram_llr.Value)); } }
private static void read_treebank3() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"); var treebank = new Treebank3CorpusReader(path); foreach (var content in treebank.words()) { //var x = content; Console.WriteLine(content); } }