private static void get_instances_of_Mr_from_austen() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Austen"); var austen = new TextCorpusReader(path); var freq = new Frequencies<string>(); var i = 0; foreach (var w in austen.words().NGram(2)) { var term = w.First().Trim().Append(w.Last()).Trim(); //if (term.Length > 1 && term.Substring(0, 2) == "Mr" && term.Substring(2, 1) != "." && term.Substring(2, 1) != "s") if (term.Length > 1 && term.Substring(0, 2) == "Mr") { freq.Add(term); } } foreach (var t in freq.Generate()) { Console.WriteLine("{0} => {1}", t.Key, t.Value); } }
private static void get_punkt_statistics_from_treebank3() { //var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"); //var treebank = new Treebank3CorpusReader(path); var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Austen"); var treebank = new TextCorpusReader(path); var stats = Punkt.process(treebank.words().Where(a=> a.Length <=7 || a.Substring(0,7)!="Speaker")); using (var writer = new StreamWriter("statistics.log")) { foreach (var stat in stats.type_classifier_results .OrderBy(a => a.statistics.scaled_llr) ) {// TODO: output statistics to a log for review. writer.WriteLine(@"{0}, {1}, {2}, {3} {4} {5} {6} {7}", stat.statistics.scaled_llr , stat.statistics.c_w0, stat.statistics.c_w1, stat.statistics.c_w01, stat.statistics.length_penalty , stat.statistics.with_final_period_penalty, stat.statistics.w0, stat.classification); // Console.WriteLine(@"{0}, c(w,.)={1}, c(w,~.)={2} -> {3}", stat.statistics.scaled_llr, stat.statistics.c_w01, stat.statistics.c_w0 - stat.statistics.c_w01, stat.statistics.w01); } } }
private static void read_raw_text_from_washingtons_first_inagural_address() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\inaugural"); var inaugural = new TextCorpusReader(path); foreach (var address in inaugural.read_raw(Path.Combine(path, "1789-Washington.txt"))) { Console.WriteLine(address); } }
private static void read_trigram_frequencies_from_inaugural_addresses() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\inaugural"); var inaugural = new TextCorpusReader(path); var f = new Frequencies<string>(); foreach (var address in inaugural.words().Where((x) => x != ", ").NGram(3)) { f.Add(address.DefaultIfEmpty("").Aggregate((a, b) => a + " " + b)); } foreach (var term in f.Generate().OrderBy(p => p.Value).Reverse().Take(10)) { Console.WriteLine(String.Format(@"{0}: {1}", term.Key, term.Value)); } }
private static void inaugural_ngram_conditional_frequencies() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\inaugural"); var inaugural = new TextCorpusReader(path); var cf = new ConditionalFrequencies(); foreach (var term in inaugural.words().Where((x) => x != ", ").NGram(3)) { cf.Add(term); } foreach (var term in cf.Generate()) { if (term.Value.Count() > 1) { Console.WriteLine(term.Key); foreach (var evt in term.Value.Generate()) { Console.WriteLine("\t{0} -> {1}", evt.Key, evt.Value); } } } }