/// <summary> /// return a stream of the statistics found when calculating type classification step. /// </summary> /// <param name="period_final_bigram_frequencies"></param> /// <param name="unigram_freqs"></param> /// <param name="N"></param> /// <param name="cw01"></param> /// <param name="cw1"></param> /// <param name="p"></param> /// <param name="p0"></param> /// <param name="p1"></param> /// <returns></returns> public static IEnumerable<Type_Classification> enumerate_type_based_classification( Frequencies<string> period_final_bigram_frequencies , Frequencies<string> unigram_freqs , double N , double cw1 , double p , double p0 , double p1 ) { foreach (var bigram in period_final_bigram_frequencies.Generate()) { var s = new Type_Classifier_Statistics(); s.c_w1 = cw1; s.p = p; s.p0 = p0; s.p1 = p1; s.w1 = "."; s.w01 = bigram.Key; // split off the ending period s.w0 = bigram.Key.Substring(0, bigram.Key.Length-1); s.c_w01 = bigram.Value; // find the unigram frequency of this term. s.c_w0 = unigram_freqs.Get(s.w0); s.llr = LikelihoodRatio.logLambda(N, s.c_w01, s.c_w0, s.c_w1, s.p, s.p0, s.p1); // Calculate the length penalty s.length_penalty = 1/Math.Exp(s.w0.Length - count_internal_periods(s.w0)); // Calculate the Internal periods penalty s.internal_periods_penalty = count_internal_periods(s.w0) + 1; // Calculate occurrances penalty without final period var len = s.w0.Length - count_internal_periods(s.w0); len = (len == 0) ? 1: len; s.with_final_period_penalty = 1/Math.Pow(len, (s.c_w0 - s.c_w01)); //if (s.with_final_period_penalty == 0) // Console.WriteLine(s.w0); // scaled log likelihood s.scaled_llr = s.llr * s.length_penalty * s.internal_periods_penalty * s.with_final_period_penalty; // calculate the final classifier for each sentence final bigram var c = new Type_Classification(); c.statistics = s; // annotate with the classification if (s.scaled_llr < 0.3) { c.classification = "<S>"; } else { if (s.w0.Length - count_internal_periods(s.w0) == 0) c.classification = "<E>"; else c.classification = "<A>"; } yield return c; } }
private static void freqs_from_emma_sample() { var freq = new Frequencies<string>(TextExamples.emma().Split()); foreach (var term in freq.Generate().OrderBy(p => p.Value)) { Console.WriteLine(String.Format(@"{0}: {1}", term.Key, term.Value)); } }
private static void frequencies_of_ngrams_in_emma_sample() { var text = TextExamples.emma(); var freq = new Frequencies<string>(); foreach (var token in Regex.Split(text, @"(\W+)").Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).NGram(3)) { freq.Add(token.Aggregate((a, b) => a + " " + b)); } foreach (var term in freq.Generate().OrderBy(p => p.Value).Reverse().Take(10)) { Console.WriteLine(String.Format(@"{0}: {1}", term.Key, term.Value)); } Console.WriteLine(freq.Count()); Console.WriteLine(freq.Terms().Count()); }
/// <summary> /// Step 1: Type based classification of period final bigrams. In this step, /// we want to guess at a classification of period final bigrams whether they are /// elipses, abbreviations, or sentence endings. This procedure calculates the /// parameters that remain constant over all the enumerations. /// </summary> /// <returns>a list of type classifications</returns> public static IEnumerable<Type_Classification> type_based_classification_stage( Frequencies<string> period_final_bigram_frequencies , Frequencies<string> nonperiod_final_bigram_frequencies , Frequencies<string> unigram_freqs , int period_freq) { // count of periods double cw1 = Convert.ToDouble(period_freq); // number of tokens in the corpus double N = unigram_freqs.Generate().Select(a=>a.Value).Sum() + cw1; // Do Collocation Bond // The paper identifies special values for probabilities on the null hypothesis // and the alternative (p. 5). double p = cw1/N; double p0 = 0.99; double p1 = 1.0 - p0; // The remaining parameters: // bigrams count (this is the same as the period count) double cw01 = cw1; // return the results for each period final bigram return enumerate_type_based_classification(period_final_bigram_frequencies , unigram_freqs, N, cw1, p, p0, p1); }
private static void get_instances_of_Mr_from_austen() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Austen"); var austen = new TextCorpusReader(path); var freq = new Frequencies<string>(); var i = 0; foreach (var w in austen.words().NGram(2)) { var term = w.First().Trim().Append(w.Last()).Trim(); //if (term.Length > 1 && term.Substring(0, 2) == "Mr" && term.Substring(2, 1) != "." && term.Substring(2, 1) != "s") if (term.Length > 1 && term.Substring(0, 2) == "Mr") { freq.Add(term); } } foreach (var t in freq.Generate()) { Console.WriteLine("{0} => {1}", t.Key, t.Value); } }
private static void read_trigram_frequencies_from_inaugural_addresses() { var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\inaugural"); var inaugural = new TextCorpusReader(path); var f = new Frequencies<string>(); foreach (var address in inaugural.words().Where((x) => x != ", ").NGram(3)) { f.Add(address.DefaultIfEmpty("").Aggregate((a, b) => a + " " + b)); } foreach (var term in f.Generate().OrderBy(p => p.Value).Reverse().Take(10)) { Console.WriteLine(String.Format(@"{0}: {1}", term.Key, term.Value)); } }