示例#1
0
        /// <summary>
        /// return a stream of the statistics found when calculating type classification step.
        /// </summary>
        /// <param name="period_final_bigram_frequencies"></param>
        /// <param name="unigram_freqs"></param>
        /// <param name="N"></param>
        /// <param name="cw01"></param>
        /// <param name="cw1"></param>
        /// <param name="p"></param>
        /// <param name="p0"></param>
        /// <param name="p1"></param>
        /// <returns></returns>
        public static IEnumerable<Type_Classification> enumerate_type_based_classification(
                  Frequencies<string> period_final_bigram_frequencies
                  , Frequencies<string> unigram_freqs
                  , double N
                  , double cw1
                  , double p
                  , double p0
                  , double p1
                  )
        {
            foreach (var bigram in period_final_bigram_frequencies.Generate())
              {
            var s = new Type_Classifier_Statistics();

            s.c_w1 = cw1;
            s.p = p;
            s.p0 = p0;
            s.p1 = p1;
            s.w1 = ".";
            s.w01 = bigram.Key;

            // split off the ending period
            s.w0 = bigram.Key.Substring(0, bigram.Key.Length-1);
            s.c_w01 = bigram.Value;
            // find the unigram frequency of this term.
            s.c_w0 = unigram_freqs.Get(s.w0);

            s.llr = LikelihoodRatio.logLambda(N, s.c_w01, s.c_w0, s.c_w1, s.p, s.p0, s.p1);

            // Calculate the length penalty
            s.length_penalty = 1/Math.Exp(s.w0.Length - count_internal_periods(s.w0));
            // Calculate the Internal periods penalty
            s.internal_periods_penalty = count_internal_periods(s.w0) + 1;
            // Calculate occurrances penalty without final period
            var len = s.w0.Length - count_internal_periods(s.w0);
            len = (len == 0) ? 1: len;
            s.with_final_period_penalty = 1/Math.Pow(len, (s.c_w0 - s.c_w01));
            //if (s.with_final_period_penalty == 0)
            //  Console.WriteLine(s.w0);
            // scaled log likelihood
            s.scaled_llr = s.llr * s.length_penalty * s.internal_periods_penalty * s.with_final_period_penalty;

            // calculate the final classifier for each sentence final bigram
            var c = new Type_Classification();
            c.statistics = s;
            // annotate with the classification
            if (s.scaled_llr < 0.3)
            {
              c.classification = "<S>";
            }
            else
            {
              if (s.w0.Length - count_internal_periods(s.w0) == 0)
            c.classification = "<E>";
              else
            c.classification = "<A>";
            }
            yield return c;
              }
        }
示例#2
0
 private static void freqs_from_emma_sample()
 {
     var freq = new Frequencies<string>(TextExamples.emma().Split());
       foreach (var term in freq.Generate().OrderBy(p => p.Value))
       {
     Console.WriteLine(String.Format(@"{0}: {1}", term.Key, term.Value));
       }
 }
示例#3
0
 private static void frequencies_of_ngrams_in_emma_sample()
 {
     var text = TextExamples.emma();
       var freq = new Frequencies<string>();
       foreach (var token in Regex.Split(text, @"(\W+)").Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).NGram(3))
       {
     freq.Add(token.Aggregate((a, b) => a + " " + b));
       }
       foreach (var term in freq.Generate().OrderBy(p => p.Value).Reverse().Take(10))
       {
     Console.WriteLine(String.Format(@"{0}: {1}", term.Key, term.Value));
       }
       Console.WriteLine(freq.Count());
       Console.WriteLine(freq.Terms().Count());
 }
示例#4
0
        /// <summary>
        /// Step 1: Type based classification of period final bigrams.  In this step, 
        /// we want to guess at a classification of period final bigrams whether they are 
        /// elipses, abbreviations, or sentence endings.  This procedure calculates the 
        /// parameters that remain constant over all the enumerations.
        /// </summary>
        /// <returns>a list of type classifications</returns>
        public static IEnumerable<Type_Classification> type_based_classification_stage(
                    Frequencies<string> period_final_bigram_frequencies
                  , Frequencies<string> nonperiod_final_bigram_frequencies
                  , Frequencies<string> unigram_freqs
                  , int period_freq)
        {
            // count of periods
              double cw1 = Convert.ToDouble(period_freq);
              // number of tokens in the corpus
              double N = unigram_freqs.Generate().Select(a=>a.Value).Sum() + cw1;

              // Do Collocation Bond
              // The paper identifies special values for probabilities on the null hypothesis
              // and the alternative (p. 5).
              double p = cw1/N;
              double p0 = 0.99;
              double p1 = 1.0 - p0;

              // The remaining parameters:
              // bigrams count (this is the same as the period count)
              double cw01 = cw1;

              // return the results for each period final bigram
              return enumerate_type_based_classification(period_final_bigram_frequencies
                      , unigram_freqs, N, cw1, p, p0, p1);
        }
示例#5
0
 private static void get_instances_of_Mr_from_austen()
 {
     var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Austen");
       var austen = new TextCorpusReader(path);
       var freq = new Frequencies<string>();
       var i = 0;
       foreach (var w in austen.words().NGram(2))
       {
     var term = w.First().Trim().Append(w.Last()).Trim();
     //if (term.Length > 1 && term.Substring(0, 2) == "Mr" && term.Substring(2, 1) != "." && term.Substring(2, 1) != "s")
     if (term.Length > 1 && term.Substring(0, 2) == "Mr")
     {
       freq.Add(term);
     }
       }
       foreach (var t in freq.Generate())
       {
     Console.WriteLine("{0} => {1}", t.Key, t.Value);
       }
 }
示例#6
0
 private static void read_trigram_frequencies_from_inaugural_addresses()
 {
     var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\inaugural");
       var inaugural = new TextCorpusReader(path);
       var f = new Frequencies<string>();
       foreach (var address in inaugural.words().Where((x) => x != ", ").NGram(3))
       {
     f.Add(address.DefaultIfEmpty("").Aggregate((a, b) => a + " " + b));
       }
       foreach (var term in f.Generate().OrderBy(p => p.Value).Reverse().Take(10))
       {
     Console.WriteLine(String.Format(@"{0}: {1}", term.Key, term.Value));
       }
 }