예제 #1
0
        /// <summary>
        /// return a stream of the statistics found when calculating type classification step.
        /// </summary>
        /// <param name="period_final_bigram_frequencies"></param>
        /// <param name="unigram_freqs"></param>
        /// <param name="N"></param>
        /// <param name="cw01"></param>
        /// <param name="cw1"></param>
        /// <param name="p"></param>
        /// <param name="p0"></param>
        /// <param name="p1"></param>
        /// <returns></returns>
        public static IEnumerable<Type_Classification> enumerate_type_based_classification(
                  Frequencies<string> period_final_bigram_frequencies
                  , Frequencies<string> unigram_freqs
                  , double N
                  , double cw1
                  , double p
                  , double p0
                  , double p1
                  )
        {
            foreach (var bigram in period_final_bigram_frequencies.Generate())
              {
            var s = new Type_Classifier_Statistics();

            s.c_w1 = cw1;
            s.p = p;
            s.p0 = p0;
            s.p1 = p1;
            s.w1 = ".";
            s.w01 = bigram.Key;

            // split off the ending period
            s.w0 = bigram.Key.Substring(0, bigram.Key.Length-1);
            s.c_w01 = bigram.Value;
            // find the unigram frequency of this term.
            s.c_w0 = unigram_freqs.Get(s.w0);

            s.llr = LikelihoodRatio.logLambda(N, s.c_w01, s.c_w0, s.c_w1, s.p, s.p0, s.p1);

            // Calculate the length penalty
            s.length_penalty = 1/Math.Exp(s.w0.Length - count_internal_periods(s.w0));
            // Calculate the Internal periods penalty
            s.internal_periods_penalty = count_internal_periods(s.w0) + 1;
            // Calculate occurrances penalty without final period
            var len = s.w0.Length - count_internal_periods(s.w0);
            len = (len == 0) ? 1: len;
            s.with_final_period_penalty = 1/Math.Pow(len, (s.c_w0 - s.c_w01));
            //if (s.with_final_period_penalty == 0)
            //  Console.WriteLine(s.w0);
            // scaled log likelihood
            s.scaled_llr = s.llr * s.length_penalty * s.internal_periods_penalty * s.with_final_period_penalty;

            // calculate the final classifier for each sentence final bigram
            var c = new Type_Classification();
            c.statistics = s;
            // annotate with the classification
            if (s.scaled_llr < 0.3)
            {
              c.classification = "<S>";
            }
            else
            {
              if (s.w0.Length - count_internal_periods(s.w0) == 0)
            c.classification = "<E>";
              else
            c.classification = "<A>";
            }
            yield return c;
              }
        }
예제 #2
0
 public void test_create_frequency_object_from_text()
 {
     var text = TextExamples.emma();
       var freq = new Frequencies<string>();
       foreach (var token in Regex.Split(text, @"(\W+)"))
       {
     freq.Add(token);
       }
       Assert.AreEqual(freq.Count(), 2227);
       Assert.AreEqual(freq.Get("and"), 47.0);
       Assert.AreEqual(freq.Terms().Count(), 479);
 }
예제 #3
0
 public void test_add_to_frequencies()
 {
     var freq = new Frequencies<string>();
       freq.Add("a");
       Assert.AreEqual(1, freq.Get("a"));
 }