/// <summary>
        /// Learn Collocation pairs and triples according to patterns in Treebank-3 tagged files
        /// </summary>
        /// <remarks>
        /// the patterns are from:
        /// Justeson, John S., and Slava M. Katz. 1995. Technical terminology: some linguistic properties and 
        ///  an algorithm for identification in text. Natural Language Engineering 1:9-27.
        /// </remarks>
        /// <returns>IEnumerable of sorted frequencies (KeyValuePairs of collocated terms 
        /// and the number of occurrrences of that string) of each collocation found in the corpus.
        /// </returns>
        public static IEnumerable<KeyValuePair<string, double>> collocated_terms_in_Treebank3()
        {
            var collocated_words_pattern = RegexTools.regex_filter_pattern("<J\\S+|N\\S+><J\\S+|N\\S+|IN\\S|TO\\S>*<N\\S+>");
              //Console.WriteLine(collocated_words_pattern);

              var treebank3 = new Treebank3CorpusReader(Path.Combine(
              Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"));
              foreach (var content in treebank3.read_tagged_sents()
              .Select((x) => Regex.Match(x, collocated_words_pattern).Groups[0].Value)
              .Select((x) => TextTools.get_term_from_string(x).DefaultIfEmpty("").Aggregate((a, b) => a + " " + b))
              .Freqs().Generate().OrderBy((x) => x.Value))
              {
            yield return content;
              }
        }
        public static IEnumerable<KeyValuePair<string, LikelihoodRatio.LikelihoodRatioData<string, string>>> collocations_from_likelihood_ratio_Treebank3()
        {
            var treebank3 = new Treebank3CorpusReader(Path.Combine(
              Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"));

              var bigram_freqs = treebank3.words()
                .Where((x) => x != ", " && !TextTools.is_puctuation(x) && TextTools.not_whitespace.IsMatch(x))
                .NGram(2).Select((a) => a.Aggregate((x, y) => x + " " + y))
                .Freqs();

              var unigram_freqs = treebank3.words()
              .Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).Freqs();
              var N = bigram_freqs.Count();

              foreach (var bigram_llr in bigram_freqs.Generate())
              {
            var llr = new LikelihoodRatio.LikelihoodRatioData<string, string>(N
                  , bigram_llr.Key, bigram_llr.Value, bigram_llr.Key.Split().ToArray(), unigram_freqs);
            {
              yield return new KeyValuePair<string, LikelihoodRatio.LikelihoodRatioData<string, string>>(bigram_llr.Key, llr);
            }
              }
              // TODO: chi-sqare comparison to gen collocations
        }
Beispiel #3
0
        private static void test_log_likelihood_collocation()
        {
            var treebank3 = new Treebank3CorpusReader(Path.Combine(
              Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"));

              var bigram_freqs = treebank3.words()
                .Where((x) => x != ", " && !TextTools.is_puctuation(x) && TextTools.not_whitespace.IsMatch(x))
                .NGram(2).Select((a) => a.Aggregate((x, y) => x + " " + y))
                .Freqs();

              var unigram_freqs = treebank3.words()
              .Where((x) => x != ", " && TextTools.not_whitespace.IsMatch(x)).Freqs();

              foreach (var bigram_llr in LikelihoodRatio.log_likelihood_ratio(unigram_freqs, bigram_freqs).OrderBy((t)=>t.Value).Take(2000))
              {
            Console.WriteLine(String.Format(@"{0}: {1}", bigram_llr.Key, bigram_llr.Value));
              }
        }
Beispiel #4
0
 private static void read_treebank3_period_bigrams()
 {
     var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3");
       var treebank = new Treebank3CorpusReader(path);
       var t1 = treebank.words().Where(x=> (x != "") && x.ToCharArray().Last() == '.');
       var t2 = treebank.words().NGram(2).Where(a=>a.Last()==".").Select(a=>a.Aggregate((x,y)=>String.Concat(x,y)));
       var t_no_periods = treebank.words().Where(x=> (x != "") && x.ToCharArray().Last() != '.')
       .Where((x)=>!TextTools.is_puctuation(x)).NGram(2).Where(a=>a.Last()!=".")
       .Select(a=>a.Aggregate((x,y)=>String.Concat(x,y)));
       foreach (var content in t1.Union(t2))
       {
     //var x = content;
     Console.WriteLine(content);
       }
       Console.WriteLine(@"ending period: {0}", t1.Concat(t2).Count());
       Console.WriteLine(@" no ending period: {0}", t_no_periods.Count());
       Console.WriteLine(@" total: {0}", t_no_periods.Concat(t1.Concat(t2)).Count());
 }
Beispiel #5
0
 private static void read_treebank3()
 {
     var path = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3");
       var treebank = new Treebank3CorpusReader(path);
       foreach (var content in treebank.words())
       {
     //var x = content;
     Console.WriteLine(content);
       }
 }
Beispiel #6
0
        private static void most_common_collocated_NPs_in_Treebank3()
        {
            var collocated_words_pattern = RegexTools.regex_filter_pattern("<J\\S+|N\\S+><J\\S+|N\\S+|IN\\S|TO\\S>*<N\\S+>");
              Console.WriteLine(collocated_words_pattern);

              var treebank3 = new Treebank3CorpusReader(Path.Combine(
              Environment.GetFolderPath(Environment.SpecialFolder.MyDocuments), @"Data\Treebank-3"));
              foreach (var content in treebank3.read_tagged_sents()
              .Select((x)=>Regex.Match(x, collocated_words_pattern).Groups[0].Value)
              .Select((x) => TextTools.get_term_from_string(x).DefaultIfEmpty("").Aggregate((a, b) => a + " " + b))
              .Freqs().Generate().OrderBy((x)=>x.Value).Reverse().Take(1000))
              {
            Console.WriteLine("{0} : {1}", content.Key, content.Value);
              }
        }