예제 #1
0
        public static Dictionary<string, int> CountTokens(IEnumerable<string> articles)
        {
            Tokenizer t = new SimpleTokenizer();
            //TokenizerME m = new TokenizerME(new TokenizerModel("e

            // List<String> terms = null;
            // Uri uri = new Uri("/terms.txt", UriKind.Relative);
            //StreamResourceInfo info = Application.GetResourceStream(uri);
            //using (var reader = new StreamReader(info.Stream))
            //{
            //    var termsFile = reader.ReadToEnd();
            //    terms = termsFile.Split().ToList();
            //}

            var words = new Dictionary<string, int>();
            foreach (var article in articles)
            {
                //var wordPattern = new Regex(@"\w+", RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace);

                var tokens = t.tokenize(article);

                foreach (var token in tokens)
                {
                    int currentCount = 0;
                    words.TryGetValue(token, out currentCount);

                    words[token] = ++currentCount;
                }

                ////foreach (Match match in wordPattern.Matches(article))
                //{
                //    //var word = match.Value.ToLower();
                //    //if (terms.Contains(word))
                //    {
                //        int currentCount = 0;
                //        words.TryGetValue(word, out currentCount);

                //        words[word] = ++currentCount;
                //    }
                //}
            }

            return words.Where(p => p.Value > 2).ToDictionary(p => p.Key, p => p.Value);
        }