コード例 #1
0
ファイル: English.cs プロジェクト: tigella/CrisisTracker
        internal static EnglishWord CreateForTest(string text)
        {
            EnglishWord word = new EnglishWord();

            word.Create(text);
            return(word);
        }
コード例 #2
0
ファイル: Tokenizer.cs プロジェクト: nikdon/SimilarityMeasure
        // Parses and tokenizes a list of documents, returning a vocabulary of words
        public List<string> GetVocabulary(List<string> docs, out List<List<string>> stemmedDocs, int vocabularyThreshold)
        {
            List<string> vocabulary = new List<string>();
            Dictionary<string, int> wordCountList = new Dictionary<string, int>();
            stemmedDocs = new List<List<string>>();

            foreach (var doc in docs)
            {
                List<string> stemmedDoc = new List<string>();

                string[] parts2 = Tokenize(doc);

                List<string> words = new List<string>();
                foreach (string part in parts2)
                {
                    // Strip non-alphanumeric characters
                    string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");

                    if (!StopWords.stopWordsList.Contains(stripped.ToLower()))
                    {
                        try
                        {
                            var english = new EnglishWord(stripped);
                            string stem = english.Stem;
                            words.Add(stem);

                            if (stem.Length > 0)
                            {
                                // Build the word count list
                                if (wordCountList.ContainsKey(stem))
                                {
                                    wordCountList[stem]++;
                                }
                                else
                                {
                                    wordCountList.Add(stem, 0);
                                }

                                stemmedDoc.Add(stem);
                            }
                        }
                        catch (Exception e)
                        {
                            Console.WriteLine("Tokenizer exception source: {0}", e.Message);
                        }
                    }
                }

                stemmedDocs.Add(stemmedDoc);
            }

            // Get the top words
            var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);
            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return vocabulary;
        }
コード例 #3
0
        public string Stem(string Word)
        {
            var english = new EnglishWord(Word);
            string stem = english.Stem;

            return stem;
        }
コード例 #4
0
ファイル: English.cs プロジェクト: tigella/CrisisTracker
        internal static EnglishWord CreateWithR1R2(string text)
        {
            EnglishWord result = CreateForTest(text);

            result._r1 = CalculateR(result.Stem, 0);
            result._r2 = CalculateR(result.Stem, result._r1.Start);
            return(result);
        }
コード例 #5
0
 internal static EnglishWord CreateForTest(string text)
 {
     EnglishWord word = new EnglishWord();
     word.Create(text);
     return word;
 }
コード例 #6
0
 private string StemTerm(string term)
 {
     string massagedTerm = term.Trim().ToLower();
     if (String.IsNullOrEmpty(massagedTerm)) return null;
     switch (_language) {
         case ConfigHandler.Language.English:
             EnglishWord englishWord = new EnglishWord(massagedTerm);
             return englishWord.Stem;
         case ConfigHandler.Language.German:
             GermanStemmer germanStemmer = new GermanStemmer();
             return germanStemmer.Stem(massagedTerm);
         case ConfigHandler.Language.Russian:
             RussianStemmer russianStemmer = new RussianStemmer();
             return russianStemmer.Stem(massagedTerm);
         case ConfigHandler.Language.French:
             FrenchStemmer frenchStemmer = new FrenchStemmer();
             return frenchStemmer.Stem(massagedTerm);
         default:
             return massagedTerm;
     }
 }