internal static EnglishWord CreateForTest(string text) { EnglishWord word = new EnglishWord(); word.Create(text); return(word); }
// Parses and tokenizes a list of documents, returning a vocabulary of words public List<string> GetVocabulary(List<string> docs, out List<List<string>> stemmedDocs, int vocabularyThreshold) { List<string> vocabulary = new List<string>(); Dictionary<string, int> wordCountList = new Dictionary<string, int>(); stemmedDocs = new List<List<string>>(); foreach (var doc in docs) { List<string> stemmedDoc = new List<string>(); string[] parts2 = Tokenize(doc); List<string> words = new List<string>(); foreach (string part in parts2) { // Strip non-alphanumeric characters string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); if (!StopWords.stopWordsList.Contains(stripped.ToLower())) { try { var english = new EnglishWord(stripped); string stem = english.Stem; words.Add(stem); if (stem.Length > 0) { // Build the word count list if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } stemmedDoc.Add(stem); } } catch (Exception e) { Console.WriteLine("Tokenizer exception source: {0}", e.Message); } } } stemmedDocs.Add(stemmedDoc); } // Get the top words var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold); foreach (var item in vocabList) { vocabulary.Add(item.Key); } return vocabulary; }
public string Stem(string Word) { var english = new EnglishWord(Word); string stem = english.Stem; return stem; }
internal static EnglishWord CreateWithR1R2(string text) { EnglishWord result = CreateForTest(text); result._r1 = CalculateR(result.Stem, 0); result._r2 = CalculateR(result.Stem, result._r1.Start); return(result); }
internal static EnglishWord CreateForTest(string text) { EnglishWord word = new EnglishWord(); word.Create(text); return word; }
private string StemTerm(string term) { string massagedTerm = term.Trim().ToLower(); if (String.IsNullOrEmpty(massagedTerm)) return null; switch (_language) { case ConfigHandler.Language.English: EnglishWord englishWord = new EnglishWord(massagedTerm); return englishWord.Stem; case ConfigHandler.Language.German: GermanStemmer germanStemmer = new GermanStemmer(); return germanStemmer.Stem(massagedTerm); case ConfigHandler.Language.Russian: RussianStemmer russianStemmer = new RussianStemmer(); return russianStemmer.Stem(massagedTerm); case ConfigHandler.Language.French: FrenchStemmer frenchStemmer = new FrenchStemmer(); return frenchStemmer.Stem(massagedTerm); default: return massagedTerm; } }