internal static EnglishWord CreateForTest(string text) { var word = new EnglishWord(); word.Create(text); return(word); }
private static List <string> GetVocabulary(string[] docs, out List <List <string> > stemmedDocs, int vocabularyThreshold) { var wordCountList = new Dictionary <string, int>(); stemmedDocs = new List <List <string> >(); var docIndex = 0; foreach (var doc in docs) { var stemmedDoc = new List <string>(); docIndex++; if (docIndex % 100 == 0) { Console.WriteLine("Processing " + docIndex + "/" + docs.Length); } var parts2 = WordsProcessing(doc); var words = new List <string>(); foreach (var part in parts2) { var stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); if (StopWords.StopWordsList.Contains(stripped.ToLower())) { continue; } try { var english = new EnglishWord(stripped); var stem = english.Stem; words.Add(stem); if (stem.Length > 0) { if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } stemmedDoc.Add(stem); } } catch (Exception e) { Console.WriteLine(e); throw; } } stemmedDocs.Add(stemmedDoc); } var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold); return(vocabList.Select(item => item.Key).ToList()); }