public static int[][] GetWordIndexStemmedDocs(string[] docs, List <string> vocabulary) { List <int>[] wordIndex = Util.ArrayInit(docs.Length, d => new List <int>()); int docIndex = 0; foreach (var doc in docs) { if (doc != null) { string[] parts2 = Tokenize(doc.ToLower()); List <int> wordIndexDoc = new List <int>(); foreach (string part in parts2) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); try { var english = new EnglishWord(stripped); string stem = english.Stem; if (vocabulary.Contains(stem)) { wordIndexDoc.Add(vocabulary.IndexOf(stem)); } } catch { // ignored } } wordIndex[docIndex] = wordIndexDoc.Distinct().ToList(); docIndex++; } } return(wordIndex.Select(list => list.Select(index => index).ToArray()).ToArray()); }
/// <summary> /// Parses and tokenizes a list of documents, returning a vocabulary of words. /// </summary> /// <param name="docs">string[]</param> /// <param name="stemmedDocs">List of List of string</param> /// <param name="vocabularyThreshold"></param> /// <returns>Vocabulary (list of strings)</returns> private static List <string> GetVocabulary(string[] docs, out List <List <string> > stemmedDocs, int vocabularyThreshold) { List <string> vocabulary = new List <string>(); Dictionary <string, int> wordCountList = new Dictionary <string, int>(); stemmedDocs = new List <List <string> >(); var stopWordsFile = File.ReadAllLines(Path.Combine("Data", "stopwords.txt")); var stopWordsList = new List <string>(stopWordsFile).ToArray(); int docIndex = 0; List <string> words = new List <string>(); foreach (var doc in docs) { List <string> stemmedDoc = new List <string>(); docIndex++; if (docIndex % 10000 == 0) { Console.WriteLine("Processing " + docIndex + "/" + docs.Length); } string[] parts2 = Tokenize(doc.ToLower()); foreach (string part in parts2) { // Strip non-alphanumeric characters. string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", ""); if (!stopWordsList.Contains(stripped.ToLower())) { try { var english = new EnglishWord(stripped); string stem = english.Original; words.Add(stripped); if (stem.Length > 0) { // Build the word count list. if (wordCountList.ContainsKey(stem)) { wordCountList[stem]++; } else { wordCountList.Add(stem, 0); } stemmedDoc.Add(stem); } } catch { // ignored } } } stemmedDocs.Add(stemmedDoc); } // Get the top words. var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold); foreach (var item in vocabList) { vocabulary.Add(item.Key); } return(vocabulary); }