public IEnumerable <string> GetTextTerms(Stream textStream) { if (textStream == null) { throw new ArgumentNullException(nameof(textStream)); } var words = new List <string>(); var wordStream = new WordStream(); wordStream.WordGenerated += (source, generatedWord) => { words.Add(generatedWord); }; var fileReadEntirely = false; var buffer = new byte[BufferSizeInKB]; while (!fileReadEntirely) { var readBytes = textStream.Read(buffer, 0, BufferSizeInKB); fileReadEntirely = readBytes != BufferSizeInKB; foreach (var textByte in buffer) { var textCharacter = (char)textByte; wordStream.AddCharacter(textCharacter); } } var filteredWords = words.Except(WordsBlacklist.Instance); var terms = filteredWords.Select(stemmer.StemString).ToList().AsEnumerable(); return(terms); }
public void Learn(Stream words) { var wordStream = new WordStream(words); LinkedList<string> wordHistory = new LinkedList<string>(); for (int i = 0; i < 4; i++) { wordHistory.AddLast(String.Empty); } foreach (var word in wordStream) { using (var transaction = new CommittableTransaction()) { wordHistory.RemoveFirst(); wordHistory.AddLast(word); HandleWord(word); HandleTwoGram(wordHistory.AsEnumerable().Skip(2).Take(2).ToList()); var gram = HandleThreeGram(wordHistory.AsEnumerable().Skip(1).Take(3).ToList()); HandleFourGram(wordHistory.AsEnumerable().ToList(), gram); context.SaveChanges(); transaction.Commit(); } } }