private static void AddWords(int numberOfCopies, WordCollection wordCollection, string inputCharacter) { for (var i = 0; i < numberOfCopies; i++) { wordCollection.AddWords(inputCharacter, 4); } }
public WordCollection GetWordDictionaryFromFile(int maxCodeLength) { var wordCollection = new WordCollection(); if (!File.Exists(_file)) { throw new InvalidOperationException($"Unable to find {_file}"); } using (var fileStream = new FileStream(_file, FileMode.OpenOrCreate, FileAccess.Read)) { using (var reader = new StreamReader(fileStream, Encoding.UTF8)) { string line; while ((line = reader.ReadLine()) != null) { wordCollection.AddWords(line, maxCodeLength); if (reader.EndOfStream) { break; } } } } return(wordCollection); }
public void CanGetSentence() { long wordCount = 0; const string input = "This is a string. The String to test, the string to prevail.\r\nWhat is the string?"; var wordCollection = new WordCollection(); wordCollection.AddWords(input, 11); wordCollection.InitWordPositions(); const int maxSentenceLength = 50; var sentence = new long?[maxSentenceLength + 1]; ulong nextRandom = 1; const float thresholdForOccurrenceOfWords = 0; long sentenceLength = 0; string[] lastLine = null; var reader = new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(input))); wordCount = NLP.Word2Vec.Word2Vec.SetSentence(reader, wordCount, sentence, ref nextRandom, ref sentenceLength, ref lastLine, wordCollection, thresholdForOccurrenceOfWords); Assert.Equal(16, wordCount); Assert.Equal(16, sentenceLength); Assert.NotNull(sentence[15]); Assert.Null(sentence[16]); }
public void CorrectlyCalculateTFIDF(string documentA, string documentB, string word, double expectedTfidfOfWordInA) { var wordCollectionA = new WordCollection(); wordCollectionA.AddWords(documentA, 10); wordCollectionA.InitWordPositions(); var wordCollectionB = new WordCollection(); wordCollectionB.AddWords(documentB, 10); wordCollectionB.InitWordPositions(); var documents = new List <WordCollection> { wordCollectionA, wordCollectionB }; var tfidfOfWordInA = wordCollectionA.CalculateTFIDF(word, documents); Assert.Equal(expectedTfidfOfWordInA, tfidfOfWordInA, 5); }
public void NotSufferFromOffByOne() { const string input = "This is a string. The String to test, the strings to prevail.\r\nWhat is the string?"; var wordCollection = new WordCollection(); wordCollection.AddWords(input, 11); wordCollection.InitWordPositions(); var sentence = new int?[12]; var nextRandom = new Random(); const double thresholdForOccurrenceOfWords = 0; var sentenceLength = 0; string[] lastLine = null; var reader = new StreamReader(new MemoryStream(Encoding.ASCII.GetBytes(input))); Word2VecTrainer.SetSentence(wordCollection, reader, sentence, nextRandom, ref sentenceLength, ref lastLine, thresholdForOccurrenceOfWords); reader.Dispose(); Assert.Equal(12, sentenceLength); Assert.NotNull(sentence[11]); }