private void InitUnigramTable() { if (_wordCollection.GetNumberOfUniqueWords() == 0) { return; } int a; var power = 0.75; _table = new int[TableSize]; var trainWordsPow = _wordCollection.GetTrainWordsPow(power); var i = 0; var keys = _wordCollection.GetWords().ToArray(); var d1 = Math.Pow(_wordCollection.GetOccurrenceOfWord(keys.First()), power) / trainWordsPow; for (a = 0; a < TableSize; a++) { _table[a] = i; if (a / (double)TableSize > d1) { i++; d1 += Math.Pow(_wordCollection.GetOccurrenceOfWord(keys[i]), power) / trainWordsPow; } if (i >= _wordCollection.GetNumberOfUniqueWords()) { i = _wordCollection.GetNumberOfUniqueWords() - 1; } } }
private static bool HandleWords(StreamReader reader, ref long wordCount, long?[] sentence, ref ulong nextRandom, ref long sentenceLength, IEnumerable <string> words, WordCollection wordCollection, float thresholdForOccurrenceOfWords) { var totalNumberOfWords = wordCollection.GetTotalNumberOfWords(); foreach (var word in words) { var wordIndex = wordCollection[word]; if (!wordIndex.HasValue) { continue; } wordCount++; //Subsampling of frequent words if (thresholdForOccurrenceOfWords > 0) { var random = ((float)Math.Sqrt(wordCollection.GetOccurrenceOfWord(word) / (thresholdForOccurrenceOfWords * totalNumberOfWords)) + 1) * (thresholdForOccurrenceOfWords * totalNumberOfWords) / wordCollection.GetOccurrenceOfWord(word); nextRandom = LinearCongruentialGenerator(nextRandom); if (random < (nextRandom & 0xFFFF) / (float)65536) { continue; } } sentence[sentenceLength] = wordIndex.Value; sentenceLength++; if (sentenceLength > sentence.Length) { return(true); } } if (reader.EndOfStream) { return(true); } return(false); }