Пример #1
0
        private void InitUnigramTable()
        {
            if (_wordCollection.GetNumberOfUniqueWords() == 0)
            {
                return;
            }

            int a;
            var power = 0.75;

            _table = new int[TableSize];
            var trainWordsPow = _wordCollection.GetTrainWordsPow(power);

            var i    = 0;
            var keys = _wordCollection.GetWords().ToArray();
            var d1   = Math.Pow(_wordCollection.GetOccurrenceOfWord(keys.First()), power) / trainWordsPow;

            for (a = 0; a < TableSize; a++)
            {
                _table[a] = i;
                if (a / (double)TableSize > d1)
                {
                    i++;
                    d1 += Math.Pow(_wordCollection.GetOccurrenceOfWord(keys[i]), power) / trainWordsPow;
                }
                if (i >= _wordCollection.GetNumberOfUniqueWords())
                {
                    i = _wordCollection.GetNumberOfUniqueWords() - 1;
                }
            }
        }
Пример #2
0
        private static bool HandleWords(StreamReader reader, ref long wordCount, long?[] sentence, ref ulong nextRandom,
                                        ref long sentenceLength, IEnumerable <string> words, WordCollection wordCollection, float thresholdForOccurrenceOfWords)
        {
            var totalNumberOfWords = wordCollection.GetTotalNumberOfWords();

            foreach (var word in words)
            {
                var wordIndex = wordCollection[word];
                if (!wordIndex.HasValue)
                {
                    continue;
                }
                wordCount++;

                //Subsampling of frequent words
                if (thresholdForOccurrenceOfWords > 0)
                {
                    var random = ((float)Math.Sqrt(wordCollection.GetOccurrenceOfWord(word) / (thresholdForOccurrenceOfWords * totalNumberOfWords)) + 1) *
                                 (thresholdForOccurrenceOfWords * totalNumberOfWords) / wordCollection.GetOccurrenceOfWord(word);
                    nextRandom = LinearCongruentialGenerator(nextRandom);
                    if (random < (nextRandom & 0xFFFF) / (float)65536)
                    {
                        continue;
                    }
                }
                sentence[sentenceLength] = wordIndex.Value;
                sentenceLength++;
                if (sentenceLength > sentence.Length)
                {
                    return(true);
                }
            }
            if (reader.EndOfStream)
            {
                return(true);
            }
            return(false);
        }