예제 #1
0
        private void InitUnigramTable()
        {
            if (_wordCollection.GetNumberOfUniqueWords() == 0)
            {
                return;
            }

            int a;
            var power = 0.75;

            _table = new int[TableSize];
            var trainWordsPow = _wordCollection.GetTrainWordsPow(power);

            var i    = 0;
            var keys = _wordCollection.GetWords().ToArray();
            var d1   = Math.Pow(_wordCollection.GetOccuranceOfWord(keys.First()), power) / trainWordsPow;

            for (a = 0; a < TableSize; a++)
            {
                _table[a] = i;
                if (a / (double)TableSize > d1)
                {
                    i++;
                    d1 += Math.Pow(_wordCollection.GetOccuranceOfWord(keys[i]), power) / trainWordsPow;
                }
                if (i >= _wordCollection.GetNumberOfUniqueWords())
                {
                    i = _wordCollection.GetNumberOfUniqueWords() - 1;
                }
            }
        }
예제 #2
0
 public static void SaveWordDictionary(string vocabFileName, WordCollection wordCollection)
 {
     using (var stream = new FileStream(vocabFileName, FileMode.OpenOrCreate))
         using (var streamWriter = new StreamWriter(stream, Encoding.UTF8))
         {
             foreach (var word in wordCollection.GetWords())
             {
                 streamWriter.WriteLine($"{word}\t{wordCollection.GetOccuranceOfWord(word)}");
             }
         }
 }
예제 #3
0
        /**
         * ======== CreateBinaryTree ========
         * Create binary Huffman tree using the word counts.
         * Frequent words will have short unique binary codes.
         * Huffman encoding is used for lossless compression.
         * The vocab_word structure contains a field for the 'code' for the word.
         */
        public static void Create(WordCollection _wordCollection, int maxCodeLength)
        {
            var code       = new char[maxCodeLength];
            var point      = new long[maxCodeLength];
            var count      = new long[_wordCollection.GetNumberOfUniqueWords() * 2 + 1];
            var binary     = new long[_wordCollection.GetNumberOfUniqueWords() * 2 + 1];
            var parentNode = new int[_wordCollection.GetNumberOfUniqueWords() * 2 + 1];
            var keys       = _wordCollection.GetWords().ToArray();

            for (var a = 0; a < _wordCollection.GetNumberOfUniqueWords(); a++)
            {
                count[a] = _wordCollection.GetOccuranceOfWord(keys[a]);
            }
            for (var a = _wordCollection.GetNumberOfUniqueWords(); a < _wordCollection.GetNumberOfUniqueWords() * 2; a++)
            {
                count[a] = (long)1e15;
            }
            long pos1 = _wordCollection.GetNumberOfUniqueWords() - 1;
            long pos2 = _wordCollection.GetNumberOfUniqueWords();

            for (var a = 0; a < _wordCollection.GetNumberOfUniqueWords() - 1; a++)
            {
                long min1I;
                if (pos1 >= 0)
                {
                    if (count[pos1] < count[pos2])
                    {
                        min1I = pos1;
                        pos1--;
                    }
                    else
                    {
                        min1I = pos2;
                        pos2++;
                    }
                }
                else
                {
                    min1I = pos2;
                    pos2++;
                }
                long min2I;
                if (pos1 >= 0)
                {
                    if (count[pos1] < count[pos2])
                    {
                        min2I = pos1;
                        pos1--;
                    }
                    else
                    {
                        min2I = pos2;
                        pos2++;
                    }
                }
                else
                {
                    min2I = pos2;
                    pos2++;
                }
                count[_wordCollection.GetNumberOfUniqueWords() + a] = count[min1I] + count[min2I];
                parentNode[min1I] = _wordCollection.GetNumberOfUniqueWords() + a;
                parentNode[min2I] = _wordCollection.GetNumberOfUniqueWords() + a;
                binary[min2I]     = 1;
            }
            for (long a = 0; a < _wordCollection.GetNumberOfUniqueWords(); a++)
            {
                var  b = a;
                long i = 0;
                while (true)
                {
                    code[i]  = (char)binary[b];
                    point[i] = b;
                    i++;
                    b = parentNode[b];
                    if (b == _wordCollection.GetNumberOfUniqueWords() * 2 - 2)
                    {
                        break;
                    }
                }
                _wordCollection.SetPoint(keys, a);
                for (b = 0; b < i; b++)
                {
                    _wordCollection.SetCode(keys, a, i, b, code);
                    _wordCollection.SetPoint2(keys, a, i, b, point);
                }
            }
            GC.Collect();
        }