private void InitUnigramTable() { if (_wordCollection.GetNumberOfUniqueWords() == 0) { return; } int a; var power = 0.75; _table = new int[TableSize]; var trainWordsPow = _wordCollection.GetTrainWordsPow(power); var i = 0; var keys = _wordCollection.GetWords().ToArray(); var d1 = Math.Pow(_wordCollection.GetOccuranceOfWord(keys.First()), power) / trainWordsPow; for (a = 0; a < TableSize; a++) { _table[a] = i; if (a / (double)TableSize > d1) { i++; d1 += Math.Pow(_wordCollection.GetOccuranceOfWord(keys[i]), power) / trainWordsPow; } if (i >= _wordCollection.GetNumberOfUniqueWords()) { i = _wordCollection.GetNumberOfUniqueWords() - 1; } } }
public static void SaveWordDictionary(string vocabFileName, WordCollection wordCollection) { using (var stream = new FileStream(vocabFileName, FileMode.OpenOrCreate)) using (var streamWriter = new StreamWriter(stream, Encoding.UTF8)) { foreach (var word in wordCollection.GetWords()) { streamWriter.WriteLine($"{word}\t{wordCollection.GetOccuranceOfWord(word)}"); } } }
/** * ======== CreateBinaryTree ======== * Create binary Huffman tree using the word counts. * Frequent words will have short unique binary codes. * Huffman encoding is used for lossless compression. * The vocab_word structure contains a field for the 'code' for the word. */ public static void Create(WordCollection _wordCollection, int maxCodeLength) { var code = new char[maxCodeLength]; var point = new long[maxCodeLength]; var count = new long[_wordCollection.GetNumberOfUniqueWords() * 2 + 1]; var binary = new long[_wordCollection.GetNumberOfUniqueWords() * 2 + 1]; var parentNode = new int[_wordCollection.GetNumberOfUniqueWords() * 2 + 1]; var keys = _wordCollection.GetWords().ToArray(); for (var a = 0; a < _wordCollection.GetNumberOfUniqueWords(); a++) { count[a] = _wordCollection.GetOccuranceOfWord(keys[a]); } for (var a = _wordCollection.GetNumberOfUniqueWords(); a < _wordCollection.GetNumberOfUniqueWords() * 2; a++) { count[a] = (long)1e15; } long pos1 = _wordCollection.GetNumberOfUniqueWords() - 1; long pos2 = _wordCollection.GetNumberOfUniqueWords(); for (var a = 0; a < _wordCollection.GetNumberOfUniqueWords() - 1; a++) { long min1I; if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min1I = pos1; pos1--; } else { min1I = pos2; pos2++; } } else { min1I = pos2; pos2++; } long min2I; if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min2I = pos1; pos1--; } else { min2I = pos2; pos2++; } } else { min2I = pos2; pos2++; } count[_wordCollection.GetNumberOfUniqueWords() + a] = count[min1I] + count[min2I]; parentNode[min1I] = _wordCollection.GetNumberOfUniqueWords() + a; parentNode[min2I] = _wordCollection.GetNumberOfUniqueWords() + a; binary[min2I] = 1; } for (long a = 0; a < _wordCollection.GetNumberOfUniqueWords(); a++) { var b = a; long i = 0; while (true) { code[i] = (char)binary[b]; point[i] = b; i++; b = parentNode[b]; if (b == _wordCollection.GetNumberOfUniqueWords() * 2 - 2) { break; } } _wordCollection.SetPoint(keys, a); for (b = 0; b < i; b++) { _wordCollection.SetCode(keys, a, i, b, code); _wordCollection.SetPoint2(keys, a, i, b, point); } } GC.Collect(); }