//从词汇表文件中读词并构建词表和hash表 //由于词汇表中的词语不存在重复,因此与LearnVocabFromTrainFile相比没有做重复词汇的检测 void ReadVocab(string filename) { vocab.Clear(); StreamReader file = new StreamReader(filename); while (!file.EndOfStream) { string[] fields = file.ReadLine().Split('\t'); vocab_word word1 = new vocab_word(); word1.word = fields[0]; word1.cn = Convert.ToInt64(fields[1]); if (!vocab.TryAdd(fields[0], word1)) { Console.WriteLine("添加单词失败"); } } }
//为一个词构建一个vocab_word结构对象,并添加到词表中 //词频初始化为0,hash值用之前的函数计算, //返回该词在词表中的位置 void AddWordToVocab(string word) { if (word.Length > MAX_STRING) { word = word.Substring(0, MAX_STRING); } if (!word_information.ContainsKey(word)) { vocab_word one_vocab_word = new vocab_word(); one_vocab_word.cn = 1; one_vocab_word.word = word; if (!word_information.TryAdd(word, one_vocab_word)) { Console.WriteLine("插入单词失败"); } } else { word_information[word].cn += 1; } }
//输入两个词的结构体,返回二者的词频差值 Int64 VocabCompare(vocab_word a, vocab_word b) { return(b.cn - a.cn); }