/// <summary> /// 从已分析存储的辞典载入数据 /// </summary> public static Lexicon FromExistLexiconFile(string existLexiconFile, EncodeScheme encode = EncodeScheme.Halfman) { //JiebaSegmenter segmenter = new JiebaSegmenter(); Lexicon lexicon = new Lexicon(); using (StreamReader sr = new StreamReader(existLexiconFile)) { Regex regex = new Regex("\\s"); string line; while (!string.IsNullOrEmpty((line = sr.ReadLine()))) { string[] vals = regex.Split(line); if (vals.Length == 2) { var a = lexicon.AddVocabulary(vals[0]); lexicon._voca_array[a].Weight = int.Parse(vals[1]); } } // lexicon.SortVocabulary(); } //应用halfman编码 if (encode == EncodeScheme.Halfman) { lexicon.UpdateHalfmanCode(); } //应用one-hot编码 else if (encode == EncodeScheme.Onehot) { lexicon.UpdateOnehotCode(); } // return(lexicon); }
/// <summary> /// 从原始文本文件中分析词句 /// </summary> /// <param name="vocabularyFile"></param> /// <returns></returns> public static Lexicon FromVocabularyFile(string vocabularyFile, EncodeScheme encode = EncodeScheme.Halfman) { Lexicon lexicon = new Lexicon(); //读取文本构建词库 using (StreamReader sr = new StreamReader(vocabularyFile)) { string line; while (!sr.EndOfStream) { line = sr.ReadLine().ClearPunctuation(); if (string.IsNullOrEmpty(line) || string.IsNullOrWhiteSpace(line)) { continue; } //sgement string[] words = lexicon.Sgement(line); Array.ForEach(words, word => { if (!string.IsNullOrWhiteSpace(word)) { lexicon._train_word_count++; int i = lexicon.SearchVocabulary(word); if (i == -1) { lexicon._voca_array[lexicon.AddVocabulary(word)].Weight = 1; } else { lexicon._voca_array[i].Weight++; } if (lexicon.VocaSize > _voca_hash_size * 0.7) { lexicon.ReduceVocabulary(); } } }); } //1. 剔除达不到最低频次要求的词 //2. 排序 lexicon.SortVocabulary(); } //应用halfman编码 if (encode == EncodeScheme.Halfman) { lexicon.UpdateHalfmanCode(); } //应用one-hot编码 else if (encode == EncodeScheme.Onehot) { lexicon.UpdateOnehotCode(); } // return(lexicon); }