/// <summary> /// 导入外部词库,词频按照重合词频比例平均值 /// </summary> /// <param name="ImportDicFile">外部词库文件名</param> /// <param name="ImportEncoding">外部词库文件编码</param> /// <param name="SourceDicFile">源dct文件名</param> /// <param name="DestDicFile">目标dct文件名</param> /// <param name="DicFormat">外部词库类型</param> /// <param name="OddLines">导入的库中无效且不在源库中的数据</param> /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param> /// <param name="AvgFrqRate">导入文件的平均频度比例</param> /// <returns>导入的条数</returns> public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0) { //初始化 double MaxFrqRate, MinFrqRate; WordDictionary.DicWordInfo[] NewWords; WordDictionary.DicWordInfo[] ExistWords; FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); //加载词库 WordDictionary dict = new WordDictionary(); if (!dict.Load(SourceDicFile)) throw new Exception("load source dic file fail"); //加入新词 foreach (WordDictionary.DicWordInfo Word in NewWords) { int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate); dict.AddWord(Word.Word, Word.Pos, Frq); } //保存 dict.Save(DestDicFile); dict.ReleaseDict(); return NewWords.Length; }
public void ReleaseUnknowWord() { m_dict.ReleaseDict(); m_roleTag.ReleaseSpan(); }
/// <summary> /// 找到导入库和现有库的不同 /// </summary> /// <param name="NewDicFile">导入库文件</param> /// <param name="Encoding">导入库文件编码</param> /// <param name="DicFormat">导入库文件格式</param> /// <param name="SourceDictFileName">原库文件</param> /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param> /// <param name="NewWords">输出新词或现有词的新词性</param> /// <param name="ExistWords">输出重复词,且词性也相同</param> /// <param name="MaxFrqRate">重复词的最大词频比例</param> /// <param name="MinFrqRate">重复词的最小词频比例</param> /// <param name="AvgFrqRate">重复词的平均词频比例</param> public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName, out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords, out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate) { WordDictionary SourceDict = new WordDictionary(); if (!SourceDict.Load(SourceDictFileName)) throw new Exception("load source dic file fail"); FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate); SourceDict.ReleaseDict(); }