Example #1
0
        /// <summary>
        /// 导入外部词库,词频按照重合词频比例平均值
        /// </summary>
        /// <param name="ImportDicFile">外部词库文件名</param>
        /// <param name="ImportEncoding">外部词库文件编码</param>
        /// <param name="SourceDicFile">源dct文件名</param>
        /// <param name="DestDicFile">目标dct文件名</param>
        /// <param name="DicFormat">外部词库类型</param>
        /// <param name="OddLines">导入的库中无效且不在源库中的数据</param>
        /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param>
        /// <param name="AvgFrqRate">导入文件的平均频度比例</param>
        /// <returns>导入的条数</returns>
        public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0)
        {
            //初始化
            double MaxFrqRate, MinFrqRate;
            WordDictionary.DicWordInfo[] NewWords;
            WordDictionary.DicWordInfo[] ExistWords;
            FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);

            //加载词库
            WordDictionary dict = new WordDictionary();
            if (!dict.Load(SourceDicFile))
                throw new Exception("load source dic file fail");

            //加入新词
            foreach (WordDictionary.DicWordInfo Word in NewWords)
            {
                int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate);
                dict.AddWord(Word.Word, Word.Pos, Frq);
            }

            //保存
            dict.Save(DestDicFile);
            dict.ReleaseDict();
            return NewWords.Length;
        }
Example #2
0
 public void ReleaseUnknowWord()
 {
     m_dict.ReleaseDict();
     m_roleTag.ReleaseSpan();
 }
Example #3
0
 /// <summary>
 /// 找到导入库和现有库的不同
 /// </summary>
 /// <param name="NewDicFile">导入库文件</param>
 /// <param name="Encoding">导入库文件编码</param>
 /// <param name="DicFormat">导入库文件格式</param>
 /// <param name="SourceDictFileName">原库文件</param>
 /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param>
 /// <param name="NewWords">输出新词或现有词的新词性</param>
 /// <param name="ExistWords">输出重复词,且词性也相同</param>
 /// <param name="MaxFrqRate">重复词的最大词频比例</param>
 /// <param name="MinFrqRate">重复词的最小词频比例</param>
 /// <param name="AvgFrqRate">重复词的平均词频比例</param>
 public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName,
     out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords,
     out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate)
 {
     WordDictionary SourceDict = new WordDictionary();
     if (!SourceDict.Load(SourceDictFileName))
         throw new Exception("load source dic file fail");
     FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);
     SourceDict.ReleaseDict();
 }