/// <summary> /// 找到导入库和现有库的不同 /// </summary> /// <param name="NewDicFile">导入库文件</param> /// <param name="Encoding">导入库文件编码</param> /// <param name="DicFormat">导入库文件格式</param> /// <param name="SourceDict">原库对象</param> /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param> /// <param name="NewWords">输出新词或现有词的新词性</param> /// <param name="ExistWords">输出重复词,且词性也相同</param> /// <param name="MaxFrqRate">重复词的最大词频比例</param> /// <param name="MinFrqRate">重复词的最小词频比例</param> /// <param name="AvgFrqRate">重复词的平均词频比例</param> public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, WordDictionary SourceDict, out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords, out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate) { //初始化 MaxFrqRate = double.MinValue; MinFrqRate = double.MaxValue; decimal SumFrqRate = 0; //const string[] CheckPos = new string[] { "n", "ns", "nr", "ng", "v", "j", "m", "vn", "a", "q" }; //准备词性转换 Dictionary<string, string> PosTrans = getPosTransformMap(DicFormat); //加载词库 Dictionary<string, WordDictionary.DicWordInfo> OldWords = SourceDict.ToWordDictionary(); ; //内存词组 List<string> Odds = new List<string>(OldWords.Count / 2); List<WordDictionary.DicWordInfo> Exists = new List<SharpICTCLAS.WordDictionary.DicWordInfo>(OldWords.Count / 2); List<WordDictionary.DicWordInfo> News = new List<WordDictionary.DicWordInfo>(OldWords.Count / 2); //加载词库并统计库内有的词的词频,以估算词频转换的比例关系 foreach (string Line in File.ReadAllLines(NewDicFile, Encoding)) { string Word; int Frq; string Poses; switch (DicFormat) { case DictionaryFormat.SogouW2006: string[] s = Line.Split('\t', ' '); Word = s[0]; Frq = s.Length == 1 ? -1 : int.Parse(s[1]); Poses = s.Length < 2 ? null : s[2]; break; case DictionaryFormat.ExcelCSV: default: int p1 = Line.IndexOf(','); int p2 = Line.IndexOf(',', p1 + 1); Word = Line.Substring(0, p1); Frq = int.Parse(Line.Substring(p1 + 1, p2 - p1 - 1)); Poses = Line.Substring(p2 + 1).Trim('"').Trim(); break; } if (string.IsNullOrEmpty(Poses)) { if (!OldWords.ContainsKey(Word.ToLower())) Odds.Add(Line); continue; } foreach (string InputPos in Poses.TrimEnd(',').Split(',')) { if (string.IsNullOrEmpty(InputPos)) continue; //如果映射表中没有,则保留原始词性字母 string Pos = PosTrans.ContainsKey(InputPos.ToLower()) ? PosTrans[InputPos.ToLower()] : InputPos.ToLower(); //是否存在 if (OldWords.ContainsKey(Word.ToLower()) && OldWords[Word.ToLower()].Pos.Contains(Pos)) { int SourceFrq = OldWords[Word.ToLower()].Frequence; double FrqR = SourceFrq == 0 ? Frq : (double)Frq / SourceFrq; if (FrqR > MaxFrqRate) MaxFrqRate = FrqR; if (FrqR < MinFrqRate) MinFrqRate = FrqR; SumFrqRate += (decimal)FrqR; Exists.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq)); } else //新词或新词性 { News.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq)); } } } //平均频度转换倍数 AvgFrqRate = Exists.Count > 0 ? Convert.ToDouble(SumFrqRate / Exists.Count) : 0; OddLines = Odds.ToArray(); NewWords = News.ToArray(); ExistWords = Exists.ToArray(); }