示例#1
0
        /// <summary>
        /// 找到导入库和现有库的不同
        /// </summary>
        /// <param name="NewDicFile">导入库文件</param>
        /// <param name="Encoding">导入库文件编码</param>
        /// <param name="DicFormat">导入库文件格式</param>
        /// <param name="SourceDict">原库对象</param>
        /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param>
        /// <param name="NewWords">输出新词或现有词的新词性</param>
        /// <param name="ExistWords">输出重复词,且词性也相同</param>
        /// <param name="MaxFrqRate">重复词的最大词频比例</param>
        /// <param name="MinFrqRate">重复词的最小词频比例</param>
        /// <param name="AvgFrqRate">重复词的平均词频比例</param>
        public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, WordDictionary SourceDict,
            out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords,
            out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate)
        {
            //初始化
            MaxFrqRate = double.MinValue; MinFrqRate = double.MaxValue; decimal SumFrqRate = 0;
            //const string[] CheckPos = new string[] { "n", "ns", "nr", "ng", "v", "j", "m", "vn", "a", "q" };

            //准备词性转换
            Dictionary<string, string> PosTrans = getPosTransformMap(DicFormat);

            //加载词库
            Dictionary<string, WordDictionary.DicWordInfo> OldWords = SourceDict.ToWordDictionary(); ;

            //内存词组
            List<string> Odds = new List<string>(OldWords.Count / 2);
            List<WordDictionary.DicWordInfo> Exists = new List<SharpICTCLAS.WordDictionary.DicWordInfo>(OldWords.Count / 2);
            List<WordDictionary.DicWordInfo> News = new List<WordDictionary.DicWordInfo>(OldWords.Count / 2);

            //加载词库并统计库内有的词的词频,以估算词频转换的比例关系
            foreach (string Line in File.ReadAllLines(NewDicFile, Encoding))
            {
                string Word;
                int Frq;
                string Poses;

                switch (DicFormat)
                {
                    case DictionaryFormat.SogouW2006:
                        string[] s = Line.Split('\t', ' ');
                        Word = s[0];
                        Frq = s.Length == 1 ? -1 : int.Parse(s[1]);
                        Poses = s.Length < 2 ? null : s[2];
                        break;

                    case DictionaryFormat.ExcelCSV:
                    default:
                        int p1 = Line.IndexOf(',');
                        int p2 = Line.IndexOf(',', p1 + 1);
                        Word = Line.Substring(0, p1);
                        Frq = int.Parse(Line.Substring(p1 + 1, p2 - p1 - 1));
                        Poses = Line.Substring(p2 + 1).Trim('"').Trim();
                        break;
                }

                if (string.IsNullOrEmpty(Poses))
                {
                    if (!OldWords.ContainsKey(Word.ToLower())) Odds.Add(Line);
                    continue;
                }

                foreach (string InputPos in Poses.TrimEnd(',').Split(','))
                {
                    if (string.IsNullOrEmpty(InputPos)) continue;
                    //如果映射表中没有,则保留原始词性字母
                    string Pos = PosTrans.ContainsKey(InputPos.ToLower()) ? PosTrans[InputPos.ToLower()] : InputPos.ToLower();

                    //是否存在
                    if (OldWords.ContainsKey(Word.ToLower()) && OldWords[Word.ToLower()].Pos.Contains(Pos))
                    {
                        int SourceFrq = OldWords[Word.ToLower()].Frequence;
                        double FrqR = SourceFrq == 0 ? Frq : (double)Frq / SourceFrq;
                        if (FrqR > MaxFrqRate) MaxFrqRate = FrqR;
                        if (FrqR < MinFrqRate) MinFrqRate = FrqR;
                        SumFrqRate += (decimal)FrqR;
                        Exists.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq));
                    }
                    else //新词或新词性
                    {
                        News.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq));
                    }
                }
            }

            //平均频度转换倍数
            AvgFrqRate = Exists.Count > 0 ? Convert.ToDouble(SumFrqRate / Exists.Count) : 0;

            OddLines = Odds.ToArray();
            NewWords = News.ToArray();
            ExistWords = Exists.ToArray();
        }