Example #1
0
        /// <summary>
        /// 按权重排序输出词库
        /// </summary>
        /// <param name="objMemoryItemColl">词库</param>
        /// <param name="nKeyWordTopCount">输出词的数量</param>
        /// <param name="bOrderbyDesc">是否倒序</param>
        /// <param name="bIsOnlyWord">是否仅输出词</param>
        /// <returns>输出的结果</returns>
        public static string ShowKeyWordWeightColl(MemoryItemColl <string> objMemoryItemColl, int nKeyWordTopCount, bool bOrderbyDesc = true, bool bIsOnlyWord = true)
        {
            double        dTotalVaildDegree = objMemoryItemColl.Sum(x => x.ValidDegree / x.ValidCount) / objMemoryItemColl.Count;
            StringBuilder sb = new StringBuilder();

            sb.AppendLine(String.Format("词库成熟度:{0}% ;", dTotalVaildDegree > 1?0:Math.Round((1 - dTotalVaildDegree) * 100, 2)));
            sb.AppendLine("----------------------------------------");
            sb.AppendLine(String.Format(" 【{0}】 | {1} | {2} | {3} | {4}", "主词", "遗忘词频", "累计词频", "词权值", "成熟度(%)"));

            IOrderedEnumerable <MemoryItemMDL <string> > tbuffer = null;

            if (bOrderbyDesc)
            {
                tbuffer = from x in objMemoryItemColl
                                                                                                                                           //如果只显示词,则要求:长度大于1、不包含符号、不是纯数字
                          where !bIsOnlyWord || x.Key.Length > 1 && !Regex.IsMatch(x.Key, @"[\p{P}\s]") && !Regex.IsMatch(x.Key, @"^\d+$") //&& !Regex.IsMatch(x.Key, @"^[a-zA-Z\p{P}\d\s=]+$")
                                                                                                                                           //按权重排序
                          orderby x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount)) descending
                          select x;
            }
            else
            {
                tbuffer = from x in objMemoryItemColl
                                                                                                                                           //如果只显示词,则要求:长度大于1、不包含符号、不是纯数字
                          where !bIsOnlyWord || x.Key.Length > 1 && !Regex.IsMatch(x.Key, @"[\p{P}\s]") && !Regex.IsMatch(x.Key, @"^\d+$") //&& !Regex.IsMatch(x.Key, @"^[a-zA-Z\p{P}\d\s=]+$")
                                                                                                                                           //按权重排序
                          orderby x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount)) ascending
                          select x;
            }
            var buffer = (tbuffer).Take(nKeyWordTopCount);

            sb.AppendLine(String.Format(" =========== 共{0} 个 ============= ", tbuffer.Count()));
            //逐词输出,每个词一行
            foreach (var x in buffer)
            {
                sb.AppendLine(String.Format(" 【{0}】 | {1} | {2} | {3} | {4}",
                                            x.Key,
                                            Math.Round(DictionaryDAL.CalcRemeberValue <string>(x.Key, objMemoryItemColl), 2),
                                            x.TotalCount,
                                            Math.Round((x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount))), 4),
                                            x.ValidCount <= 1?0: x.ValidDegree / x.ValidCount > 1 ? 0 : Math.Round((1 - x.ValidDegree / x.ValidCount) * 100, 2)));
            }
            return(sb.ToString());
        }
Example #2
0
        private void btnLoadDictionary_Click(object sender, EventArgs e)
        {
            string floder = Path.GetFullPath(String.Format(@"{0}\dict", CachePathDAL.GetWorkSpacePath()));

            if (!Directory.Exists(floder))
            {
                Directory.CreateDirectory(floder);
            }

            string filename1 = Path.GetFullPath(String.Format(@"{0}\dict\{1}", CachePathDAL.GetWorkSpacePath(), "CharBond.coll"));

            AppendText(String.Format("请稍候,正在加载文件:{0}", filename1));
            objCharBondColl = SerialLib.DeserializeBinary <MemoryBondColl <string> >(filename1);

            string filename2 = Path.GetFullPath(String.Format(@"{0}\dict\{1}", CachePathDAL.GetWorkSpacePath(), "KeyWord.coll"));

            AppendText(String.Format("请稍候,正在加载文件:{0}", filename2));
            objKeyWordColl = SerialLib.DeserializeBinary <MemoryItemColl <string> >(filename2);

            AppendText("字典集加载完毕!");
        }
Example #3
0
        /// <summary>
        /// 从文本中生成候选词
        /// </summary>
        /// <param name="text">文本行</param>
        /// <param name="objCharBondColl">相邻字典</param>
        /// <param name="objKeyWordColl">词库</param>
        /// <param name="bUpdateCharBondColl">是否更新相邻字典</param>
        /// <param name="bUpdateKeyWordColl">是否更新词库</param>
        public static void UpdateKeyWordColl(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true)
        {
            if (String.IsNullOrEmpty(text))
            {
                return;
            }

            StringBuilder buffer  = new StringBuilder(); //用于存放连续的子串
            string        keyHead = text[0].ToString();  //keyHead、keyTail分别存放相邻的两个字符

            buffer.Append(keyHead);
            for (int k = 1; k < text.Length; k++) //遍历句子中的每一个字符
            {
                //从句子中取一个字作为相邻两字的尾字
                string keyTail = text[k].ToString();
                if (bUpdateCharBondColl)
                {
                    //更新相邻字典
                    DictionaryDAL.UpdateMemoryBondColl <string>(keyHead, keyTail, objCharBondColl);
                }
                if (bUpdateKeyWordColl)
                {
                    //判断相邻两字是否有关
                    if (!DictionaryDAL.IsBondValid <string>(keyHead, keyTail, objCharBondColl))
                    {
                        //两字无关,则将绥中的字串取出,此即为候选词
                        string keyword = buffer.ToString();
                        //将候选词添加到词库中
                        DictionaryDAL.UpdateMemoryItemColl <string>(keyword, objKeyWordColl);
                        //清空缓冲
                        buffer.Clear();
                        //并开始下一个子串
                        buffer.Append(keyTail);
                    }
                    else
                    {
                        //两个字有关,则将当前字追加至串缓冲中
                        buffer.Append(keyTail);
                    }
                }
                //将当前的字作为相邻的首字
                keyHead = keyTail;
            }
        }
Example #4
0
        /// <summary>
        /// 分词(同时自动维护词典)
        /// </summary>
        /// <param name="text">待分词文本</param>
        /// <param name="objCharBondColl">邻键集合(用于生成词库)</param>
        /// <param name="objKeyWordColl">词库</param>
        /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param>
        /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param>
        /// <param name="bUpdateKeyWordColl">是否同时更新词库</param>
        /// <returns>返回分词结果</returns>
        public static List <string> Segment(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true)
        {
            if (String.IsNullOrEmpty(text))
            {
                return(new List <string>());
            }
            if (maxWordLen == 0)
            {
                maxWordLen = text.Length;
            }

            //此处使用了个技巧:偶尔发现,词库在遗忘公式作用下,其总量也为相对稳定的固定值,且与MinuteOffsetSize相当。
            //故此处以此替换所有词的遗忘后的总词频,这样可以在处理流式数据时,避免动态计算词库总词频(因其计算量较大)。
            double dLogTotalCount = Math.Log(objKeyWordColl.MinuteOffsetSize, Math.E);

            if (bUpdateCharBondColl || bUpdateKeyWordColl)
            {
                WordDictBLL.UpdateKeyWordColl(text, objCharBondColl, objKeyWordColl, bUpdateCharBondColl, bUpdateKeyWordColl);
            }

            Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >();
            Dictionary <int, double>         objKeyWordValueDict  = new Dictionary <int, double>();

            for (int k = 0; k < text.Length; k++)
            {
                List <string> objKeyWordList = new List <string>();
                double        dKeyWordValue  = 0;

                for (int len = 0; len <= Math.Min(k, maxWordLen); len++)
                {
                    int    startpos = k - len;
                    string keyword  = text.Substring(startpos, len + 1);
                    if (len > 0 && !objKeyWordColl.Contains(keyword))
                    {
                        continue;
                    }
                    double dTempValue = 0;
                    if (objKeyWordColl.Contains(keyword))
                    {
                        dTempValue = -(dLogTotalCount - Math.Log(DictionaryDAL.CalcRemeberValue <string>(keyword, objKeyWordColl), Math.E));
                    }
                    if (objKeyWordValueDict.ContainsKey(startpos - 1))
                    {
                        dTempValue += objKeyWordValueDict[startpos - 1];
                        if (dKeyWordValue == 0 || dTempValue > dKeyWordValue)
                        {
                            dKeyWordValue  = dTempValue;
                            objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]);
                            objKeyWordList.Add(keyword);
                        }
                    }
                    else
                    {
                        if (dKeyWordValue == 0 || dTempValue > dKeyWordValue)
                        {
                            dKeyWordValue  = dTempValue;
                            objKeyWordList = new List <string>();
                            objKeyWordList.Add(keyword);
                        }
                    }
                }
                objKeyWordBufferDict.Add(k, objKeyWordList);
                objKeyWordValueDict.Add(k, dKeyWordValue);
            }

            return(objKeyWordBufferDict[text.Length - 1]);
        }