Ejemplo n.º 1
0
        /// <summary>
        /// 按权重排序输出词库
        /// </summary>
        /// <param name="objMemoryItemColl">词库</param>
        /// <param name="nKeyWordTopCount">输出词的数量</param>
        /// <param name="bOrderbyDesc">是否倒序</param>
        /// <param name="bIsOnlyWord">是否仅输出词</param>
        /// <returns>输出的结果</returns>
        public static string ShowKeyWordWeightColl(MemoryItemColl <string> objMemoryItemColl, int nKeyWordTopCount, bool bOrderbyDesc = true, bool bIsOnlyWord = true)
        {
            double        dTotalVaildDegree = objMemoryItemColl.Sum(x => x.ValidDegree / x.ValidCount) / objMemoryItemColl.Count;
            StringBuilder sb = new StringBuilder();

            sb.AppendLine(String.Format("词库成熟度:{0}% ;", dTotalVaildDegree > 1?0:Math.Round((1 - dTotalVaildDegree) * 100, 2)));
            sb.AppendLine("----------------------------------------");
            sb.AppendLine(String.Format(" 【{0}】 | {1} | {2} | {3} | {4}", "主词", "遗忘词频", "累计词频", "词权值", "成熟度(%)"));

            IOrderedEnumerable <MemoryItemMDL <string> > tbuffer = null;

            if (bOrderbyDesc)
            {
                tbuffer = from x in objMemoryItemColl
                                                                                                                                           //如果只显示词,则要求:长度大于1、不包含符号、不是纯数字
                          where !bIsOnlyWord || x.Key.Length > 1 && !Regex.IsMatch(x.Key, @"[\p{P}\s]") && !Regex.IsMatch(x.Key, @"^\d+$") //&& !Regex.IsMatch(x.Key, @"^[a-zA-Z\p{P}\d\s=]+$")
                                                                                                                                           //按权重排序
                          orderby x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount)) descending
                          select x;
            }
            else
            {
                tbuffer = from x in objMemoryItemColl
                                                                                                                                           //如果只显示词,则要求:长度大于1、不包含符号、不是纯数字
                          where !bIsOnlyWord || x.Key.Length > 1 && !Regex.IsMatch(x.Key, @"[\p{P}\s]") && !Regex.IsMatch(x.Key, @"^\d+$") //&& !Regex.IsMatch(x.Key, @"^[a-zA-Z\p{P}\d\s=]+$")
                                                                                                                                           //按权重排序
                          orderby x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount)) ascending
                          select x;
            }
            var buffer = (tbuffer).Take(nKeyWordTopCount);

            sb.AppendLine(String.Format(" =========== 共{0} 个 ============= ", tbuffer.Count()));
            //逐词输出,每个词一行
            foreach (var x in buffer)
            {
                sb.AppendLine(String.Format(" 【{0}】 | {1} | {2} | {3} | {4}",
                                            x.Key,
                                            Math.Round(DictionaryDAL.CalcRemeberValue <string>(x.Key, objMemoryItemColl), 2),
                                            x.TotalCount,
                                            Math.Round((x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount))), 4),
                                            x.ValidCount <= 1?0: x.ValidDegree / x.ValidCount > 1 ? 0 : Math.Round((1 - x.ValidDegree / x.ValidCount) * 100, 2)));
            }
            return(sb.ToString());
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 分词(同时自动维护词典)
        /// </summary>
        /// <param name="text">待分词文本</param>
        /// <param name="objCharBondColl">邻键集合(用于生成词库)</param>
        /// <param name="objKeyWordColl">词库</param>
        /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param>
        /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param>
        /// <param name="bUpdateKeyWordColl">是否同时更新词库</param>
        /// <returns>返回分词结果</returns>
        public static List <string> Segment(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true)
        {
            if (String.IsNullOrEmpty(text))
            {
                return(new List <string>());
            }
            if (maxWordLen == 0)
            {
                maxWordLen = text.Length;
            }

            //此处使用了个技巧:偶尔发现,词库在遗忘公式作用下,其总量也为相对稳定的固定值,且与MinuteOffsetSize相当。
            //故此处以此替换所有词的遗忘后的总词频,这样可以在处理流式数据时,避免动态计算词库总词频(因其计算量较大)。
            double dLogTotalCount = Math.Log(objKeyWordColl.MinuteOffsetSize, Math.E);

            if (bUpdateCharBondColl || bUpdateKeyWordColl)
            {
                WordDictBLL.UpdateKeyWordColl(text, objCharBondColl, objKeyWordColl, bUpdateCharBondColl, bUpdateKeyWordColl);
            }

            Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >();
            Dictionary <int, double>         objKeyWordValueDict  = new Dictionary <int, double>();

            for (int k = 0; k < text.Length; k++)
            {
                List <string> objKeyWordList = new List <string>();
                double        dKeyWordValue  = 0;

                for (int len = 0; len <= Math.Min(k, maxWordLen); len++)
                {
                    int    startpos = k - len;
                    string keyword  = text.Substring(startpos, len + 1);
                    if (len > 0 && !objKeyWordColl.Contains(keyword))
                    {
                        continue;
                    }
                    double dTempValue = 0;
                    if (objKeyWordColl.Contains(keyword))
                    {
                        dTempValue = -(dLogTotalCount - Math.Log(DictionaryDAL.CalcRemeberValue <string>(keyword, objKeyWordColl), Math.E));
                    }
                    if (objKeyWordValueDict.ContainsKey(startpos - 1))
                    {
                        dTempValue += objKeyWordValueDict[startpos - 1];
                        if (dKeyWordValue == 0 || dTempValue > dKeyWordValue)
                        {
                            dKeyWordValue  = dTempValue;
                            objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]);
                            objKeyWordList.Add(keyword);
                        }
                    }
                    else
                    {
                        if (dKeyWordValue == 0 || dTempValue > dKeyWordValue)
                        {
                            dKeyWordValue  = dTempValue;
                            objKeyWordList = new List <string>();
                            objKeyWordList.Add(keyword);
                        }
                    }
                }
                objKeyWordBufferDict.Add(k, objKeyWordList);
                objKeyWordValueDict.Add(k, dKeyWordValue);
            }

            return(objKeyWordBufferDict[text.Length - 1]);
        }