Example #1
0
        public static string ShowKeyBondCollEx(KeyBondColl <string, string> objKeyBondColl, List <string> objKeyWordList, int nLinkTopCount, bool bIsOrderbyDesc = true, string splitChar = "\t", string spaceChar = "\r")
        {
            StringBuilder sb = new StringBuilder();

            sb.AppendLine(String.Format("[{0}]{1}|{2}|{3}|{4}", "词项", "遗忘词频", "总词频", "词权重", "成熟度"));
            sb.AppendLine("=============================================");


            StringBuilder sbkey = new StringBuilder();
            HashSet <KeyItemMDL <string> > objBufferSet = new HashSet <KeyItemMDL <string> >();

            foreach (string keyword in objKeyWordList)
            {
                if (String.IsNullOrWhiteSpace(keyword))
                {
                    continue;
                }
                if (!objKeyBondColl.Contains(keyword))
                {
                    continue;
                }
                if (sbkey.Length > 0)
                {
                    sbkey.Append("、");
                }
                sbkey.Append(keyword);
                KeyBondMDL <string, string> bond = objKeyBondColl[keyword];
                if (objBufferSet.Count <= 0)
                {
                    objBufferSet.UnionWith(bond.LinkColl);
                }
                else
                {
                    HashSet <KeyItemMDL <string> > buffer = new HashSet <KeyItemMDL <string> >();
                    foreach (KeyItemMDL <string> mdl in objBufferSet)
                    {
                        if (bond.LinkColl.Contains(mdl.Key))
                        {
                            buffer.Add(mdl);
                        }
                    }
                    objBufferSet = new HashSet <KeyItemMDL <string> >(buffer);
                }
            }
            KeyItemColl <string> objBufferColl = new KeyItemColl <string>();

            foreach (KeyItemMDL <string> mdl in objBufferSet)
            {
                if (!objBufferColl.Contains(mdl.Key))
                {
                    objBufferColl.Add(mdl);
                }
            }

            sb.AppendLine();
            sb.AppendLine(String.Format("【{0}】", sbkey));
            sb.Append(KeyItemHelper.ShowKeyItemColl(objBufferColl, nLinkTopCount, false, bIsOrderbyDesc, false, splitChar, spaceChar));

            return(sb.ToString());
        }
Example #2
0
 public static double CalcTailValidCount <T, L>(T head, L tail, KeyBondColl <T, L> objMemoryBondColl)
 {
     if (!objMemoryBondColl.Contains(head))
     {
         return(0);
     }
     if (!objMemoryBondColl[head].LinkColl.Contains(tail))
     {
         return(0);
     }
     return(KeyItemHelper.CalcValidCount(tail, objMemoryBondColl[head].LinkColl));
 }
Example #3
0
        /// <summary>
        /// 使用互信息计算关联系数
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="keyHead"></param>
        /// <param name="keyTail"></param>
        /// <param name="objKeyBondColl"></param>
        /// <returns></returns>
        public static double CalcBondRelateValueByPMI <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl)
        {
            ////分别获得相邻单项的频次
            //double dHeadValidCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue<T,T>(keyHead,objKeyBondColl) : 1;
            //double dTailValidCount = objKeyBondColl.Contains(keyTail) ? 1 + objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue<T,T>(keyTail,objKeyBondColl) : 1;
            //double dTotalValidCount = 1 + 1.0 / ( 1 - MemoryDAL.CalcRemeberValue(1,objKeyBondColl.Parameter) );
            ////获得相邻项共现的频次
            //KeyItemColl<T>  objLinkColl = objKeyBondColl.Contains(keyHead) ? objKeyBondColl[keyHead].LinkColl : new KeyItemColl<T>();

            //KeyItemMDL<T> mdl = objLinkColl.Contains(keyTail) ? objLinkColl[keyTail] : new KeyItemMDL<T>();
            //double dShareValidCount = 1 + mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl,objLinkColl);
            //double dShareTotalCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.TotalCount : 1;

            //if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail)) return 0;
            //if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail)) return 0;

            if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail))
            {
                return(0);
            }
            if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail))
            {
                return(0);
            }

            //分别获得相邻单项的频次
            double dHeadValidCount = objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl);
            double dTailValidCount = objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl);

            if (dHeadValidCount < objKeyBondColl.Parameter.Threshold || dTailValidCount < objKeyBondColl.Parameter.Threshold)
            {
                return(0);
            }
            double dTotalValidCount = objKeyBondColl.Parameter.TotalValidCount;// 1.0 / (1 - MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter));

            //获得相邻项共现的频次
            KeyItemColl <T> objLinkColl = objKeyBondColl[keyHead].LinkColl;

            KeyItemMDL <T> mdl = objLinkColl[keyTail];
            double         dShareValidCount = mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl);
            double         dShareTotalCount = objLinkColl.Parameter.TotalValidCount;

            if (dShareTotalCount < objLinkColl.Parameter.Threshold || dShareValidCount < objLinkColl.Parameter.Threshold)
            {
                return(0);
            }

            //P(AB)=P(B|A)*P(A)
            //result=P(AB)/(P(A)*P(B))=P(B|A)/P(B)
            return((dShareValidCount / dShareTotalCount) / (dTailValidCount / dTotalValidCount));
        }
Example #4
0
        public static double CalcBondRelateValueWithLaplace <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl)
        {
            //分别获得相邻单项的频次
            double dHeadValidCount  = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl) : 1;
            double dTailValidCount  = objKeyBondColl.Contains(keyTail) ? 1 + objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl) : 1;
            double dTotalValidCount = 1 + 1.0 / (1 - MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter));
            //获得相邻项共现的频次
            KeyItemColl <T> objLinkColl = objKeyBondColl.Contains(keyHead) ? objKeyBondColl[keyHead].LinkColl : new KeyItemColl <T>();

            KeyItemMDL <T> mdl = objLinkColl.Contains(keyTail) ? objLinkColl[keyTail] : new KeyItemMDL <T>();
            double         dShareValidCount = 1 + mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl);
            double         dShareTotalCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.TotalCount : 1;


            //P(AB)=P(B|A)*P(A)
            //result=P(AB)/(P(A)*P(B))=P(B|A)/P(B)
            return((dShareValidCount / dShareTotalCount) / (dTailValidCount / dTotalValidCount));
        }
Example #5
0
        /// <summary>
        /// 使用平均信息熵计算关联
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="keyHead"></param>
        /// <param name="keyTail"></param>
        /// <param name="objKeyBondColl"></param>
        /// <returns></returns>
        public static double CalcBondRelateValueByAverageEntropy <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl)
        {
            if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail))
            {
                return(0);
            }
            if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail))
            {
                return(0);
            }



            //分别获得相邻单项的频次
            double dHeadValidCount = objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl);
            double dTailValidCount = objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl);

            if (dHeadValidCount < objKeyBondColl.Parameter.Threshold || dTailValidCount < objKeyBondColl.Parameter.Threshold)
            {
                return(0);
            }

            //获得相邻项共现的频次
            KeyItemColl <T> objLinkColl = objKeyBondColl[keyHead].LinkColl;

            KeyItemMDL <T> mdl = objLinkColl[keyTail];
            double         dShareValidCount = mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl);
            double         dShareTotalCount = objLinkColl.Parameter.TotalValidCount;

            if (dShareTotalCount < objLinkColl.Parameter.Threshold || dShareValidCount < objLinkColl.Parameter.Threshold)
            {
                return(0);
            }

            double dEuler          = 0.5772156649;
            double dKeywordCount   = objKeyBondColl[keyHead].LinkColl.Count;
            double dAverageEntropy = Math.Log(dKeywordCount) + dEuler - 1;

            double dKeywordEntropy = (dKeywordCount * dShareValidCount / dShareTotalCount) * (Math.Log(dShareTotalCount) - Math.Log(dShareValidCount));

            return(dKeywordEntropy - dAverageEntropy);
        }
Example #6
0
        public static string ShowKeyBondColl(KeyBondColl <string, string> objKeyBondColl, List <string> objKeyWordList, int nLinkTopCount, bool bIsOrderbyDesc = true, string splitChar = "\t", string spaceChar = "\r")
        {
            StringBuilder sb = new StringBuilder();

            sb.AppendLine(String.Format("[{0}]{1}|{2}|{3}|{4}", "词项", "遗忘词频", "总词频", "词权重", "成熟度"));
            sb.AppendLine("=============================================");

            foreach (string keyword in objKeyWordList)
            {
                if (!objKeyBondColl.Contains(keyword))
                {
                    continue;
                }
                KeyBondMDL <string, string> bond = objKeyBondColl[keyword];
                sb.AppendLine();
                sb.AppendLine(String.Format("【{0}】{1}|{2}", bond.KeyItem.Key, Math.Round(bond.KeyItem.ValidCount * KeyBondHelper.CalcRemeberValue <string, string>(bond.KeyItem.Key, objKeyBondColl), 4), Math.Round(bond.KeyItem.TotalCount)));
                sb.Append(KeyItemHelper.ShowKeyItemColl(bond.LinkColl, nLinkTopCount, false, bIsOrderbyDesc, false, splitChar, spaceChar));
            }
            return(sb.ToString());
        }
Example #7
0
        /// <summary>
        /// 分词(同时自动维护词典)
        /// </summary>
        /// <param name="text">待分词文本</param>
        /// <param name="objCharBondColl">邻键集合(用于生成词库)</param>
        /// <param name="objKeyWordColl">词库</param>
        /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param>
        /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param>
        /// <param name="bUpdateKeyWordColl">是否同时更新词库</param>
        /// <param name="nRadiusSize">有效键半径</param>
        /// <returns>返回分词结果</returns>
        public static List <string> Segment(string text, KeyBondColl <string, string> objCharBondColl, KeyItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true, int nRadiusSize = 7)
        {
            if (String.IsNullOrEmpty(text))
            {
                return(new List <string>());
            }
            if (maxWordLen <= 0)
            {
                maxWordLen = text.Length;
            }

            //总词频
            double dLogTotalCount = Math.Log(objKeyWordColl.Parameter.TotalValidCount + 1);// Math.Log(1.0 / ( 1.0 - MemoryDAL.CalcRemeberValue(1,objKeyWordColl.Parameter) ) );// Math.Log(objKeyWordColl.Sum(x =>x.VaildCount* KeyWordBLL.CalcRemeberValue<string>(x.Key,objKeyWordColl)));//


            Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >();
            Dictionary <int, double>         objKeyWordValueDict  = new Dictionary <int, double>();

            for (int k = 0; k < text.Length; k++)
            {
                List <string> objKeyWordList = new List <string>();
                double        dKeyWordValue  = 0;

                for (int len = 0; len < maxWordLen; len++)
                {
                    int startpos = k - len;
                    if (startpos < 0)
                    {
                        break;
                    }
                    string keyword = text.Substring(startpos, len + 1);
                    if (len > 0 && !objKeyWordColl.Contains(keyword))
                    {
                        continue;
                    }
                    if (len > 0)
                    {
                        if (!objKeyWordColl.Contains(keyword))
                        {
                            continue;
                        }
                        double dValidCount = KeyItemHelper.CalcValidCount(keyword, objKeyWordColl);
                        if (dValidCount < objKeyWordColl.Parameter.Threshold)
                        {
                            continue;
                        }
                        //if (dValidCount < Math.E) continue;//经测试,原始最好
                    }
                    double dTempValue = 0;
                    if (objKeyWordColl.Contains(keyword))
                    {
                        KeyItemMDL <string> mdl = objKeyWordColl[keyword];
                        dTempValue = -(dLogTotalCount - Math.Log(KeyItemHelper.CalcValidCount(keyword, objKeyWordColl)));
                    }
                    if (objKeyWordValueDict.ContainsKey(startpos - 1))
                    {
                        dTempValue += objKeyWordValueDict[startpos - 1];
                        if (dKeyWordValue == 0 || dTempValue > dKeyWordValue)
                        {
                            dKeyWordValue  = dTempValue;
                            objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]);
                            objKeyWordList.Add(keyword);
                        }
                    }
                    else
                    {
                        if (dKeyWordValue == 0 || dTempValue > dKeyWordValue)
                        {
                            dKeyWordValue  = dTempValue;
                            objKeyWordList = new List <string>();
                            objKeyWordList.Add(keyword);
                        }
                    }
                }
                objKeyWordBufferDict.Add(k, objKeyWordList);
                objKeyWordValueDict.Add(k, dKeyWordValue);

                if (k > maxWordLen)
                {
                    objKeyWordBufferDict.Remove(k - maxWordLen - 1);
                    objKeyWordValueDict.Remove(k - maxWordLen - 1);
                }
            }

            if (bUpdateCharBondColl || bUpdateKeyWordColl)
            {
                KeyWordBLL.UpdateKeyWordColl(text, objKeyWordColl, maxWordLen);
            }

            return(objKeyWordBufferDict[text.Length - 1]);
        }
Example #8
0
        public static Dictionary <string, HashSet <int> > GetKeyWingMatchedList(List <string> objKeyList, KeyItemColl <string> objKeyWordColl, KeyBondColl <string, string> objKeyCloudColl, KeyBondColl <string, string> objKeyWingColl, string splitChar = "\t", int nRadiusSize = 7)
        {
            HashSet <int> objKeyPosSet = new HashSet <int>();
            Dictionary <int, Dictionary <int, double> > objPosWeightDict = new Dictionary <int, Dictionary <int, double> >();

            #region 获得每个索引位对应的关联位置的关联系数
            for (int k = 0; k < objKeyList.Count; k++)
            {
                objKeyPosSet.Add(k);

                objPosWeightDict.Add(k, new Dictionary <int, double>());
                string keyTail = objKeyList[k];
                for (int t = 1; t <= nRadiusSize; t++)
                {
                    int nPos = k - t;
                    if (nPos < 0)
                    {
                        break;
                    }
                    string keyHead = objKeyList[nPos];

                    if (objPosWeightDict.ContainsKey(nPos))
                    {
                        //double dRelateValue= KeyBondHelper.CalcBondRelateValue(keyHead,keyTail,objKeyCloudColl);
                        // if (KeyBondHelper. GetRandomNumber(0,dRelateValue) > KeyBondHelper.GetRandomNumber(0,Math.E))
                        // {
                        //     objPosWeightDict[nPos].Add(k,dRelateValue);
                        //     objPosWeightDict[k].Add(nPos,dRelateValue);
                        // }
                        if (KeyBondHelper.IsBondValid(keyHead, keyTail, objKeyCloudColl))
                        {
                            double dLinkValidCount = KeyBondHelper.CalcTailValidCount(keyHead, keyTail, objKeyCloudColl);
                            objPosWeightDict[nPos].Add(k, dLinkValidCount);
                            objPosWeightDict[k].Add(nPos, dLinkValidCount);
                        }
                    }
                }
            }
            #endregion

            Dictionary <string, HashSet <int> > objKeyPosDict = new Dictionary <string, HashSet <int> >();
            #region 将位置转换为词翼
            foreach (KeyValuePair <int, Dictionary <int, double> > pair in objPosWeightDict)
            {
                SortedSet <int> objPosWingSet = new SortedSet <int>();
                objPosWingSet.Add(pair.Key);
                IOrderedEnumerable <KeyValuePair <int, double> > buffer = pair.Value.OrderByDescending(x => x.Value);

                foreach (KeyValuePair <int, double> kvp in buffer)
                {
                    StringBuilder sb       = new StringBuilder();
                    int           nLastPos = -1;
                    foreach (int pos in objPosWingSet)
                    {
                        if (pos - nLastPos > 1)
                        {
                            sb.Append(splitChar);
                        }
                        sb.Append(objKeyList[pos]);
                        nLastPos = pos;
                    }
                    if (nLastPos + 1 < objKeyList.Count)
                    {
                        sb.Append(splitChar);
                    }
                    string keywing = sb.ToString();
                    if (!objKeyPosDict.ContainsKey(keywing))
                    {
                        objKeyPosDict.Add(keywing, new HashSet <int>());
                    }
                    objKeyPosDict[keywing].UnionWith(objPosWingSet);

                    objPosWingSet.Add(kvp.Key);
                }
            }
            #endregion


            Dictionary <string, double>         objKeyWeightDict = new Dictionary <string, double>();
            Dictionary <string, HashSet <int> > objKeyPosExDict  = new Dictionary <string, HashSet <int> >();

            double dLogTotalCount = Math.Log(1.0 / (1.0 - MemoryDAL.CalcRemeberValue(1, objKeyWordColl.Parameter)));

            #region 获得库中存在的词翼,同时累计匹配词的权重
            foreach (KeyValuePair <string, HashSet <int> > pair  in objKeyPosDict)
            {
                string keywing = pair.Key;
                foreach (int pos in pair.Value)
                {
                    string keyword            = objKeyList[pos];
                    double dKeyWordValidCount = objKeyWordColl.Contains(keyword) ?  KeyItemHelper.CalcValidCount(keyword, objKeyWordColl) + 1 : 1;
                    double dKeyWordWeight     = dLogTotalCount - Math.Log(dKeyWordValidCount);

                    if (objKeyWingColl.Contains(keyword))
                    {
                        if (objKeyWingColl[keyword].LinkColl.Contains(keywing))
                        {
                            if (!objKeyWeightDict.ContainsKey(keywing))
                            {
                                objKeyWeightDict.Add(keywing, dKeyWordWeight);
                            }
                            else
                            {
                                objKeyWeightDict[keywing] += dKeyWordWeight;
                            }

                            if (!objKeyPosExDict.ContainsKey(keywing))
                            {
                                objKeyPosExDict.Add(keywing, new HashSet <int>());
                            }
                            objKeyPosExDict[keywing].Add(pos);
                        }
                    }
                }
            }
            #endregion


            Dictionary <string, HashSet <int> > dict = new Dictionary <string, HashSet <int> >();
            #region 获得最佳匹配词翼
            foreach (KeyValuePair <string, double> pair in objKeyWeightDict.OrderByDescending(x => x.Value))
            {
                if (objKeyPosSet.Count <= 0)
                {
                    break;
                }
                if (!objKeyPosSet.Any(x => objKeyPosExDict[pair.Key].Contains(x)))
                {
                    continue;
                }
                if (!dict.ContainsKey(pair.Key))
                {
                    dict.Add(pair.Key, objKeyPosExDict[pair.Key]);
                }
                objKeyPosSet.RemoveWhere(x => objKeyPosExDict[pair.Key].Contains(x));
            }
            #endregion
            return(dict);
        }