コード例 #1
0
        //public static void UpdateKeyWingColl(List<string> objKeyList,KeyBondColl<string,string> objKeyCloudColl,KeyBondColl<string,string> objKeyWingColl,int nRadiusSize = 7,string splitChar = "\t")
        //{
        //    Dictionary<int,PosWingColl> dict= GetPosWingDict(objKeyList,objKeyCloudColl,nRadiusSize);
        //    UpdateKeyWingColl(objKeyList,dict,objKeyWingColl,splitChar);
        //}
        //public static void UpdateKeyWingColl(List<string> objKeyList,Dictionary<int,PosWingColl> objPosWingDict,KeyBondColl<string,string> objKeyWingColl, string splitChar="\t")
        //{
        //    HashSet<int> objPosSet = new HashSet<int>();
        //    for (int k = 0; k < objKeyList.Count; k++)
        //    {
        //        objPosSet.Add(k);
        //    }
        //    PosWingColl objPosWingColl = new KeyWing.PosWingColl();
        //    foreach (KeyValuePair<int,PosWingColl> pair in objPosWingDict)
        //    {
        //        foreach (PosWingMDL mdl in pair.Value)
        //        {
        //            if (objPosWingColl.Contains(mdl.KeyIndex)) continue;
        //            objPosWingColl.Add(mdl);
        //        }
        //    }
        //    PosWingColl objResultColl = new PosWingColl();

        //    var objOrderedColl = objPosWingColl.OrderByDescending(x => x.Weight);
        //    foreach (PosWingMDL mdl in objOrderedColl)
        //    {
        //        if (objPosSet.Count <= 0) break;
        //        if (mdl.PosSet.All(x => !objPosSet.Contains(x))) continue; //没有包含新词,则跳过
        //        if (!objResultColl.Contains(mdl.KeyIndex))
        //        {
        //            objResultColl.Add(mdl);
        //        }
        //        objPosSet.RemoveWhere(x => mdl.PosSet.Contains(x));

        //    }


        //    foreach (PosWingMDL mdl in objResultColl)
        //    {
        //        SortedSet<int> set = new SortedSet<int>(mdl.PosSet);
        //        StringBuilder sb = new StringBuilder();
        //        int prev = -1;
        //        foreach (int k in set)
        //        {
        //            if (prev >= 0) sb.Append(splitChar);
        //            sb.Append(String.Format("{0}",objKeyList[k]));
        //            prev = k;
        //        }
        //        string keywing = sb.ToString();
        //        foreach (int k in mdl.PosSet)
        //        {
        //            string keyword = objKeyList[k];
        //            KeyBondDAL.UpdateKeyBondColl<string,string>(keyword,keywing,objKeyWingColl,new OffsetWeightMDL(1,1),new OffsetWeightMDL(1,1));
        //        }
        //    }
        //}
        //public static Dictionary<int,PosWingColl> GetPosWingDict(List<string> objKeyList,KeyBondColl<string,string> objKeyCloudColl,int nRadiusSize = 7 )
        //{
        //    double dTotalCount = 1.0 / ( 1.0 - MemoryDAL.CalcRemeberValue(1,objKeyCloudColl.Parameter) );
        //    double dLogTotalCount = Math.Log(dTotalCount);

        //    Dictionary<int,PosWingColl> dict = new Dictionary<int, PosWingColl>();
        //    for (int k = 0; k < objKeyList.Count; k++)
        //    {

        //        PosWingColl objPosWingColl = new PosWingColl();
        //        HashSet<int> objLinkSet = new HashSet<int>();

        //        string keyword = objKeyList[k];
        //        double weight = 0;
        //        if (objKeyCloudColl.Contains(keyword))
        //        {
        //            KeyItemMDL<string> objKeyItem = objKeyCloudColl[keyword].KeyItem;
        //            if (objKeyItem.ValidCount > objKeyCloudColl.Parameter.Threshold)
        //            {
        //                weight = dLogTotalCount - Math.Log(objKeyItem.ValidCount * KeyBondHelper.CalcRemeberValue(keyword,objKeyCloudColl));
        //            }
        //        }

        //        //将自身添加进集合
        //        {
        //            objLinkSet.Add(k);
        //            PosWingMDL mdl = new PosWingMDL();
        //            mdl.Weight = weight;
        //            mdl.PosSet.Add(k);
        //            mdl.KeyIndex = objPosWingColl.GetKeyIndex(mdl.PosSet);
        //            if (!objPosWingColl.Contains(mdl.KeyIndex))
        //            {
        //                objPosWingColl.Add(mdl);
        //            }
        //        }
        //        string keyTail = keyword;
        //        for (int t = 1; t <= nRadiusSize; t++)
        //        {
        //            int nPos = k - t;
        //            if (nPos <= 0) break;
        //            string keyHead = objKeyList[nPos];

        //            if (dict.ContainsKey(nPos))
        //            {
        //                if (KeyBondHelper.IsBondValid(keyHead,keyTail,objKeyCloudColl))
        //                {
        //                    objLinkSet.Add(nPos);
        //                    foreach (PosWingMDL headMDL in dict[nPos])
        //                    {
        //                        PosWingMDL mdl = new PosWingMDL();
        //                        mdl.Weight = headMDL.Weight + weight;
        //                        mdl.PosSet.UnionWith(headMDL.PosSet);
        //                        mdl.PosSet.Add(k);
        //                        mdl.KeyIndex = objPosWingColl.GetKeyIndex(mdl.PosSet);
        //                        if (!objPosWingColl.Contains(mdl.KeyIndex))
        //                        {
        //                            objPosWingColl.Add(mdl);
        //                        }
        //                    }
        //                }

        //            }
        //        }

        //        PosWingColl objResultColl = new PosWingColl();

        //        var objOrderedColl= objPosWingColl.OrderByDescending(x => x.Weight);
        //        foreach (PosWingMDL mdl in objOrderedColl)
        //        {
        //            if (objLinkSet.Count <= 0) break;
        //            if (mdl.PosSet.All(x => !objLinkSet.Contains(x))) continue; //没有包含新词,则跳过
        //            if (!objResultColl.Contains(mdl.KeyIndex))
        //            {
        //                objResultColl.Add(mdl);
        //            }
        //            objLinkSet.RemoveWhere(x => mdl.PosSet.Contains(x));

        //        }

        //        dict.Add(k,objResultColl);
        //    }

        //    return dict;
        //}

        #endregion
        #region  局部词翼

        /// <summary>
        /// 获得每一个词窗口尺寸内相关的词的位置集合
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="objKeyList"></param>
        /// <param name="objKeyCloudColl"></param>
        /// <param name="nRadiusSize"></param>
        /// <returns></returns>
        public static Dictionary <int, SortedSet <int> > GetPosWingDict <T>(List <T> objKeyList, KeyBondColl <T, T> objKeyCloudColl, int nRadiusSize = 7)
        {
            Dictionary <int, SortedSet <int> > dict = new Dictionary <int, SortedSet <int> >();

            for (int k = 0; k < objKeyList.Count; k++)
            {
                dict.Add(k, new SortedSet <int>());
                T keyTail = objKeyList[k];
                for (int t = 1; t <= nRadiusSize; t++)
                {
                    int nPos = k - t;
                    if (nPos <= 0)
                    {
                        break;
                    }
                    T keyHead = objKeyList[nPos];

                    if (dict.ContainsKey(nPos))
                    {
                        if (KeyBondHelper.IsBondValid(keyHead, keyTail, objKeyCloudColl))
                        {
                            dict[nPos].Add(k);
                            dict[k].Add(nPos);
                        }
                    }
                }
                dict[k].Add(k);
            }

            return(dict);
        }
コード例 #2
0
        public static Dictionary <string, HashSet <int> > GetKeyWingMatchedList(List <string> objKeyList, KeyItemColl <string> objKeyWordColl, KeyBondColl <string, string> objKeyCloudColl, KeyBondColl <string, string> objKeyWingColl, string splitChar = "\t", int nRadiusSize = 7)
        {
            HashSet <int> objKeyPosSet = new HashSet <int>();
            Dictionary <int, Dictionary <int, double> > objPosWeightDict = new Dictionary <int, Dictionary <int, double> >();

            #region 获得每个索引位对应的关联位置的关联系数
            for (int k = 0; k < objKeyList.Count; k++)
            {
                objKeyPosSet.Add(k);

                objPosWeightDict.Add(k, new Dictionary <int, double>());
                string keyTail = objKeyList[k];
                for (int t = 1; t <= nRadiusSize; t++)
                {
                    int nPos = k - t;
                    if (nPos < 0)
                    {
                        break;
                    }
                    string keyHead = objKeyList[nPos];

                    if (objPosWeightDict.ContainsKey(nPos))
                    {
                        //double dRelateValue= KeyBondHelper.CalcBondRelateValue(keyHead,keyTail,objKeyCloudColl);
                        // if (KeyBondHelper. GetRandomNumber(0,dRelateValue) > KeyBondHelper.GetRandomNumber(0,Math.E))
                        // {
                        //     objPosWeightDict[nPos].Add(k,dRelateValue);
                        //     objPosWeightDict[k].Add(nPos,dRelateValue);
                        // }
                        if (KeyBondHelper.IsBondValid(keyHead, keyTail, objKeyCloudColl))
                        {
                            double dLinkValidCount = KeyBondHelper.CalcTailValidCount(keyHead, keyTail, objKeyCloudColl);
                            objPosWeightDict[nPos].Add(k, dLinkValidCount);
                            objPosWeightDict[k].Add(nPos, dLinkValidCount);
                        }
                    }
                }
            }
            #endregion

            Dictionary <string, HashSet <int> > objKeyPosDict = new Dictionary <string, HashSet <int> >();
            #region 将位置转换为词翼
            foreach (KeyValuePair <int, Dictionary <int, double> > pair in objPosWeightDict)
            {
                SortedSet <int> objPosWingSet = new SortedSet <int>();
                objPosWingSet.Add(pair.Key);
                IOrderedEnumerable <KeyValuePair <int, double> > buffer = pair.Value.OrderByDescending(x => x.Value);

                foreach (KeyValuePair <int, double> kvp in buffer)
                {
                    StringBuilder sb       = new StringBuilder();
                    int           nLastPos = -1;
                    foreach (int pos in objPosWingSet)
                    {
                        if (pos - nLastPos > 1)
                        {
                            sb.Append(splitChar);
                        }
                        sb.Append(objKeyList[pos]);
                        nLastPos = pos;
                    }
                    if (nLastPos + 1 < objKeyList.Count)
                    {
                        sb.Append(splitChar);
                    }
                    string keywing = sb.ToString();
                    if (!objKeyPosDict.ContainsKey(keywing))
                    {
                        objKeyPosDict.Add(keywing, new HashSet <int>());
                    }
                    objKeyPosDict[keywing].UnionWith(objPosWingSet);

                    objPosWingSet.Add(kvp.Key);
                }
            }
            #endregion


            Dictionary <string, double>         objKeyWeightDict = new Dictionary <string, double>();
            Dictionary <string, HashSet <int> > objKeyPosExDict  = new Dictionary <string, HashSet <int> >();

            double dLogTotalCount = Math.Log(1.0 / (1.0 - MemoryDAL.CalcRemeberValue(1, objKeyWordColl.Parameter)));

            #region 获得库中存在的词翼,同时累计匹配词的权重
            foreach (KeyValuePair <string, HashSet <int> > pair  in objKeyPosDict)
            {
                string keywing = pair.Key;
                foreach (int pos in pair.Value)
                {
                    string keyword            = objKeyList[pos];
                    double dKeyWordValidCount = objKeyWordColl.Contains(keyword) ?  KeyItemHelper.CalcValidCount(keyword, objKeyWordColl) + 1 : 1;
                    double dKeyWordWeight     = dLogTotalCount - Math.Log(dKeyWordValidCount);

                    if (objKeyWingColl.Contains(keyword))
                    {
                        if (objKeyWingColl[keyword].LinkColl.Contains(keywing))
                        {
                            if (!objKeyWeightDict.ContainsKey(keywing))
                            {
                                objKeyWeightDict.Add(keywing, dKeyWordWeight);
                            }
                            else
                            {
                                objKeyWeightDict[keywing] += dKeyWordWeight;
                            }

                            if (!objKeyPosExDict.ContainsKey(keywing))
                            {
                                objKeyPosExDict.Add(keywing, new HashSet <int>());
                            }
                            objKeyPosExDict[keywing].Add(pos);
                        }
                    }
                }
            }
            #endregion


            Dictionary <string, HashSet <int> > dict = new Dictionary <string, HashSet <int> >();
            #region 获得最佳匹配词翼
            foreach (KeyValuePair <string, double> pair in objKeyWeightDict.OrderByDescending(x => x.Value))
            {
                if (objKeyPosSet.Count <= 0)
                {
                    break;
                }
                if (!objKeyPosSet.Any(x => objKeyPosExDict[pair.Key].Contains(x)))
                {
                    continue;
                }
                if (!dict.ContainsKey(pair.Key))
                {
                    dict.Add(pair.Key, objKeyPosExDict[pair.Key]);
                }
                objKeyPosSet.RemoveWhere(x => objKeyPosExDict[pair.Key].Contains(x));
            }
            #endregion
            return(dict);
        }