public static KeyBondMDL <T, L> UpdateKeyBondColl <T, L>(T head, L tail, KeyBondColl <T, L> objKeyBondColl, OffsetWeightMDL objHeadWeightMDL, OffsetWeightMDL objTailWeightMDL) { if (!objKeyBondColl.Contains(head)) { KeyBondMDL <T, L> bond = new KeyBondMDL <T, L>(); bond.KeyItem.Key = head; bond.KeyItem.UpdateOffset = objKeyBondColl.Parameter.TotalOffset; bond.LinkColl.Parameter.ContainerSize = objKeyBondColl.Parameter.ContainerSize; bond.LinkColl.Parameter.Threshold = objKeyBondColl.Parameter.Threshold; objKeyBondColl.Add(bond); } KeyItemMDL <T> mdl = objKeyBondColl[head].KeyItem; mdl.ValidCount = objHeadWeightMDL.Weight + mdl.ValidCount * MemoryDAL.CalcRemeberValue(objKeyBondColl.Parameter.TotalOffset - mdl.UpdateOffset, objKeyBondColl.Parameter); mdl.TotalCount = objHeadWeightMDL.Weight + mdl.TotalCount * MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter); mdl.UpdateOffset = objKeyBondColl.Parameter.TotalOffset; KeyItemColl <L> objLinkColl = objKeyBondColl[head].LinkColl; if (objTailWeightMDL.Offset < 0) { //继承主计数 objLinkColl.Parameter.TotalOffset = objKeyBondColl.Parameter.TotalOffset; objTailWeightMDL.Offset = 0; } KeyItemDAL.UpdateKeyItemColl(tail, objLinkColl, objTailWeightMDL); objKeyBondColl.Parameter.TotalValidCount = objHeadWeightMDL.Weight + objKeyBondColl.Parameter.TotalValidCount * MemoryDAL.CalcRemeberValue(objHeadWeightMDL.Offset, objKeyBondColl.Parameter); objKeyBondColl.Parameter.TotalOffset += objHeadWeightMDL.Offset; return(objKeyBondColl[head]); }
public static void UpdateKeyWordCollByNGram(string line, KeyItemColl <string> objKeyWordColl, int nMaxWordSize = 7) { string text = Regex.Replace(line, @"\p{C}", ""); string[] parts = Regex.Split(text, @"([\s\p{P}\p{C}])"); foreach (string part in parts) { if (String.IsNullOrEmpty(part)) { continue; } for (int k = 0; k < part.Length; k++) { for (int s = 0; s < nMaxWordSize; s++) { int nStartPos = k - s; if (nStartPos < 0) { break; } string keyword = part.Substring(nStartPos, s + 1); WSR_Forget_Core.KeyItem.KeyItemDAL.UpdateKeyItemColl(keyword, objKeyWordColl, new OffsetWeightMDL(1, 1)); } } } }
public static string ShowKeyBondCollEx(KeyBondColl <string, string> objKeyBondColl, List <string> objKeyWordList, int nLinkTopCount, bool bIsOrderbyDesc = true, string splitChar = "\t", string spaceChar = "\r") { StringBuilder sb = new StringBuilder(); sb.AppendLine(String.Format("[{0}]{1}|{2}|{3}|{4}", "词项", "遗忘词频", "总词频", "词权重", "成熟度")); sb.AppendLine("============================================="); StringBuilder sbkey = new StringBuilder(); HashSet <KeyItemMDL <string> > objBufferSet = new HashSet <KeyItemMDL <string> >(); foreach (string keyword in objKeyWordList) { if (String.IsNullOrWhiteSpace(keyword)) { continue; } if (!objKeyBondColl.Contains(keyword)) { continue; } if (sbkey.Length > 0) { sbkey.Append("、"); } sbkey.Append(keyword); KeyBondMDL <string, string> bond = objKeyBondColl[keyword]; if (objBufferSet.Count <= 0) { objBufferSet.UnionWith(bond.LinkColl); } else { HashSet <KeyItemMDL <string> > buffer = new HashSet <KeyItemMDL <string> >(); foreach (KeyItemMDL <string> mdl in objBufferSet) { if (bond.LinkColl.Contains(mdl.Key)) { buffer.Add(mdl); } } objBufferSet = new HashSet <KeyItemMDL <string> >(buffer); } } KeyItemColl <string> objBufferColl = new KeyItemColl <string>(); foreach (KeyItemMDL <string> mdl in objBufferSet) { if (!objBufferColl.Contains(mdl.Key)) { objBufferColl.Add(mdl); } } sb.AppendLine(); sb.AppendLine(String.Format("【{0}】", sbkey)); sb.Append(KeyItemHelper.ShowKeyItemColl(objBufferColl, nLinkTopCount, false, bIsOrderbyDesc, false, splitChar, spaceChar)); return(sb.ToString()); }
public static double CalcRemeberValue <T>(T key, KeyItemColl <T> objMemoryItemColl) { if (objMemoryItemColl.Contains(key)) { return(MemoryDAL.CalcRemeberValue(objMemoryItemColl.Parameter.TotalOffset - objMemoryItemColl[key].UpdateOffset, objMemoryItemColl.Parameter)); } return(0); }
/// <summary> /// 使用互信息计算关联系数 /// </summary> /// <typeparam name="T"></typeparam> /// <param name="keyHead"></param> /// <param name="keyTail"></param> /// <param name="objKeyBondColl"></param> /// <returns></returns> public static double CalcBondRelateValueByPMI <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl) { ////分别获得相邻单项的频次 //double dHeadValidCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue<T,T>(keyHead,objKeyBondColl) : 1; //double dTailValidCount = objKeyBondColl.Contains(keyTail) ? 1 + objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue<T,T>(keyTail,objKeyBondColl) : 1; //double dTotalValidCount = 1 + 1.0 / ( 1 - MemoryDAL.CalcRemeberValue(1,objKeyBondColl.Parameter) ); ////获得相邻项共现的频次 //KeyItemColl<T> objLinkColl = objKeyBondColl.Contains(keyHead) ? objKeyBondColl[keyHead].LinkColl : new KeyItemColl<T>(); //KeyItemMDL<T> mdl = objLinkColl.Contains(keyTail) ? objLinkColl[keyTail] : new KeyItemMDL<T>(); //double dShareValidCount = 1 + mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl,objLinkColl); //double dShareTotalCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.TotalCount : 1; //if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail)) return 0; //if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail)) return 0; if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail)) { return(0); } if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail)) { return(0); } //分别获得相邻单项的频次 double dHeadValidCount = objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl); double dTailValidCount = objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl); if (dHeadValidCount < objKeyBondColl.Parameter.Threshold || dTailValidCount < objKeyBondColl.Parameter.Threshold) { return(0); } double dTotalValidCount = objKeyBondColl.Parameter.TotalValidCount;// 1.0 / (1 - MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter)); //获得相邻项共现的频次 KeyItemColl <T> objLinkColl = objKeyBondColl[keyHead].LinkColl; KeyItemMDL <T> mdl = objLinkColl[keyTail]; double dShareValidCount = mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl); double dShareTotalCount = objLinkColl.Parameter.TotalValidCount; if (dShareTotalCount < objLinkColl.Parameter.Threshold || dShareValidCount < objLinkColl.Parameter.Threshold) { return(0); } //P(AB)=P(B|A)*P(A) //result=P(AB)/(P(A)*P(B))=P(B|A)/P(B) return((dShareValidCount / dShareTotalCount) / (dTailValidCount / dTotalValidCount)); }
public static KeyItemMDL <L> UpdateTailBondColl <T, L>(T head, L tail, KeyBondColl <T, L> objKeyBondColl, OffsetWeightMDL objTailWeightMDL) { if (!objKeyBondColl.Contains(head)) { return(null); } KeyItemColl <L> objLinkColl = objKeyBondColl[head].LinkColl; if (objTailWeightMDL.Offset < 0) { //继承主计数 objLinkColl.Parameter.TotalOffset = objKeyBondColl.Parameter.TotalOffset; objTailWeightMDL.Offset = 0; } return(KeyItemDAL.UpdateKeyItemColl(tail, objLinkColl, objTailWeightMDL)); }
public static double CalcBondRelateValueWithLaplace <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl) { //分别获得相邻单项的频次 double dHeadValidCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl) : 1; double dTailValidCount = objKeyBondColl.Contains(keyTail) ? 1 + objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl) : 1; double dTotalValidCount = 1 + 1.0 / (1 - MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter)); //获得相邻项共现的频次 KeyItemColl <T> objLinkColl = objKeyBondColl.Contains(keyHead) ? objKeyBondColl[keyHead].LinkColl : new KeyItemColl <T>(); KeyItemMDL <T> mdl = objLinkColl.Contains(keyTail) ? objLinkColl[keyTail] : new KeyItemMDL <T>(); double dShareValidCount = 1 + mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl); double dShareTotalCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.TotalCount : 1; //P(AB)=P(B|A)*P(A) //result=P(AB)/(P(A)*P(B))=P(B|A)/P(B) return((dShareValidCount / dShareTotalCount) / (dTailValidCount / dTotalValidCount)); }
/// <summary> /// 当关键项列表发生时,获取关联项一次都不发生时的概率对数,以及包含的关键项 /// </summary> /// <typeparam name="T"></typeparam> /// <typeparam name="L"></typeparam> /// <param name="objKeyList"></param> /// <param name="objMemoryBondColl"></param> /// <param name="objLinkProbDict">注:此处存放的是相关项一次都不发生的概率的对数</param> /// <param name="objLinkKeyDict"></param> public static void UpdateKeyLinkDict <T, L>(List <T> objKeyList, KeyBondColl <T, L> objMemoryBondColl, Dictionary <L, double> objLinkProbDict, Dictionary <L, List <T> > objLinkKeyDict, Dictionary <L, List <int> > objLinkPosDict) { for (int k = 0; k < objKeyList.Count; k++) { T key = objKeyList[k]; if (!objMemoryBondColl.Contains(key)) { continue; } KeyItemMDL <T> objKeyMDL = objMemoryBondColl[key].KeyItem; KeyItemColl <L> objLinkColl = objMemoryBondColl[key].LinkColl; double dKeyValidCount = /* objKeyMDL.ValidCount * */ objKeyMDL.TotalCount * CalcRemeberValue <T, L>(objKeyMDL.Key, objMemoryBondColl); if (dKeyValidCount < objMemoryBondColl.Parameter.Threshold) { continue; } foreach (KeyItemMDL <L> link in objLinkColl) { double dLinkValidCount = link.ValidCount * CalcRemeberValue <L>(link, objLinkColl); if (!objLinkProbDict.ContainsKey(link.Key)) { objLinkProbDict.Add(link.Key, 0); } double dLinkProb = dLinkValidCount / dKeyValidCount; objLinkProbDict[link.Key] += dLinkProb >= 1 ? 0 : Math.Log(1 - dLinkProb); //不发生的概率 if (!objLinkKeyDict.ContainsKey(link.Key)) { objLinkKeyDict.Add(link.Key, new List <T>()); } objLinkKeyDict[link.Key].Add(objKeyMDL.Key); if (!objLinkPosDict.ContainsKey(link.Key)) { objLinkPosDict.Add(link.Key, new List <int>()); } objLinkPosDict[link.Key].Add(k); } } }
/// <summary> /// 使用平均信息熵计算关联 /// </summary> /// <typeparam name="T"></typeparam> /// <param name="keyHead"></param> /// <param name="keyTail"></param> /// <param name="objKeyBondColl"></param> /// <returns></returns> public static double CalcBondRelateValueByAverageEntropy <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl) { if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail)) { return(0); } if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail)) { return(0); } //分别获得相邻单项的频次 double dHeadValidCount = objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl); double dTailValidCount = objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl); if (dHeadValidCount < objKeyBondColl.Parameter.Threshold || dTailValidCount < objKeyBondColl.Parameter.Threshold) { return(0); } //获得相邻项共现的频次 KeyItemColl <T> objLinkColl = objKeyBondColl[keyHead].LinkColl; KeyItemMDL <T> mdl = objLinkColl[keyTail]; double dShareValidCount = mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl); double dShareTotalCount = objLinkColl.Parameter.TotalValidCount; if (dShareTotalCount < objLinkColl.Parameter.Threshold || dShareValidCount < objLinkColl.Parameter.Threshold) { return(0); } double dEuler = 0.5772156649; double dKeywordCount = objKeyBondColl[keyHead].LinkColl.Count; double dAverageEntropy = Math.Log(dKeywordCount) + dEuler - 1; double dKeywordEntropy = (dKeywordCount * dShareValidCount / dShareTotalCount) * (Math.Log(dShareTotalCount) - Math.Log(dShareValidCount)); return(dKeywordEntropy - dAverageEntropy); }
/// <summary> /// 分词(同时自动维护词典) /// </summary> /// <param name="text">待分词文本</param> /// <param name="objCharBondColl">邻键集合(用于生成词库)</param> /// <param name="objKeyWordColl">词库</param> /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param> /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param> /// <param name="bUpdateKeyWordColl">是否同时更新词库</param> /// <param name="nRadiusSize">有效键半径</param> /// <returns>返回分词结果</returns> public static List <string> Segment(string text, KeyBondColl <string, string> objCharBondColl, KeyItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true, int nRadiusSize = 7) { if (String.IsNullOrEmpty(text)) { return(new List <string>()); } if (maxWordLen <= 0) { maxWordLen = text.Length; } //总词频 double dLogTotalCount = Math.Log(objKeyWordColl.Parameter.TotalValidCount + 1);// Math.Log(1.0 / ( 1.0 - MemoryDAL.CalcRemeberValue(1,objKeyWordColl.Parameter) ) );// Math.Log(objKeyWordColl.Sum(x =>x.VaildCount* KeyWordBLL.CalcRemeberValue<string>(x.Key,objKeyWordColl)));// Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >(); Dictionary <int, double> objKeyWordValueDict = new Dictionary <int, double>(); for (int k = 0; k < text.Length; k++) { List <string> objKeyWordList = new List <string>(); double dKeyWordValue = 0; for (int len = 0; len < maxWordLen; len++) { int startpos = k - len; if (startpos < 0) { break; } string keyword = text.Substring(startpos, len + 1); if (len > 0 && !objKeyWordColl.Contains(keyword)) { continue; } if (len > 0) { if (!objKeyWordColl.Contains(keyword)) { continue; } double dValidCount = KeyItemHelper.CalcValidCount(keyword, objKeyWordColl); if (dValidCount < objKeyWordColl.Parameter.Threshold) { continue; } //if (dValidCount < Math.E) continue;//经测试,原始最好 } double dTempValue = 0; if (objKeyWordColl.Contains(keyword)) { KeyItemMDL <string> mdl = objKeyWordColl[keyword]; dTempValue = -(dLogTotalCount - Math.Log(KeyItemHelper.CalcValidCount(keyword, objKeyWordColl))); } if (objKeyWordValueDict.ContainsKey(startpos - 1)) { dTempValue += objKeyWordValueDict[startpos - 1]; if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]); objKeyWordList.Add(keyword); } } else { if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(); objKeyWordList.Add(keyword); } } } objKeyWordBufferDict.Add(k, objKeyWordList); objKeyWordValueDict.Add(k, dKeyWordValue); if (k > maxWordLen) { objKeyWordBufferDict.Remove(k - maxWordLen - 1); objKeyWordValueDict.Remove(k - maxWordLen - 1); } } if (bUpdateCharBondColl || bUpdateKeyWordColl) { KeyWordBLL.UpdateKeyWordColl(text, objKeyWordColl, maxWordLen); } return(objKeyWordBufferDict[text.Length - 1]); }
/// <summary> /// 分词(支持实时新词发现) /// </summary> /// <param name="text"></param> /// <param name="objKeyWordColl"></param> /// <param name="maxWordLen"></param> /// <param name="bIsNewWordMining"></param> /// <returns></returns> public static List <string> SegmentEx(string text, KeyItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bIsNewWordMining = true) { if (!bIsNewWordMining) { return(Segment(text, null, objKeyWordColl, maxWordLen, false, false, maxWordLen)); } if (String.IsNullOrEmpty(text)) { return(new List <string>()); } if (maxWordLen <= 0) { maxWordLen = text.Length; } //总词频 double dLogTotalCount = Math.Log(objKeyWordColl.Parameter.TotalValidCount + 1);// Math.Log(1.0 / ( 1.0 - MemoryDAL.CalcRemeberValue(1,objKeyWordColl.Parameter) ) );// Math.Log(objKeyWordColl.Sum(x =>x.VaildCount* KeyWordBLL.CalcRemeberValue<string>(x.Key,objKeyWordColl)));// Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >(); Dictionary <int, double> objKeyWordValueDict = new Dictionary <int, double>(); Dictionary <string, double> objKeyWordProbDict = new Dictionary <string, double>(); for (int k = 0; k < text.Length; k++) { for (int s = 0; s < maxWordLen; s++) { int nStartPos = k - s; if (nStartPos < 0) { break; } string keyword = text.Substring(nStartPos, s + 1); if (keyword.Length > 1 && Regex.IsMatch(keyword, @"[\s\p{P}\p{C}]")) { break; } if (!objKeyWordProbDict.ContainsKey(keyword)) { //if (objKeyWordColl.Contains(keyword)) //{ double dKeywordValidCount = WSR_Forget_Core.KeyItem.KeyItemHelper.CalcValidCount(keyword, objKeyWordColl) + 1; //if (dKeywordValidCount < objKeyWordColl.Parameter.Threshold + 1) dKeywordValidCount = 1; double dLogKeywordProbValue = Math.Log(dKeywordValidCount) - dLogTotalCount; //objKeyWordProbDict.Add(keyword, dLogProbValue); if (s > 0) { string prevword = text.Substring(nStartPos + 1, s); double dPrevValidCount = WSR_Forget_Core.KeyItem.KeyItemHelper.CalcValidCount(prevword, objKeyWordColl) + 1; double dLogProbValue = objKeyWordProbDict[prevword] + Math.Log(dKeywordValidCount) - Math.Log(dPrevValidCount);//Log(P(prevword)*P(keyword|prevword)) objKeyWordProbDict.Add(keyword, Math.Min(dLogProbValue, dLogKeywordProbValue)); } else { //单字 objKeyWordProbDict.Add(keyword, dLogKeywordProbValue); } //} //else //{ // if (s > 0) // { // string prevword = text.Substring(nStartPos+1, s); // double dPrevValidCount = WSR_Forget_Core.KeyItem.KeyItemHelper.CalcValidCount(prevword, objKeyWordColl) + 1; // double dLogProbValue = objKeyWordProbDict[prevword] - Math.Log(dPrevValidCount);//Log(P(prevword)*P(keyword|prevword)) // objKeyWordProbDict.Add(keyword, dLogProbValue); // } // else // { // //单字 // objKeyWordProbDict.Add(keyword, -dLogTotalCount); // } //} } } } for (int k = 0; k < text.Length; k++) { List <string> objKeyWordList = new List <string>(); double dKeyWordValue = 0; for (int len = 0; len < maxWordLen; len++) { int startpos = k - len; if (startpos < 0) { break; } string keyword = text.Substring(startpos, len + 1); if (!objKeyWordProbDict.ContainsKey(keyword)) { break; } if (objKeyWordProbDict[keyword] < 1.0 - dLogTotalCount) { break; } double dTempValue = objKeyWordProbDict[keyword]; if (objKeyWordValueDict.ContainsKey(startpos - 1)) { dTempValue += objKeyWordValueDict[startpos - 1]; if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]); objKeyWordList.Add(keyword); } } else { if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(); objKeyWordList.Add(keyword); } } } objKeyWordBufferDict.Add(k, objKeyWordList); objKeyWordValueDict.Add(k, dKeyWordValue); if (k > maxWordLen) { objKeyWordBufferDict.Remove(k - maxWordLen - 1); objKeyWordValueDict.Remove(k - maxWordLen - 1); } } return(objKeyWordBufferDict[text.Length - 1]); }
public static double CalcRemeberValue <T>(KeyItemMDL <T> mdl, KeyItemColl <T> objMemoryBondColl) { return(MemoryDAL.CalcRemeberValue(objMemoryBondColl.Parameter.TotalOffset - mdl.UpdateOffset, objMemoryBondColl.Parameter)); }
public static Dictionary <string, HashSet <int> > GetKeyWingMatchedList(List <string> objKeyList, KeyItemColl <string> objKeyWordColl, KeyBondColl <string, string> objKeyCloudColl, KeyBondColl <string, string> objKeyWingColl, string splitChar = "\t", int nRadiusSize = 7) { HashSet <int> objKeyPosSet = new HashSet <int>(); Dictionary <int, Dictionary <int, double> > objPosWeightDict = new Dictionary <int, Dictionary <int, double> >(); #region 获得每个索引位对应的关联位置的关联系数 for (int k = 0; k < objKeyList.Count; k++) { objKeyPosSet.Add(k); objPosWeightDict.Add(k, new Dictionary <int, double>()); string keyTail = objKeyList[k]; for (int t = 1; t <= nRadiusSize; t++) { int nPos = k - t; if (nPos < 0) { break; } string keyHead = objKeyList[nPos]; if (objPosWeightDict.ContainsKey(nPos)) { //double dRelateValue= KeyBondHelper.CalcBondRelateValue(keyHead,keyTail,objKeyCloudColl); // if (KeyBondHelper. GetRandomNumber(0,dRelateValue) > KeyBondHelper.GetRandomNumber(0,Math.E)) // { // objPosWeightDict[nPos].Add(k,dRelateValue); // objPosWeightDict[k].Add(nPos,dRelateValue); // } if (KeyBondHelper.IsBondValid(keyHead, keyTail, objKeyCloudColl)) { double dLinkValidCount = KeyBondHelper.CalcTailValidCount(keyHead, keyTail, objKeyCloudColl); objPosWeightDict[nPos].Add(k, dLinkValidCount); objPosWeightDict[k].Add(nPos, dLinkValidCount); } } } } #endregion Dictionary <string, HashSet <int> > objKeyPosDict = new Dictionary <string, HashSet <int> >(); #region 将位置转换为词翼 foreach (KeyValuePair <int, Dictionary <int, double> > pair in objPosWeightDict) { SortedSet <int> objPosWingSet = new SortedSet <int>(); objPosWingSet.Add(pair.Key); IOrderedEnumerable <KeyValuePair <int, double> > buffer = pair.Value.OrderByDescending(x => x.Value); foreach (KeyValuePair <int, double> kvp in buffer) { StringBuilder sb = new StringBuilder(); int nLastPos = -1; foreach (int pos in objPosWingSet) { if (pos - nLastPos > 1) { sb.Append(splitChar); } sb.Append(objKeyList[pos]); nLastPos = pos; } if (nLastPos + 1 < objKeyList.Count) { sb.Append(splitChar); } string keywing = sb.ToString(); if (!objKeyPosDict.ContainsKey(keywing)) { objKeyPosDict.Add(keywing, new HashSet <int>()); } objKeyPosDict[keywing].UnionWith(objPosWingSet); objPosWingSet.Add(kvp.Key); } } #endregion Dictionary <string, double> objKeyWeightDict = new Dictionary <string, double>(); Dictionary <string, HashSet <int> > objKeyPosExDict = new Dictionary <string, HashSet <int> >(); double dLogTotalCount = Math.Log(1.0 / (1.0 - MemoryDAL.CalcRemeberValue(1, objKeyWordColl.Parameter))); #region 获得库中存在的词翼,同时累计匹配词的权重 foreach (KeyValuePair <string, HashSet <int> > pair in objKeyPosDict) { string keywing = pair.Key; foreach (int pos in pair.Value) { string keyword = objKeyList[pos]; double dKeyWordValidCount = objKeyWordColl.Contains(keyword) ? KeyItemHelper.CalcValidCount(keyword, objKeyWordColl) + 1 : 1; double dKeyWordWeight = dLogTotalCount - Math.Log(dKeyWordValidCount); if (objKeyWingColl.Contains(keyword)) { if (objKeyWingColl[keyword].LinkColl.Contains(keywing)) { if (!objKeyWeightDict.ContainsKey(keywing)) { objKeyWeightDict.Add(keywing, dKeyWordWeight); } else { objKeyWeightDict[keywing] += dKeyWordWeight; } if (!objKeyPosExDict.ContainsKey(keywing)) { objKeyPosExDict.Add(keywing, new HashSet <int>()); } objKeyPosExDict[keywing].Add(pos); } } } } #endregion Dictionary <string, HashSet <int> > dict = new Dictionary <string, HashSet <int> >(); #region 获得最佳匹配词翼 foreach (KeyValuePair <string, double> pair in objKeyWeightDict.OrderByDescending(x => x.Value)) { if (objKeyPosSet.Count <= 0) { break; } if (!objKeyPosSet.Any(x => objKeyPosExDict[pair.Key].Contains(x))) { continue; } if (!dict.ContainsKey(pair.Key)) { dict.Add(pair.Key, objKeyPosExDict[pair.Key]); } objKeyPosSet.RemoveWhere(x => objKeyPosExDict[pair.Key].Contains(x)); } #endregion return(dict); }
public static void UpdateKeyWordColl(string line, KeyItemColl <string> objKeyWordColl, int nMaxWordSize = 7) { KeyWordBLL.UpdateKeyWordCollByNGram(line, objKeyWordColl, nMaxWordSize); }