/// <summary> /// 使用互信息计算关联系数 /// </summary> /// <typeparam name="T"></typeparam> /// <param name="keyHead"></param> /// <param name="keyTail"></param> /// <param name="objKeyBondColl"></param> /// <returns></returns> public static double CalcBondRelateValueByPMI <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl) { ////分别获得相邻单项的频次 //double dHeadValidCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue<T,T>(keyHead,objKeyBondColl) : 1; //double dTailValidCount = objKeyBondColl.Contains(keyTail) ? 1 + objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue<T,T>(keyTail,objKeyBondColl) : 1; //double dTotalValidCount = 1 + 1.0 / ( 1 - MemoryDAL.CalcRemeberValue(1,objKeyBondColl.Parameter) ); ////获得相邻项共现的频次 //KeyItemColl<T> objLinkColl = objKeyBondColl.Contains(keyHead) ? objKeyBondColl[keyHead].LinkColl : new KeyItemColl<T>(); //KeyItemMDL<T> mdl = objLinkColl.Contains(keyTail) ? objLinkColl[keyTail] : new KeyItemMDL<T>(); //double dShareValidCount = 1 + mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl,objLinkColl); //double dShareTotalCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.TotalCount : 1; //if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail)) return 0; //if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail)) return 0; if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail)) { return(0); } if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail)) { return(0); } //分别获得相邻单项的频次 double dHeadValidCount = objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl); double dTailValidCount = objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl); if (dHeadValidCount < objKeyBondColl.Parameter.Threshold || dTailValidCount < objKeyBondColl.Parameter.Threshold) { return(0); } double dTotalValidCount = objKeyBondColl.Parameter.TotalValidCount;// 1.0 / (1 - MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter)); //获得相邻项共现的频次 KeyItemColl <T> objLinkColl = objKeyBondColl[keyHead].LinkColl; KeyItemMDL <T> mdl = objLinkColl[keyTail]; double dShareValidCount = mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl); double dShareTotalCount = objLinkColl.Parameter.TotalValidCount; if (dShareTotalCount < objLinkColl.Parameter.Threshold || dShareValidCount < objLinkColl.Parameter.Threshold) { return(0); } //P(AB)=P(B|A)*P(A) //result=P(AB)/(P(A)*P(B))=P(B|A)/P(B) return((dShareValidCount / dShareTotalCount) / (dTailValidCount / dTotalValidCount)); }
public static KeyBondMDL <T, L> UpdateKeyBondColl <T, L>(T head, L tail, KeyBondColl <T, L> objKeyBondColl, OffsetWeightMDL objHeadWeightMDL, OffsetWeightMDL objTailWeightMDL) { if (!objKeyBondColl.Contains(head)) { KeyBondMDL <T, L> bond = new KeyBondMDL <T, L>(); bond.KeyItem.Key = head; bond.KeyItem.UpdateOffset = objKeyBondColl.Parameter.TotalOffset; bond.LinkColl.Parameter.ContainerSize = objKeyBondColl.Parameter.ContainerSize; bond.LinkColl.Parameter.Threshold = objKeyBondColl.Parameter.Threshold; objKeyBondColl.Add(bond); } KeyItemMDL <T> mdl = objKeyBondColl[head].KeyItem; mdl.ValidCount = objHeadWeightMDL.Weight + mdl.ValidCount * MemoryDAL.CalcRemeberValue(objKeyBondColl.Parameter.TotalOffset - mdl.UpdateOffset, objKeyBondColl.Parameter); mdl.TotalCount = objHeadWeightMDL.Weight + mdl.TotalCount * MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter); mdl.UpdateOffset = objKeyBondColl.Parameter.TotalOffset; KeyItemColl <L> objLinkColl = objKeyBondColl[head].LinkColl; if (objTailWeightMDL.Offset < 0) { //继承主计数 objLinkColl.Parameter.TotalOffset = objKeyBondColl.Parameter.TotalOffset; objTailWeightMDL.Offset = 0; } KeyItemDAL.UpdateKeyItemColl(tail, objLinkColl, objTailWeightMDL); objKeyBondColl.Parameter.TotalValidCount = objHeadWeightMDL.Weight + objKeyBondColl.Parameter.TotalValidCount * MemoryDAL.CalcRemeberValue(objHeadWeightMDL.Offset, objKeyBondColl.Parameter); objKeyBondColl.Parameter.TotalOffset += objHeadWeightMDL.Offset; return(objKeyBondColl[head]); }
public static string ShowKeyBondCollEx(KeyBondColl <string, string> objKeyBondColl, List <string> objKeyWordList, int nLinkTopCount, bool bIsOrderbyDesc = true, string splitChar = "\t", string spaceChar = "\r") { StringBuilder sb = new StringBuilder(); sb.AppendLine(String.Format("[{0}]{1}|{2}|{3}|{4}", "词项", "遗忘词频", "总词频", "词权重", "成熟度")); sb.AppendLine("============================================="); StringBuilder sbkey = new StringBuilder(); HashSet <KeyItemMDL <string> > objBufferSet = new HashSet <KeyItemMDL <string> >(); foreach (string keyword in objKeyWordList) { if (String.IsNullOrWhiteSpace(keyword)) { continue; } if (!objKeyBondColl.Contains(keyword)) { continue; } if (sbkey.Length > 0) { sbkey.Append("、"); } sbkey.Append(keyword); KeyBondMDL <string, string> bond = objKeyBondColl[keyword]; if (objBufferSet.Count <= 0) { objBufferSet.UnionWith(bond.LinkColl); } else { HashSet <KeyItemMDL <string> > buffer = new HashSet <KeyItemMDL <string> >(); foreach (KeyItemMDL <string> mdl in objBufferSet) { if (bond.LinkColl.Contains(mdl.Key)) { buffer.Add(mdl); } } objBufferSet = new HashSet <KeyItemMDL <string> >(buffer); } } KeyItemColl <string> objBufferColl = new KeyItemColl <string>(); foreach (KeyItemMDL <string> mdl in objBufferSet) { if (!objBufferColl.Contains(mdl.Key)) { objBufferColl.Add(mdl); } } sb.AppendLine(); sb.AppendLine(String.Format("【{0}】", sbkey)); sb.Append(KeyItemHelper.ShowKeyItemColl(objBufferColl, nLinkTopCount, false, bIsOrderbyDesc, false, splitChar, spaceChar)); return(sb.ToString()); }
public static double CalcHeadValidCount <T, L>(T head, KeyBondColl <T, L> objMemoryBondColl) { if (!objMemoryBondColl.Contains(head)) { return(0); } return(objMemoryBondColl[head].KeyItem.ValidCount * CalcRemeberValue(head, objMemoryBondColl)); }
public static double CalcRemeberValue <T, L>(T key, KeyBondColl <T, L> objMemoryBondColl) { if (objMemoryBondColl.Contains(key)) { return(MemoryDAL.CalcRemeberValue(objMemoryBondColl.Parameter.TotalOffset - objMemoryBondColl[key].KeyItem.UpdateOffset, objMemoryBondColl.Parameter)); } return(0); }
public static double CalcBondRelateValueWithLaplace <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl) { //分别获得相邻单项的频次 double dHeadValidCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl) : 1; double dTailValidCount = objKeyBondColl.Contains(keyTail) ? 1 + objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl) : 1; double dTotalValidCount = 1 + 1.0 / (1 - MemoryDAL.CalcRemeberValue(1, objKeyBondColl.Parameter)); //获得相邻项共现的频次 KeyItemColl <T> objLinkColl = objKeyBondColl.Contains(keyHead) ? objKeyBondColl[keyHead].LinkColl : new KeyItemColl <T>(); KeyItemMDL <T> mdl = objLinkColl.Contains(keyTail) ? objLinkColl[keyTail] : new KeyItemMDL <T>(); double dShareValidCount = 1 + mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl); double dShareTotalCount = objKeyBondColl.Contains(keyHead) ? 1 + objKeyBondColl[keyHead].KeyItem.TotalCount : 1; //P(AB)=P(B|A)*P(A) //result=P(AB)/(P(A)*P(B))=P(B|A)/P(B) return((dShareValidCount / dShareTotalCount) / (dTailValidCount / dTotalValidCount)); }
/// <summary> /// 使用平均信息熵计算关联 /// </summary> /// <typeparam name="T"></typeparam> /// <param name="keyHead"></param> /// <param name="keyTail"></param> /// <param name="objKeyBondColl"></param> /// <returns></returns> public static double CalcBondRelateValueByAverageEntropy <T>(T keyHead, T keyTail, KeyBondColl <T, T> objKeyBondColl) { if (!objKeyBondColl.Contains(keyHead) || !objKeyBondColl.Contains(keyTail)) { return(0); } if (!objKeyBondColl[keyHead].LinkColl.Contains(keyTail)) { return(0); } //分别获得相邻单项的频次 double dHeadValidCount = objKeyBondColl[keyHead].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyHead, objKeyBondColl); double dTailValidCount = objKeyBondColl[keyTail].KeyItem.ValidCount * CalcRemeberValue <T, T>(keyTail, objKeyBondColl); if (dHeadValidCount < objKeyBondColl.Parameter.Threshold || dTailValidCount < objKeyBondColl.Parameter.Threshold) { return(0); } //获得相邻项共现的频次 KeyItemColl <T> objLinkColl = objKeyBondColl[keyHead].LinkColl; KeyItemMDL <T> mdl = objLinkColl[keyTail]; double dShareValidCount = mdl.ValidCount * KeyItemHelper.CalcRemeberValue(mdl, objLinkColl); double dShareTotalCount = objLinkColl.Parameter.TotalValidCount; if (dShareTotalCount < objLinkColl.Parameter.Threshold || dShareValidCount < objLinkColl.Parameter.Threshold) { return(0); } double dEuler = 0.5772156649; double dKeywordCount = objKeyBondColl[keyHead].LinkColl.Count; double dAverageEntropy = Math.Log(dKeywordCount) + dEuler - 1; double dKeywordEntropy = (dKeywordCount * dShareValidCount / dShareTotalCount) * (Math.Log(dShareTotalCount) - Math.Log(dShareValidCount)); return(dKeywordEntropy - dAverageEntropy); }
public static double CalcTailValidCount <T, L>(T head, L tail, KeyBondColl <T, L> objMemoryBondColl) { if (!objMemoryBondColl.Contains(head)) { return(0); } if (!objMemoryBondColl[head].LinkColl.Contains(tail)) { return(0); } return(KeyItemHelper.CalcValidCount(tail, objMemoryBondColl[head].LinkColl)); }
public static KeyItemMDL <L> UpdateTailBondColl <T, L>(T head, L tail, KeyBondColl <T, L> objKeyBondColl, OffsetWeightMDL objTailWeightMDL) { if (!objKeyBondColl.Contains(head)) { return(null); } KeyItemColl <L> objLinkColl = objKeyBondColl[head].LinkColl; if (objTailWeightMDL.Offset < 0) { //继承主计数 objLinkColl.Parameter.TotalOffset = objKeyBondColl.Parameter.TotalOffset; objTailWeightMDL.Offset = 0; } return(KeyItemDAL.UpdateKeyItemColl(tail, objLinkColl, objTailWeightMDL)); }
/// <summary> /// 当关键项列表发生时,获取关联项一次都不发生时的概率对数,以及包含的关键项 /// </summary> /// <typeparam name="T"></typeparam> /// <typeparam name="L"></typeparam> /// <param name="objKeyList"></param> /// <param name="objMemoryBondColl"></param> /// <param name="objLinkProbDict">注:此处存放的是相关项一次都不发生的概率的对数</param> /// <param name="objLinkKeyDict"></param> public static void UpdateKeyLinkDict <T, L>(List <T> objKeyList, KeyBondColl <T, L> objMemoryBondColl, Dictionary <L, double> objLinkProbDict, Dictionary <L, List <T> > objLinkKeyDict, Dictionary <L, List <int> > objLinkPosDict) { for (int k = 0; k < objKeyList.Count; k++) { T key = objKeyList[k]; if (!objMemoryBondColl.Contains(key)) { continue; } KeyItemMDL <T> objKeyMDL = objMemoryBondColl[key].KeyItem; KeyItemColl <L> objLinkColl = objMemoryBondColl[key].LinkColl; double dKeyValidCount = /* objKeyMDL.ValidCount * */ objKeyMDL.TotalCount * CalcRemeberValue <T, L>(objKeyMDL.Key, objMemoryBondColl); if (dKeyValidCount < objMemoryBondColl.Parameter.Threshold) { continue; } foreach (KeyItemMDL <L> link in objLinkColl) { double dLinkValidCount = link.ValidCount * CalcRemeberValue <L>(link, objLinkColl); if (!objLinkProbDict.ContainsKey(link.Key)) { objLinkProbDict.Add(link.Key, 0); } double dLinkProb = dLinkValidCount / dKeyValidCount; objLinkProbDict[link.Key] += dLinkProb >= 1 ? 0 : Math.Log(1 - dLinkProb); //不发生的概率 if (!objLinkKeyDict.ContainsKey(link.Key)) { objLinkKeyDict.Add(link.Key, new List <T>()); } objLinkKeyDict[link.Key].Add(objKeyMDL.Key); if (!objLinkPosDict.ContainsKey(link.Key)) { objLinkPosDict.Add(link.Key, new List <int>()); } objLinkPosDict[link.Key].Add(k); } } }
public static string ShowKeyBondColl(KeyBondColl <string, string> objKeyBondColl, List <string> objKeyWordList, int nLinkTopCount, bool bIsOrderbyDesc = true, string splitChar = "\t", string spaceChar = "\r") { StringBuilder sb = new StringBuilder(); sb.AppendLine(String.Format("[{0}]{1}|{2}|{3}|{4}", "词项", "遗忘词频", "总词频", "词权重", "成熟度")); sb.AppendLine("============================================="); foreach (string keyword in objKeyWordList) { if (!objKeyBondColl.Contains(keyword)) { continue; } KeyBondMDL <string, string> bond = objKeyBondColl[keyword]; sb.AppendLine(); sb.AppendLine(String.Format("【{0}】{1}|{2}", bond.KeyItem.Key, Math.Round(bond.KeyItem.ValidCount * KeyBondHelper.CalcRemeberValue <string, string>(bond.KeyItem.Key, objKeyBondColl), 4), Math.Round(bond.KeyItem.TotalCount))); sb.Append(KeyItemHelper.ShowKeyItemColl(bond.LinkColl, nLinkTopCount, false, bIsOrderbyDesc, false, splitChar, spaceChar)); } return(sb.ToString()); }
public static KeyBondColl <T, L> ClearKeyBondColl <T, L>(KeyBondColl <T, L> objKeyBondColl, bool bIsParentTotalOffset = true) { double dTotalVaildCount = objKeyBondColl.Parameter.TotalValidCount + 1;// 1.0 / ( 1.0 - MemoryDAL.CalcRemeberValue(1,objKeyBondColl.Parameter) ); KeyBondColl <T, L> objResultColl = new KeyBondColl <T, L>(); objResultColl.Parameter = objKeyBondColl.Parameter; objResultColl.Parameter.Entropy = 0; double dClearValidCount = 0; foreach (KeyBondMDL <T, L> bond in objKeyBondColl) { KeyItemMDL <T> mdl = bond.KeyItem; double dRemeberValue = mdl.ValidCount * MemoryDAL.CalcRemeberValue(objKeyBondColl.Parameter.TotalOffset - mdl.UpdateOffset, objKeyBondColl.Parameter); if (dRemeberValue > objKeyBondColl.Parameter.Threshold) { mdl.ValidCount = dRemeberValue; mdl.UpdateOffset = 0; if (!objResultColl.Contains(mdl.Key)) { dClearValidCount += mdl.ValidCount; double dProb = mdl.ValidCount / dTotalVaildCount; objKeyBondColl.Parameter.Entropy += -dProb *Math.Log(dProb); if (bIsParentTotalOffset) { bond.LinkColl.Parameter.TotalOffset = objKeyBondColl.Parameter.TotalOffset; } bond.LinkColl = KeyItemDAL.ClearKeyItemColl <L>(bond.LinkColl); objResultColl.Add(bond); } } } objResultColl.Parameter.TotalOffset = 0; objResultColl.Parameter.TotalValidCount = dClearValidCount; return(objResultColl); }