/// <summary> /// 按权重排序输出词库 /// </summary> /// <param name="objMemoryItemColl">词库</param> /// <param name="nKeyWordTopCount">输出词的数量</param> /// <param name="bOrderbyDesc">是否倒序</param> /// <param name="bIsOnlyWord">是否仅输出词</param> /// <returns>输出的结果</returns> public static string ShowKeyWordWeightColl(MemoryItemColl <string> objMemoryItemColl, int nKeyWordTopCount, bool bOrderbyDesc = true, bool bIsOnlyWord = true) { double dTotalVaildDegree = objMemoryItemColl.Sum(x => x.ValidDegree / x.ValidCount) / objMemoryItemColl.Count; StringBuilder sb = new StringBuilder(); sb.AppendLine(String.Format("词库成熟度:{0}% ;", dTotalVaildDegree > 1?0:Math.Round((1 - dTotalVaildDegree) * 100, 2))); sb.AppendLine("----------------------------------------"); sb.AppendLine(String.Format(" 【{0}】 | {1} | {2} | {3} | {4}", "主词", "遗忘词频", "累计词频", "词权值", "成熟度(%)")); IOrderedEnumerable <MemoryItemMDL <string> > tbuffer = null; if (bOrderbyDesc) { tbuffer = from x in objMemoryItemColl //如果只显示词,则要求:长度大于1、不包含符号、不是纯数字 where !bIsOnlyWord || x.Key.Length > 1 && !Regex.IsMatch(x.Key, @"[\p{P}\s]") && !Regex.IsMatch(x.Key, @"^\d+$") //&& !Regex.IsMatch(x.Key, @"^[a-zA-Z\p{P}\d\s=]+$") //按权重排序 orderby x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount)) descending select x; } else { tbuffer = from x in objMemoryItemColl //如果只显示词,则要求:长度大于1、不包含符号、不是纯数字 where !bIsOnlyWord || x.Key.Length > 1 && !Regex.IsMatch(x.Key, @"[\p{P}\s]") && !Regex.IsMatch(x.Key, @"^\d+$") //&& !Regex.IsMatch(x.Key, @"^[a-zA-Z\p{P}\d\s=]+$") //按权重排序 orderby x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount)) ascending select x; } var buffer = (tbuffer).Take(nKeyWordTopCount); sb.AppendLine(String.Format(" =========== 共{0} 个 ============= ", tbuffer.Count())); //逐词输出,每个词一行 foreach (var x in buffer) { sb.AppendLine(String.Format(" 【{0}】 | {1} | {2} | {3} | {4}", x.Key, Math.Round(DictionaryDAL.CalcRemeberValue <string>(x.Key, objMemoryItemColl), 2), x.TotalCount, Math.Round((x.ValidCount <= 0 ? 0 : (x.ValidCount) * (Math.Log(objMemoryItemColl.MinuteOffsetSize) - Math.Log(x.ValidCount))), 4), x.ValidCount <= 1?0: x.ValidDegree / x.ValidCount > 1 ? 0 : Math.Round((1 - x.ValidDegree / x.ValidCount) * 100, 2))); } return(sb.ToString()); }
private void btnLoadDictionary_Click(object sender, EventArgs e) { string floder = Path.GetFullPath(String.Format(@"{0}\dict", CachePathDAL.GetWorkSpacePath())); if (!Directory.Exists(floder)) { Directory.CreateDirectory(floder); } string filename1 = Path.GetFullPath(String.Format(@"{0}\dict\{1}", CachePathDAL.GetWorkSpacePath(), "CharBond.coll")); AppendText(String.Format("请稍候,正在加载文件:{0}", filename1)); objCharBondColl = SerialLib.DeserializeBinary <MemoryBondColl <string> >(filename1); string filename2 = Path.GetFullPath(String.Format(@"{0}\dict\{1}", CachePathDAL.GetWorkSpacePath(), "KeyWord.coll")); AppendText(String.Format("请稍候,正在加载文件:{0}", filename2)); objKeyWordColl = SerialLib.DeserializeBinary <MemoryItemColl <string> >(filename2); AppendText("字典集加载完毕!"); }
/// <summary> /// 从文本中生成候选词 /// </summary> /// <param name="text">文本行</param> /// <param name="objCharBondColl">相邻字典</param> /// <param name="objKeyWordColl">词库</param> /// <param name="bUpdateCharBondColl">是否更新相邻字典</param> /// <param name="bUpdateKeyWordColl">是否更新词库</param> public static void UpdateKeyWordColl(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true) { if (String.IsNullOrEmpty(text)) { return; } StringBuilder buffer = new StringBuilder(); //用于存放连续的子串 string keyHead = text[0].ToString(); //keyHead、keyTail分别存放相邻的两个字符 buffer.Append(keyHead); for (int k = 1; k < text.Length; k++) //遍历句子中的每一个字符 { //从句子中取一个字作为相邻两字的尾字 string keyTail = text[k].ToString(); if (bUpdateCharBondColl) { //更新相邻字典 DictionaryDAL.UpdateMemoryBondColl <string>(keyHead, keyTail, objCharBondColl); } if (bUpdateKeyWordColl) { //判断相邻两字是否有关 if (!DictionaryDAL.IsBondValid <string>(keyHead, keyTail, objCharBondColl)) { //两字无关,则将绥中的字串取出,此即为候选词 string keyword = buffer.ToString(); //将候选词添加到词库中 DictionaryDAL.UpdateMemoryItemColl <string>(keyword, objKeyWordColl); //清空缓冲 buffer.Clear(); //并开始下一个子串 buffer.Append(keyTail); } else { //两个字有关,则将当前字追加至串缓冲中 buffer.Append(keyTail); } } //将当前的字作为相邻的首字 keyHead = keyTail; } }
/// <summary> /// 分词(同时自动维护词典) /// </summary> /// <param name="text">待分词文本</param> /// <param name="objCharBondColl">邻键集合(用于生成词库)</param> /// <param name="objKeyWordColl">词库</param> /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param> /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param> /// <param name="bUpdateKeyWordColl">是否同时更新词库</param> /// <returns>返回分词结果</returns> public static List <string> Segment(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true) { if (String.IsNullOrEmpty(text)) { return(new List <string>()); } if (maxWordLen == 0) { maxWordLen = text.Length; } //此处使用了个技巧:偶尔发现,词库在遗忘公式作用下,其总量也为相对稳定的固定值,且与MinuteOffsetSize相当。 //故此处以此替换所有词的遗忘后的总词频,这样可以在处理流式数据时,避免动态计算词库总词频(因其计算量较大)。 double dLogTotalCount = Math.Log(objKeyWordColl.MinuteOffsetSize, Math.E); if (bUpdateCharBondColl || bUpdateKeyWordColl) { WordDictBLL.UpdateKeyWordColl(text, objCharBondColl, objKeyWordColl, bUpdateCharBondColl, bUpdateKeyWordColl); } Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >(); Dictionary <int, double> objKeyWordValueDict = new Dictionary <int, double>(); for (int k = 0; k < text.Length; k++) { List <string> objKeyWordList = new List <string>(); double dKeyWordValue = 0; for (int len = 0; len <= Math.Min(k, maxWordLen); len++) { int startpos = k - len; string keyword = text.Substring(startpos, len + 1); if (len > 0 && !objKeyWordColl.Contains(keyword)) { continue; } double dTempValue = 0; if (objKeyWordColl.Contains(keyword)) { dTempValue = -(dLogTotalCount - Math.Log(DictionaryDAL.CalcRemeberValue <string>(keyword, objKeyWordColl), Math.E)); } if (objKeyWordValueDict.ContainsKey(startpos - 1)) { dTempValue += objKeyWordValueDict[startpos - 1]; if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]); objKeyWordList.Add(keyword); } } else { if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(); objKeyWordList.Add(keyword); } } } objKeyWordBufferDict.Add(k, objKeyWordList); objKeyWordValueDict.Add(k, dKeyWordValue); } return(objKeyWordBufferDict[text.Length - 1]); }