/// <summary> /// 分词(同时自动维护词典) /// </summary> /// <param name="text">待分词文本</param> /// <param name="objCharBondColl">邻键集合(用于生成词库)</param> /// <param name="objKeyWordColl">词库</param> /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param> /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param> /// <param name="bUpdateKeyWordColl">是否同时更新词库</param> /// <param name="nRadiusSize">有效键半径</param> /// <returns>返回分词结果</returns> public static List <string> Segment(string text, KeyBondColl <string, string> objCharBondColl, KeyItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true, int nRadiusSize = 7) { if (String.IsNullOrEmpty(text)) { return(new List <string>()); } if (maxWordLen <= 0) { maxWordLen = text.Length; } //总词频 double dLogTotalCount = Math.Log(objKeyWordColl.Parameter.TotalValidCount + 1);// Math.Log(1.0 / ( 1.0 - MemoryDAL.CalcRemeberValue(1,objKeyWordColl.Parameter) ) );// Math.Log(objKeyWordColl.Sum(x =>x.VaildCount* KeyWordBLL.CalcRemeberValue<string>(x.Key,objKeyWordColl)));// Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >(); Dictionary <int, double> objKeyWordValueDict = new Dictionary <int, double>(); for (int k = 0; k < text.Length; k++) { List <string> objKeyWordList = new List <string>(); double dKeyWordValue = 0; for (int len = 0; len < maxWordLen; len++) { int startpos = k - len; if (startpos < 0) { break; } string keyword = text.Substring(startpos, len + 1); if (len > 0 && !objKeyWordColl.Contains(keyword)) { continue; } if (len > 0) { if (!objKeyWordColl.Contains(keyword)) { continue; } double dValidCount = KeyItemHelper.CalcValidCount(keyword, objKeyWordColl); if (dValidCount < objKeyWordColl.Parameter.Threshold) { continue; } //if (dValidCount < Math.E) continue;//经测试,原始最好 } double dTempValue = 0; if (objKeyWordColl.Contains(keyword)) { KeyItemMDL <string> mdl = objKeyWordColl[keyword]; dTempValue = -(dLogTotalCount - Math.Log(KeyItemHelper.CalcValidCount(keyword, objKeyWordColl))); } if (objKeyWordValueDict.ContainsKey(startpos - 1)) { dTempValue += objKeyWordValueDict[startpos - 1]; if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]); objKeyWordList.Add(keyword); } } else { if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(); objKeyWordList.Add(keyword); } } } objKeyWordBufferDict.Add(k, objKeyWordList); objKeyWordValueDict.Add(k, dKeyWordValue); if (k > maxWordLen) { objKeyWordBufferDict.Remove(k - maxWordLen - 1); objKeyWordValueDict.Remove(k - maxWordLen - 1); } } if (bUpdateCharBondColl || bUpdateKeyWordColl) { KeyWordBLL.UpdateKeyWordColl(text, objKeyWordColl, maxWordLen); } return(objKeyWordBufferDict[text.Length - 1]); }
public static void UpdateKeyWordColl(string line, KeyItemColl <string> objKeyWordColl, int nMaxWordSize = 7) { KeyWordBLL.UpdateKeyWordCollByNGram(line, objKeyWordColl, nMaxWordSize); }