/// <summary> /// 相邻字统计 /// </summary> /// <param name="text">文本行</param> /// <param name="objCharBondColl">存放相邻结果的字典</param> /// <remarks>遍历句中相邻的字,将结果存放到字典中</remarks> public static void UpdateCharBondColl(string text, MemoryBondColl <string> objCharBondColl) { if (String.IsNullOrEmpty(text)) { return; } string keyHead = text[0].ToString(); for (int k = 1; k < text.Length; k++) { string keyTail = text[k].ToString(); //存入相邻字典中 DictionaryDAL.UpdateMemoryBondColl <string>(keyHead, keyTail, objCharBondColl); keyHead = keyTail; } }
/// <summary> /// 从文本中生成候选词 /// </summary> /// <param name="text">文本行</param> /// <param name="objCharBondColl">相邻字典</param> /// <param name="objKeyWordColl">词库</param> /// <param name="bUpdateCharBondColl">是否更新相邻字典</param> /// <param name="bUpdateKeyWordColl">是否更新词库</param> public static void UpdateKeyWordColl(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true) { if (String.IsNullOrEmpty(text)) { return; } StringBuilder buffer = new StringBuilder(); //用于存放连续的子串 string keyHead = text[0].ToString(); //keyHead、keyTail分别存放相邻的两个字符 buffer.Append(keyHead); for (int k = 1; k < text.Length; k++) //遍历句子中的每一个字符 { //从句子中取一个字作为相邻两字的尾字 string keyTail = text[k].ToString(); if (bUpdateCharBondColl) { //更新相邻字典 DictionaryDAL.UpdateMemoryBondColl <string>(keyHead, keyTail, objCharBondColl); } if (bUpdateKeyWordColl) { //判断相邻两字是否有关 if (!DictionaryDAL.IsBondValid <string>(keyHead, keyTail, objCharBondColl)) { //两字无关,则将绥中的字串取出,此即为候选词 string keyword = buffer.ToString(); //将候选词添加到词库中 DictionaryDAL.UpdateMemoryItemColl <string>(keyword, objKeyWordColl); //清空缓冲 buffer.Clear(); //并开始下一个子串 buffer.Append(keyTail); } else { //两个字有关,则将当前字追加至串缓冲中 buffer.Append(keyTail); } } //将当前的字作为相邻的首字 keyHead = keyTail; } }
private void btnLoadDictionary_Click(object sender, EventArgs e) { string floder = Path.GetFullPath(String.Format(@"{0}\dict", CachePathDAL.GetWorkSpacePath())); if (!Directory.Exists(floder)) { Directory.CreateDirectory(floder); } string filename1 = Path.GetFullPath(String.Format(@"{0}\dict\{1}", CachePathDAL.GetWorkSpacePath(), "CharBond.coll")); AppendText(String.Format("请稍候,正在加载文件:{0}", filename1)); objCharBondColl = SerialLib.DeserializeBinary <MemoryBondColl <string> >(filename1); string filename2 = Path.GetFullPath(String.Format(@"{0}\dict\{1}", CachePathDAL.GetWorkSpacePath(), "KeyWord.coll")); AppendText(String.Format("请稍候,正在加载文件:{0}", filename2)); objKeyWordColl = SerialLib.DeserializeBinary <MemoryItemColl <string> >(filename2); AppendText("字典集加载完毕!"); }
/// <summary> /// 分词(同时自动维护词典) /// </summary> /// <param name="text">待分词文本</param> /// <param name="objCharBondColl">邻键集合(用于生成词库)</param> /// <param name="objKeyWordColl">词库</param> /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param> /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param> /// <param name="bUpdateKeyWordColl">是否同时更新词库</param> /// <returns>返回分词结果</returns> public static List <string> Segment(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true) { if (String.IsNullOrEmpty(text)) { return(new List <string>()); } if (maxWordLen == 0) { maxWordLen = text.Length; } //此处使用了个技巧:偶尔发现,词库在遗忘公式作用下,其总量也为相对稳定的固定值,且与MinuteOffsetSize相当。 //故此处以此替换所有词的遗忘后的总词频,这样可以在处理流式数据时,避免动态计算词库总词频(因其计算量较大)。 double dLogTotalCount = Math.Log(objKeyWordColl.MinuteOffsetSize, Math.E); if (bUpdateCharBondColl || bUpdateKeyWordColl) { WordDictBLL.UpdateKeyWordColl(text, objCharBondColl, objKeyWordColl, bUpdateCharBondColl, bUpdateKeyWordColl); } Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >(); Dictionary <int, double> objKeyWordValueDict = new Dictionary <int, double>(); for (int k = 0; k < text.Length; k++) { List <string> objKeyWordList = new List <string>(); double dKeyWordValue = 0; for (int len = 0; len <= Math.Min(k, maxWordLen); len++) { int startpos = k - len; string keyword = text.Substring(startpos, len + 1); if (len > 0 && !objKeyWordColl.Contains(keyword)) { continue; } double dTempValue = 0; if (objKeyWordColl.Contains(keyword)) { dTempValue = -(dLogTotalCount - Math.Log(DictionaryDAL.CalcRemeberValue <string>(keyword, objKeyWordColl), Math.E)); } if (objKeyWordValueDict.ContainsKey(startpos - 1)) { dTempValue += objKeyWordValueDict[startpos - 1]; if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]); objKeyWordList.Add(keyword); } } else { if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(); objKeyWordList.Add(keyword); } } } objKeyWordBufferDict.Add(k, objKeyWordList); objKeyWordValueDict.Add(k, dKeyWordValue); } return(objKeyWordBufferDict[text.Length - 1]); }