private void btnShowWordCloud_Click(object sender, EventArgs e) { if (radDictionary.Checked) { string result = WordDictBLL.ShowKeyWordWeightColl(objKeyWordColl, Convert.ToInt32(numericUpDown1.Value), chkOrderBy.Checked, chkIsOnlyWord.Checked); this.richTextBox1.Text = result; } if (radSegment.Checked) { using (StringReader sr = new StringReader(this.tbCoreWordList.Text)) { StringBuilder sb = new StringBuilder(); Dictionary <string, double> dictValue = new Dictionary <string, double>(); string line = null; while ((line = sr.ReadLine()) != null) { List <string> objKeyWordList = SegmentBLL.Segment(line, objCharBondColl, objKeyWordColl, 7, false, false); foreach (string keyword in objKeyWordList) { if (!dictValue.ContainsKey(keyword)) { dictValue.Add(keyword, !objKeyWordColl.Contains(keyword)?0: -Math.Log(objKeyWordColl[keyword].ValidCount / objKeyWordColl.MinuteOffsetSize)); } else { dictValue[keyword] += !objKeyWordColl.Contains(keyword) ? 0 : -Math.Log(objKeyWordColl[keyword].ValidCount / objKeyWordColl.MinuteOffsetSize); } } sb.AppendLine(SegmentBLL.ShowSegment(objKeyWordList)); } sb.AppendLine(); var buffer = from x in dictValue orderby x.Value descending select x; foreach (var x in buffer) { sb.AppendLine(String.Format("【{0}】{1}", x.Key, Math.Round(x.Value, 4))); } this.richTextBox1.Text = sb.ToString(); } } }
/// <summary> /// 分词(同时自动维护词典) /// </summary> /// <param name="text">待分词文本</param> /// <param name="objCharBondColl">邻键集合(用于生成词库)</param> /// <param name="objKeyWordColl">词库</param> /// <param name="maxWordLen">最大词长(建议:细粒度为4、粗粒度为7)</param> /// <param name="bUpdateCharBondColl">是否同时更新邻键集合</param> /// <param name="bUpdateKeyWordColl">是否同时更新词库</param> /// <returns>返回分词结果</returns> public static List <string> Segment(string text, MemoryBondColl <string> objCharBondColl, MemoryItemColl <string> objKeyWordColl, int maxWordLen = 7, bool bUpdateCharBondColl = true, bool bUpdateKeyWordColl = true) { if (String.IsNullOrEmpty(text)) { return(new List <string>()); } if (maxWordLen == 0) { maxWordLen = text.Length; } //此处使用了个技巧:偶尔发现,词库在遗忘公式作用下,其总量也为相对稳定的固定值,且与MinuteOffsetSize相当。 //故此处以此替换所有词的遗忘后的总词频,这样可以在处理流式数据时,避免动态计算词库总词频(因其计算量较大)。 double dLogTotalCount = Math.Log(objKeyWordColl.MinuteOffsetSize, Math.E); if (bUpdateCharBondColl || bUpdateKeyWordColl) { WordDictBLL.UpdateKeyWordColl(text, objCharBondColl, objKeyWordColl, bUpdateCharBondColl, bUpdateKeyWordColl); } Dictionary <int, List <string> > objKeyWordBufferDict = new Dictionary <int, List <string> >(); Dictionary <int, double> objKeyWordValueDict = new Dictionary <int, double>(); for (int k = 0; k < text.Length; k++) { List <string> objKeyWordList = new List <string>(); double dKeyWordValue = 0; for (int len = 0; len <= Math.Min(k, maxWordLen); len++) { int startpos = k - len; string keyword = text.Substring(startpos, len + 1); if (len > 0 && !objKeyWordColl.Contains(keyword)) { continue; } double dTempValue = 0; if (objKeyWordColl.Contains(keyword)) { dTempValue = -(dLogTotalCount - Math.Log(DictionaryDAL.CalcRemeberValue <string>(keyword, objKeyWordColl), Math.E)); } if (objKeyWordValueDict.ContainsKey(startpos - 1)) { dTempValue += objKeyWordValueDict[startpos - 1]; if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(objKeyWordBufferDict[startpos - 1]); objKeyWordList.Add(keyword); } } else { if (dKeyWordValue == 0 || dTempValue > dKeyWordValue) { dKeyWordValue = dTempValue; objKeyWordList = new List <string>(); objKeyWordList.Add(keyword); } } } objKeyWordBufferDict.Add(k, objKeyWordList); objKeyWordValueDict.Add(k, dKeyWordValue); } return(objKeyWordBufferDict[text.Length - 1]); }