/// <summary> /// 添加一条词的出现记录 /// </summary> /// <param name="word"></param> /// <param name="sentence"></param> public void addWordRecord(Pair word, Sentence sentence) { foreach (WordInfo winfo in dc.wordinfo) { if (word.Word == winfo.word) { winfo.sum++; winfo.appearSentences.Add(sentence); return; } } WordInfo newwinfo = new WordInfo(); newwinfo.word = word.Word; newwinfo.sum = 1; newwinfo.wordType = word.Flag; newwinfo.appearSentences = new List <Sentence>(); newwinfo.appearSentences.Add(sentence); dc.wordinfo.Add(newwinfo); }
/// <summary> /// 用于比较词频。降序。 /// </summary> /// <param name="obj"></param> /// <returns></returns> public int CompareTo(object obj) { //这里是按sum降序 int res = 0; try { WordInfo sObj = (WordInfo)obj; if (this.sum > sObj.sum) { res = -1; } else if (this.sum < sObj.sum) { res = 1; } } catch (Exception ex) { throw new Exception("比较异常", ex.InnerException); } return(res); }
/// <summary> /// 修正分词结果 /// </summary> public void workResetWordCut() { List <Pair> tmpWordInfo; //修正:从全文分词结果来看,将被错误分割的词拼接起来 tmpWordInfo = new List <Pair>(); List <Pair> tmpChangeWordInfo = new List <Pair>(); foreach (var f in dc.fileinfo) { foreach (var s in f.sentences) { if (s == null) { continue; } foreach (var w in s.words) { if (w.Word.Length > 1 && isInList(tmpWordInfo, w)) { tmpWordInfo.Add(w); } } } } foreach (var w in tmpWordInfo) { resetCuts_Link(w); } //修正:以出现频率较高的人名为基准,将未正确分割的含人名的词分开 List <WordInfo> tmpWord = new List <WordInfo>(); foreach (var f in dc.fileinfo) { foreach (var s in f.sentences) { if (s == null) { continue; } //mainForm.print("人名修正 - 词频统计(第" + (mainForm.dc.fileinfo.IndexOf(f) + 1) + "篇" + (f.sentneces.ToList().IndexOf(s) + 1) + "句)"); foreach (var w in s.words) { if (w.Word.Length > 1 && w.Flag == "nr") { bool haveit = false; foreach (WordInfo winfo in tmpWord) { if (w.Word == winfo.word) { winfo.sum++; haveit = true; break; } } if (haveit) { continue; } WordInfo newwinfo = new WordInfo(); newwinfo.word = w.Word; newwinfo.sum = 1; newwinfo.wordType = w.Flag; tmpWord.Add(newwinfo); } } } } foreach (var wi in tmpWord) { if (wi.sum >= 10) { Pair p = new Pair(wi.word, wi.wordType); resetCuts_Cut(p); } } //mainForm.print("修正分词结果完毕。"); }