예제 #1
0
        /// <summary>
        /// 添加一条词的出现记录
        /// </summary>
        /// <param name="word"></param>
        /// <param name="sentence"></param>
        public void addWordRecord(Pair word, Sentence sentence)
        {
            foreach (WordInfo winfo in dc.wordinfo)
            {
                if (word.Word == winfo.word)
                {
                    winfo.sum++;
                    winfo.appearSentences.Add(sentence);
                    return;
                }
            }
            WordInfo newwinfo = new WordInfo();

            newwinfo.word            = word.Word;
            newwinfo.sum             = 1;
            newwinfo.wordType        = word.Flag;
            newwinfo.appearSentences = new List <Sentence>();
            newwinfo.appearSentences.Add(sentence);
            dc.wordinfo.Add(newwinfo);
        }
예제 #2
0
        /// <summary>
        /// 用于比较词频。降序。
        /// </summary>
        /// <param name="obj"></param>
        /// <returns></returns>
        public int CompareTo(object obj)
        {
            //这里是按sum降序
            int res = 0;

            try
            {
                WordInfo sObj = (WordInfo)obj;
                if (this.sum > sObj.sum)
                {
                    res = -1;
                }
                else if (this.sum < sObj.sum)
                {
                    res = 1;
                }
            }
            catch (Exception ex)
            {
                throw new Exception("比较异常", ex.InnerException);
            }
            return(res);
        }
예제 #3
0
        /// <summary>
        /// 修正分词结果
        /// </summary>
        public void workResetWordCut()
        {
            List <Pair> tmpWordInfo;

            //修正:从全文分词结果来看,将被错误分割的词拼接起来
            tmpWordInfo = new List <Pair>();
            List <Pair> tmpChangeWordInfo = new List <Pair>();

            foreach (var f in dc.fileinfo)
            {
                foreach (var s in f.sentences)
                {
                    if (s == null)
                    {
                        continue;
                    }
                    foreach (var w in s.words)
                    {
                        if (w.Word.Length > 1 && isInList(tmpWordInfo, w))
                        {
                            tmpWordInfo.Add(w);
                        }
                    }
                }
            }
            foreach (var w in tmpWordInfo)
            {
                resetCuts_Link(w);
            }
            //修正:以出现频率较高的人名为基准,将未正确分割的含人名的词分开
            List <WordInfo> tmpWord = new List <WordInfo>();

            foreach (var f in dc.fileinfo)
            {
                foreach (var s in f.sentences)
                {
                    if (s == null)
                    {
                        continue;
                    }
                    //mainForm.print("人名修正 - 词频统计(第" + (mainForm.dc.fileinfo.IndexOf(f) + 1) + "篇" + (f.sentneces.ToList().IndexOf(s) + 1) + "句)");
                    foreach (var w in s.words)
                    {
                        if (w.Word.Length > 1 && w.Flag == "nr")
                        {
                            bool haveit = false;
                            foreach (WordInfo winfo in tmpWord)
                            {
                                if (w.Word == winfo.word)
                                {
                                    winfo.sum++;
                                    haveit = true;
                                    break;
                                }
                            }
                            if (haveit)
                            {
                                continue;
                            }
                            WordInfo newwinfo = new WordInfo();
                            newwinfo.word     = w.Word;
                            newwinfo.sum      = 1;
                            newwinfo.wordType = w.Flag;
                            tmpWord.Add(newwinfo);
                        }
                    }
                }
            }
            foreach (var wi in tmpWord)
            {
                if (wi.sum >= 10)
                {
                    Pair p = new Pair(wi.word, wi.wordType);
                    resetCuts_Cut(p);
                }
            }
            //mainForm.print("修正分词结果完毕。");
        }