예제 #1
0
        /// <summary>
        /// 从文本文件读取字典
        /// </summary>
        /// <param name="fileName"></param>
        static public T_DictFile LoadFromTextDict(String fileName)
        {
            T_DictFile dictFile = new T_DictFile();

            String dictStr = CFile.ReadFileToString(fileName, "utf-8");

            String[] words = CRegex.Split(dictStr, "\r\n");

            foreach (String word in words)
            {
                String[] wp = CRegex.Split(word, @"\|");

                if (wp == null)
                {
                    continue;
                }

                if (wp.Length != 2)
                {
                    continue;
                }

                int pos = 0;

                try
                {
                    pos = int.Parse(wp[1]);
                }
                catch
                {
                    continue;
                }

                T_DictStruct dict = new T_DictStruct();
                dict.Word = wp[0];
                dict.Pos  = pos;

                if (dict.Word.Contains("一") || dict.Word.Contains("二") ||
                    dict.Word.Contains("三") || dict.Word.Contains("四") ||
                    dict.Word.Contains("五") || dict.Word.Contains("六") ||
                    dict.Word.Contains("七") || dict.Word.Contains("八") ||
                    dict.Word.Contains("九") || dict.Word.Contains("十"))
                {
                    dict.Pos |= (int)T_POS.POS_A_M;
                }

                if (dict.Word == "字典")
                {
                    dict.Pos = (int)T_POS.POS_D_N;
                }

                dictFile.Dicts.Add(dict);
            }

            return(dictFile);
        }
예제 #2
0
        /// <summary>
        /// 分词,不屏蔽停用词
        /// </summary>
        /// <param name="str"></param>
        /// <returns></returns>
        private List <String> SegmentNoStopWord(String str)
        {
            List <String> preWords = PreSegment(str);
            List <String> retWords = new List <String>();

            int index = 0;

            while (index < preWords.Count)
            {
                int next = -1;
                foreach (IRule rule in _Rules)
                {
                    if (!_MatchName && rule is MatchName)
                    {
                        continue;
                    }

                    next = rule.ProcRule(preWords, index, retWords);
                    if (next > 0)
                    {
                        index = next;
                        break;
                    }
                }

                if (next > 0)
                {
                    continue;
                }

                retWords.Add(preWords[index]);
                index++;
            }

            //return retWords;
            List <String> retStrings = RecoverUnknowWord(retWords);

            if (AutoStudy)
            {
                foreach (String word in retStrings)
                {
                    T_DictStruct dict = (T_DictStruct)_ExtractWords.GetTag(word);

                    if (dict != null)
                    {
                        dict.Frequency++;
                    }
                }
            }

            return(retStrings);
        }
예제 #3
0
        protected virtual double GetFreqWeight(List <T_WordInfo> words, List <int> list)
        {
            double weight = 0;

            for (int i = 0; i < list.Count; i++)
            {
                T_WordInfo   w    = (T_WordInfo)words[(int)list[i]];
                T_DictStruct dict = (T_DictStruct)w.Tag;
                weight += dict.Frequency;
            }

            return(weight);
        }
예제 #4
0
        public virtual void DeleteWord(String word)
        {
            word = word.Trim();

            T_DictStruct w = GetWord(word);

            if (w == null)
            {
                return;
            }

            _DictTbl.Remove(w.Word);
            _Dict.Dicts.Remove(w);
        }
예제 #5
0
        public virtual void UpdateWord(String word, double frequency, int pos)
        {
            word = word.Trim();

            T_DictStruct w = GetWord(word);

            if (w == null)
            {
                return;
            }

            w.Frequency = frequency;
            w.Pos       = pos;
        }
예제 #6
0
        public virtual void SaveDict()
        {
            _MatchNameRule.SaveNameTraffic(_DictPath + "Name.dct");

            foreach (T_DictStruct word in _Dict.Dicts)
            {
                T_DictStruct dict = (T_DictStruct)_ExtractWords.GetTag(word.Word);
                if (dict != null)
                {
                    word.Frequency = dict.Frequency;
                }
            }

            Dict.SaveToBinFileEx(_DictPath + "Dict.dct", _Dict);

            Dict.SaveToBinFileEx(_DictPath + "UnknownWords.dct", _UnknownWordsDict);
        }
예제 #7
0
        private void TrafficUnknownWord(String word, T_POS Pos)
        {
            if (word.Length <= 1 || word.Length > 3)
            {
                return;
            }

            T_DictStruct unknownWord = _UnknownWordsDictMgr.GetWord(word);


            if (unknownWord == null)
            {
                _UnknownWordsDictMgr.InsertWord(word, 1, (int)Pos);
                return;
            }

            //如果是屏蔽的未登录词,则不加入
            //屏蔽的未登录词用词性等于0来表示
            if (unknownWord.Pos == 0)
            {
                return;
            }

            unknownWord.Pos |= (int)Pos;
            unknownWord.Frequency++;

            if (unknownWord.Frequency > UnknownWordsThreshold && AutoInsertUnknownWords)
            {
                T_DictStruct w = _DictMgr.GetWord(word);
                if (w == null)
                {
                    _DictMgr.InsertWord(word, unknownWord.Frequency, unknownWord.Pos);

                    _ExtractWords.InsertWordToDfa(word, unknownWord);
                    _POS.AddWordPos(word, unknownWord.Pos);
                }
                else
                {
                    w.Pos       |= unknownWord.Pos;
                    w.Frequency += unknownWord.Frequency;
                }

                unknownWord.Frequency = 0;
            }
        }
예제 #8
0
        static public T_DictFile LoadFromBinFileEx(string fileName)
        {
            T_DictFile dictFile = new T_DictFile();

            dictFile.Dicts = new List <T_DictStruct>();

            File.SetAttributes(fileName, FileAttributes.Normal);
            FileStream fs = new FileStream(fileName, FileMode.Open);

            byte[] version = new byte[32];
            fs.Read(version, 0, version.Length);
            String ver = Encoding.UTF8.GetString(version, 0, version.Length);

            String verNumStr = CRegex.GetMatch(ver, "KTDictSeg Dict V(.+)", true);

            if (verNumStr == null || verNumStr == "")
            {
                //1.3以前版本

                fs.Close();
                return(LoadFromBinFile(fileName));
            }

            while (fs.Position < fs.Length)
            {
                byte[] buf = new byte[sizeof(int)];
                fs.Read(buf, 0, buf.Length);
                int length = BitConverter.ToInt32(buf, 0);

                buf = new byte[length];

                T_DictStruct dict = new T_DictStruct();

                fs.Read(buf, 0, buf.Length);

                dict.Word      = Encoding.UTF8.GetString(buf, 0, length - sizeof(int) - sizeof(double));
                dict.Pos       = BitConverter.ToInt32(buf, length - sizeof(int) - sizeof(double));
                dict.Frequency = BitConverter.ToDouble(buf, length - sizeof(double));
                dictFile.Dicts.Add(dict);
            }

            fs.Close();

            return(dictFile);
        }
예제 #9
0
        public virtual void InsertWord(String word, double frequency, int pos)
        {
            word = word.Trim();

            if (GetWord(word) != null)
            {
                return;
            }

            T_DictStruct w = new T_DictStruct();

            w.Word      = word;
            w.Frequency = frequency;
            w.Pos       = pos;

            _Dict.Dicts.Add(w);
            _DictTbl[word] = w;
        }
예제 #10
0
        /// <summary>
        /// 预分词
        /// </summary>
        /// <param name="str">要分词的句子</param>
        /// <returns>预分词后的字符串输出</returns>
        private List <String> PreSegment(String str)
        {
            ArrayList initSeg = new ArrayList();


            if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg))
            {
                return(new List <String>());
            }

            List <String> retWords = new List <String>();

            int i = 0;

            _ExtractWords.MatchDirection = MatchDirection;

            while (i < initSeg.Count)
            {
                String word = (String)initSeg[i];
                if (word == "")
                {
                    word = " ";
                }

                if (i < initSeg.Count - 1)
                {
                    bool mergeOk = false;
                    if (((word[0] >= '0' && word[0] <= '9') || (word[0] >= '0' && word[0] <= '9')) &&
                        ((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') ||
                         (word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9'))
                        )
                    {
                        //合并浮点数
                        word    = MergeFloat(initSeg, i, ref i);
                        mergeOk = true;
                    }
                    else if ((word[0] >= 'a' && word[0] <= 'z') ||
                             (word[0] >= 'A' && word[0] <= 'Z')
                             )
                    {
                        //合并成英文专业名词
                        String specialEnglish = MergeEnglishSpecialWord(_ExtractWords, initSeg, i, ref i);

                        if (specialEnglish != null)
                        {
                            InsertWordToArray(specialEnglish, retWords);
                            continue;
                        }

                        //合并邮件地址
                        if ((String)initSeg[i + 1] != "")
                        {
                            if (((String)initSeg[i + 1])[0] == '@')
                            {
                                word    = MergeEmail(initSeg, i, ref i);
                                mergeOk = true;
                            }
                        }
                    }

                    if (mergeOk)
                    {
                        InsertWordToArray(word, retWords);
                        continue;
                    }
                }


                if (word[0] < 0x4e00 || word[0] > 0x9fa5)
                {
                    //英文或符号,直接加入
                    InsertWordToArray(word, retWords);
                }
                else
                {
                    List <T_WordInfo> words = _ExtractWords.ExtractFullTextMaxMatch(word);
                    int  lastPos            = 0;
                    bool lstIsName          = false; //前一个词是人名

                    foreach (T_WordInfo wordInfo in words)
                    {
                        if (lastPos < wordInfo.Position)
                        {
                            /*
                             *                          String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos);
                             *
                             *                          InsertWordToArray(unMatchWord, retWords);
                             */
                            //中间有未匹配词,将单个字逐个加入
                            for (int j = lastPos; j < wordInfo.Position; j++)
                            {
                                InsertWordToArray(word[j].ToString(), retWords);
                            }
                        }


                        lastPos = wordInfo.Position + wordInfo.Word.Length;

                        //统计中文姓名的后缀
                        if (AutoStudy && lstIsName)
                        {
                            T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                            if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0)
                            {
                                _MatchNameRule.AddBefore(wordInfo.Word);
                            }

                            lstIsName = false;
                        }

                        //统计中文姓名的前缀
                        //如总统,主席等
                        if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0)
                        {
                            if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName)
                            {
                                T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                                _MatchNameRule.AddBefore(retWords[retWords.Count - 1]);
                            }

                            lstIsName = true;
                        }


                        InsertWordToArray(wordInfo.Word, retWords);
                    }

                    if (lastPos < word.Length)
                    {
                        //尾部有未匹配词,将单个字逐个加入
                        for (int j = lastPos; j < word.Length; j++)
                        {
                            InsertWordToArray(word[j].ToString(), retWords);
                        }

                        //InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords);
                    }
                }

                i++;
            }

            return(retWords);
        }