Ejemplo n.º 1
0
        /// <summary>
        /// 从文本文件读取字典
        /// </summary>
        /// <param name="fileName"></param>
        static public T_DictFile LoadFromTextDict(String fileName)
        {
            T_DictFile dictFile = new T_DictFile();

            String dictStr = CFile.ReadFileToString(fileName, "utf-8");

            String[] words = CRegex.Split(dictStr, "\r\n");

            foreach (String word in words)
            {
                String[] wp = CRegex.Split(word, @"\|");

                if (wp == null)
                {
                    continue;
                }

                if (wp.Length != 2)
                {
                    continue;
                }

                int pos = 0;

                try
                {
                    pos = int.Parse(wp[1]);
                }
                catch
                {
                    continue;
                }

                T_DictStruct dict = new T_DictStruct();
                dict.Word = wp[0];
                dict.Pos  = pos;

                if (dict.Word.Contains("一") || dict.Word.Contains("二") ||
                    dict.Word.Contains("三") || dict.Word.Contains("四") ||
                    dict.Word.Contains("五") || dict.Word.Contains("六") ||
                    dict.Word.Contains("七") || dict.Word.Contains("八") ||
                    dict.Word.Contains("九") || dict.Word.Contains("十"))
                {
                    dict.Pos |= (int)T_POS.POS_A_M;
                }

                if (dict.Word == "字典")
                {
                    dict.Pos = (int)T_POS.POS_D_N;
                }

                dictFile.Dicts.Add(dict);
            }

            return(dictFile);
        }
Ejemplo n.º 2
0
        static public T_DictFile LoadFromBinFileEx(string fileName)
        {
            T_DictFile dictFile = new T_DictFile();

            dictFile.Dicts = new List <T_DictStruct>();

            File.SetAttributes(fileName, FileAttributes.Normal);
            FileStream fs = new FileStream(fileName, FileMode.Open);

            byte[] version = new byte[32];
            fs.Read(version, 0, version.Length);
            String ver = Encoding.UTF8.GetString(version, 0, version.Length);

            String verNumStr = CRegex.GetMatch(ver, "KTDictSeg Dict V(.+)", true);

            if (verNumStr == null || verNumStr == "")
            {
                //1.3以前版本

                fs.Close();
                return(LoadFromBinFile(fileName));
            }

            while (fs.Position < fs.Length)
            {
                byte[] buf = new byte[sizeof(int)];
                fs.Read(buf, 0, buf.Length);
                int length = BitConverter.ToInt32(buf, 0);

                buf = new byte[length];

                T_DictStruct dict = new T_DictStruct();

                fs.Read(buf, 0, buf.Length);

                dict.Word      = Encoding.UTF8.GetString(buf, 0, length - sizeof(int) - sizeof(double));
                dict.Pos       = BitConverter.ToInt32(buf, length - sizeof(int) - sizeof(double));
                dict.Frequency = BitConverter.ToDouble(buf, length - sizeof(double));
                dictFile.Dicts.Add(dict);
            }

            fs.Close();

            return(dictFile);
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 预分词
        /// </summary>
        /// <param name="str">要分词的句子</param>
        /// <returns>预分词后的字符串输出</returns>
        private List <String> PreSegment(String str)
        {
            ArrayList initSeg = new ArrayList();


            if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg))
            {
                return(new List <String>());
            }

            List <String> retWords = new List <String>();

            int i = 0;

            _ExtractWords.MatchDirection = MatchDirection;

            while (i < initSeg.Count)
            {
                String word = (String)initSeg[i];
                if (word == "")
                {
                    word = " ";
                }

                if (i < initSeg.Count - 1)
                {
                    bool mergeOk = false;
                    if (((word[0] >= '0' && word[0] <= '9') || (word[0] >= '0' && word[0] <= '9')) &&
                        ((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') ||
                         (word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9'))
                        )
                    {
                        //合并浮点数
                        word    = MergeFloat(initSeg, i, ref i);
                        mergeOk = true;
                    }
                    else if ((word[0] >= 'a' && word[0] <= 'z') ||
                             (word[0] >= 'A' && word[0] <= 'Z')
                             )
                    {
                        //合并成英文专业名词
                        String specialEnglish = MergeEnglishSpecialWord(_ExtractWords, initSeg, i, ref i);

                        if (specialEnglish != null)
                        {
                            InsertWordToArray(specialEnglish, retWords);
                            continue;
                        }

                        //合并邮件地址
                        if ((String)initSeg[i + 1] != "")
                        {
                            if (((String)initSeg[i + 1])[0] == '@')
                            {
                                word    = MergeEmail(initSeg, i, ref i);
                                mergeOk = true;
                            }
                        }
                    }

                    if (mergeOk)
                    {
                        InsertWordToArray(word, retWords);
                        continue;
                    }
                }


                if (word[0] < 0x4e00 || word[0] > 0x9fa5)
                {
                    //英文或符号,直接加入
                    InsertWordToArray(word, retWords);
                }
                else
                {
                    List <T_WordInfo> words = _ExtractWords.ExtractFullTextMaxMatch(word);
                    int  lastPos            = 0;
                    bool lstIsName          = false; //前一个词是人名

                    foreach (T_WordInfo wordInfo in words)
                    {
                        if (lastPos < wordInfo.Position)
                        {
                            /*
                             *                          String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos);
                             *
                             *                          InsertWordToArray(unMatchWord, retWords);
                             */
                            //中间有未匹配词,将单个字逐个加入
                            for (int j = lastPos; j < wordInfo.Position; j++)
                            {
                                InsertWordToArray(word[j].ToString(), retWords);
                            }
                        }


                        lastPos = wordInfo.Position + wordInfo.Word.Length;

                        //统计中文姓名的后缀
                        if (AutoStudy && lstIsName)
                        {
                            T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                            if ((wordDict.Pos & (int)T_POS.POS_A_NR) == 0)
                            {
                                _MatchNameRule.AddBefore(wordInfo.Word);
                            }

                            lstIsName = false;
                        }

                        //统计中文姓名的前缀
                        //如总统,主席等
                        if ((((T_DictStruct)wordInfo.Tag).Pos & (int)T_POS.POS_A_NR) != 0)
                        {
                            if (wordInfo.Word.Length > 1 && wordInfo.Word.Length <= 4 && retWords.Count > 0 && AutoStudy && !lstIsName)
                            {
                                T_DictStruct wordDict = (T_DictStruct)wordInfo.Tag;
                                _MatchNameRule.AddBefore(retWords[retWords.Count - 1]);
                            }

                            lstIsName = true;
                        }


                        InsertWordToArray(wordInfo.Word, retWords);
                    }

                    if (lastPos < word.Length)
                    {
                        //尾部有未匹配词,将单个字逐个加入
                        for (int j = lastPos; j < word.Length; j++)
                        {
                            InsertWordToArray(word[j].ToString(), retWords);
                        }

                        //InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords);
                    }
                }

                i++;
            }

            return(retWords);
        }