示例#1
0
        /// <summary>
        /// 从文本文件读取字典
        /// </summary>
        /// <param name="fileName"></param>
        public WordPosCollection LoadFromTextDict(String fileName)
        {
            WordPosCollection dictFile = new WordPosCollection();

            String dictStr = CFile.ReadFileToString(fileName, "utf-8");

            String[] words = CRegex.Split(dictStr, "\r\n");

            foreach (String word in words)
            {
                String[] wp = CRegex.Split(word, @"\|");

                if (wp == null)
                {
                    continue;
                }

                if (wp.Length != 2)
                {
                    continue;
                }

                int pos = 0;

                try
                {
                    pos = int.Parse(wp[1]);
                }
                catch
                {
                    continue;
                }

                WordPos dict = new WordPos(wp[0], pos);

                if (dict.Word.Contains("一") || dict.Word.Contains("二") ||
                    dict.Word.Contains("三") || dict.Word.Contains("四") ||
                    dict.Word.Contains("五") || dict.Word.Contains("六") ||
                    dict.Word.Contains("七") || dict.Word.Contains("八") ||
                    dict.Word.Contains("九") || dict.Word.Contains("十"))
                {
                    dict.Pos |= (int)PosEnum.POS_A_M;
                }

                if (dict.Word == "字典")
                {
                    dict.Pos = (int)PosEnum.POS_D_N;
                }

                dictFile.WordPosList.Add(dict);
            }

            return(dictFile);
        }
示例#2
0
        /// <summary>
        /// 预分词
        /// </summary>
        /// <param name="str">要分词的句子</param>
        /// <returns>预分词后的字符串输出</returns>
        private List <string> PreSegment(String str)
        {
            List <string> initSeg = new List <string>();

            //如果不包括数字串、日期、英文字母、汉字就返回,有就分成多个块
            if (!CRegex.GetSingleMatchStrings(str, PATTERNS, true, ref initSeg))
            {
                return(new List <string>());
            }
            List <string> retWords = new List <string>();
            int           i        = 0;

            WordSegService.ExtractInfo.MatchDirection = MatchDirection;

            while (i < initSeg.Count)
            {
                String word = initSeg[i];
                if (word == "")
                {
                    word = " ";
                }
                if (i < initSeg.Count - 1)
                {
                    bool mergeOk = false;
                    if (((word[0] >= '0' && word[0] <= '9') || (word[0] >= '0' && word[0] <= '9')) &&
                        ((word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9') ||
                         (word[word.Length - 1] >= '0' && word[word.Length - 1] <= '9'))
                        )//如果word是数字开头和数字结尾
                    {
                        word    = MergeFloat(initSeg, i, ref i);
                        mergeOk = true;
                    }
                    else if ((word[0] >= 'a' && word[0] <= 'z') ||
                             (word[0] >= 'A' && word[0] <= 'Z'))
                    {
                        if ((String)initSeg[i + 1] != "")
                        {
                            if (((String)initSeg[i + 1])[0] == '@')
                            {
                                word    = MergeEmail(initSeg, i, ref i);
                                mergeOk = true;
                            }
                        }
                    }
                    if (mergeOk)
                    {
                        InsertWordToArray(word, retWords);
                        continue;
                    }
                }
                if (word[0] < 0x4e00 || word[0] > 0x9fa5)
                {
                    InsertWordToArray(word, retWords);
                }
                else
                {
                    List <WordInfo> words   = WordSegService.ExtractInfo.ExtractFullTextMaxMatch(word);
                    int             lastPos = 0;
                    foreach (WordInfo wordInfo in words)
                    {
                        if (lastPos < wordInfo.Position)
                        {/*
                          * String unMatchWord = word.Substring(lastPos, wordInfo.Position - lastPos);
                          *
                          * InsertWordToArray(unMatchWord, retWords);        */
                            //将没有匹配的词元进行一个字一词划分
                            for (int j = lastPos; j < wordInfo.Position; j++)
                            {
                                InsertWordToArray(word[j].ToString(), retWords);
                            }
                        }

                        lastPos = wordInfo.Position + wordInfo.Word.Length;
                        InsertWordToArray(wordInfo.Word, retWords);
                    }
                    //将剩下的字做为一个整词划分
                    if (lastPos < word.Length)
                    {
                        InsertWordToArray(word.Substring(lastPos, word.Length - lastPos), retWords);
                    }
                }
                i++;
            }
            return(retWords);
        }