Пример #1
0
        public Word Next()
        {
            Word word = null;

            if (bufWord.Count > 0)
            {
                word = bufWord.Dequeue();
            }

            if (word == null)
            {
                bufSentence.Length = 0;
                int  data = -1;
                bool read = true;
                while (read && (data = ReadNext()) != -1)
                {
                    read = false;
                    UnicodeCategory type = char.GetUnicodeCategory((char)data);
                    #region 条件检测
                    switch (type)
                    {
                    case UnicodeCategory.UppercaseLetter:
                    case UnicodeCategory.LowercaseLetter:
                    case UnicodeCategory.TitlecaseLetter:
                    case UnicodeCategory.ModifierLetter:
                        #region exec digit or letter

                        /*
                         * 1. 0x410-0x44f -> А-я	//俄文
                         * 2. 0x391-0x3a9 -> Α-Ω	//希腊大写
                         * 3. 0x3b1-0x3c9 -> α-ω	//希腊小写
                         *
                         */
                        data = ToAscii(data);
                        NationLetter nl = GetNation(data);
                        if (nl == NationLetter.UNKNOW)
                        {
                            read = true;
                            break;
                        }
                        string wordType = Word.TYPE_LETTER;
                        bufSentence.Append((char)data);

                        switch (nl)
                        {
                        case NationLetter.EN:
                            //字母后面的数字,如:VH049PA
                            ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();
                            ReadChars(bufSentence, rcad);
                            if (rcad.HasDigit)
                            {
                                wordType = Word.TYPE_LETTER_OR_DIGIT;
                            }
                            break;

                        case NationLetter.RA:
                            ReadChars(bufSentence, new ReadCharByRussia());
                            break;

                        case NationLetter.GE:
                            ReadChars(bufSentence, new ReadCharByGreece());
                            break;
                        }
                        bufWord.Enqueue(CreateWord(bufSentence, wordType));
                        bufSentence.Length = 0;
                        #endregion
                        break;

                    case UnicodeCategory.OtherLetter:
                        /*
                         * 1. 0x3041-0x30f6 -> ぁ-ヶ	    //日文(平|片)假名
                         * 2. 0x3105-0x3129 -> ㄅ-ㄩ	//注意符号
                         */
                        bufSentence.Append((char)data);
                        ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherLetter));

                        currentSentence    = CreateSentence(bufSentence);
                        bufSentence.Length = 0;
                        break;

                    case UnicodeCategory.DecimalDigitNumber:
                        #region decimalDigitNumber

                        bufSentence.Append((char)ToAscii(data));
                        //读后面的数字,AsciiLetterOr
                        ReadChars(bufSentence, new ReadCharDigit());
                        wordType = Word.TYPE_DIGIT;
                        int d = ReadNext();
                        if (d > -1)
                        {
                            if (seg.IsUnit(d))
                            {
                                //单位,如时间
                                bufWord.Enqueue(CreateWord(bufSentence, StartIdx(bufSentence) - 1, Word.TYPE_DIGIT));
                                bufSentence.Length = 0;
                                bufSentence.Append((char)d);
                                wordType = Word.TYPE_WORD;
                            }
                            else
                            {
                                //后面可能是字母和数字
                                PushBack(d);
                                if (ReadChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0)
                                {
                                    wordType = Word.TYPE_DIGIT_OR_LETTER;
                                }
                            }
                        }

                        bufWord.Enqueue(CreateWord(bufSentence, wordType));
                        bufSentence.Length = 0;
                        #endregion
                        break;

                    case UnicodeCategory.LetterNumber:
                        //ⅠⅡⅢ 单分
                        bufSentence.Append((char)data);
                        ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.LetterNumber));
                        int startIdx = StartIdx(bufSentence);
                        for (int i = 0; i < bufSentence.Length; i++)
                        {
                            bufWord.Enqueue(new Word(new char[] { bufSentence[i] }, startIdx++, Word.TYPE_LETTER_NUMBER));
                        }
                        bufSentence.Length = 0;
                        break;

                    case UnicodeCategory.OtherNumber:
                        //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
                        bufSentence.Append((char)data);
                        ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherNumber));
                        bufWord.Enqueue(CreateWord(bufSentence, Word.TYPE_OTHER_NUMBER));
                        bufSentence.Length = 0;
                        break;

                    default:
                        //其它认为无效字符
                        read = true;
                        break;
                    }
                    #endregion
                }
                //中文分词
                if (currentSentence != null)
                {
                    Chunk chunk = null;
                    do
                    {
                        chunk = seg.Segment(currentSentence);
                        for (int i = 0; i < chunk.Count; i++)
                        {
                            bufWord.Enqueue(chunk.Words[i]);
                        }
                    } while (!currentSentence.IsFinish);
                    currentSentence = null;
                }
                if (bufWord.Count > 0)
                {
                    word = bufWord.Dequeue();
                }
            }
            return(word);
        }