public Word Next() { Word word = null; if (bufWord.Count > 0) word = bufWord.Dequeue(); if (word == null) { bufSentence.Length = 0; int data = -1; bool read = true; while (read && (data = ReadNext()) != -1) { read = false; UnicodeCategory type = char.GetUnicodeCategory((char)data); #region 条件检测 switch (type) { case UnicodeCategory.UppercaseLetter: case UnicodeCategory.LowercaseLetter: case UnicodeCategory.TitlecaseLetter: case UnicodeCategory.ModifierLetter: #region exec digit or letter /* * 1. 0x410-0x44f -> А-я //俄文 * 2. 0x391-0x3a9 -> Α-Ω //希腊大写 * 3. 0x3b1-0x3c9 -> α-ω //希腊小写 * */ data = ToAscii(data); NationLetter nl = GetNation(data); if (nl == NationLetter.UNKNOW) { read = true; break; } string wordType = Word.TYPE_LETTER; bufSentence.Append((char)data); switch (nl) { case NationLetter.EN: //字母后面的数字,如:VH049PA ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit(); ReadChars(bufSentence, rcad); if (rcad.HasDigit) wordType = Word.TYPE_LETTER_OR_DIGIT; break; case NationLetter.RA: ReadChars(bufSentence, new ReadCharByRussia()); break; case NationLetter.GE: ReadChars(bufSentence, new ReadCharByGreece()); break; } bufWord.Enqueue(CreateWord(bufSentence, wordType)); bufSentence.Length = 0; #endregion break; case UnicodeCategory.OtherLetter: /* * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名 * 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号 */ bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherLetter)); currentSentence = CreateSentence(bufSentence); bufSentence.Length = 0; break; case UnicodeCategory.DecimalDigitNumber: #region decimalDigitNumber bufSentence.Append((char)ToAscii(data)); //读后面的数字,AsciiLetterOr ReadChars(bufSentence, new ReadCharDigit()); wordType = Word.TYPE_DIGIT; int d = ReadNext(); if (d > -1) { if (seg.IsUnit(d)) { //单位,如时间 bufWord.Enqueue(CreateWord(bufSentence, StartIdx(bufSentence) - 1, Word.TYPE_DIGIT)); bufSentence.Length = 0; bufSentence.Append((char)d); wordType = Word.TYPE_WORD; } else { //后面可能是字母和数字 PushBack(d); if (ReadChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) { wordType = Word.TYPE_DIGIT_OR_LETTER; } } } bufWord.Enqueue(CreateWord(bufSentence, wordType)); bufSentence.Length = 0; #endregion break; case UnicodeCategory.LetterNumber: //ⅠⅡⅢ 单分 bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.LetterNumber)); int startIdx = StartIdx(bufSentence); for (int i = 0; i < bufSentence.Length; i++) { bufWord.Enqueue(new Word(new char[] { bufSentence[i] }, startIdx++, Word.TYPE_LETTER_NUMBER)); } bufSentence.Length = 0; break; case UnicodeCategory.OtherNumber: //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用 bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherNumber)); bufWord.Enqueue(CreateWord(bufSentence, Word.TYPE_OTHER_NUMBER)); bufSentence.Length = 0; break; default: //其它认为无效字符 read = true; break; } #endregion } //中文分词 if (currentSentence != null) { Chunk chunk = null; do { chunk = seg.Segment(currentSentence); for (int i = 0; i < chunk.Count; i++) { bufWord.Enqueue(chunk.Words[i]); } } while (!currentSentence.IsFinish); currentSentence = null; } if (bufWord.Count > 0) word = bufWord.Dequeue(); } return word; }
public Word Next() { Word word = null; if (bufWord.Count > 0) { word = bufWord.Dequeue(); } if (word == null) { bufSentence.Length = 0; int data = -1; bool read = true; while (read && (data = ReadNext()) != -1) { read = false; UnicodeCategory type = char.GetUnicodeCategory((char)data); #region 条件检测 switch (type) { case UnicodeCategory.UppercaseLetter: case UnicodeCategory.LowercaseLetter: case UnicodeCategory.TitlecaseLetter: case UnicodeCategory.ModifierLetter: #region exec digit or letter /* * 1. 0x410-0x44f -> А-я //俄文 * 2. 0x391-0x3a9 -> Α-Ω //希腊大写 * 3. 0x3b1-0x3c9 -> α-ω //希腊小写 * */ data = ToAscii(data); NationLetter nl = GetNation(data); if (nl == NationLetter.UNKNOW) { read = true; break; } string wordType = Word.TYPE_LETTER; bufSentence.Append((char)data); switch (nl) { case NationLetter.EN: //字母后面的数字,如:VH049PA ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit(); ReadChars(bufSentence, rcad); if (rcad.HasDigit) { wordType = Word.TYPE_LETTER_OR_DIGIT; } break; case NationLetter.RA: ReadChars(bufSentence, new ReadCharByRussia()); break; case NationLetter.GE: ReadChars(bufSentence, new ReadCharByGreece()); break; } bufWord.Enqueue(CreateWord(bufSentence, wordType)); bufSentence.Length = 0; #endregion break; case UnicodeCategory.OtherLetter: /* * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名 * 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号 */ bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherLetter)); currentSentence = CreateSentence(bufSentence); bufSentence.Length = 0; break; case UnicodeCategory.DecimalDigitNumber: #region decimalDigitNumber bufSentence.Append((char)ToAscii(data)); //读后面的数字,AsciiLetterOr ReadChars(bufSentence, new ReadCharDigit()); wordType = Word.TYPE_DIGIT; int d = ReadNext(); if (d > -1) { if (seg.IsUnit(d)) { //单位,如时间 bufWord.Enqueue(CreateWord(bufSentence, StartIdx(bufSentence) - 1, Word.TYPE_DIGIT)); bufSentence.Length = 0; bufSentence.Append((char)d); wordType = Word.TYPE_WORD; } else { //后面可能是字母和数字 PushBack(d); if (ReadChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) { wordType = Word.TYPE_DIGIT_OR_LETTER; } } } bufWord.Enqueue(CreateWord(bufSentence, wordType)); bufSentence.Length = 0; #endregion break; case UnicodeCategory.LetterNumber: //ⅠⅡⅢ 单分 bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.LetterNumber)); int startIdx = StartIdx(bufSentence); for (int i = 0; i < bufSentence.Length; i++) { bufWord.Enqueue(new Word(new char[] { bufSentence[i] }, startIdx++, Word.TYPE_LETTER_NUMBER)); } bufSentence.Length = 0; break; case UnicodeCategory.OtherNumber: //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用 bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherNumber)); bufWord.Enqueue(CreateWord(bufSentence, Word.TYPE_OTHER_NUMBER)); bufSentence.Length = 0; break; default: //其它认为无效字符 read = true; break; } #endregion } //中文分词 if (currentSentence != null) { Chunk chunk = null; do { chunk = seg.Segment(currentSentence); for (int i = 0; i < chunk.Count; i++) { bufWord.Enqueue(chunk.Words[i]); } } while (!currentSentence.IsFinish); currentSentence = null; } if (bufWord.Count > 0) { word = bufWord.Dequeue(); } } return(word); }