public void reset(TextReader input) { this.reader = new PushbackReader(input); currentSentence = null; bufWord = new Queue<Word>(); bufSentence.Length = 0; readedIdx = -1; }
public override Chunk Segment(Sentence sen) { Chunk chunk = base.Segment(sen); if (chunk != null) { List<Word> cks = new List<Word>(); for (int i = 0; i < chunk.Count; i++) { Word word = chunk.Words[i]; if (word.Length < 3) { cks.Add(word); } else { char[] chs = word.Sen; int offset = word.WordOffset; int n = 0; int wordEnd = word.WordOffset + word.Length; int senStartOffset = word.StartOffset - offset; //sen 在文件中的位置 int end = -1;//上一次找到的位置 for (; offset < wordEnd - 1; offset++) { int idx = Search(chs, offset, 1); if (idx > -1) { cks.Add(new Word(chs, senStartOffset, offset, 2)); end = offset + 2; n++; } else if (offset >= end) { //有单字 cks.Add(new Word(chs, senStartOffset, offset, 1)); end = offset + 1; } } if (end > -1 && end < wordEnd) { cks.Add(new Word(chs, senStartOffset, offset, 1)); } } } chunk.Words = cks.ToArray(); chunk.Count = cks.Count; } return chunk; }
public override Chunk Segment(Sentence sen) { Chunk chunk = new Chunk(); char[] chs = sen.Text; for (int k = 0; k < 3 && !sen.IsFinish; k++) { int offset = sen.Offset; int maxLen = 0; //有了 key tree 的支持可以从头开始 max match maxLen = dic.maxMatch(chs, offset); chunk.Words[k] = new Word(chs, sen.StartOffset, offset, maxLen + 1); offset += maxLen + 1; sen.Offset = offset; } return chunk; }
/// <summary> /// 对句子sen进行分词 /// </summary> /// <param name="sen"></param> /// <returns></returns> public abstract Chunk Segment(Sentence sen);
public Word Next() { Word word = null; if (bufWord.Count > 0) word = bufWord.Dequeue(); if (word == null) { bufSentence.Length = 0; int data = -1; bool read = true; while (read && (data = ReadNext()) != -1) { read = false; UnicodeCategory type = char.GetUnicodeCategory((char)data); #region 条件检测 switch (type) { case UnicodeCategory.UppercaseLetter: case UnicodeCategory.LowercaseLetter: case UnicodeCategory.TitlecaseLetter: case UnicodeCategory.ModifierLetter: #region exec digit or letter /* * 1. 0x410-0x44f -> А-я //俄文 * 2. 0x391-0x3a9 -> Α-Ω //希腊大写 * 3. 0x3b1-0x3c9 -> α-ω //希腊小写 * */ data = ToAscii(data); NationLetter nl = GetNation(data); if (nl == NationLetter.UNKNOW) { read = true; break; } string wordType = Word.TYPE_LETTER; bufSentence.Append((char)data); switch (nl) { case NationLetter.EN: //字母后面的数字,如:VH049PA ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit(); ReadChars(bufSentence, rcad); if (rcad.HasDigit) wordType = Word.TYPE_LETTER_OR_DIGIT; break; case NationLetter.RA: ReadChars(bufSentence, new ReadCharByRussia()); break; case NationLetter.GE: ReadChars(bufSentence, new ReadCharByGreece()); break; } bufWord.Enqueue(CreateWord(bufSentence, wordType)); bufSentence.Length = 0; #endregion break; case UnicodeCategory.OtherLetter: /* * 1. 0x3041-0x30f6 -> ぁ-ヶ //日文(平|片)假名 * 2. 0x3105-0x3129 -> ㄅ-ㄩ //注意符号 */ bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherLetter)); currentSentence = CreateSentence(bufSentence); bufSentence.Length = 0; break; case UnicodeCategory.DecimalDigitNumber: #region decimalDigitNumber bufSentence.Append((char)ToAscii(data)); //读后面的数字,AsciiLetterOr ReadChars(bufSentence, new ReadCharDigit()); wordType = Word.TYPE_DIGIT; int d = ReadNext(); if (d > -1) { if (seg.IsUnit(d)) { //单位,如时间 bufWord.Enqueue(CreateWord(bufSentence, StartIdx(bufSentence) - 1, Word.TYPE_DIGIT)); bufSentence.Length = 0; bufSentence.Append((char)d); wordType = Word.TYPE_WORD; } else { //后面可能是字母和数字 PushBack(d); if (ReadChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) { wordType = Word.TYPE_DIGIT_OR_LETTER; } } } bufWord.Enqueue(CreateWord(bufSentence, wordType)); bufSentence.Length = 0; #endregion break; case UnicodeCategory.LetterNumber: //ⅠⅡⅢ 单分 bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.LetterNumber)); int startIdx = StartIdx(bufSentence); for (int i = 0; i < bufSentence.Length; i++) { bufWord.Enqueue(new Word(new char[] { bufSentence[i] }, startIdx++, Word.TYPE_LETTER_NUMBER)); } bufSentence.Length = 0; break; case UnicodeCategory.OtherNumber: //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用 bufSentence.Append((char)data); ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherNumber)); bufWord.Enqueue(CreateWord(bufSentence, Word.TYPE_OTHER_NUMBER)); bufSentence.Length = 0; break; default: //其它认为无效字符 read = true; break; } #endregion } //中文分词 if (currentSentence != null) { Chunk chunk = null; do { chunk = seg.Segment(currentSentence); for (int i = 0; i < chunk.Count; i++) { bufWord.Enqueue(chunk.Words[i]); } } while (!currentSentence.IsFinish); currentSentence = null; } if (bufWord.Count > 0) word = bufWord.Dequeue(); } return word; }
public override Chunk Segment(Sentence sen) { char[] chs = sen.Text; int[] tailLen = new int[3];//记录词的尾长 List<int>[] tailLens = new List<int>[2];//记录词尾部允许的长度 for (int i = 0; i < 2; i++) { tailLens[i] = new List<int>(); } CharNode[] cns = new CharNode[3]; //每个词在SEN的开始位置 int[] offsets = new int[3]; mmr.Reset(); if (!sen.IsFinish) { if (showChunk) { Console.WriteLine(); } int maxLen = 0; offsets[0] = sen.Offset; //Console.WriteLine("{0}:{1}", sen.Offset, new String(sen.Text)); /* * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs,offsets[0]);w[0]>=0;w[0]--) * 可以减少一部分多余的查找 */ MaxMatch(cns, 0, chs, offsets[0], tailLens, 0); for (int aIdx = tailLens[0].Count - 1; aIdx >= 0; aIdx--) { tailLen[0] = tailLens[0][aIdx]; //第二个词的开始位置 offsets[1] = offsets[0] + 1 + tailLen[0]; MaxMatch(cns, 1, chs, offsets[1], tailLens, 1); for (int bIdx = tailLens[1].Count - 1; bIdx >= 0; bIdx--) { tailLen[1] = tailLens[1][bIdx]; offsets[2] = offsets[1] + 1 + tailLen[1]; //第三个词只需要最长的 tailLen[2] = MaxMatch(cns, 2, chs, offsets[2]); int sumChunkLen = 0; for (int i = 0; i < 3; i++) { sumChunkLen += tailLen[i] + 1; } Chunk ck = null; if (sumChunkLen >= maxLen) { maxLen = sumChunkLen; ck = CreateChunk(sen, chs, tailLen, offsets, cns); mmr.AddChunk(ck); } if (showChunk) { if (ck == null) { ck = CreateChunk(sen, chs, tailLen, offsets, cns); mmr.AddChunk(ck); } Console.WriteLine(ck); } } } //maxLen个字符已经处理完 sen.AddOffset(maxLen); //Console.WriteLine("max:{0}", maxLen); List<Chunk> chunks = mmr.RemainChunks(); foreach (Rule rule in otherRules) { if (showChunk) { Console.WriteLine("---------filter before {0} -----------", rule); PrintChunk(chunks); } if (chunks.Count <= 1) break; rule.Reset(); rule.AddChunks(chunks); chunks = rule.RemainChunks(); } if (showChunk) { Console.WriteLine("------------remainChunks--------"); PrintChunk(chunks); } if (chunks.Count > 0) return chunks[0]; } return null; }
Chunk CreateChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns) { Chunk ck = new Chunk(); for (int i = 0; i < 3; i++) { if (offsets[i] < chs.Length) { ck.Words[i] = new Word(chs, sen.StartOffset, offsets[i], tailLen[i] + 1); if (tailLen[i] == 0) //单字的要取得"字频计算出自由度" { CharNode cn = cns[i]; if (cn != null) { ck.Words[i].Degree = cn.Freq; } } } } return ck; }