public void reset(TextReader input)
 {
     this.reader = new PushbackReader(input);
     currentSentence = null;
     bufWord = new Queue<Word>();
     bufSentence.Length = 0;
     readedIdx = -1;
 }
 public override Chunk Segment(Sentence sen)
 {
     Chunk chunk = base.Segment(sen);
     if (chunk != null)
     {
         List<Word> cks = new List<Word>();
         for (int i = 0; i < chunk.Count; i++)
         {
             Word word = chunk.Words[i];
             if (word.Length < 3)
             {
                 cks.Add(word);
             }
             else
             {
                 char[] chs = word.Sen;
                 int offset = word.WordOffset;
                 int n = 0;
                 int wordEnd = word.WordOffset + word.Length;
                 int senStartOffset = word.StartOffset - offset; //sen 在文件中的位置
                 int end = -1;//上一次找到的位置
                 for (; offset < wordEnd - 1; offset++)
                 {
                     int idx = Search(chs, offset, 1);
                     if (idx > -1)
                     {
                         cks.Add(new Word(chs, senStartOffset, offset, 2));
                         end = offset + 2;
                         n++;
                     }
                     else if (offset >= end)
                     {
                         //有单字
                         cks.Add(new Word(chs, senStartOffset, offset, 1));
                         end = offset + 1;
                     }
                 }
                 if (end > -1 && end < wordEnd)
                 {
                     cks.Add(new Word(chs, senStartOffset, offset, 1));
                 }
             }
         }
         chunk.Words = cks.ToArray();
         chunk.Count = cks.Count;
     }
     return chunk;
 }
        public override Chunk Segment(Sentence sen)
        {
            Chunk chunk = new Chunk();
            char[] chs = sen.Text;
            for (int k = 0; k < 3 && !sen.IsFinish; k++)
            {
                int offset = sen.Offset;
                int maxLen = 0;

                //有了 key tree 的支持可以从头开始 max match
                maxLen = dic.maxMatch(chs, offset);
                chunk.Words[k] = new Word(chs, sen.StartOffset, offset, maxLen + 1);
                offset += maxLen + 1;
                sen.Offset = offset;
            }
            return chunk;
        }
Beispiel #4
0
 /// <summary>
 /// 对句子sen进行分词
 /// </summary>
 /// <param name="sen"></param>
 /// <returns></returns>
 public abstract Chunk Segment(Sentence sen);
        public Word Next()
        {
            Word word = null;
            if (bufWord.Count > 0)
                word = bufWord.Dequeue();

            if (word == null)
            {
                bufSentence.Length = 0;
                int data = -1;
                bool read = true;
                while (read && (data = ReadNext()) != -1)
                {
                    read = false;
                    UnicodeCategory type = char.GetUnicodeCategory((char)data);
                    #region 条件检测
                    switch (type)
                    {
                        case UnicodeCategory.UppercaseLetter:
                        case UnicodeCategory.LowercaseLetter:
                        case UnicodeCategory.TitlecaseLetter:
                        case UnicodeCategory.ModifierLetter:
                            #region exec digit or letter
                            /*
                             * 1. 0x410-0x44f -> А-я	//俄文   
                             * 2. 0x391-0x3a9 -> Α-Ω	//希腊大写
                             * 3. 0x3b1-0x3c9 -> α-ω	//希腊小写
                             * 
                             */
                            data = ToAscii(data);
                            NationLetter nl = GetNation(data);
                            if (nl == NationLetter.UNKNOW)
                            {
                                read = true;
                                break;
                            }
                            string wordType = Word.TYPE_LETTER;
                            bufSentence.Append((char)data);

                            switch (nl)
                            {
                                case NationLetter.EN:
                                    //字母后面的数字,如:VH049PA
                                    ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();
                                    ReadChars(bufSentence, rcad);
                                    if (rcad.HasDigit)
                                        wordType = Word.TYPE_LETTER_OR_DIGIT;
                                    break;
                                case NationLetter.RA:
                                    ReadChars(bufSentence, new ReadCharByRussia());
                                    break;
                                case NationLetter.GE:
                                    ReadChars(bufSentence, new ReadCharByGreece());
                                    break;
                            }
                            bufWord.Enqueue(CreateWord(bufSentence, wordType));
                            bufSentence.Length = 0;
                            #endregion
                            break;

                        case UnicodeCategory.OtherLetter:
                            /*
                             * 1. 0x3041-0x30f6 -> ぁ-ヶ	    //日文(平|片)假名
                             * 2. 0x3105-0x3129 -> ㄅ-ㄩ	//注意符号
                             */
                            bufSentence.Append((char)data);
                            ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherLetter));

                            currentSentence = CreateSentence(bufSentence);
                            bufSentence.Length = 0;
                            break;

                        case UnicodeCategory.DecimalDigitNumber:
                            #region decimalDigitNumber

                            bufSentence.Append((char)ToAscii(data));
                            //读后面的数字,AsciiLetterOr
                            ReadChars(bufSentence, new ReadCharDigit());
                            wordType = Word.TYPE_DIGIT;
                            int d = ReadNext();
                            if (d > -1)
                            {
                                if (seg.IsUnit(d))
                                {
                                    //单位,如时间
                                    bufWord.Enqueue(CreateWord(bufSentence, StartIdx(bufSentence) - 1, Word.TYPE_DIGIT));
                                    bufSentence.Length = 0;
                                    bufSentence.Append((char)d);
                                    wordType = Word.TYPE_WORD;
                                }
                                else
                                {
                                    //后面可能是字母和数字
                                    PushBack(d);
                                    if (ReadChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0)
                                    {
                                        wordType = Word.TYPE_DIGIT_OR_LETTER;
                                    }
                                }
                            }

                            bufWord.Enqueue(CreateWord(bufSentence, wordType));
                            bufSentence.Length = 0;
                            #endregion
                            break;

                        case UnicodeCategory.LetterNumber:
                            //ⅠⅡⅢ 单分
                            bufSentence.Append((char)data);
                            ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.LetterNumber));
                            int startIdx = StartIdx(bufSentence);
                            for (int i = 0; i < bufSentence.Length; i++)
                            {
                                bufWord.Enqueue(new Word(new char[] { bufSentence[i] }, startIdx++, Word.TYPE_LETTER_NUMBER));
                            }
                            bufSentence.Length = 0;
                            break;
                        case UnicodeCategory.OtherNumber:
                            //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 连着用
                            bufSentence.Append((char)data);
                            ReadChars(bufSentence, new ReadCharByType(UnicodeCategory.OtherNumber));
                            bufWord.Enqueue(CreateWord(bufSentence, Word.TYPE_OTHER_NUMBER));
                            bufSentence.Length = 0;
                            break;
                        default:
                            //其它认为无效字符
                            read = true;
                            break;
                    }
                    #endregion
                }
                //中文分词
                if (currentSentence != null)
                {
                    Chunk chunk = null;
                    do
                    {
                        chunk = seg.Segment(currentSentence);
                        for (int i = 0; i < chunk.Count; i++)
                        {
                            bufWord.Enqueue(chunk.Words[i]);
                        }
                    } while (!currentSentence.IsFinish);
                    currentSentence = null;
                }
                if (bufWord.Count > 0)
                    word = bufWord.Dequeue();
            }
            return word;
        }
        public override Chunk Segment(Sentence sen)
        {
            char[] chs = sen.Text;
            int[] tailLen = new int[3];//记录词的尾长
            List<int>[] tailLens = new List<int>[2];//记录词尾部允许的长度
            for (int i = 0; i < 2; i++)
            {
                tailLens[i] = new List<int>();
            }
            CharNode[] cns = new CharNode[3];

            //每个词在SEN的开始位置
            int[] offsets = new int[3];
            mmr.Reset();
            if (!sen.IsFinish)
            {
                if (showChunk)
                {
                    Console.WriteLine();
                }
                int maxLen = 0;
                offsets[0] = sen.Offset;
                //Console.WriteLine("{0}:{1}", sen.Offset, new String(sen.Text));
                /*
                 * 遍历所有不同词长,还不是从最大到0(w[0]=maxLen(chs,offsets[0]);w[0]>=0;w[0]--)
                 * 可以减少一部分多余的查找
                 */
                MaxMatch(cns, 0, chs, offsets[0], tailLens, 0);
                for (int aIdx = tailLens[0].Count - 1; aIdx >= 0; aIdx--)
                {
                    tailLen[0] = tailLens[0][aIdx];
                    //第二个词的开始位置
                    offsets[1] = offsets[0] + 1 + tailLen[0];
                    MaxMatch(cns, 1, chs, offsets[1], tailLens, 1);
                    for (int bIdx = tailLens[1].Count - 1; bIdx >= 0; bIdx--)
                    {
                        tailLen[1] = tailLens[1][bIdx];
                        offsets[2] = offsets[1] + 1 + tailLen[1];

                        //第三个词只需要最长的
                        tailLen[2] = MaxMatch(cns, 2, chs, offsets[2]);
                        int sumChunkLen = 0;
                        for (int i = 0; i < 3; i++)
                        {
                            sumChunkLen += tailLen[i] + 1;
                        }
                        Chunk ck = null;
                        if (sumChunkLen >= maxLen)
                        {
                            maxLen = sumChunkLen;
                            ck = CreateChunk(sen, chs, tailLen, offsets, cns);
                            mmr.AddChunk(ck);
                        }
                        if (showChunk)
                        {
                            if (ck == null)
                            {
                                ck = CreateChunk(sen, chs, tailLen, offsets, cns);
                                mmr.AddChunk(ck);
                            }
                            Console.WriteLine(ck);
                        }
                    }
                }
                //maxLen个字符已经处理完
                sen.AddOffset(maxLen);
                //Console.WriteLine("max:{0}", maxLen);
                List<Chunk> chunks = mmr.RemainChunks();
                foreach (Rule rule in otherRules)
                {
                    if (showChunk)
                    {
                        Console.WriteLine("---------filter before {0} -----------", rule);
                        PrintChunk(chunks);
                    }
                    if (chunks.Count <= 1)
                        break;

                    rule.Reset();
                    rule.AddChunks(chunks);
                    chunks = rule.RemainChunks();
                }
                if (showChunk)
                {
                    Console.WriteLine("------------remainChunks--------");
                    PrintChunk(chunks);
                }
                if (chunks.Count > 0)
                    return chunks[0];
            }

            return null;
        }
 Chunk CreateChunk(Sentence sen, char[] chs, int[] tailLen, int[] offsets, CharNode[] cns)
 {
     Chunk ck = new Chunk();
     for (int i = 0; i < 3; i++)
     {
         if (offsets[i] < chs.Length)
         {
             ck.Words[i] = new Word(chs, sen.StartOffset, offsets[i], tailLen[i] + 1);
             if (tailLen[i] == 0) //单字的要取得"字频计算出自由度"
             {
                 CharNode cn = cns[i];
                 if (cn != null)
                 {
                     ck.Words[i].Degree = cn.Freq;
                 }
             }
         }
     }
     return ck;
 }