Пример #1
0
 internal IKTokenizer(TextReader in_Renamed, bool mircoSupported)
     : base(in_Renamed)
 {
     dict                = Dictionary.load();
     lastHitState        = 0;
     segmentBuff         = new char[2048];
     this.mircoSupported = mircoSupported;
     tokens              = new ExtendOrderedSet <TokenDelegate>();
     numberSet           = new ExtendOrderedSet <TokenDelegate>();
     initContext();
 }
Пример #2
0
        private void getTokens(int segLength)
        {
            initContext();
            char current = '\x0000';

            for (int ci = 0; ci < segLength; ci++)
            {
                nextContextStatus = 0;
                nextNumberStatus  = 0;
                segmentBuff[ci]   = toDBC(segmentBuff[ci]);//全角转半角
                current           = segmentBuff[ci];
                inputStatus       = dict.identify(current);
                if (contextStatus == 0)
                {
                    procInitState(ci);
                }
                else
                {
                    if ((contextStatus & 1) > 0)
                    {
                        ci = procLetterState(ci);
                    }
                    if ((contextStatus & 0x10) > 0)
                    {
                        ci = procNumberState(ci);
                    }
                    if ((contextStatus & 0x100) > 0)
                    {
                        ci = procCJKState(ci);
                    }
                }
                contextStatus = nextContextStatus;
                numberStatus  = nextNumberStatus;
                if (nextContextStatus != 0 || segLength - ci <= 1 || segLength - ci >= 50 || segLength != 2048)
                {
                    continue;
                }
                bAr_offset   = ci + 1;
                breakAndRead = true;
                break;
            }

            if (!breakAndRead)
            {
                if (numberBeginIndex >= 0)
                {
                    pushNumber(numberBeginIndex, segLength - 1);
                }
                if (lastMatchEnd != segLength - 1)
                {
                    if (unmatchFlag)
                    {
                        procSplitSeg(unmatchBegin, segLength - 1);
                    }
                    else if (beginIndex < segLength)
                    {
                        procSplitSeg(beginIndex, segLength - 1);
                    }
                }
            }
            for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current))
            {
                ;
            }
            numberSet.Clear();
            if (!mircoSupported)
            {
                ExtendOrderedSet <TokenDelegate> tmpTokens = new ExtendOrderedSet <TokenDelegate>();
                for (System.Collections.IEnumerator it = tokens.GetEnumerator(); it.MoveNext();)
                {
                    TokenDelegate  td  = (TokenDelegate)it.Current;
                    MTokenDelegate mtd = new MTokenDelegate(td.Offset, td.Begin, td.End);
                    String         w   = (new String(segmentBuff, td.Begin, (td.End - td.Begin) + 1)).ToLower();
                    if (!dict.isUselessWord(w))
                    {
                        mtd.Term = w;
                        tmpTokens.Add(mtd);
                    }
                }

                tokens.Clear();
                tokens = tmpTokens;
            }
            if (breakAndRead)
            {
                Array.Copy(segmentBuff, bAr_offset, segmentBuff, 0, segLength - bAr_offset);
            }
        }