예제 #1
0
        private void pushTerm(int begin, int end, bool directOutput)
        {
            if (begin > end)
            {
                return;
            }
            if (!directOutput)
            {
                Hit h = dict.search(segmentBuff, begin, end);
                if (!h.isMatch() && begin != end && dict.isNoiseWord(Convert.ToString(segmentBuff[begin])))
                {
                    begin++;
                }
                if (dict.isNbSign(segmentBuff[end]) || dict.isConnector(segmentBuff[end]))
                {
                    end--;
                }
            }
            TokenDelegate td = new TokenDelegate(offset, begin, end);

            if (mircoSupported)
            {
                String w = (new String(segmentBuff, begin, (end - begin) + 1)).ToLower();
                if (!dict.isUselessWord(w))
                {
                    td.Term = w;
                    tokens.Add(td);
                }
            }
            else
            {
                tokens.Add(td);
            }
        }
예제 #2
0
        public virtual int CompareTo(System.Object o)
        {
            TokenDelegate ntd = (TokenDelegate)o;

            if (begin < ntd.begin)
            {
                return(-1);
            }
            if (begin == ntd.begin)
            {
                if (end > ntd.end)
                {
                    return(-1);
                }
                if (end == ntd.end)
                {
                    return(0);
                }
                if (end < ntd.end)
                {
                    return(1);
                }
            }
            return(1);
        }
예제 #3
0
 public override bool Equals(System.Object o)
 {
     if (o is TokenDelegate)
     {
         TokenDelegate ntd = (TokenDelegate)o;
         if (begin == ntd.Begin && end == ntd.End)
         {
             return(true);
         }
     }
     return(false);
 }
예제 #4
0
 private void pushNumber(int begin, int end)
 {
     if (dict.isNbSign(segmentBuff[end]) || dict.isConnector(segmentBuff[end]))
     {
         pushTerm(begin, end - 1);
         return;
     }
     else
     {
         TokenDelegate td = new TokenDelegate(offset, begin, end);
         numberSet.Add(td);
         return;
     }
 }
예제 #5
0
 private void pushTerm(TokenDelegate td)
 {
     if (mircoSupported)
     {
         String w = new String(segmentBuff, td.Begin, (td.End - td.Begin) + 1);
         if (!dict.isUselessWord(w))
         {
             td.Term = w;
             tokens.Add(td);
         }
     }
     else
     {
         tokens.Add(td);
     }
 }
예제 #6
0
        public override Token Next()
        {
            if (!(tokens.Count == 0))
            {
                TokenDelegate td = tokens.GetFirst();
                tokens.Remove(td);
                return(td.Token);
            }
            int segLength = 0;

            if (breakAndRead)
            {
                segLength  = input.Read(segmentBuff, 2048 - bAr_offset, bAr_offset);
                segLength += 2048 - bAr_offset;
            }
            else
            {
                segLength = input.Read(segmentBuff, 0, segmentBuff.Length);
            }
            if (segLength <= 0)
            {
                return(null);
            }
            getTokens(segLength);
            offset += bAr_offset;
            if (!(tokens.Count == 0))
            {
                TokenDelegate td = tokens.GetFirst();
                tokens.Remove(td);
                return(td.Token);
            }
            else
            {
                return(null);
            }
        }
예제 #7
0
        private void getTokens(int segLength)
        {
            initContext();
            char current = '\x0000';

            for (int ci = 0; ci < segLength; ci++)
            {
                nextContextStatus = 0;
                nextNumberStatus  = 0;
                segmentBuff[ci]   = toDBC(segmentBuff[ci]);//全角转半角
                current           = segmentBuff[ci];
                inputStatus       = dict.identify(current);
                if (contextStatus == 0)
                {
                    procInitState(ci);
                }
                else
                {
                    if ((contextStatus & 1) > 0)
                    {
                        ci = procLetterState(ci);
                    }
                    if ((contextStatus & 0x10) > 0)
                    {
                        ci = procNumberState(ci);
                    }
                    if ((contextStatus & 0x100) > 0)
                    {
                        ci = procCJKState(ci);
                    }
                }
                contextStatus = nextContextStatus;
                numberStatus  = nextNumberStatus;
                if (nextContextStatus != 0 || segLength - ci <= 1 || segLength - ci >= 50 || segLength != 2048)
                {
                    continue;
                }
                bAr_offset   = ci + 1;
                breakAndRead = true;
                break;
            }

            if (!breakAndRead)
            {
                if (numberBeginIndex >= 0)
                {
                    pushNumber(numberBeginIndex, segLength - 1);
                }
                if (lastMatchEnd != segLength - 1)
                {
                    if (unmatchFlag)
                    {
                        procSplitSeg(unmatchBegin, segLength - 1);
                    }
                    else if (beginIndex < segLength)
                    {
                        procSplitSeg(beginIndex, segLength - 1);
                    }
                }
            }
            for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current))
            {
                ;
            }
            numberSet.Clear();
            if (!mircoSupported)
            {
                ExtendOrderedSet <TokenDelegate> tmpTokens = new ExtendOrderedSet <TokenDelegate>();
                for (System.Collections.IEnumerator it = tokens.GetEnumerator(); it.MoveNext();)
                {
                    TokenDelegate  td  = (TokenDelegate)it.Current;
                    MTokenDelegate mtd = new MTokenDelegate(td.Offset, td.Begin, td.End);
                    String         w   = (new String(segmentBuff, td.Begin, (td.End - td.Begin) + 1)).ToLower();
                    if (!dict.isUselessWord(w))
                    {
                        mtd.Term = w;
                        tmpTokens.Add(mtd);
                    }
                }

                tokens.Clear();
                tokens = tmpTokens;
            }
            if (breakAndRead)
            {
                Array.Copy(segmentBuff, bAr_offset, segmentBuff, 0, segLength - bAr_offset);
            }
        }
예제 #8
0
        private int procCJK(int begin, int end)
        {
            Hit hit = dict.search(segmentBuff, begin, end);

            if (hit.MatchAndContinue)
            {
                if (unmatchFlag)
                {
                    if (hit.WordType.Suffix)
                    {
                        pushTerm(unmatchBegin, end);
                    }
                    else if (unmatchBegin == begin - 1 && !dict.isNoiseWord(Convert.ToString(segmentBuff[unmatchBegin])))
                    {
                        pushTerm(unmatchBegin, begin - 1, true);
                        pushTerm(unmatchBegin, begin, true);
                    }
                    else
                    {
                        cjkCut(unmatchBegin, begin - 1);
                        pushTerm(unmatchBegin, begin - 1);
                    }
                    LastMatchEnd = end;
                    pushTerm(begin, end);
                    unmatchFlag = false;
                }
                else if (hit.WordType.NormWord)
                {
                    LastMatchEnd = end;
                    pushTerm(begin, end);
                }
                if (!(numberSet.Count == 0))
                {
                    for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current))
                    {
                        ;
                    }
                    if (hit.WordType.Count)
                    {
                        TokenDelegate number = numberSet.GetLast();
                        pushTerm(number.Begin, end, true);
                        LastMatchEnd = end;
                    }
                    else
                    {
                        numberSet.Clear();
                    }
                }
                lastHitState = 1;
                return(0);
            }
            if (hit.isMatch())
            {
                if (unmatchFlag)
                {
                    if (hit.WordType.Suffix)
                    {
                        pushTerm(unmatchBegin, end);
                    }
                    else if (unmatchBegin == begin - 1 && !dict.isNoiseWord(Convert.ToString(segmentBuff[unmatchBegin])))
                    {
                        pushTerm(unmatchBegin, begin - 1, true);
                        pushTerm(unmatchBegin, begin, true);
                    }
                    else
                    {
                        cjkCut(unmatchBegin, begin - 1);
                        pushTerm(unmatchBegin, begin - 1);
                    }
                    LastMatchEnd = end;
                    pushTerm(begin, end);
                    unmatchFlag = false;
                }
                else if (hit.WordType.NormWord)
                {
                    LastMatchEnd = end;
                    pushTerm(begin, end);
                }
                else
                {
                    unmatchFlag  = true;
                    unmatchBegin = begin;
                }
                if (!(numberSet.Count == 0))
                {
                    for (IEnumerator it = numberSet.GetEnumerator(); it.MoveNext(); pushTerm((TokenDelegate)it.Current))
                    {
                        ;
                    }
                    if (hit.WordType.Count)
                    {
                        TokenDelegate number = numberSet.GetLast();
                        pushTerm(number.Begin, end, true);
                        LastMatchEnd = end;
                        unmatchFlag  = false;
                    }
                    numberSet.Clear();
                }
                beginIndex++;
                lastHitState = 2;
                return(beginIndex - end);
            }
            if (hit.isPrefixMatch())
            {
                lastHitState = 3;
                return(0);
            }
            if (hit.isUnmatch())
            {
                if (lastHitState == 3 && unmatchFlag)
                {
                    if (unmatchBegin == begin - 1 && !dict.isNoiseWord(Convert.ToString(segmentBuff[unmatchBegin])))
                    {
                        pushTerm(unmatchBegin, begin - 1, true);
                        pushTerm(unmatchBegin, begin, true);
                    }
                    else
                    {
                        cjkCut(unmatchBegin, begin - 1);
                        pushTerm(unmatchBegin, begin - 1);
                    }
                }
                if (begin > lastMatchEnd)
                {
                    if (!unmatchFlag)
                    {
                        unmatchFlag  = true;
                        unmatchBegin = begin;
                    }
                    Hit h = dict.search(segmentBuff, end, end);
                    if (dict.isNoiseWord(Convert.ToString(segmentBuff[end])) && end - begin == 1)
                    {
                        cjkCut(unmatchBegin, begin);
                        pushTerm(unmatchBegin, begin);
                        unmatchFlag  = false;
                        LastMatchEnd = begin;
                    }
                    else if (h.isMatch() && h.WordType.Suffix)
                    {
                        if (unmatchBegin < end)
                        {
                            pushTerm(unmatchBegin, end - 1);
                            pushTerm(unmatchBegin, end);
                            cjkCut(unmatchBegin, end);
                            unmatchFlag  = false;
                            LastMatchEnd = end;
                            beginIndex   = end;
                            return(0);
                        }
                    }
                    else
                    {
                        h = dict.search(segmentBuff, begin, begin);
                        if (h.isMatch() && h.WordType.Suffix && unmatchBegin != begin)
                        {
                            pushTerm(unmatchBegin, begin);
                            unmatchFlag  = false;
                            LastMatchEnd = begin;
                        }
                    }
                }
                beginIndex++;
                lastHitState = 4;
                return(beginIndex - end);
            }
            else
            {
                return(0);
            }
        }