コード例 #1
0
            protected internal override bool incrementWord()
            {
                wordStart = wordEnd;
                while (wordStart < sentenceEnd)
                {
                    if (char.IsLetterOrDigit(buffer[wordStart]))
                    {
                        break;
                    }
                    wordStart++;
                }

                if (wordStart == sentenceEnd)
                {
                    return(false);
                }

                wordEnd = wordStart + 1;
                while (wordEnd < sentenceEnd && char.IsLetterOrDigit(buffer[wordEnd]))
                {
                    wordEnd++;
                }

                clearAttributes();
                termAtt.copyBuffer(buffer, wordStart, wordEnd - wordStart);
                offsetAtt.setOffset(correctOffset(offset + wordStart), correctOffset(offset + wordEnd));
                posIncAtt.PositionIncrement = posIncAtt.PositionIncrement + posBoost;
                posBoost = 0;
                return(true);
            }
コード例 #2
0
        /// <summary>
        /// Returns the next input Token, after being stemmed </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (input.incrementToken())
            {
                if (!keywordAttr.Keyword)
                {
                    char[] termBuffer = termAtt.buffer();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int length = termAtt.length();
                    int length = termAtt.length();
                    stemmer.setCurrent(termBuffer, length);
                    stemmer.stem();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char finalTerm[] = stemmer.getCurrentBuffer();
                    char[] finalTerm = stemmer.CurrentBuffer;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int newLength = stemmer.getCurrentBufferLength();
                    int newLength = stemmer.CurrentBufferLength;
                    if (finalTerm != termBuffer)
                    {
                        termAtt.copyBuffer(finalTerm, 0, newLength);
                    }
                    else
                    {
                        termAtt.Length = newLength;
                    }
                }
                return(true);
            }
            else
            {
                return(false);
            }
        }
コード例 #3
0
ファイル: TrimFilter.cs プロジェクト: leotohill/lucene.net
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

            char[] termBuffer = termAtt.buffer();
            int    len        = termAtt.length();

            //TODO: Is this the right behavior or should we return false?  Currently, "  ", returns true, so I think this should
            //also return true
            if (len == 0)
            {
                return(true);
            }
            int start  = 0;
            int end    = 0;
            int endOff = 0;

            // eat the first characters
            for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++)
            {
            }
            // eat the end characters
            for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--)
            {
                endOff++;
            }
            if (start > 0 || end < len)
            {
                if (start < end)
                {
                    termAtt.copyBuffer(termBuffer, start, (end - start));
                }
                else
                {
                    termAtt.setEmpty();
                }
                if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset())
                {
                    int newStart = offsetAtt.startOffset() + start;
                    int newEnd   = offsetAtt.endOffset() - (start < end ? endOff:0);
                    offsetAtt.setOffset(newStart, newEnd);
                }
            }

            return(true);
        }
コード例 #4
0
 protected internal override bool incrementWord()
 {
     if (hasSentence)
     {
         hasSentence = false;
         clearAttributes();
         termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd - sentenceStart);
         offsetAtt.setOffset(correctOffset(offset + sentenceStart), correctOffset(offset + sentenceEnd));
         return(true);
     }
     else
     {
         return(false);
     }
 }
コード例 #5
0
ファイル: ChineseTokenizer.cs プロジェクト: vicancy/lucenenet
 private bool flush()
 {
     if (length > 0)
     {
         //System.out.println(new String(buffer, 0,
         //length));
         termAtt.copyBuffer(buffer, 0, length);
         offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
         return(true);
     }
     else
     {
         return(false);
     }
 }
コード例 #6
0
 /// <summary>
 /// Fills CharTermAttribute with the current token text.
 /// </summary>
 public void getText(CharTermAttribute t)
 {
     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
 }
コード例 #7
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            while (true)
            {
                if (curTermBuffer == null)
                {
                    if (!input.incrementToken())
                    {
                        return(false);
                    }
                    else
                    {
                        curTermBuffer     = termAtt.buffer().clone();
                        curTermLength     = termAtt.length();
                        curCodePointCount = charUtils.codePointCount(termAtt);
                        curGramSize       = minGram;
                        curPos            = 0;
                        curPosInc         = posIncAtt.PositionIncrement;
                        curPosLen         = posLenAtt.PositionLength;
                        tokStart          = offsetAtt.startOffset();
                        tokEnd            = offsetAtt.endOffset();
                        // if length by start + end offsets doesn't match the term text then assume
                        // this is a synonym and don't adjust the offsets.
                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
                    }
                }
                if (version.onOrAfter(Version.LUCENE_44))
                {
                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
                    {
                        ++curPos;
                        curGramSize = minGram;
                    }
                    if ((curPos + curGramSize) <= curCodePointCount)
                    {
                        clearAttributes();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                        int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        termAtt.copyBuffer(curTermBuffer, start, end - start);
                        posIncAtt.PositionIncrement = curPosInc;
                        curPosInc = 0;
                        posLenAtt.PositionLength = curPosLen;
                        offsetAtt.setOffset(tokStart, tokEnd);
                        curGramSize++;
                        return(true);
                    }
                }
                else
                {
                    while (curGramSize <= maxGram)
                    {
                        while (curPos + curGramSize <= curTermLength)   // while there is input
                        {
                            clearAttributes();
                            termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
                            if (hasIllegalOffsets)
                            {
                                offsetAtt.setOffset(tokStart, tokEnd);
                            }
                            else
                            {
                                offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                            }
                            curPos++;
                            return(true);
                        }
                        curGramSize++;   // increase n-gram size
                        curPos = 0;
                    }
                }
                curTermBuffer = null;
            }
        }
コード例 #8
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (hasMoreTokensInClone)
            {
                int start = breaker.current();
                int end   = breaker.next();
                if (end != BreakIterator.DONE)
                {
                    clonedToken.copyTo(this);
                    termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
                    if (hasIllegalOffsets)
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                    }
                    else
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
                    }
                    if (handlePosIncr)
                    {
                        posAtt.PositionIncrement = 1;
                    }
                    return(true);
                }
                hasMoreTokensInClone = false;
            }

            if (!input.incrementToken())
            {
                return(false);
            }

            if (termAtt.length() == 0 || char.UnicodeBlock.of(termAtt.charAt(0)) != char.UnicodeBlock.THAI)
            {
                return(true);
            }

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
            {
                clonedToken     = cloneAttributes();
                clonedTermAtt   = clonedToken.getAttribute(typeof(CharTermAttribute));
                clonedOffsetAtt = clonedToken.getAttribute(typeof(OffsetAttribute));
            }
            else
            {
                this.copyTo(clonedToken);
            }

            // reinit CharacterIterator
            charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
            breaker.Text = charIterator;
            int end = breaker.next();

            if (end != BreakIterator.DONE)
            {
                termAtt.Length = end;
                if (hasIllegalOffsets)
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                }
                else
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
                }
                // position increment keeps as it is for first token
                return(true);
            }
            return(false);
        }
コード例 #9
0
	  /// <summary>
	  /// Fills CharTermAttribute with the current token text.
	  /// </summary>
	  public void getText(CharTermAttribute t)
	  {
		t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
	  }
コード例 #10
0
ファイル: SynonymFilter.cs プロジェクト: freemsly/lucenenet
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

            while (true)
            {
                // First play back any buffered future inputs/outputs
                // w/o running parsing again:
                while (inputSkipCount != 0)
                {
                    // At each position, we first output the original
                    // token

                    // TODO: maybe just a PendingState class, holding
                    // both input & outputs?
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final PendingInput input = futureInputs[nextRead];
                    PendingInput input = futureInputs[nextRead];
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final PendingOutputs outputs = futureOutputs[nextRead];
                    PendingOutputs outputs = futureOutputs[nextRead];

                    //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

                    if (!input.consumed && (input.keepOrig || !input.matched))
                    {
                        if (input.state != null)
                        {
                            // Return a previously saved token (because we
                            // had to lookahead):
                            restoreState(input.state);
                        }
                        else
                        {
                            // Pass-through case: return token we just pulled
                            // but didn't capture:
                            Debug.Assert(inputSkipCount == 1, "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead);
                        }
                        input.reset();
                        if (outputs.count > 0)
                        {
                            outputs.posIncr = 0;
                        }
                        else
                        {
                            nextRead = rollIncr(nextRead);
                            inputSkipCount--;
                        }
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else if (outputs.upto < outputs.count)
                    {
                        // Still have pending outputs to replay at this
                        // position
                        input.reset();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int posIncr = outputs.posIncr;
                        int posIncr = outputs.posIncr;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.CharsRef output = outputs.pullNext();
                        CharsRef output = outputs.pullNext();
                        clearAttributes();
                        termAtt.copyBuffer(output.chars, output.offset, output.length);
                        typeAtt.Type = TYPE_SYNONYM;
                        int endOffset = outputs.LastEndOffset;
                        if (endOffset == -1)
                        {
                            endOffset = input.endOffset;
                        }
                        offsetAtt.setOffset(input.startOffset, endOffset);
                        posIncrAtt.PositionIncrement = posIncr;
                        posLenAtt.PositionLength     = outputs.LastPosLength;
                        if (outputs.count == 0)
                        {
                            // Done with the buffered input and all outputs at
                            // this position
                            nextRead = rollIncr(nextRead);
                            inputSkipCount--;
                        }
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else
                    {
                        // Done with the buffered input and all outputs at
                        // this position
                        input.reset();
                        nextRead = rollIncr(nextRead);
                        inputSkipCount--;
                    }
                }

                if (finished && nextRead == nextWrite)
                {
                    // End case: if any output syns went beyond end of
                    // input stream, enumerate them now:
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final PendingOutputs outputs = futureOutputs[nextRead];
                    PendingOutputs outputs = futureOutputs[nextRead];
                    if (outputs.upto < outputs.count)
                    {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int posIncr = outputs.posIncr;
                        int posIncr = outputs.posIncr;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.CharsRef output = outputs.pullNext();
                        CharsRef output = outputs.pullNext();
                        futureInputs[nextRead].reset();
                        if (outputs.count == 0)
                        {
                            nextWrite = nextRead = rollIncr(nextRead);
                        }
                        clearAttributes();
                        // Keep offset from last input token:
                        offsetAtt.setOffset(lastStartOffset, lastEndOffset);
                        termAtt.copyBuffer(output.chars, output.offset, output.length);
                        typeAtt.Type = TYPE_SYNONYM;
                        //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
                        posIncrAtt.PositionIncrement = posIncr;
                        //System.out.println("  return token=" + termAtt.toString());
                        return(true);
                    }
                    else
                    {
                        return(false);
                    }
                }

                // Find new synonym matches:
                parse();
            }
        }
コード例 #11
0
ファイル: SlowSynonymFilter.cs プロジェクト: zfxsss/lucenenet
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = nextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null;

                if (result == null)
                {
                    copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <>();

                result = match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource            origTok        = includeOrig ? firstTok : null;
                PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute));
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                      repTok       = result.synonyms[i];
                    AttributeSource            newTok       = firstTok.cloneAttributes();
                    CharTermAttribute          newTermAtt   = newTok.addAttribute(typeof(CharTermAttribute));
                    OffsetAttribute            newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute));
                    PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute));

                    OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute));

                    newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                    newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                    repPos += repTok.PositionIncrement;
                    if (i == 0)     // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos    += origPosInc.PositionIncrement;
                        origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (origTok != null)
                        {
                            origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos    += origPosInc.PositionIncrement;
                    origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (origTok != null)
                    {
                        origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
コード例 #12
0
ファイル: CJKTokenizer.cs プロジェクト: zfxsss/lucenenet
        //~ Methods ----------------------------------------------------------------

        /// <summary>
        /// Returns true for the next token in the stream, or false at EOS.
        /// See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
        /// for detail.
        /// </summary>
        /// <returns> false for end of stream, true otherwise
        /// </returns>
        /// <exception cref="java.io.IOException"> - throw IOException when read error <br>
        ///         happened in the InputStream
        ///  </exception>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            clearAttributes();
            /// <summary>
            /// how many character(s) has been stored in buffer </summary>

            while (true)             // loop until we find a non-empty token
            {
                int length = 0;

                /// <summary>
                /// the position used to create Token </summary>
                int start = offset;

                while (true)           // loop until we've found a full token
                {
                    /// <summary>
                    /// current character </summary>
                    char c;

                    /// <summary>
                    /// unicode block of current character for detail </summary>
                    char.UnicodeBlock ub;

                    offset++;

                    if (bufferIndex >= dataLen)
                    {
                        dataLen     = input.read(ioBuffer);
                        bufferIndex = 0;
                    }

                    if (dataLen == -1)
                    {
                        if (length > 0)
                        {
                            if (preIsTokened == true)
                            {
                                length       = 0;
                                preIsTokened = false;
                            }
                            else
                            {
                                offset--;
                            }

                            break;
                        }
                        else
                        {
                            offset--;
                            return(false);
                        }
                    }
                    else
                    {
                        //get current character
                        c = ioBuffer[bufferIndex++];

                        //get the UnicodeBlock of the current character
                        ub = char.UnicodeBlock.of(c);
                    }

                    //if the current character is ASCII or Extend ASCII
                    if ((ub == char.UnicodeBlock.BASIC_LATIN) || (ub == char.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS))
                    {
                        if (ub == char.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
                        {
                            int i = (int)c;
                            if (i >= 65281 && i <= 65374)
                            {
                                // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
                                i = i - 65248;
                                c = (char)i;
                            }
                        }

                        // if the current character is a letter or "_" "+" "#"
                        if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')))
                        {
                            if (length == 0)
                            {
                                // "javaC1C2C3C4linux" <br>
                                //      ^--: the current character begin to token the ASCII
                                // letter
                                start = offset - 1;
                            }
                            else if (tokenType == DOUBLE_TOKEN_TYPE)
                            {
                                // "javaC1C2C3C4linux" <br>
                                //              ^--: the previous non-ASCII
                                // : the current character
                                offset--;
                                bufferIndex--;

                                if (preIsTokened == true)
                                {
                                    // there is only one non-ASCII has been stored
                                    length       = 0;
                                    preIsTokened = false;
                                    break;
                                }
                                else
                                {
                                    break;
                                }
                            }

                            // store the LowerCase(c) in the buffer
                            buffer[length++] = char.ToLower(c);
                            tokenType        = SINGLE_TOKEN_TYPE;

                            // break the procedure if buffer overflowed!
                            if (length == MAX_WORD_LEN)
                            {
                                break;
                            }
                        }
                        else if (length > 0)
                        {
                            if (preIsTokened == true)
                            {
                                length       = 0;
                                preIsTokened = false;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }
                    else
                    {
                        // non-ASCII letter, e.g."C1C2C3C4"
                        if (char.IsLetter(c))
                        {
                            if (length == 0)
                            {
                                start            = offset - 1;
                                buffer[length++] = c;
                                tokenType        = DOUBLE_TOKEN_TYPE;
                            }
                            else
                            {
                                if (tokenType == SINGLE_TOKEN_TYPE)
                                {
                                    offset--;
                                    bufferIndex--;

                                    //return the previous ASCII characters
                                    break;
                                }
                                else
                                {
                                    buffer[length++] = c;
                                    tokenType        = DOUBLE_TOKEN_TYPE;

                                    if (length == 2)
                                    {
                                        offset--;
                                        bufferIndex--;
                                        preIsTokened = true;

                                        break;
                                    }
                                }
                            }
                        }
                        else if (length > 0)
                        {
                            if (preIsTokened == true)
                            {
                                // empty the buffer
                                length       = 0;
                                preIsTokened = false;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }
                }

                if (length > 0)
                {
                    termAtt.copyBuffer(buffer, 0, length);
                    offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
                    typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];
                    return(true);
                }
                else if (dataLen == -1)
                {
                    offset--;
                    return(false);
                }

                // Cycle back and try for the next token (don't
                // return an empty string)
            }
        }
コード例 #13
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (currentMatcher != -1 && nextCapture())
            {
                Debug.Assert(state != null);
                clearAttributes();
                restoreState(state);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
                int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);
                int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);

                posAttr.PositionIncrement = 0;
                charTermAttr.copyBuffer(spare.chars, start, end - start);
                currentGroup[currentMatcher]++;
                return(true);
            }

            if (!input.incrementToken())
            {
                return(false);
            }

            char[] buffer = charTermAttr.buffer();
            int    length = charTermAttr.length();

            spare.copyChars(buffer, 0, length);
            state = captureState();

            for (int i = 0; i < matchers.Length; i++)
            {
                matchers[i].reset(spare);
                currentGroup[i] = -1;
            }

            if (preserveOriginal)
            {
                currentMatcher = 0;
            }
            else if (nextCapture())
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
                int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);
                int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);

                // if we start at 0 we can simply set the length and save the copy
                if (start == 0)
                {
                    charTermAttr.Length = end;
                }
                else
                {
                    charTermAttr.copyBuffer(spare.chars, start, end - start);
                }
                currentGroup[currentMatcher]++;
            }
            return(true);
        }
コード例 #14
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            clearAttributes();
            if (delimitersCount == -1)
            {
                int length = 0;
                delimiterPositions.Add(0);
                while (true)
                {
                    int c = input.read();
                    if (c < 0)
                    {
                        break;
                    }
                    length++;
                    if (c == delimiter)
                    {
                        delimiterPositions.Add(length);
                        resultToken.Append(replacement);
                    }
                    else
                    {
                        resultToken.Append((char)c);
                    }
                }
                delimitersCount = delimiterPositions.Count;
                if (delimiterPositions[delimitersCount - 1] < length)
                {
                    delimiterPositions.Add(length);
                    delimitersCount++;
                }
                if (resultTokenBuffer.Length < resultToken.Length)
                {
                    resultTokenBuffer = new char[resultToken.Length];
                }
                resultToken.getChars(0, resultToken.Length, resultTokenBuffer, 0);
                resultToken.Length = 0;
                int idx = delimitersCount - 1 - skip;
                if (idx >= 0)
                {
                    // otherwise its ok, because we will skip and return false
                    endPosition = delimiterPositions[idx];
                }
                finalOffset = correctOffset(length);
                posAtt.PositionIncrement = 1;
            }
            else
            {
                posAtt.PositionIncrement = 0;
            }

            while (skipped < delimitersCount - skip - 1)
            {
                int start = delimiterPositions[skipped];
                termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start);
                offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition));
                skipped++;
                return(true);
            }

            return(false);
        }