Esempio n. 1
0
        // we only check a few core attributes here.
        // TODO: test other things
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void assertEquals(String s, org.apache.lucene.analysis.TokenStream left, org.apache.lucene.analysis.TokenStream right) throws Exception
        public virtual void assertEquals(string s, TokenStream left, TokenStream right)
        {
            left.reset();
            right.reset();
            CharTermAttribute          leftTerm    = left.addAttribute(typeof(CharTermAttribute));
            CharTermAttribute          rightTerm   = right.addAttribute(typeof(CharTermAttribute));
            OffsetAttribute            leftOffset  = left.addAttribute(typeof(OffsetAttribute));
            OffsetAttribute            rightOffset = right.addAttribute(typeof(OffsetAttribute));
            PositionIncrementAttribute leftPos     = left.addAttribute(typeof(PositionIncrementAttribute));
            PositionIncrementAttribute rightPos    = right.addAttribute(typeof(PositionIncrementAttribute));

            while (left.incrementToken())
            {
                assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
                assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString());
                assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement);
                assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
                assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
            }
            ;
            assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
            left.end();
            right.end();
            assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
            left.close();
            right.close();
        }
Esempio n. 2
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (!input.incrementToken())
            {
                return(false);
            }

            char[] termBuffer = termAtt.buffer();
            int    len        = termAtt.length();

            //TODO: Is this the right behavior or should we return false?  Currently, "  ", returns true, so I think this should
            //also return true
            if (len == 0)
            {
                return(true);
            }
            int start  = 0;
            int end    = 0;
            int endOff = 0;

            // eat the first characters
            for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++)
            {
            }
            // eat the end characters
            for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--)
            {
                endOff++;
            }
            if (start > 0 || end < len)
            {
                if (start < end)
                {
                    termAtt.copyBuffer(termBuffer, start, (end - start));
                }
                else
                {
                    termAtt.setEmpty();
                }
                if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset())
                {
                    int newStart = offsetAtt.startOffset() + start;
                    int newEnd   = offsetAtt.endOffset() - (start < end ? endOff:0);
                    offsetAtt.setOffset(newStart, newEnd);
                }
            }

            return(true);
        }
Esempio n. 3
0
        /// <summary>
        /// {@inheritDoc}
        /// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            while (!exhausted && input.incrementToken())
            {
                char[] term       = termAttribute.buffer();
                int    termLength = termAttribute.length();
                lastEndOffset = offsetAttribute.endOffset();

                if (termLength > 0 && term[termLength - 1] == '-')
                {
                    // a hyphenated word
                    // capture the state of the first token only
                    if (savedState == null)
                    {
                        savedState = captureState();
                    }
                    hyphenated.Append(term, 0, termLength - 1);
                }
                else if (savedState == null)
                {
                    // not part of a hyphenated word.
                    return(true);
                }
                else
                {
                    // the final portion of a hyphenated word
                    hyphenated.Append(term, 0, termLength);
                    unhyphenate();
                    return(true);
                }
            }

            exhausted = true;

            if (savedState != null)
            {
                // the final term ends with a hyphen
                // add back the hyphen, for backwards compatibility.
                hyphenated.Append('-');
                unhyphenate();
                return(true);
            }

            return(false);
        }
Esempio n. 4
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testSupplementaryCharacters() throws java.io.IOException
        public virtual void testSupplementaryCharacters()
        {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final String s = org.apache.lucene.util.TestUtil.randomUnicodeString(random(), 10);
            string s = TestUtil.randomUnicodeString(random(), 10);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int codePointCount = s.codePointCount(0, s.length());
            int codePointCount = s.codePointCount(0, s.Length);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int minGram = org.apache.lucene.util.TestUtil.nextInt(random(), 1, 3);
            int minGram = TestUtil.Next(random(), 1, 3);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int maxGram = org.apache.lucene.util.TestUtil.nextInt(random(), minGram, 10);
            int         maxGram = TestUtil.Next(random(), minGram, 10);
            TokenStream tk      = new KeywordTokenizer(new StringReader(s));

            tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
            CharTermAttribute termAtt = tk.addAttribute(typeof(CharTermAttribute));
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAtt = tk.addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
            OffsetAttribute offsetAtt = tk.addAttribute(typeof(OffsetAttribute));

            tk.reset();
            for (int start = 0; start < codePointCount; ++start)
            {
                for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end)
                {
                    assertTrue(tk.incrementToken());
                    assertEquals(0, offsetAtt.startOffset());
                    assertEquals(s.Length, offsetAtt.endOffset());
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int startIndex = Character.offsetByCodePoints(s, 0, start);
                    int startIndex = char.offsetByCodePoints(s, 0, start);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int endIndex = Character.offsetByCodePoints(s, 0, end);
                    int endIndex = char.offsetByCodePoints(s, 0, end);
                    assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString());
                }
            }
            assertFalse(tk.incrementToken());
        }
Esempio n. 5
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testFilterTokens() throws Exception
        public virtual void testFilterTokens()
        {
            SnowballFilter             filter     = new SnowballFilter(new TestTokenStream(this), "English");
            CharTermAttribute          termAtt    = filter.getAttribute(typeof(CharTermAttribute));
            OffsetAttribute            offsetAtt  = filter.getAttribute(typeof(OffsetAttribute));
            TypeAttribute              typeAtt    = filter.getAttribute(typeof(TypeAttribute));
            PayloadAttribute           payloadAtt = filter.getAttribute(typeof(PayloadAttribute));
            PositionIncrementAttribute posIncAtt  = filter.getAttribute(typeof(PositionIncrementAttribute));
            FlagsAttribute             flagsAtt   = filter.getAttribute(typeof(FlagsAttribute));

            filter.incrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.startOffset());
            assertEquals(7, offsetAtt.endOffset());
            assertEquals("wrd", typeAtt.type());
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new sbyte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Esempio n. 6
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testOtherLetterOffset() throws java.io.IOException
        public virtual void testOtherLetterOffset()
        {
            string           s         = "a天b";
            ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

            int             correctStartOffset = 0;
            int             correctEndOffset   = 1;
            OffsetAttribute offsetAtt          = tokenizer.getAttribute(typeof(OffsetAttribute));

            tokenizer.reset();
            while (tokenizer.incrementToken())
            {
                assertEquals(correctStartOffset, offsetAtt.startOffset());
                assertEquals(correctEndOffset, offsetAtt.endOffset());
                correctStartOffset++;
                correctEndOffset++;
            }
            tokenizer.end();
            tokenizer.close();
        }
Esempio n. 7
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS. </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            while (true)
            {
                if (curTermBuffer == null)
                {
                    if (!input.incrementToken())
                    {
                        return(false);
                    }
                    else
                    {
                        curTermBuffer     = termAtt.buffer().clone();
                        curTermLength     = termAtt.length();
                        curCodePointCount = charUtils.codePointCount(termAtt);
                        curGramSize       = minGram;
                        curPos            = 0;
                        curPosInc         = posIncAtt.PositionIncrement;
                        curPosLen         = posLenAtt.PositionLength;
                        tokStart          = offsetAtt.startOffset();
                        tokEnd            = offsetAtt.endOffset();
                        // if length by start + end offsets doesn't match the term text then assume
                        // this is a synonym and don't adjust the offsets.
                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
                    }
                }
                if (version.onOrAfter(Version.LUCENE_44))
                {
                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
                    {
                        ++curPos;
                        curGramSize = minGram;
                    }
                    if ((curPos + curGramSize) <= curCodePointCount)
                    {
                        clearAttributes();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                        int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        termAtt.copyBuffer(curTermBuffer, start, end - start);
                        posIncAtt.PositionIncrement = curPosInc;
                        curPosInc = 0;
                        posLenAtt.PositionLength = curPosLen;
                        offsetAtt.setOffset(tokStart, tokEnd);
                        curGramSize++;
                        return(true);
                    }
                }
                else
                {
                    while (curGramSize <= maxGram)
                    {
                        while (curPos + curGramSize <= curTermLength)   // while there is input
                        {
                            clearAttributes();
                            termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
                            if (hasIllegalOffsets)
                            {
                                offsetAtt.setOffset(tokStart, tokEnd);
                            }
                            else
                            {
                                offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                            }
                            curPos++;
                            return(true);
                        }
                        curGramSize++;   // increase n-gram size
                        curPos = 0;
                    }
                }
                curTermBuffer = null;
            }
        }
Esempio n. 8
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
        public override bool incrementToken()
        {
            if (hasMoreTokensInClone)
            {
                int start = breaker.current();
                int end   = breaker.next();
                if (end != BreakIterator.DONE)
                {
                    clonedToken.copyTo(this);
                    termAtt.copyBuffer(clonedTermAtt.buffer(), start, end - start);
                    if (hasIllegalOffsets)
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                    }
                    else
                    {
                        offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
                    }
                    if (handlePosIncr)
                    {
                        posAtt.PositionIncrement = 1;
                    }
                    return(true);
                }
                hasMoreTokensInClone = false;
            }

            if (!input.incrementToken())
            {
                return(false);
            }

            if (termAtt.length() == 0 || char.UnicodeBlock.of(termAtt.charAt(0)) != char.UnicodeBlock.THAI)
            {
                return(true);
            }

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.endOffset() - offsetAtt.startOffset() != termAtt.length();

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
            {
                clonedToken     = cloneAttributes();
                clonedTermAtt   = clonedToken.getAttribute(typeof(CharTermAttribute));
                clonedOffsetAtt = clonedToken.getAttribute(typeof(OffsetAttribute));
            }
            else
            {
                this.copyTo(clonedToken);
            }

            // reinit CharacterIterator
            charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
            breaker.Text = charIterator;
            int end = breaker.next();

            if (end != BreakIterator.DONE)
            {
                termAtt.Length = end;
                if (hasIllegalOffsets)
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.endOffset());
                }
                else
                {
                    offsetAtt.setOffset(clonedOffsetAtt.startOffset(), clonedOffsetAtt.startOffset() + end);
                }
                // position increment keeps as it is for first token
                return(true);
            }
            return(false);
        }
Esempio n. 9
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void parse() throws java.io.IOException
        private void parse()
        {
            //System.out.println("\nS: parse");

            Debug.Assert(inputSkipCount == 0);

            int curNextRead = nextRead;

            // Holds the longest match we've seen so far:
            BytesRef matchOutput      = null;
            int      matchInputLength = 0;
            int      matchEndOffset   = -1;

            BytesRef pendingOutput = fst.outputs.NoOutput;

            fst.getFirstArc(scratchArc);

            Debug.Assert(scratchArc.output == fst.outputs.NoOutput);

            int tokenCount = 0;

            while (true)
            {
                // Pull next token's chars:
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final char[] buffer;
                char[] buffer;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int bufferLen;
                int bufferLen;
                //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

                int inputEndOffset = 0;

                if (curNextRead == nextWrite)
                {
                    // We used up our lookahead buffer of input tokens
                    // -- pull next real input token:

                    if (finished)
                    {
                        break;
                    }
                    else
                    {
                        //System.out.println("  input.incrToken");
                        Debug.Assert(futureInputs[nextWrite].consumed);
                        // Not correct: a syn match whose output is longer
                        // than its input can set future inputs keepOrig
                        // to true:
                        //assert !futureInputs[nextWrite].keepOrig;
                        if (input.incrementToken())
                        {
                            buffer    = termAtt.buffer();
                            bufferLen = termAtt.length();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final PendingInput input = futureInputs[nextWrite];
                            PendingInput input = futureInputs[nextWrite];
                            lastStartOffset = input.startOffset = offsetAtt.startOffset();
                            lastEndOffset   = input.endOffset = offsetAtt.endOffset();
                            inputEndOffset  = input.endOffset;
                            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                            if (nextRead != nextWrite)
                            {
                                capture();
                            }
                            else
                            {
                                input.consumed = false;
                            }
                        }
                        else
                        {
                            // No more input tokens
                            //System.out.println("      set end");
                            finished = true;
                            break;
                        }
                    }
                }
                else
                {
                    // Still in our lookahead
                    buffer         = futureInputs[curNextRead].term.chars;
                    bufferLen      = futureInputs[curNextRead].term.length;
                    inputEndOffset = futureInputs[curNextRead].endOffset;
                    //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
                }

                tokenCount++;

                // Run each char in this token through the FST:
                int bufUpto = 0;
                while (bufUpto < bufferLen)
                {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
                    int codePoint = char.codePointAt(buffer, bufUpto, bufferLen);
                    if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        //System.out.println("    stop");
                        goto byTokenBreak;
                    }

                    // Accum the output
                    pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
                    //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
                    bufUpto += char.charCount(codePoint);
                }

                // OK, entire token matched; now see if this is a final
                // state:
                if (scratchArc.Final)
                {
                    matchOutput      = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
                    matchInputLength = tokenCount;
                    matchEndOffset   = inputEndOffset;
                    //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
                }

                // See if the FST wants to continue matching (ie, needs to
                // see the next input token):
                if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
                {
                    // No further rules can match here; we're done
                    // searching for matching rules starting at the
                    // current input position.
                    break;
                }
                else
                {
                    // More matching is possible -- accum the output (if
                    // any) of the WORD_SEP arc:
                    pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
                    if (nextRead == nextWrite)
                    {
                        capture();
                    }
                }

                curNextRead = rollIncr(curNextRead);
                byTokenContinue :;
            }
            byTokenBreak :

            if (nextRead == nextWrite && !finished)
            {
                //System.out.println("  skip write slot=" + nextWrite);
                nextWrite = rollIncr(nextWrite);
            }

            if (matchOutput != null)
            {
                //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
                inputSkipCount = matchInputLength;
                addOutput(matchOutput, matchInputLength, matchEndOffset);
            }
            else if (nextRead != nextWrite)
            {
                // Even though we had no match here, we set to 1
                // because we need to skip current input token before
                // trying to match again:
                inputSkipCount = 1;
            }
            else
            {
                Debug.Assert(finished);
            }

            //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
        }
Esempio n. 10
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = nextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null;

                if (result == null)
                {
                    copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <>();

                result = match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource            origTok        = includeOrig ? firstTok : null;
                PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(typeof(PositionIncrementAttribute));
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                      repTok       = result.synonyms[i];
                    AttributeSource            newTok       = firstTok.cloneAttributes();
                    CharTermAttribute          newTermAtt   = newTok.addAttribute(typeof(CharTermAttribute));
                    OffsetAttribute            newOffsetAtt = newTok.addAttribute(typeof(OffsetAttribute));
                    PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(typeof(PositionIncrementAttribute));

                    OffsetAttribute lastOffsetAtt = lastTok.addAttribute(typeof(OffsetAttribute));

                    newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
                    newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
                    repPos += repTok.PositionIncrement;
                    if (i == 0)     // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos    += origPosInc.PositionIncrement;
                        origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (origTok != null)
                        {
                            origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    PositionIncrementAttribute origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos    += origPosInc.PositionIncrement;
                    origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (origTok != null)
                    {
                        origPosInc = origTok.addAttribute(typeof(PositionIncrementAttribute));
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Esempio n. 11
0
        // LUCENE-3642: normalize BMP->SMP and check that offsets are correct
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testCrossPlaneNormalization2() throws java.io.IOException
        public virtual void testCrossPlaneNormalization2()
        {
            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper2(this);
            int      num      = 1000 * RANDOM_MULTIPLIER;

            for (int i = 0; i < num; i++)
            {
                string      s  = TestUtil.randomUnicodeString(random());
                TokenStream ts = analyzer.tokenStream("foo", s);
                try
                {
                    ts.reset();
                    OffsetAttribute offsetAtt = ts.addAttribute(typeof(OffsetAttribute));
                    while (ts.incrementToken())
                    {
                        string highlightedText = StringHelperClass.SubstringSpecial(s, offsetAtt.startOffset(), offsetAtt.endOffset());
                        for (int j = 0, cp = 0; j < highlightedText.Length; j += char.charCount(cp))
                        {
                            cp = char.ConvertToUtf32(highlightedText, j);
                            assertTrue("non-letter:" + cp.ToString("x"), char.IsLetter(cp));
                        }
                    }
                    ts.end();
                }
                finally
                {
                    IOUtils.closeWhileHandlingException(ts);
                }
            }
            // just for fun
            checkRandomData(random(), analyzer, num);
        }