Ejemplo n.º 1
0
        private void Parse()
        {
            //System.out.println("\nS: parse");

            Debug.Assert(inputSkipCount == 0);

            int curNextRead = nextRead;

            // Holds the longest match we've seen so far:
            BytesRef matchOutput      = null;
            int      matchInputLength = 0;
            int      matchEndOffset   = -1;

            BytesRef pendingOutput = fst.Outputs.NoOutput;

            fst.GetFirstArc(scratchArc);

            Debug.Assert(scratchArc.Output == fst.Outputs.NoOutput);

            int tokenCount = 0;

            while (true)
            {
                // Pull next token's chars:
                char[] buffer;
                int    bufferLen;
                //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

                int inputEndOffset = 0;

                if (curNextRead == nextWrite)
                {
                    // We used up our lookahead buffer of input tokens
                    // -- pull next real input token:

                    if (finished)
                    {
                        break;
                    }
                    else
                    {
                        //System.out.println("  input.incrToken");
                        Debug.Assert(futureInputs[nextWrite].consumed);
                        // Not correct: a syn match whose output is longer
                        // than its input can set future inputs keepOrig
                        // to true:
                        //assert !futureInputs[nextWrite].keepOrig;
                        if (input.IncrementToken())
                        {
                            buffer    = termAtt.Buffer();
                            bufferLen = termAtt.Length;
                            PendingInput pendingInput = futureInputs[nextWrite];
                            lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset();
                            lastEndOffset   = pendingInput.endOffset = offsetAtt.EndOffset();
                            inputEndOffset  = pendingInput.endOffset;
                            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                            if (nextRead != nextWrite)
                            {
                                Capture();
                            }
                            else
                            {
                                pendingInput.consumed = false;
                            }
                        }
                        else
                        {
                            // No more input tokens
                            //System.out.println("      set end");
                            finished = true;
                            break;
                        }
                    }
                }
                else
                {
                    // Still in our lookahead
                    buffer         = futureInputs[curNextRead].term.Chars;
                    bufferLen      = futureInputs[curNextRead].term.Length;
                    inputEndOffset = futureInputs[curNextRead].endOffset;
                    //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
                }

                tokenCount++;

                // Run each char in this token through the FST:
                int bufUpto = 0;
                while (bufUpto < bufferLen)
                {
                    int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
                    if (fst.FindTargetArc(ignoreCase ? Character.ToLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        //System.out.println("    stop");
                        goto byTokenBreak;
                    }

                    // Accum the output
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
                    bufUpto += Character.CharCount(codePoint);
                }

                // OK, entire token matched; now see if this is a final
                // state:
                if (scratchArc.Final)
                {
                    matchOutput      = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
                    matchInputLength = tokenCount;
                    matchEndOffset   = inputEndOffset;
                    //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
                }

                // See if the FST wants to continue matching (ie, needs to
                // see the next input token):
                if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
                {
                    // No further rules can match here; we're done
                    // searching for matching rules starting at the
                    // current input position.
                    break;
                }
                else
                {
                    // More matching is possible -- accum the output (if
                    // any) of the WORD_SEP arc:
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    if (nextRead == nextWrite)
                    {
                        Capture();
                    }
                }

                curNextRead = RollIncr(curNextRead);
            }
byTokenBreak:

            if (nextRead == nextWrite && !finished)
            {
                //System.out.println("  skip write slot=" + nextWrite);
                nextWrite = RollIncr(nextWrite);
            }

            if (matchOutput != null)
            {
                //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
                inputSkipCount = matchInputLength;
                AddOutput(matchOutput, matchInputLength, matchEndOffset);
            }
            else if (nextRead != nextWrite)
            {
                // Even though we had no match here, we set to 1
                // because we need to skip current input token before
                // trying to match again:
                inputSkipCount = 1;
            }
            else
            {
                Debug.Assert(finished);
            }

            //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
        }
Ejemplo n.º 2
0
        public override void ProcessFields(IndexableField[] fields, int count)
        {
            FieldState.Reset();

            bool doInvert = Consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
            {
                IndexableField     field     = fields[i];
                IndexableFieldType fieldType = field.FieldType();

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (fieldType.Indexed && doInvert)
                {
                    bool analyzed = fieldType.Tokenized && DocState.Analyzer != null;

                    // if the field omits norms, the boost cannot be indexed.
                    if (fieldType.OmitNorms && field.GetBoost() != 1.0f)
                    {
                        throw new System.NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name() + "'");
                    }

                    // only bother checking offsets if something will consume them.
                    // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
                    bool checkOffsets    = fieldType.IndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                    int  lastStartOffset = 0;

                    if (i > 0)
                    {
                        FieldState.Position_Renamed += analyzed ? DocState.Analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;
                    }

                    /*
                     * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
                     * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
                     * but rather a finally that takes note of the problem.
                     */

                    bool succeededInProcessingField = false;

                    TokenStream stream = field.GetTokenStream(DocState.Analyzer);
                    // reset the TokenStream to the first token
                    stream.Reset();

                    try
                    {
                        bool hasMoreTokens = stream.IncrementToken();

                        FieldState.AttributeSource_Renamed = stream;

                        IOffsetAttribute            offsetAttribute  = FieldState.AttributeSource_Renamed.AddAttribute <IOffsetAttribute>();
                        IPositionIncrementAttribute posIncrAttribute = FieldState.AttributeSource_Renamed.AddAttribute <IPositionIncrementAttribute>();

                        if (hasMoreTokens)
                        {
                            Consumer.Start(field);

                            do
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                int posIncr = posIncrAttribute.PositionIncrement;
                                if (posIncr < 0)
                                {
                                    throw new System.ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name() + "'");
                                }
                                if (FieldState.Position_Renamed == 0 && posIncr == 0)
                                {
                                    throw new System.ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name() + "'");
                                }
                                int position = FieldState.Position_Renamed + posIncr;
                                if (position > 0)
                                {
                                    // NOTE: confusing: this "mirrors" the
                                    // position++ we do below
                                    position--;
                                }
                                else if (position < 0)
                                {
                                    throw new System.ArgumentException("position overflow for field '" + field.Name() + "'");
                                }

                                // position is legal, we can safely place it in fieldState now.
                                // not sure if anything will use fieldState after non-aborting exc...
                                FieldState.Position_Renamed = position;

                                if (posIncr == 0)
                                {
                                    FieldState.NumOverlap_Renamed++;
                                }

                                if (checkOffsets)
                                {
                                    int startOffset = FieldState.Offset_Renamed + offsetAttribute.StartOffset();
                                    int endOffset   = FieldState.Offset_Renamed + offsetAttribute.EndOffset();
                                    if (startOffset < 0 || endOffset < startOffset)
                                    {
                                        throw new System.ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name() + "'");
                                    }
                                    if (startOffset < lastStartOffset)
                                    {
                                        throw new System.ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name() + "'");
                                    }
                                    lastStartOffset = startOffset;
                                }

                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    Consumer.Add();
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        DocState.DocWriter.SetAborting();
                                    }
                                }
                                FieldState.Length_Renamed++;
                                FieldState.Position_Renamed++;
                            } while (stream.IncrementToken());
                        }
                        // trigger streams to perform end-of-stream operations
                        stream.End();
                        // TODO: maybe add some safety? then again, its already checked
                        // when we come back around to the field...
                        FieldState.Position_Renamed += posIncrAttribute.PositionIncrement;
                        FieldState.Offset_Renamed   += offsetAttribute.EndOffset();

                        if (DocState.MaxTermPrefix != null)
                        {
                            string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + DocState.MaxTermPrefix + "...'";
                            if (DocState.InfoStream.IsEnabled("IW"))
                            {
                                DocState.InfoStream.Message("IW", "ERROR: " + msg);
                            }
                            DocState.MaxTermPrefix = null;
                            throw new System.ArgumentException(msg);
                        }

                        /* if success was false above there is an exception coming through and we won't get here.*/
                        succeededInProcessingField = true;
                    }
                    finally
                    {
                        if (!succeededInProcessingField)
                        {
                            IOUtils.CloseWhileHandlingException(stream);
                        }
                        else
                        {
                            stream.Dispose();
                        }
                        if (!succeededInProcessingField && DocState.InfoStream.IsEnabled("DW"))
                        {
                            DocState.InfoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);
                        }
                    }

                    FieldState.Offset_Renamed += analyzed ? DocState.Analyzer.GetOffsetGap(fieldInfo.Name) : 0;
                    FieldState.Boost_Renamed  *= field.GetBoost();
                }

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;
            }

            Consumer.Finish();
            EndConsumer.Finish();
        }
Ejemplo n.º 3
0
        // offsetsAreCorrect also validates:
        //   - graph offsets are correct (all tokens leaving from
        //     pos X have the same startOffset; all tokens
        //     arriving to pos Y have the same endOffset)
        //   - offsets only move forwards (startOffset >=
        //     lastStartOffset)
        public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int?finalOffset, int?finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect)
        {
            Assert.IsNotNull(output);
            var checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>();

            ICharTermAttribute termAtt = null;

            if (output.Length > 0)
            {
                Assert.IsTrue(ts.HasAttribute <ICharTermAttribute>(), "has no CharTermAttribute");
                termAtt = ts.GetAttribute <ICharTermAttribute>();
            }

            IOffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute <IOffsetAttribute>();
            }

            ITypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute <ITypeAttribute>();
            }

            IPositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null || finalPosInc != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>();
            }

            IPositionLengthAttribute posLengthAtt = null;

            if (posLengths != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionLengthAttribute>(), "has no PositionLengthAttribute");
                posLengthAtt = ts.GetAttribute <IPositionLengthAttribute>();
            }

            IKeywordAttribute keywordAtt = null;

            if (keywordAtts != null)
            {
                Assert.IsTrue(ts.HasAttribute <IKeywordAttribute>(), "has no KeywordAttribute");
                keywordAtt = ts.GetAttribute <IKeywordAttribute>();
            }

            // Maps position to the start/end offset:
            IDictionary <int?, int?> posToStartOffset = new Dictionary <int?, int?>();
            IDictionary <int?, int?> posToEndOffset   = new Dictionary <int?, int?>();

            ts.Reset();
            int pos             = -1;
            int lastStartOffset = 0;

            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.Type = "bogusType";
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.PositionIncrement = 45987657;
                }
                if (posLengthAtt != null)
                {
                    posLengthAtt.PositionLength = 45987653;
                }
                if (keywordAtt != null)
                {
                    keywordAtt.Keyword = (i & 1) == 0;
                }

                bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString());
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
                }
                if (posLengths != null)
                {
                    Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i);
                }
                if (keywordAtts != null)
                {
                    Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i);
                }

                // we can enforce some basic things about a few attributes even if the caller doesn't check:
                if (offsetAtt != null)
                {
                    int startOffset = offsetAtt.StartOffset();
                    int endOffset   = offsetAtt.EndOffset();
                    if (finalOffset != null)
                    {
                        Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset");
                        Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset);
                    }

                    if (offsetsAreCorrect)
                    {
                        Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
                        lastStartOffset = offsetAtt.StartOffset();
                    }

                    if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null)
                    {
                        // Validate offset consistency in the graph, ie
                        // all tokens leaving from a certain pos have the
                        // same startOffset, and all tokens arriving to a
                        // certain pos have the same endOffset:
                        int posInc = posIncrAtt.PositionIncrement;
                        pos += posInc;

                        int posLength = posLengthAtt.PositionLength;

                        if (!posToStartOffset.ContainsKey(pos))
                        {
                            // First time we've seen a token leaving from this position:
                            posToStartOffset[pos] = startOffset;
                            //System.out.println("  + s " + pos + " -> " + startOffset);
                        }
                        else
                        {
                            // We've seen a token leaving from this position
                            // before; verify the startOffset is the same:
                            //System.out.println("  + vs " + pos + " -> " + startOffset);
                            Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }

                        int endPos = pos + posLength;

                        if (!posToEndOffset.ContainsKey(endPos))
                        {
                            // First time we've seen a token arriving to this position:
                            posToEndOffset[endPos] = endOffset;
                            //System.out.println("  + e " + endPos + " -> " + endOffset);
                        }
                        else
                        {
                            // We've seen a token arriving to this position
                            // before; verify the endOffset is the same:
                            //System.out.println("  + ve " + endPos + " -> " + endOffset);
                            Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }
                    }
                }
                if (posIncrAtt != null)
                {
                    if (i == 0)
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1");
                    }
                    else
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0");
                    }
                }
                if (posLengthAtt != null)
                {
                    Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1");
                }
            }

            if (ts.IncrementToken())
            {
                Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt);
            }

            // repeat our extra safety checks for End()
            ts.ClearAttributes();
            if (termAtt != null)
            {
                termAtt.SetEmpty().Append("bogusTerm");
            }
            if (offsetAtt != null)
            {
                offsetAtt.SetOffset(14584724, 24683243);
            }
            if (typeAtt != null)
            {
                typeAtt.Type = "bogusType";
            }
            if (posIncrAtt != null)
            {
                posIncrAtt.PositionIncrement = 45987657;
            }
            if (posLengthAtt != null)
            {
                posLengthAtt.PositionLength = 45987653;
            }

            var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before

            ts.End();
            Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()");

            if (finalOffset != null)
            {
                Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset");
            }
            if (offsetAtt != null)
            {
                Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0");
            }
            if (finalPosInc != null)
            {
                Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc");
            }

            ts.Dispose();
        }
Ejemplo n.º 4
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    Copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = NextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null;

                if (result == null)
                {
                    Copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <AttributeSource>();

                result = Match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    Copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource             origTok        = includeOrig ? firstTok : null;
                IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>();
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                       repTok       = result.synonyms[i];
                    AttributeSource             newTok       = firstTok.CloneAttributes();
                    ICharTermAttribute          newTermAtt   = newTok.AddAttribute <ICharTermAttribute>();
                    IOffsetAttribute            newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>();
                    IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>();

                    IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>();

                    newOffsetAtt.SetOffset(newOffsetAtt.StartOffset(), lastOffsetAtt.EndOffset());
                    newTermAtt.CopyBuffer(repTok.Buffer(), 0, repTok.Length);
                    repPos += repTok.PositionIncrement;
                    if (i == 0) // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos += origPosInc.PositionIncrement;
                        //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (matched.Count == 0)
                        {
                            origTok = null;
                        }
                        else
                        {
                            origTok = matched.First.Value;
                            matched.RemoveFirst();
                        }
                        if (origTok != null)
                        {
                            origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos += origPosInc.PositionIncrement;
                    //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (matched.Count == 0)
                    {
                        origTok = null;
                    }
                    else
                    {
                        origTok = matched.Count > 0 ? matched.First.Value : null;
                        matched.RemoveFirst();
                    }
                    if (origTok != null)
                    {
                        origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Returns the next token in the stream, or null at EOS.
        /// </summary>
        public override bool IncrementToken()
        {
            while (true)
            {
                if (curTermBuffer == null)
                {
                    if (!input.IncrementToken())
                    {
                        return(false);
                    }
                    else
                    {
                        curTermBuffer     = (char[])termAtt.Buffer().Clone();
                        curTermLength     = termAtt.Length;
                        curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
                        curGramSize       = minGram;
                        curPos            = 0;
                        curPosInc         = posIncAtt.PositionIncrement;
                        curPosLen         = posLenAtt.PositionLength;
                        tokStart          = offsetAtt.StartOffset();
                        tokEnd            = offsetAtt.EndOffset();
                        // if length by start + end offsets doesn't match the term text then assume
                        // this is a synonym and don't adjust the offsets.
                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
                    }
                }
#pragma warning disable 612, 618
                if (version.OnOrAfter(LuceneVersion.LUCENE_44))
#pragma warning restore 612, 618
                {
                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
                    {
                        ++curPos;
                        curGramSize = minGram;
                    }
                    if ((curPos + curGramSize) <= curCodePointCount)
                    {
                        ClearAttributes();
                        int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                        int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
                        posIncAtt.PositionIncrement = curPosInc;
                        curPosInc = 0;
                        posLenAtt.PositionLength = curPosLen;
                        offsetAtt.SetOffset(tokStart, tokEnd);
                        curGramSize++;
                        return(true);
                    }
                }
                else
                {
                    while (curGramSize <= maxGram)
                    {
                        while (curPos + curGramSize <= curTermLength) // while there is input
                        {
                            ClearAttributes();
                            termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                            if (hasIllegalOffsets)
                            {
                                offsetAtt.SetOffset(tokStart, tokEnd);
                            }
                            else
                            {
                                offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                            }
                            curPos++;
                            return(true);
                        }
                        curGramSize++; // increase n-gram size
                        curPos = 0;
                    }
                }
                curTermBuffer = null;
            }
        }
Ejemplo n.º 6
0
        public static void VerifyEquals(Fields d1, Fields d2)
        {
            if (d1 == null)
            {
                Assert.IsTrue(d2 == null || d2.Size == 0);
                return;
            }
            Assert.IsTrue(d2 != null);

            IEnumerator <string> fieldsEnum2 = d2.GetEnumerator();

            foreach (string field1 in d1)
            {
                fieldsEnum2.MoveNext();
                string field2 = fieldsEnum2.Current;
                Assert.AreEqual(field1, field2);

                Terms terms1 = d1.Terms(field1);
                Assert.IsNotNull(terms1);
                TermsEnum termsEnum1 = terms1.Iterator(null);

                Terms terms2 = d2.Terms(field2);
                Assert.IsNotNull(terms2);
                TermsEnum termsEnum2 = terms2.Iterator(null);

                DocsAndPositionsEnum dpEnum1 = null;
                DocsAndPositionsEnum dpEnum2 = null;
                DocsEnum             dEnum1  = null;
                DocsEnum             dEnum2  = null;

                BytesRef term1;
                while ((term1 = termsEnum1.Next()) != null)
                {
                    BytesRef term2 = termsEnum2.Next();
                    Assert.AreEqual(term1, term2);
                    Assert.AreEqual(termsEnum1.TotalTermFreq(), termsEnum2.TotalTermFreq());

                    dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1);
                    dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2);
                    if (dpEnum1 != null)
                    {
                        Assert.IsNotNull(dpEnum2);
                        int docID1 = dpEnum1.NextDoc();
                        dpEnum2.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dpEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);

                        int freq1 = dpEnum1.Freq();
                        int freq2 = dpEnum2.Freq();
                        Assert.AreEqual(freq1, freq2);
                        IOffsetAttribute offsetAtt1 = dpEnum1.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes().GetAttribute <IOffsetAttribute>() : null;
                        IOffsetAttribute offsetAtt2 = dpEnum2.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes().GetAttribute <IOffsetAttribute>() : null;

                        if (offsetAtt1 != null)
                        {
                            Assert.IsNotNull(offsetAtt2);
                        }
                        else
                        {
                            Assert.IsNull(offsetAtt2);
                        }

                        for (int posUpto = 0; posUpto < freq1; posUpto++)
                        {
                            int pos1 = dpEnum1.NextPosition();
                            int pos2 = dpEnum2.NextPosition();
                            Assert.AreEqual(pos1, pos2);
                            if (offsetAtt1 != null)
                            {
                                Assert.AreEqual(offsetAtt1.StartOffset(), offsetAtt2.StartOffset());
                                Assert.AreEqual(offsetAtt1.EndOffset(), offsetAtt2.EndOffset());
                            }
                        }
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc());
                    }
                    else
                    {
                        dEnum1 = TestUtil.Docs(Random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS);
                        dEnum2 = TestUtil.Docs(Random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS);
                        Assert.IsNotNull(dEnum1);
                        Assert.IsNotNull(dEnum2);
                        int docID1 = dEnum1.NextDoc();
                        dEnum2.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
                        int freq1 = dEnum1.Freq();
                        int freq2 = dEnum2.Freq();
                        Assert.AreEqual(freq1, freq2);
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc());
                    }
                }

                Assert.IsNull(termsEnum2.Next());
            }
            Assert.IsFalse(fieldsEnum2.MoveNext());
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Iterates over the given token stream and adds the resulting terms to the index;
        /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
        /// Lucene <see cref="Documents.Field"/>.
        /// Finally closes the token stream. Note that untokenized keywords can be added with this method via
        /// <see cref="KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities.
        ///
        /// </summary>
        /// <param name="fieldName"> a name to be associated with the text </param>
        /// <param name="stream"> the token stream to retrieve tokens from. </param>
        /// <param name="boost"> the boost factor for hits for this field </param>
        /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param>
        /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param>
        /// <seealso cref="Documents.Field.Boost(float)"/>
        public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap)
        {
            try
            {
                if (fieldName == null)
                {
                    throw new System.ArgumentException("fieldName must not be null");
                }
                if (stream == null)
                {
                    throw new System.ArgumentException("token stream must not be null");
                }
                if (boost <= 0.0f)
                {
                    throw new System.ArgumentException("boost factor must be greater than 0.0");
                }
                int                 numTokens        = 0;
                int                 numOverlapTokens = 0;
                int                 pos = -1;
                BytesRefHash        terms;
                SliceByteStartArray sliceArray;
                Info                info             = null;
                long                sumTotalTermFreq = 0;
                int                 offset           = 0;
                if (fields.TryGetValue(fieldName, out info))
                {
                    numTokens        = info.numTokens;
                    numOverlapTokens = info.numOverlapTokens;
                    pos              = info.lastPosition + positionIncrementGap;
                    offset           = info.lastOffset + offsetGap;
                    terms            = info.terms;
                    boost           *= info.boost;
                    sliceArray       = info.sliceArray;
                    sumTotalTermFreq = info.sumTotalTermFreq;
                }
                else
                {
                    sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
                    terms      = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
                }

                if (!fieldInfos.ContainsKey(fieldName))
                {
                    fieldInfos[fieldName] = new FieldInfo(fieldName, true, fieldInfos.Count, false, false, false, this.storeOffsets ? FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null, null, null);
                }
                ITermToBytesRefAttribute    termAtt          = stream.GetAttribute <ITermToBytesRefAttribute>();
                IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>();
                IOffsetAttribute            offsetAtt        = stream.AddAttribute <IOffsetAttribute>();
                BytesRef @ref = termAtt.BytesRef;
                stream.Reset();

                while (stream.IncrementToken())
                {
                    termAtt.FillBytesRef();
                    //        if (DEBUG) System.err.println("token='" + term + "'");
                    numTokens++;
                    int posIncr = posIncrAttribute.PositionIncrement;
                    if (posIncr == 0)
                    {
                        numOverlapTokens++;
                    }
                    pos += posIncr;
                    int ord = terms.Add(@ref);
                    if (ord < 0)
                    {
                        ord = (-ord) - 1;
                        postingsWriter.Reset(sliceArray.end[ord]);
                    }
                    else
                    {
                        sliceArray.start[ord] = postingsWriter.StartNewSlice();
                    }
                    sliceArray.freq[ord]++;
                    sumTotalTermFreq++;
                    if (!storeOffsets)
                    {
                        postingsWriter.WriteInt(pos);
                    }
                    else
                    {
                        postingsWriter.WriteInt(pos);
                        postingsWriter.WriteInt(offsetAtt.StartOffset() + offset);
                        postingsWriter.WriteInt(offsetAtt.EndOffset() + offset);
                    }
                    sliceArray.end[ord] = postingsWriter.CurrentOffset;
                }
                stream.End();

                // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
                if (numTokens > 0)
                {
                    fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset() + offset, sumTotalTermFreq);
                    sortedFields      = null; // invalidate sorted view, if any
                }
            } // can never happen
            catch (Exception /*e*/)
            {
                throw;
            }
            finally
            {
                try
                {
                    if (stream != null)
                    {
                        stream.Dispose();
                    }
                }
                catch (IOException /*e2*/)
                {
                    throw;
                }
            }
        }
Ejemplo n.º 8
0
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = (char[])termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length;
                 curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
                 curGramSize       = minGram;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 if (version.OnOrAfter(LuceneVersion.LUCENE_44))
                 {
                     // Never update offsets
                     updateOffsets = false;
                 }
                 else
                 {
                     // if length by start + end offsets doesn't match the term text then assume
                     // this is a synonym and don't adjust the offsets.
                     updateOffsets = (tokStart + curTermLength) == tokEnd;
                 }
                 savePosIncr += posIncrAtt.PositionIncrement;
                 savePosLen   = posLenAtt.PositionLength;
             }
         }
         if (curGramSize <= maxGram)               // if we have hit the end of our n-gram size range, quit
         {
             if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
             {
                 // grab gramSize chars from front or back
                 int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
                 int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 ClearAttributes();
                 if (updateOffsets)
                 {
                     offsetAtt.SetOffset(tokStart + start, tokStart + end);
                 }
                 else
                 {
                     offsetAtt.SetOffset(tokStart, tokEnd);
                 }
                 // first ngram gets increment, others don't
                 if (curGramSize == minGram)
                 {
                     posIncrAtt.PositionIncrement = savePosIncr;
                     savePosIncr = 0;
                 }
                 else
                 {
                     posIncrAtt.PositionIncrement = 0;
                 }
                 posLenAtt.PositionLength = savePosLen;
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 curGramSize++;
                 return(true);
             }
         }
         curTermBuffer = null;
     }
 }
Ejemplo n.º 9
0
        /*
         * much of this complexity revolves around handling the special case of a
         * "lone cjk character" where cjktokenizer would output a unigram. this
         * is also the only time we ever have to captureState.
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                if (HasBufferedBigram)
                {
                    // case 1: we have multiple remaining codepoints buffered,
                    // so we can emit a bigram here.

                    if (outputUnigrams)
                    {
                        // when also outputting unigrams, we output the unigram first,
                        // then rewind back to revisit the bigram.
                        // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
                        // the logic in hasBufferedUnigram ensures we output the C,
                        // even though it did actually have adjacent CJK characters.

                        if (ngramState)
                        {
                            FlushBigram();
                        }
                        else
                        {
                            FlushUnigram();
                            index--;
                        }
                        ngramState = !ngramState;
                    }
                    else
                    {
                        FlushBigram();
                    }
                    return(true);
                }
                else if (DoNext())
                {
                    // case 2: look at the token type. should we form any n-grams?

                    string type = typeAtt.Type;
                    if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul)
                    {
                        // acceptable CJK type: we form n-grams from these.
                        // as long as the offsets are aligned, we just add these to our current buffer.
                        // otherwise, we clear the buffer and start over.

                        if (offsetAtt.StartOffset() != lastEndOffset) // unaligned, clear queue
                        {
                            if (HasBufferedUnigram)
                            {
                                // we have a buffered unigram, and we peeked ahead to see if we could form
                                // a bigram, but we can't, because the offsets are unaligned. capture the state
                                // of this peeked data to be revisited next time thru the loop, and dump our unigram.

                                loneState = CaptureState();
                                FlushUnigram();
                                return(true);
                            }
                            index     = 0;
                            bufferLen = 0;
                        }
                        Refill();
                    }
                    else
                    {
                        // not a CJK type: we just return these as-is.

                        if (HasBufferedUnigram)
                        {
                            // we have a buffered unigram, and we peeked ahead to see if we could form
                            // a bigram, but we can't, because its not a CJK type. capture the state
                            // of this peeked data to be revisited next time thru the loop, and dump our unigram.

                            loneState = CaptureState();
                            FlushUnigram();
                            return(true);
                        }
                        return(true);
                    }
                }
                else
                {
                    // case 3: we have only zero or 1 codepoints buffered,
                    // so not enough to form a bigram. But, we also have no
                    // more input. So if we have a buffered codepoint, emit
                    // a unigram, otherwise, its end of stream.

                    if (HasBufferedUnigram)
                    {
                        FlushUnigram(); // flush our remaining unigram
                        return(true);
                    }
                    return(false);
                }
            }
        }
Ejemplo n.º 10
0
        // we only check a few core attributes here.
        // TODO: test other things
        public virtual void assertEquals(string s, TokenStream left, TokenStream right)
        {
            left.Reset();
            right.Reset();
            ICharTermAttribute          leftTerm    = left.AddAttribute <ICharTermAttribute>();
            ICharTermAttribute          rightTerm   = right.AddAttribute <ICharTermAttribute>();
            IOffsetAttribute            leftOffset  = left.AddAttribute <IOffsetAttribute>();
            IOffsetAttribute            rightOffset = right.AddAttribute <IOffsetAttribute>();
            IPositionIncrementAttribute leftPos     = left.AddAttribute <IPositionIncrementAttribute>();
            IPositionIncrementAttribute rightPos    = right.AddAttribute <IPositionIncrementAttribute>();

            while (left.IncrementToken())
            {
                assertTrue("wrong number of tokens for input: " + s, right.IncrementToken());
                assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString());
                assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement);
                assertEquals("wrong start offset for input: " + s, leftOffset.StartOffset(), rightOffset.StartOffset());
                assertEquals("wrong end offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset());
            }
            ;
            assertFalse("wrong number of tokens for input: " + s, right.IncrementToken());
            left.End();
            right.End();
            assertEquals("wrong final offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset());
            left.Dispose();
            right.Dispose();
        }
Ejemplo n.º 11
0
        public virtual void ToDot()
        {
            @in.Reset();
            WriteHeader();

            // TODO: is there some way to tell dot that it should
            // make the "main path" a straight line and have the
            // non-sausage arcs not affect node placement...

            int pos        = -1;
            int lastEndPos = -1;

            while (@in.IncrementToken())
            {
                bool isFirst = pos == -1;
                int  posInc  = PosIncAtt.PositionIncrement;
                if (isFirst && posInc == 0)
                {
                    // TODO: hmm are TS's still allowed to do this...?
                    Console.Error.WriteLine("WARNING: first posInc was 0; correcting to 1");
                    posInc = 1;
                }

                if (posInc > 0)
                {
                    // New node:
                    pos += posInc;
                    WriteNode(pos, Convert.ToString(pos));
                }

                if (posInc > 1)
                {
                    // Gap!
                    WriteArc(lastEndPos, pos, null, "dotted");
                }

                if (isFirst)
                {
                    WriteNode(-1, null);
                    WriteArc(-1, pos, null, null);
                }

                string arcLabel = TermAtt.ToString();
                if (OffsetAtt != null)
                {
                    int startOffset = OffsetAtt.StartOffset();
                    int endOffset   = OffsetAtt.EndOffset();
                    //System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length());
                    if (InputText != null)
                    {
                        arcLabel += " / " + InputText.Substring(startOffset, endOffset - startOffset);
                    }
                    else
                    {
                        arcLabel += " / " + startOffset + "-" + endOffset;
                    }
                }

                WriteArc(pos, pos + PosLengthAtt.PositionLength, arcLabel, null);
                lastEndPos = pos + PosLengthAtt.PositionLength;
            }

            @in.End();

            if (lastEndPos != -1)
            {
                // TODO: should we output any final text (from end
                // offsets) on this arc...?
                WriteNode(-2, null);
                WriteArc(lastEndPos, -2, null, null);
            }

            WriteTrailer();
        }
Ejemplo n.º 12
0
        public override bool IncrementToken()
        {
            if (hasMoreTokensInClone)
            {
                int start = breaker.Current();
                int end   = breaker.Next();
                if (end != BreakIterator.DONE)
                {
                    clonedToken.CopyTo(this);
                    termAtt.CopyBuffer(clonedTermAtt.Buffer(), start, end - start);
                    if (hasIllegalOffsets)
                    {
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset(), clonedOffsetAtt.EndOffset());
                    }
                    else
                    {
                        offsetAtt.SetOffset(clonedOffsetAtt.StartOffset() + start, clonedOffsetAtt.StartOffset() + end);
                    }
                    if (handlePosIncr)
                    {
                        posAtt.PositionIncrement = 1;
                    }
                    return(true);
                }
                hasMoreTokensInClone = false;
            }

            if (!input.IncrementToken())
            {
                return(false);
            }

            if (termAtt.Length == 0 || !Regex.IsMatch(termAtt.ToString().Substring(0, 1), @"\p{IsThai}"))
            {
                return(true);
            }

            hasMoreTokensInClone = true;

            // if length by start + end offsets doesn't match the term text then assume
            // this is a synonym and don't adjust the offsets.
            hasIllegalOffsets = offsetAtt.EndOffset() - offsetAtt.StartOffset() != termAtt.Length;

            // we lazy init the cloned token, as in ctor not all attributes may be added
            if (clonedToken == null)
            {
                clonedToken     = CloneAttributes();
                clonedTermAtt   = clonedToken.GetAttribute <ICharTermAttribute>();
                clonedOffsetAtt = clonedToken.GetAttribute <IOffsetAttribute>();
            }
            else
            {
                this.CopyTo(clonedToken);
            }

            // reinit CharacterIterator
            charIterator.SetText(clonedTermAtt.Buffer(), 0, clonedTermAtt.Length);
            breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length));
            int end2 = breaker.Next();

            if (end2 != BreakIterator.DONE)
            {
                termAtt.Length = end2;
                if (hasIllegalOffsets)
                {
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset(), clonedOffsetAtt.EndOffset());
                }
                else
                {
                    offsetAtt.SetOffset(clonedOffsetAtt.StartOffset(), clonedOffsetAtt.StartOffset() + end2);
                }
                // position increment keeps as it is for first token
                return(true);
            }
            return(false);
        }