Esempio n. 1
0
        // we only check a few core attributes here.
        // TODO: test other things
        public virtual void assertEquals(string s, TokenStream left, TokenStream right)
        {
            left.Reset();
            right.Reset();
            ICharTermAttribute          leftTerm    = left.AddAttribute <ICharTermAttribute>();
            ICharTermAttribute          rightTerm   = right.AddAttribute <ICharTermAttribute>();
            IOffsetAttribute            leftOffset  = left.AddAttribute <IOffsetAttribute>();
            IOffsetAttribute            rightOffset = right.AddAttribute <IOffsetAttribute>();
            IPositionIncrementAttribute leftPos     = left.AddAttribute <IPositionIncrementAttribute>();
            IPositionIncrementAttribute rightPos    = right.AddAttribute <IPositionIncrementAttribute>();

            while (left.IncrementToken())
            {
                assertTrue("wrong number of tokens for input: " + s, right.IncrementToken());
                assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString());
                assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement);
                assertEquals("wrong start offset for input: " + s, leftOffset.StartOffset(), rightOffset.StartOffset());
                assertEquals("wrong end offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset());
            }
            ;
            assertFalse("wrong number of tokens for input: " + s, right.IncrementToken());
            left.End();
            right.End();
            assertEquals("wrong final offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset());
            left.Dispose();
            right.Dispose();
        }
        public override bool IncrementToken()
        {
            if (endState != null)
            {
                return(false);
            }

            if (!input.IncrementToken())
            {
                return(false);
            }

            int skippedPositions = 0;

            while (true)
            {
                if (stopWords.Contains(termAtt.Buffer(), 0, termAtt.Length))
                {
                    int posInc    = posIncAtt.PositionIncrement;
                    int endOffset = offsetAtt.EndOffset();
                    // This token may be a stopword, if it's not end:
                    State sav = CaptureState();
                    if (input.IncrementToken())
                    {
                        // It was a stopword; skip it
                        skippedPositions += posInc;
                    }
                    else
                    {
                        ClearAttributes();
                        input.End();
                        endState = CaptureState();
                        int finalEndOffset = offsetAtt.EndOffset();
                        Debug.Assert(finalEndOffset >= endOffset);
                        if (finalEndOffset > endOffset)
                        {
                            // OK there was a token separator after the
                            // stopword, so it was a stopword
                            return(false);
                        }
                        else
                        {
                            // No token separator after final token that
                            // looked like a stop-word; don't filter it:
                            RestoreState(sav);
                            posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
                            keywordAtt.Keyword          = true;
                            return(true);
                        }
                    }
                }
                else
                {
                    // Not a stopword; return the current token:
                    posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
                    return(true);
                }
            }
        }
Esempio n. 3
0
            protected internal override object Highlight(string text, IEnumerable <string> matchedTokens, string prefixToken)
            {
                TokenStream ts = queryAnalyzer.TokenStream("text", new StringReader(text));

                try
                {
                    ICharTermAttribute termAtt   = ts.AddAttribute <ICharTermAttribute>();
                    IOffsetAttribute   offsetAtt = ts.AddAttribute <IOffsetAttribute>();
                    ts.Reset();
                    List <LookupHighlightFragment> fragments = new List <LookupHighlightFragment>();
                    int upto = 0;
                    while (ts.IncrementToken())
                    {
                        string token       = termAtt.toString();
                        int    startOffset = offsetAtt.StartOffset();
                        int    endOffset   = offsetAtt.EndOffset();
                        if (upto < startOffset)
                        {
                            fragments.Add(new LookupHighlightFragment(text.Substring(upto, startOffset - upto), false));
                            upto = startOffset;
                        }
                        else if (upto > startOffset)
                        {
                            continue;
                        }

                        if (matchedTokens.Contains(token))
                        {
                            // Token matches.
                            fragments.Add(new LookupHighlightFragment(text.Substring(startOffset, endOffset - startOffset), true));
                            upto = endOffset;
                        }
                        else if (prefixToken != null && token.StartsWith(prefixToken, StringComparison.InvariantCulture))
                        {
                            fragments.Add(new LookupHighlightFragment(text.Substring(startOffset, prefixToken.Length), true));
                            if (prefixToken.Length < token.Length)
                            {
                                fragments.Add(new LookupHighlightFragment(text.Substring(startOffset + prefixToken.Length, (startOffset + token.Length) - (startOffset + prefixToken.Length)), false));
                            }
                            upto = endOffset;
                        }
                    }
                    ts.End();
                    int endOffset2 = offsetAtt.EndOffset();
                    if (upto < endOffset2)
                    {
                        fragments.Add(new LookupHighlightFragment(text.Substring(upto), false));
                    }

                    return(fragments);
                }
                finally
                {
                    IOUtils.CloseWhileHandlingException(ts);
                }
            }
Esempio n. 4
0
        public override bool IncrementToken()
        {
            if (!input.IncrementToken())
            {
                return(false);
            }

            char[] termBuffer = termAtt.Buffer();
            int    len        = termAtt.Length;

            //TODO: Is this the right behavior or should we return false?  Currently, "  ", returns true, so I think this should
            //also return true
            if (len == 0)
            {
                return(true);
            }
            int start  = 0;
            int end    = 0;
            int endOff = 0;

            // eat the first characters
            for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++)
            {
            }
            // eat the end characters
            for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--)
            {
                endOff++;
            }
            if (start > 0 || end < len)
            {
                if (start < end)
                {
                    termAtt.CopyBuffer(termBuffer, start, (end - start));
                }
                else
                {
                    termAtt.SetEmpty();
                }
                if (updateOffsets && len == offsetAtt.EndOffset() - offsetAtt.StartOffset())
                {
                    int newStart = offsetAtt.StartOffset() + start;
                    int newEnd   = offsetAtt.EndOffset() - (start < end ? endOff : 0);
                    offsetAtt.SetOffset(newStart, newEnd);
                }
            }

            return(true);
        }
Esempio n. 5
0
        public virtual void TestSupplementaryCharacters()
        {
            string      s = TestUtil.RandomUnicodeString(Random(), 10);
            int         codePointCount = Character.CodePointCount(s, 0, s.Length);
            int         minGram        = TestUtil.NextInt(Random(), 1, 3);
            int         maxGram        = TestUtil.NextInt(Random(), minGram, 10);
            TokenStream tk             = new KeywordTokenizer(new StringReader(s));

            tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
            ICharTermAttribute termAtt   = tk.AddAttribute <ICharTermAttribute>();
            IOffsetAttribute   offsetAtt = tk.AddAttribute <IOffsetAttribute>();

            tk.Reset();
            for (int start = 0; start < codePointCount; ++start)
            {
                for (int end = start + minGram; end <= Math.Min(codePointCount, start + maxGram); ++end)
                {
                    assertTrue(tk.IncrementToken());
                    assertEquals(0, offsetAtt.StartOffset());
                    assertEquals(s.Length, offsetAtt.EndOffset());
                    int startIndex = Character.OffsetByCodePoints(s, 0, start);
                    int endIndex   = Character.OffsetByCodePoints(s, 0, end);
                    assertEquals(s.Substring(startIndex, endIndex - startIndex), termAtt.ToString());
                }
            }
            assertFalse(tk.IncrementToken());
        }
Esempio n. 6
0
 public override bool IncrementToken()
 {
     if (inPhrase)
     {
         inPhrase = false;
         termAtt.SetEmpty().Append("phrase2");
         offsetAtt.SetOffset(savedStart, savedEnd);
         return(true);
     }
     else
     {
         while (input.IncrementToken())
         {
             if (termAtt.toString().equals("phrase"))
             {
                 inPhrase   = true;
                 savedStart = offsetAtt.StartOffset();
                 savedEnd   = offsetAtt.EndOffset();
                 termAtt.SetEmpty().Append("phrase1");
                 offsetAtt.SetOffset(savedStart, savedEnd);
                 return(true);
             }
             else if (!termAtt.toString().equals("stop"))
             {
                 return(true);
             }
         }
     }
     return(false);
 }
Esempio n. 7
0
        internal static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars, bool edgesOnly)
        {
            // convert the string to code points
            int[] codePoints = toCodePoints(s);
            int[] offsets    = new int[codePoints.Length + 1];
            for (int i = 0; i < codePoints.Length; ++i)
            {
                offsets[i + 1] = offsets[i] + Character.CharCount(codePoints[i]);
            }
            TokenStream                 grams     = new NGramTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly, nonTokenChars);
            ICharTermAttribute          termAtt   = grams.AddAttribute <ICharTermAttribute>();
            IPositionIncrementAttribute posIncAtt = grams.AddAttribute <IPositionIncrementAttribute>();
            IPositionLengthAttribute    posLenAtt = grams.AddAttribute <IPositionLengthAttribute>();
            IOffsetAttribute            offsetAtt = grams.AddAttribute <IOffsetAttribute>();

            grams.Reset();
            for (int start = 0; start < codePoints.Length; ++start)
            {
                for (int end = start + minGram; end <= start + maxGram && end <= codePoints.Length; ++end)
                {
                    if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1]))
                    {
                        // not on an edge
                        goto nextGramContinue;
                    }
                    for (int j = start; j < end; ++j)
                    {
                        if (!isTokenChar(nonTokenChars, codePoints[j]))
                        {
                            goto nextGramContinue;
                        }
                    }
                    assertTrue(grams.IncrementToken());
                    assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString()));
                    assertEquals(1, posIncAtt.PositionIncrement);
                    assertEquals(1, posLenAtt.PositionLength);
                    assertEquals(offsets[start], offsetAtt.StartOffset());
                    assertEquals(offsets[end], offsetAtt.EndOffset());
                    nextGramContinue :;
                }
                //nextGramBreak:;
            }
            assertFalse(grams.IncrementToken());
            grams.End();
            assertEquals(s.Length, offsetAtt.StartOffset());
            assertEquals(s.Length, offsetAtt.EndOffset());
        }
Esempio n. 8
0
        internal void WriteOffsets(int termID, int offsetAccum)
        {
            Debug.Assert(HasOffsets);
            int startOffset = offsetAccum + OffsetAttribute.StartOffset();
            int endOffset   = offsetAccum + OffsetAttribute.EndOffset();
            FreqProxPostingsArray postings = (FreqProxPostingsArray)TermsHashPerField.PostingsArray;

            Debug.Assert(startOffset - postings.LastOffsets[termID] >= 0);
            TermsHashPerField.WriteVInt(1, startOffset - postings.LastOffsets[termID]);
            TermsHashPerField.WriteVInt(1, endOffset - startOffset);

            postings.LastOffsets[termID] = startOffset;
        }
Esempio n. 9
0
 private Token GetNextPrefixInputToken(Token token)
 {
     if (!prefix.IncrementToken())
     {
         return(null);
     }
     token.CopyBuffer(p_termAtt.Buffer(), 0, p_termAtt.Length);
     token.PositionIncrement = p_posIncrAtt.PositionIncrement;
     token.Flags             = p_flagsAtt.Flags;
     token.SetOffset(p_offsetAtt.StartOffset(), p_offsetAtt.EndOffset());
     token.Type    = p_typeAtt.Type;
     token.Payload = p_payloadAtt.Payload;
     return(token);
 }
Esempio n. 10
0
        /// <summary>
        /// refills buffers with new data from the current token.
        /// </summary>
        private void Refill()
        {
            // compact buffers to keep them smallish if they become large
            // just a safety check, but technically we only need the last codepoint
            if (bufferLen > 64)
            {
                int last = bufferLen - 1;
                buffer[0]      = buffer[last];
                startOffset[0] = startOffset[last];
                endOffset[0]   = endOffset[last];
                bufferLen      = 1;
                index         -= last;
            }

            char[] termBuffer = termAtt.Buffer();
            int    len        = termAtt.Length;
            int    start      = offsetAtt.StartOffset();
            int    end        = offsetAtt.EndOffset();

            int newSize = bufferLen + len;

            buffer        = ArrayUtil.Grow(buffer, newSize);
            startOffset   = ArrayUtil.Grow(startOffset, newSize);
            endOffset     = ArrayUtil.Grow(endOffset, newSize);
            lastEndOffset = end;

            if (end - start != len)
            {
                // crazy offsets (modified by synonym or charfilter): just preserve
                for (int i = 0, cp = 0; i < len; i += Character.CharCount(cp))
                {
                    cp = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len);
                    startOffset[bufferLen] = start;
                    endOffset[bufferLen]   = end;
                    bufferLen++;
                }
            }
            else
            {
                // normal offsets
                for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen)
                {
                    cp    = buffer[bufferLen] = Character.CodePointAt(termBuffer, i, len);
                    cpLen = Character.CharCount(cp);
                    startOffset[bufferLen] = start;
                    start = endOffset[bufferLen] = start + cpLen;
                    bufferLen++;
                }
            }
        }
Esempio n. 11
0
 private Token GetNextSuffixInputToken(Token token)
 {
     if (!suffix.IncrementToken())
     {
         return(null);
     }
     token.CopyBuffer(termAtt.Buffer(), 0, termAtt.Length);
     token.PositionIncrement = posIncrAtt.PositionIncrement;
     token.Flags             = flagsAtt.Flags;
     token.SetOffset(offsetAtt.StartOffset(), offsetAtt.EndOffset());
     token.Type    = typeAtt.Type;
     token.Payload = payloadAtt.Payload;
     return(token);
 }
        public virtual void Test()
        {
            string test = "The quick red fox jumped over the lazy brown dogs";

            TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
            int count = 0;
            IPayloadAttribute payloadAtt = nptf.GetAttribute <IPayloadAttribute>();
            IOffsetAttribute  offsetAtt  = nptf.GetAttribute <IOffsetAttribute>();

            nptf.Reset();
            while (nptf.IncrementToken())
            {
                BytesRef pay = payloadAtt.Payload;
                assertTrue("pay is null and it shouldn't be", pay != null);
                byte[] data  = pay.Bytes;
                int    start = PayloadHelper.DecodeInt(data, 0);
                assertTrue(start + " does not equal: " + offsetAtt.StartOffset(), start == offsetAtt.StartOffset());
                int end = PayloadHelper.DecodeInt(data, 4);
                assertTrue(end + " does not equal: " + offsetAtt.EndOffset(), end == offsetAtt.EndOffset());
                count++;
            }
            assertTrue(count + " does not equal: " + 10, count == 10);
        }
 public override sealed bool IncrementToken()
 {
     if (input.IncrementToken())
     {
         byte[] data = new byte[8];
         PayloadHelper.EncodeInt(offsetAtt.StartOffset(), data, 0);
         PayloadHelper.EncodeInt(offsetAtt.EndOffset(), data, 4);
         BytesRef payload = new BytesRef(data);
         payAtt.Payload = payload;
         return(true);
     }
     else
     {
         return(false);
     }
 }
Esempio n. 14
0
        /// <summary>
        /// {@inheritDoc}
        /// </summary>
        public override bool IncrementToken()
        {
            while (!exhausted && input.IncrementToken())
            {
                char[] term       = termAttribute.Buffer();
                int    termLength = termAttribute.Length;
                lastEndOffset = offsetAttribute.EndOffset();

                if (termLength > 0 && term[termLength - 1] == '-')
                {
                    // a hyphenated word
                    // capture the state of the first token only
                    if (savedState == null)
                    {
                        savedState = CaptureState();
                    }
                    hyphenated.Append(term, 0, termLength - 1);
                }
                else if (savedState == null)
                {
                    // not part of a hyphenated word.
                    return(true);
                }
                else
                {
                    // the final portion of a hyphenated word
                    hyphenated.Append(term, 0, termLength);
                    Unhyphenate();
                    return(true);
                }
            }

            exhausted = true;

            if (savedState != null)
            {
                // the final term ends with a hyphen
                // add back the hyphen, for backwards compatibility.
                hyphenated.Append('-');
                Unhyphenate();
                return(true);
            }

            return(false);
        }
Esempio n. 15
0
        public virtual void TestOffsets()
        {
            TokenStream stream = (new KeywordAnalyzer()).TokenStream("field", new StringReader("abcd"));

            try
            {
                IOffsetAttribute offsetAtt = stream.AddAttribute <IOffsetAttribute>();
                stream.Reset();
                assertTrue(stream.IncrementToken());
                assertEquals(0, offsetAtt.StartOffset());
                assertEquals(4, offsetAtt.EndOffset());
                assertFalse(stream.IncrementToken());
                stream.End();
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(stream);
            }
        }
Esempio n. 16
0
        public virtual void TestOtherLetterOffset()
        {
            string           s         = "a天b";
            ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));

            int correctStartOffset     = 0;
            int correctEndOffset       = 1;
            IOffsetAttribute offsetAtt = tokenizer.GetAttribute <IOffsetAttribute>();

            tokenizer.Reset();
            while (tokenizer.IncrementToken())
            {
                assertEquals(correctStartOffset, offsetAtt.StartOffset());
                assertEquals(correctEndOffset, offsetAtt.EndOffset());
                correctStartOffset++;
                correctEndOffset++;
            }
            tokenizer.End();
            tokenizer.Dispose();
        }
Esempio n. 17
0
        public virtual void TestFilterTokens()
        {
            SnowballFilter              filter     = new SnowballFilter(new TestTokenStream(this), "English");
            ICharTermAttribute          termAtt    = filter.GetAttribute <ICharTermAttribute>();
            IOffsetAttribute            offsetAtt  = filter.GetAttribute <IOffsetAttribute>();
            ITypeAttribute              typeAtt    = filter.GetAttribute <ITypeAttribute>();
            IPayloadAttribute           payloadAtt = filter.GetAttribute <IPayloadAttribute>();
            IPositionIncrementAttribute posIncAtt  = filter.GetAttribute <IPositionIncrementAttribute>();
            IFlagsAttribute             flagsAtt   = filter.GetAttribute <IFlagsAttribute>();

            filter.IncrementToken();

            assertEquals("accent", termAtt.ToString());
            assertEquals(2, offsetAtt.StartOffset());
            assertEquals(7, offsetAtt.EndOffset());
            assertEquals("wrd", typeAtt.Type);
            assertEquals(3, posIncAtt.PositionIncrement);
            assertEquals(77, flagsAtt.Flags);
            assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.Payload);
        }
Esempio n. 18
0
 public override sealed bool IncrementToken()
 {
     if (multiToken > 0)
     {
         termAtt.SetEmpty().Append("multi" + (multiToken + 1));
         offsetAtt.SetOffset(prevStartOffset, prevEndOffset);
         typeAtt.Type = (prevType);
         posIncrAtt.PositionIncrement = (0);
         multiToken--;
         return(true);
     }
     else
     {
         bool next = input.IncrementToken();
         if (!next)
         {
             return(false);
         }
         prevType        = typeAtt.Type;
         prevStartOffset = offsetAtt.StartOffset();
         prevEndOffset   = offsetAtt.EndOffset();
         string text = termAtt.toString();
         if (text.equals("triplemulti"))
         {
             multiToken = 2;
             return(true);
         }
         else if (text.equals("multi"))
         {
             multiToken = 1;
             return(true);
         }
         else
         {
             return(true);
         }
     }
 }
Esempio n. 19
0
        /// <summary>
        /// Constructs a compound token.
        /// </summary>
        private void GramToken()
        {
            buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length);
            int endOffset = offsetAttribute.EndOffset();

            ClearAttributes();

            var length   = buffer.Length;
            var termText = termAttribute.Buffer();

            if (length > termText.Length)
            {
                termText = termAttribute.ResizeBuffer(length);
            }

            buffer.GetChars(0, length, termText, 0);
            termAttribute.Length = length;
            posIncAttribute.PositionIncrement = 0;
            posLenAttribute.PositionLength    = 2; // bigram
            offsetAttribute.SetOffset(lastStartOffset, endOffset);
            typeAttribute.Type = GRAM_TYPE;
            buffer.Length      = 0;
        }
Esempio n. 20
0
        public override void ProcessFields(IndexableField[] fields, int count)
        {
            FieldState.Reset();

            bool doInvert = Consumer.Start(fields, count);

            for (int i = 0; i < count; i++)
            {
                IndexableField     field     = fields[i];
                IndexableFieldType fieldType = field.FieldType();

                // TODO FI: this should be "genericized" to querying
                // consumer if it wants to see this particular field
                // tokenized.
                if (fieldType.Indexed && doInvert)
                {
                    bool analyzed = fieldType.Tokenized && DocState.Analyzer != null;

                    // if the field omits norms, the boost cannot be indexed.
                    if (fieldType.OmitNorms && field.GetBoost() != 1.0f)
                    {
                        throw new System.NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name() + "'");
                    }

                    // only bother checking offsets if something will consume them.
                    // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
                    bool checkOffsets    = fieldType.IndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                    int  lastStartOffset = 0;

                    if (i > 0)
                    {
                        FieldState.Position_Renamed += analyzed ? DocState.Analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;
                    }

                    /*
                     * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
                     * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
                     * but rather a finally that takes note of the problem.
                     */

                    bool succeededInProcessingField = false;

                    TokenStream stream = field.GetTokenStream(DocState.Analyzer);
                    // reset the TokenStream to the first token
                    stream.Reset();

                    try
                    {
                        bool hasMoreTokens = stream.IncrementToken();

                        FieldState.AttributeSource_Renamed = stream;

                        IOffsetAttribute            offsetAttribute  = FieldState.AttributeSource_Renamed.AddAttribute <IOffsetAttribute>();
                        IPositionIncrementAttribute posIncrAttribute = FieldState.AttributeSource_Renamed.AddAttribute <IPositionIncrementAttribute>();

                        if (hasMoreTokens)
                        {
                            Consumer.Start(field);

                            do
                            {
                                // If we hit an exception in stream.next below
                                // (which is fairly common, eg if analyzer
                                // chokes on a given document), then it's
                                // non-aborting and (above) this one document
                                // will be marked as deleted, but still
                                // consume a docID

                                int posIncr = posIncrAttribute.PositionIncrement;
                                if (posIncr < 0)
                                {
                                    throw new System.ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name() + "'");
                                }
                                if (FieldState.Position_Renamed == 0 && posIncr == 0)
                                {
                                    throw new System.ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name() + "'");
                                }
                                int position = FieldState.Position_Renamed + posIncr;
                                if (position > 0)
                                {
                                    // NOTE: confusing: this "mirrors" the
                                    // position++ we do below
                                    position--;
                                }
                                else if (position < 0)
                                {
                                    throw new System.ArgumentException("position overflow for field '" + field.Name() + "'");
                                }

                                // position is legal, we can safely place it in fieldState now.
                                // not sure if anything will use fieldState after non-aborting exc...
                                FieldState.Position_Renamed = position;

                                if (posIncr == 0)
                                {
                                    FieldState.NumOverlap_Renamed++;
                                }

                                if (checkOffsets)
                                {
                                    int startOffset = FieldState.Offset_Renamed + offsetAttribute.StartOffset();
                                    int endOffset   = FieldState.Offset_Renamed + offsetAttribute.EndOffset();
                                    if (startOffset < 0 || endOffset < startOffset)
                                    {
                                        throw new System.ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name() + "'");
                                    }
                                    if (startOffset < lastStartOffset)
                                    {
                                        throw new System.ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name() + "'");
                                    }
                                    lastStartOffset = startOffset;
                                }

                                bool success = false;
                                try
                                {
                                    // If we hit an exception in here, we abort
                                    // all buffered documents since the last
                                    // flush, on the likelihood that the
                                    // internal state of the consumer is now
                                    // corrupt and should not be flushed to a
                                    // new segment:
                                    Consumer.Add();
                                    success = true;
                                }
                                finally
                                {
                                    if (!success)
                                    {
                                        DocState.DocWriter.SetAborting();
                                    }
                                }
                                FieldState.Length_Renamed++;
                                FieldState.Position_Renamed++;
                            } while (stream.IncrementToken());
                        }
                        // trigger streams to perform end-of-stream operations
                        stream.End();
                        // TODO: maybe add some safety? then again, its already checked
                        // when we come back around to the field...
                        FieldState.Position_Renamed += posIncrAttribute.PositionIncrement;
                        FieldState.Offset_Renamed   += offsetAttribute.EndOffset();

                        if (DocState.MaxTermPrefix != null)
                        {
                            string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + DocState.MaxTermPrefix + "...'";
                            if (DocState.InfoStream.IsEnabled("IW"))
                            {
                                DocState.InfoStream.Message("IW", "ERROR: " + msg);
                            }
                            DocState.MaxTermPrefix = null;
                            throw new System.ArgumentException(msg);
                        }

                        /* if success was false above there is an exception coming through and we won't get here.*/
                        succeededInProcessingField = true;
                    }
                    finally
                    {
                        if (!succeededInProcessingField)
                        {
                            IOUtils.CloseWhileHandlingException(stream);
                        }
                        else
                        {
                            stream.Dispose();
                        }
                        if (!succeededInProcessingField && DocState.InfoStream.IsEnabled("DW"))
                        {
                            DocState.InfoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);
                        }
                    }

                    FieldState.Offset_Renamed += analyzed ? DocState.Analyzer.GetOffsetGap(fieldInfo.Name) : 0;
                    FieldState.Boost_Renamed  *= field.GetBoost();
                }

                // LUCENE-2387: don't hang onto the field, so GC can
                // reclaim
                fields[i] = null;
            }

            Consumer.Finish();
            EndConsumer.Finish();
        }
Esempio n. 21
0
        public virtual void ToDot()
        {
            @in.Reset();
            WriteHeader();

            // TODO: is there some way to tell dot that it should
            // make the "main path" a straight line and have the
            // non-sausage arcs not affect node placement...

            int pos        = -1;
            int lastEndPos = -1;

            while (@in.IncrementToken())
            {
                bool isFirst = pos == -1;
                int  posInc  = PosIncAtt.PositionIncrement;
                if (isFirst && posInc == 0)
                {
                    // TODO: hmm are TS's still allowed to do this...?
                    Console.Error.WriteLine("WARNING: first posInc was 0; correcting to 1");
                    posInc = 1;
                }

                if (posInc > 0)
                {
                    // New node:
                    pos += posInc;
                    WriteNode(pos, Convert.ToString(pos));
                }

                if (posInc > 1)
                {
                    // Gap!
                    WriteArc(lastEndPos, pos, null, "dotted");
                }

                if (isFirst)
                {
                    WriteNode(-1, null);
                    WriteArc(-1, pos, null, null);
                }

                string arcLabel = TermAtt.ToString();
                if (OffsetAtt != null)
                {
                    int startOffset = OffsetAtt.StartOffset();
                    int endOffset   = OffsetAtt.EndOffset();
                    //System.out.println("start=" + startOffset + " end=" + endOffset + " len=" + inputText.length());
                    if (InputText != null)
                    {
                        arcLabel += " / " + InputText.Substring(startOffset, endOffset - startOffset);
                    }
                    else
                    {
                        arcLabel += " / " + startOffset + "-" + endOffset;
                    }
                }

                WriteArc(pos, pos + PosLengthAtt.PositionLength, arcLabel, null);
                lastEndPos = pos + PosLengthAtt.PositionLength;
            }

            @in.End();

            if (lastEndPos != -1)
            {
                // TODO: should we output any final text (from end
                // offsets) on this arc...?
                WriteNode(-2, null);
                WriteArc(lastEndPos, -2, null, null);
            }

            WriteTrailer();
        }
        // For the output string: separate positions with a space,
        // and separate multiple tokens at each position with a
        // /.  If a token should have end offset != the input
        // token's end offset then add :X to it:

        // TODO: we should probably refactor this guy to use/take analyzer,
        // the tests are a little messy
        private void Verify(string input, string output)
        {
            if (VERBOSE)
            {
                Console.WriteLine("TEST: verify input=" + input + " expectedOutput=" + output);
            }

            tokensIn.Reader = new StringReader(input);
            tokensOut.Reset();
            string[] expected     = output.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries);
            int      expectedUpto = 0;

            while (tokensOut.IncrementToken())
            {
                if (VERBOSE)
                {
                    Console.WriteLine("  incr token=" + termAtt.ToString() + " posIncr=" + posIncrAtt.PositionIncrement + " startOff=" + offsetAtt.StartOffset() + " endOff=" + offsetAtt.EndOffset());
                }

                assertTrue(expectedUpto < expected.Length);
                int startOffset = offsetAtt.StartOffset();
                int endOffset   = offsetAtt.EndOffset();

                string[] expectedAtPos = expected[expectedUpto++].Split(new string[] { "/" }, StringSplitOptions.RemoveEmptyEntries);
                for (int atPos = 0; atPos < expectedAtPos.Length; atPos++)
                {
                    if (atPos > 0)
                    {
                        assertTrue(tokensOut.IncrementToken());
                        if (VERBOSE)
                        {
                            Console.WriteLine("  incr token=" + termAtt.ToString() + " posIncr=" + posIncrAtt.PositionIncrement + " startOff=" + offsetAtt.StartOffset() + " endOff=" + offsetAtt.EndOffset());
                        }
                    }
                    int    colonIndex    = expectedAtPos[atPos].IndexOf(':');
                    int    underbarIndex = expectedAtPos[atPos].IndexOf('_');
                    string expectedToken;
                    int    expectedEndOffset;
                    int    expectedPosLen;
                    if (colonIndex != -1)
                    {
                        expectedToken = expectedAtPos[atPos].Substring(0, colonIndex - 0);
                        if (underbarIndex != -1)
                        {
                            expectedEndOffset = int.Parse(expectedAtPos[atPos].Substring(1 + colonIndex, underbarIndex - (1 + colonIndex)));
                            expectedPosLen    = int.Parse(expectedAtPos[atPos].Substring(1 + underbarIndex));
                        }
                        else
                        {
                            expectedEndOffset = int.Parse(expectedAtPos[atPos].Substring(1 + colonIndex));
                            expectedPosLen    = 1;
                        }
                    }
                    else
                    {
                        expectedToken     = expectedAtPos[atPos];
                        expectedEndOffset = endOffset;
                        expectedPosLen    = 1;
                    }
                    assertEquals(expectedToken, termAtt.ToString());
                    assertEquals(atPos == 0 ? 1 : 0, posIncrAtt.PositionIncrement);
                    // start/end offset of all tokens at same pos should
                    // be the same:
                    assertEquals(startOffset, offsetAtt.StartOffset());
                    assertEquals(expectedEndOffset, offsetAtt.EndOffset());
                    assertEquals(expectedPosLen, posLenAtt.PositionLength);
                }
            }
            tokensOut.End();
            tokensOut.Dispose();
            if (VERBOSE)
            {
                Console.WriteLine("  incr: END");
            }
            assertEquals(expectedUpto, expected.Length);
        }
Esempio n. 23
0
 /// <summary>
 /// Returns the next token in the stream, or null at EOS.
 /// </summary>
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = (char[])termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length;
                 curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
                 curGramSize       = minGram;
                 curPos            = 0;
                 curPosInc         = posIncAtt.PositionIncrement;
                 curPosLen         = posLenAtt.PositionLength;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 // if length by start + end offsets doesn't match the term text then assume
                 // this is a synonym and don't adjust the offsets.
                 hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
             }
         }
         if (version.OnOrAfter(LuceneVersion.LUCENE_44))
         {
             if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
             {
                 ++curPos;
                 curGramSize = minGram;
             }
             if ((curPos + curGramSize) <= curCodePointCount)
             {
                 ClearAttributes();
                 int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                 int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 posIncAtt.PositionIncrement = curPosInc;
                 curPosInc = 0;
                 posLenAtt.PositionLength = curPosLen;
                 offsetAtt.SetOffset(tokStart, tokEnd);
                 curGramSize++;
                 return(true);
             }
         }
         else
         {
             while (curGramSize <= maxGram)
             {
                 while (curPos + curGramSize <= curTermLength) // while there is input
                 {
                     ClearAttributes();
                     termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                     if (hasIllegalOffsets)
                     {
                         offsetAtt.SetOffset(tokStart, tokEnd);
                     }
                     else
                     {
                         offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                     }
                     curPos++;
                     return(true);
                 }
                 curGramSize++; // increase n-gram size
                 curPos = 0;
             }
         }
         curTermBuffer = null;
     }
 }
Esempio n. 24
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual List <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.BytesReader;

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.Message, bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(new ComparatorAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }
        }
Esempio n. 25
0
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = (char[])termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length;
                 curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
                 curGramSize       = minGram;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 if (version.OnOrAfter(LuceneVersion.LUCENE_44))
                 {
                     // Never update offsets
                     updateOffsets = false;
                 }
                 else
                 {
                     // if length by start + end offsets doesn't match the term text then assume
                     // this is a synonym and don't adjust the offsets.
                     updateOffsets = (tokStart + curTermLength) == tokEnd;
                 }
                 savePosIncr += posIncrAtt.PositionIncrement;
                 savePosLen   = posLenAtt.PositionLength;
             }
         }
         if (curGramSize <= maxGram)               // if we have hit the end of our n-gram size range, quit
         {
             if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
             {
                 // grab gramSize chars from front or back
                 int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
                 int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 ClearAttributes();
                 if (updateOffsets)
                 {
                     offsetAtt.SetOffset(tokStart + start, tokStart + end);
                 }
                 else
                 {
                     offsetAtt.SetOffset(tokStart, tokEnd);
                 }
                 // first ngram gets increment, others don't
                 if (curGramSize == minGram)
                 {
                     posIncrAtt.PositionIncrement = savePosIncr;
                     savePosIncr = 0;
                 }
                 else
                 {
                     posIncrAtt.PositionIncrement = 0;
                 }
                 posLenAtt.PositionLength = savePosLen;
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 curGramSize++;
                 return(true);
             }
         }
         curTermBuffer = null;
     }
 }
Esempio n. 26
0
        private void Parse()
        {
            //System.out.println("\nS: parse");

            Debug.Assert(inputSkipCount == 0);

            int curNextRead = nextRead;

            // Holds the longest match we've seen so far:
            BytesRef matchOutput      = null;
            int      matchInputLength = 0;
            int      matchEndOffset   = -1;

            BytesRef pendingOutput = fst.Outputs.NoOutput;

            fst.GetFirstArc(scratchArc);

            Debug.Assert(scratchArc.Output == fst.Outputs.NoOutput);

            int tokenCount = 0;

            while (true)
            {
                // Pull next token's chars:
                char[] buffer;
                int    bufferLen;
                //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

                int inputEndOffset = 0;

                if (curNextRead == nextWrite)
                {
                    // We used up our lookahead buffer of input tokens
                    // -- pull next real input token:

                    if (finished)
                    {
                        break;
                    }
                    else
                    {
                        //System.out.println("  input.incrToken");
                        Debug.Assert(futureInputs[nextWrite].consumed);
                        // Not correct: a syn match whose output is longer
                        // than its input can set future inputs keepOrig
                        // to true:
                        //assert !futureInputs[nextWrite].keepOrig;
                        if (input.IncrementToken())
                        {
                            buffer    = termAtt.Buffer();
                            bufferLen = termAtt.Length;
                            PendingInput pendingInput = futureInputs[nextWrite];
                            lastStartOffset = pendingInput.startOffset = offsetAtt.StartOffset();
                            lastEndOffset   = pendingInput.endOffset = offsetAtt.EndOffset();
                            inputEndOffset  = pendingInput.endOffset;
                            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
                            if (nextRead != nextWrite)
                            {
                                Capture();
                            }
                            else
                            {
                                pendingInput.consumed = false;
                            }
                        }
                        else
                        {
                            // No more input tokens
                            //System.out.println("      set end");
                            finished = true;
                            break;
                        }
                    }
                }
                else
                {
                    // Still in our lookahead
                    buffer         = futureInputs[curNextRead].term.Chars;
                    bufferLen      = futureInputs[curNextRead].term.Length;
                    inputEndOffset = futureInputs[curNextRead].endOffset;
                    //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
                }

                tokenCount++;

                // Run each char in this token through the FST:
                int bufUpto = 0;
                while (bufUpto < bufferLen)
                {
                    int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
                    if (fst.FindTargetArc(ignoreCase ? Character.ToLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
                    {
                        //System.out.println("    stop");
                        goto byTokenBreak;
                    }

                    // Accum the output
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
                    bufUpto += Character.CharCount(codePoint);
                }

                // OK, entire token matched; now see if this is a final
                // state:
                if (scratchArc.Final)
                {
                    matchOutput      = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
                    matchInputLength = tokenCount;
                    matchEndOffset   = inputEndOffset;
                    //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
                }

                // See if the FST wants to continue matching (ie, needs to
                // see the next input token):
                if (fst.FindTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null)
                {
                    // No further rules can match here; we're done
                    // searching for matching rules starting at the
                    // current input position.
                    break;
                }
                else
                {
                    // More matching is possible -- accum the output (if
                    // any) of the WORD_SEP arc:
                    pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                    if (nextRead == nextWrite)
                    {
                        Capture();
                    }
                }

                curNextRead = RollIncr(curNextRead);
            }
byTokenBreak:

            if (nextRead == nextWrite && !finished)
            {
                //System.out.println("  skip write slot=" + nextWrite);
                nextWrite = RollIncr(nextWrite);
            }

            if (matchOutput != null)
            {
                //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
                inputSkipCount = matchInputLength;
                AddOutput(matchOutput, matchInputLength, matchEndOffset);
            }
            else if (nextRead != nextWrite)
            {
                // Even though we had no match here, we set to 1
                // because we need to skip current input token before
                // trying to match again:
                inputSkipCount = 1;
            }
            else
            {
                Debug.Assert(finished);
            }

            //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
        }
Esempio n. 27
0
        public static void VerifyEquals(Fields d1, Fields d2)
        {
            if (d1 == null)
            {
                Assert.IsTrue(d2 == null || d2.Size == 0);
                return;
            }
            Assert.IsTrue(d2 != null);

            IEnumerator <string> fieldsEnum2 = d2.GetEnumerator();

            foreach (string field1 in d1)
            {
                fieldsEnum2.MoveNext();
                string field2 = fieldsEnum2.Current;
                Assert.AreEqual(field1, field2);

                Terms terms1 = d1.Terms(field1);
                Assert.IsNotNull(terms1);
                TermsEnum termsEnum1 = terms1.Iterator(null);

                Terms terms2 = d2.Terms(field2);
                Assert.IsNotNull(terms2);
                TermsEnum termsEnum2 = terms2.Iterator(null);

                DocsAndPositionsEnum dpEnum1 = null;
                DocsAndPositionsEnum dpEnum2 = null;
                DocsEnum             dEnum1  = null;
                DocsEnum             dEnum2  = null;

                BytesRef term1;
                while ((term1 = termsEnum1.Next()) != null)
                {
                    BytesRef term2 = termsEnum2.Next();
                    Assert.AreEqual(term1, term2);
                    Assert.AreEqual(termsEnum1.TotalTermFreq(), termsEnum2.TotalTermFreq());

                    dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1);
                    dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2);
                    if (dpEnum1 != null)
                    {
                        Assert.IsNotNull(dpEnum2);
                        int docID1 = dpEnum1.NextDoc();
                        dpEnum2.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dpEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);

                        int freq1 = dpEnum1.Freq();
                        int freq2 = dpEnum2.Freq();
                        Assert.AreEqual(freq1, freq2);
                        IOffsetAttribute offsetAtt1 = dpEnum1.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes().GetAttribute <IOffsetAttribute>() : null;
                        IOffsetAttribute offsetAtt2 = dpEnum2.Attributes().HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes().GetAttribute <IOffsetAttribute>() : null;

                        if (offsetAtt1 != null)
                        {
                            Assert.IsNotNull(offsetAtt2);
                        }
                        else
                        {
                            Assert.IsNull(offsetAtt2);
                        }

                        for (int posUpto = 0; posUpto < freq1; posUpto++)
                        {
                            int pos1 = dpEnum1.NextPosition();
                            int pos2 = dpEnum2.NextPosition();
                            Assert.AreEqual(pos1, pos2);
                            if (offsetAtt1 != null)
                            {
                                Assert.AreEqual(offsetAtt1.StartOffset(), offsetAtt2.StartOffset());
                                Assert.AreEqual(offsetAtt1.EndOffset(), offsetAtt2.EndOffset());
                            }
                        }
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc());
                    }
                    else
                    {
                        dEnum1 = TestUtil.Docs(Random(), termsEnum1, null, dEnum1, DocsEnum.FLAG_FREQS);
                        dEnum2 = TestUtil.Docs(Random(), termsEnum2, null, dEnum2, DocsEnum.FLAG_FREQS);
                        Assert.IsNotNull(dEnum1);
                        Assert.IsNotNull(dEnum2);
                        int docID1 = dEnum1.NextDoc();
                        dEnum2.NextDoc();
                        // docIDs are not supposed to be equal
                        //int docID2 = dEnum2.NextDoc();
                        //Assert.AreEqual(docID1, docID2);
                        Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS);
                        int freq1 = dEnum1.Freq();
                        int freq2 = dEnum2.Freq();
                        Assert.AreEqual(freq1, freq2);
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc());
                        Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc());
                    }
                }

                Assert.IsNull(termsEnum2.Next());
            }
            Assert.IsFalse(fieldsEnum2.MoveNext());
        }
Esempio n. 28
0
        /// <summary>
        /// Iterates over the given token stream and adds the resulting terms to the index;
        /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
        /// Lucene <see cref="Documents.Field"/>.
        /// Finally closes the token stream. Note that untokenized keywords can be added with this method via
        /// <see cref="KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities.
        ///
        /// </summary>
        /// <param name="fieldName"> a name to be associated with the text </param>
        /// <param name="stream"> the token stream to retrieve tokens from. </param>
        /// <param name="boost"> the boost factor for hits for this field </param>
        /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param>
        /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param>
        /// <seealso cref="Documents.Field.Boost(float)"/>
        public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap)
        {
            try
            {
                if (fieldName == null)
                {
                    throw new System.ArgumentException("fieldName must not be null");
                }
                if (stream == null)
                {
                    throw new System.ArgumentException("token stream must not be null");
                }
                if (boost <= 0.0f)
                {
                    throw new System.ArgumentException("boost factor must be greater than 0.0");
                }
                int                 numTokens        = 0;
                int                 numOverlapTokens = 0;
                int                 pos = -1;
                BytesRefHash        terms;
                SliceByteStartArray sliceArray;
                Info                info             = null;
                long                sumTotalTermFreq = 0;
                int                 offset           = 0;
                if (fields.TryGetValue(fieldName, out info))
                {
                    numTokens        = info.numTokens;
                    numOverlapTokens = info.numOverlapTokens;
                    pos              = info.lastPosition + positionIncrementGap;
                    offset           = info.lastOffset + offsetGap;
                    terms            = info.terms;
                    boost           *= info.boost;
                    sliceArray       = info.sliceArray;
                    sumTotalTermFreq = info.sumTotalTermFreq;
                }
                else
                {
                    sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
                    terms      = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
                }

                if (!fieldInfos.ContainsKey(fieldName))
                {
                    fieldInfos[fieldName] = new FieldInfo(fieldName, true, fieldInfos.Count, false, false, false, this.storeOffsets ? FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null, null, null);
                }
                ITermToBytesRefAttribute    termAtt          = stream.GetAttribute <ITermToBytesRefAttribute>();
                IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>();
                IOffsetAttribute            offsetAtt        = stream.AddAttribute <IOffsetAttribute>();
                BytesRef @ref = termAtt.BytesRef;
                stream.Reset();

                while (stream.IncrementToken())
                {
                    termAtt.FillBytesRef();
                    //        if (DEBUG) System.err.println("token='" + term + "'");
                    numTokens++;
                    int posIncr = posIncrAttribute.PositionIncrement;
                    if (posIncr == 0)
                    {
                        numOverlapTokens++;
                    }
                    pos += posIncr;
                    int ord = terms.Add(@ref);
                    if (ord < 0)
                    {
                        ord = (-ord) - 1;
                        postingsWriter.Reset(sliceArray.end[ord]);
                    }
                    else
                    {
                        sliceArray.start[ord] = postingsWriter.StartNewSlice();
                    }
                    sliceArray.freq[ord]++;
                    sumTotalTermFreq++;
                    if (!storeOffsets)
                    {
                        postingsWriter.WriteInt(pos);
                    }
                    else
                    {
                        postingsWriter.WriteInt(pos);
                        postingsWriter.WriteInt(offsetAtt.StartOffset() + offset);
                        postingsWriter.WriteInt(offsetAtt.EndOffset() + offset);
                    }
                    sliceArray.end[ord] = postingsWriter.CurrentOffset;
                }
                stream.End();

                // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
                if (numTokens > 0)
                {
                    fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset() + offset, sumTotalTermFreq);
                    sortedFields      = null; // invalidate sorted view, if any
                }
            } // can never happen
            catch (Exception /*e*/)
            {
                throw;
            }
            finally
            {
                try
                {
                    if (stream != null)
                    {
                        stream.Dispose();
                    }
                }
                catch (IOException /*e2*/)
                {
                    throw;
                }
            }
        }
Esempio n. 29
0
        /*
         * Need to worry about multiple scenarios:
         *  - need to go for the longest match
         *    a b => foo      #shouldn't match if "a b" is followed by "c d"
         *    a b c d => bar
         *  - need to backtrack - retry matches for tokens already read
         *     a b c d => foo
         *       b c => bar
         *     If the input stream is "a b c x", one will consume "a b c d"
         *     trying to match the first rule... all but "a" should be
         *     pushed back so a match may be made on "b c".
         *  - don't try and match generated tokens (thus need separate queue)
         *    matching is not recursive.
         *  - handle optional generation of original tokens in all these cases,
         *    merging token streams to preserve token positions.
         *  - preserve original positionIncrement of first matched token
         */
        public override bool IncrementToken()
        {
            while (true)
            {
                // if there are any generated tokens, return them... don't try any
                // matches against them, as we specifically don't want recursion.
                if (replacement != null && replacement.MoveNext())
                {
                    Copy(this, replacement.Current);
                    return(true);
                }

                // common case fast-path of first token not matching anything
                AttributeSource firstTok = NextTok();
                if (firstTok == null)
                {
                    return(false);
                }
                var            termAtt = firstTok.AddAttribute <ICharTermAttribute>();
                SlowSynonymMap result  = map.submap != null?map.submap.Get(termAtt.Buffer(), 0, termAtt.Length) : null;

                if (result == null)
                {
                    Copy(this, firstTok);
                    return(true);
                }

                // fast-path failed, clone ourselves if needed
                if (firstTok == this)
                {
                    firstTok = CloneAttributes();
                }
                // OK, we matched a token, so find the longest match.

                matched = new LinkedList <AttributeSource>();

                result = Match(result);

                if (result == null)
                {
                    // no match, simply return the first token read.
                    Copy(this, firstTok);
                    return(true);
                }

                // reuse, or create new one each time?
                List <AttributeSource> generated = new List <AttributeSource>(result.synonyms.Length + matched.Count + 1);

                //
                // there was a match... let's generate the new tokens, merging
                // in the matched tokens (position increments need adjusting)
                //
                AttributeSource lastTok     = matched.Count == 0 ? firstTok : matched.Last.Value;
                bool            includeOrig = result.IncludeOrig;

                AttributeSource             origTok        = includeOrig ? firstTok : null;
                IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute <IPositionIncrementAttribute>();
                int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                int repPos  = 0;                                // curr position in replacement token stream
                int pos     = 0;                                // current position in merged token stream

                for (int i = 0; i < result.synonyms.Length; i++)
                {
                    Token                       repTok       = result.synonyms[i];
                    AttributeSource             newTok       = firstTok.CloneAttributes();
                    ICharTermAttribute          newTermAtt   = newTok.AddAttribute <ICharTermAttribute>();
                    IOffsetAttribute            newOffsetAtt = newTok.AddAttribute <IOffsetAttribute>();
                    IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute <IPositionIncrementAttribute>();

                    IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute <IOffsetAttribute>();

                    newOffsetAtt.SetOffset(newOffsetAtt.StartOffset(), lastOffsetAtt.EndOffset());
                    newTermAtt.CopyBuffer(repTok.Buffer(), 0, repTok.Length);
                    repPos += repTok.PositionIncrement;
                    if (i == 0) // make position of first token equal to original
                    {
                        repPos = origPos;
                    }

                    // if necessary, insert original tokens and adjust position increment
                    while (origTok != null && origPos <= repPos)
                    {
                        IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPosInc.PositionIncrement = origPos - pos;
                        generated.Add(origTok);
                        pos += origPosInc.PositionIncrement;
                        //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                        if (matched.Count == 0)
                        {
                            origTok = null;
                        }
                        else
                        {
                            origTok = matched.First.Value;
                            matched.RemoveFirst();
                        }
                        if (origTok != null)
                        {
                            origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                            origPos   += origPosInc.PositionIncrement;
                        }
                    }

                    newPosIncAtt.PositionIncrement = repPos - pos;
                    generated.Add(newTok);
                    pos += newPosIncAtt.PositionIncrement;
                }

                // finish up any leftover original tokens
                while (origTok != null)
                {
                    IPositionIncrementAttribute origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                    origPosInc.PositionIncrement = origPos - pos;
                    generated.Add(origTok);
                    pos += origPosInc.PositionIncrement;
                    //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                    if (matched.Count == 0)
                    {
                        origTok = null;
                    }
                    else
                    {
                        origTok = matched.Count > 0 ? matched.First.Value : null;
                        matched.RemoveFirst();
                    }
                    if (origTok != null)
                    {
                        origPosInc = origTok.AddAttribute <IPositionIncrementAttribute>();
                        origPos   += origPosInc.PositionIncrement;
                    }
                }

                // what if we replaced a longer sequence with a shorter one?
                // a/0 b/5 =>  foo/0
                // should I re-create the gap on the next buffered token?

                replacement = generated.GetEnumerator();
                // Now return to the top of the loop to read and return the first
                // generated token.. The reason this is done is that we may have generated
                // nothing at all, and may need to continue with more matching logic.
            }
        }
Esempio n. 30
0
        // offsetsAreCorrect also validates:
        //   - graph offsets are correct (all tokens leaving from
        //     pos X have the same startOffset; all tokens
        //     arriving to pos Y have the same endOffset)
        //   - offsets only move forwards (startOffset >=
        //     lastStartOffset)
        public static void AssertTokenStreamContents(TokenStream ts, string[] output, int[] startOffsets, int[] endOffsets, string[] types, int[] posIncrements, int[] posLengths, int?finalOffset, int?finalPosInc, bool[] keywordAtts, bool offsetsAreCorrect)
        {
            Assert.IsNotNull(output);
            var checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>();

            ICharTermAttribute termAtt = null;

            if (output.Length > 0)
            {
                Assert.IsTrue(ts.HasAttribute <ICharTermAttribute>(), "has no CharTermAttribute");
                termAtt = ts.GetAttribute <ICharTermAttribute>();
            }

            IOffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute <IOffsetAttribute>();
            }

            ITypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute <ITypeAttribute>();
            }

            IPositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null || finalPosInc != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>();
            }

            IPositionLengthAttribute posLengthAtt = null;

            if (posLengths != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionLengthAttribute>(), "has no PositionLengthAttribute");
                posLengthAtt = ts.GetAttribute <IPositionLengthAttribute>();
            }

            IKeywordAttribute keywordAtt = null;

            if (keywordAtts != null)
            {
                Assert.IsTrue(ts.HasAttribute <IKeywordAttribute>(), "has no KeywordAttribute");
                keywordAtt = ts.GetAttribute <IKeywordAttribute>();
            }

            // Maps position to the start/end offset:
            IDictionary <int?, int?> posToStartOffset = new Dictionary <int?, int?>();
            IDictionary <int?, int?> posToEndOffset   = new Dictionary <int?, int?>();

            ts.Reset();
            int pos             = -1;
            int lastStartOffset = 0;

            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.Type = "bogusType";
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.PositionIncrement = 45987657;
                }
                if (posLengthAtt != null)
                {
                    posLengthAtt.PositionLength = 45987653;
                }
                if (keywordAtt != null)
                {
                    keywordAtt.Keyword = (i & 1) == 0;
                }

                bool reset = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(reset, "ClearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.ToString(), "term " + i + ", output[i] = " + output[i] + ", termAtt = " + termAtt.ToString());
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
                }
                if (posLengths != null)
                {
                    Assert.AreEqual(posLengths[i], posLengthAtt.PositionLength, "posLength " + i);
                }
                if (keywordAtts != null)
                {
                    Assert.AreEqual(keywordAtts[i], keywordAtt.Keyword, "keywordAtt " + i);
                }

                // we can enforce some basic things about a few attributes even if the caller doesn't check:
                if (offsetAtt != null)
                {
                    int startOffset = offsetAtt.StartOffset();
                    int endOffset   = offsetAtt.EndOffset();
                    if (finalOffset != null)
                    {
                        Assert.IsTrue(startOffset <= (int)finalOffset, "startOffset must be <= finalOffset");
                        Assert.IsTrue(endOffset <= (int)finalOffset, "endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + (int)finalOffset);
                    }

                    if (offsetsAreCorrect)
                    {
                        Assert.IsTrue(offsetAtt.StartOffset() >= lastStartOffset, "offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
                        lastStartOffset = offsetAtt.StartOffset();
                    }

                    if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null)
                    {
                        // Validate offset consistency in the graph, ie
                        // all tokens leaving from a certain pos have the
                        // same startOffset, and all tokens arriving to a
                        // certain pos have the same endOffset:
                        int posInc = posIncrAtt.PositionIncrement;
                        pos += posInc;

                        int posLength = posLengthAtt.PositionLength;

                        if (!posToStartOffset.ContainsKey(pos))
                        {
                            // First time we've seen a token leaving from this position:
                            posToStartOffset[pos] = startOffset;
                            //System.out.println("  + s " + pos + " -> " + startOffset);
                        }
                        else
                        {
                            // We've seen a token leaving from this position
                            // before; verify the startOffset is the same:
                            //System.out.println("  + vs " + pos + " -> " + startOffset);
                            Assert.AreEqual((int)posToStartOffset[pos], startOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }

                        int endPos = pos + posLength;

                        if (!posToEndOffset.ContainsKey(endPos))
                        {
                            // First time we've seen a token arriving to this position:
                            posToEndOffset[endPos] = endOffset;
                            //System.out.println("  + e " + endPos + " -> " + endOffset);
                        }
                        else
                        {
                            // We've seen a token arriving to this position
                            // before; verify the endOffset is the same:
                            //System.out.println("  + ve " + endPos + " -> " + endOffset);
                            Assert.AreEqual((int)posToEndOffset[endPos], endOffset, "pos=" + pos + " posLen=" + posLength + " token=" + termAtt);
                        }
                    }
                }
                if (posIncrAtt != null)
                {
                    if (i == 0)
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 1, "first posIncrement must be >= 1");
                    }
                    else
                    {
                        Assert.IsTrue(posIncrAtt.PositionIncrement >= 0, "posIncrement must be >= 0");
                    }
                }
                if (posLengthAtt != null)
                {
                    Assert.IsTrue(posLengthAtt.PositionLength >= 1, "posLength must be >= 1");
                }
            }

            if (ts.IncrementToken())
            {
                Assert.Fail("TokenStream has more tokens than expected (expected count=" + output.Length + "); extra token=" + termAtt);
            }

            // repeat our extra safety checks for End()
            ts.ClearAttributes();
            if (termAtt != null)
            {
                termAtt.SetEmpty().Append("bogusTerm");
            }
            if (offsetAtt != null)
            {
                offsetAtt.SetOffset(14584724, 24683243);
            }
            if (typeAtt != null)
            {
                typeAtt.Type = "bogusType";
            }
            if (posIncrAtt != null)
            {
                posIncrAtt.PositionIncrement = 45987657;
            }
            if (posLengthAtt != null)
            {
                posLengthAtt.PositionLength = 45987653;
            }

            var reset_ = checkClearAtt.AndResetClearCalled; // reset it, because we called clearAttribute() before

            ts.End();
            Assert.IsTrue(checkClearAtt.AndResetClearCalled, "super.End()/ClearAttributes() was not called correctly in End()");

            if (finalOffset != null)
            {
                Assert.AreEqual((int)finalOffset, offsetAtt.EndOffset(), "finalOffset");
            }
            if (offsetAtt != null)
            {
                Assert.IsTrue(offsetAtt.EndOffset() >= 0, "finalOffset must be >= 0");
            }
            if (finalPosInc != null)
            {
                Assert.AreEqual((int)finalPosInc, posIncrAtt.PositionIncrement, "finalPosInc");
            }

            ts.Dispose();
        }