예제 #1
0
        public void TestOffsetsWithTokenizer()
        {
            const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

            Tokenizer t = new WhitespaceTokenizer(new HTMLStripCharFilter(CharReader.Get(new StringReader(input))));

            string          token   = string.Empty;
            List <Token>    results = new List <Token>();
            OffsetAttribute att     = ((OffsetAttribute)t.GetAttribute(typeof(OffsetAttribute)));

            t.IncrementToken();
            Assert.AreEqual(0, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(20, att.StartOffset());
            Assert.AreEqual(8, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(33, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(39, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());
        }
예제 #2
0
 /** Returns the next token in the stream, or null at EOS. */
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer = (char[])termAtt.TermBuffer().Clone();
                 curTermLength = termAtt.TermLength();
                 curGramSize   = minGram;
                 curPos        = 0;
                 tokStart      = offsetAtt.StartOffset();
             }
         }
         while (curGramSize <= maxGram)
         {
             while (curPos + curGramSize <= curTermLength)
             {                             // while there is input
                 ClearAttributes();
                 termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize);
                 offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                 curPos++;
                 return(true);
             }
             curGramSize++;                                                 // increase n-gram size
             curPos = 0;
         }
         curTermBuffer = null;
     }
 }
예제 #3
0
 /// <summary>
 /// Saves this information to form the left part of a gram
 /// </summary>
 private void SaveTermBuffer()
 {
     buffer.Length = 0;
     buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length);
     buffer.Append(SEPARATOR);
     lastStartOffset = offsetAttribute.StartOffset();
     lastWasCommon   = Common;
 }
예제 #4
0
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length();
                 curCodePointCount = charUtils.codePointCount(termAtt);
                 curGramSize       = minGram;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 if (version.OnOrAfter(Version.LUCENE_44))
                 {
                     // Never update offsets
                     updateOffsets = false;
                 }
                 else
                 {
                     // if length by start + end offsets doesn't match the term text then assume
                     // this is a synonym and don't adjust the offsets.
                     updateOffsets = (tokStart + curTermLength) == tokEnd;
                 }
                 savePosIncr += posIncrAtt.PositionIncrement;
                 savePosLen   = posLenAtt.PositionLength;
             }
         }
         if (curGramSize <= maxGram)               // if we have hit the end of our n-gram size range, quit
         {
             if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
             {
                 // grab gramSize chars from front or back
                 int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
                 int end   = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 ClearAttributes();
                 if (updateOffsets)
                 {
                     offsetAtt.SetOffset(tokStart + start, tokStart + end);
                 }
                 else
                 {
                     offsetAtt.SetOffset(tokStart, tokEnd);
                 }
                 // first ngram gets increment, others don't
                 if (curGramSize == minGram)
                 {
                     posIncrAtt.PositionIncrement = savePosIncr;
                     savePosIncr = 0;
                 }
                 else
                 {
                     posIncrAtt.PositionIncrement = 0;
                 }
                 posLenAtt.PositionLength = savePosLen;
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 curGramSize++;
                 return(true);
             }
         }
         curTermBuffer = null;
     }
 }
예제 #5
0
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute");
            TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute));

            OffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute));
            }

            TypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute));
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.SetType("bogusType");
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.SetPositionIncrement(45987657);
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset ");
            }
            ts.Close();
        }
예제 #6
0
        public override bool IncrementToken()
        {
            if (!Input.IncrementToken())
            {
                return(false);
            }

            int startOffset = 0;
            int endOffset   = 0;
            int posLen      = 0;

            if (PosIncAtt != null)
            {
                Pos += PosIncAtt.PositionIncrement;
                if (Pos == -1)
                {
                    throw new Exception("first posInc must be > 0");
                }
            }

            // System.out.println("  got token=" + termAtt + " pos=" + pos);

            if (OffsetAtt != null)
            {
                startOffset = OffsetAtt.StartOffset();
                endOffset   = OffsetAtt.EndOffset();

                if (OffsetsAreCorrect && OffsetAtt.StartOffset() < LastStartOffset)
                {
                    throw new Exception(Name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + LastStartOffset);
                }
                LastStartOffset = OffsetAtt.StartOffset();
            }

            posLen = PosLenAtt == null ? 1 : PosLenAtt.PositionLength;

            if (OffsetAtt != null && PosIncAtt != null && OffsetsAreCorrect)
            {
                if (!PosToStartOffset.ContainsKey(Pos))
                {
                    // First time we've seen a token leaving from this position:
                    PosToStartOffset[Pos] = startOffset;
                    //System.out.println("  + s " + pos + " -> " + startOffset);
                }
                else
                {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    //System.out.println("  + vs " + pos + " -> " + startOffset);
                    int oldStartOffset = PosToStartOffset[Pos];
                    if (oldStartOffset != startOffset)
                    {
                        throw new Exception(Name + ": inconsistent startOffset at pos=" + Pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + TermAtt);
                    }
                }

                int endPos = Pos + posLen;

                if (!PosToEndOffset.ContainsKey(endPos))
                {
                    // First time we've seen a token arriving to this position:
                    PosToEndOffset[endPos] = endOffset;
                    //System.out.println("  + e " + endPos + " -> " + endOffset);
                }
                else
                {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    //System.out.println("  + ve " + endPos + " -> " + endOffset);
                    int oldEndOffset = PosToEndOffset[endPos];
                    if (oldEndOffset != endOffset)
                    {
                        throw new Exception(Name + ": inconsistent endOffset at pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + TermAtt);
                    }
                }
            }

            return(true);
        }
예제 #7
0
 /// <summary>
 /// Returns the next token in the stream, or null at EOS.
 /// </summary>
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length;
                 curCodePointCount = charUtils.CodePointCount(termAtt);
                 curGramSize       = minGram;
                 curPos            = 0;
                 curPosInc         = posIncAtt.PositionIncrement;
                 curPosLen         = posLenAtt.PositionLength;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 // if length by start + end offsets doesn't match the term text then assume
                 // this is a synonym and don't adjust the offsets.
                 hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
             }
         }
         if (version.OnOrAfter(Version.LUCENE_44))
         {
             if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
             {
                 ++curPos;
                 curGramSize = minGram;
             }
             if ((curPos + curGramSize) <= curCodePointCount)
             {
                 ClearAttributes();
                 int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                 int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 posIncAtt.PositionIncrement = curPosInc;
                 curPosInc = 0;
                 posLenAtt.PositionLength = curPosLen;
                 offsetAtt.SetOffset(tokStart, tokEnd);
                 curGramSize++;
                 return(true);
             }
         }
         else
         {
             while (curGramSize <= maxGram)
             {
                 while (curPos + curGramSize <= curTermLength) // while there is input
                 {
                     ClearAttributes();
                     termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                     if (hasIllegalOffsets)
                     {
                         offsetAtt.SetOffset(tokStart, tokEnd);
                     }
                     else
                     {
                         offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                     }
                     curPos++;
                     return(true);
                 }
                 curGramSize++; // increase n-gram size
                 curPos = 0;
             }
         }
         curTermBuffer = null;
     }
 }