Exemplo n.º 1
0
        public void TestOffsetsWithTokenizer()
        {
            const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

            Tokenizer t = new WhitespaceTokenizer(new HTMLStripCharFilter(CharReader.Get(new StringReader(input))));

            string          token   = string.Empty;
            List <Token>    results = new List <Token>();
            OffsetAttribute att     = ((OffsetAttribute)t.GetAttribute(typeof(OffsetAttribute)));

            t.IncrementToken();
            Assert.AreEqual(0, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(20, att.StartOffset());
            Assert.AreEqual(8, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(33, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(39, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());
        }
Exemplo n.º 2
0
        public override bool IncrementToken()
        {
            if (endState != null)
            {
                return(false);
            }

            if (!Input.IncrementToken())
            {
                return(false);
            }

            int skippedPositions = 0;

            while (true)
            {
                if (stopWords.Contains(termAtt.Buffer(), 0, termAtt.Length))
                {
                    int posInc    = posIncAtt.PositionIncrement;
                    int endOffset = offsetAtt.EndOffset();
                    // This token may be a stopword, if it's not end:
                    State sav = CaptureState();
                    if (Input.IncrementToken())
                    {
                        // It was a stopword; skip it
                        skippedPositions += posInc;
                    }
                    else
                    {
                        ClearAttributes();
                        Input.End();
                        endState = CaptureState();
                        int finalEndOffset = offsetAtt.EndOffset();
                        Debug.Assert(finalEndOffset >= endOffset);
                        if (finalEndOffset > endOffset)
                        {
                            // OK there was a token separator after the
                            // stopword, so it was a stopword
                            return(false);
                        }
                        else
                        {
                            // No token separator after final token that
                            // looked like a stop-word; don't filter it:
                            RestoreState(sav);
                            posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
                            keywordAtt.Keyword          = true;
                            return(true);
                        }
                    }
                }
                else
                {
                    // Not a stopword; return the current token:
                    posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
                    return(true);
                }
            }
        }
Exemplo n.º 3
0
        /// <summary>
        /// Constructs a compound token.
        /// </summary>
        private void GramToken()
        {
            buffer.Append(termAttribute.Buffer(), 0, termAttribute.Length);
            int endOffset = offsetAttribute.EndOffset();

            ClearAttributes();

            int length = buffer.Length;

            char[] termText = termAttribute.Buffer();
            if (length > termText.Length)
            {
                termText = termAttribute.ResizeBuffer(length);
            }

            buffer.GetChars(0, length, termText, 0);
            termAttribute.Length = length;
            posIncAttribute.PositionIncrement = 0;
            posLenAttribute.PositionLength    = 2;  // bigram
            offsetAttribute.SetOffset(lastStartOffset, endOffset);
            typeAttribute.Type = GRAM_TYPE;
            buffer.Length      = 0;
        }
Exemplo n.º 4
0
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length();
                 curCodePointCount = charUtils.codePointCount(termAtt);
                 curGramSize       = minGram;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 if (version.OnOrAfter(Version.LUCENE_44))
                 {
                     // Never update offsets
                     updateOffsets = false;
                 }
                 else
                 {
                     // if length by start + end offsets doesn't match the term text then assume
                     // this is a synonym and don't adjust the offsets.
                     updateOffsets = (tokStart + curTermLength) == tokEnd;
                 }
                 savePosIncr += posIncrAtt.PositionIncrement;
                 savePosLen   = posLenAtt.PositionLength;
             }
         }
         if (curGramSize <= maxGram)               // if we have hit the end of our n-gram size range, quit
         {
             if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
             {
                 // grab gramSize chars from front or back
                 int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
                 int end   = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 ClearAttributes();
                 if (updateOffsets)
                 {
                     offsetAtt.SetOffset(tokStart + start, tokStart + end);
                 }
                 else
                 {
                     offsetAtt.SetOffset(tokStart, tokEnd);
                 }
                 // first ngram gets increment, others don't
                 if (curGramSize == minGram)
                 {
                     posIncrAtt.PositionIncrement = savePosIncr;
                     savePosIncr = 0;
                 }
                 else
                 {
                     posIncrAtt.PositionIncrement = 0;
                 }
                 posLenAtt.PositionLength = savePosLen;
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 curGramSize++;
                 return(true);
             }
         }
         curTermBuffer = null;
     }
 }
Exemplo n.º 5
0
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute");
            TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute));

            OffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute));
            }

            TypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute));
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.SetType("bogusType");
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.SetPositionIncrement(45987657);
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset ");
            }
            ts.Close();
        }
Exemplo n.º 6
0
        public override bool IncrementToken()
        {
            if (!Input.IncrementToken())
            {
                return(false);
            }

            int startOffset = 0;
            int endOffset   = 0;
            int posLen      = 0;

            if (PosIncAtt != null)
            {
                Pos += PosIncAtt.PositionIncrement;
                if (Pos == -1)
                {
                    throw new Exception("first posInc must be > 0");
                }
            }

            // System.out.println("  got token=" + termAtt + " pos=" + pos);

            if (OffsetAtt != null)
            {
                startOffset = OffsetAtt.StartOffset();
                endOffset   = OffsetAtt.EndOffset();

                if (OffsetsAreCorrect && OffsetAtt.StartOffset() < LastStartOffset)
                {
                    throw new Exception(Name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + LastStartOffset);
                }
                LastStartOffset = OffsetAtt.StartOffset();
            }

            posLen = PosLenAtt == null ? 1 : PosLenAtt.PositionLength;

            if (OffsetAtt != null && PosIncAtt != null && OffsetsAreCorrect)
            {
                if (!PosToStartOffset.ContainsKey(Pos))
                {
                    // First time we've seen a token leaving from this position:
                    PosToStartOffset[Pos] = startOffset;
                    //System.out.println("  + s " + pos + " -> " + startOffset);
                }
                else
                {
                    // We've seen a token leaving from this position
                    // before; verify the startOffset is the same:
                    //System.out.println("  + vs " + pos + " -> " + startOffset);
                    int oldStartOffset = PosToStartOffset[Pos];
                    if (oldStartOffset != startOffset)
                    {
                        throw new Exception(Name + ": inconsistent startOffset at pos=" + Pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + TermAtt);
                    }
                }

                int endPos = Pos + posLen;

                if (!PosToEndOffset.ContainsKey(endPos))
                {
                    // First time we've seen a token arriving to this position:
                    PosToEndOffset[endPos] = endOffset;
                    //System.out.println("  + e " + endPos + " -> " + endOffset);
                }
                else
                {
                    // We've seen a token arriving to this position
                    // before; verify the endOffset is the same:
                    //System.out.println("  + ve " + endPos + " -> " + endOffset);
                    int oldEndOffset = PosToEndOffset[endPos];
                    if (oldEndOffset != endOffset)
                    {
                        throw new Exception(Name + ": inconsistent endOffset at pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + TermAtt);
                    }
                }
            }

            return(true);
        }
Exemplo n.º 7
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> Lookup(string key, HashSet <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());

            try
            {
                TermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <TermToBytesRefAttribute>();
                OffsetAttribute            offsetAtt    = ts.AddAttribute <OffsetAttribute>();
                PositionLengthAttribute    posLenAtt    = ts.AddAttribute <PositionLengthAttribute>();
                PositionIncrementAttribute posIncAtt    = ts.AddAttribute <PositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.BytesReader;

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                IList <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Util.Get(fst, Lucene.Net.Util.Fst.Util.ToIntsRef(context, new IntsRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    nextCompletionBreak :
                    backoff *= ALPHA;
                }

                results.Sort(new ComparatorAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }
        }
Exemplo n.º 8
0
 /// <summary>
 /// Returns the next token in the stream, or null at EOS.
 /// </summary>
 public override bool IncrementToken()
 {
     while (true)
     {
         if (curTermBuffer == null)
         {
             if (!input.IncrementToken())
             {
                 return(false);
             }
             else
             {
                 curTermBuffer     = termAtt.Buffer().Clone();
                 curTermLength     = termAtt.Length;
                 curCodePointCount = charUtils.CodePointCount(termAtt);
                 curGramSize       = minGram;
                 curPos            = 0;
                 curPosInc         = posIncAtt.PositionIncrement;
                 curPosLen         = posLenAtt.PositionLength;
                 tokStart          = offsetAtt.StartOffset();
                 tokEnd            = offsetAtt.EndOffset();
                 // if length by start + end offsets doesn't match the term text then assume
                 // this is a synonym and don't adjust the offsets.
                 hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
             }
         }
         if (version.OnOrAfter(Version.LUCENE_44))
         {
             if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
             {
                 ++curPos;
                 curGramSize = minGram;
             }
             if ((curPos + curGramSize) <= curCodePointCount)
             {
                 ClearAttributes();
                 int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
                 int end   = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
                 termAtt.CopyBuffer(curTermBuffer, start, end - start);
                 posIncAtt.PositionIncrement = curPosInc;
                 curPosInc = 0;
                 posLenAtt.PositionLength = curPosLen;
                 offsetAtt.SetOffset(tokStart, tokEnd);
                 curGramSize++;
                 return(true);
             }
         }
         else
         {
             while (curGramSize <= maxGram)
             {
                 while (curPos + curGramSize <= curTermLength) // while there is input
                 {
                     ClearAttributes();
                     termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
                     if (hasIllegalOffsets)
                     {
                         offsetAtt.SetOffset(tokStart, tokEnd);
                     }
                     else
                     {
                         offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
                     }
                     curPos++;
                     return(true);
                 }
                 curGramSize++; // increase n-gram size
                 curPos = 0;
             }
         }
         curTermBuffer = null;
     }
 }