public static void AssertTokenStreamContents(TokenStream ts, String[] output, Int32[] startOffsets, Int32[] endOffsets, String[] types, Int32[] posIncrements, Int32? finalOffset) {
            Assert.IsNotNull(output);
            var checkClearAtt = (CheckClearAttributesAttribute) ts.AddAttribute(typeof (CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof (TermAttribute)), "has no TermAttribute");
            var termAtt = (TermAttribute) ts.GetAttribute(typeof (TermAttribute));

            OffsetAttribute offsetAtt = null;
            if (startOffsets != null || endOffsets != null || finalOffset != null) {
                Assert.IsTrue(ts.HasAttribute(typeof (OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute) ts.GetAttribute(typeof (OffsetAttribute));
            }

            TypeAttribute typeAtt = null;
            if (types != null) {
                Assert.IsTrue(ts.HasAttribute(typeof (TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute) ts.GetAttribute(typeof (TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;
            if (posIncrements != null) {
                Assert.IsTrue(ts.HasAttribute(typeof (PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute) ts.GetAttribute(typeof (PositionIncrementAttribute));
            }

            ts.Reset();
            for (Int32 i = 0; i < output.Length; i++) {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null) offsetAtt.SetOffset(14584724, 24683243);
                if (typeAtt != null) typeAtt.SetType("bogusType");
                if (posIncrAtt != null) posIncrAtt.SetPositionIncrement(45987657);

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                if (endOffsets != null)
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                if (types != null)
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                if (posIncrements != null)
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
                Assert.AreEqual(finalOffset.Value, offsetAtt.EndOffset(), "finalOffset ");
            ts.Close();
        }
Пример #2
0
        private void assertBaseForms(String input, params String[] baseForms)
        {
            TokenStream ts = analyzer.GetTokenStream("ignored", input);

            try
            {
                IBaseFormAttribute baseFormAtt = ts.AddAttribute <IBaseFormAttribute>();
                ts.Reset();
                foreach (String baseForm in baseForms)
                {
                    assertTrue(ts.IncrementToken());
                    assertEquals(baseForm, baseFormAtt.GetBaseForm());
                }
                assertFalse(ts.IncrementToken());
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Пример #3
0
        private void assertPronunciations(String input, params String[] pronunciations)
        {
            TokenStream ts = analyzer.GetTokenStream("ignored", input);

            try
            {
                IReadingAttribute readingAtt = ts.AddAttribute <IReadingAttribute>();
                ts.Reset();
                foreach (String pronunciation in pronunciations)
                {
                    assertTrue(ts.IncrementToken());
                    assertEquals(pronunciation, readingAtt.GetPronunciation());
                }
                assertFalse(ts.IncrementToken());
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Пример #4
0
        private void assertPartsOfSpeech(String input, params String[] partsOfSpeech)
        {
            TokenStream ts = analyzer.GetTokenStream("ignored", input);

            try
            {
                IPartOfSpeechAttribute partOfSpeechAtt = ts.AddAttribute <IPartOfSpeechAttribute>();
                ts.Reset();
                foreach (String partOfSpeech in partsOfSpeech)
                {
                    assertTrue(ts.IncrementToken());
                    assertEquals(partOfSpeech, partOfSpeechAtt.GetPartOfSpeech());
                }
                assertFalse(ts.IncrementToken());
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Пример #5
0
        private void assertInflectionTypes(String input, params String[] inflectionTypes)
        {
            TokenStream ts = analyzer.GetTokenStream("ignored", input);

            try
            {
                IInflectionAttribute inflectionAtt = ts.AddAttribute <IInflectionAttribute>();
                ts.Reset();
                foreach (String inflectionType in inflectionTypes)
                {
                    assertTrue(ts.IncrementToken());
                    assertEquals(inflectionType, inflectionAtt.GetInflectionType());
                }
                assertFalse(ts.IncrementToken());
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Пример #6
0
        private Query ExpandPrefixQuery(string field, string termStr)
        {
            if (string.IsNullOrWhiteSpace(field))
            {
                throw new ArgumentException("field must have a value.");
            }

            var terms = new List <string>();

            using (TokenStream source = base.Analyzer.TokenStream(field, new StringReader(termStr)))
            {
                var term = source.AddAttribute <ITermAttribute>();

                while (source.IncrementToken())
                {
                    terms.Add(term.Term);
                }
            }

            if (terms.Count != 1)
            {
                /* this means that the analyzer used either added or consumed
                 * (common for a stemmer) tokens, and we can't build a PrefixQuery */
                throw new ParseException(string.Format("Cannot build PrefixQuery with analyzer {0} with {1}.",
                                                       Analyzer.GetType(),
                                                       terms.Count > 1 ? "token(s) added" : "token consumed"));
            }

            var prefixQuery = new PrefixQuery(new Term(field, terms[0]));

            //If the user passes a map of boosts
            if (boosts != null)
            {
                prefixQuery.Boost = base.boosts[field];
            }

            var r = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(_indexReader, prefixQuery);

            return(r);
        }
Пример #7
0
        public virtual Query GetQuery(XmlElement e)
        {
            string fieldName = DOMUtils.GetAttributeWithInheritanceOrFail(e, "fieldName");
            string text      = DOMUtils.GetNonBlankTextOrFail(e);

            BooleanQuery bq = new BooleanQuery(DOMUtils.GetAttribute(e, "disableCoord", false));

            bq.MinimumNumberShouldMatch = DOMUtils.GetAttribute(e, "minimumNumberShouldMatch", 0);
            TokenStream ts = null;

            try
            {
                ts = analyzer.GetTokenStream(fieldName, text);
                ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                Term     term  = null;
                BytesRef bytes = termAtt.BytesRef;
                ts.Reset();
                while (ts.IncrementToken())
                {
                    termAtt.FillBytesRef();
                    term = new Term(fieldName, BytesRef.DeepCopyOf(bytes));
                    bq.Add(new BooleanClause(new TermQuery(term), Occur.SHOULD));
                }
                ts.End();
            }
            catch (Exception ioe) when(ioe.IsIOException())
            {
                throw RuntimeException.Create("Error constructing terms from index:" + ioe, ioe);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }

            bq.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f);
            return(bq);
        }
Пример #8
0
        public override SpanQuery GetSpanQuery(XmlElement e)
        {
            string fieldName = DOMUtils.GetAttributeWithInheritanceOrFail(e, "fieldName");
            string value     = DOMUtils.GetNonBlankTextOrFail(e);

            List <SpanQuery> clausesList = new List <SpanQuery>();

            TokenStream ts = null;

            try
            {
                ts = analyzer.GetTokenStream(fieldName, value);
                ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                BytesRef bytes = termAtt.BytesRef;
                ts.Reset();
                while (ts.IncrementToken())
                {
                    termAtt.FillBytesRef();
                    SpanTermQuery stq = new SpanTermQuery(new Term(fieldName, BytesRef.DeepCopyOf(bytes)));
                    clausesList.Add(stq);
                }
                ts.End();
                SpanOrQuery soq = new SpanOrQuery(clausesList.ToArray(/*new SpanQuery[clausesList.size()]*/));
                soq.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f);
                return(soq);
            }
#pragma warning disable 168
            catch (IOException ioe)
#pragma warning restore 168
            {
                throw new ParserException("IOException parsing value:" + value);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Пример #9
0
 public override void Run()
 {
     try
     {
         foreach (var mapping in this.map)
         {
             string      term           = mapping.Key;
             BytesRef    expected       = mapping.Value;
             IOException priorException = null;
             TokenStream ts             = this.analyzer.GetTokenStream("fake", new StringReader(term));
             try
             {
                 ITermToBytesRefAttribute termAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                 BytesRef bytes = termAtt.BytesRef;
                 ts.Reset();
                 Assert.IsTrue(ts.IncrementToken());
                 termAtt.FillBytesRef();
                 Assert.AreEqual(expected, bytes);
                 Assert.IsFalse(ts.IncrementToken());
                 ts.End();
             }
             catch (IOException e)
             {
                 priorException = e;
             }
             finally
             {
                 IOUtils.DisposeWhileHandlingException(priorException, ts);
             }
         }
     }
     catch (IOException e)
     {
         throw new Exception(e.ToString(), e);
     }
 }
Пример #10
0
        /// <summary>
        /// TODO: rewrite tests not to use string comparison.
        /// </summary>
        private static string tsToString(TokenStream @in)
        {
            StringBuilder      @out    = new StringBuilder();
            ICharTermAttribute termAtt = @in.AddAttribute <ICharTermAttribute>();

            // extra safety to enforce, that the state is not preserved and also
            // assign bogus values
            @in.ClearAttributes();
            termAtt.SetEmpty().Append("bogusTerm");
            @in.Reset();
            while (@in.IncrementToken())
            {
                if (@out.Length > 0)
                {
                    @out.Append(' ');
                }
                @out.Append(termAtt.ToString());
                @in.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
            }

            @in.Dispose();
            return(@out.ToString());
        }
Пример #11
0
        private static IList <string> SplitByTokenizer(string source, TokenizerFactory tokFactory)
        {
            StringReader   reader  = new StringReader(source);
            TokenStream    ts      = LoadTokenizer(tokFactory, reader);
            IList <string> tokList = new List <string>();

            try
            {
                ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();
                ts.Reset();
                while (ts.IncrementToken())
                {
                    if (termAtt.Length > 0)
                    {
                        tokList.Add(termAtt.ToString());
                    }
                }
            }
            finally
            {
                reader.Dispose();
            }
            return(tokList);
        }
Пример #12
0
        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your <see cref="IndexSearcher"/> for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        ///
        /// <p/>
        ///
        /// So, if you have a code fragment like this:
        /// <br/>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        ///
        /// <p/>
        ///
        ///  The query returned, in string form, will be <c>'(i use lucene to search fast searchers are good')</c>.
        ///
        /// <p/>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        ///
        /// <P/>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// <see cref="BooleanQuery.Add"/> (used internally)
        /// throws
        /// <see cref="BooleanQuery.TooManyClauses"/>, the
        /// query as it is will be returned.
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, ISet <string> stop)
        {
            TokenStream    ts      = a.TokenStream(field, new System.IO.StringReader(body));
            ITermAttribute termAtt = ts.AddAttribute <ITermAttribute>();

            BooleanQuery  tmp     = new BooleanQuery();
            ISet <string> already = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet <string>(); // ignore dups

            while (ts.IncrementToken())
            {
                String word = termAtt.Term;
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                {
                    continue;
                }
                // ignore dups
                if (already.Contains(word))
                {
                    continue;
                }
                already.Add(word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, Occur.SHOULD);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return(tmp);
        }
Пример #13
0
        public void TestSurrogates2()
        {
            int numIterations = AtLeast(1000);

            for (int i = 0; i < numIterations; i++)
            {
                String      s  = TestUtil.RandomUnicodeString(Random(), 100);
                TokenStream ts = analyzer.GetTokenStream("foo", s);
                try
                {
                    ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();
                    ts.Reset();
                    while (ts.IncrementToken())
                    {
                        assertTrue(UnicodeUtil.ValidUTF16String(termAtt));
                    }
                    ts.End();
                }
                finally
                {
                    IOUtils.DisposeWhileHandlingException(ts);
                }
            }
        }
Пример #14
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
        {
            this.suffix     = suffix;
            this.prefix     = prefix;
            prefixExhausted = false;

            termAtt    = AddAttribute <ICharTermAttribute>();
            posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            payloadAtt = AddAttribute <IPayloadAttribute>();
            offsetAtt  = AddAttribute <IOffsetAttribute>();
            typeAtt    = AddAttribute <ITypeAttribute>();
            flagsAtt   = AddAttribute <IFlagsAttribute>();

            p_termAtt    = prefix.AddAttribute <ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute <IPayloadAttribute>();
            p_offsetAtt  = prefix.AddAttribute <IOffsetAttribute>();
            p_typeAtt    = prefix.AddAttribute <ITypeAttribute>();
            p_flagsAtt   = prefix.AddAttribute <IFlagsAttribute>();
        }
Пример #15
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix)
            : base(suffix)
        {
            this.suffix = suffix;
            this.prefix = prefix;
            prefixExhausted = false;

            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            payloadAtt = AddAttribute<IPayloadAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            flagsAtt = AddAttribute<IFlagsAttribute>();

            p_termAtt = prefix.AddAttribute<ICharTermAttribute>();
            p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            p_typeAtt = prefix.AddAttribute<ITypeAttribute>();
            p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix = suffix;
            Prefix = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt = prefix.AddAttribute<ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>();
            _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>();
            _pTypeAtt = prefix.AddAttribute<ITypeAttribute>();
            _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>();
        }
Пример #17
0
        public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix)
        {
            Suffix           = suffix;
            Prefix           = prefix;
            _prefixExhausted = false;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt    = AddAttribute <ITermAttribute>();
            _posIncrAtt = AddAttribute <IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute <IPayloadAttribute>();
            _offsetAtt  = AddAttribute <IOffsetAttribute>();
            _typeAtt    = AddAttribute <ITypeAttribute>();
            _flagsAtt   = AddAttribute <IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _pTermAtt    = prefix.AddAttribute <ITermAttribute>();
            _pPosIncrAtt = prefix.AddAttribute <IPositionIncrementAttribute>();
            _pPayloadAtt = prefix.AddAttribute <IPayloadAttribute>();
            _pOffsetAtt  = prefix.AddAttribute <IOffsetAttribute>();
            _pTypeAtt    = prefix.AddAttribute <ITypeAttribute>();
            _pFlagsAtt   = prefix.AddAttribute <IFlagsAttribute>();
        }
Пример #18
0
        /// <summary>
        /// Retrieve suggestions, specifying whether all terms
        ///  must match (<paramref name="allTermsRequired"/>) and whether the hits
        ///  should be highlighted (<paramref name="doHighlight"/>).
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num, bool allTermsRequired, bool doHighlight)
        {
            if (m_searcherMgr == null)
            {
                throw new InvalidOperationException("suggester was not built");
            }

            Occur occur;

            if (allTermsRequired)
            {
                occur = Occur.MUST;
            }
            else
            {
                occur = Occur.SHOULD;
            }

            TokenStream  ts = null;
            BooleanQuery query;
            var          matchedTokens = new HashSet <string>();
            string       prefixToken   = null;

            try
            {
                ts = m_queryAnalyzer.GetTokenStream("", new StringReader(key));

                //long t0 = System.currentTimeMillis();
                ts.Reset();
                var    termAtt   = ts.AddAttribute <ICharTermAttribute>();
                var    offsetAtt = ts.AddAttribute <IOffsetAttribute>();
                string lastToken = null;
                query = new BooleanQuery();
                int maxEndOffset = -1;
                matchedTokens = new HashSet <string>();
                while (ts.IncrementToken())
                {
                    if (lastToken != null)
                    {
                        matchedTokens.Add(lastToken);
                        query.Add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
                    }
                    lastToken = termAtt.ToString();
                    if (lastToken != null)
                    {
                        maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    }
                }
                ts.End();

                if (lastToken != null)
                {
                    Query lastQuery;
                    if (maxEndOffset == offsetAtt.EndOffset)
                    {
                        // Use PrefixQuery (or the ngram equivalent) when
                        // there was no trailing discarded chars in the
                        // string (e.g. whitespace), so that if query does
                        // not end with a space we show prefix matches for
                        // that token:
                        lastQuery   = GetLastTokenQuery(lastToken);
                        prefixToken = lastToken;
                    }
                    else
                    {
                        // Use TermQuery for an exact match if there were
                        // trailing discarded chars (e.g. whitespace), so
                        // that if query ends with a space we only show
                        // exact matches for that term:
                        matchedTokens.Add(lastToken);
                        lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
                    }
                    if (lastQuery != null)
                    {
                        query.Add(lastQuery, occur);
                    }
                }

                if (contexts != null)
                {
                    BooleanQuery sub = new BooleanQuery();
                    query.Add(sub, Occur.MUST);
                    foreach (BytesRef context in contexts)
                    {
                        // NOTE: we "should" wrap this in
                        // ConstantScoreQuery, or maybe send this as a
                        // Filter instead to search, but since all of
                        // these are MUST'd, the change to the score won't
                        // affect the overall ranking.  Since we indexed
                        // as DOCS_ONLY, the perf should be the same
                        // either way (no freq int[] blocks to decode):

                        // TODO: if we had a BinaryTermField we could fix
                        // this "must be valid ut8f" limitation:
                        sub.Add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context.Utf8ToString())), Occur.SHOULD);
                    }
                }
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(ts);
            }

            // TODO: we could allow blended sort here, combining
            // weight w/ score.  Now we ignore score and sort only
            // by weight:

            Query finalQuery = FinishQuery(query, allTermsRequired);

            //System.out.println("finalQuery=" + query);

            // Sort by weight, descending:
            TopFieldCollector c = TopFieldCollector.Create(SORT, num, true, false, false, false);

            // We sorted postings by weight during indexing, so we
            // only retrieve the first num hits now:
            ICollector           c2       = new EarlyTerminatingSortingCollector(c, SORT, num);
            IndexSearcher        searcher = m_searcherMgr.Acquire();
            IList <LookupResult> results  = null;

            try
            {
                //System.out.println("got searcher=" + searcher);
                searcher.Search(finalQuery, c2);

                TopFieldDocs hits = (TopFieldDocs)c.GetTopDocs();

                // Slower way if postings are not pre-sorted by weight:
                // hits = searcher.search(query, null, num, SORT);
                results = CreateResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
            }
            finally
            {
                m_searcherMgr.Release(searcher);
            }

            //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
            //System.out.println(results);

            return(results);
        }
Пример #19
0
        /*
         * Iterates over the given token stream and adds the resulting terms to the index;
         * Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
         * Lucene {@link org.apache.lucene.document.Field}.
         * Finally closes the token stream. Note that untokenized keywords can be added with this method via
         * {@link #CreateKeywordTokenStream(Collection)}, the Lucene contrib <c>KeywordTokenizer</c> or similar utilities.
         *
         * @param fieldName
         *            a name to be associated with the text
         * @param stream
         *            the token stream to retrieve tokens from.
         * @param boost
         *            the boost factor for hits for this field
         * @see org.apache.lucene.document.Field#setBoost(float)
         */
        public void AddField(String fieldName, TokenStream stream, float boost)
        {
            try
            {
                if (fieldName == null)
                {
                    throw new ArgumentException("fieldName must not be null");
                }
                if (stream == null)
                {
                    throw new ArgumentException("token stream must not be null");
                }
                if (boost <= 0.0f)
                {
                    throw new ArgumentException("boost factor must be greater than 0.0");
                }
                if (fields[fieldName] != null)
                {
                    throw new ArgumentException("field must not be added more than once");
                }

                var terms            = new HashMap <String, ArrayIntList>();
                int numTokens        = 0;
                int numOverlapTokens = 0;
                int pos = -1;

                var termAtt          = stream.AddAttribute <ITermAttribute>();
                var posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>();
                var offsetAtt        = stream.AddAttribute <IOffsetAttribute>();

                stream.Reset();
                while (stream.IncrementToken())
                {
                    String term = termAtt.Term;
                    if (term.Length == 0)
                    {
                        continue;                   // nothing to do
                    }
                    //        if (DEBUG) System.Diagnostics.Debug.WriteLine("token='" + term + "'");
                    numTokens++;
                    int posIncr = posIncrAttribute.PositionIncrement;
                    if (posIncr == 0)
                    {
                        numOverlapTokens++;
                    }
                    pos += posIncr;

                    ArrayIntList positions = terms[term];
                    if (positions == null)
                    {
                        // term not seen before
                        positions   = new ArrayIntList(stride);
                        terms[term] = positions;
                    }
                    if (stride == 1)
                    {
                        positions.Add(pos);
                    }
                    else
                    {
                        positions.Add(pos, offsetAtt.StartOffset, offsetAtt.EndOffset);
                    }
                }
                stream.End();

                // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
                if (numTokens > 0)
                {
                    boost             = boost * docBoost; // see DocumentWriter.addDocument(...)
                    fields[fieldName] = new Info(terms, numTokens, numOverlapTokens, boost);
                    sortedFields      = null;             // invalidate sorted view, if any
                }
            }
            catch (IOException e)
            {
                // can never happen
                throw new SystemException(string.Empty, e);
            }
            finally
            {
                try
                {
                    if (stream != null)
                    {
                        stream.Close();
                    }
                }
                catch (IOException e2)
                {
                    throw new SystemException(string.Empty, e2);
                }
            }
        }
Пример #20
0
        public virtual void TestRandomPhrases()
        {
            Directory dir      = NewDirectory();
            Analyzer  analyzer = new MockAnalyzer(Random);

            RandomIndexWriter       w    = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetMergePolicy(NewLogMergePolicy()));
            IList <IList <string> > docs = new List <IList <string> >();

            Documents.Document d = new Documents.Document();
            Field f = NewTextField("f", "", Field.Store.NO);

            d.Add(f);

            Random r = Random;

            int NUM_DOCS = AtLeast(10);

            for (int i = 0; i < NUM_DOCS; i++)
            {
                // must be > 4096 so it spans multiple chunks
                int termCount = TestUtil.NextInt32(Random, 4097, 8200);

                IList <string> doc = new List <string>();

                StringBuilder sb = new StringBuilder();
                while (doc.Count < termCount)
                {
                    if (r.Next(5) == 1 || docs.Count == 0)
                    {
                        // make new non-empty-string term
                        string term;
                        while (true)
                        {
                            term = TestUtil.RandomUnicodeString(r);
                            if (term.Length > 0)
                            {
                                break;
                            }
                        }
                        Exception   priorException = null; // LUCENENET: No need to cast to IOExcpetion
                        TokenStream ts             = analyzer.GetTokenStream("ignore", new StringReader(term));
                        try
                        {
                            ICharTermAttribute termAttr = ts.AddAttribute <ICharTermAttribute>();
                            ts.Reset();
                            while (ts.IncrementToken())
                            {
                                string text = termAttr.ToString();
                                doc.Add(text);
                                sb.Append(text).Append(' ');
                            }
                            ts.End();
                        }
                        catch (Exception e) when(e.IsIOException())
                        {
                            priorException = e;
                        }
                        finally
                        {
                            IOUtils.DisposeWhileHandlingException(priorException, ts);
                        }
                    }
                    else
                    {
                        // pick existing sub-phrase
                        IList <string> lastDoc = docs[r.Next(docs.Count)];
                        int            len     = TestUtil.NextInt32(r, 1, 10);
                        int            start   = r.Next(lastDoc.Count - len);
                        for (int k = start; k < start + len; k++)
                        {
                            string t = lastDoc[k];
                            doc.Add(t);
                            sb.Append(t).Append(' ');
                        }
                    }
                }
                docs.Add(doc);
                f.SetStringValue(sb.ToString());
                w.AddDocument(d);
            }

            IndexReader   reader = w.GetReader();
            IndexSearcher s      = NewSearcher(reader);

            w.Dispose();

            // now search
            int num = AtLeast(10);

            for (int i = 0; i < num; i++)
            {
                int            docID = r.Next(docs.Count);
                IList <string> doc   = docs[docID];

                int           numTerm = TestUtil.NextInt32(r, 2, 20);
                int           start   = r.Next(doc.Count - numTerm);
                PhraseQuery   pq      = new PhraseQuery();
                StringBuilder sb      = new StringBuilder();
                for (int t = start; t < start + numTerm; t++)
                {
                    pq.Add(new Term("f", doc[t]));
                    sb.Append(doc[t]).Append(' ');
                }

                TopDocs hits  = s.Search(pq, NUM_DOCS);
                bool    found = false;
                for (int j = 0; j < hits.ScoreDocs.Length; j++)
                {
                    if (hits.ScoreDocs[j].Doc == docID)
                    {
                        found = true;
                        break;
                    }
                }

                Assert.IsTrue(found, "phrase '" + sb + "' not found; start=" + start);
            }

            reader.Dispose();
            dir.Dispose();
        }
Пример #21
0
        /// <summary>
        /// Not an explicit test, just useful to print out some info on performance
        /// </summary>
        public virtual void Performance()
        {
            int[] tokCount  = new int[] { 100, 500, 1000, 2000, 5000, 10000 };
            int[] modCounts = new int[] { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
            for (int k = 0; k < tokCount.Length; k++)
            {
                StringBuilder buffer = new StringBuilder();
                Console.WriteLine("-----Tokens: " + tokCount[k] + "-----");
                for (int i = 0; i < tokCount[k]; i++)
                {
                    //buffer.Append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).Append(' ');
                    buffer.Append(i.ToString(CultureInfo.InvariantCulture)).Append(' ');
                }
                //make sure we produce the same tokens
                TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                TokenStream        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, 100));
                teeStream.ConsumeAllTokens();
                TokenStream        stream  = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), 100);
                ICharTermAttribute tfTok   = stream.AddAttribute <ICharTermAttribute>();
                ICharTermAttribute sinkTok = sink.AddAttribute <ICharTermAttribute>();
                for (int i = 0; stream.IncrementToken(); i++)
                {
                    assertTrue(sink.IncrementToken());
                    assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.Equals(sinkTok) == true);
                }

                //simulate two fields, each being analyzed once, for 20 documents
                for (int j = 0; j < modCounts.Length; j++)
                {
                    int tfPos = 0;
                    //long start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    long start = Environment.TickCount;
                    for (int i = 0; i < 20; i++)
                    {
                        stream = new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString())));
                        IPositionIncrementAttribute posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                        stream     = new ModuloTokenFilter(this, new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))), modCounts[j]);
                        posIncrAtt = stream.GetAttribute <IPositionIncrementAttribute>();
                        while (stream.IncrementToken())
                        {
                            tfPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    //long finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    long finish = Environment.TickCount;
                    Console.WriteLine("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
                    int sinkPos = 0;
                    //simulate one field with one sink
                    //start = DateTimeHelperClass.CurrentUnixTimeMillis();
                    start = Environment.TickCount;
                    for (int i = 0; i < 20; i++)
                    {
                        teeStream = new TeeSinkTokenFilter(new StandardFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer.ToString()))));
                        sink      = teeStream.NewSinkTokenStream(new ModuloSinkFilter(this, modCounts[j]));
                        IPositionIncrementAttribute posIncrAtt = teeStream.GetAttribute <IPositionIncrementAttribute>();
                        while (teeStream.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                        //System.out.println("Modulo--------");
                        posIncrAtt = sink.GetAttribute <IPositionIncrementAttribute>();
                        while (sink.IncrementToken())
                        {
                            sinkPos += posIncrAtt.PositionIncrement;
                        }
                    }
                    //finish = DateTimeHelperClass.CurrentUnixTimeMillis();
                    finish = Environment.TickCount;
                    Console.WriteLine("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
                    assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
                }
                Console.WriteLine("- End Tokens: " + tokCount[k] + "-----");
            }
        }
Пример #22
0
        /// <summary>
        /// Creates a shingle filter with ad hoc parameter settings.
        /// </summary>
        /// <param name="input">stream from which to construct the matrix</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Char? spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            _input = input;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            _inTermAtt = input.AddAttribute<ITermAttribute>();
            _inPosIncrAtt = input.AddAttribute<IPositionIncrementAttribute>();
            _inPayloadAtt = input.AddAttribute<IPayloadAttribute>();
            _inOffsetAtt = input.AddAttribute<IOffsetAttribute>();
            _inTypeAtt = input.AddAttribute<ITypeAttribute>();
            _inFlagsAtt = input.AddAttribute<IFlagsAttribute>();
        }
        // we only check a few core attributes here.
        // TODO: test other things
        public virtual void assertEquals(string s, TokenStream left, TokenStream right)
        {
            left.Reset();
            right.Reset();
            ICharTermAttribute leftTerm = left.AddAttribute<ICharTermAttribute>();
            ICharTermAttribute rightTerm = right.AddAttribute<ICharTermAttribute>();
            IOffsetAttribute leftOffset = left.AddAttribute<IOffsetAttribute>();
            IOffsetAttribute rightOffset = right.AddAttribute<IOffsetAttribute>();
            IPositionIncrementAttribute leftPos = left.AddAttribute<IPositionIncrementAttribute>();
            IPositionIncrementAttribute rightPos = right.AddAttribute<IPositionIncrementAttribute>();

            while (left.IncrementToken())
            {
                assertTrue("wrong number of tokens for input: " + s, right.IncrementToken());
                assertEquals("wrong term text for input: " + s, leftTerm.ToString(), rightTerm.ToString());
                assertEquals("wrong position for input: " + s, leftPos.PositionIncrement, rightPos.PositionIncrement);
                assertEquals("wrong start offset for input: " + s, leftOffset.StartOffset(), rightOffset.StartOffset());
                assertEquals("wrong end offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset());
            };
            assertFalse("wrong number of tokens for input: " + s, right.IncrementToken());
            left.End();
            right.End();
            assertEquals("wrong final offset for input: " + s, leftOffset.EndOffset(), rightOffset.EndOffset());
            left.Dispose();
            right.Dispose();
        }
Пример #24
0
        /* (non-Javadoc)
         * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
         */

        public TokenStream Init(TokenStream tokenStream)
        {
            termAtt = tokenStream.AddAttribute <ITermAttribute>();
            return(null);
        }
Пример #25
0
        /// <summary>
        /// (non-Javadoc)
        /// @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
        /// </summary>
        public virtual Query GetQuery(XmlElement e)
        {
            string fieldsList = e.GetAttribute("fieldNames"); //a comma-delimited list of fields

            string[] fields = defaultFieldNames;
            if ((fieldsList != null) && (fieldsList.Trim().Length > 0))
            {
                fields = fieldsList.Trim().Split(',').TrimEnd();
                //trim the fieldnames
                for (int i = 0; i < fields.Length; i++)
                {
                    fields[i] = fields[i].Trim();
                }
            }

            //Parse any "stopWords" attribute
            //TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
            //I use all analyzers/fields to generate multi-field compatible stop list
            string        stopWords    = e.GetAttribute("stopWords");
            ISet <string> stopWordsSet = null;

            if ((stopWords != null) && (fields != null))
            {
                stopWordsSet = new JCG.HashSet <string>();
                foreach (string field in fields)
                {
                    TokenStream ts = null;
                    try
                    {
                        ts = analyzer.GetTokenStream(field, stopWords);
                        ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();
                        ts.Reset();
                        while (ts.IncrementToken())
                        {
                            stopWordsSet.Add(termAtt.ToString());
                        }
                        ts.End();
                    }
                    catch (IOException ioe)
                    {
                        throw new ParserException("IoException parsing stop words list in "
                                                  + GetType().Name + ":" + ioe.Message);
                    }
                    finally
                    {
                        IOUtils.DisposeWhileHandlingException(ts);
                    }
                }
            }

            MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils.GetText(e), fields, analyzer, fields[0]);

            mlt.MaxQueryTerms       = DOMUtils.GetAttribute(e, "maxQueryTerms", DEFAULT_MAX_QUERY_TERMS);
            mlt.MinTermFrequency    = DOMUtils.GetAttribute(e, "minTermFrequency", DEFAULT_MIN_TERM_FREQUENCY);
            mlt.PercentTermsToMatch = DOMUtils.GetAttribute(e, "percentTermsToMatch", DEFAULT_PERCENT_TERMS_TO_MATCH) / 100;
            mlt.StopWords           = stopWordsSet;
            int minDocFreq = DOMUtils.GetAttribute(e, "minDocFreq", -1);

            if (minDocFreq >= 0)
            {
                mlt.MinDocFreq = minDocFreq;
            }

            mlt.Boost = DOMUtils.GetAttribute(e, "boost", 1.0f);

            return(mlt);
        }
Пример #26
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            TokenStream   ts      = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            int       corpusNumDocs            = reader.NumDocs();
            Term      internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
            Hashtable processedTerms           = new Hashtable();

            while (ts.IncrementToken())
            {
                String term = termAtt.Term();
                if (!processedTerms.Contains(term))
                {
                    processedTerms.Add(term, term);
                    ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                    float          minScore  = 0;
                    Term           startTerm = internSavingTemplateTerm.CreateTerm(term);
                    FuzzyTermEnum  fe        = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
                    TermEnum       origEnum  = reader.Terms(startTerm);
                    int            df        = 0;
                    if (startTerm.Equals(origEnum.Term()))
                    {
                        df = origEnum.DocFreq(); //store the df so all variants use same idf
                    }
                    int numVariants          = 0;
                    int totalVariantDocFreqs = 0;
                    do
                    {
                        Term possibleMatch = fe.Term();
                        if (possibleMatch != null)
                        {
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq();
                            float score = fe.Difference();
                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
                                variantsQ.Insert(st);
                                minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore
                            }
                        }
                    }while (fe.Next());
                    if (numVariants > 0)
                    {
                        int avgDf = totalVariantDocFreqs / numVariants;
                        if (df == 0)    //no direct match we can use as df for all variants
                        {
                            df = avgDf; //use avg df of all variants
                        }

                        // take the top variants (scored by edit distance) and reset the score
                        // to include an IDF factor then add to the global queue for ranking
                        // overall top query terms
                        int size = variantsQ.Size();
                        for (int i = 0; i < size; i++)
                        {
                            ScoreTerm st = (ScoreTerm)variantsQ.Pop();
                            st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
                            q.Insert(st);
                        }
                    }
                }
            }
        }
Пример #27
0
        /// <summary>
        /// Creates a shingle filter based on a user defined matrix.
        /// 
        /// The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
        /// todo: don't touch the matrix! use a bool, set the input stream to null or something, and keep track of where in the matrix we are at.
        /// 
        /// </summary>
        /// <param name="matrix">the input based for creating shingles. Does not need to contain any information until ShingleMatrixFilter.IncrementToken() is called the first time.</param>
        /// <param name="minimumShingleSize">minimum number of tokens in any shingle.</param>
        /// <param name="maximumShingleSize">maximum number of tokens in any shingle.</param>
        /// <param name="spacerCharacter">character to use between texts of the token parts in a shingle. null for none.</param>
        /// <param name="ignoringSinglePrefixOrSuffixShingle">if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.</param>
        /// <param name="settingsCodec">codec used to read input token weight and matrix positioning.</param>
        public ShingleMatrixFilter(Matrix.Matrix matrix, int minimumShingleSize, int maximumShingleSize, Char spacerCharacter, bool ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec)
        {
            Matrix = matrix;
            MinimumShingleSize = minimumShingleSize;
            MaximumShingleSize = maximumShingleSize;
            SpacerCharacter = spacerCharacter;
            IsIgnoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
            _settingsCodec = settingsCodec;

            // ReSharper disable DoNotCallOverridableMethodsInConstructor
            _termAtt = AddAttribute<ITermAttribute>();
            _posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            _payloadAtt = AddAttribute<IPayloadAttribute>();
            _offsetAtt = AddAttribute<IOffsetAttribute>();
            _typeAtt = AddAttribute<ITypeAttribute>();
            _flagsAtt = AddAttribute<IFlagsAttribute>();
            // ReSharper restore DoNotCallOverridableMethodsInConstructor

            // set the input to be an empty token stream, we already have the data.
            _input = new EmptyTokenStream();

            _inTermAtt = _input.AddAttribute<ITermAttribute>();
            _inPosIncrAtt = _input.AddAttribute<IPositionIncrementAttribute>();
            _inPayloadAtt = _input.AddAttribute<IPayloadAttribute>();
            _inOffsetAtt = _input.AddAttribute<IOffsetAttribute>();
            _inTypeAtt = _input.AddAttribute<ITypeAttribute>();
            _inFlagsAtt = _input.AddAttribute<IFlagsAttribute>();
        }
Пример #28
0
 public CyrilicToLatinFilter(TokenStream input) : base(input)
 {
     termAttribute = (CharTermAttribute)input.AddAttribute <ICharTermAttribute>();
 }
Пример #29
0
 /// <summary>
 /// <seealso cref="IFragmenter.Start(string, TokenStream)"/>
 /// </summary>
 public virtual void Start(string originalText, TokenStream stream)
 {
     offsetAtt       = stream.AddAttribute <IOffsetAttribute>();
     currentNumFrags = 1;
 }
Пример #30
0
        /// <summary>
        /// Iterates over the given token stream and adds the resulting terms to the index;
        /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored,
        /// Lucene <see cref="Documents.Field"/>.
        /// Finally closes the token stream. Note that untokenized keywords can be added with this method via
        /// <see cref="T:KeywordTokenStream{T}(ICollection{T}"/>)"/>, the Lucene <c>KeywordTokenizer</c> or similar utilities.
        ///
        /// </summary>
        /// <param name="fieldName"> a name to be associated with the text </param>
        /// <param name="stream"> the token stream to retrieve tokens from. </param>
        /// <param name="boost"> the boost factor for hits for this field </param>
        /// <param name="positionIncrementGap"> the position increment gap if fields with the same name are added more than once </param>
        /// <param name="offsetGap"> the offset gap if fields with the same name are added more than once </param>
        /// <seealso cref="Documents.Field.Boost"/>
        public virtual void AddField(string fieldName, TokenStream stream, float boost, int positionIncrementGap, int offsetGap)
        {
            try
            {
                if (fieldName == null)
                {
                    throw new ArgumentException("fieldName must not be null");
                }
                if (stream == null)
                {
                    throw new ArgumentException("token stream must not be null");
                }
                if (boost <= 0.0f)
                {
                    throw new ArgumentException("boost factor must be greater than 0.0");
                }
                int                 numTokens        = 0;
                int                 numOverlapTokens = 0;
                int                 pos = -1;
                BytesRefHash        terms;
                SliceByteStartArray sliceArray;
                long                sumTotalTermFreq = 0;
                int                 offset           = 0;
                if (fields.TryGetValue(fieldName, out Info info))
                {
                    numTokens        = info.numTokens;
                    numOverlapTokens = info.numOverlapTokens;
                    pos              = info.lastPosition + positionIncrementGap;
                    offset           = info.lastOffset + offsetGap;
                    terms            = info.terms;
                    boost           *= info.boost;
                    sliceArray       = info.sliceArray;
                    sumTotalTermFreq = info.sumTotalTermFreq;
                }
                else
                {
                    sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
                    terms      = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
                }

                if (!fieldInfos.ContainsKey(fieldName))
                {
                    fieldInfos[fieldName] = new FieldInfo(fieldName,
                                                          true,
                                                          fieldInfos.Count,
                                                          false,
                                                          false,
                                                          false,
                                                          this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                                                          DocValuesType.NONE,
                                                          DocValuesType.NONE,
                                                          null);
                }
                ITermToBytesRefAttribute    termAtt          = stream.GetAttribute <ITermToBytesRefAttribute>();
                IPositionIncrementAttribute posIncrAttribute = stream.AddAttribute <IPositionIncrementAttribute>();
                IOffsetAttribute            offsetAtt        = stream.AddAttribute <IOffsetAttribute>();
                BytesRef @ref = termAtt.BytesRef;
                stream.Reset();

                while (stream.IncrementToken())
                {
                    termAtt.FillBytesRef();
                    //        if (DEBUG) System.err.println("token='" + term + "'");
                    numTokens++;
                    int posIncr = posIncrAttribute.PositionIncrement;
                    if (posIncr == 0)
                    {
                        numOverlapTokens++;
                    }
                    pos += posIncr;
                    int ord = terms.Add(@ref);
                    if (ord < 0)
                    {
                        ord = (-ord) - 1;
                        postingsWriter.Reset(sliceArray.end[ord]);
                    }
                    else
                    {
                        sliceArray.start[ord] = postingsWriter.StartNewSlice();
                    }
                    sliceArray.freq[ord]++;
                    sumTotalTermFreq++;
                    if (!storeOffsets)
                    {
                        postingsWriter.WriteInt32(pos);
                    }
                    else
                    {
                        postingsWriter.WriteInt32(pos);
                        postingsWriter.WriteInt32(offsetAtt.StartOffset + offset);
                        postingsWriter.WriteInt32(offsetAtt.EndOffset + offset);
                    }
                    sliceArray.end[ord] = postingsWriter.CurrentOffset;
                }
                stream.End();

                // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
                if (numTokens > 0)
                {
                    fields[fieldName] = new Info(terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.EndOffset + offset, sumTotalTermFreq);
                    sortedFields      = null; // invalidate sorted view, if any
                }
            } // can never happen
            catch (Exception e)
            {
                throw new Exception(e.ToString(), e);
            }
            finally
            {
                try
                {
                    if (stream != null)
                    {
                        stream.Dispose();
                    }
                }
                catch (IOException e2)
                {
                    throw new Exception(e2.ToString(), e2);
                }
            }
        }
Пример #31
0
        /// <summary>
        /// Low level api to get the most relevant (formatted) sections of the document.
        /// This method has been made public to allow visibility of score information held in TextFragment objects.
        /// Thanks to Jason Calabrese for help in redefining the interface.
        /// </summary>
        public TextFragment[] GetBestTextFragments(
            TokenStream tokenStream,
            String text,
            bool mergeContiguousFragments,
            int maxNumFragments)
        {
            var docFrags = new List <TextFragment>();
            var newText  = new StringBuilder();

            var termAtt   = tokenStream.AddAttribute <ITermAttribute>();
            var offsetAtt = tokenStream.AddAttribute <IOffsetAttribute>();

            tokenStream.AddAttribute <IPositionIncrementAttribute>();
            tokenStream.Reset();

            var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count);
            var newStream   = _fragmentScorer.Init(tokenStream);

            if (newStream != null)
            {
                tokenStream = newStream;
            }
            _fragmentScorer.StartFragment(currentFrag);
            docFrags.Add(currentFrag);

            var fragQueue = new FragmentQueue(maxNumFragments);

            try
            {
                String tokenText;
                int    startOffset;
                int    endOffset;
                int    lastEndOffset = 0;
                _textFragmenter.Start(text, tokenStream);

                var tokenGroup = new TokenGroup(tokenStream);

                for (bool next = tokenStream.IncrementToken();
                     next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze);
                     next = tokenStream.IncrementToken())
                {
                    if ((offsetAtt.EndOffset > text.Length)
                        ||
                        (offsetAtt.StartOffset > text.Length)
                        )
                    {
                        throw new InvalidTokenOffsetsException("Token " + termAtt.Term
                                                               + " exceeds length of provided text sized " + text.Length);
                    }
                    if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct()))
                    {
                        //the current token is distinct from previous tokens -
                        // markup the cached token group info
                        startOffset = tokenGroup.MatchStartOffset;
                        endOffset   = tokenGroup.MatchEndOffset;
                        tokenText   = text.Substring(startOffset, endOffset - startOffset);
                        String markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                        //store any whitespace etc from between this and last group
                        if (startOffset > lastEndOffset)
                        {
                            newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                        }
                        newText.Append(markedUpText);
                        lastEndOffset = Math.Max(endOffset, lastEndOffset);
                        tokenGroup.Clear();

                        //check if current token marks the start of a new fragment
                        if (_textFragmenter.IsNewFragment())
                        {
                            currentFrag.Score = _fragmentScorer.FragmentScore;
                            //record stats for a new fragment
                            currentFrag.TextEndPos = newText.Length;
                            currentFrag            = new TextFragment(newText, newText.Length, docFrags.Count);
                            _fragmentScorer.StartFragment(currentFrag);
                            docFrags.Add(currentFrag);
                        }
                    }

                    tokenGroup.AddToken(_fragmentScorer.GetTokenScore());

                    //                if(lastEndOffset>maxDocBytesToAnalyze)
                    //                {
                    //                    break;
                    //                }
                }
                currentFrag.Score = _fragmentScorer.FragmentScore;

                if (tokenGroup.NumTokens > 0)
                {
                    //flush the accumulated text (same code as in above loop)
                    startOffset = tokenGroup.MatchStartOffset;
                    endOffset   = tokenGroup.MatchEndOffset;
                    tokenText   = text.Substring(startOffset, endOffset - startOffset);
                    var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup);
                    //store any whitespace etc from between this and last group
                    if (startOffset > lastEndOffset)
                    {
                        newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset)));
                    }
                    newText.Append(markedUpText);
                    lastEndOffset = Math.Max(lastEndOffset, endOffset);
                }

                //Test what remains of the original text beyond the point where we stopped analyzing
                if (
                    //                    if there is text beyond the last token considered..
                    (lastEndOffset < text.Length)
                    &&
                    //                    and that text is not too large...
                    (text.Length <= _maxDocCharsToAnalyze)
                    )
                {
                    //append it to the last fragment
                    newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset)));
                }

                currentFrag.TextEndPos = newText.Length;

                //sort the most relevant sections of the text
                foreach (var f in docFrags)
                {
                    currentFrag = f;

                    //If you are running with a version of Lucene before 11th Sept 03
                    // you do not have PriorityQueue.insert() - so uncomment the code below

                    /*
                     *                  if (currentFrag.getScore() >= minScore)
                     *                  {
                     *                      fragQueue.put(currentFrag);
                     *                      if (fragQueue.size() > maxNumFragments)
                     *                      { // if hit queue overfull
                     *                          fragQueue.pop(); // remove lowest in hit queue
                     *                          minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                     *                      }
                     *
                     *
                     *                  }
                     */
                    //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
                    //fix to PriorityQueue. The correct method to use here is the new "insert" method
                    // USE ABOVE CODE IF THIS DOES NOT COMPILE!
                    fragQueue.InsertWithOverflow(currentFrag);
                }

                //return the most relevant fragments
                var frag = new TextFragment[fragQueue.Size()];
                for (int i = frag.Length - 1; i >= 0; i--)
                {
                    frag[i] = fragQueue.Pop();
                }

                //merge any contiguous fragments to improve readability
                if (mergeContiguousFragments)
                {
                    MergeContiguousFragments(frag);
                    frag = frag.Where(t => (t != null) && (t.Score > 0)).ToArray();
                }

                return(frag);
            }
            finally
            {
                if (tokenStream != null)
                {
                    try
                    {
                        tokenStream.Close();
                    }
                    catch (Exception)
                    {
                    }
                }
            }
        }
Пример #32
0
 public TokenGroup(TokenStream tokenStream)
 {
     offsetAtt = tokenStream.AddAttribute <IOffsetAttribute>();
     termAtt   = tokenStream.AddAttribute <ICharTermAttribute>();
 }
Пример #33
0
        /// <summary>
        /// Retrieve suggestions.
        /// </summary>
        public virtual IList <LookupResult> DoLookup(string key, IEnumerable <BytesRef> contexts, int num)
        {
            if (contexts != null)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            TokenStream ts = queryAnalyzer.GetTokenStream("", key.ToString());

            try
            {
                ITermToBytesRefAttribute    termBytesAtt = ts.AddAttribute <ITermToBytesRefAttribute>();
                IOffsetAttribute            offsetAtt    = ts.AddAttribute <IOffsetAttribute>();
                IPositionLengthAttribute    posLenAtt    = ts.AddAttribute <IPositionLengthAttribute>();
                IPositionIncrementAttribute posIncAtt    = ts.AddAttribute <IPositionIncrementAttribute>();
                ts.Reset();

                var lastTokens = new BytesRef[grams];
                //System.out.println("lookup: key='" + key + "'");

                // Run full analysis, but save only the
                // last 1gram, last 2gram, etc.:
                BytesRef tokenBytes   = termBytesAtt.BytesRef;
                int      maxEndOffset = -1;
                bool     sawRealToken = false;
                while (ts.IncrementToken())
                {
                    termBytesAtt.FillBytesRef();
                    sawRealToken |= tokenBytes.Length > 0;
                    // TODO: this is somewhat iffy; today, ShingleFilter
                    // sets posLen to the gram count; maybe we should make
                    // a separate dedicated att for this?
                    int gramCount = posLenAtt.PositionLength;

                    Debug.Assert(gramCount <= grams);

                    // Safety: make sure the recalculated count "agrees":
                    if (CountGrams(tokenBytes) != gramCount)
                    {
                        throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + CountGrams(tokenBytes));
                    }
                    maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset);
                    lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
                }
                ts.End();

                if (!sawRealToken)
                {
                    throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
                }

                // Carefully fill last tokens with _ tokens;
                // ShingleFilter appraently won't emit "only hole"
                // tokens:
                int endPosInc = posIncAtt.PositionIncrement;

                // Note this will also be true if input is the empty
                // string (in which case we saw no tokens and
                // maxEndOffset is still -1), which in fact works out OK
                // because we fill the unigram with an empty BytesRef
                // below:
                bool lastTokenEnded = offsetAtt.EndOffset > maxEndOffset || endPosInc > 0;
                //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.EndOffset);

                if (lastTokenEnded)
                {
                    //System.out.println("  lastTokenEnded");
                    // If user hit space after the last token, then
                    // "upgrade" all tokens.  This way "foo " will suggest
                    // all bigrams starting w/ foo, and not any unigrams
                    // starting with "foo":
                    for (int i = grams - 1; i > 0; i--)
                    {
                        BytesRef token = lastTokens[i - 1];
                        if (token == null)
                        {
                            continue;
                        }
                        token.Grow(token.Length + 1);
                        token.Bytes[token.Length] = separator;
                        token.Length++;
                        lastTokens[i] = token;
                    }
                    lastTokens[0] = new BytesRef();
                }

                var arc = new FST.Arc <long?>();

                var bytesReader = fst.GetBytesReader();

                // Try highest order models first, and if they return
                // results, return that; else, fallback:
                double backoff = 1.0;

                List <LookupResult> results = new List <LookupResult>(num);

                // We only add a given suffix once, from the highest
                // order model that saw it; for subsequent lower order
                // models we skip it:
                var seen = new HashSet <BytesRef>();

                for (int gram = grams - 1; gram >= 0; gram--)
                {
                    BytesRef token = lastTokens[gram];
                    // Don't make unigram predictions from empty string:
                    if (token == null || (token.Length == 0 && key.Length > 0))
                    {
                        // Input didn't have enough tokens:
                        //System.out.println("  gram=" + gram + ": skip: not enough input");
                        continue;
                    }

                    if (endPosInc > 0 && gram <= endPosInc)
                    {
                        // Skip hole-only predictions; in theory we
                        // shouldn't have to do this, but we'd need to fix
                        // ShingleFilter to produce only-hole tokens:
                        //System.out.println("  break: only holes now");
                        break;
                    }

                    //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

                    // TODO: we could add fuzziness here
                    // match the prefix portion exactly
                    //Pair<Long,BytesRef> prefixOutput = null;
                    long?prefixOutput = null;
                    try
                    {
                        prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }
                    //System.out.println("  prefixOutput=" + prefixOutput);

                    if (prefixOutput == null)
                    {
                        // This model never saw this prefix, e.g. the
                        // trigram model never saw context "purple mushroom"
                        backoff *= ALPHA;
                        continue;
                    }

                    // TODO: we could do this division at build time, and
                    // bake it into the FST?

                    // Denominator for computing scores from current
                    // model's predictions:
                    long contextCount = totTokens;

                    BytesRef lastTokenFragment = null;

                    for (int i = token.Length - 1; i >= 0; i--)
                    {
                        if (token.Bytes[token.Offset + i] == separator)
                        {
                            BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
                            long?    output  = Lucene.Net.Util.Fst.Util.Get(fst, Lucene.Net.Util.Fst.Util.ToInt32sRef(context, new Int32sRef()));
                            Debug.Assert(output != null);
                            contextCount      = DecodeWeight(output);
                            lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                            break;
                        }
                    }

                    BytesRef finalLastToken;

                    if (lastTokenFragment == null)
                    {
                        finalLastToken = BytesRef.DeepCopyOf(token);
                    }
                    else
                    {
                        finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
                    }
                    Debug.Assert(finalLastToken.Offset == 0);

                    CharsRef spare = new CharsRef();

                    // complete top-N
                    Util.Fst.Util.TopResults <long?> completions = null;
                    try
                    {
                        // Because we store multiple models in one FST
                        // (1gram, 2gram, 3gram), we must restrict the
                        // search so that it only considers the current
                        // model.  For highest order model, this is not
                        // necessary since all completions in the FST
                        // must be from this model, but for lower order
                        // models we have to filter out the higher order
                        // ones:

                        // Must do num+seen.size() for queue depth because we may
                        // reject up to seen.size() paths in acceptResult():
                        Util.Fst.Util.TopNSearcher <long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparer, seen, finalLastToken);

                        // since this search is initialized with a single start node
                        // it is okay to start with an empty input path here
                        searcher.AddStartPaths(arc, prefixOutput, true, new Int32sRef());

                        completions = searcher.Search();
                        Debug.Assert(completions.IsComplete);
                    }
                    catch (IOException bogus)
                    {
                        throw new Exception(bogus.ToString(), bogus);
                    }

                    int prefixLength = token.Length;

                    BytesRef suffix = new BytesRef(8);
                    //System.out.println("    " + completions.length + " completions");

                    foreach (Util.Fst.Util.Result <long?> completion in completions)
                    {
                        token.Length = prefixLength;
                        // append suffix
                        Util.Fst.Util.ToBytesRef(completion.Input, suffix);
                        token.Append(suffix);

                        //System.out.println("    completion " + token.utf8ToString());

                        // Skip this path if a higher-order model already
                        // saw/predicted its last token:
                        BytesRef lastToken = token;
                        for (int i = token.Length - 1; i >= 0; i--)
                        {
                            if (token.Bytes[token.Offset + i] == separator)
                            {
                                Debug.Assert(token.Length - i - 1 > 0);
                                lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
                                break;
                            }
                        }
                        if (seen.Contains(lastToken))
                        {
                            //System.out.println("      skip dup " + lastToken.utf8ToString());
                            goto nextCompletionContinue;
                        }
                        seen.Add(BytesRef.DeepCopyOf(lastToken));
                        spare.Grow(token.Length);
                        UnicodeUtil.UTF8toUTF16(token, spare);
                        LookupResult result = new LookupResult(spare.ToString(),
                                                               // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                                               // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                                               (long)(long.MaxValue * (decimal)backoff * ((decimal)DecodeWeight(completion.Output)) / contextCount));
                        results.Add(result);
                        Debug.Assert(results.Count == seen.Count);
                        //System.out.println("  add result=" + result);
                        nextCompletionContinue :;
                    }
                    backoff *= ALPHA;
                }

                results.Sort(new ComparerAnonymousInnerClassHelper(this));

                if (results.Count > num)
                {
                    results.SubList(num, results.Count).Clear();
                }

                return(results);
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
Пример #34
0
 /// <summary>
 /// <seealso cref="IScorer.Init(TokenStream)"/>
 /// </summary>
 public virtual TokenStream Init(TokenStream tokenStream)
 {
     termAtt = tokenStream.AddAttribute <ICharTermAttribute>();
     return(null);
 }
Пример #35
0
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            ICheckClearAttributesAttribute checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>();

            Assert.IsTrue(ts.HasAttribute <ITermAttribute>(), "has no TermAttribute");
            ITermAttribute termAtt = ts.GetAttribute <ITermAttribute>();

            IOffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute <IOffsetAttribute>();
            }

            ITypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute <ITypeAttribute>();
            }

            IPositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>();
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.Type = "bogusType";
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.PositionIncrement = 45987657;
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term, "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset, "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset, "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset, "finalOffset ");
            }
            ts.Close();
        }
Пример #36
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString is null)
            {
                return;
            }
            Terms terms = MultiFields.GetTerms(reader, f.fieldName);

            if (terms is null)
            {
                return;
            }
            TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString);

            try
            {
                ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();

                int           corpusNumDocs  = reader.NumDocs;
                ISet <string> processedTerms = new JCG.HashSet <string>();
                ts.Reset();
                while (ts.IncrementToken())
                {
                    string term = termAtt.ToString();
                    if (!processedTerms.Contains(term))
                    {
                        processedTerms.Add(term);
                        ScoreTermQueue  variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                        float           minScore  = 0;
                        Term            startTerm = new Term(f.fieldName, term);
                        AttributeSource atts      = new AttributeSource();
                        IMaxNonCompetitiveBoostAttribute maxBoostAtt =
                            atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
#pragma warning disable 612, 618
                        SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
#pragma warning restore 612, 618
                        //store the df so all variants use same idf
                        int             df                   = reader.DocFreq(startTerm);
                        int             numVariants          = 0;
                        int             totalVariantDocFreqs = 0;
                        BytesRef        possibleMatch;
                        IBoostAttribute boostAtt =
                            fe.Attributes.AddAttribute <IBoostAttribute>();
                        while (fe.MoveNext())
                        {
                            possibleMatch = fe.Term;
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq;
                            float score = boostAtt.Boost;
                            if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
                                variantsQ.InsertWithOverflow(st);
                                minScore = variantsQ.Top.Score; // maintain minScore
                            }
                            maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
                        }

                        if (numVariants > 0)
                        {
                            int avgDf = totalVariantDocFreqs / numVariants;
                            if (df == 0)    //no direct match we can use as df for all variants
                            {
                                df = avgDf; //use avg df of all variants
                            }

                            // take the top variants (scored by edit distance) and reset the score
                            // to include an IDF factor then add to the global queue for ranking
                            // overall top query terms
                            int size = variantsQ.Count;
                            for (int i = 0; i < size; i++)
                            {
                                ScoreTerm st = variantsQ.Pop();
                                st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
                                q.InsertWithOverflow(st);
                            }
                        }
                    }
                }
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
        /// <summary>
        /// TODO: rewrite tests not to use string comparison.
        /// </summary>
        private static string tsToString(TokenStream @in)
        {
            StringBuilder @out = new StringBuilder();
            ICharTermAttribute termAtt = @in.AddAttribute<ICharTermAttribute>();
            // extra safety to enforce, that the state is not preserved and also
            // assign bogus values
            @in.ClearAttributes();
            termAtt.SetEmpty().Append("bogusTerm");
            @in.Reset();
            while (@in.IncrementToken())
            {
                if (@out.Length > 0)
                {
                    @out.Append(' ');
                }
                @out.Append(termAtt.ToString());
                @in.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
            }

            @in.Dispose();
            return @out.ToString();
        }