ExtractTerms() public method

Expert: adds all terms occurring in this query to the terms set. Only works if this query is in its #rewrite rewritten form.
if this query is not yet rewritten
public ExtractTerms ( ISet terms ) : void
terms ISet
return void
Beispiel #1
0
        /// <summary> Create weight in multiple index scenario.
        ///
        /// Distributed query processing is done in the following steps:
        /// 1. rewrite query
        /// 2. extract necessary terms
        /// 3. collect dfs for these terms from the Searchables
        /// 4. create query weight using aggregate dfs.
        /// 5. distribute that weight to Searchables
        /// 6. merge results
        ///
        /// Steps 1-4 are done here, 5+6 in the search() methods
        ///
        /// </summary>
        /// <returns> rewritten queries
        /// </returns>
        public /*protected internal*/ override Weight CreateWeight(Query original, IState state)
        {
            // step 1
            Query rewrittenQuery = Rewrite(original, state);

            // step 2
            ISet <Term> terms = Lucene.Net.Support.Compatibility.SetFactory.CreateHashSet <Term>();

            rewrittenQuery.ExtractTerms(terms);

            // step3
            Term[] allTermsArray = terms.ToArray();
            int[]  aggregatedDfs = new int[terms.Count];
            for (int i = 0; i < searchables.Length; i++)
            {
                int[] dfs = searchables[i].DocFreqs(allTermsArray, state);
                for (int j = 0; j < aggregatedDfs.Length; j++)
                {
                    aggregatedDfs[j] += dfs[j];
                }
            }

            var dfMap = new Dictionary <Term, int>();

            for (int i = 0; i < allTermsArray.Length; i++)
            {
                dfMap[allTermsArray[i]] = aggregatedDfs[i];
            }

            // step4
            int            numDocs  = MaxDoc;
            CachedDfSource cacheSim = new CachedDfSource(dfMap, numDocs, Similarity);

            return(rewrittenQuery.Weight(cacheSim, state));
        }
 public override void ExtractTerms(ISet <Term> terms)
 {
     // TODO: OK to not add any terms when wrapped a filter
     // and used with MultiSearcher, but may not be OK for
     // highlighting.
     // If a query was wrapped, we delegate to query.
     if (m_query != null)
     {
         m_query.ExtractTerms(terms);
     }
 }
        public void TestNoMatchFirstWordBug()
        {
            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);

            flt.AddTerms("fernando smith", "name", 0.3f, 1);
            Query     q          = flt.Rewrite(searcher.GetIndexReader());
            Hashtable queryTerms = new Hashtable();

            q.ExtractTerms(queryTerms);
            Assert.IsTrue(queryTerms.Contains(new Term("name", "smith")), "Should have variant smith");
            TopDocs topDocs = searcher.Search(flt, 1);

            ScoreDoc[] sd = topDocs.scoreDocs;
            Assert.IsTrue((sd != null) && (sd.Length > 0), "score docs must match 1 doc");
            Document doc = searcher.Doc(sd[0].doc);

            Assert.AreEqual("2", doc.Get("id"), "Should match most similar when using 2 words");
        }
        public void TestMultiWord()
        {
            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);

            flt.AddTerms("jonathin smoth", "name", 0.3f, 1);
            Query       q          = flt.Rewrite(searcher.IndexReader);
            ISet <Term> queryTerms = Support.Compatibility.SetFactory.CreateHashSet <Term>();

            q.ExtractTerms(queryTerms);
            Assert.IsTrue(queryTerms.Contains(new Term("name", "jonathan")), "Should have variant jonathan");
            Assert.IsTrue(queryTerms.Contains(new Term("name", "smith")), "Should have variant smith");
            TopDocs topDocs = searcher.Search(flt, 1);

            ScoreDoc[] sd = topDocs.ScoreDocs;
            Assert.IsTrue((sd != null) && (sd.Length > 0), "score docs must match 1 doc");
            Document doc = searcher.Doc(sd[0].Doc);

            Assert.AreEqual("2", doc.Get("id"), "Should match most similar when using 2 words");
        }
Beispiel #5
0
        /// <summary> Create weight in multiple index scenario.
        ///
        /// Distributed query processing is done in the following steps:
        /// 1. rewrite query
        /// 2. extract necessary terms
        /// 3. collect dfs for these terms from the Searchables
        /// 4. create query weight using aggregate dfs.
        /// 5. distribute that weight to Searchables
        /// 6. merge results
        ///
        /// Steps 1-4 are done here, 5+6 in the search() methods
        ///
        /// </summary>
        /// <returns> rewritten queries
        /// </returns>
        public /*protected internal*/ override Weight CreateWeight(Query original)
        {
            // step 1
            Query rewrittenQuery = Rewrite(original);

            // step 2
            Support.Set <Lucene.Net.Index.Term> terms = new Support.Set <Term>();
            rewrittenQuery.ExtractTerms(terms);

            // step3
            Term[] allTermsArray = new Term[terms.Count];
            int    index         = 0;

            foreach (Term t in terms)
            {
                allTermsArray[index++] = t;
            }

            int[] aggregatedDfs = new int[terms.Count];
            for (int i = 0; i < searchables.Length; i++)
            {
                int[] dfs = searchables[i].DocFreqs(allTermsArray);
                for (int j = 0; j < aggregatedDfs.Length; j++)
                {
                    aggregatedDfs[j] += dfs[j];
                }
            }

            IDictionary <Term, int> dfMap = new Support.Dictionary <Term, int>();

            for (int i = 0; i < allTermsArray.Length; i++)
            {
                dfMap[allTermsArray[i]] = aggregatedDfs[i];
            }

            // step4
            int            numDocs  = MaxDoc();
            CachedDfSource cacheSim = new CachedDfSource(dfMap, numDocs, GetSimilarity());

            return(rewrittenQuery.Weight(cacheSim));
        }
        /// <summary> Create weight in multiple index scenario.
        ///
        /// Distributed query processing is done in the following steps:
        /// 1. rewrite query
        /// 2. extract necessary terms
        /// 3. collect dfs for these terms from the Searchables
        /// 4. create query weight using aggregate dfs.
        /// 5. distribute that weight to Searchables
        /// 6. merge results
        ///
        /// Steps 1-4 are done here, 5+6 in the search() methods
        ///
        /// </summary>
        /// <returns> rewritten queries
        /// </returns>
        protected internal override Weight CreateWeight(Query original)
        {
            // step 1
            Query rewrittenQuery = Rewrite(original);

            // step 2
            System.Collections.Hashtable terms = new System.Collections.Hashtable();
            rewrittenQuery.ExtractTerms(terms);

            // step3
            Term[] allTermsArray = new Term[terms.Count];
            int    index         = 0;

            System.Collections.IEnumerator e = terms.Keys.GetEnumerator();
            while (e.MoveNext())
            {
                allTermsArray[index++] = e.Current as Term;
            }
            int[] aggregatedDfs = new int[terms.Count];
            for (int i = 0; i < searchables.Length; i++)
            {
                int[] dfs = searchables[i].DocFreqs(allTermsArray);
                for (int j = 0; j < aggregatedDfs.Length; j++)
                {
                    aggregatedDfs[j] += dfs[j];
                }
            }

            System.Collections.Hashtable dfMap = new System.Collections.Hashtable();
            for (int i = 0; i < allTermsArray.Length; i++)
            {
                dfMap[allTermsArray[i]] = (System.Int32)aggregatedDfs[i];
            }

            // step4
            int            numDocs  = MaxDoc();
            CachedDfSource cacheSim = new CachedDfSource(dfMap, numDocs);

            return(rewrittenQuery.Weight(cacheSim));
        }
Beispiel #7
0
                public override Query Rewrite(Query original)
                {
                    Query       rewritten = base.Rewrite(original);
                    ISet <Term> terms     = new JCG.HashSet <Term>();

                    rewritten.ExtractTerms(terms);

                    // Make a single request to remote nodes for term
                    // stats:
                    for (int nodeID = 0; nodeID < nodeVersions.Length; nodeID++)
                    {
                        if (nodeID == MyNodeID)
                        {
                            continue;
                        }

                        ISet <Term> missing = new JCG.HashSet <Term>();
                        foreach (Term term in terms)
                        {
                            TermAndShardVersion key = new TermAndShardVersion(nodeID, nodeVersions[nodeID], term);
                            if (!outerInstance.termStatsCache.ContainsKey(key))
                            {
                                missing.Add(term);
                            }
                        }
                        if (missing.Count != 0)
                        {
                            foreach (KeyValuePair <Term, TermStatistics> ent in outerInstance.outerInstance.GetNodeTermStats(missing, nodeID, nodeVersions[nodeID]))
                            {
                                TermAndShardVersion key = new TermAndShardVersion(nodeID, nodeVersions[nodeID], ent.Key);
                                outerInstance.termStatsCache[key] = ent.Value;
                            }
                        }
                    }

                    return(rewritten);
                }
Beispiel #8
0
        public void flatten(Query sourceQuery, Dictionary<Query, Query> flatQueries)
        {
            if (sourceQuery is BooleanQuery)
            {
                BooleanQuery bq = (BooleanQuery)sourceQuery;
                foreach (BooleanClause clause in bq.GetClauses())
                {
                    if (!clause.IsProhibited)
                        flatten(clause.Query, flatQueries);
                }
            }
            else if (sourceQuery is PrefixQuery)
            {
                if (!flatQueries.ContainsKey(sourceQuery))
                    flatQueries.Add(sourceQuery, sourceQuery);
            }
            else if (sourceQuery is DisjunctionMaxQuery)
            {
                DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery;
                foreach (Query query in dmq)
                {
                    flatten(query, flatQueries);
                }
            }
            else if (sourceQuery is TermQuery)
            {
                if (!flatQueries.ContainsKey(sourceQuery))
                    flatQueries.Add(sourceQuery, sourceQuery);
            }
            else if (sourceQuery is PhraseQuery)
            {
                if (!flatQueries.ContainsKey(sourceQuery))
                {
                    PhraseQuery pq = (PhraseQuery)sourceQuery;
                    if (pq.GetTerms().Length > 1)
                        flatQueries.Add(pq, pq);
                    else if (pq.GetTerms().Length == 1)
                    {
                        Query q = new TermQuery(pq.GetTerms()[0]);
                        flatQueries.Add(q, q);
                    }
                }
            }
            else
            {
                // Fallback to using extracted terms
                ISet<Term> terms = SetFactory.CreateHashSet<Term>();
                try
                {
                    sourceQuery.ExtractTerms(terms);
                }
                catch (NotSupportedException)
                { // thrown by default impl
                    return; // ignore error and discard query
                }

                foreach (var term in terms)
                {
                    flatten(new TermQuery(term), flatQueries);
                }
            }
        }
Beispiel #9
0
 public override void ExtractTerms(ISet <Term> terms)
 {
     @in.ExtractTerms(terms);
 }
 // inherit javadoc
 public override void  ExtractTerms(System.Collections.Generic.ISet <Term> terms)
 {
     Query.ExtractTerms(terms);
 }
Beispiel #11
0
        private void DoSearch(Query query, IEnumerable<SortField> sortField, int maxResults)
        {
            //This try catch is because analyzers strip out stop words and sometimes leave the query
            //with null values. This simply tries to extract terms, if it fails with a null
            //reference then its an invalid null query, NotSupporteException occurs when the query is
            //valid but the type of query can't extract terms.
            //This IS a work-around, theoretically Lucene itself should check for null query parameters
            //before throwing exceptions.
            try
            {
                var set = new Hashtable();
                query.ExtractTerms(set);
            }
            catch (NullReferenceException)
            {
                //this means that an analyzer has stipped out stop words and now there are
                //no words left to search on
                TotalItemCount = 0;
                return;
            }
            catch (NotSupportedException)
            {
                //swallow this exception, we should continue if this occurs.
            }

            maxResults = maxResults > 1 ? maxResults : LuceneSearcher.MaxDoc();

            if (sortField.Count() == 0)
            {
                var topDocs = LuceneSearcher.Search(query, null, maxResults, new Sort());
                _collector = new AllHitsCollector(topDocs.scoreDocs);
                topDocs = null;
            }
            else
            {
                var topDocs = LuceneSearcher.Search(query, null, maxResults, new Sort(sortField.ToArray()));
                _collector = new AllHitsCollector(topDocs.scoreDocs);
                topDocs = null;
            }
            TotalItemCount = _collector.Count;
        }
Beispiel #12
0
 // inherit javadoc
 public override void ExtractTerms(ISet <Term> terms)
 {
     Query.ExtractTerms(terms);
 }