Ejemplo n.º 1
0
        /// <summary>
        /// Search, collecting hits with a <see cref="ICollector"/>, and
        /// computing drill down and sideways counts.
        /// </summary>
        public virtual DrillSidewaysResult Search(DrillDownQuery query, ICollector hitCollector)
        {
            IDictionary <string, int?> drillDownDims = query.Dims;

            FacetsCollector drillDownCollector = new FacetsCollector();

            if (drillDownDims.Count == 0)
            {
                // There are no drill-down dims, so there is no
                // drill-sideways to compute:
                m_searcher.Search(query, MultiCollector.Wrap(hitCollector, drillDownCollector));
                return(new DrillSidewaysResult(BuildFacetsResult(drillDownCollector, null, null), null));
            }

            BooleanQuery ddq = query.BooleanQuery;

            BooleanClause[] clauses = ddq.GetClauses();

            Query baseQuery;
            int   startClause;

            if (clauses.Length == drillDownDims.Count)
            {
                // TODO: we could optimize this pure-browse case by
                // making a custom scorer instead:
                baseQuery   = new MatchAllDocsQuery();
                startClause = 0;
            }
            else
            {
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(clauses.Length == 1 + drillDownDims.Count);
                }
                baseQuery   = clauses[0].Query;
                startClause = 1;
            }

            FacetsCollector[] drillSidewaysCollectors = new FacetsCollector[drillDownDims.Count];
            for (int i = 0; i < drillSidewaysCollectors.Length; i++)
            {
                drillSidewaysCollectors[i] = new FacetsCollector();
            }

            Query[] drillDownQueries = new Query[clauses.Length - startClause];
            for (int i = startClause; i < clauses.Length; i++)
            {
                drillDownQueries[i - startClause] = clauses[i].Query;
            }
            DrillSidewaysQuery dsq = new DrillSidewaysQuery(baseQuery, drillDownCollector, drillSidewaysCollectors, drillDownQueries, ScoreSubDocsAtOnce);

            m_searcher.Search(dsq, hitCollector);

            return(new DrillSidewaysResult(BuildFacetsResult(drillDownCollector, drillSidewaysCollectors, drillDownDims.Keys.ToArray()), null));
        }
Ejemplo n.º 2
0
        private static TopDocs DoSearch(IndexSearcher searcher, ScoreDoc after, Query q, Filter filter, int n, Sort sort, bool doDocScores, bool doMaxScore, Collector fc)
        {
            if (filter != null)
            {
                q = new FilteredQuery(q, filter);
            }

            int limit = searcher.IndexReader.MaxDoc;

            if (limit == 0)
            {
                limit = 1;
            }
            n = Math.Min(n, limit);

            if (after != null && after.Doc >= limit)
            {
                throw new System.ArgumentException("after.doc exceeds the number of documents in the reader: after.doc=" + after.Doc + " limit=" + limit);
            }


            if (sort != null)
            {
                if (after != null && !(after is FieldDoc))
                {
                    // TODO: if we fix type safety of TopFieldDocs we can
                    // remove this
                    throw new System.ArgumentException("after must be a FieldDoc; got " + after);
                }
                const bool fillFields    = true;
                var        hitsCollector = TopFieldCollector.Create(sort, n, (FieldDoc)after, fillFields, doDocScores, doMaxScore, false);
                searcher.Search(q, MultiCollector.Wrap(hitsCollector, fc));
                return(hitsCollector.TopDocs());
            }
            else
            {
                // TODO: can we pass the right boolean for
                // in-order instead of hardwired to false...?  we'd
                // need access to the protected IS.search methods
                // taking Weight... could use reflection...
                var hitsCollector = TopScoreDocCollector.Create(n, after, false);
                searcher.Search(q, MultiCollector.Wrap(hitsCollector, fc));
                return(hitsCollector.TopDocs());
            }
        }
        public virtual void TestRandomSampling()
        {
            Directory dir     = NewDirectory();
            Directory taxoDir = NewDirectory();

            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            RandomIndexWriter       writer     = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir);

            FacetsConfig config = new FacetsConfig();

            int numDocs = AtLeast(10000);

            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
                doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture)));
                writer.AddDocument(config.Build(taxoWriter, doc));
            }
            Random random = Random;

            // NRT open
            IndexSearcher searcher   = NewSearcher(writer.GetReader());
            var           taxoReader = new DirectoryTaxonomyReader(taxoWriter);

            IOUtils.Dispose(writer, taxoWriter);

            // Test empty results
            RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64());

            // There should be no divisions by zero
            searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

            // There should be no divisions by zero and no null result
            Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs());

            // There should be no results at all
            foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs())
            {
                Assert.AreEqual(0, doc.TotalHits);
            }

            // Now start searching and retrieve results.

            // Use a query to select half of the documents.
            TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

            // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
            // 10) are hits.
            // there is a REAL small chance that one of the 5 values will be missed when
            // sampling.
            // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
            // missing) ~ 10^-193
            // so that is probably not going to happen.
            int maxNumChildren = 5;

            RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64());      // no sampling
            RandomSamplingFacetsCollector random10Percent  = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits

            FacetsCollector fc = new FacetsCollector();

            searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));

            FastTaxonomyFacetCounts random10FacetCounts  = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
            FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
            FastTaxonomyFacetCounts exactFacetCounts     = new FastTaxonomyFacetCounts(taxoReader, config, fc);

            FacetResult random10Result  = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
            FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
            FacetResult exactResult     = exactFacetCounts.GetTopChildren(10, "iMod10");

            Assert.AreEqual(random100Result, exactResult);

            // we should have five children, but there is a small chance we have less.
            // (see above).
            Assert.IsTrue(random10Result.ChildCount <= maxNumChildren);
            // there should be one child at least.
            Assert.IsTrue(random10Result.ChildCount >= 1);

            // now calculate some statistics to determine if the sampled result is 'ok'.
            // because random sampling is used, the results will vary each time.
            int sum = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                sum += (int)lav.Value;
            }
            float mu = (float)sum / (float)maxNumChildren;

            float variance = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                variance += (float)Math.Pow((mu - (int)lav.Value), 2);
            }
            variance = variance / maxNumChildren;
            float sigma = (float)Math.Sqrt(variance);

            // we query only half the documents and have 5 categories. The average
            // number of docs in a category will thus be the total divided by 5*2
            float targetMu = numDocs / (5.0f * 2.0f);

            // the average should be in the range and the standard deviation should not
            // be too great
            Assert.IsTrue(sigma < 200);
            Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);

            IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir);
        }
Ejemplo n.º 4
0
        //LUCENENET Specific. One of two methods that replace GroupByFieldOrFunction. Used support
        //          SearchByFunction in a way that eliminates casting for the caller.
        //          This method is essentually a Function specific version of the GroupByFieldOrFunction.
        protected virtual ITopGroups <TMutableValue> GroupByFunction <TMutableValue>(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit)
            where TMutableValue : MutableValue
        {
            int topN = groupOffset + groupLimit;
            FunctionFirstPassGroupingCollector <TMutableValue> firstPassCollector;
            FunctionAllGroupsCollector <TMutableValue>         allGroupsCollector;
            AbstractAllGroupHeadsCollector allGroupHeadsCollector;

            if (groupFunction == null)
            {
                throw IllegalStateException.Create("groupFunction must be set via the constructor by specifying a ValueSource.");
            }

            firstPassCollector = new FunctionFirstPassGroupingCollector <TMutableValue>(groupFunction, valueSourceContext, groupSort, topN);
            if (allGroups)
            {
                allGroupsCollector = new FunctionAllGroupsCollector <TMutableValue>(groupFunction, valueSourceContext);
            }
            else
            {
                allGroupsCollector = null;
            }
            if (allGroupHeads)
            {
                allGroupHeadsCollector = new FunctionAllGroupHeadsCollector(groupFunction, valueSourceContext, sortWithinGroup);
            }
            else
            {
                allGroupHeadsCollector = null;
            }


            ICollector firstRound;

            if (allGroupHeads || allGroups)
            {
                List <ICollector> collectors = new List <ICollector>();
                collectors.Add(firstPassCollector);

                if (allGroups)
                {
                    collectors.Add(allGroupsCollector);
                }
                if (allGroupHeads)
                {
                    collectors.Add(allGroupHeadsCollector);
                }
                firstRound = MultiCollector.Wrap(collectors.ToArray(/* new Collector[collectors.size()] */));
            }
            else
            {
                firstRound = firstPassCollector;
            }

            CachingCollector cachedCollector = null;

            if (maxCacheRAMMB != null || maxDocsToCache != null)
            {
                if (maxCacheRAMMB != null)
                {
                    cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxCacheRAMMB.Value);
                }
                else
                {
                    cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxDocsToCache.Value);
                }
                searcher.Search(query, filter, cachedCollector);
            }
            else
            {
                searcher.Search(query, filter, firstRound);
            }

            if (allGroups)
            {
                matchingGroups = (ICollection)allGroupsCollector.Groups;
            }
            else
            {
                matchingGroups = (ICollection)Collections.EmptyList <TMutableValue>();
            }
            if (allGroupHeads)
            {
                matchingGroupHeads = allGroupHeadsCollector.RetrieveGroupHeads(searcher.IndexReader.MaxDoc);
            }
            else
            {
                matchingGroupHeads = new Bits.MatchNoBits(searcher.IndexReader.MaxDoc);
            }

            IEnumerable <ISearchGroup <TMutableValue> > topSearchGroups = firstPassCollector.GetTopGroups(groupOffset, fillSortFields);

            if (topSearchGroups == null)
            {
                // LUCENENET specific - optimized empty array creation
                return(new TopGroups <TMutableValue>(Arrays.Empty <SortField>(), Arrays.Empty <SortField>(), 0, 0, Arrays.Empty <GroupDocs <TMutableValue> >(), float.NaN));
            }

            int topNInsideGroup = groupDocsOffset + groupDocsLimit;
            IAbstractSecondPassGroupingCollector <TMutableValue> secondPassCollector;

            secondPassCollector = new FunctionSecondPassGroupingCollector <TMutableValue>(topSearchGroups as IEnumerable <ISearchGroup <TMutableValue> >,
                                                                                          groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields, groupFunction, valueSourceContext)
                                  as IAbstractSecondPassGroupingCollector <TMutableValue>;


            if (cachedCollector != null && cachedCollector.IsCached)
            {
                cachedCollector.Replay(secondPassCollector);
            }
            else
            {
                searcher.Search(query, filter, secondPassCollector);
            }

            if (allGroups)
            {
                return(new TopGroups <TMutableValue>(secondPassCollector.GetTopGroups(groupDocsOffset), matchingGroups.Count));
            }
            else
            {
                return(secondPassCollector.GetTopGroups(groupDocsOffset));
            }
        }
Ejemplo n.º 5
0
        protected virtual ITopGroups <TGroupValue> GroupByFieldOrFunction <TGroupValue>(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit)
        {
            int topN = groupOffset + groupLimit;
            IAbstractFirstPassGroupingCollector <TGroupValue> firstPassCollector;
            IAbstractAllGroupsCollector <TGroupValue>         allGroupsCollector;
            AbstractAllGroupHeadsCollector allGroupHeadsCollector;

            if (groupFunction != null)
            {
                firstPassCollector = (IAbstractFirstPassGroupingCollector <TGroupValue>) new FunctionFirstPassGroupingCollector(groupFunction, valueSourceContext, groupSort, topN);
                if (allGroups)
                {
                    allGroupsCollector = (IAbstractAllGroupsCollector <TGroupValue>) new FunctionAllGroupsCollector(groupFunction, valueSourceContext);
                }
                else
                {
                    allGroupsCollector = null;
                }
                if (allGroupHeads)
                {
                    allGroupHeadsCollector = new FunctionAllGroupHeadsCollector(groupFunction, valueSourceContext, sortWithinGroup);
                }
                else
                {
                    allGroupHeadsCollector = null;
                }
            }
            else
            {
                firstPassCollector = (IAbstractFirstPassGroupingCollector <TGroupValue>) new TermFirstPassGroupingCollector(groupField, groupSort, topN);
                if (allGroups)
                {
                    allGroupsCollector = (IAbstractAllGroupsCollector <TGroupValue>) new TermAllGroupsCollector(groupField, initialSize);
                }
                else
                {
                    allGroupsCollector = null;
                }
                if (allGroupHeads)
                {
                    allGroupHeadsCollector = TermAllGroupHeadsCollector.Create(groupField, sortWithinGroup, initialSize);
                }
                else
                {
                    allGroupHeadsCollector = null;
                }
            }

            Collector firstRound;

            if (allGroupHeads || allGroups)
            {
                List <Collector> collectors = new List <Collector>();
                // LUCENENET TODO: Make the Collector abstract class into an interface
                // so we can remove the casting here
                collectors.Add((Collector)firstPassCollector);
                if (allGroups)
                {
                    // LUCENENET TODO: Make the Collector abstract class into an interface
                    // so we can remove the casting here
                    collectors.Add((Collector)allGroupsCollector);
                }
                if (allGroupHeads)
                {
                    collectors.Add(allGroupHeadsCollector);
                }
                firstRound = MultiCollector.Wrap(collectors.ToArray(/* new Collector[collectors.size()] */));
            }
            else
            {
                // LUCENENET TODO: Make the Collector abstract class into an interface
                // so we can remove the casting here
                firstRound = (Collector)firstPassCollector;
            }

            CachingCollector cachedCollector = null;

            if (maxCacheRAMMB != null || maxDocsToCache != null)
            {
                if (maxCacheRAMMB != null)
                {
                    cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxCacheRAMMB.Value);
                }
                else
                {
                    cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxDocsToCache.Value);
                }
                searcher.Search(query, filter, cachedCollector);
            }
            else
            {
                searcher.Search(query, filter, firstRound);
            }

            if (allGroups)
            {
                matchingGroups = (IList)allGroupsCollector.Groups;
            }
            else
            {
                matchingGroups = new List <TGroupValue>();
            }
            if (allGroupHeads)
            {
                matchingGroupHeads = allGroupHeadsCollector.RetrieveGroupHeads(searcher.IndexReader.MaxDoc);
            }
            else
            {
                matchingGroupHeads = new Bits_MatchNoBits(searcher.IndexReader.MaxDoc);
            }

            IEnumerable <ISearchGroup <TGroupValue> > topSearchGroups = firstPassCollector.GetTopGroups(groupOffset, fillSortFields);

            if (topSearchGroups == null)
            {
                return(new TopGroups <TGroupValue>(new SortField[0], new SortField[0], 0, 0, new GroupDocs <TGroupValue> [0], float.NaN));
            }

            int topNInsideGroup = groupDocsOffset + groupDocsLimit;
            IAbstractSecondPassGroupingCollector <TGroupValue> secondPassCollector;

            if (groupFunction != null)
            {
                secondPassCollector = new FunctionSecondPassGroupingCollector(topSearchGroups as IEnumerable <ISearchGroup <MutableValue> >,
                                                                              groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields, groupFunction, valueSourceContext)
                                      as IAbstractSecondPassGroupingCollector <TGroupValue>;
            }
            else
            {
                secondPassCollector = new TermSecondPassGroupingCollector(groupField, topSearchGroups as IEnumerable <ISearchGroup <BytesRef> >,
                                                                          groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields)
                                      as IAbstractSecondPassGroupingCollector <TGroupValue>;
            }

            if (cachedCollector != null && cachedCollector.Cached)
            {
                // LUCENENET TODO: Create an ICollector interface that we can inherit our Collector interfaces from
                // so this cast is not necessary. Consider eliminating the Collector abstract class.
                cachedCollector.Replay(secondPassCollector as Collector);
            }
            else
            {
                // LUCENENET TODO: Create an ICollector interface that we can inherit our Collector interfaces from
                // so this cast is not necessary. Consider eliminating the Collector abstract class.
                searcher.Search(query, filter, secondPassCollector as Collector);
            }

            if (allGroups)
            {
                return(new TopGroups <TGroupValue>(secondPassCollector.GetTopGroups(groupDocsOffset), matchingGroups.Count));
            }
            else
            {
                return(secondPassCollector.GetTopGroups(groupDocsOffset));
            }
        }