/// <summary> /// Search, collecting hits with a <see cref="ICollector"/>, and /// computing drill down and sideways counts. /// </summary> public virtual DrillSidewaysResult Search(DrillDownQuery query, ICollector hitCollector) { IDictionary <string, int?> drillDownDims = query.Dims; FacetsCollector drillDownCollector = new FacetsCollector(); if (drillDownDims.Count == 0) { // There are no drill-down dims, so there is no // drill-sideways to compute: m_searcher.Search(query, MultiCollector.Wrap(hitCollector, drillDownCollector)); return(new DrillSidewaysResult(BuildFacetsResult(drillDownCollector, null, null), null)); } BooleanQuery ddq = query.BooleanQuery; BooleanClause[] clauses = ddq.GetClauses(); Query baseQuery; int startClause; if (clauses.Length == drillDownDims.Count) { // TODO: we could optimize this pure-browse case by // making a custom scorer instead: baseQuery = new MatchAllDocsQuery(); startClause = 0; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(clauses.Length == 1 + drillDownDims.Count); } baseQuery = clauses[0].Query; startClause = 1; } FacetsCollector[] drillSidewaysCollectors = new FacetsCollector[drillDownDims.Count]; for (int i = 0; i < drillSidewaysCollectors.Length; i++) { drillSidewaysCollectors[i] = new FacetsCollector(); } Query[] drillDownQueries = new Query[clauses.Length - startClause]; for (int i = startClause; i < clauses.Length; i++) { drillDownQueries[i - startClause] = clauses[i].Query; } DrillSidewaysQuery dsq = new DrillSidewaysQuery(baseQuery, drillDownCollector, drillSidewaysCollectors, drillDownQueries, ScoreSubDocsAtOnce); m_searcher.Search(dsq, hitCollector); return(new DrillSidewaysResult(BuildFacetsResult(drillDownCollector, drillSidewaysCollectors, drillDownDims.Keys.ToArray()), null)); }
private static TopDocs DoSearch(IndexSearcher searcher, ScoreDoc after, Query q, Filter filter, int n, Sort sort, bool doDocScores, bool doMaxScore, Collector fc) { if (filter != null) { q = new FilteredQuery(q, filter); } int limit = searcher.IndexReader.MaxDoc; if (limit == 0) { limit = 1; } n = Math.Min(n, limit); if (after != null && after.Doc >= limit) { throw new System.ArgumentException("after.doc exceeds the number of documents in the reader: after.doc=" + after.Doc + " limit=" + limit); } if (sort != null) { if (after != null && !(after is FieldDoc)) { // TODO: if we fix type safety of TopFieldDocs we can // remove this throw new System.ArgumentException("after must be a FieldDoc; got " + after); } const bool fillFields = true; var hitsCollector = TopFieldCollector.Create(sort, n, (FieldDoc)after, fillFields, doDocScores, doMaxScore, false); searcher.Search(q, MultiCollector.Wrap(hitsCollector, fc)); return(hitsCollector.TopDocs()); } else { // TODO: can we pass the right boolean for // in-order instead of hardwired to false...? we'd // need access to the protected IS.search methods // taking Weight... could use reflection... var hitsCollector = TopScoreDocCollector.Create(n, after, false); searcher.Search(q, MultiCollector.Wrap(hitsCollector, fc)); return(hitsCollector.TopDocs()); } }
public virtual void TestRandomSampling() { Directory dir = NewDirectory(); Directory taxoDir = NewDirectory(); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); FacetsConfig config = new FacetsConfig(); int numDocs = AtLeast(10000); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture))); writer.AddDocument(config.Build(taxoWriter, doc)); } Random random = Random; // NRT open IndexSearcher searcher = NewSearcher(writer.GetReader()); var taxoReader = new DirectoryTaxonomyReader(taxoWriter); IOUtils.Dispose(writer, taxoWriter); // Test empty results RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // There should be no divisions by zero searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); // There should be no divisions by zero and no null result Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs()); // There should be no results at all foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs()) { Assert.AreEqual(0, doc.TotalHits); } // Now start searching and retrieve results. // Use a query to select half of the documents. TermQuery query = new TermQuery(new Term("EvenOdd", "even")); // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i % // 10) are hits. // there is a REAL small chance that one of the 5 values will be missed when // sampling. // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be // missing) ~ 10^-193 // so that is probably not going to happen. int maxNumChildren = 5; RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64()); // no sampling RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits FacetsCollector fc = new FacetsCollector(); searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent)); FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent); FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc); FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher); FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10"); FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10"); Assert.AreEqual(random100Result, exactResult); // we should have five children, but there is a small chance we have less. // (see above). Assert.IsTrue(random10Result.ChildCount <= maxNumChildren); // there should be one child at least. Assert.IsTrue(random10Result.ChildCount >= 1); // now calculate some statistics to determine if the sampled result is 'ok'. // because random sampling is used, the results will vary each time. int sum = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { sum += (int)lav.Value; } float mu = (float)sum / (float)maxNumChildren; float variance = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { variance += (float)Math.Pow((mu - (int)lav.Value), 2); } variance = variance / maxNumChildren; float sigma = (float)Math.Sqrt(variance); // we query only half the documents and have 5 categories. The average // number of docs in a category will thus be the total divided by 5*2 float targetMu = numDocs / (5.0f * 2.0f); // the average should be in the range and the standard deviation should not // be too great Assert.IsTrue(sigma < 200); Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma); IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir); }
//LUCENENET Specific. One of two methods that replace GroupByFieldOrFunction. Used support // SearchByFunction in a way that eliminates casting for the caller. // This method is essentually a Function specific version of the GroupByFieldOrFunction. protected virtual ITopGroups <TMutableValue> GroupByFunction <TMutableValue>(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) where TMutableValue : MutableValue { int topN = groupOffset + groupLimit; FunctionFirstPassGroupingCollector <TMutableValue> firstPassCollector; FunctionAllGroupsCollector <TMutableValue> allGroupsCollector; AbstractAllGroupHeadsCollector allGroupHeadsCollector; if (groupFunction == null) { throw IllegalStateException.Create("groupFunction must be set via the constructor by specifying a ValueSource."); } firstPassCollector = new FunctionFirstPassGroupingCollector <TMutableValue>(groupFunction, valueSourceContext, groupSort, topN); if (allGroups) { allGroupsCollector = new FunctionAllGroupsCollector <TMutableValue>(groupFunction, valueSourceContext); } else { allGroupsCollector = null; } if (allGroupHeads) { allGroupHeadsCollector = new FunctionAllGroupHeadsCollector(groupFunction, valueSourceContext, sortWithinGroup); } else { allGroupHeadsCollector = null; } ICollector firstRound; if (allGroupHeads || allGroups) { List <ICollector> collectors = new List <ICollector>(); collectors.Add(firstPassCollector); if (allGroups) { collectors.Add(allGroupsCollector); } if (allGroupHeads) { collectors.Add(allGroupHeadsCollector); } firstRound = MultiCollector.Wrap(collectors.ToArray(/* new Collector[collectors.size()] */)); } else { firstRound = firstPassCollector; } CachingCollector cachedCollector = null; if (maxCacheRAMMB != null || maxDocsToCache != null) { if (maxCacheRAMMB != null) { cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxCacheRAMMB.Value); } else { cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxDocsToCache.Value); } searcher.Search(query, filter, cachedCollector); } else { searcher.Search(query, filter, firstRound); } if (allGroups) { matchingGroups = (ICollection)allGroupsCollector.Groups; } else { matchingGroups = (ICollection)Collections.EmptyList <TMutableValue>(); } if (allGroupHeads) { matchingGroupHeads = allGroupHeadsCollector.RetrieveGroupHeads(searcher.IndexReader.MaxDoc); } else { matchingGroupHeads = new Bits.MatchNoBits(searcher.IndexReader.MaxDoc); } IEnumerable <ISearchGroup <TMutableValue> > topSearchGroups = firstPassCollector.GetTopGroups(groupOffset, fillSortFields); if (topSearchGroups == null) { // LUCENENET specific - optimized empty array creation return(new TopGroups <TMutableValue>(Arrays.Empty <SortField>(), Arrays.Empty <SortField>(), 0, 0, Arrays.Empty <GroupDocs <TMutableValue> >(), float.NaN)); } int topNInsideGroup = groupDocsOffset + groupDocsLimit; IAbstractSecondPassGroupingCollector <TMutableValue> secondPassCollector; secondPassCollector = new FunctionSecondPassGroupingCollector <TMutableValue>(topSearchGroups as IEnumerable <ISearchGroup <TMutableValue> >, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields, groupFunction, valueSourceContext) as IAbstractSecondPassGroupingCollector <TMutableValue>; if (cachedCollector != null && cachedCollector.IsCached) { cachedCollector.Replay(secondPassCollector); } else { searcher.Search(query, filter, secondPassCollector); } if (allGroups) { return(new TopGroups <TMutableValue>(secondPassCollector.GetTopGroups(groupDocsOffset), matchingGroups.Count)); } else { return(secondPassCollector.GetTopGroups(groupDocsOffset)); } }
protected virtual ITopGroups <TGroupValue> GroupByFieldOrFunction <TGroupValue>(IndexSearcher searcher, Filter filter, Query query, int groupOffset, int groupLimit) { int topN = groupOffset + groupLimit; IAbstractFirstPassGroupingCollector <TGroupValue> firstPassCollector; IAbstractAllGroupsCollector <TGroupValue> allGroupsCollector; AbstractAllGroupHeadsCollector allGroupHeadsCollector; if (groupFunction != null) { firstPassCollector = (IAbstractFirstPassGroupingCollector <TGroupValue>) new FunctionFirstPassGroupingCollector(groupFunction, valueSourceContext, groupSort, topN); if (allGroups) { allGroupsCollector = (IAbstractAllGroupsCollector <TGroupValue>) new FunctionAllGroupsCollector(groupFunction, valueSourceContext); } else { allGroupsCollector = null; } if (allGroupHeads) { allGroupHeadsCollector = new FunctionAllGroupHeadsCollector(groupFunction, valueSourceContext, sortWithinGroup); } else { allGroupHeadsCollector = null; } } else { firstPassCollector = (IAbstractFirstPassGroupingCollector <TGroupValue>) new TermFirstPassGroupingCollector(groupField, groupSort, topN); if (allGroups) { allGroupsCollector = (IAbstractAllGroupsCollector <TGroupValue>) new TermAllGroupsCollector(groupField, initialSize); } else { allGroupsCollector = null; } if (allGroupHeads) { allGroupHeadsCollector = TermAllGroupHeadsCollector.Create(groupField, sortWithinGroup, initialSize); } else { allGroupHeadsCollector = null; } } Collector firstRound; if (allGroupHeads || allGroups) { List <Collector> collectors = new List <Collector>(); // LUCENENET TODO: Make the Collector abstract class into an interface // so we can remove the casting here collectors.Add((Collector)firstPassCollector); if (allGroups) { // LUCENENET TODO: Make the Collector abstract class into an interface // so we can remove the casting here collectors.Add((Collector)allGroupsCollector); } if (allGroupHeads) { collectors.Add(allGroupHeadsCollector); } firstRound = MultiCollector.Wrap(collectors.ToArray(/* new Collector[collectors.size()] */)); } else { // LUCENENET TODO: Make the Collector abstract class into an interface // so we can remove the casting here firstRound = (Collector)firstPassCollector; } CachingCollector cachedCollector = null; if (maxCacheRAMMB != null || maxDocsToCache != null) { if (maxCacheRAMMB != null) { cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxCacheRAMMB.Value); } else { cachedCollector = CachingCollector.Create(firstRound, cacheScores, maxDocsToCache.Value); } searcher.Search(query, filter, cachedCollector); } else { searcher.Search(query, filter, firstRound); } if (allGroups) { matchingGroups = (IList)allGroupsCollector.Groups; } else { matchingGroups = new List <TGroupValue>(); } if (allGroupHeads) { matchingGroupHeads = allGroupHeadsCollector.RetrieveGroupHeads(searcher.IndexReader.MaxDoc); } else { matchingGroupHeads = new Bits_MatchNoBits(searcher.IndexReader.MaxDoc); } IEnumerable <ISearchGroup <TGroupValue> > topSearchGroups = firstPassCollector.GetTopGroups(groupOffset, fillSortFields); if (topSearchGroups == null) { return(new TopGroups <TGroupValue>(new SortField[0], new SortField[0], 0, 0, new GroupDocs <TGroupValue> [0], float.NaN)); } int topNInsideGroup = groupDocsOffset + groupDocsLimit; IAbstractSecondPassGroupingCollector <TGroupValue> secondPassCollector; if (groupFunction != null) { secondPassCollector = new FunctionSecondPassGroupingCollector(topSearchGroups as IEnumerable <ISearchGroup <MutableValue> >, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields, groupFunction, valueSourceContext) as IAbstractSecondPassGroupingCollector <TGroupValue>; } else { secondPassCollector = new TermSecondPassGroupingCollector(groupField, topSearchGroups as IEnumerable <ISearchGroup <BytesRef> >, groupSort, sortWithinGroup, topNInsideGroup, includeScores, includeMaxScore, fillSortFields) as IAbstractSecondPassGroupingCollector <TGroupValue>; } if (cachedCollector != null && cachedCollector.Cached) { // LUCENENET TODO: Create an ICollector interface that we can inherit our Collector interfaces from // so this cast is not necessary. Consider eliminating the Collector abstract class. cachedCollector.Replay(secondPassCollector as Collector); } else { // LUCENENET TODO: Create an ICollector interface that we can inherit our Collector interfaces from // so this cast is not necessary. Consider eliminating the Collector abstract class. searcher.Search(query, filter, secondPassCollector as Collector); } if (allGroups) { return(new TopGroups <TGroupValue>(secondPassCollector.GetTopGroups(groupDocsOffset), matchingGroups.Count)); } else { return(secondPassCollector.GetTopGroups(groupDocsOffset)); } }