/// <summary>User runs a query and counts facets.</summary> private IList <FacetResult> FacetsWithSearch() { using (DirectoryReader indexReader = DirectoryReader.Open(indexDir)) using (TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir)) { IndexSearcher searcher = new IndexSearcher(indexReader); FacetsCollector fc = new FacetsCollector(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: FacetsCollector.Search(searcher, new MatchAllDocsQuery(), 10, fc); // Retrieve results IList <FacetResult> results = new List <FacetResult>(); // Count both "Publish Date" and "Author" dimensions Facets facets = new FastTaxonomyFacetCounts(taxoReader, config, fc); results.Add(facets.GetTopChildren(10, "Author")); results.Add(facets.GetTopChildren(10, "Publish Date")); return(results); }// Disposes indexReader and taxoReader }
/// <summary>User runs a query and counts facets only without collecting the matching documents.</summary> private IList <FacetResult> FacetsOnly() { using DirectoryReader indexReader = DirectoryReader.Open(indexDir); using TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); IndexSearcher searcher = new IndexSearcher(indexReader); FacetsCollector fc = new FacetsCollector(); // MatchAllDocsQuery is for "browsing" (counts facets // for all non-deleted docs in the index); normally // you'd use a "normal" query: searcher.Search(new MatchAllDocsQuery(), null /*Filter */, fc); Facets facets = new FastTaxonomyFacetCounts(taxoReader, config, fc); // Retrieve results IList <FacetResult> results = new List <FacetResult> { // Count both "Publish Date" and "Author" dimensions facets.GetTopChildren(10, "Author"), facets.GetTopChildren(10, "Publish Date") }; return(results); }
public bool? Call() { if (indexReader == null) { indexReader = DirectoryReader.Open(indexDir); lastIndexGeneration = indexReader.IndexCommit.Generation; taxoReader = new DirectoryTaxonomyReader(taxoDir); } else { // verify search index DirectoryReader newReader = DirectoryReader.OpenIfChanged(indexReader); assertNotNull("should not have reached here if no changes were made to the index", newReader); long newGeneration = newReader.IndexCommit.Generation; assertTrue("expected newer generation; current=" + lastIndexGeneration + " new=" + newGeneration, newGeneration > lastIndexGeneration); indexReader.Dispose(); indexReader = newReader; lastIndexGeneration = newGeneration; TestUtil.CheckIndex(indexDir); // verify taxonomy index DirectoryTaxonomyReader newTaxoReader = TaxonomyReader.OpenIfChanged(taxoReader); if (newTaxoReader != null) { taxoReader.Dispose(); taxoReader = newTaxoReader; } TestUtil.CheckIndex(taxoDir); // verify faceted search int id = int.Parse(indexReader.IndexCommit.UserData[VERSION_ID], NumberStyles.HexNumber); IndexSearcher searcher = new IndexSearcher(indexReader); FacetsCollector fc = new FacetsCollector(); searcher.Search(new MatchAllDocsQuery(), fc); Facets facets = new FastTaxonomyFacetCounts(taxoReader, config, fc); assertEquals(1, (int)facets.GetSpecificValue("A", id.ToString("X"))); DrillDownQuery drillDown = new DrillDownQuery(config); drillDown.Add("A", id.ToString("X")); TopDocs docs = searcher.Search(drillDown, 10); assertEquals(1, docs.TotalHits); } return null; }
public virtual Facets GetTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector c, string indexFieldName) { Facets facets; if (Random.NextBoolean()) { facets = new FastTaxonomyFacetCounts(indexFieldName, taxoReader, config, c); } else { OrdinalsReader ordsReader = new DocValuesOrdinalsReader(indexFieldName); if (Random.NextBoolean()) { ordsReader = new CachedOrdinalsReader(ordsReader); } facets = new TaxonomyFacetCounts(ordsReader, taxoReader, config, c); } return(facets); }
/// <summary> /// User drills down on 'Publish Date/2010', and we /// return facets for 'Author' /// </summary> private FacetResult DrillDown() { using DirectoryReader indexReader = DirectoryReader.Open(indexDir); using TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); IndexSearcher searcher = new IndexSearcher(indexReader); // Passing no baseQuery means we drill down on all // documents ("browse only"): DrillDownQuery q = new DrillDownQuery(config); // Now user drills down on Publish Date/2010: q.Add("Publish Date", "2010"); FacetsCollector fc = new FacetsCollector(); FacetsCollector.Search(searcher, q, 10, fc); // Retrieve results Facets facets = new FastTaxonomyFacetCounts(taxoReader, config, fc); FacetResult result = facets.GetTopChildren(10, "Author"); return(result); }
/// <summary> /// Subclass can override to customize per-dim Facets /// impl. /// </summary> protected virtual Facets BuildFacetsResult(FacetsCollector drillDowns, FacetsCollector[] drillSideways, string[] drillSidewaysDims) { Facets drillDownFacets; var drillSidewaysFacets = new Dictionary <string, Facets>(); if (m_taxoReader != null) { drillDownFacets = new FastTaxonomyFacetCounts(m_taxoReader, m_config, drillDowns); if (drillSideways != null) { for (int i = 0; i < drillSideways.Length; i++) { drillSidewaysFacets[drillSidewaysDims[i]] = new FastTaxonomyFacetCounts(m_taxoReader, m_config, drillSideways[i]); } } } else { drillDownFacets = new SortedSetDocValuesFacetCounts(m_state, drillDowns); if (drillSideways != null) { for (int i = 0; i < drillSideways.Length; i++) { drillSidewaysFacets[drillSidewaysDims[i]] = new SortedSetDocValuesFacetCounts(m_state, drillSideways[i]); } } } if (drillSidewaysFacets.Count == 0) { return(drillDownFacets); } else { return(new MultiFacets(drillSidewaysFacets, drillDownFacets)); } }
public virtual void TestRandomSampling() { Directory dir = NewDirectory(); Directory taxoDir = NewDirectory(); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); FacetsConfig config = new FacetsConfig(); int numDocs = AtLeast(10000); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture))); writer.AddDocument(config.Build(taxoWriter, doc)); } Random random = Random; // NRT open IndexSearcher searcher = NewSearcher(writer.GetReader()); var taxoReader = new DirectoryTaxonomyReader(taxoWriter); IOUtils.Dispose(writer, taxoWriter); // Test empty results RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // There should be no divisions by zero searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); // There should be no divisions by zero and no null result Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs()); // There should be no results at all foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs()) { Assert.AreEqual(0, doc.TotalHits); } // Now start searching and retrieve results. // Use a query to select half of the documents. TermQuery query = new TermQuery(new Term("EvenOdd", "even")); // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i % // 10) are hits. // there is a REAL small chance that one of the 5 values will be missed when // sampling. // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be // missing) ~ 10^-193 // so that is probably not going to happen. int maxNumChildren = 5; RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64()); // no sampling RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits FacetsCollector fc = new FacetsCollector(); searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent)); FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent); FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc); FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher); FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10"); FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10"); Assert.AreEqual(random100Result, exactResult); // we should have five children, but there is a small chance we have less. // (see above). Assert.IsTrue(random10Result.ChildCount <= maxNumChildren); // there should be one child at least. Assert.IsTrue(random10Result.ChildCount >= 1); // now calculate some statistics to determine if the sampled result is 'ok'. // because random sampling is used, the results will vary each time. int sum = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { sum += (int)lav.Value; } float mu = (float)sum / (float)maxNumChildren; float variance = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { variance += (float)Math.Pow((mu - (int)lav.Value), 2); } variance = variance / maxNumChildren; float sigma = (float)Math.Sqrt(variance); // we query only half the documents and have 5 categories. The average // number of docs in a category will thus be the total divided by 5*2 float targetMu = numDocs / (5.0f * 2.0f); // the average should be in the range and the standard deviation should not // be too great Assert.IsTrue(sigma < 200); Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma); IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir); }
/// <summary> /// Searches the Lucene index for Documents that match the specified search criteria. /// </summary> /// <param name="criteria">The search criteria.</param> /// <returns></returns> /// <exception cref="ArgumentNullException"></exception> /// <exception cref="System.ArgumentNullException"></exception> public SearchResult<Guid> Search(SearchCriteria criteria) { if (criteria == null) throw new ArgumentNullException(nameof(criteria)); criteria.Query = String.IsNullOrWhiteSpace(criteria.Query) ? ALL_DOCS_QUERY : criteria.Query; criteria.TopN = criteria.TopN > 0 ? criteria.TopN : SearchCriteria.DEFAULT_TOP_N; criteria.ItemsPerPage = criteria.ItemsPerPage > 0 ? criteria.ItemsPerPage : SearchCriteria.DEFAULT_ITEMS_PER_PAGE; criteria.PageNumber = criteria.PageNumber > 0 ? criteria.PageNumber : 1; criteria.Validate(); var result = new SearchResult<Guid>(criteria); var queryParser = new LuceneQueryParser(Schema.StandardField.FULL_TEXT, _compositeAnalyzer, Schema); var query = queryParser.Parse(criteria.Query); var instance = _searcherTaxonomyManager.Acquire() as SearcherTaxonomyManagerSearcherAndTaxonomy; if (instance != null) { var searcher = instance.Searcher; var taxonomyReader = instance.TaxonomyReader; try { var sort = GetSortCriteria(criteria.SortByField); var selectedFacets = criteria.SelectCategories.ToFacetFields(); var topDocs = (TopDocs)null; var categories = (IEnumerable<Category>)null; if (selectedFacets.Count() == 0) { // We are not going to do a drill-down on specific facets. // Instead we will just take the top N facets from the matching Documents. var facetsCollector = new FacetsCollector(); // Get the matching Documents topDocs = FacetsCollector.Search(searcher, query, criteria.TopN, sort, facetsCollector); // Get the Facet counts from the matching Documents var facetCounts = new FastTaxonomyFacetCounts(taxonomyReader, _facetBuilder.FacetsConfig, facetsCollector); categories = facetCounts.GetCategories(criteria.TopNCategories); } else { // Perform a drill-sideways query var drillDownQuery = new DrillDownQuery(_facetBuilder.FacetsConfig, query); foreach (var facetField in selectedFacets) drillDownQuery.Add(facetField.Dim, facetField.Path); var drillSideways = new DrillSideways(searcher, _facetBuilder.FacetsConfig, taxonomyReader); var drillSidewaysResult = drillSideways.Search(drillDownQuery, null, null, criteria.TopN, sort, false, false); // Get the matching documents topDocs = drillSidewaysResult.Hits; // Get the Facet counts from the matching Documents categories = drillSidewaysResult.Facets.GetCategories(criteria.TopNCategories, selectedFacets); } // TODO: Don't pass TopDocs; pass an IEnumerable<Guid> result.PopulateWith(topDocs, categories, id => searcher.Doc(id)); } finally { _searcherTaxonomyManager.Release(instance); searcher = null; taxonomyReader = null; } } return result; }
public async Task <SearchResult <T> > SearchAsync(SearchQuery queryDefinition, CancellationToken cancellationToken = default) { using (await writerLock.ReaderLockAsync(cancellationToken)) { var result = new SearchResult <T>(); List <T> hits = new List <T>(); using (var writer = getWriter()) { Query query = new MatchAllDocsQuery(); // Term queries if (queryDefinition.TermQueries.Any()) { var phraseQuery = new MultiPhraseQuery(); foreach (var termQuery in queryDefinition.TermQueries) { phraseQuery.Add( termQuery.value .Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries) .Select(phrase => new Term(termQuery.field, phrase.ToLower())) .ToArray() ); } query = phraseQuery; } var reader = writer.DocsWriter.GetReader(applyAllDeletes: true); var searcher = new IndexSearcher(reader); var luceneResult = searcher.Search(query, queryDefinition.Limit); foreach (var doc in luceneResult.ScoreDocs) { var foundDoc = searcher.Doc(doc.Doc); hits.Add(await inflateDocument(foundDoc)); } result.TotalHits = luceneResult.TotalHits; result.Hits = hits; // Facets if (queryDefinition.Facets.Any()) { FacetsConfig facetsConfig = new FacetsConfig(); FacetsCollector fc = new FacetsCollector(); FacetsCollector.Search(searcher, query, queryDefinition.FacetMax, fc); using (var taxonomyReader = new DirectoryTaxonomyReader(FSDirectory.Open(Path.Combine(options.IndexPath, indexType, "taxonomy")))) { var facets = new FastTaxonomyFacetCounts(taxonomyReader, facetsConfig, fc); foreach (var facet in queryDefinition.Facets) { var facetGroup = new FacetGroup { Field = facet }; facetGroup.Facets = facets.GetTopChildren(queryDefinition.FacetMax, facet).LabelValues .Select(x => new Facet { Key = x.Label, Count = (long)x.Value }) .ToArray(); result.FacetGroups.Add(facetGroup); } } } } return(result); } }
/// <summary> /// Get the taxonomy facet caounts /// </summary> /// <param name="taxoReader">The taxonomy reader.</param> /// <param name="config">The facet configuration.</param> /// <param name="collector">The result collector.</param> /// <param name="indexFieldName">The index field name.</param> /// <returns>The facets.</returns> private Facets GetTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector collector, string indexFieldName) { Facets facets = new FastTaxonomyFacetCounts(indexFieldName, taxoReader, config, collector); return(facets); }
public virtual void TestRandomSampling() { Directory dir = NewDirectory(); Directory taxoDir = NewDirectory(); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); FacetsConfig config = new FacetsConfig(); int numDocs = AtLeast(10000); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); doc.Add(new FacetField("iMod10", Convert.ToString(i % 10))); writer.AddDocument(config.Build(taxoWriter, doc)); } Random random = Random(); // NRT open IndexSearcher searcher = NewSearcher(writer.Reader); var taxoReader = new DirectoryTaxonomyReader(taxoWriter); IOUtils.Close(writer, taxoWriter); // Test empty results RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong()); // There should be no divisions by zero searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); // There should be no divisions by zero and no null result Assert.NotNull(collectRandomZeroResults.GetMatchingDocs()); // There should be no results at all foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs()) { Assert.AreEqual(0, doc.TotalHits); } // Now start searching and retrieve results. // Use a query to select half of the documents. TermQuery query = new TermQuery(new Term("EvenOdd", "even")); // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i % // 10) are hits. // there is a REAL small chance that one of the 5 values will be missed when // sampling. // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be // missing) ~ 10^-193 // so that is probably not going to happen. int maxNumChildren = 5; RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextLong()); // no sampling RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong()); // 10 % of total docs, 20% of the hits FacetsCollector fc = new FacetsCollector(); searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent)); FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent); FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc); FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher); FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10"); FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10"); Assert.AreEqual(random100Result, exactResult); // we should have five children, but there is a small chance we have less. // (see above). Assert.True(random10Result.ChildCount <= maxNumChildren); // there should be one child at least. Assert.True(random10Result.ChildCount >= 1); // now calculate some statistics to determine if the sampled result is 'ok'. // because random sampling is used, the results will vary each time. int sum = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { sum += (int)lav.Value; } float mu = (float)sum / (float)maxNumChildren; float variance = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { variance += (float)Math.Pow((mu - (int)lav.Value), 2); } variance = variance / maxNumChildren; float sigma = (float)Math.Sqrt(variance); // we query only half the documents and have 5 categories. The average // number of docs in a category will thus be the total divided by 5*2 float targetMu = numDocs / (5.0f * 2.0f); // the average should be in the range and the standard deviation should not // be too great Assert.True(sigma < 200); Assert.True(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma); IOUtils.Close(searcher.IndexReader, taxoReader, dir, taxoDir); }