/// <summary>User runs a query and counts facets.</summary>
        private IList <FacetResult> FacetsWithSearch()
        {
            using (DirectoryReader indexReader = DirectoryReader.Open(indexDir))
                using (TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir))
                {
                    IndexSearcher searcher = new IndexSearcher(indexReader);

                    FacetsCollector fc = new FacetsCollector();

                    // MatchAllDocsQuery is for "browsing" (counts facets
                    // for all non-deleted docs in the index); normally
                    // you'd use a "normal" query:
                    FacetsCollector.Search(searcher, new MatchAllDocsQuery(), 10, fc);

                    // Retrieve results
                    IList <FacetResult> results = new List <FacetResult>();

                    // Count both "Publish Date" and "Author" dimensions
                    Facets facets = new FastTaxonomyFacetCounts(taxoReader, config, fc);
                    results.Add(facets.GetTopChildren(10, "Author"));
                    results.Add(facets.GetTopChildren(10, "Publish Date"));

                    return(results);
                }// Disposes indexReader and taxoReader
        }
Beispiel #2
0
        /// <summary>User runs a query and counts facets only without collecting the matching documents.</summary>
        private IList <FacetResult> FacetsOnly()
        {
            using DirectoryReader indexReader = DirectoryReader.Open(indexDir);
            using TaxonomyReader taxoReader   = new DirectoryTaxonomyReader(taxoDir);
            IndexSearcher searcher = new IndexSearcher(indexReader);

            FacetsCollector fc = new FacetsCollector();

            // MatchAllDocsQuery is for "browsing" (counts facets
            // for all non-deleted docs in the index); normally
            // you'd use a "normal" query:
            searcher.Search(new MatchAllDocsQuery(), null /*Filter */, fc);

            Facets facets = new FastTaxonomyFacetCounts(taxoReader, config, fc);

            // Retrieve results
            IList <FacetResult> results = new List <FacetResult>
            {
                // Count both "Publish Date" and "Author" dimensions
                facets.GetTopChildren(10, "Author"),
                facets.GetTopChildren(10, "Publish Date")
            };

            return(results);
        }
Beispiel #3
0
        /// <summary>
        /// User drills down on 'Publish Date/2010', and we
        /// return facets for 'Author'
        /// </summary>
        private FacetResult DrillDown()
        {
            using DirectoryReader indexReader = DirectoryReader.Open(indexDir);
            using TaxonomyReader taxoReader   = new DirectoryTaxonomyReader(taxoDir);
            IndexSearcher searcher = new IndexSearcher(indexReader);

            // Passing no baseQuery means we drill down on all
            // documents ("browse only"):
            DrillDownQuery q = new DrillDownQuery(config);

            // Now user drills down on Publish Date/2010:
            q.Add("Publish Date", "2010");
            FacetsCollector fc = new FacetsCollector();

            FacetsCollector.Search(searcher, q, 10, fc);

            // Retrieve results
            Facets      facets = new FastTaxonomyFacetCounts(taxoReader, config, fc);
            FacetResult result = facets.GetTopChildren(10, "Author");

            return(result);
        }
        public virtual void TestRandomSampling()
        {
            Directory dir     = NewDirectory();
            Directory taxoDir = NewDirectory();

            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            RandomIndexWriter       writer     = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir);

            FacetsConfig config = new FacetsConfig();

            int numDocs = AtLeast(10000);

            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
                doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture)));
                writer.AddDocument(config.Build(taxoWriter, doc));
            }
            Random random = Random;

            // NRT open
            IndexSearcher searcher   = NewSearcher(writer.GetReader());
            var           taxoReader = new DirectoryTaxonomyReader(taxoWriter);

            IOUtils.Dispose(writer, taxoWriter);

            // Test empty results
            RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64());

            // There should be no divisions by zero
            searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

            // There should be no divisions by zero and no null result
            Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs());

            // There should be no results at all
            foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs())
            {
                Assert.AreEqual(0, doc.TotalHits);
            }

            // Now start searching and retrieve results.

            // Use a query to select half of the documents.
            TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

            // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
            // 10) are hits.
            // there is a REAL small chance that one of the 5 values will be missed when
            // sampling.
            // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
            // missing) ~ 10^-193
            // so that is probably not going to happen.
            int maxNumChildren = 5;

            RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64());      // no sampling
            RandomSamplingFacetsCollector random10Percent  = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits

            FacetsCollector fc = new FacetsCollector();

            searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));

            FastTaxonomyFacetCounts random10FacetCounts  = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
            FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
            FastTaxonomyFacetCounts exactFacetCounts     = new FastTaxonomyFacetCounts(taxoReader, config, fc);

            FacetResult random10Result  = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
            FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
            FacetResult exactResult     = exactFacetCounts.GetTopChildren(10, "iMod10");

            Assert.AreEqual(random100Result, exactResult);

            // we should have five children, but there is a small chance we have less.
            // (see above).
            Assert.IsTrue(random10Result.ChildCount <= maxNumChildren);
            // there should be one child at least.
            Assert.IsTrue(random10Result.ChildCount >= 1);

            // now calculate some statistics to determine if the sampled result is 'ok'.
            // because random sampling is used, the results will vary each time.
            int sum = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                sum += (int)lav.Value;
            }
            float mu = (float)sum / (float)maxNumChildren;

            float variance = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                variance += (float)Math.Pow((mu - (int)lav.Value), 2);
            }
            variance = variance / maxNumChildren;
            float sigma = (float)Math.Sqrt(variance);

            // we query only half the documents and have 5 categories. The average
            // number of docs in a category will thus be the total divided by 5*2
            float targetMu = numDocs / (5.0f * 2.0f);

            // the average should be in the range and the standard deviation should not
            // be too great
            Assert.IsTrue(sigma < 200);
            Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);

            IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir);
        }
Beispiel #5
0
        public async Task <SearchResult <T> > SearchAsync(SearchQuery queryDefinition, CancellationToken cancellationToken = default)
        {
            using (await writerLock.ReaderLockAsync(cancellationToken))
            {
                var      result = new SearchResult <T>();
                List <T> hits   = new List <T>();

                using (var writer = getWriter())
                {
                    Query query = new MatchAllDocsQuery();

                    // Term queries
                    if (queryDefinition.TermQueries.Any())
                    {
                        var phraseQuery = new MultiPhraseQuery();
                        foreach (var termQuery in queryDefinition.TermQueries)
                        {
                            phraseQuery.Add(
                                termQuery.value
                                .Split(" ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)
                                .Select(phrase => new Term(termQuery.field, phrase.ToLower()))
                                .ToArray()
                                );
                        }
                        query = phraseQuery;
                    }

                    var reader       = writer.DocsWriter.GetReader(applyAllDeletes: true);
                    var searcher     = new IndexSearcher(reader);
                    var luceneResult = searcher.Search(query, queryDefinition.Limit);

                    foreach (var doc in luceneResult.ScoreDocs)
                    {
                        var foundDoc = searcher.Doc(doc.Doc);
                        hits.Add(await inflateDocument(foundDoc));
                    }

                    result.TotalHits = luceneResult.TotalHits;
                    result.Hits      = hits;

                    // Facets
                    if (queryDefinition.Facets.Any())
                    {
                        FacetsConfig    facetsConfig = new FacetsConfig();
                        FacetsCollector fc           = new FacetsCollector();
                        FacetsCollector.Search(searcher, query, queryDefinition.FacetMax, fc);
                        using (var taxonomyReader = new DirectoryTaxonomyReader(FSDirectory.Open(Path.Combine(options.IndexPath, indexType, "taxonomy"))))
                        {
                            var facets = new FastTaxonomyFacetCounts(taxonomyReader, facetsConfig, fc);
                            foreach (var facet in queryDefinition.Facets)
                            {
                                var facetGroup = new FacetGroup {
                                    Field = facet
                                };
                                facetGroup.Facets =
                                    facets.GetTopChildren(queryDefinition.FacetMax, facet).LabelValues
                                    .Select(x => new Facet {
                                    Key = x.Label, Count = (long)x.Value
                                })
                                    .ToArray();
                                result.FacetGroups.Add(facetGroup);
                            }
                        }
                    }
                }

                return(result);
            }
        }
        public virtual void TestRandomSampling()
        {
            Directory dir = NewDirectory();
            Directory taxoDir = NewDirectory();

            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, Similarity, TimeZone);

            FacetsConfig config = new FacetsConfig();

            int numDocs = AtLeast(10000);
            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
                doc.Add(new FacetField("iMod10", Convert.ToString(i % 10)));
                writer.AddDocument(config.Build(taxoWriter, doc));
            }
            Random random = Random();

            // NRT open
            IndexSearcher searcher = NewSearcher(writer.Reader);
            var taxoReader = new DirectoryTaxonomyReader(taxoWriter);
            IOUtils.Close(writer, taxoWriter);

            // Test empty results
            RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong());

            // There should be no divisions by zero
            searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

            // There should be no divisions by zero and no null result
            Assert.NotNull(collectRandomZeroResults.GetMatchingDocs());

            // There should be no results at all
            foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs())
            {
                Assert.AreEqual(0, doc.TotalHits);
            }

            // Now start searching and retrieve results.

            // Use a query to select half of the documents.
            TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

            // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
            // 10) are hits.
            // there is a REAL small chance that one of the 5 values will be missed when
            // sampling.
            // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
            // missing) ~ 10^-193
            // so that is probably not going to happen.
            int maxNumChildren = 5;

            RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextLong()); // no sampling
            RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong()); // 10 % of total docs, 20% of the hits

            FacetsCollector fc = new FacetsCollector();

            searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));

            FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
            FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
            FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc);

            FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
            FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
            FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10");

            Assert.AreEqual(random100Result, exactResult);

            // we should have five children, but there is a small chance we have less.
            // (see above).
            Assert.True(random10Result.ChildCount <= maxNumChildren);
            // there should be one child at least.
            Assert.True(random10Result.ChildCount >= 1);

            // now calculate some statistics to determine if the sampled result is 'ok'.
            // because random sampling is used, the results will vary each time.
            int sum = 0;
            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                sum += (int)lav.Value;
            }
            float mu = (float)sum / (float)maxNumChildren;

            float variance = 0;
            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                variance += (float)Math.Pow((mu - (int)lav.Value), 2);
            }
            variance = variance / maxNumChildren;
            float sigma = (float)Math.Sqrt(variance);

            // we query only half the documents and have 5 categories. The average
            // number of docs in a category will thus be the total divided by 5*2
            float targetMu = numDocs / (5.0f * 2.0f);

            // the average should be in the range and the standard deviation should not
            // be too great
            Assert.True(sigma < 200);
            Assert.True(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);

            IOUtils.Close(searcher.IndexReader, taxoReader, dir, taxoDir);
        }