Counts or aggregates for a single dimension.
Пример #1
0
        public virtual void TestAddSameDocTwice()
        {
            // LUCENE-5367: this was a problem with the previous code, making sure it
            // works with the new code.
            Directory               indexDir = NewDirectory(), taxoDir = NewDirectory();
            IndexWriter             indexWriter  = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)));
            DirectoryTaxonomyWriter taxoWriter   = new DirectoryTaxonomyWriter(taxoDir);
            FacetsConfig            facetsConfig = new FacetsConfig();
            Document doc = new Document();

            doc.Add(new FacetField("a", "b"));
            doc = facetsConfig.Build(taxoWriter, doc);
            // these two addDocument() used to fail
            indexWriter.AddDocument(doc);
            indexWriter.AddDocument(doc);
            IOUtils.Dispose(indexWriter, taxoWriter);

            DirectoryReader         indexReader = DirectoryReader.Open(indexDir);
            DirectoryTaxonomyReader taxoReader  = new DirectoryTaxonomyReader(taxoDir);
            IndexSearcher           searcher    = NewSearcher(indexReader);
            FacetsCollector         fc          = new FacetsCollector();

            searcher.Search(new MatchAllDocsQuery(), fc);

            Facets      facets = GetTaxonomyFacetCounts(taxoReader, facetsConfig, fc);
            FacetResult res    = facets.GetTopChildren(10, "a");

            Assert.AreEqual(1, res.LabelValues.Length);
            Assert.AreEqual(2, res.LabelValues[0].Value);
            IOUtils.Dispose(indexReader, taxoReader);

            IOUtils.Dispose(indexDir, taxoDir);
        }
Пример #2
0
        public override bool Equals(object other)
        {
            if ((other is FacetResult) == false)
            {
                return(false);
            }
            FacetResult other2 = (FacetResult)other;

            return(Value.Equals(other2.Value) && ChildCount == other2.ChildCount && Arrays.Equals(LabelValues, other2.LabelValues));
        }
Пример #3
0
 protected internal virtual void AssertFloatValuesEquals(FacetResult a, FacetResult b)
 {
     Assert.AreEqual(a.Dim, b.Dim);
     Assert.True(Arrays.Equals(a.Path, b.Path));
     Assert.AreEqual(a.ChildCount, b.ChildCount);
     Assert.AreEqual((float)a.Value, (float)b.Value, (float)a.Value / 1e5);
     Assert.AreEqual(a.LabelValues.Length, b.LabelValues.Length);
     for (int i = 0; i < a.LabelValues.Length; i++)
     {
         Assert.AreEqual(a.LabelValues[i].Label, b.LabelValues[i].Label);
         Assert.AreEqual((float)a.LabelValues[i].Value, (float)b.LabelValues[i].Value, (float)a.LabelValues[i].Value / 1e5);
     }
 }
Пример #4
0
        /// <summary>
        /// Note: if you use a counting <see cref="Facets"/> implementation, you can amortize the
        /// sampled counts by calling this method. Uses the <see cref="FacetsConfig"/> and
        /// the <see cref="IndexSearcher"/> to determine the upper bound for each facet value.
        /// </summary>
        public virtual FacetResult AmortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher)
        {
            if (res == null || totalHits <= sampleSize)
            {
                return(res);
            }

            LabelAndValue[] fixedLabelValues = new LabelAndValue[res.LabelValues.Length];
            IndexReader     reader           = searcher.IndexReader;
            DimConfig       dimConfig        = config.GetDimConfig(res.Dim);

            // +2 to prepend dimension, append child label
            string[] childPath = new string[res.Path.Length + 2];
            childPath[0] = res.Dim;

            Array.Copy(res.Path, 0, childPath, 1, res.Path.Length); // reuse

            for (int i = 0; i < res.LabelValues.Length; i++)
            {
                childPath[res.Path.Length + 1] = res.LabelValues[i].Label;
                string fullPath       = FacetsConfig.PathToString(childPath, childPath.Length);
                int    max            = reader.DocFreq(new Term(dimConfig.IndexFieldName, fullPath));
                int    correctedCount = (int)((double)res.LabelValues[i].Value / samplingRate);
                correctedCount      = Math.Min(max, correctedCount);
                fixedLabelValues[i] = new LabelAndValue(res.LabelValues[i].Label, correctedCount);
            }

            // cap the total count on the total number of non-deleted documents in the reader
            int correctedTotalCount = (int)res.Value;

            if (correctedTotalCount > 0)
            {
                correctedTotalCount = Math.Min(reader.NumDocs, (int)((double)res.Value / samplingRate));
            }

            return(new FacetResult(res.Dim, res.Path, correctedTotalCount, fixedLabelValues, res.ChildCount));
        }
        public virtual void TestRandomSampling()
        {
            Directory dir     = NewDirectory();
            Directory taxoDir = NewDirectory();

            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            RandomIndexWriter       writer     = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir);

            FacetsConfig config = new FacetsConfig();

            int numDocs = AtLeast(10000);

            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
                doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture)));
                writer.AddDocument(config.Build(taxoWriter, doc));
            }
            Random random = Random;

            // NRT open
            IndexSearcher searcher   = NewSearcher(writer.GetReader());
            var           taxoReader = new DirectoryTaxonomyReader(taxoWriter);

            IOUtils.Dispose(writer, taxoWriter);

            // Test empty results
            RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64());

            // There should be no divisions by zero
            searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

            // There should be no divisions by zero and no null result
            Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs());

            // There should be no results at all
            foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs())
            {
                Assert.AreEqual(0, doc.TotalHits);
            }

            // Now start searching and retrieve results.

            // Use a query to select half of the documents.
            TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

            // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
            // 10) are hits.
            // there is a REAL small chance that one of the 5 values will be missed when
            // sampling.
            // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
            // missing) ~ 10^-193
            // so that is probably not going to happen.
            int maxNumChildren = 5;

            RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64());      // no sampling
            RandomSamplingFacetsCollector random10Percent  = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits

            FacetsCollector fc = new FacetsCollector();

            searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));

            FastTaxonomyFacetCounts random10FacetCounts  = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
            FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
            FastTaxonomyFacetCounts exactFacetCounts     = new FastTaxonomyFacetCounts(taxoReader, config, fc);

            FacetResult random10Result  = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
            FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
            FacetResult exactResult     = exactFacetCounts.GetTopChildren(10, "iMod10");

            Assert.AreEqual(random100Result, exactResult);

            // we should have five children, but there is a small chance we have less.
            // (see above).
            Assert.IsTrue(random10Result.ChildCount <= maxNumChildren);
            // there should be one child at least.
            Assert.IsTrue(random10Result.ChildCount >= 1);

            // now calculate some statistics to determine if the sampled result is 'ok'.
            // because random sampling is used, the results will vary each time.
            int sum = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                sum += (int)lav.Value;
            }
            float mu = (float)sum / (float)maxNumChildren;

            float variance = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                variance += (float)Math.Pow((mu - (int)lav.Value), 2);
            }
            variance = variance / maxNumChildren;
            float sigma = (float)Math.Sqrt(variance);

            // we query only half the documents and have 5 categories. The average
            // number of docs in a category will thus be the total divided by 5*2
            float targetMu = numDocs / (5.0f * 2.0f);

            // the average should be in the range and the standard deviation should not
            // be too great
            Assert.IsTrue(sigma < 200);
            Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);

            IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir);
        }
        /// <summary>
        /// Note: if you use a counting <seealso cref="Facets"/> implementation, you can amortize the
        /// sampled counts by calling this method. Uses the <seealso cref="FacetsConfig"/> and
        /// the <seealso cref="IndexSearcher"/> to determine the upper bound for each facet value.
        /// </summary>
        public virtual FacetResult AmortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher)
        {
            if (res == null || totalHits <= sampleSize)
            {
                return res;
            }

            LabelAndValue[] fixedLabelValues = new LabelAndValue[res.LabelValues.Length];
            IndexReader reader = searcher.IndexReader;
            DimConfig dimConfig = config.GetDimConfig(res.Dim);

            // +2 to prepend dimension, append child label
            string[] childPath = new string[res.Path.Length + 2];
            childPath[0] = res.Dim;

            Array.Copy(res.Path, 0, childPath, 1, res.Path.Length); // reuse

            for (int i = 0; i < res.LabelValues.Length; i++)
            {
                childPath[res.Path.Length + 1] = res.LabelValues[i].label;
                string fullPath = FacetsConfig.PathToString(childPath, childPath.Length);
                int max = reader.DocFreq(new Term(dimConfig.IndexFieldName, fullPath));
                int correctedCount = (int)((double)res.LabelValues[i].value / samplingRate);
                correctedCount = Math.Min(max, correctedCount);
                fixedLabelValues[i] = new LabelAndValue(res.LabelValues[i].label, correctedCount);
            }

            // cap the total count on the total number of non-deleted documents in the reader
            int correctedTotalCount = (int)res.Value;
            if (correctedTotalCount > 0)
            {
                correctedTotalCount = Math.Min(reader.NumDocs, (int)((double)res.Value / samplingRate));
            }

            return new FacetResult(res.Dim, res.Path, correctedTotalCount, fixedLabelValues, res.ChildCount);
        }