public virtual void TestAddSameDocTwice() { // LUCENE-5367: this was a problem with the previous code, making sure it // works with the new code. Directory indexDir = NewDirectory(), taxoDir = NewDirectory(); IndexWriter indexWriter = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); FacetsConfig facetsConfig = new FacetsConfig(); Document doc = new Document(); doc.Add(new FacetField("a", "b")); doc = facetsConfig.Build(taxoWriter, doc); // these two addDocument() used to fail indexWriter.AddDocument(doc); indexWriter.AddDocument(doc); IOUtils.Dispose(indexWriter, taxoWriter); DirectoryReader indexReader = DirectoryReader.Open(indexDir); DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); IndexSearcher searcher = NewSearcher(indexReader); FacetsCollector fc = new FacetsCollector(); searcher.Search(new MatchAllDocsQuery(), fc); Facets facets = GetTaxonomyFacetCounts(taxoReader, facetsConfig, fc); FacetResult res = facets.GetTopChildren(10, "a"); Assert.AreEqual(1, res.LabelValues.Length); Assert.AreEqual(2, res.LabelValues[0].Value); IOUtils.Dispose(indexReader, taxoReader); IOUtils.Dispose(indexDir, taxoDir); }
public override bool Equals(object other) { if ((other is FacetResult) == false) { return(false); } FacetResult other2 = (FacetResult)other; return(Value.Equals(other2.Value) && ChildCount == other2.ChildCount && Arrays.Equals(LabelValues, other2.LabelValues)); }
protected internal virtual void AssertFloatValuesEquals(FacetResult a, FacetResult b) { Assert.AreEqual(a.Dim, b.Dim); Assert.True(Arrays.Equals(a.Path, b.Path)); Assert.AreEqual(a.ChildCount, b.ChildCount); Assert.AreEqual((float)a.Value, (float)b.Value, (float)a.Value / 1e5); Assert.AreEqual(a.LabelValues.Length, b.LabelValues.Length); for (int i = 0; i < a.LabelValues.Length; i++) { Assert.AreEqual(a.LabelValues[i].Label, b.LabelValues[i].Label); Assert.AreEqual((float)a.LabelValues[i].Value, (float)b.LabelValues[i].Value, (float)a.LabelValues[i].Value / 1e5); } }
/// <summary> /// Note: if you use a counting <see cref="Facets"/> implementation, you can amortize the /// sampled counts by calling this method. Uses the <see cref="FacetsConfig"/> and /// the <see cref="IndexSearcher"/> to determine the upper bound for each facet value. /// </summary> public virtual FacetResult AmortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) { if (res == null || totalHits <= sampleSize) { return(res); } LabelAndValue[] fixedLabelValues = new LabelAndValue[res.LabelValues.Length]; IndexReader reader = searcher.IndexReader; DimConfig dimConfig = config.GetDimConfig(res.Dim); // +2 to prepend dimension, append child label string[] childPath = new string[res.Path.Length + 2]; childPath[0] = res.Dim; Array.Copy(res.Path, 0, childPath, 1, res.Path.Length); // reuse for (int i = 0; i < res.LabelValues.Length; i++) { childPath[res.Path.Length + 1] = res.LabelValues[i].Label; string fullPath = FacetsConfig.PathToString(childPath, childPath.Length); int max = reader.DocFreq(new Term(dimConfig.IndexFieldName, fullPath)); int correctedCount = (int)((double)res.LabelValues[i].Value / samplingRate); correctedCount = Math.Min(max, correctedCount); fixedLabelValues[i] = new LabelAndValue(res.LabelValues[i].Label, correctedCount); } // cap the total count on the total number of non-deleted documents in the reader int correctedTotalCount = (int)res.Value; if (correctedTotalCount > 0) { correctedTotalCount = Math.Min(reader.NumDocs, (int)((double)res.Value / samplingRate)); } return(new FacetResult(res.Dim, res.Path, correctedTotalCount, fixedLabelValues, res.ChildCount)); }
public virtual void TestRandomSampling() { Directory dir = NewDirectory(); Directory taxoDir = NewDirectory(); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); FacetsConfig config = new FacetsConfig(); int numDocs = AtLeast(10000); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture))); writer.AddDocument(config.Build(taxoWriter, doc)); } Random random = Random; // NRT open IndexSearcher searcher = NewSearcher(writer.GetReader()); var taxoReader = new DirectoryTaxonomyReader(taxoWriter); IOUtils.Dispose(writer, taxoWriter); // Test empty results RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // There should be no divisions by zero searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); // There should be no divisions by zero and no null result Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs()); // There should be no results at all foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs()) { Assert.AreEqual(0, doc.TotalHits); } // Now start searching and retrieve results. // Use a query to select half of the documents. TermQuery query = new TermQuery(new Term("EvenOdd", "even")); // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i % // 10) are hits. // there is a REAL small chance that one of the 5 values will be missed when // sampling. // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be // missing) ~ 10^-193 // so that is probably not going to happen. int maxNumChildren = 5; RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64()); // no sampling RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits FacetsCollector fc = new FacetsCollector(); searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent)); FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent); FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc); FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher); FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10"); FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10"); Assert.AreEqual(random100Result, exactResult); // we should have five children, but there is a small chance we have less. // (see above). Assert.IsTrue(random10Result.ChildCount <= maxNumChildren); // there should be one child at least. Assert.IsTrue(random10Result.ChildCount >= 1); // now calculate some statistics to determine if the sampled result is 'ok'. // because random sampling is used, the results will vary each time. int sum = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { sum += (int)lav.Value; } float mu = (float)sum / (float)maxNumChildren; float variance = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { variance += (float)Math.Pow((mu - (int)lav.Value), 2); } variance = variance / maxNumChildren; float sigma = (float)Math.Sqrt(variance); // we query only half the documents and have 5 categories. The average // number of docs in a category will thus be the total divided by 5*2 float targetMu = numDocs / (5.0f * 2.0f); // the average should be in the range and the standard deviation should not // be too great Assert.IsTrue(sigma < 200); Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma); IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir); }
/// <summary> /// Note: if you use a counting <seealso cref="Facets"/> implementation, you can amortize the /// sampled counts by calling this method. Uses the <seealso cref="FacetsConfig"/> and /// the <seealso cref="IndexSearcher"/> to determine the upper bound for each facet value. /// </summary> public virtual FacetResult AmortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) { if (res == null || totalHits <= sampleSize) { return res; } LabelAndValue[] fixedLabelValues = new LabelAndValue[res.LabelValues.Length]; IndexReader reader = searcher.IndexReader; DimConfig dimConfig = config.GetDimConfig(res.Dim); // +2 to prepend dimension, append child label string[] childPath = new string[res.Path.Length + 2]; childPath[0] = res.Dim; Array.Copy(res.Path, 0, childPath, 1, res.Path.Length); // reuse for (int i = 0; i < res.LabelValues.Length; i++) { childPath[res.Path.Length + 1] = res.LabelValues[i].label; string fullPath = FacetsConfig.PathToString(childPath, childPath.Length); int max = reader.DocFreq(new Term(dimConfig.IndexFieldName, fullPath)); int correctedCount = (int)((double)res.LabelValues[i].value / samplingRate); correctedCount = Math.Min(max, correctedCount); fixedLabelValues[i] = new LabelAndValue(res.LabelValues[i].label, correctedCount); } // cap the total count on the total number of non-deleted documents in the reader int correctedTotalCount = (int)res.Value; if (correctedTotalCount > 0) { correctedTotalCount = Math.Min(reader.NumDocs, (int)((double)res.Value / samplingRate)); } return new FacetResult(res.Dim, res.Path, correctedTotalCount, fixedLabelValues, res.ChildCount); }