public virtual void TestAddSameDocTwice() { // LUCENE-5367: this was a problem with the previous code, making sure it // works with the new code. Directory indexDir = NewDirectory(), taxoDir = NewDirectory(); IndexWriter indexWriter = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()))); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); FacetsConfig facetsConfig = new FacetsConfig(); Document doc = new Document(); doc.Add(new FacetField("a", "b")); doc = facetsConfig.Build(taxoWriter, doc); // these two addDocument() used to fail indexWriter.AddDocument(doc); indexWriter.AddDocument(doc); IOUtils.Close(indexWriter, taxoWriter); DirectoryReader indexReader = DirectoryReader.Open(indexDir); DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); IndexSearcher searcher = NewSearcher(indexReader); FacetsCollector fc = new FacetsCollector(); searcher.Search(new MatchAllDocsQuery(), fc); Facets facets = GetTaxonomyFacetCounts(taxoReader, facetsConfig, fc); FacetResult res = facets.GetTopChildren(10, "a"); Assert.AreEqual(1, res.LabelValues.Length); Assert.AreEqual(2, res.LabelValues[0].value); IOUtils.Close(indexReader, taxoReader); IOUtils.Close(indexDir, taxoDir); }
public static void AfterClassDrillDownQueryTest() { IOUtils.Close(reader, taxo, dir, taxoDir); reader = null; taxo = null; dir = null; taxoDir = null; config = null; }
/// <summary> /// Used by <see cref="DrillSideways"/> /// </summary> internal DrillDownQuery(FacetsConfig config, Filter filter, DrillDownQuery other) { query = new BooleanQuery(true); // disable coord BooleanClause[] clauses = other.query.Clauses; if (clauses.Length == other.drillDownDims.Count) { throw new System.ArgumentException("cannot apply filter unless baseQuery isn't null; pass ConstantScoreQuery instead"); } Debug.Assert(clauses.Length == 1 + other.drillDownDims.Count, clauses.Length + " vs " + (1 + other.drillDownDims.Count)); drillDownDims.AddAll(other.drillDownDims); query.Add(new FilteredQuery(clauses[0].Query, filter), Occur.MUST); for (int i = 1; i < clauses.Length; i++) { query.Add(clauses[i].Query, Occur.MUST); } this.config = config; }
public virtual void TestRandomSampling() { Directory dir = NewDirectory(); Directory taxoDir = NewDirectory(); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); RandomIndexWriter writer = new RandomIndexWriter(Util.LuceneTestCase.Random, dir, Similarity, TimeZone); FacetsConfig config = new FacetsConfig(); int numDocs = AtLeast(10000); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); doc.Add(new FacetField("iMod10", Convert.ToString(i % 10))); writer.AddDocument(config.Build(taxoWriter, doc)); } Random random = Random; // NRT open IndexSearcher searcher = NewSearcher(writer.GetReader()); var taxoReader = new DirectoryTaxonomyReader(taxoWriter); IOUtils.Dispose(writer, taxoWriter); // Test empty results RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // There should be no divisions by zero searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); // There should be no divisions by zero and no null result Assert.NotNull(collectRandomZeroResults.GetMatchingDocs()); // There should be no results at all foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs()) { Assert.AreEqual(0, doc.TotalHits); } // Now start searching and retrieve results. // Use a query to select half of the documents. TermQuery query = new TermQuery(new Term("EvenOdd", "even")); // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i % // 10) are hits. // there is a REAL small chance that one of the 5 values will be missed when // sampling. // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be // missing) ~ 10^-193 // so that is probably not going to happen. int maxNumChildren = 5; RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64()); // no sampling RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits FacetsCollector fc = new FacetsCollector(); searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent)); FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent); FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc); FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher); FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10"); FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10"); Assert.AreEqual(random100Result, exactResult); // we should have five children, but there is a small chance we have less. // (see above). Assert.True(random10Result.ChildCount <= maxNumChildren); // there should be one child at least. Assert.True(random10Result.ChildCount >= 1); // now calculate some statistics to determine if the sampled result is 'ok'. // because random sampling is used, the results will vary each time. int sum = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { sum += (int)lav.Value; } float mu = (float)sum / (float)maxNumChildren; float variance = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { variance += (float)Math.Pow((mu - (int)lav.Value), 2); } variance = variance / maxNumChildren; float sigma = (float)Math.Sqrt(variance); // we query only half the documents and have 5 categories. The average // number of docs in a category will thus be the total divided by 5*2 float targetMu = numDocs / (5.0f * 2.0f); // the average should be in the range and the standard deviation should not // be too great Assert.True(sigma < 200); Assert.True(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma); IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir); }
/// <summary> /// Used by <see cref="Clone"/> /// </summary> internal DrillDownQuery(FacetsConfig config, BooleanQuery query, IDictionary <string, int> drillDownDims) { this.query = (BooleanQuery)query.Clone(); this.drillDownDims.PutAll(drillDownDims); this.config = config; }
private static void seedIndex(TaxonomyWriter tw, RandomIndexWriter iw, FacetsConfig config) { foreach (FacetField ff in CATEGORIES) { Document doc = new Document(); doc.Add(ff); doc.Add(new TextField("content", "alpha", Field.Store.YES)); iw.AddDocument(config.Build(tw, doc)); } }
/// <summary> /// Create {@code FastTaxonomyFacetCounts}, using the /// specified {@code indexFieldName} for ordinals. Use /// this if you had set {@link /// FacetsConfig#setIndexFieldName} to change the index /// field name for certain dimensions. /// </summary> public FastTaxonomyFacetCounts(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) : base(indexFieldName, taxoReader, config) { Count(fc.GetMatchingDocs); }
/// <summary> /// Creates a drill-down term. /// </summary> public static Term Term(string field, string dim, params string[] path) { return(new Term(field, FacetsConfig.PathToString(dim, path))); }
/// <summary> /// Create {@code TaxonomyFacetCounts}, which also /// counts all facet labels. Use this for a non-default /// <seealso cref="OrdinalsReader"/>; otherwise use {@link /// FastTaxonomyFacetCounts}. /// </summary> public TaxonomyFacetCounts(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) : base(ordinalsReader.IndexFieldName, taxoReader, config) { this.ordinalsReader = ordinalsReader; Count(fc.GetMatchingDocs); }
/// <summary> /// Aggreggates float facet values from the provided /// <seealso cref="ValueSource"/>, pulling ordinals using {@link /// DocValuesOrdinalsReader} against the default indexed /// facet field {@link /// FacetsConfig#DEFAULT_INDEX_FIELD_NAME}. /// </summary> public TaxonomyFacetSumValueSource(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc, ValueSource valueSource) : this(new DocValuesOrdinalsReader(FacetsConfig.DEFAULT_INDEX_FIELD_NAME), taxoReader, config, fc, valueSource) { }
/// <summary> /// Used by <see cref="Clone"/> /// </summary> internal DrillDownQuery(FacetsConfig config, BooleanQuery query, IDictionary<string, int?> drillDownDims) { this.query = (BooleanQuery)query.Clone(); this.drillDownDims.AddAll(drillDownDims); this.config = config; }
/// <summary> /// Create a new <see cref="DrillSideways"/> instance. /// </summary> public DrillSideways(IndexSearcher searcher, FacetsConfig config, TaxonomyReader taxoReader) : this(searcher, config, taxoReader, null) { }
/// <summary> /// Creates a new <see cref="DrillDownQuery"/> without a base query, /// to perform a pure browsing query (equivalent to using /// <see cref="MatchAllDocsQuery"/> as base). /// </summary> public DrillDownQuery(FacetsConfig config) : this(config, null) { }
/// <summary> /// Creates a new <see cref="DrillDownQuery"/> over the given base query. Can be /// <c>null</c>, in which case the result <see cref="Query"/> from /// <see cref="Rewrite(IndexReader)"/> will be a pure browsing query, filtering on /// the added categories only. /// </summary> public DrillDownQuery(FacetsConfig config, Query baseQuery) { query = new BooleanQuery(true); // disable coord if (baseQuery != null) { query.Add(baseQuery, Occur.MUST); } this.config = config; }
/// <summary> /// Create a new {@code DrillSideways} instance, where some /// dimensions were indexed with {@link /// SortedSetDocValuesFacetField} and others were indexed /// with <seealso cref="FacetField"/>. /// </summary> public DrillSideways(IndexSearcher searcher, FacetsConfig config, TaxonomyReader taxoReader, SortedSetDocValuesReaderState state) { this.searcher = searcher; this.config = config; this.taxoReader = taxoReader; this.state = state; }
/// <summary> /// Create a new {@code DrillSideways} instance, assuming the categories were /// indexed with <seealso cref="SortedSetDocValuesFacetField"/>. /// </summary> public DrillSideways(IndexSearcher searcher, FacetsConfig config, SortedSetDocValuesReaderState state) : this(searcher, config, null, state) { }
/// <summary> /// Create a new {@code DrillSideways} instance. </summary> public DrillSideways(IndexSearcher searcher, FacetsConfig config, TaxonomyReader taxoReader) : this(searcher, config, taxoReader, null) { }
/// <summary> /// Create {@code TaxonomyFacetSumFloatAssociations} against /// the default index field. /// </summary> public TaxonomyFacetSumFloatAssociations(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) : this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, taxoReader, config, fc) { }
public override void BeforeClass() // LUCENENET specific - renamed from BeforeClassDrillDownQueryTest() to ensure calling order { base.BeforeClass(); dir = NewDirectory(); Random r = Random; RandomIndexWriter writer = new RandomIndexWriter(r, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(r, MockTokenizer.KEYWORD, false))); taxoDir = NewDirectory(); ITaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); config = new FacetsConfig(); // Randomize the per-dim config: config.SetHierarchical("a", Random.NextBoolean()); config.SetMultiValued("a", Random.NextBoolean()); if (Random.NextBoolean()) { config.SetIndexFieldName("a", "$a"); } config.SetRequireDimCount("a", true); config.SetHierarchical("b", Random.NextBoolean()); config.SetMultiValued("b", Random.NextBoolean()); if (Random.NextBoolean()) { config.SetIndexFieldName("b", "$b"); } config.SetRequireDimCount("b", true); for (int i = 0; i < 100; i++) { Document doc = new Document(); if (i % 2 == 0) // 50 { doc.Add(new TextField("content", "foo", Field.Store.NO)); } if (i % 3 == 0) // 33 { doc.Add(new TextField("content", "bar", Field.Store.NO)); } if (i % 4 == 0) // 25 { if (r.NextBoolean()) { doc.Add(new FacetField("a", "1")); } else { doc.Add(new FacetField("a", "2")); } } if (i % 5 == 0) // 20 { doc.Add(new FacetField("b", "1")); } writer.AddDocument(config.Build(taxoWriter, doc)); } taxoWriter.Dispose(); reader = writer.GetReader(); writer.Dispose(); taxo = new DirectoryTaxonomyReader(taxoDir); }
/// <summary> /// Create {@code TaxonomyFacetSumFloatAssociations} against /// the specified index field. /// </summary> public TaxonomyFacetSumFloatAssociations(string indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) : base(indexFieldName, taxoReader, config) { SumValues(fc.GetMatchingDocs); }
/// <summary> /// Note: if you use a counting <seealso cref="Facets"/> implementation, you can amortize the /// sampled counts by calling this method. Uses the <seealso cref="FacetsConfig"/> and /// the <seealso cref="IndexSearcher"/> to determine the upper bound for each facet value. /// </summary> public virtual FacetResult AmortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) { if (res == null || totalHits <= sampleSize) { return res; } LabelAndValue[] fixedLabelValues = new LabelAndValue[res.LabelValues.Length]; IndexReader reader = searcher.IndexReader; DimConfig dimConfig = config.GetDimConfig(res.Dim); // +2 to prepend dimension, append child label string[] childPath = new string[res.Path.Length + 2]; childPath[0] = res.Dim; Array.Copy(res.Path, 0, childPath, 1, res.Path.Length); // reuse for (int i = 0; i < res.LabelValues.Length; i++) { childPath[res.Path.Length + 1] = res.LabelValues[i].label; string fullPath = FacetsConfig.PathToString(childPath, childPath.Length); int max = reader.DocFreq(new Term(dimConfig.IndexFieldName, fullPath)); int correctedCount = (int)((double)res.LabelValues[i].value / samplingRate); correctedCount = Math.Min(max, correctedCount); fixedLabelValues[i] = new LabelAndValue(res.LabelValues[i].label, correctedCount); } // cap the total count on the total number of non-deleted documents in the reader int correctedTotalCount = (int)res.Value; if (correctedTotalCount > 0) { correctedTotalCount = Math.Min(reader.NumDocs, (int)((double)res.Value / samplingRate)); } return new FacetResult(res.Dim, res.Path, correctedTotalCount, fixedLabelValues, res.ChildCount); }
public virtual Facets GetTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector c) { return(GetTaxonomyFacetCounts(taxoReader, config, c, FacetsConfig.DEFAULT_INDEX_FIELD_NAME)); }
/// <summary> /// Aggreggates float facet values from the provided /// <seealso cref="ValueSource"/>, and pulls ordinals from the /// provided <seealso cref="OrdinalsReader"/>. /// </summary> public TaxonomyFacetSumValueSource(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc, ValueSource valueSource) : base(ordinalsReader.IndexFieldName, taxoReader, config) { this.ordinalsReader = ordinalsReader; SumValues(fc.GetMatchingDocs, fc.KeepScores, valueSource); }
/// <summary> /// Create a new <see cref="DrillSideways"/> instance, assuming the categories were /// indexed with <see cref="SortedSetDocValuesFacetField"/>. /// </summary> public DrillSideways(IndexSearcher searcher, FacetsConfig config, SortedSetDocValuesReaderState state) : this(searcher, config, null, state) { }
/// <summary> /// Used by <see cref="DrillSideways"/> /// </summary> internal DrillDownQuery(FacetsConfig config, Query baseQuery, IList<Query> clauses, IDictionary<string, int?> drillDownDims) { query = new BooleanQuery(true); if (baseQuery != null) { query.Add(baseQuery, Occur.MUST); } foreach (Query clause in clauses) { query.Add(clause, Occur.MUST); } this.drillDownDims.AddAll(drillDownDims); this.config = config; }
public virtual void TestRandomSampling() { Directory dir = NewDirectory(); Directory taxoDir = NewDirectory(); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir); FacetsConfig config = new FacetsConfig(); int numDocs = AtLeast(10000); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO)); doc.Add(new FacetField("iMod10", Convert.ToString(i % 10))); writer.AddDocument(config.Build(taxoWriter, doc)); } Random random = Random(); // NRT open IndexSearcher searcher = NewSearcher(writer.Reader); var taxoReader = new DirectoryTaxonomyReader(taxoWriter); IOUtils.Close(writer, taxoWriter); // Test empty results RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong()); // There should be no divisions by zero searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults); // There should be no divisions by zero and no null result Assert.NotNull(collectRandomZeroResults.GetMatchingDocs); // There should be no results at all foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs) { Assert.AreEqual(0, doc.totalHits); } // Now start searching and retrieve results. // Use a query to select half of the documents. TermQuery query = new TermQuery(new Term("EvenOdd", "even")); // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i % // 10) are hits. // there is a REAL small chance that one of the 5 values will be missed when // sampling. // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be // missing) ~ 10^-193 // so that is probably not going to happen. int maxNumChildren = 5; RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextLong()); // no sampling RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong()); // 10 % of total docs, 20% of the hits FacetsCollector fc = new FacetsCollector(); searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent)); FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent); FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent); FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc); FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher); FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10"); FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10"); Assert.AreEqual(random100Result, exactResult); // we should have five children, but there is a small chance we have less. // (see above). Assert.True(random10Result.ChildCount <= maxNumChildren); // there should be one child at least. Assert.True(random10Result.ChildCount >= 1); // now calculate some statistics to determine if the sampled result is 'ok'. // because random sampling is used, the results will vary each time. int sum = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { sum += (int)lav.value; } float mu = (float)sum / (float)maxNumChildren; float variance = 0; foreach (LabelAndValue lav in random10Result.LabelValues) { variance += (float)Math.Pow((mu - (int)lav.value), 2); } variance = variance / maxNumChildren; float sigma = (float)Math.Sqrt(variance); // we query only half the documents and have 5 categories. The average // number of docs in a category will thus be the total divided by 5*2 float targetMu = numDocs / (5.0f * 2.0f); // the average should be in the range and the standard deviation should not // be too great Assert.True(sigma < 200); Assert.True(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma); IOUtils.Close(searcher.IndexReader, taxoReader, dir, taxoDir); }
/// <summary> /// Creates a new <see cref="DrillDownQuery"/> without a base query, /// to perform a pure browsing query (equivalent to using /// <see cref="MatchAllDocsQuery"/> as base). /// </summary> public DrillDownQuery(FacetsConfig config) : this(config, null) { }
public static void BeforeClassDrillDownQueryTest() { dir = NewDirectory(); Random r = Random(); RandomIndexWriter writer = new RandomIndexWriter(r, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(r, MockTokenizer.KEYWORD, false))); taxoDir = NewDirectory(); TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); config = new FacetsConfig(); // Randomize the per-dim config: config.SetHierarchical("a", Random().NextBoolean()); config.SetMultiValued("a", Random().NextBoolean()); if (Random().NextBoolean()) { config.SetIndexFieldName("a", "$a"); } config.SetRequireDimCount("a", true); config.SetHierarchical("b", Random().NextBoolean()); config.SetMultiValued("b", Random().NextBoolean()); if (Random().NextBoolean()) { config.SetIndexFieldName("b", "$b"); } config.SetRequireDimCount("b", true); for (int i = 0; i < 100; i++) { Document doc = new Document(); if (i % 2 == 0) // 50 { doc.Add(new TextField("content", "foo", Field.Store.NO)); } if (i % 3 == 0) // 33 { doc.Add(new TextField("content", "bar", Field.Store.NO)); } if (i % 4 == 0) // 25 { if (r.NextBoolean()) { doc.Add(new FacetField("a", "1")); } else { doc.Add(new FacetField("a", "2")); } } if (i % 5 == 0) // 20 { doc.Add(new FacetField("b", "1")); } writer.AddDocument(config.Build(taxoWriter, doc)); } taxoWriter.Dispose(); reader = writer.Reader; writer.Dispose(); taxo = new DirectoryTaxonomyReader(taxoDir); }
/// <summary> /// Create {@code FastTaxonomyFacetCounts}, which also /// counts all facet labels. /// </summary> public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) : this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, taxoReader, config, fc) { }