예제 #1
0
        public virtual void TestAddSameDocTwice()
        {
            // LUCENE-5367: this was a problem with the previous code, making sure it
            // works with the new code.
            Directory indexDir = NewDirectory(), taxoDir = NewDirectory();
            IndexWriter indexWriter = new IndexWriter(indexDir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())));
            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            FacetsConfig facetsConfig = new FacetsConfig();
            Document doc = new Document();
            doc.Add(new FacetField("a", "b"));
            doc = facetsConfig.Build(taxoWriter, doc);
            // these two addDocument() used to fail
            indexWriter.AddDocument(doc);
            indexWriter.AddDocument(doc);
            IOUtils.Close(indexWriter, taxoWriter);

            DirectoryReader indexReader = DirectoryReader.Open(indexDir);
            DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
            IndexSearcher searcher = NewSearcher(indexReader);
            FacetsCollector fc = new FacetsCollector();
            searcher.Search(new MatchAllDocsQuery(), fc);

            Facets facets = GetTaxonomyFacetCounts(taxoReader, facetsConfig, fc);
            FacetResult res = facets.GetTopChildren(10, "a");
            Assert.AreEqual(1, res.LabelValues.Length);
            Assert.AreEqual(2, res.LabelValues[0].value);
            IOUtils.Close(indexReader, taxoReader);

            IOUtils.Close(indexDir, taxoDir);
        }
예제 #2
0
        public static void BeforeClassDrillDownQueryTest()
        {
            dir = NewDirectory();
            Random r = Random();
            RandomIndexWriter writer = new RandomIndexWriter(r, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(r, MockTokenizer.KEYWORD, false)));

            taxoDir = NewDirectory();
            TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            config = new FacetsConfig();

            // Randomize the per-dim config:
            config.SetHierarchical("a", Random().NextBoolean());
            config.SetMultiValued("a", Random().NextBoolean());
            if (Random().NextBoolean())
            {
                config.SetIndexFieldName("a", "$a");
            }
            config.SetRequireDimCount("a", true);

            config.SetHierarchical("b", Random().NextBoolean());
            config.SetMultiValued("b", Random().NextBoolean());
            if (Random().NextBoolean())
            {
                config.SetIndexFieldName("b", "$b");
            }
            config.SetRequireDimCount("b", true);

            for (int i = 0; i < 100; i++)
            {
                Document doc = new Document();
                if (i % 2 == 0) // 50
                {
                    doc.Add(new TextField("content", "foo", Field.Store.NO));
                }
                if (i % 3 == 0) // 33
                {
                    doc.Add(new TextField("content", "bar", Field.Store.NO));
                }
                if (i % 4 == 0) // 25
                {
                    if (r.NextBoolean())
                    {
                        doc.Add(new FacetField("a", "1"));
                    }
                    else
                    {
                        doc.Add(new FacetField("a", "2"));
                    }
                }
                if (i % 5 == 0) // 20
                {
                    doc.Add(new FacetField("b", "1"));
                }
                writer.AddDocument(config.Build(taxoWriter, doc));
            }

            taxoWriter.Dispose();
            reader = writer.Reader;
            writer.Dispose();

            taxo = new DirectoryTaxonomyReader(taxoDir);
        }
        public virtual void TestRandomSampling()
        {
            Directory dir     = NewDirectory();
            Directory taxoDir = NewDirectory();

            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            RandomIndexWriter       writer     = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir);

            FacetsConfig config = new FacetsConfig();

            int numDocs = AtLeast(10000);

            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
                doc.Add(new FacetField("iMod10", Convert.ToString(i % 10, CultureInfo.InvariantCulture)));
                writer.AddDocument(config.Build(taxoWriter, doc));
            }
            Random random = Random;

            // NRT open
            IndexSearcher searcher   = NewSearcher(writer.GetReader());
            var           taxoReader = new DirectoryTaxonomyReader(taxoWriter);

            IOUtils.Dispose(writer, taxoWriter);

            // Test empty results
            RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64());

            // There should be no divisions by zero
            searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

            // There should be no divisions by zero and no null result
            Assert.IsNotNull(collectRandomZeroResults.GetMatchingDocs());

            // There should be no results at all
            foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs())
            {
                Assert.AreEqual(0, doc.TotalHits);
            }

            // Now start searching and retrieve results.

            // Use a query to select half of the documents.
            TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

            // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
            // 10) are hits.
            // there is a REAL small chance that one of the 5 values will be missed when
            // sampling.
            // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
            // missing) ~ 10^-193
            // so that is probably not going to happen.
            int maxNumChildren = 5;

            RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextInt64());      // no sampling
            RandomSamplingFacetsCollector random10Percent  = new RandomSamplingFacetsCollector(numDocs / 10, random.NextInt64()); // 10 % of total docs, 20% of the hits

            FacetsCollector fc = new FacetsCollector();

            searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));

            FastTaxonomyFacetCounts random10FacetCounts  = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
            FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
            FastTaxonomyFacetCounts exactFacetCounts     = new FastTaxonomyFacetCounts(taxoReader, config, fc);

            FacetResult random10Result  = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
            FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
            FacetResult exactResult     = exactFacetCounts.GetTopChildren(10, "iMod10");

            Assert.AreEqual(random100Result, exactResult);

            // we should have five children, but there is a small chance we have less.
            // (see above).
            Assert.IsTrue(random10Result.ChildCount <= maxNumChildren);
            // there should be one child at least.
            Assert.IsTrue(random10Result.ChildCount >= 1);

            // now calculate some statistics to determine if the sampled result is 'ok'.
            // because random sampling is used, the results will vary each time.
            int sum = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                sum += (int)lav.Value;
            }
            float mu = (float)sum / (float)maxNumChildren;

            float variance = 0;

            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                variance += (float)Math.Pow((mu - (int)lav.Value), 2);
            }
            variance = variance / maxNumChildren;
            float sigma = (float)Math.Sqrt(variance);

            // we query only half the documents and have 5 categories. The average
            // number of docs in a category will thus be the total divided by 5*2
            float targetMu = numDocs / (5.0f * 2.0f);

            // the average should be in the range and the standard deviation should not
            // be too great
            Assert.IsTrue(sigma < 200);
            Assert.IsTrue(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);

            IOUtils.Dispose(searcher.IndexReader, taxoReader, dir, taxoDir);
        }
        public virtual void TestRandomSampling()
        {
            Directory dir = NewDirectory();
            Directory taxoDir = NewDirectory();

            DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
            RandomIndexWriter writer = new RandomIndexWriter(Random(), dir);

            FacetsConfig config = new FacetsConfig();

            int numDocs = AtLeast(10000);
            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
                doc.Add(new FacetField("iMod10", Convert.ToString(i % 10)));
                writer.AddDocument(config.Build(taxoWriter, doc));
            }
            Random random = Random();

            // NRT open
            IndexSearcher searcher = NewSearcher(writer.Reader);
            var taxoReader = new DirectoryTaxonomyReader(taxoWriter);
            IOUtils.Close(writer, taxoWriter);

            // Test empty results
            RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong());

            // There should be no divisions by zero
            searcher.Search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);

            // There should be no divisions by zero and no null result
            Assert.NotNull(collectRandomZeroResults.GetMatchingDocs);

            // There should be no results at all
            foreach (MatchingDocs doc in collectRandomZeroResults.GetMatchingDocs)
            {
                Assert.AreEqual(0, doc.totalHits);
            }

            // Now start searching and retrieve results.

            // Use a query to select half of the documents.
            TermQuery query = new TermQuery(new Term("EvenOdd", "even"));

            // there will be 5 facet values (0, 2, 4, 6 and 8), as only the even (i %
            // 10) are hits.
            // there is a REAL small chance that one of the 5 values will be missed when
            // sampling.
            // but is that 0.8 (chance not to take a value) ^ 2000 * 5 (any can be
            // missing) ~ 10^-193
            // so that is probably not going to happen.
            int maxNumChildren = 5;

            RandomSamplingFacetsCollector random100Percent = new RandomSamplingFacetsCollector(numDocs, random.NextLong()); // no sampling
            RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.NextLong()); // 10 % of total docs, 20% of the hits

            FacetsCollector fc = new FacetsCollector();

            searcher.Search(query, MultiCollector.Wrap(fc, random100Percent, random10Percent));

            FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
            FastTaxonomyFacetCounts random100FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random100Percent);
            FastTaxonomyFacetCounts exactFacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, fc);

            FacetResult random10Result = random10Percent.AmortizeFacetCounts(random10FacetCounts.GetTopChildren(10, "iMod10"), config, searcher);
            FacetResult random100Result = random100FacetCounts.GetTopChildren(10, "iMod10");
            FacetResult exactResult = exactFacetCounts.GetTopChildren(10, "iMod10");

            Assert.AreEqual(random100Result, exactResult);

            // we should have five children, but there is a small chance we have less.
            // (see above).
            Assert.True(random10Result.ChildCount <= maxNumChildren);
            // there should be one child at least.
            Assert.True(random10Result.ChildCount >= 1);

            // now calculate some statistics to determine if the sampled result is 'ok'.
            // because random sampling is used, the results will vary each time.
            int sum = 0;
            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                sum += (int)lav.value;
            }
            float mu = (float)sum / (float)maxNumChildren;

            float variance = 0;
            foreach (LabelAndValue lav in random10Result.LabelValues)
            {
                variance += (float)Math.Pow((mu - (int)lav.value), 2);
            }
            variance = variance / maxNumChildren;
            float sigma = (float)Math.Sqrt(variance);

            // we query only half the documents and have 5 categories. The average
            // number of docs in a category will thus be the total divided by 5*2
            float targetMu = numDocs / (5.0f * 2.0f);

            // the average should be in the range and the standard deviation should not
            // be too great
            Assert.True(sigma < 200);
            Assert.True(targetMu - 3 * sigma < mu && mu < targetMu + 3 * sigma);

            IOUtils.Close(searcher.IndexReader, taxoReader, dir, taxoDir);
        }
예제 #5
0
        public override void BeforeClass() // LUCENENET specific - renamed from BeforeClassDrillDownQueryTest() to ensure calling order
        {
            base.BeforeClass();

            dir = NewDirectory();
            Random            r      = Random;
            RandomIndexWriter writer = new RandomIndexWriter(r, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(r, MockTokenizer.KEYWORD, false)));

            taxoDir = NewDirectory();
            ITaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);

            config = new FacetsConfig();

            // Randomize the per-dim config:
            config.SetHierarchical("a", Random.NextBoolean());
            config.SetMultiValued("a", Random.NextBoolean());
            if (Random.NextBoolean())
            {
                config.SetIndexFieldName("a", "$a");
            }
            config.SetRequireDimCount("a", true);

            config.SetHierarchical("b", Random.NextBoolean());
            config.SetMultiValued("b", Random.NextBoolean());
            if (Random.NextBoolean())
            {
                config.SetIndexFieldName("b", "$b");
            }
            config.SetRequireDimCount("b", true);

            for (int i = 0; i < 100; i++)
            {
                Document doc = new Document();
                if (i % 2 == 0) // 50
                {
                    doc.Add(new TextField("content", "foo", Field.Store.NO));
                }
                if (i % 3 == 0) // 33
                {
                    doc.Add(new TextField("content", "bar", Field.Store.NO));
                }
                if (i % 4 == 0) // 25
                {
                    if (r.NextBoolean())
                    {
                        doc.Add(new FacetField("a", "1"));
                    }
                    else
                    {
                        doc.Add(new FacetField("a", "2"));
                    }
                }
                if (i % 5 == 0) // 20
                {
                    doc.Add(new FacetField("b", "1"));
                }
                writer.AddDocument(config.Build(taxoWriter, doc));
            }

            taxoWriter.Dispose();
            reader = writer.GetReader();
            writer.Dispose();

            taxo = new DirectoryTaxonomyReader(taxoDir);
        }
 private static void seedIndex(TaxonomyWriter tw, RandomIndexWriter iw, FacetsConfig config)
 {
     foreach (FacetField ff in CATEGORIES)
     {
         Document doc = new Document();
         doc.Add(ff);
         doc.Add(new TextField("content", "alpha", Field.Store.YES));
         iw.AddDocument(config.Build(tw, doc));
     }
 }