예제 #1
0
        public void RestDocsAndPositionsEnumStart()
        {
            Analyzer    analyzer = new MockAnalyzer(Random);
            int         numIters = AtLeast(3);
            MemoryIndex memory   = new MemoryIndex(true, Random.nextInt(50) * 1024 * 1024);

            for (int i = 0; i < numIters; i++)
            { // check reuse
                memory.AddField("foo", "bar", analyzer);
                AtomicReader reader = (AtomicReader)memory.CreateSearcher().IndexReader;
                assertEquals(1, reader.GetTerms("foo").SumTotalTermFreq);
                DocsAndPositionsEnum disi = reader.GetTermPositionsEnum(new Term("foo", "bar"));
                int docid = disi.DocID;
                assertEquals(-1, docid);
                assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(0, disi.NextPosition());
                assertEquals(0, disi.StartOffset);
                assertEquals(3, disi.EndOffset);

                // now reuse and check again
                TermsEnum te = reader.GetTerms("foo").GetIterator(null);
                assertTrue(te.SeekExact(new BytesRef("bar")));
                disi  = te.DocsAndPositions(null, disi);
                docid = disi.DocID;
                assertEquals(-1, docid);
                assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                reader.Dispose();
                memory.Reset();
            }
        }
예제 #2
0
        public virtual void TestDocsAndPositionsEnum()
        {
            TermsEnum termsEnum = reader.GetTerms(DOC_POSITIONS_FIELD).GetIterator(null);

            assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOC_POSITIONS_TERM)));
            DocsAndPositionsEnum sortedPositions = termsEnum.DocsAndPositions(null, null);
            int doc;

            // test nextDoc()
            while ((doc = sortedPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
            {
                int freq = sortedPositions.Freq;
                assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq);
                for (int i = 0; i < freq; i++)
                {
                    assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition());
                    if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD)))
                    {
                        assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset);
                        assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset);
                    }
                    assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture));
                }
            }

            // test advance()
            DocsAndPositionsEnum reuse = sortedPositions;

            sortedPositions = termsEnum.DocsAndPositions(null, reuse);
            if (sortedPositions is SortingAtomicReader.SortingDocsAndPositionsEnum)
            {
                assertTrue(((SortingAtomicReader.SortingDocsAndPositionsEnum)sortedPositions).Reused(reuse)); // make sure reuse worked
            }
            doc = 0;
            while ((doc = sortedPositions.Advance(doc + TestUtil.NextInt32(Random, 1, 5))) != DocIdSetIterator.NO_MORE_DOCS)
            {
                int freq = sortedPositions.Freq;
                assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq);
                for (int i = 0; i < freq; i++)
                {
                    assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition());
                    if (!DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD)))
                    {
                        assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset);
                        assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset);
                    }
                    assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.GetPayload().Utf8ToString(), CultureInfo.InvariantCulture));
                }
            }
        }
예제 #3
0
        /// <summary>
        /// Takes the categories from the given taxonomy directory, and adds the
        /// missing ones to this taxonomy. Additionally, it fills the given
        /// <see cref="IOrdinalMap"/> with a mapping from the original ordinal to the new
        /// ordinal.
        /// </summary>
        public virtual void AddTaxonomy(Directory taxoDir, IOrdinalMap map)
        {
            EnsureOpen();
            DirectoryReader r = DirectoryReader.Open(taxoDir);

            try
            {
                int         size       = r.NumDocs;
                IOrdinalMap ordinalMap = map;
                ordinalMap.SetSize(size);
                int       @base = 0;
                TermsEnum te    = null;
                DocsEnum  docs  = null;
                foreach (AtomicReaderContext ctx in r.Leaves)
                {
                    AtomicReader ar    = ctx.AtomicReader;
                    Terms        terms = ar.GetTerms(Consts.FULL);
                    te = terms.GetIterator(te);
                    while (te.Next() != null)
                    {
                        FacetLabel cp      = new FacetLabel(FacetsConfig.StringToPath(te.Term.Utf8ToString()));
                        int        ordinal = AddCategory(cp);
                        docs = te.Docs(null, docs, DocsFlags.NONE);
                        ordinalMap.AddMapping(docs.NextDoc() + @base, ordinal);
                    }
                    @base += ar.MaxDoc; // no deletions, so we're ok
                }
                ordinalMap.AddDone();
            }
            finally
            {
                r.Dispose();
            }
        }
예제 #4
0
        public virtual void TestReuseDocsEnumNoReuse()
        {
            Directory         dir    = NewDirectory();
            Codec             cp     = TestUtil.AlwaysPostingsFormat(new Lucene40RWPostingsFormat());
            RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetCodec(cp));
            int numdocs = AtLeast(20);

            CreateRandomIndex(numdocs, writer, Random());
            writer.Commit();

            DirectoryReader open = DirectoryReader.Open(dir);

            foreach (AtomicReaderContext ctx in open.Leaves)
            {
                AtomicReader indexReader = (AtomicReader)ctx.Reader;
                Terms        terms       = indexReader.GetTerms("body");
                TermsEnum    iterator    = terms.GetIterator(null);
                IdentityHashMap <DocsEnum, bool?> enums = new IdentityHashMap <DocsEnum, bool?>();
                MatchNoBits bits = new MatchNoBits(indexReader.MaxDoc);
                while ((iterator.Next()) != null)
                {
                    DocsEnum docs = iterator.Docs(Random().NextBoolean() ? bits : new MatchNoBits(indexReader.MaxDoc), null, Random().NextBoolean() ? DocsFlags.FREQS : DocsFlags.NONE);
                    enums[docs] = true;
                }

                Assert.AreEqual(terms.Count, enums.Count);
            }
            IOUtils.Close(writer, open, dir);
        }
예제 #5
0
        public virtual void TestNestedPulsing()
        {
            // we always run this test with pulsing codec.
            Codec cp = TestUtil.AlwaysPostingsFormat(new NestedPulsingPostingsFormat());
            BaseDirectoryWrapper dir = NewDirectory();
            RandomIndexWriter    iw  = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetCodec(cp));
            Document             doc = new Document();

            doc.Add(new TextField("foo", "a b b c c c d e f g g g h i i j j k l l m m m", Field.Store.NO));
            // note: the reuse is imperfect, here we would have 4 enums (lost reuse when we get an enum for 'm')
            // this is because we only track the 'last' enum we reused (not all).
            // but this seems 'good enough' for now.
            iw.AddDocument(doc);
            DirectoryReader ir = iw.GetReader();

            iw.Dispose();

            AtomicReader segment = GetOnlySegmentReader(ir);
            DocsEnum     reuse   = null;
            IDictionary <DocsEnum, bool?> allEnums = new JCG.Dictionary <DocsEnum, bool?>(IdentityEqualityComparer <DocsEnum> .Default);
            TermsEnum te = segment.GetTerms("foo").GetEnumerator();

            while (te.MoveNext())
            {
                reuse           = te.Docs(null, reuse, DocsFlags.NONE);
                allEnums[reuse] = true;
            }

            assertEquals(4, allEnums.Count);

            allEnums.Clear();
            DocsAndPositionsEnum posReuse = null;

            te = segment.GetTerms("foo").GetEnumerator();
            while (te.MoveNext())
            {
                posReuse           = te.DocsAndPositions(null, posReuse);
                allEnums[posReuse] = true;
            }

            assertEquals(4, allEnums.Count);

            ir.Dispose();
            dir.Dispose();
        }
예제 #6
0
        public virtual void TestSophisticatedReuse()
        {
            // we always run this test with pulsing codec.
            Codec             cp  = TestUtil.AlwaysPostingsFormat(new Pulsing41PostingsFormat(1));
            Directory         dir = NewDirectory();
            RandomIndexWriter iw  = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetCodec(cp));
            Document          doc = new Document();

            doc.Add(new TextField("foo", "a b b c c c d e f g g h i i j j k", Field.Store.NO));
            iw.AddDocument(doc);
            DirectoryReader ir = iw.GetReader();

            iw.Dispose();

            AtomicReader segment = GetOnlySegmentReader(ir);
            DocsEnum     reuse   = null;
            IDictionary <DocsEnum, bool?> allEnums = new JCG.Dictionary <DocsEnum, bool?>(IdentityEqualityComparer <DocsEnum> .Default);
            TermsEnum te = segment.GetTerms("foo").GetEnumerator();

            while (te.MoveNext())
            {
                reuse           = te.Docs(null, reuse, DocsFlags.NONE);
                allEnums[reuse] = true;
            }

            assertEquals(2, allEnums.Count);

            allEnums.Clear();
            DocsAndPositionsEnum posReuse = null;

            te = segment.GetTerms("foo").GetEnumerator();
            while (te.MoveNext())
            {
                posReuse           = te.DocsAndPositions(null, posReuse);
                allEnums[posReuse] = true;
            }

            assertEquals(2, allEnums.Count);

            ir.Dispose();
            dir.Dispose();
        }
예제 #7
0
        protected int GetDictValueCount(AtomicReader reader, string field)
        {
            int   ret   = 0;
            Terms terms = reader.GetTerms(field);

            if (terms == null)
            {
                return(ret);
            }
            return((int)terms.Count);
        }
예제 #8
0
        public void TestNonExistingsField()
        {
            MemoryIndex  mindex       = new MemoryIndex(Random.nextBoolean(), Random.nextInt(50) * 1024 * 1024);
            MockAnalyzer mockAnalyzer = new MockAnalyzer(Random);

            mindex.AddField("field", "the quick brown fox", mockAnalyzer);
            AtomicReader reader = (AtomicReader)mindex.CreateSearcher().IndexReader;

            assertNull(reader.GetNumericDocValues("not-in-index"));
            assertNull(reader.GetNormValues("not-in-index"));
            assertNull(reader.GetTermDocsEnum(new Term("not-in-index", "foo")));
            assertNull(reader.GetTermPositionsEnum(new Term("not-in-index", "foo")));
            assertNull(reader.GetTerms("not-in-index"));
        }
            public BaseTermsEnumTraverser(AbstractPrefixTreeFilter outerInstance, AtomicReaderContext context, IBits acceptDocs)
            {
                this.m_outerInstance = outerInstance;

                this.m_context = context;
                AtomicReader reader = context.AtomicReader;

                this.m_acceptDocs = acceptDocs;
                m_maxDoc          = reader.MaxDoc;
                Terms terms = reader.GetTerms(outerInstance.m_fieldName);

                if (terms != null)
                {
                    m_termsEnum = terms.GetIterator(null);
                }
            }
예제 #10
0
            protected BaseTermsEnumTraverser(AbstractPrefixTreeFilter outerInstance, AtomicReaderContext context, IBits acceptDocs) // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
            {
                this.m_outerInstance = outerInstance;

                this.m_context = context;
                AtomicReader reader = context.AtomicReader;

                this.m_acceptDocs = acceptDocs;
                m_maxDoc          = reader.MaxDoc;
                Terms terms = reader.GetTerms(outerInstance.m_fieldName);

                if (terms != null)
                {
                    m_termsEnum = terms.GetEnumerator();
                }
            }
예제 #11
0
        protected BaseTermsEnumTraverser(AbstractPrefixTreeFilter filter, AtomicReaderContext context, IBits?acceptDocs)  // LUCENENET: CA1012: Abstract types should not have constructors (marked protected)
        {
            // LUCENENET specific - added guard clauses
            this.m_filter  = filter ?? throw new ArgumentNullException(nameof(filter));
            this.m_context = context ?? throw new ArgumentNullException(nameof(context));
            AtomicReader reader = context.AtomicReader;

            this.m_acceptDocs = acceptDocs;
            m_maxDoc          = reader.MaxDoc;
            Terms?terms = reader.GetTerms(filter.m_fieldName);

            if (terms != null)
            {
                m_termsEnum = terms.GetEnumerator();
            }
        }
예제 #12
0
        public virtual ShapeFieldCache <T> GetCache(AtomicReader reader)
        {
            lock (locker)
            {
                ShapeFieldCache <T> idx;
                if (sidx.TryGetValue(reader, out idx) && idx != null)
                {
                    return(idx);
                }

                /*long startTime = Runtime.CurrentTimeMillis();
                 *              log.Fine("Building Cache [" + reader.MaxDoc() + "]");*/
                idx = new ShapeFieldCache <T>(reader.MaxDoc, m_defaultSize);
                int       count = 0;
                DocsEnum  docs  = null;
                Terms     terms = reader.GetTerms(m_shapeField);
                TermsEnum te    = null;
                if (terms != null)
                {
                    te = terms.GetIterator(te);
                    BytesRef term = te.Next();
                    while (term != null)
                    {
                        T shape = ReadShape(term);
                        if (shape != null)
                        {
                            docs = te.Docs(null, docs, DocsFlags.NONE);
                            int docid = docs.NextDoc();
                            while (docid != DocIdSetIterator.NO_MORE_DOCS)
                            {
                                idx.Add(docid, shape);
                                docid = docs.NextDoc();
                                count++;
                            }
                        }
                        term = te.Next();
                    }
                }
                sidx[reader] = idx;

                /*long elapsed = Runtime.CurrentTimeMillis() - startTime;
                 * log.Fine("Cached: [" + count + " in " + elapsed + "ms] " + idx);*/
                return(idx);
            }
        }
예제 #13
0
        public virtual DocsEnum RandomDocsEnum(string field, BytesRef term, IList <AtomicReaderContext> readers, IBits bits)
        {
            if (Random().Next(10) == 0)
            {
                return(null);
            }
            AtomicReader indexReader = (AtomicReader)readers[Random().Next(readers.Count)].Reader;
            Terms        terms       = indexReader.GetTerms(field);

            if (terms == null)
            {
                return(null);
            }
            TermsEnum iterator = terms.GetIterator(null);

            if (iterator.SeekExact(term))
            {
                return(iterator.Docs(bits, null, Random().NextBoolean() ? DocsFlags.FREQS : DocsFlags.NONE));
            }
            return(null);
        }
예제 #14
0
        protected int GetNegativeValueCount(AtomicReader reader, string field)
        {
            int   ret   = 0;
            Terms terms = reader.GetTerms(field);

            if (terms == null)
            {
                return(ret);
            }
            TermsEnum termsEnum = terms.GetIterator(null);
            BytesRef  text;

            while ((text = termsEnum.Next()) != null)
            {
                if (!text.Utf8ToString().StartsWith("-"))
                {
                    break;
                }
                ret++;
            }
            return(ret);
        }
예제 #15
0
        public void TestSameFieldAddedMultipleTimes()
        {
            MemoryIndex  mindex       = new MemoryIndex(Random.nextBoolean(), Random.nextInt(50) * 1024 * 1024);
            MockAnalyzer mockAnalyzer = new MockAnalyzer(Random);

            mindex.AddField("field", "the quick brown fox", mockAnalyzer);
            mindex.AddField("field", "jumps over the", mockAnalyzer);
            AtomicReader reader = (AtomicReader)mindex.CreateSearcher().IndexReader;

            assertEquals(7, reader.GetTerms("field").SumTotalTermFreq);
            PhraseQuery query = new PhraseQuery();

            query.Add(new Term("field", "fox"));
            query.Add(new Term("field", "jumps"));
            assertTrue(mindex.Search(query) > 0.1);
            mindex.Reset();
            mockAnalyzer.SetPositionIncrementGap(1 + Random.nextInt(10));
            mindex.AddField("field", "the quick brown fox", mockAnalyzer);
            mindex.AddField("field", "jumps over the", mockAnalyzer);
            assertEquals(0, mindex.Search(query), 0.00001f);
            query.Slop = (10);
            assertTrue("posGap" + mockAnalyzer.GetPositionIncrementGap("field"), mindex.Search(query) > 0.0001);
        }
예제 #16
0
        public void TestDocsEnumStart()
        {
            Analyzer    analyzer = new MockAnalyzer(Random);
            MemoryIndex memory   = new MemoryIndex(Random.nextBoolean(), Random.nextInt(50) * 1024 * 1024);

            memory.AddField("foo", "bar", analyzer);
            AtomicReader reader = (AtomicReader)memory.CreateSearcher().IndexReader;
            DocsEnum     disi   = TestUtil.Docs(Random, reader, "foo", new BytesRef("bar"), null, null, DocsFlags.NONE);
            int          docid  = disi.DocID;

            assertEquals(-1, docid);
            assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);

            // now reuse and check again
            TermsEnum te = reader.GetTerms("foo").GetEnumerator();

            assertTrue(te.SeekExact(new BytesRef("bar")));
            disi  = te.Docs(null, disi, DocsFlags.NONE);
            docid = disi.DocID;
            assertEquals(-1, docid);
            assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            reader.Dispose();
        }
예제 #17
0
        private IDictionary <int, object> HighlightField(string field, string[] contents, BreakIterator bi, BytesRef[] terms, int[] docids, IList <AtomicReaderContext> leaves, int maxPassages, Query query)
        {
            IDictionary <int, object> highlights = new Dictionary <int, object>();

            PassageFormatter fieldFormatter = GetFormatter(field);

            if (fieldFormatter is null)
            {
                // LUCENENET: Changed from NullPointerException to InvalidOperationException (which isn't caught anywhere outside of tests)
                throw IllegalStateException.Create("PassageFormatter cannot be null");
            }

            // check if we should do any multiterm processing
            Analyzer analyzer = GetIndexAnalyzer(field);

            CharacterRunAutomaton[] automata = Arrays.Empty <CharacterRunAutomaton>();
            if (analyzer != null)
            {
                automata = MultiTermHighlighting.ExtractAutomata(query, field);
            }

            // resize 'terms', where the last term is the multiterm matcher
            if (automata.Length > 0)
            {
                BytesRef[] newTerms = new BytesRef[terms.Length + 1];
                System.Array.Copy(terms, 0, newTerms, 0, terms.Length);
                terms = newTerms;
            }

            // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes
            // otherwise, we will just advance() existing enums to the new document in the same segment.
            DocsAndPositionsEnum[] postings = null;
            TermsEnum termsEnum             = null;
            int       lastLeaf = -1;

            for (int i = 0; i < docids.Length; i++)
            {
                string content = contents[i];
                if (content.Length == 0)
                {
                    continue; // nothing to do
                }
                bi.SetText(content);
                int doc  = docids[i];
                int leaf = ReaderUtil.SubIndex(doc, leaves);
                AtomicReaderContext subContext = leaves[leaf];
                AtomicReader        r          = subContext.AtomicReader;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(leaf >= lastLeaf);                           // increasing order
                }
                // if the segment has changed, we must initialize new enums.
                if (leaf != lastLeaf)
                {
                    Terms t = r.GetTerms(field);
                    if (t != null)
                    {
                        termsEnum = t.GetEnumerator();
                        postings  = new DocsAndPositionsEnum[terms.Length];
                    }
                }
                if (termsEnum is null)
                {
                    continue; // no terms for this field, nothing to do
                }

                // if there are multi-term matches, we have to initialize the "fake" enum for each document
                if (automata.Length > 0)
                {
                    DocsAndPositionsEnum dp = MultiTermHighlighting.GetDocsEnum(analyzer.GetTokenStream(field, content), automata);
                    dp.Advance(doc - subContext.DocBase);
                    postings[terms.Length - 1] = dp; // last term is the multiterm matcher
                }

                Passage[] passages = HighlightDoc(field, terms, content.Length, bi, doc - subContext.DocBase, termsEnum, postings, maxPassages);

                if (passages.Length == 0)
                {
                    // no passages were returned, so ask for a default summary
                    passages = GetEmptyHighlight(field, bi, maxPassages);
                }

                if (passages.Length > 0)
                {
                    highlights[doc] = fieldFormatter.Format(passages, content);
                }

                lastLeaf = leaf;
            }

            return(highlights);
        }
예제 #18
0
        public void TestRandomIndex()
        {
            Directory dir = NewDirectory();
            MockAnalyzer analyzer = new MockAnalyzer(Random);
            analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH);
            RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                this,
#endif
                Random, dir, analyzer);
            CreateRandomIndex(AtLeast(50), w, Random.NextInt64());
            DirectoryReader reader = w.GetReader();
            AtomicReader wrapper = SlowCompositeReaderWrapper.Wrap(reader);
            string field = @"body";
            Terms terms = wrapper.GetTerms(field);
            var lowFreqQueue = new AnonymousPriorityQueue(this, 5);
            Util.PriorityQueue<TermAndFreq> highFreqQueue = new AnonymousPriorityQueue1(this, 5);
            try
            {
                TermsEnum iterator = terms.GetIterator(null);
                while (iterator.Next() != null)
                {
                    if (highFreqQueue.Count < 5)
                    {
                        highFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
                        lowFreqQueue.Add(new TermAndFreq(BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
                    }
                    else
                    {
                        if (highFreqQueue.Top.freq < iterator.DocFreq)
                        {
                            highFreqQueue.Top.freq = iterator.DocFreq;
                            highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
                            highFreqQueue.UpdateTop();
                        }

                        if (lowFreqQueue.Top.freq > iterator.DocFreq)
                        {
                            lowFreqQueue.Top.freq = iterator.DocFreq;
                            lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
                            lowFreqQueue.UpdateTop();
                        }
                    }
                }

                int lowFreq = lowFreqQueue.Top.freq;
                int highFreq = highFreqQueue.Top.freq;
                AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq);
                List<TermAndFreq> highTerms = QueueToList(highFreqQueue);
                List<TermAndFreq> lowTerms = QueueToList(lowFreqQueue);
                IndexSearcher searcher = NewSearcher(reader);
                Occur lowFreqOccur = RandomOccur(Random);
                BooleanQuery verifyQuery = new BooleanQuery();
                CommonTermsQuery cq = new CommonTermsQuery(RandomOccur(Random), lowFreqOccur, highFreq - 1, Random.NextBoolean());
                foreach (TermAndFreq termAndFreq in lowTerms)
                {
                    cq.Add(new Term(field, termAndFreq.term));
                    verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field, termAndFreq.term)), lowFreqOccur));
                }

                foreach (TermAndFreq termAndFreq in highTerms)
                {
                    cq.Add(new Term(field, termAndFreq.term));
                }

                TopDocs cqSearch = searcher.Search(cq, reader.MaxDoc);
                TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc);
                assertEquals(verifySearch.TotalHits, cqSearch.TotalHits);
                var hits = new JCG.HashSet<int>();
                foreach (ScoreDoc doc in verifySearch.ScoreDocs)
                {
                    hits.Add(doc.Doc);
                }

                foreach (ScoreDoc doc in cqSearch.ScoreDocs)
                {
                    assertTrue(hits.Remove(doc.Doc));
                }

                assertTrue(hits.Count == 0);
                w.ForceMerge(1);
                DirectoryReader reader2 = w.GetReader();
                QueryUtils.Check(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
                    this,
#endif
                    Random, cq, NewSearcher(reader2));
                reader2.Dispose();
            }
            finally
            {
                reader.Dispose();
                wrapper.Dispose();
                w.Dispose();
                dir.Dispose();
            }
        }
예제 #19
0
        private void Verify(Directory dir)
        {
            DirectoryReader ir = DirectoryReader.Open(dir);

            foreach (AtomicReaderContext leaf in ir.Leaves)
            {
                AtomicReader leafReader = (AtomicReader)leaf.Reader;
                AssertTerms(leafReader.GetTerms("field1docs"), leafReader.GetTerms("field2freqs"), true);
                AssertTerms(leafReader.GetTerms("field3positions"), leafReader.GetTerms("field4offsets"), true);
                AssertTerms(leafReader.GetTerms("field4offsets"), leafReader.GetTerms("field5payloadsFixed"), true);
                AssertTerms(leafReader.GetTerms("field5payloadsFixed"), leafReader.GetTerms("field6payloadsVariable"), true);
                AssertTerms(leafReader.GetTerms("field6payloadsVariable"), leafReader.GetTerms("field7payloadsFixedOffsets"), true);
                AssertTerms(leafReader.GetTerms("field7payloadsFixedOffsets"), leafReader.GetTerms("field8payloadsVariableOffsets"), true);
            }
            ir.Dispose();
        }
예제 #20
0
        public void TestRandomIndex()
        {
            Directory    dir      = NewDirectory();
            MockAnalyzer analyzer = new MockAnalyzer(Random);

            analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH);
            RandomIndexWriter w = new RandomIndexWriter(Random, dir, analyzer);

            CreateRandomIndex(AtLeast(50), w, Random.NextInt64());
            DirectoryReader reader        = w.GetReader();
            AtomicReader    wrapper       = SlowCompositeReaderWrapper.Wrap(reader);
            string          field         = @"body";
            Terms           terms         = wrapper.GetTerms(field);
            var             lowFreqQueue  = new PriorityQueueAnonymousClass(5);
            var             highFreqQueue = new PriorityQueueAnonymousClass1(5);

            try
            {
                TermsEnum iterator = terms.GetEnumerator();
                while (iterator.MoveNext())
                {
                    if (highFreqQueue.Count < 5)
                    {
                        highFreqQueue.Add(new TermAndFreq(
                                              BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
                        lowFreqQueue.Add(new TermAndFreq(
                                             BytesRef.DeepCopyOf(iterator.Term), iterator.DocFreq));
                    }
                    else
                    {
                        if (highFreqQueue.Top.freq < iterator.DocFreq)
                        {
                            highFreqQueue.Top.freq = iterator.DocFreq;
                            highFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
                            highFreqQueue.UpdateTop();
                        }

                        if (lowFreqQueue.Top.freq > iterator.DocFreq)
                        {
                            lowFreqQueue.Top.freq = iterator.DocFreq;
                            lowFreqQueue.Top.term = BytesRef.DeepCopyOf(iterator.Term);
                            lowFreqQueue.UpdateTop();
                        }
                    }
                }

                int lowFreq  = lowFreqQueue.Top.freq;
                int highFreq = highFreqQueue.Top.freq;
                AssumeTrue(@"unlucky index", highFreq - 1 > lowFreq);
                IList <TermAndFreq> highTerms = QueueToList(highFreqQueue);
                IList <TermAndFreq> lowTerms  = QueueToList(lowFreqQueue);

                IndexSearcher    searcher     = NewSearcher(reader);
                Occur            lowFreqOccur = RandomOccur(Random);
                BooleanQuery     verifyQuery  = new BooleanQuery();
                CommonTermsQuery cq           = new CommonTermsQuery(RandomOccur(Random),
                                                                     lowFreqOccur, highFreq - 1, Random.NextBoolean());
                foreach (TermAndFreq termAndFreq in lowTerms)
                {
                    cq.Add(new Term(field, termAndFreq.term));
                    verifyQuery.Add(new BooleanClause(new TermQuery(new Term(field,
                                                                             termAndFreq.term)), lowFreqOccur));
                }
                foreach (TermAndFreq termAndFreq in highTerms)
                {
                    cq.Add(new Term(field, termAndFreq.term));
                }

                TopDocs cqSearch = searcher.Search(cq, reader.MaxDoc);

                TopDocs verifySearch = searcher.Search(verifyQuery, reader.MaxDoc);
                assertEquals(verifySearch.TotalHits, cqSearch.TotalHits);
                var hits = new JCG.HashSet <int>();
                foreach (ScoreDoc doc in verifySearch.ScoreDocs)
                {
                    hits.Add(doc.Doc);
                }

                foreach (ScoreDoc doc in cqSearch.ScoreDocs)
                {
                    assertTrue(hits.Remove(doc.Doc));
                }

                assertTrue(hits.Count == 0);

                /*
                 *  need to force merge here since QueryUtils adds checks based
                 *  on leave readers which have different statistics than the top
                 *  level reader if we have more than one segment. This could
                 *  result in a different query / results.
                 */
                w.ForceMerge(1);
                DirectoryReader reader2 = w.GetReader();
                QueryUtils.Check(Random, cq, NewSearcher(reader2));
                reader2.Dispose();
            }
            finally
            {
                reader.Dispose();
                wrapper.Dispose();
                w.Dispose();
                dir.Dispose();
            }
        }
예제 #21
0
        private void DuellReaders(CompositeReader other, AtomicReader memIndexReader)
        {
            AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other);
            Fields       memFields  = memIndexReader.Fields;

            foreach (string field in competitor.Fields)
            {
                Terms memTerms = memFields.GetTerms(field);
                Terms iwTerms  = memIndexReader.GetTerms(field);
                if (iwTerms is null)
                {
                    assertNull(memTerms);
                }
                else
                {
                    NumericDocValues normValues    = competitor.GetNormValues(field);
                    NumericDocValues memNormValues = memIndexReader.GetNormValues(field);
                    if (normValues != null)
                    {
                        // mem idx always computes norms on the fly
                        assertNotNull(memNormValues);
                        assertEquals(normValues.Get(0), memNormValues.Get(0));
                    }

                    assertNotNull(memTerms);
                    assertEquals(iwTerms.DocCount, memTerms.DocCount);
                    assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq);
                    assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq);
                    TermsEnum iwTermsIter  = iwTerms.GetEnumerator();
                    TermsEnum memTermsIter = memTerms.GetEnumerator();
                    if (iwTerms.HasPositions)
                    {
                        bool offsets = iwTerms.HasOffsets && memTerms.HasOffsets;

                        while (iwTermsIter.MoveNext())
                        {
                            assertTrue(memTermsIter.MoveNext());
                            assertEquals(iwTermsIter.Term, memTermsIter.Term);
                            DocsAndPositionsEnum iwDocsAndPos  = iwTermsIter.DocsAndPositions(null, null);
                            DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null);
                            while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS)
                            {
                                assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc());
                                assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq);
                                for (int i = 0; i < iwDocsAndPos.Freq; i++)
                                {
                                    assertEquals("term: " + iwTermsIter.Term.Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition());
                                    if (offsets)
                                    {
                                        assertEquals(iwDocsAndPos.StartOffset, memDocsAndPos.StartOffset);
                                        assertEquals(iwDocsAndPos.EndOffset, memDocsAndPos.EndOffset);
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        while (iwTermsIter.MoveNext())
                        {
                            assertEquals(iwTermsIter.Term, memTermsIter.Term);
                            DocsEnum iwDocsAndPos  = iwTermsIter.Docs(null, null);
                            DocsEnum memDocsAndPos = memTermsIter.Docs(null, null);
                            while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS)
                            {
                                assertEquals(iwDocsAndPos.DocID, memDocsAndPos.NextDoc());
                                assertEquals(iwDocsAndPos.Freq, memDocsAndPos.Freq);
                            }
                        }
                    }
                }
            }
        }
예제 #22
0
        private IndexIterationContext CreateContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter,
                                                    bool multipleValuesPerDocument, bool scoreDocsInOrder)
        {
            IndexIterationContext context = new IndexIterationContext();
            int numRandomValues           = nDocs / 2;

            context.RandomUniqueValues = new string[numRandomValues];
            ISet <string> trackSet = new JCG.HashSet <string>();

            context.RandomFrom = new bool[numRandomValues];
            for (int i = 0; i < numRandomValues; i++)
            {
                string uniqueRandomValue;
                do
                {
                    uniqueRandomValue = TestUtil.RandomRealisticUnicodeString(Random);
                    //        uniqueRandomValue = TestUtil.randomSimpleString(random);
                } while ("".Equals(uniqueRandomValue, StringComparison.Ordinal) || trackSet.Contains(uniqueRandomValue));
                // Generate unique values and empty strings aren't allowed.
                trackSet.Add(uniqueRandomValue);
                context.RandomFrom[i]         = Random.NextBoolean();
                context.RandomUniqueValues[i] = uniqueRandomValue;
            }

            RandomDoc[] docs = new RandomDoc[nDocs];
            for (int i = 0; i < nDocs; i++)
            {
                string   id       = Convert.ToString(i, CultureInfo.InvariantCulture);
                int      randomI  = Random.Next(context.RandomUniqueValues.Length);
                string   value    = context.RandomUniqueValues[randomI];
                Document document = new Document();
                document.Add(NewTextField(Random, "id", id, Field.Store.NO));
                document.Add(NewTextField(Random, "value", value, Field.Store.NO));

                bool from = context.RandomFrom[randomI];
                int  numberOfLinkValues = multipleValuesPerDocument ? 2 + Random.Next(10) : 1;
                docs[i] = new RandomDoc(id, numberOfLinkValues, value, from);
                for (int j = 0; j < numberOfLinkValues; j++)
                {
                    string linkValue = context.RandomUniqueValues[Random.Next(context.RandomUniqueValues.Length)];
                    docs[i].linkValues.Add(linkValue);
                    if (from)
                    {
                        if (!context.FromDocuments.TryGetValue(linkValue, out IList <RandomDoc> fromDocs))
                        {
                            context.FromDocuments[linkValue] = fromDocs = new JCG.List <RandomDoc>();
                        }
                        if (!context.RandomValueFromDocs.TryGetValue(value, out IList <RandomDoc> randomValueFromDocs))
                        {
                            context.RandomValueFromDocs[value] = randomValueFromDocs = new JCG.List <RandomDoc>();
                        }

                        fromDocs.Add(docs[i]);
                        randomValueFromDocs.Add(docs[i]);
                        document.Add(NewTextField(Random, "from", linkValue, Field.Store.NO));
                    }
                    else
                    {
                        if (!context.ToDocuments.TryGetValue(linkValue, out IList <RandomDoc> toDocuments))
                        {
                            context.ToDocuments[linkValue] = toDocuments = new JCG.List <RandomDoc>();
                        }
                        if (!context.RandomValueToDocs.TryGetValue(value, out IList <RandomDoc> randomValueToDocs))
                        {
                            context.RandomValueToDocs[value] = randomValueToDocs = new JCG.List <RandomDoc>();
                        }

                        toDocuments.Add(docs[i]);
                        randomValueToDocs.Add(docs[i]);
                        document.Add(NewTextField(Random, "to", linkValue, Field.Store.NO));
                    }
                }

                RandomIndexWriter w;
                if (from)
                {
                    w = fromWriter;
                }
                else
                {
                    w = toWriter;
                }

                w.AddDocument(document);
                if (Random.Next(10) == 4)
                {
                    w.Commit();
                }
                if (Verbose)
                {
                    Console.WriteLine("Added document[" + docs[i].id + "]: " + document);
                }
            }

            // Pre-compute all possible hits for all unique random values. On top of this also compute all possible score for
            // any ScoreMode.
            IndexSearcher fromSearcher = NewSearcher(fromWriter.GetReader());
            IndexSearcher toSearcher   = NewSearcher(toWriter.GetReader());

            for (int i = 0; i < context.RandomUniqueValues.Length; i++)
            {
                string uniqueRandomValue = context.RandomUniqueValues[i];
                string fromField;
                string toField;
                IDictionary <string, IDictionary <int, JoinScore> > queryVals;
                if (context.RandomFrom[i])
                {
                    fromField = "from";
                    toField   = "to";
                    queryVals = context.FromHitsToJoinScore;
                }
                else
                {
                    fromField = "to";
                    toField   = "from";
                    queryVals = context.ToHitsToJoinScore;
                }
                IDictionary <BytesRef, JoinScore> joinValueToJoinScores = new Dictionary <BytesRef, JoinScore>();
                if (multipleValuesPerDocument)
                {
                    fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)),
                                        new CollectorAnonymousClass3(fromField, joinValueToJoinScores));
                }
                else
                {
                    fromSearcher.Search(new TermQuery(new Term("value", uniqueRandomValue)),
                                        new CollectorAnonymousClass4(fromField, joinValueToJoinScores));
                }

                IDictionary <int, JoinScore> docToJoinScore = new Dictionary <int, JoinScore>();
                if (multipleValuesPerDocument)
                {
                    if (scoreDocsInOrder)
                    {
                        AtomicReader slowCompositeReader = SlowCompositeReaderWrapper.Wrap(toSearcher.IndexReader);
                        Terms        terms = slowCompositeReader.GetTerms(toField);
                        if (terms != null)
                        {
                            DocsEnum  docsEnum  = null;
                            TermsEnum termsEnum = null;
                            JCG.SortedSet <BytesRef> joinValues =
                                new JCG.SortedSet <BytesRef>(BytesRef.UTF8SortedAsUnicodeComparer);
                            joinValues.UnionWith(joinValueToJoinScores.Keys);
                            foreach (BytesRef joinValue in joinValues)
                            {
                                termsEnum = terms.GetEnumerator(termsEnum);
                                if (termsEnum.SeekExact(joinValue))
                                {
                                    docsEnum = termsEnum.Docs(slowCompositeReader.LiveDocs, docsEnum, DocsFlags.NONE);
                                    JoinScore joinScore = joinValueToJoinScores[joinValue];

                                    for (int doc = docsEnum.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = docsEnum.NextDoc())
                                    {
                                        // First encountered join value determines the score.
                                        // Something to keep in mind for many-to-many relations.
                                        if (!docToJoinScore.ContainsKey(doc))
                                        {
                                            docToJoinScore[doc] = joinScore;
                                        }
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        toSearcher.Search(new MatchAllDocsQuery(),
                                          new CollectorAnonymousClass5(toField, joinValueToJoinScores,
                                                                       docToJoinScore));
                    }
                }
                else
                {
                    toSearcher.Search(new MatchAllDocsQuery(),
                                      new CollectorAnonymousClass6(toField, joinValueToJoinScores,
                                                                   docToJoinScore));
                }
                queryVals[uniqueRandomValue] = docToJoinScore;
            }

            fromSearcher.IndexReader.Dispose();
            toSearcher.IndexReader.Dispose();

            return(context);
        }
예제 #23
0
            public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
            {
                Debug.Assert(outerInstance.termArrays.Count > 0);
                AtomicReader reader   = (context.AtomicReader);
                IBits        liveDocs = acceptDocs;

                PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[outerInstance.termArrays.Count];

                Terms fieldTerms = reader.GetTerms(outerInstance.field);

                if (fieldTerms == null)
                {
                    return(null);
                }

                // Reuse single TermsEnum below:
                TermsEnum termsEnum = fieldTerms.GetIterator(null);

                for (int pos = 0; pos < postingsFreqs.Length; pos++)
                {
                    Term[] terms = outerInstance.termArrays[pos];

                    DocsAndPositionsEnum postingsEnum;
                    int docFreq;

                    if (terms.Length > 1)
                    {
                        postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum);

                        // coarse -- this overcounts since a given doc can
                        // have more than one term:
                        docFreq = 0;
                        for (int termIdx = 0; termIdx < terms.Length; termIdx++)
                        {
                            Term      term      = terms[termIdx];
                            TermState termState = termContexts[term].Get(context.Ord);
                            if (termState == null)
                            {
                                // Term not in reader
                                continue;
                            }
                            termsEnum.SeekExact(term.Bytes, termState);
                            docFreq += termsEnum.DocFreq;
                        }

                        if (docFreq == 0)
                        {
                            // None of the terms are in this reader
                            return(null);
                        }
                    }
                    else
                    {
                        Term      term      = terms[0];
                        TermState termState = termContexts[term].Get(context.Ord);
                        if (termState == null)
                        {
                            // Term not in reader
                            return(null);
                        }
                        termsEnum.SeekExact(term.Bytes, termState);
                        postingsEnum = termsEnum.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);

                        if (postingsEnum == null)
                        {
                            // term does exist, but has no positions
                            Debug.Assert(termsEnum.Docs(liveDocs, null, DocsFlags.NONE) != null, "termstate found but no term exists in reader");
                            throw new InvalidOperationException("field \"" + term.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.Text() + ")");
                        }

                        docFreq = termsEnum.DocFreq;
                    }

                    postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, (int)outerInstance.positions[pos], terms);
                }

                // sort by increasing docFreq order
                if (outerInstance.slop == 0)
                {
                    ArrayUtil.TimSort(postingsFreqs);
                }

                if (outerInstance.slop == 0)
                {
                    ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
                    if (s.noDocs)
                    {
                        return(null);
                    }
                    else
                    {
                        return(s);
                    }
                }
                else
                {
                    return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context)));
                }
            }
예제 #24
0
        public virtual void Load(string fieldName, AtomicReader reader, TermListFactory listFactory)
        {
#if FEATURE_STRING_INTERN
            string field = string.Intern(fieldName);
#else
            string field = fieldName;
#endif
            int maxDoc = reader.MaxDoc;

            int dictValueCount      = GetDictValueCount(reader, fieldName);
            BigSegmentedArray order = NewInstance(dictValueCount, maxDoc);

            this.m_orderArray = order;

            List <int> minIDList = new List <int>();
            List <int> maxIDList = new List <int>();
            List <int> freqList  = new List <int>();

            int            length             = maxDoc + 1;
            ITermValueList list               = listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList();
            int            negativeValueCount = GetNegativeValueCount(reader, field);

            int t = 1; // valid term id starts from 1

            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);
            int   totalFreq = 0;
            Terms terms     = reader.GetTerms(field);
            if (terms != null)
            {
                TermsEnum termsEnum = terms.GetIterator(null);
                BytesRef  text;
                while ((text = termsEnum.Next()) != null)
                {
                    // store term text
                    // we expect that there is at most one term per document
                    if (t >= length)
                    {
                        throw new RuntimeException("there are more terms than "
                                                   + "documents in field \"" + field + "\", but it's impossible to sort on "
                                                   + "tokenized fields");
                    }
                    string strText = text.Utf8ToString();
                    list.Add(strText);
                    Term     term     = new Term(field, strText);
                    DocsEnum docsEnum = reader.GetTermDocsEnum(term);
                    // freqList.add(termEnum.docFreq()); // doesn't take into account
                    // deldocs
                    int minID = -1;
                    int maxID = -1;
                    int docID = -1;
                    int df    = 0;
                    int valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                    while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS)
                    {
                        df++;
                        order.Add(docID, valId);
                        minID = docID;
                        while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS)
                        {
                            docID = docsEnum.DocID;
                            df++;
                            order.Add(docID, valId);
                        }
                        maxID = docID;
                    }
                    freqList.Add(df);
                    totalFreq += df;
                    minIDList.Add(minID);
                    maxIDList.Add(maxID);
                    t++;
                }
            }

            list.Seal();
            this.m_valArray = list;
            this.m_freqs    = freqList.ToArray();
            this.m_minIDs   = minIDList.ToArray();
            this.m_maxIDs   = maxIDList.ToArray();

            int doc = 0;
            while (doc < maxDoc && order.Get(doc) != 0)
            {
                ++doc;
            }
            if (doc < maxDoc)
            {
                this.m_minIDs[0] = doc;
                // Try to get the max
                doc = maxDoc - 1;
                while (doc >= 0 && order.Get(doc) != 0)
                {
                    --doc;
                }
                this.m_maxIDs[0] = doc;
            }
            this.m_freqs[0] = reader.NumDocs - totalFreq;
        }
        /// <summary>
        /// loads multi-value facet data. This method uses a workarea to prepare loading.
        /// </summary>
        /// <param name="fieldName"></param>
        /// <param name="reader"></param>
        /// <param name="listFactory"></param>
        /// <param name="workArea"></param>
        public virtual void Load(string fieldName, AtomicReader reader, TermListFactory listFactory, BoboSegmentReader.WorkArea workArea)
        {
#if FEATURE_STRING_INTERN
            string field = string.Intern(fieldName);
#else
            string field = fieldName;
#endif
            int maxdoc = reader.MaxDoc;
            BigNestedInt32Array.BufferedLoader loader = GetBufferedLoader(maxdoc, workArea);

            ITermValueList list               = (listFactory == null ? (ITermValueList) new TermStringList() : listFactory.CreateTermList());
            List <int>     minIDList          = new List <int>();
            List <int>     maxIDList          = new List <int>();
            List <int>     freqList           = new List <int>();
            OpenBitSet     bitset             = new OpenBitSet(maxdoc + 1);
            int            negativeValueCount = GetNegativeValueCount(reader, field);
            int            t = 1; // valid term id starts from 1
            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);

            m_overflow = false;
            Terms terms = reader.GetTerms(field);
            if (terms != null)
            {
                TermsEnum termsEnum = terms.GetIterator(null);
                BytesRef  text;
                while ((text = termsEnum.Next()) != null)
                {
                    string strText = text.Utf8ToString();
                    list.Add(strText);

                    Term     term     = new Term(field, strText);
                    DocsEnum docsEnum = reader.GetTermDocsEnum(term);
                    int      df       = 0;
                    int      minID    = -1;
                    int      maxID    = -1;
                    int      docID    = -1;
                    int      valId    = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                    while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS)
                    {
                        df++;
                        if (!loader.Add(docID, valId))
                        {
                            LogOverflow(fieldName);
                        }
                        minID = docID;
                        bitset.FastSet(docID);
                        while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS)
                        {
                            docID = docsEnum.DocID;
                            df++;
                            if (!loader.Add(docID, valId))
                            {
                                LogOverflow(fieldName);
                            }
                            bitset.FastSet(docID);
                        }
                        maxID = docID;
                    }
                    freqList.Add(df);
                    minIDList.Add(minID);
                    maxIDList.Add(maxID);
                    t++;
                }
            }

            list.Seal();

            try
            {
                m_nestedArray.Load(maxdoc + 1, loader);
            }
            catch (Exception e)
            {
                throw new RuntimeException("failed to load due to " + e.ToString(), e);
            }

            this.m_valArray = list;
            this.m_freqs    = freqList.ToArray();
            this.m_minIDs   = minIDList.ToArray();
            this.m_maxIDs   = maxIDList.ToArray();

            int doc = 0;
            while (doc < maxdoc && !m_nestedArray.Contains(doc, 0, true))
            {
                ++doc;
            }
            if (doc < maxdoc)
            {
                this.m_minIDs[0] = doc;
                doc = maxdoc - 1;
                while (doc >= 0 && !m_nestedArray.Contains(doc, 0, true))
                {
                    --doc;
                }
                this.m_maxIDs[0] = doc;
            }
            this.m_freqs[0] = maxdoc - (int)bitset.Cardinality();
        }
예제 #26
0
            public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
            {
                Debug.Assert(outerInstance.terms.Count > 0);
                AtomicReader reader   = context.AtomicReader;
                IBits        liveDocs = acceptDocs;

                PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[outerInstance.terms.Count];

                Terms fieldTerms = reader.GetTerms(outerInstance.field);

                if (fieldTerms == null)
                {
                    return(null);
                }

                // Reuse single TermsEnum below:
                TermsEnum te = fieldTerms.GetIterator(null);

                for (int i = 0; i < outerInstance.terms.Count; i++)
                {
                    Term      t     = outerInstance.terms[i];
                    TermState state = states[i].Get(context.Ord);
                    if (state == null) // term doesnt exist in this segment
                    {
                        Debug.Assert(TermNotInReader(reader, t), "no termstate found but term exists in reader");
                        return(null);
                    }
                    te.SeekExact(t.Bytes, state);
                    DocsAndPositionsEnum postingsEnum = te.DocsAndPositions(liveDocs, null, DocsAndPositionsFlags.NONE);

                    // PhraseQuery on a field that did not index
                    // positions.
                    if (postingsEnum == null)
                    {
                        Debug.Assert(te.SeekExact(t.Bytes), "termstate found but no term exists in reader");
                        // term does exist, but has no positions
                        throw new InvalidOperationException("field \"" + t.Field + "\" was indexed without position data; cannot run PhraseQuery (term=" + t.Text() + ")");
                    }
                    postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.DocFreq, (int)outerInstance.positions[i], t);
                }

                // sort by increasing docFreq order
                if (outerInstance.slop == 0)
                {
                    ArrayUtil.TimSort(postingsFreqs);
                }

                if (outerInstance.slop == 0) // optimize exact case
                {
                    ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.GetSimScorer(stats, context));
                    if (s.noDocs)
                    {
                        return(null);
                    }
                    else
                    {
                        return(s);
                    }
                }
                else
                {
                    return(new SloppyPhraseScorer(this, postingsFreqs, outerInstance.slop, similarity.GetSimScorer(stats, context)));
                }
            }
예제 #27
0
        public override void Load(string fieldName, AtomicReader reader, TermListFactory listFactory, BoboSegmentReader.WorkArea workArea)
        {
#if FEATURE_STRING_INTERN
            string field = string.Intern(fieldName);
#else
            string field = fieldName;
#endif
            int maxdoc = reader.MaxDoc;
            BigNestedInt32Array.BufferedLoader loader       = GetBufferedLoader(maxdoc, workArea);
            BigNestedInt32Array.BufferedLoader weightLoader = GetBufferedLoader(maxdoc, null);

            var        list               = (listFactory == null ? new TermStringList() : listFactory.CreateTermList());
            List <int> minIDList          = new List <int>();
            List <int> maxIDList          = new List <int>();
            List <int> freqList           = new List <int>();
            OpenBitSet bitset             = new OpenBitSet(maxdoc + 1);
            int        negativeValueCount = GetNegativeValueCount(reader, field);
            int        t = 1; // valid term id starts from 1
            list.Add(null);
            minIDList.Add(-1);
            maxIDList.Add(-1);
            freqList.Add(0);

            m_overflow = false;

            string pre = null;

            int df    = 0;
            int minID = -1;
            int maxID = -1;
            int docID = -1;
            int valId = 0;

            Terms terms = reader.GetTerms(field);
            if (terms != null)
            {
                TermsEnum termsEnum = terms.GetIterator(null);
                BytesRef  text;
                while ((text = termsEnum.Next()) != null)
                {
                    string   strText = text.Utf8ToString();
                    string   val     = null;
                    int      weight  = 0;
                    string[] split   = strText.Split(new char[] { '\0' }, StringSplitOptions.RemoveEmptyEntries);
                    if (split.Length > 1)
                    {
                        val    = split[0];
                        weight = int.Parse(split[split.Length - 1]);
                    }
                    else
                    {
                        continue;
                    }

                    if (pre == null || !val.Equals(pre))
                    {
                        if (pre != null)
                        {
                            freqList.Add(df);
                            minIDList.Add(minID);
                            maxIDList.Add(maxID);
                        }
                        list.Add(val);
                        df    = 0;
                        minID = -1;
                        maxID = -1;
                        valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                        t++;
                    }

                    Term     term     = new Term(field, strText);
                    DocsEnum docsEnum = reader.GetTermDocsEnum(term);
                    if (docsEnum != null)
                    {
                        while ((docID = docsEnum.NextDoc()) != DocsEnum.NO_MORE_DOCS)
                        {
                            df++;

                            if (!loader.Add(docID, valId))
                            {
                                LogOverflow(fieldName);
                            }
                            else
                            {
                                weightLoader.Add(docID, weight);
                            }

                            if (docID < minID)
                            {
                                minID = docID;
                            }
                            bitset.FastSet(docID);
                            while (docsEnum.NextDoc() != DocsEnum.NO_MORE_DOCS)
                            {
                                docID = docsEnum.DocID;
                                df++;
                                if (!loader.Add(docID, valId))
                                {
                                    LogOverflow(fieldName);
                                }
                                else
                                {
                                    weightLoader.Add(docID, weight);
                                }
                                bitset.FastSet(docID);
                            }
                            if (docID > maxID)
                            {
                                maxID = docID;
                            }
                        }
                    }
                    pre = val;
                }
                if (pre != null)
                {
                    freqList.Add(df);
                    minIDList.Add(minID);
                    maxIDList.Add(maxID);
                }
            }

            list.Seal();

            try
            {
                m_nestedArray.Load(maxdoc + 1, loader);
                m_weightArray.Load(maxdoc + 1, weightLoader);
            }
            catch (Exception e)
            {
                throw new RuntimeException("failed to load due to " + e.ToString(), e);
            }

            this.m_valArray = list;
            this.m_freqs    = freqList.ToArray();
            this.m_minIDs   = minIDList.ToArray();
            this.m_maxIDs   = maxIDList.ToArray();

            int doc = 0;
            while (doc < maxdoc && !m_nestedArray.Contains(doc, 0, true))
            {
                ++doc;
            }
            if (doc < maxdoc)
            {
                this.m_minIDs[0] = doc;
                doc = maxdoc - 1;
                while (doc >= 0 && !m_nestedArray.Contains(doc, 0, true))
                {
                    --doc;
                }
                this.m_maxIDs[0] = doc;
            }
            this.m_freqs[0] = maxdoc - (int)bitset.Cardinality();
        }