Ejemplo n.º 1
0
        public virtual void TestDocsAndPositionsEnum()
        {
            TermsEnum termsEnum = reader.Terms(DOC_POSITIONS_FIELD).Iterator(null);

            assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOC_POSITIONS_TERM)));
            DocsAndPositionsEnum sortedPositions = termsEnum.DocsAndPositions(null, null);
            int doc;

            // test nextDoc()
            while ((doc = sortedPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS)
            {
                int freq = sortedPositions.Freq();
                assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq);
                for (int i = 0; i < freq; i++)
                {
                    assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition());
                    if (!DoesntSupportOffsets.contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD)))
                    {
                        assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset());
                        assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset());
                    }
                    assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.Payload.Utf8ToString(), CultureInfo.InvariantCulture));
                }
            }

            // test advance()
            DocsAndPositionsEnum reuse = sortedPositions;

            sortedPositions = termsEnum.DocsAndPositions(null, reuse);
            if (sortedPositions is SortingAtomicReader.SortingDocsAndPositionsEnum)
            {
                assertTrue(((SortingAtomicReader.SortingDocsAndPositionsEnum)sortedPositions).Reused(reuse)); // make sure reuse worked
            }
            doc = 0;
            while ((doc = sortedPositions.Advance(doc + TestUtil.NextInt(Random(), 1, 5))) != DocIdSetIterator.NO_MORE_DOCS)
            {
                int freq = sortedPositions.Freq();
                assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq);
                for (int i = 0; i < freq; i++)
                {
                    assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition());
                    if (!DoesntSupportOffsets.contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD)))
                    {
                        assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset());
                        assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset());
                    }
                    assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.Payload.Utf8ToString(), CultureInfo.InvariantCulture));
                }
            }
        }
Ejemplo n.º 2
0
        public void RestDocsAndPositionsEnumStart()
        {
            Analyzer    analyzer = new MockAnalyzer(Random());
            int         numIters = AtLeast(3);
            MemoryIndex memory   = new MemoryIndex(true, Random().nextInt(50) * 1024 * 1024);

            for (int i = 0; i < numIters; i++)
            { // check reuse
                memory.AddField("foo", "bar", analyzer);
                AtomicReader reader = (AtomicReader)memory.CreateSearcher().IndexReader;
                assertEquals(1, reader.Terms("foo").SumTotalTermFreq);
                DocsAndPositionsEnum disi = reader.TermPositionsEnum(new Term("foo", "bar"));
                int docid = disi.DocID();
                assertEquals(-1, docid);
                assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                assertEquals(0, disi.NextPosition());
                assertEquals(0, disi.StartOffset());
                assertEquals(3, disi.EndOffset());

                // now reuse and check again
                TermsEnum te = reader.Terms("foo").Iterator(null);
                assertTrue(te.SeekExact(new BytesRef("bar")));
                disi  = te.DocsAndPositions(null, disi);
                docid = disi.DocID();
                assertEquals(-1, docid);
                assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
                reader.Dispose();
                memory.Reset();
            }
        }
Ejemplo n.º 3
0
            internal virtual void AddPositions(DocsAndPositionsEnum @in, IndexOutput @out)
            {
                int freq = @in.Freq();

                @out.WriteVInt(freq);
                int previousPosition  = 0;
                int previousEndOffset = 0;

                for (int i = 0; i < freq; i++)
                {
                    int      pos     = @in.NextPosition();
                    BytesRef payload = @in.Payload;
                    // The low-order bit of token is set only if there is a payload, the
                    // previous bits are the delta-encoded position.
                    int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1);
                    @out.WriteVInt(token);
                    previousPosition = pos;
                    if (storeOffsets) // don't encode offsets if they are not stored
                    {
                        int startOffset = @in.StartOffset();
                        int endOffset   = @in.EndOffset();
                        @out.WriteVInt(startOffset - previousEndOffset);
                        @out.WriteVInt(endOffset - startOffset);
                        previousEndOffset = endOffset;
                    }
                    if (payload != null)
                    {
                        @out.WriteVInt(payload.Length);
                        @out.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                    }
                }
            }
Ejemplo n.º 4
0
        public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter()
        {
            Store.Directory    dir         = NewDirectory();
            Analyzer           analyzer    = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false);
            IndexWriter        w           = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
            Document           doc         = new Document();
            TokenStream        tokenStream = analyzer.TokenStream("field", "abcd   ");
            TeeSinkTokenFilter tee         = new TeeSinkTokenFilter(tokenStream);
            TokenStream        sink        = tee.NewSinkTokenStream();
            FieldType          ft          = new FieldType(TextField.TYPE_NOT_STORED);

            ft.StoreTermVectors         = true;
            ft.StoreTermVectorOffsets   = true;
            ft.StoreTermVectorPositions = true;
            Field f1 = new Field("field", tee, ft);
            Field f2 = new Field("field", sink, ft);

            doc.Add(f1);
            doc.Add(f2);
            w.AddDocument(doc);
            w.Dispose();

            IndexReader r      = DirectoryReader.Open(dir);
            Terms       vector = r.GetTermVectors(0).Terms("field");

            assertEquals(1, vector.Size());
            TermsEnum termsEnum = vector.Iterator(null);

            termsEnum.Next();
            assertEquals(2, termsEnum.TotalTermFreq());
            DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null);

            assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            assertEquals(2, positions.Freq());
            positions.NextPosition();
            assertEquals(0, positions.StartOffset());
            assertEquals(4, positions.EndOffset());
            positions.NextPosition();
            assertEquals(8, positions.StartOffset());
            assertEquals(12, positions.EndOffset());
            assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc());
            r.Dispose();
            dir.Dispose();
        }
Ejemplo n.º 5
0
        public virtual void TestChangeGaps()
        {
            // LUCENE-5324: check that it is possible to change the wrapper's gaps
            int      positionGap = Random().Next(1000);
            int      offsetGap   = Random().Next(1000);
            Analyzer @delegate   = new MockAnalyzer(Random());
            Analyzer a           = new AnalyzerWrapperAnonymousInnerClassHelper2(this, @delegate.Strategy, positionGap, offsetGap, @delegate);

            RandomIndexWriter writer = new RandomIndexWriter(Random(), NewDirectory());
            Document          doc    = new Document();
            FieldType         ft     = new FieldType();

            ft.Indexed                  = true;
            ft.IndexOptions             = FieldInfo.IndexOptions.DOCS_ONLY;
            ft.Tokenized                = true;
            ft.StoreTermVectors         = true;
            ft.StoreTermVectorPositions = true;
            ft.StoreTermVectorOffsets   = true;
            doc.Add(new Field("f", "a", ft));
            doc.Add(new Field("f", "a", ft));
            writer.AddDocument(doc, a);
            AtomicReader reader = GetOnlySegmentReader(writer.Reader);
            Fields       fields = reader.GetTermVectors(0);
            Terms        terms  = fields.Terms("f");
            TermsEnum    te     = terms.Iterator(null);

            Assert.AreEqual(new BytesRef("a"), te.Next());
            DocsAndPositionsEnum dpe = te.DocsAndPositions(null, null);

            Assert.AreEqual(0, dpe.NextDoc());
            Assert.AreEqual(2, dpe.Freq());
            Assert.AreEqual(0, dpe.NextPosition());
            Assert.AreEqual(0, dpe.StartOffset());
            int endOffset = dpe.EndOffset();

            Assert.AreEqual(1 + positionGap, dpe.NextPosition());
            Assert.AreEqual(1 + endOffset + offsetGap, dpe.EndOffset());
            Assert.AreEqual(null, te.Next());
            reader.Dispose();
            writer.Dispose();
            writer.w.Directory.Dispose();
        }
Ejemplo n.º 6
0
        protected void CompareTermVectors(Terms terms, Terms memTerms, string field_name)
        {
            TermsEnum termEnum    = terms.Iterator(null);
            TermsEnum memTermEnum = memTerms.Iterator(null);

            while (termEnum.Next() != null)
            {
                assertNotNull(memTermEnum.Next());

                assertEquals(termEnum.TotalTermFreq(), memTermEnum.TotalTermFreq());

                DocsAndPositionsEnum docsPosEnum    = termEnum.DocsAndPositions(null, null, 0);
                DocsAndPositionsEnum memDocsPosEnum = memTermEnum.DocsAndPositions(null, null, 0);
                String currentTerm = termEnum.Term().Utf8ToString();


                assertEquals("Token mismatch for field: " + field_name, currentTerm, memTermEnum.Term().Utf8ToString());

                docsPosEnum.NextDoc();
                memDocsPosEnum.NextDoc();

                int freq = docsPosEnum.Freq();
                assertEquals(freq, memDocsPosEnum.Freq());
                for (int i = 0; i < freq; i++)
                {
                    string failDesc = " (field:" + field_name + " term:" + currentTerm + ")";
                    int    memPos   = memDocsPosEnum.NextPosition();
                    int    pos      = docsPosEnum.NextPosition();
                    assertEquals("Position test failed" + failDesc, memPos, pos);
                    assertEquals("Start offset test failed" + failDesc, memDocsPosEnum.StartOffset(), docsPosEnum.StartOffset());
                    assertEquals("End offset test failed" + failDesc, memDocsPosEnum.EndOffset(), docsPosEnum.EndOffset());
                    assertEquals("Missing payload test failed" + failDesc, docsPosEnum.Payload, null);
                }
            }
            assertNull("Still some tokens not processed", memTermEnum.Next());
        }
Ejemplo n.º 7
0
        public virtual void TestMixedVectrosVectors()
        {
            RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE));
            Document          doc    = new Document();

            FieldType ft2 = new FieldType(TextField.TYPE_STORED);

            ft2.StoreTermVectors = true;

            FieldType ft3 = new FieldType(TextField.TYPE_STORED);

            ft3.StoreTermVectors         = true;
            ft3.StoreTermVectorPositions = true;

            FieldType ft4 = new FieldType(TextField.TYPE_STORED);

            ft4.StoreTermVectors       = true;
            ft4.StoreTermVectorOffsets = true;

            FieldType ft5 = new FieldType(TextField.TYPE_STORED);

            ft5.StoreTermVectors         = true;
            ft5.StoreTermVectorOffsets   = true;
            ft5.StoreTermVectorPositions = true;

            doc.Add(NewTextField("field", "one", Field.Store.YES));
            doc.Add(NewField("field", "one", ft2));
            doc.Add(NewField("field", "one", ft3));
            doc.Add(NewField("field", "one", ft4));
            doc.Add(NewField("field", "one", ft5));
            writer.AddDocument(doc);
            IndexReader reader = writer.Reader;

            writer.Dispose();

            IndexSearcher searcher = NewSearcher(reader);

            Query query = new TermQuery(new Term("field", "one"));

            ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs;
            Assert.AreEqual(1, hits.Length);

            Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc);

            Assert.IsNotNull(vectors);
            Assert.AreEqual(1, vectors.Size);
            Terms vector = vectors.Terms("field");

            Assert.IsNotNull(vector);
            Assert.AreEqual(1, vector.Size());
            TermsEnum termsEnum = vector.Iterator(null);

            Assert.IsNotNull(termsEnum.Next());
            Assert.AreEqual("one", termsEnum.Term().Utf8ToString());
            Assert.AreEqual(5, termsEnum.TotalTermFreq());
            DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null);

            Assert.IsNotNull(dpEnum);
            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(5, dpEnum.Freq());
            for (int i = 0; i < 5; i++)
            {
                Assert.AreEqual(i, dpEnum.NextPosition());
            }

            dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
            Assert.IsNotNull(dpEnum);
            Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
            Assert.AreEqual(5, dpEnum.Freq());
            for (int i = 0; i < 5; i++)
            {
                dpEnum.NextPosition();
                Assert.AreEqual(4 * i, dpEnum.StartOffset());
                Assert.AreEqual(4 * i + 3, dpEnum.EndOffset());
            }
            reader.Dispose();
        }
Ejemplo n.º 8
0
        private void DuellReaders(CompositeReader other, AtomicReader memIndexReader)
        {
            AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other);
            Fields       memFields  = memIndexReader.Fields;

            foreach (string field in competitor.Fields)
            {
                Terms memTerms = memFields.Terms(field);
                Terms iwTerms  = memIndexReader.Terms(field);
                if (iwTerms == null)
                {
                    assertNull(memTerms);
                }
                else
                {
                    NumericDocValues normValues    = competitor.GetNormValues(field);
                    NumericDocValues memNormValues = memIndexReader.GetNormValues(field);
                    if (normValues != null)
                    {
                        // mem idx always computes norms on the fly
                        assertNotNull(memNormValues);
                        assertEquals(normValues.Get(0), memNormValues.Get(0));
                    }

                    assertNotNull(memTerms);
                    assertEquals(iwTerms.DocCount, memTerms.DocCount);
                    assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq);
                    assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq);
                    TermsEnum iwTermsIter  = iwTerms.Iterator(null);
                    TermsEnum memTermsIter = memTerms.Iterator(null);
                    if (iwTerms.HasPositions())
                    {
                        bool offsets = iwTerms.HasOffsets() && memTerms.HasOffsets();

                        while (iwTermsIter.Next() != null)
                        {
                            assertNotNull(memTermsIter.Next());
                            assertEquals(iwTermsIter.Term(), memTermsIter.Term());
                            DocsAndPositionsEnum iwDocsAndPos  = iwTermsIter.DocsAndPositions(null, null);
                            DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null);
                            while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS)
                            {
                                assertEquals(iwDocsAndPos.DocID(), memDocsAndPos.NextDoc());
                                assertEquals(iwDocsAndPos.Freq(), memDocsAndPos.Freq());
                                for (int i = 0; i < iwDocsAndPos.Freq(); i++)
                                {
                                    assertEquals("term: " + iwTermsIter.Term().Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition());
                                    if (offsets)
                                    {
                                        assertEquals(iwDocsAndPos.StartOffset(), memDocsAndPos.StartOffset());
                                        assertEquals(iwDocsAndPos.EndOffset(), memDocsAndPos.EndOffset());
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        while (iwTermsIter.Next() != null)
                        {
                            assertEquals(iwTermsIter.Term(), memTermsIter.Term());
                            DocsEnum iwDocsAndPos  = iwTermsIter.Docs(null, null);
                            DocsEnum memDocsAndPos = memTermsIter.Docs(null, null);
                            while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS)
                            {
                                assertEquals(iwDocsAndPos.DocID(), memDocsAndPos.NextDoc());
                                assertEquals(iwDocsAndPos.Freq(), memDocsAndPos.Freq());
                            }
                        }
                    }
                }
            }
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Safe (but, slowish) default method to write every
        ///  vector field in the document.
        /// </summary>
        protected internal void AddAllDocVectors(Fields vectors, MergeState mergeState)
        {
            if (vectors == null)
            {
                StartDocument(0);
                FinishDocument();
                return;
            }

            int numFields = vectors.Size;

            if (numFields == -1)
            {
                // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
                numFields = 0;
                //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();)
                foreach (string it in vectors)
                {
                    numFields++;
                }
            }
            StartDocument(numFields);

            string lastFieldName = null;

            TermsEnum            termsEnum            = null;
            DocsAndPositionsEnum docsAndPositionsEnum = null;

            int fieldCount = 0;

            foreach (string fieldName in vectors)
            {
                fieldCount++;
                FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName);

                Debug.Assert(lastFieldName == null || fieldName.CompareTo(lastFieldName) > 0, "lastFieldName=" + lastFieldName + " fieldName=" + fieldName);
                lastFieldName = fieldName;

                Terms terms = vectors.Terms(fieldName);
                if (terms == null)
                {
                    // FieldsEnum shouldn't lie...
                    continue;
                }

                bool hasPositions = terms.HasPositions();
                bool hasOffsets   = terms.HasOffsets();
                bool hasPayloads  = terms.HasPayloads();
                Debug.Assert(!hasPayloads || hasPositions);

                int numTerms = (int)terms.Size();
                if (numTerms == -1)
                {
                    // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
                    numTerms  = 0;
                    termsEnum = terms.Iterator(termsEnum);
                    while (termsEnum.Next() != null)
                    {
                        numTerms++;
                    }
                }

                StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
                termsEnum = terms.Iterator(termsEnum);

                int termCount = 0;
                while (termsEnum.Next() != null)
                {
                    termCount++;

                    int freq = (int)termsEnum.TotalTermFreq();

                    StartTerm(termsEnum.Term(), freq);

                    if (hasPositions || hasOffsets)
                    {
                        docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum);
                        Debug.Assert(docsAndPositionsEnum != null);

                        int docID = docsAndPositionsEnum.NextDoc();
                        Debug.Assert(docID != DocIdSetIterator.NO_MORE_DOCS);
                        Debug.Assert(docsAndPositionsEnum.Freq() == freq);

                        for (int posUpto = 0; posUpto < freq; posUpto++)
                        {
                            int pos         = docsAndPositionsEnum.NextPosition();
                            int startOffset = docsAndPositionsEnum.StartOffset();
                            int endOffset   = docsAndPositionsEnum.EndOffset();

                            BytesRef payload = docsAndPositionsEnum.Payload;

                            Debug.Assert(!hasPositions || pos >= 0);
                            AddPosition(pos, startOffset, endOffset, payload);
                        }
                    }
                    FinishTerm();
                }
                Debug.Assert(termCount == numTerms);
                FinishField();
            }
            Debug.Assert(fieldCount == numFields);
            FinishDocument();
        }
 internal virtual void AddPositions(DocsAndPositionsEnum @in, IndexOutput @out)
 {
     int freq = @in.Freq();
     @out.WriteVInt(freq);
     int previousPosition = 0;
     int previousEndOffset = 0;
     for (int i = 0; i < freq; i++)
     {
         int pos = @in.NextPosition();
         BytesRef payload = @in.Payload;
         // The low-order bit of token is set only if there is a payload, the
         // previous bits are the delta-encoded position. 
         int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1);
         @out.WriteVInt(token);
         previousPosition = pos;
         if (storeOffsets) // don't encode offsets if they are not stored
         {
             int startOffset = @in.StartOffset();
             int endOffset = @in.EndOffset();
             @out.WriteVInt(startOffset - previousEndOffset);
             @out.WriteVInt(endOffset - startOffset);
             previousEndOffset = endOffset;
         }
         if (payload != null)
         {
             @out.WriteVInt(payload.Length);
             @out.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
         }
     }
 }