public virtual void TestDocsAndPositionsEnum() { TermsEnum termsEnum = reader.Terms(DOC_POSITIONS_FIELD).Iterator(null); assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(DOC_POSITIONS_TERM))); DocsAndPositionsEnum sortedPositions = termsEnum.DocsAndPositions(null, null); int doc; // test nextDoc() while ((doc = sortedPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.Freq(); assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition()); if (!DoesntSupportOffsets.contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset()); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset()); } assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.Payload.Utf8ToString(), CultureInfo.InvariantCulture)); } } // test advance() DocsAndPositionsEnum reuse = sortedPositions; sortedPositions = termsEnum.DocsAndPositions(null, reuse); if (sortedPositions is SortingAtomicReader.SortingDocsAndPositionsEnum) { assertTrue(((SortingAtomicReader.SortingDocsAndPositionsEnum)sortedPositions).Reused(reuse)); // make sure reuse worked } doc = 0; while ((doc = sortedPositions.Advance(doc + TestUtil.NextInt(Random(), 1, 5))) != DocIdSetIterator.NO_MORE_DOCS) { int freq = sortedPositions.Freq(); assertEquals("incorrect freq for doc=" + doc, sortedValues[doc] / 10 + 1, freq); for (int i = 0; i < freq; i++) { assertEquals("incorrect position for doc=" + doc, i, sortedPositions.NextPosition()); if (!DoesntSupportOffsets.contains(TestUtil.GetPostingsFormat(DOC_POSITIONS_FIELD))) { assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.StartOffset()); assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.EndOffset()); } assertEquals("incorrect payload for doc=" + doc, freq - i, int.Parse(sortedPositions.Payload.Utf8ToString(), CultureInfo.InvariantCulture)); } } }
public void RestDocsAndPositionsEnumStart() { Analyzer analyzer = new MockAnalyzer(Random()); int numIters = AtLeast(3); MemoryIndex memory = new MemoryIndex(true, Random().nextInt(50) * 1024 * 1024); for (int i = 0; i < numIters; i++) { // check reuse memory.AddField("foo", "bar", analyzer); AtomicReader reader = (AtomicReader)memory.CreateSearcher().IndexReader; assertEquals(1, reader.Terms("foo").SumTotalTermFreq); DocsAndPositionsEnum disi = reader.TermPositionsEnum(new Term("foo", "bar")); int docid = disi.DocID(); assertEquals(-1, docid); assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(0, disi.NextPosition()); assertEquals(0, disi.StartOffset()); assertEquals(3, disi.EndOffset()); // now reuse and check again TermsEnum te = reader.Terms("foo").Iterator(null); assertTrue(te.SeekExact(new BytesRef("bar"))); disi = te.DocsAndPositions(null, disi); docid = disi.DocID(); assertEquals(-1, docid); assertTrue(disi.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); reader.Dispose(); memory.Reset(); } }
internal virtual void AddPositions(DocsAndPositionsEnum @in, IndexOutput @out) { int freq = @in.Freq(); @out.WriteVInt(freq); int previousPosition = 0; int previousEndOffset = 0; for (int i = 0; i < freq; i++) { int pos = @in.NextPosition(); BytesRef payload = @in.Payload; // The low-order bit of token is set only if there is a payload, the // previous bits are the delta-encoded position. int token = (pos - previousPosition) << 1 | (payload == null ? 0 : 1); @out.WriteVInt(token); previousPosition = pos; if (storeOffsets) // don't encode offsets if they are not stored { int startOffset = @in.StartOffset(); int endOffset = @in.EndOffset(); @out.WriteVInt(startOffset - previousEndOffset); @out.WriteVInt(endOffset - startOffset); previousEndOffset = endOffset; } if (payload != null) { @out.WriteVInt(payload.Length); @out.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } } }
public virtual void TestEndOffsetPositionWithTeeSinkTokenFilter() { Store.Directory dir = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random(), MockTokenizer.WHITESPACE, false); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); TokenStream tokenStream = analyzer.TokenStream("field", "abcd "); TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream); TokenStream sink = tee.NewSinkTokenStream(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = true; ft.StoreTermVectorPositions = true; Field f1 = new Field("field", tee, ft); Field f2 = new Field("field", sink, ft); doc.Add(f1); doc.Add(f2); w.AddDocument(doc); w.Dispose(); IndexReader r = DirectoryReader.Open(dir); Terms vector = r.GetTermVectors(0).Terms("field"); assertEquals(1, vector.Size()); TermsEnum termsEnum = vector.Iterator(null); termsEnum.Next(); assertEquals(2, termsEnum.TotalTermFreq()); DocsAndPositionsEnum positions = termsEnum.DocsAndPositions(null, null); assertTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(2, positions.Freq()); positions.NextPosition(); assertEquals(0, positions.StartOffset()); assertEquals(4, positions.EndOffset()); positions.NextPosition(); assertEquals(8, positions.StartOffset()); assertEquals(12, positions.EndOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.NextDoc()); r.Dispose(); dir.Dispose(); }
public virtual void TestChangeGaps() { // LUCENE-5324: check that it is possible to change the wrapper's gaps int positionGap = Random().Next(1000); int offsetGap = Random().Next(1000); Analyzer @delegate = new MockAnalyzer(Random()); Analyzer a = new AnalyzerWrapperAnonymousInnerClassHelper2(this, @delegate.Strategy, positionGap, offsetGap, @delegate); RandomIndexWriter writer = new RandomIndexWriter(Random(), NewDirectory()); Document doc = new Document(); FieldType ft = new FieldType(); ft.Indexed = true; ft.IndexOptions = FieldInfo.IndexOptions.DOCS_ONLY; ft.Tokenized = true; ft.StoreTermVectors = true; ft.StoreTermVectorPositions = true; ft.StoreTermVectorOffsets = true; doc.Add(new Field("f", "a", ft)); doc.Add(new Field("f", "a", ft)); writer.AddDocument(doc, a); AtomicReader reader = GetOnlySegmentReader(writer.Reader); Fields fields = reader.GetTermVectors(0); Terms terms = fields.Terms("f"); TermsEnum te = terms.Iterator(null); Assert.AreEqual(new BytesRef("a"), te.Next()); DocsAndPositionsEnum dpe = te.DocsAndPositions(null, null); Assert.AreEqual(0, dpe.NextDoc()); Assert.AreEqual(2, dpe.Freq()); Assert.AreEqual(0, dpe.NextPosition()); Assert.AreEqual(0, dpe.StartOffset()); int endOffset = dpe.EndOffset(); Assert.AreEqual(1 + positionGap, dpe.NextPosition()); Assert.AreEqual(1 + endOffset + offsetGap, dpe.EndOffset()); Assert.AreEqual(null, te.Next()); reader.Dispose(); writer.Dispose(); writer.w.Directory.Dispose(); }
protected void CompareTermVectors(Terms terms, Terms memTerms, string field_name) { TermsEnum termEnum = terms.Iterator(null); TermsEnum memTermEnum = memTerms.Iterator(null); while (termEnum.Next() != null) { assertNotNull(memTermEnum.Next()); assertEquals(termEnum.TotalTermFreq(), memTermEnum.TotalTermFreq()); DocsAndPositionsEnum docsPosEnum = termEnum.DocsAndPositions(null, null, 0); DocsAndPositionsEnum memDocsPosEnum = memTermEnum.DocsAndPositions(null, null, 0); String currentTerm = termEnum.Term().Utf8ToString(); assertEquals("Token mismatch for field: " + field_name, currentTerm, memTermEnum.Term().Utf8ToString()); docsPosEnum.NextDoc(); memDocsPosEnum.NextDoc(); int freq = docsPosEnum.Freq(); assertEquals(freq, memDocsPosEnum.Freq()); for (int i = 0; i < freq; i++) { string failDesc = " (field:" + field_name + " term:" + currentTerm + ")"; int memPos = memDocsPosEnum.NextPosition(); int pos = docsPosEnum.NextPosition(); assertEquals("Position test failed" + failDesc, memPos, pos); assertEquals("Start offset test failed" + failDesc, memDocsPosEnum.StartOffset(), docsPosEnum.StartOffset()); assertEquals("End offset test failed" + failDesc, memDocsPosEnum.EndOffset(), docsPosEnum.EndOffset()); assertEquals("Missing payload test failed" + failDesc, docsPosEnum.Payload, null); } } assertNull("Still some tokens not processed", memTermEnum.Next()); }
public virtual void TestMixedVectrosVectors() { RandomIndexWriter writer = new RandomIndexWriter(Random(), Directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.SIMPLE, true)).SetOpenMode(OpenMode.CREATE)); Document doc = new Document(); FieldType ft2 = new FieldType(TextField.TYPE_STORED); ft2.StoreTermVectors = true; FieldType ft3 = new FieldType(TextField.TYPE_STORED); ft3.StoreTermVectors = true; ft3.StoreTermVectorPositions = true; FieldType ft4 = new FieldType(TextField.TYPE_STORED); ft4.StoreTermVectors = true; ft4.StoreTermVectorOffsets = true; FieldType ft5 = new FieldType(TextField.TYPE_STORED); ft5.StoreTermVectors = true; ft5.StoreTermVectorOffsets = true; ft5.StoreTermVectorPositions = true; doc.Add(NewTextField("field", "one", Field.Store.YES)); doc.Add(NewField("field", "one", ft2)); doc.Add(NewField("field", "one", ft3)); doc.Add(NewField("field", "one", ft4)); doc.Add(NewField("field", "one", ft5)); writer.AddDocument(doc); IndexReader reader = writer.Reader; writer.Dispose(); IndexSearcher searcher = NewSearcher(reader); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).ScoreDocs; Assert.AreEqual(1, hits.Length); Fields vectors = searcher.IndexReader.GetTermVectors(hits[0].Doc); Assert.IsNotNull(vectors); Assert.AreEqual(1, vectors.Size); Terms vector = vectors.Terms("field"); Assert.IsNotNull(vector); Assert.AreEqual(1, vector.Size()); TermsEnum termsEnum = vector.Iterator(null); Assert.IsNotNull(termsEnum.Next()); Assert.AreEqual("one", termsEnum.Term().Utf8ToString()); Assert.AreEqual(5, termsEnum.TotalTermFreq()); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq()); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, dpEnum.NextPosition()); } dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(5, dpEnum.Freq()); for (int i = 0; i < 5; i++) { dpEnum.NextPosition(); Assert.AreEqual(4 * i, dpEnum.StartOffset()); Assert.AreEqual(4 * i + 3, dpEnum.EndOffset()); } reader.Dispose(); }
private void DuellReaders(CompositeReader other, AtomicReader memIndexReader) { AtomicReader competitor = SlowCompositeReaderWrapper.Wrap(other); Fields memFields = memIndexReader.Fields; foreach (string field in competitor.Fields) { Terms memTerms = memFields.Terms(field); Terms iwTerms = memIndexReader.Terms(field); if (iwTerms == null) { assertNull(memTerms); } else { NumericDocValues normValues = competitor.GetNormValues(field); NumericDocValues memNormValues = memIndexReader.GetNormValues(field); if (normValues != null) { // mem idx always computes norms on the fly assertNotNull(memNormValues); assertEquals(normValues.Get(0), memNormValues.Get(0)); } assertNotNull(memTerms); assertEquals(iwTerms.DocCount, memTerms.DocCount); assertEquals(iwTerms.SumDocFreq, memTerms.SumDocFreq); assertEquals(iwTerms.SumTotalTermFreq, memTerms.SumTotalTermFreq); TermsEnum iwTermsIter = iwTerms.Iterator(null); TermsEnum memTermsIter = memTerms.Iterator(null); if (iwTerms.HasPositions()) { bool offsets = iwTerms.HasOffsets() && memTerms.HasOffsets(); while (iwTermsIter.Next() != null) { assertNotNull(memTermsIter.Next()); assertEquals(iwTermsIter.Term(), memTermsIter.Term()); DocsAndPositionsEnum iwDocsAndPos = iwTermsIter.DocsAndPositions(null, null); DocsAndPositionsEnum memDocsAndPos = memTermsIter.DocsAndPositions(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID(), memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq(), memDocsAndPos.Freq()); for (int i = 0; i < iwDocsAndPos.Freq(); i++) { assertEquals("term: " + iwTermsIter.Term().Utf8ToString(), iwDocsAndPos.NextPosition(), memDocsAndPos.NextPosition()); if (offsets) { assertEquals(iwDocsAndPos.StartOffset(), memDocsAndPos.StartOffset()); assertEquals(iwDocsAndPos.EndOffset(), memDocsAndPos.EndOffset()); } } } } } else { while (iwTermsIter.Next() != null) { assertEquals(iwTermsIter.Term(), memTermsIter.Term()); DocsEnum iwDocsAndPos = iwTermsIter.Docs(null, null); DocsEnum memDocsAndPos = memTermsIter.Docs(null, null); while (iwDocsAndPos.NextDoc() != DocsAndPositionsEnum.NO_MORE_DOCS) { assertEquals(iwDocsAndPos.DocID(), memDocsAndPos.NextDoc()); assertEquals(iwDocsAndPos.Freq(), memDocsAndPos.Freq()); } } } } } }
/// <summary> /// Safe (but, slowish) default method to write every /// vector field in the document. /// </summary> protected internal void AddAllDocVectors(Fields vectors, MergeState mergeState) { if (vectors == null) { StartDocument(0); FinishDocument(); return; } int numFields = vectors.Size; if (numFields == -1) { // count manually! TODO: Maybe enforce that Fields.size() returns something valid? numFields = 0; //for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();) foreach (string it in vectors) { numFields++; } } StartDocument(numFields); string lastFieldName = null; TermsEnum termsEnum = null; DocsAndPositionsEnum docsAndPositionsEnum = null; int fieldCount = 0; foreach (string fieldName in vectors) { fieldCount++; FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName); Debug.Assert(lastFieldName == null || fieldName.CompareTo(lastFieldName) > 0, "lastFieldName=" + lastFieldName + " fieldName=" + fieldName); lastFieldName = fieldName; Terms terms = vectors.Terms(fieldName); if (terms == null) { // FieldsEnum shouldn't lie... continue; } bool hasPositions = terms.HasPositions(); bool hasOffsets = terms.HasOffsets(); bool hasPayloads = terms.HasPayloads(); Debug.Assert(!hasPayloads || hasPositions); int numTerms = (int)terms.Size(); if (numTerms == -1) { // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function numTerms = 0; termsEnum = terms.Iterator(termsEnum); while (termsEnum.Next() != null) { numTerms++; } } StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads); termsEnum = terms.Iterator(termsEnum); int termCount = 0; while (termsEnum.Next() != null) { termCount++; int freq = (int)termsEnum.TotalTermFreq(); StartTerm(termsEnum.Term(), freq); if (hasPositions || hasOffsets) { docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum); Debug.Assert(docsAndPositionsEnum != null); int docID = docsAndPositionsEnum.NextDoc(); Debug.Assert(docID != DocIdSetIterator.NO_MORE_DOCS); Debug.Assert(docsAndPositionsEnum.Freq() == freq); for (int posUpto = 0; posUpto < freq; posUpto++) { int pos = docsAndPositionsEnum.NextPosition(); int startOffset = docsAndPositionsEnum.StartOffset(); int endOffset = docsAndPositionsEnum.EndOffset(); BytesRef payload = docsAndPositionsEnum.Payload; Debug.Assert(!hasPositions || pos >= 0); AddPosition(pos, startOffset, endOffset, payload); } } FinishTerm(); } Debug.Assert(termCount == numTerms); FinishField(); } Debug.Assert(fieldCount == numFields); FinishDocument(); }