public virtual void TestDocsEnum() { TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random)); for (int j = 0; j < 5; j++) { Terms vector = reader.Get(j).GetTerms(testFields[0]); Assert.IsNotNull(vector); Assert.AreEqual(testTerms.Length, vector.Count); TermsEnum termsEnum = vector.GetEnumerator(); DocsEnum docsEnum = null; for (int i = 0; i < testTerms.Length; i++) { Assert.IsTrue(termsEnum.MoveNext()); BytesRef text = termsEnum.Term; string term = text.Utf8ToString(); //System.out.println("Term: " + term); Assert.AreEqual(testTerms[i], term); docsEnum = TestUtil.Docs(Random, termsEnum, null, docsEnum, DocsFlags.NONE); Assert.IsNotNull(docsEnum); int doc = docsEnum.DocID; Assert.AreEqual(-1, doc); Assert.IsTrue(docsEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc()); } Assert.IsFalse(termsEnum.MoveNext()); } reader.Dispose(); }
/// <summary> /// Returns <see cref="DocsAndPositionsEnum"/> for the specified /// term. This will return <c>null</c> if the /// field or term does not exist or positions weren't indexed. </summary> /// <seealso cref="TermsEnum.DocsAndPositions(IBits, DocsAndPositionsEnum)"/> public DocsAndPositionsEnum GetTermPositionsEnum(Term term) // LUCENENET specific: Renamed from TermPositionsEnum() { if (Debugging.AssertsEnabled) { Debugging.Assert(term.Field != null); } if (Debugging.AssertsEnabled) { Debugging.Assert(term.Bytes != null); } Fields fields = Fields; if (fields != null) { Terms terms = fields.GetTerms(term.Field); if (terms != null) { TermsEnum termsEnum = terms.GetEnumerator(); if (termsEnum.SeekExact(term.Bytes)) { return(termsEnum.DocsAndPositions(LiveDocs, null)); } } } return(null); }
public virtual void TestTerms() { Fields fields = MultiFields.GetFields(reader); foreach (string field in fields) { Terms terms = fields.GetTerms(field); Assert.IsNotNull(terms); TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; Assert.IsTrue(term != null); string fieldValue = (string)DocHelper.NameValues[field]; Assert.IsTrue(fieldValue.IndexOf(term.Utf8ToString(), StringComparison.Ordinal) != -1); } } DocsEnum termDocs = TestUtil.Docs(Random, reader, DocHelper.TEXT_FIELD_1_KEY, new BytesRef("field"), MultiFields.GetLiveDocs(reader), null, 0); Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); termDocs = TestUtil.Docs(Random, reader, DocHelper.NO_NORMS_KEY, new BytesRef(DocHelper.NO_NORMS_TEXT), MultiFields.GetLiveDocs(reader), null, 0); Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); DocsAndPositionsEnum positions = MultiFields.GetTermPositionsEnum(reader, MultiFields.GetLiveDocs(reader), DocHelper.TEXT_FIELD_1_KEY, new BytesRef("field")); // NOTE: prior rev of this test was failing to first // call next here: Assert.IsTrue(positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.IsTrue(positions.DocID == 0); Assert.IsTrue(positions.NextPosition() >= 0); }
public virtual void TestZeroTerms() { var d = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, d); Document doc = new Document(); doc.Add(NewTextField("field", "one two three", Field.Store.NO)); doc = new Document(); doc.Add(NewTextField("field2", "one two three", Field.Store.NO)); w.AddDocument(doc); w.Commit(); w.DeleteDocuments(new Term("field", "one")); w.ForceMerge(1); IndexReader r = w.GetReader(); w.Dispose(); Assert.AreEqual(1, r.NumDocs); Assert.AreEqual(1, r.MaxDoc); Terms terms = MultiFields.GetTerms(r, "field"); if (terms != null) { Assert.IsFalse(terms.GetEnumerator().MoveNext()); } r.Dispose(); d.Dispose(); }
/// <summary> /// Returns a <see cref="TermsEnum"/> that implements <see cref="TermsEnum.Ord"/>. If the /// provided <paramref name="reader"/> supports <see cref="TermsEnum.Ord"/>, we just return its /// <see cref="TermsEnum"/>; if it does not, we build a "private" terms /// index internally (WARNING: consumes RAM) and use that /// index to implement <see cref="TermsEnum.Ord"/>. This also enables <see cref="TermsEnum.Ord"/> on top /// of a composite reader. The returned <see cref="TermsEnum"/> is /// unpositioned. This returns <c>null</c> if there are no terms. /// /// <para/><b>NOTE</b>: you must pass the same reader that was /// used when creating this class /// </summary> public virtual TermsEnum GetOrdTermsEnum(AtomicReader reader) { if (m_indexedTermsArray == null) { //System.out.println("GET normal enum"); Fields fields = reader.Fields; if (fields == null) { return(null); } Terms terms = fields.GetTerms(m_field); if (terms == null) { return(null); } else { return(terms.GetEnumerator()); } } else { //System.out.println("GET wrapped enum ordBase=" + ordBase); return(new OrdWrappedTermsEnum(this, reader)); } }
/// <summary> /// Creates a <see cref="TermContext"/> from a top-level <see cref="IndexReaderContext"/> and the /// given <see cref="Term"/>. this method will lookup the given term in all context's leaf readers /// and register each of the readers containing the term in the returned <see cref="TermContext"/> /// using the leaf reader's ordinal. /// <para/> /// Note: the given context must be a top-level context. /// </summary> public static TermContext Build(IndexReaderContext context, Term term) { if (Debugging.AssertsEnabled) { Debugging.Assert(context != null && context.IsTopLevel); } string field = term.Field; BytesRef bytes = term.Bytes; TermContext perReaderTermState = new TermContext(context); //if (DEBUG) System.out.println("prts.build term=" + term); foreach (AtomicReaderContext ctx in context.Leaves) { //if (DEBUG) System.out.println(" r=" + leaves[i].reader); Fields fields = ctx.AtomicReader.Fields; if (fields != null) { Terms terms = fields.GetTerms(field); if (terms != null) { TermsEnum termsEnum = terms.GetEnumerator(); if (termsEnum.SeekExact(bytes)) { TermState termState = termsEnum.GetTermState(); //if (DEBUG) System.out.println(" found"); perReaderTermState.Register(termState, ctx.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); } } } } return(perReaderTermState); }
private void VerifyCount(IndexReader ir) { Fields fields = MultiFields.GetFields(ir); if (fields == null) { return; } foreach (string field in fields) { Terms terms = fields.GetTerms(field); if (terms == null) { continue; } int docCount = terms.DocCount; FixedBitSet visited = new FixedBitSet(ir.MaxDoc); TermsEnum te = terms.GetEnumerator(); while (te.MoveNext()) { DocsEnum de = TestUtil.Docs(Random, te, null, null, DocsFlags.NONE); while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { visited.Set(de.DocID); } } Assert.AreEqual(visited.Cardinality, docCount); } }
/// <summary> /// Returns the number of documents containing the <paramref name="term"/>. /// This method returns 0 if the term or /// field does not exist. This method does not take into /// account deleted documents that have not yet been merged /// away. /// </summary> public override sealed long TotalTermFreq(Term term) { Fields fields = Fields; if (fields is null) { return(0); } Terms terms = fields.GetTerms(term.Field); if (terms is null) { return(0); } TermsEnum termsEnum = terms.GetEnumerator(); if (termsEnum.SeekExact(term.Bytes)) { return(termsEnum.TotalTermFreq); } else { return(0); } }
private void AssertSumDocFreq(IndexReader ir) { // compute sumDocFreq across all fields Fields fields = MultiFields.GetFields(ir); foreach (string f in fields) { Terms terms = fields.GetTerms(f); long sumDocFreq = terms.SumDocFreq; if (sumDocFreq == -1) { if (Verbose) { Console.WriteLine("skipping field: " + f + ", codec does not support sumDocFreq"); } continue; } long computedSumDocFreq = 0; TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { computedSumDocFreq += termsEnum.DocFreq; } Assert.AreEqual(computedSumDocFreq, sumDocFreq); } }
public virtual void TestMixupMultiValued() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorPayloads = true; customType.StoreTermVectorOffsets = Random.NextBoolean(); Field field = new Field("field", "", customType); TokenStream ts = new MockTokenizer(new StringReader("here we go"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>()); field.SetTokenStream(ts); doc.Add(field); Field field2 = new Field("field", "", customType); Token withPayload = new Token("withPayload", 0, 11); withPayload.Payload = new BytesRef("test"); ts = new CannedTokenStream(withPayload); Assert.IsTrue(ts.HasAttribute <IPayloadAttribute>()); field2.SetTokenStream(ts); doc.Add(field2); Field field3 = new Field("field", "", customType); ts = new MockTokenizer(new StringReader("nopayload"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>()); field3.SetTokenStream(ts); doc.Add(field3); writer.AddDocument(doc); DirectoryReader reader = writer.GetReader(); Terms terms = reader.GetTermVector(0, "field"); if (Debugging.AssertsEnabled) { Debugging.Assert(terms != null); } TermsEnum termsEnum = terms.GetEnumerator(); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("withPayload"))); DocsAndPositionsEnum de = termsEnum.DocsAndPositions(null, null); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(3, de.NextPosition()); Assert.AreEqual(new BytesRef("test"), de.GetPayload()); writer.Dispose(); reader.Dispose(); dir.Dispose(); }
public virtual void TestMixupDocs() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, iwc); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorPayloads = true; customType.StoreTermVectorOffsets = Random.NextBoolean(); Field field = new Field("field", "", customType); TokenStream ts = new MockTokenizer(new StringReader("here we go"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>()); field.SetTokenStream(ts); doc.Add(field); writer.AddDocument(doc); Token withPayload = new Token("withPayload", 0, 11); withPayload.Payload = new BytesRef("test"); ts = new CannedTokenStream(withPayload); Assert.IsTrue(ts.HasAttribute <IPayloadAttribute>()); field.SetTokenStream(ts); writer.AddDocument(doc); ts = new MockTokenizer(new StringReader("another"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute <IPayloadAttribute>()); field.SetTokenStream(ts); writer.AddDocument(doc); DirectoryReader reader = writer.GetReader(); Terms terms = reader.GetTermVector(1, "field"); if (Debugging.AssertsEnabled) { Debugging.Assert(terms != null); } TermsEnum termsEnum = terms.GetEnumerator(); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("withPayload"))); DocsAndPositionsEnum de = termsEnum.DocsAndPositions(null, null); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(0, de.NextPosition()); Assert.AreEqual(new BytesRef("test"), de.GetPayload()); writer.Dispose(); reader.Dispose(); dir.Dispose(); }
public virtual int[] ToDocsArray(Term term, IBits bits, IndexReader reader) { Fields fields = MultiFields.GetFields(reader); Terms cterms = fields.GetTerms(term.Field); TermsEnum ctermsEnum = cterms.GetEnumerator(); if (ctermsEnum.SeekExact(new BytesRef(term.Text))) { DocsEnum docsEnum = TestUtil.Docs(Random, ctermsEnum, bits, null, DocsFlags.NONE); return(ToArray(docsEnum)); } return(null); }
public virtual DocsAndPositionsEnum GetDocsAndPositions(AtomicReader reader, BytesRef bytes, IBits liveDocs) { Terms terms = reader.GetTerms(fieldName); if (terms != null) { TermsEnum te = terms.GetEnumerator(); if (te.SeekExact(bytes)) { return(te.DocsAndPositions(liveDocs, null)); } } return(null); }
private void CheckTerms(Terms terms, IBits liveDocs, params string[] termsList) { Assert.IsNotNull(terms); TermsEnum te = terms.GetEnumerator(); foreach (string t in termsList) { Assert.IsTrue(te.MoveNext()); BytesRef b = te.Term; Assert.AreEqual(t, b.Utf8ToString()); DocsEnum td = TestUtil.Docs(Random, te, liveDocs, null, DocsFlags.NONE); Assert.IsTrue(td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(0, td.DocID); Assert.AreEqual(td.NextDoc(), DocIdSetIterator.NO_MORE_DOCS); } Assert.IsFalse(te.MoveNext()); }
public virtual void TestOffsetReader() { TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random)); Terms vector = reader.Get(0).GetTerms(testFields[0]); Assert.IsNotNull(vector); TermsEnum termsEnum = vector.GetEnumerator(); Assert.IsNotNull(termsEnum); Assert.AreEqual(testTerms.Length, vector.Count); DocsAndPositionsEnum dpEnum = null; for (int i = 0; i < testTerms.Length; i++) { Assert.IsTrue(termsEnum.MoveNext()); BytesRef text = termsEnum.Term; string term = text.Utf8ToString(); Assert.AreEqual(testTerms[i], term); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(dpEnum.Freq, positions[i].Length); for (int j = 0; j < positions[i].Length; j++) { Assert.AreEqual(positions[i][j], dpEnum.NextPosition()); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.IsNotNull(dpEnum); Assert.AreEqual(dpEnum.Freq, positions[i].Length); for (int j = 0; j < positions[i].Length; j++) { Assert.AreEqual(positions[i][j], dpEnum.NextPosition()); Assert.AreEqual(j * 10, dpEnum.StartOffset); Assert.AreEqual(j * 10 + testTerms[i].Length, dpEnum.EndOffset); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); } reader.Dispose(); }
/// <summary> /// Returns <see cref="DocsAndPositionsEnum"/> for the specified /// field & term, with control over whether offsets and payloads are /// required. Some codecs may be able to optimize /// their implementation when offsets and/or payloads are not /// required. This will return <c>null</c> if the field or term does not /// exist or positions were not indexed. See /// <see cref="TermsEnum.DocsAndPositions(IBits, DocsAndPositionsEnum, DocsAndPositionsFlags)"/>. /// </summary> public static DocsAndPositionsEnum GetTermPositionsEnum(IndexReader r, IBits liveDocs, string field, BytesRef term, DocsAndPositionsFlags flags) { if (Debugging.AssertsEnabled) { Debugging.Assert(field != null); Debugging.Assert(term != null); } Terms terms = GetTerms(r, field); if (terms != null) { TermsEnum termsEnum = terms.GetEnumerator(); if (termsEnum.SeekExact(term)) { return(termsEnum.DocsAndPositions(liveDocs, null, flags)); } } return(null); }
public virtual void TestTermVectors() { Terms result = reader.GetTermVectors(0).GetTerms(DocHelper.TEXT_FIELD_2_KEY); Assert.IsNotNull(result); Assert.AreEqual(3, result.Count); TermsEnum termsEnum = result.GetEnumerator(); while (termsEnum.MoveNext()) { string term = termsEnum.Term.Utf8ToString(); int freq = (int)termsEnum.TotalTermFreq; Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term, StringComparison.Ordinal) != -1); Assert.IsTrue(freq > 0); } Fields results = reader.GetTermVectors(0); Assert.IsTrue(results != null); Assert.AreEqual(3, results.Count, "We do not have 3 term freq vectors"); }
public virtual void TestReader() { TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random)); for (int j = 0; j < 5; j++) { Terms vector = reader.Get(j).GetTerms(testFields[0]); Assert.IsNotNull(vector); Assert.AreEqual(testTerms.Length, vector.Count); TermsEnum termsEnum = vector.GetEnumerator(); for (int i = 0; i < testTerms.Length; i++) { Assert.IsTrue(termsEnum.MoveNext()); BytesRef text = termsEnum.Term; string term = text.Utf8ToString(); //System.out.println("Term: " + term); Assert.AreEqual(testTerms[i], term); } Assert.IsFalse(termsEnum.MoveNext()); } reader.Dispose(); }
private void PrintSegment(StreamWriter @out, SegmentCommitInfo si) { SegmentReader reader = new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, NewIOContext(Random)); for (int i = 0; i < reader.NumDocs; i++) { @out.WriteLine(reader.Document(i)); } Fields fields = reader.Fields; foreach (string field in fields) { Terms terms = fields.GetTerms(field); Assert.IsNotNull(terms); TermsEnum tis = terms.GetEnumerator(); while (tis.MoveNext()) { @out.Write(" term=" + field + ":" + tis.Term); @out.WriteLine(" DF=" + tis.DocFreq); DocsAndPositionsEnum positions = tis.DocsAndPositions(reader.LiveDocs, null); while (positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { @out.Write(" doc=" + positions.DocID); @out.Write(" TF=" + positions.Freq); @out.Write(" pos="); @out.Write(positions.NextPosition()); for (int j = 1; j < positions.Freq; j++) { @out.Write("," + positions.NextPosition()); } @out.WriteLine(""); } } } reader.Dispose(); }
public override TermsEnum GetEnumerator() => m_input.GetEnumerator();
public virtual void Test() { IList <string> postingsList = new List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt32(Random, 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field"), StringComparison.Ordinal); IndexWriterConfig iwc = NewIndexWriterConfig(Random, TEST_VERSION_CURRENT, new MockAnalyzer(Random)); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TestNightly || RandomMultiplier > 1)) { // Otherwise test can take way too long (> 2 hours) //numTerms /= 2; // LUCENENET specific - To keep this under the 1 hour free limit // of Azure DevOps, this was reduced from /2 to /6. numTerms /= 6; } if (Verbose) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i, CultureInfo.InvariantCulture); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList.Shuffle(Random); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir("bagofpostings")); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); int threadCount = TestUtil.NextInt32(Random, 1, 5); if (Verbose) { Console.WriteLine("config: " + iw.IndexWriter.Config); Console.WriteLine("threadCount=" + threadCount); } ThreadJob[] threads = new ThreadJob[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { threads[threadID] = new ThreadAnonymousInnerClassHelper(this, maxTermsPerDoc, postings, iw, startingGun); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadJob t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.GetReader(); Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.GetTerms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: #pragma warning disable 612, 618 Assert.AreEqual(numTerms - 1, air.Fields.UniqueTermCount); if (iwc.Codec is Lucene3xCodec == false) #pragma warning restore 612, 618 { Assert.AreEqual(numTerms - 1, terms.Count); } TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { int value = Convert.ToInt32(termsEnum.Term.Utf8ToString(), CultureInfo.InvariantCulture); Assert.AreEqual(value, termsEnum.DocFreq); // don't really need to check more than this, as CheckIndex // will verify that docFreq == actual number of documents seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
public virtual void Test() { IList <string> postingsList = new JCG.List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt32(Random, 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field"), StringComparison.Ordinal); IndexWriterConfig iwc = NewIndexWriterConfig(Random, TEST_VERSION_CURRENT, new MockAnalyzer(Random)); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TestNightly || RandomMultiplier > 1)) { // Otherwise test can take way too long (> 2 hours) //numTerms /= 2; // LUCENENET specific - To keep this under the 1 hour free limit // of Azure DevOps, this was reduced from /2 to /6. numTerms /= 6; } if (Verbose) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i, CultureInfo.InvariantCulture); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList.Shuffle(Random); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir(GetFullMethodName())); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); int threadCount = TestUtil.NextInt32(Random, 1, 5); if (Verbose) { Console.WriteLine("config: " + iw.IndexWriter.Config); Console.WriteLine("threadCount=" + threadCount); } Field prototype = NewTextField("field", "", Field.Store.NO); FieldType fieldType = new FieldType(prototype.FieldType); if (Random.NextBoolean()) { fieldType.OmitNorms = true; } int options = Random.Next(3); if (options == 0) { fieldType.IndexOptions = IndexOptions.DOCS_AND_FREQS; // we dont actually need positions fieldType.StoreTermVectors = true; // but enforce term vectors when we do this so we check SOMETHING } else if (options == 1 && !DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat("field"))) { fieldType.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } // else just positions ThreadJob[] threads = new ThreadJob[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { Random threadRandom = new Random(Random.Next()); Document document = new Document(); Field field = new Field("field", "", fieldType); document.Add(field); threads[threadID] = new ThreadAnonymousClass(this, numTerms, maxTermsPerDoc, postings, iw, startingGun, threadRandom, document, field); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadJob t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.GetReader(); Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.GetTerms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: Assert.AreEqual(numTerms - 1, terms.Count); TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { int value = Convert.ToInt32(termsEnum.Term.Utf8ToString(), CultureInfo.InvariantCulture); Assert.AreEqual(value, termsEnum.TotalTermFreq); // don't really need to check more than this, as CheckIndex // will verify that totalTermFreq == total number of positions seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
public static void VerifyEquals(Fields d1, Fields d2) { if (d1 == null) { Assert.IsTrue(d2 == null || d2.Count == 0); return; } Assert.IsTrue(d2 != null); IEnumerator <string> fieldsEnum2 = d2.GetEnumerator(); foreach (string field1 in d1) { fieldsEnum2.MoveNext(); string field2 = fieldsEnum2.Current; Assert.AreEqual(field1, field2); Terms terms1 = d1.GetTerms(field1); Assert.IsNotNull(terms1); TermsEnum termsEnum1 = terms1.GetEnumerator(); Terms terms2 = d2.GetTerms(field2); Assert.IsNotNull(terms2); TermsEnum termsEnum2 = terms2.GetEnumerator(); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while (termsEnum1.MoveNext()) { term1 = termsEnum1.Term; termsEnum2.MoveNext(); BytesRef term2 = termsEnum2.Term; Assert.AreEqual(term1, term2); Assert.AreEqual(termsEnum1.TotalTermFreq, termsEnum2.TotalTermFreq); dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2); if (dpEnum1 != null) { Assert.IsNotNull(dpEnum2); int docID1 = dpEnum1.NextDoc(); dpEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dpEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.Freq; int freq2 = dpEnum2.Freq; Assert.AreEqual(freq1, freq2); IOffsetAttribute offsetAtt1 = dpEnum1.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes.GetAttribute <IOffsetAttribute>() : null; IOffsetAttribute offsetAtt2 = dpEnum2.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes.GetAttribute <IOffsetAttribute>() : null; if (offsetAtt1 != null) { Assert.IsNotNull(offsetAtt2); } else { Assert.IsNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.NextPosition(); int pos2 = dpEnum2.NextPosition(); Assert.AreEqual(pos1, pos2); if (offsetAtt1 != null) { Assert.AreEqual(offsetAtt1.StartOffset, offsetAtt2.StartOffset); Assert.AreEqual(offsetAtt1.EndOffset, offsetAtt2.EndOffset); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc()); } else { dEnum1 = TestUtil.Docs(Random, termsEnum1, null, dEnum1, DocsFlags.FREQS); dEnum2 = TestUtil.Docs(Random, termsEnum2, null, dEnum2, DocsFlags.FREQS); Assert.IsNotNull(dEnum1); Assert.IsNotNull(dEnum2); int docID1 = dEnum1.NextDoc(); dEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.Freq; int freq2 = dEnum2.Freq; Assert.AreEqual(freq1, freq2); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc()); } } Assert.IsFalse(termsEnum2.MoveNext()); } Assert.IsFalse(fieldsEnum2.MoveNext()); }
public virtual void VerifyEquals(DirectoryReader r1, DirectoryReader r2, string idField) { if (Verbose) { Console.WriteLine("\nr1 docs:"); PrintDocs(r1); Console.WriteLine("\nr2 docs:"); PrintDocs(r2); } if (r1.NumDocs != r2.NumDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(false, "r1.NumDocs={0} vs r2.NumDocs={1}", r1.NumDocs, r2.NumDocs); } } bool hasDeletes = !(r1.MaxDoc == r2.MaxDoc && r1.NumDocs == r1.MaxDoc); int[] r2r1 = new int[r2.MaxDoc]; // r2 id to r1 id mapping // create mapping from id2 space to id2 based on idField Fields f1 = MultiFields.GetFields(r1); if (f1 == null) { // make sure r2 is empty Assert.IsNull(MultiFields.GetFields(r2)); return; } Terms terms1 = f1.GetTerms(idField); if (terms1 == null) { Assert.IsTrue(MultiFields.GetFields(r2) == null || MultiFields.GetFields(r2).GetTerms(idField) == null); return; } TermsEnum termsEnum = terms1.GetEnumerator(); IBits liveDocs1 = MultiFields.GetLiveDocs(r1); IBits liveDocs2 = MultiFields.GetLiveDocs(r2); Fields fields = MultiFields.GetFields(r2); if (fields == null) { // make sure r1 is in fact empty (eg has only all // deleted docs): IBits liveDocs = MultiFields.GetLiveDocs(r1); DocsEnum docs = null; while (termsEnum.MoveNext()) { docs = TestUtil.Docs(Random, termsEnum, liveDocs, docs, DocsFlags.NONE); while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.Fail("r1 is not empty but r2 is"); } } return; } Terms terms2 = fields.GetTerms(idField); TermsEnum termsEnum2 = terms2.GetEnumerator(); DocsEnum termDocs1 = null; DocsEnum termDocs2 = null; while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; //System.out.println("TEST: match id term=" + term); termDocs1 = TestUtil.Docs(Random, termsEnum, liveDocs1, termDocs1, DocsFlags.NONE); if (termsEnum2.SeekExact(term)) { termDocs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, termDocs2, DocsFlags.NONE); } else { termDocs2 = null; } if (termDocs1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS) { // this doc is deleted and wasn't replaced Assert.IsTrue(termDocs2 == null || termDocs2.NextDoc() == DocIdSetIterator.NO_MORE_DOCS); continue; } int id1 = termDocs1.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs1.NextDoc()); Assert.IsTrue(termDocs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int id2 = termDocs2.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs2.NextDoc()); r2r1[id2] = id1; // verify stored fields are equivalent try { VerifyEquals(r1.Document(id1), r2.Document(id2)); } catch (Exception /*t*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); Console.WriteLine(" d1=" + r1.Document(id1)); Console.WriteLine(" d2=" + r2.Document(id2)); throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } try { // verify term vectors are equivalent VerifyEquals(r1.GetTermVectors(id1), r2.GetTermVectors(id2)); } catch (Exception /*e*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); Fields tv1 = r1.GetTermVectors(id1); Console.WriteLine(" d1=" + tv1); if (tv1 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv1) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv1.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetEnumerator(); while (termsEnum3.MoveNext()) { Console.WriteLine(" " + termsEnum3.Term.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } Fields tv2 = r2.GetTermVectors(id2); Console.WriteLine(" d2=" + tv2); if (tv2 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv2) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv2.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetEnumerator(); while (termsEnum3.MoveNext()) { Console.WriteLine(" " + termsEnum3.Term.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } //System.out.println("TEST: done match id"); // Verify postings //System.out.println("TEST: create te1"); Fields fields1 = MultiFields.GetFields(r1); IEnumerator <string> fields1Enum = fields1.GetEnumerator(); Fields fields2 = MultiFields.GetFields(r2); IEnumerator <string> fields2Enum = fields2.GetEnumerator(); string field1 = null, field2 = null; TermsEnum termsEnum1 = null; termsEnum2 = null; DocsEnum docs1 = null, docs2 = null; // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs]; long[] info2 = new long[r2.NumDocs]; for (; ;) { BytesRef term1 = null, term2 = null; // iterate until we get some docs int len1; for (; ;) { len1 = 0; if (termsEnum1 == null) { if (!fields1Enum.MoveNext()) { break; } field1 = fields1Enum.Current; Terms terms = fields1.GetTerms(field1); if (terms == null) { continue; } termsEnum1 = terms.GetEnumerator(); } if (!termsEnum1.MoveNext()) { term1 = null; // no more terms in this field termsEnum1 = null; continue; } term1 = termsEnum1.Term; //System.out.println("TEST: term1=" + term1); docs1 = TestUtil.Docs(Random, termsEnum1, liveDocs1, docs1, DocsFlags.FREQS); while (docs1.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = docs1.DocID; int f = docs1.Freq; info1[len1] = (((long)d) << 32) | (uint)f; len1++; } if (len1 > 0) { break; } } // iterate until we get some docs int len2; for (; ;) { len2 = 0; if (termsEnum2 == null) { if (!fields2Enum.MoveNext()) { break; } field2 = fields2Enum.Current; Terms terms = fields2.GetTerms(field2); if (terms == null) { continue; } termsEnum2 = terms.GetEnumerator(); } if (!termsEnum2.MoveNext()) { term2 = null; // no more terms in this field termsEnum2 = null; continue; } term2 = termsEnum2.Term; //System.out.println("TEST: term1=" + term1); docs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, docs2, DocsFlags.FREQS); while (docs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = r2r1[docs2.DocID]; int f = docs2.Freq; info2[len2] = (((long)d) << 32) | (uint)f; len2++; } if (len2 > 0) { break; } } Assert.AreEqual(len1, len2); if (len1 == 0) // no more terms { break; } Assert.AreEqual(field1, field2); Assert.IsTrue(term1.BytesEquals(term2)); if (!hasDeletes) { Assert.AreEqual(termsEnum1.DocFreq, termsEnum2.DocFreq); } Assert.AreEqual(term1, term2, "len1=" + len1 + " len2=" + len2 + " deletes?=" + hasDeletes); // sort info2 to get it into ascending docid Array.Sort(info2, 0, len2); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i], "i=" + i + " len=" + len1 + " d1=" + ((long)((ulong)info1[i] >> 32)) + " f1=" + (info1[i] & int.MaxValue) + " d2=" + ((long)((ulong)info2[i] >> 32)) + " f2=" + (info2[i] & int.MaxValue) + " field=" + field1 + " term=" + term1.Utf8ToString()); } } }
public virtual void TestDoubleOffsetCounting() { Directory dir = NewDirectory(); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); Document doc = new Document(); FieldType customType = new FieldType(StringField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorOffsets = true; Field f = NewField("field", "abcd", customType); doc.Add(f); doc.Add(f); Field f2 = NewField("field", "", customType); doc.Add(f2); doc.Add(f); w.AddDocument(doc); w.Dispose(); IndexReader r = DirectoryReader.Open(dir); Terms vector = r.GetTermVectors(0).GetTerms("field"); Assert.IsNotNull(vector); TermsEnum termsEnum = vector.GetEnumerator(); Assert.IsTrue(termsEnum.MoveNext()); Assert.AreEqual("", termsEnum.Term.Utf8ToString()); // Token "" occurred once Assert.AreEqual(1, termsEnum.TotalTermFreq); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.NextPosition(); Assert.AreEqual(8, dpEnum.StartOffset); Assert.AreEqual(8, dpEnum.EndOffset); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); // Token "abcd" occurred three times Assert.IsTrue(termsEnum.MoveNext()); Assert.AreEqual(new BytesRef("abcd"), termsEnum.Term); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.AreEqual(3, termsEnum.TotalTermFreq); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.NextPosition(); Assert.AreEqual(0, dpEnum.StartOffset); Assert.AreEqual(4, dpEnum.EndOffset); dpEnum.NextPosition(); Assert.AreEqual(4, dpEnum.StartOffset); Assert.AreEqual(8, dpEnum.EndOffset); dpEnum.NextPosition(); Assert.AreEqual(8, dpEnum.StartOffset); Assert.AreEqual(12, dpEnum.EndOffset); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); Assert.IsFalse(termsEnum.MoveNext()); r.Dispose(); dir.Dispose(); }
// DocValues updates private void ApplyDocValuesUpdates <T1>(IEnumerable <T1> updates, ReadersAndUpdates rld, SegmentReader reader, DocValuesFieldUpdates.Container dvUpdatesContainer) where T1 : DocValuesUpdate { lock (this) { Fields fields = reader.Fields; if (fields == null) { // this reader has no postings return; } // TODO: we can process the updates per DV field, from last to first so that // if multiple terms affect same document for the same field, we add an update // only once (that of the last term). To do that, we can keep a bitset which // marks which documents have already been updated. So e.g. if term T1 // updates doc 7, and then we process term T2 and it updates doc 7 as well, // we don't apply the update since we know T1 came last and therefore wins // the update. // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so // that these documents aren't even returned. string currentField = null; TermsEnum termsEnum = null; DocsEnum docs = null; //System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader); foreach (DocValuesUpdate update in updates) { Term term = update.term; int limit = update.docIDUpto; // TODO: we traverse the terms in update order (not term order) so that we // apply the updates in the correct order, i.e. if two terms udpate the // same document, the last one that came in wins, irrespective of the // terms lexical order. // we can apply the updates in terms order if we keep an updatesGen (and // increment it with every update) and attach it to each NumericUpdate. Note // that we cannot rely only on docIDUpto because an app may send two updates // which will get same docIDUpto, yet will still need to respect the order // those updates arrived. if (!string.Equals(term.Field, currentField, StringComparison.Ordinal)) { // if we change the code to process updates in terms order, enable this assert // assert currentField == null || currentField.CompareToOrdinal(term.Field) < 0; currentField = term.Field; Terms terms = fields.GetTerms(currentField); if (terms != null) { termsEnum = terms.GetEnumerator(termsEnum); } else { termsEnum = null; continue; // no terms in that field } } if (termsEnum == null) { continue; } // System.out.println(" term=" + term); if (termsEnum.SeekExact(term.Bytes)) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsFlags.NONE); //System.out.println("BDS: got docsEnum=" + docsEnum); DocValuesFieldUpdates dvUpdates = dvUpdatesContainer.GetUpdates(update.field, update.type); if (dvUpdates == null) { dvUpdates = dvUpdatesContainer.NewUpdates(update.field, update.type, reader.MaxDoc); } int doc; while ((doc = docsEnum.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term + " doc=" + docID); if (doc >= limit) { break; // no more docs that can be updated for this term } dvUpdates.Add(doc, update.value); } } } } }
public virtual void TestPositionReader() { TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random)); //BytesRef[] terms; // LUCENENET NOTE: Not used in Lucene Terms vector = reader.Get(0).GetTerms(testFields[0]); Assert.IsNotNull(vector); Assert.AreEqual(testTerms.Length, vector.Count); TermsEnum termsEnum = vector.GetEnumerator(); DocsAndPositionsEnum dpEnum = null; for (int i = 0; i < testTerms.Length; i++) { Assert.IsTrue(termsEnum.MoveNext()); BytesRef text = termsEnum.Term; string term = text.Utf8ToString(); //System.out.println("Term: " + term); Assert.AreEqual(testTerms[i], term); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); int doc = dpEnum.DocID; Assert.AreEqual(-1, doc); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(dpEnum.Freq, positions[i].Length); for (int j = 0; j < positions[i].Length; j++) { Assert.AreEqual(positions[i][j], dpEnum.NextPosition()); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); doc = dpEnum.DocID; Assert.AreEqual(-1, doc); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.IsNotNull(dpEnum); Assert.AreEqual(dpEnum.Freq, positions[i].Length); for (int j = 0; j < positions[i].Length; j++) { Assert.AreEqual(positions[i][j], dpEnum.NextPosition()); Assert.AreEqual(j * 10, dpEnum.StartOffset); Assert.AreEqual(j * 10 + testTerms[i].Length, dpEnum.EndOffset); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); } Terms freqVector = reader.Get(0).GetTerms(testFields[1]); //no pos, no offset Assert.IsNotNull(freqVector); Assert.AreEqual(testTerms.Length, freqVector.Count); termsEnum = freqVector.GetEnumerator(); Assert.IsNotNull(termsEnum); for (int i = 0; i < testTerms.Length; i++) { Assert.IsTrue(termsEnum.MoveNext()); BytesRef text = termsEnum.Term; string term = text.Utf8ToString(); //System.out.println("Term: " + term); Assert.AreEqual(testTerms[i], term); Assert.IsNotNull(termsEnum.Docs(null, null)); Assert.IsNull(termsEnum.DocsAndPositions(null, null)); // no pos } reader.Dispose(); }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw IllegalStateException.Create("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = Environment.TickCount; m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetEnumerator(); BytesRef seekStart = termPrefix ?? new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } catch (Exception uoe) when(uoe.IsUnsupportedOperationException()) { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = val.TripleShift(8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = val.TripleShift(8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (!te.MoveNext()) { break; } } m_numTermsInField = termNum; long midPoint = Environment.TickCount; if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = val.TripleShift(8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw IllegalStateException.Create("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = Environment.TickCount; m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }
public virtual void TestArbitraryFields() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); int NUM_DOCS = AtLeast(27); if (Verbose) { Console.WriteLine("TEST: " + NUM_DOCS + " docs"); } int[] fieldsPerDoc = new int[NUM_DOCS]; int baseCount = 0; for (int docCount = 0; docCount < NUM_DOCS; docCount++) { int fieldCount = TestUtil.NextInt32(Random, 1, 17); fieldsPerDoc[docCount] = fieldCount - 1; int finalDocCount = docCount; if (Verbose) { Console.WriteLine("TEST: " + fieldCount + " fields in doc " + docCount); } int finalBaseCount = baseCount; baseCount += fieldCount - 1; w.AddDocument(new IterableAnonymousInnerClassHelper(this, fieldCount, finalDocCount, finalBaseCount)); } IndexReader r = w.GetReader(); w.Dispose(); IndexSearcher s = NewSearcher(r); int counter = 0; for (int id = 0; id < NUM_DOCS; id++) { if (Verbose) { Console.WriteLine("TEST: verify doc id=" + id + " (" + fieldsPerDoc[id] + " fields) counter=" + counter); } TopDocs hits = s.Search(new TermQuery(new Term("id", "" + id)), 1); Assert.AreEqual(1, hits.TotalHits); int docID = hits.ScoreDocs[0].Doc; Document doc = s.Doc(docID); int endCounter = counter + fieldsPerDoc[id]; while (counter < endCounter) { string name = "f" + counter; int fieldID = counter % 10; bool stored = (counter & 1) == 0 || fieldID == 3; bool binary = fieldID == 3; bool indexed = fieldID != 3; string stringValue; if (fieldID != 3 && fieldID != 9) { stringValue = "text " + counter; } else { stringValue = null; } // stored: if (stored) { IIndexableField f = doc.GetField(name); Assert.IsNotNull(f, "doc " + id + " doesn't have field f" + counter); if (binary) { Assert.IsNotNull(f, "doc " + id + " doesn't have field f" + counter); BytesRef b = f.GetBinaryValue(); Assert.IsNotNull(b); Assert.AreEqual(10, b.Length); for (int idx = 0; idx < 10; idx++) { Assert.AreEqual((byte)(idx + counter), b.Bytes[b.Offset + idx]); } } else { if (Debugging.AssertsEnabled) { Debugging.Assert(stringValue != null); } Assert.AreEqual(stringValue, f.GetStringValue()); } } if (indexed) { bool tv = counter % 2 == 1 && fieldID != 9; if (tv) { Terms tfv = r.GetTermVectors(docID).GetTerms(name); Assert.IsNotNull(tfv); TermsEnum termsEnum = tfv.GetEnumerator(); Assert.IsTrue(termsEnum.MoveNext()); Assert.AreEqual(new BytesRef("" + counter), termsEnum.Term); Assert.AreEqual(1, termsEnum.TotalTermFreq); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, dpEnum.Freq); Assert.AreEqual(1, dpEnum.NextPosition()); Assert.IsTrue(termsEnum.MoveNext()); Assert.AreEqual(new BytesRef("text"), termsEnum.Term); Assert.AreEqual(1, termsEnum.TotalTermFreq); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, dpEnum.Freq); Assert.AreEqual(0, dpEnum.NextPosition()); Assert.IsFalse(termsEnum.MoveNext()); // TODO: offsets } else { Fields vectors = r.GetTermVectors(docID); Assert.IsTrue(vectors == null || vectors.GetTerms(name) == null); } BooleanQuery bq = new BooleanQuery(); bq.Add(new TermQuery(new Term("id", "" + id)), Occur.MUST); bq.Add(new TermQuery(new Term(name, "text")), Occur.MUST); TopDocs hits2 = s.Search(bq, 1); Assert.AreEqual(1, hits2.TotalHits); Assert.AreEqual(docID, hits2.ScoreDocs[0].Doc); bq = new BooleanQuery(); bq.Add(new TermQuery(new Term("id", "" + id)), Occur.MUST); bq.Add(new TermQuery(new Term(name, "" + counter)), Occur.MUST); TopDocs hits3 = s.Search(bq, 1); Assert.AreEqual(1, hits3.TotalHits); Assert.AreEqual(docID, hits3.ScoreDocs[0].Doc); } counter++; } } r.Dispose(); dir.Dispose(); }
private void Verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) { DocTermOrds dto = new DocTermOrds(r, r.LiveDocs, "field", prefixRef, int.MaxValue, TestUtil.NextInt32(Random, 2, 10)); FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(r, "id", false); /* * for(int docID=0;docID<subR.MaxDoc;docID++) { * System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); * } */ if (Verbose) { Console.WriteLine("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.Utf8ToString())); Console.WriteLine("TEST: all TERMS:"); TermsEnum allTE = MultiFields.GetTerms(r, "field").GetEnumerator(); int ord = 0; while (allTE.MoveNext()) { Console.WriteLine(" ord=" + (ord++) + " term=" + allTE.Term.Utf8ToString()); } } //final TermsEnum te = subR.Fields.Terms("field").iterator(); TermsEnum te = dto.GetOrdTermsEnum(r); if (dto.NumTerms == 0) { if (prefixRef == null) { Assert.IsNull(MultiFields.GetTerms(r, "field")); } else { Terms terms = MultiFields.GetTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.GetEnumerator(); TermsEnum.SeekStatus result = termsEnum.SeekCeil(prefixRef); if (result != TermsEnum.SeekStatus.END) { Assert.IsFalse(StringHelper.StartsWith(termsEnum.Term, prefixRef), "term=" + termsEnum.Term.Utf8ToString() + " matches prefix=" + prefixRef.Utf8ToString()); } else { // ok } } else { // ok } } return; } if (Verbose) { Console.WriteLine("TEST: TERMS:"); te.SeekExact(0); while (true) { Console.WriteLine(" ord=" + te.Ord + " term=" + te.Term.Utf8ToString()); if (!te.MoveNext()) { break; } } } SortedSetDocValues iter = dto.GetIterator(r); for (int docID = 0; docID < r.MaxDoc; docID++) { if (Verbose) { Console.WriteLine("TEST: docID=" + docID + " of " + r.MaxDoc + " (id=" + docIDToID.Get(docID) + ")"); } iter.SetDocument(docID); int[] answers = idToOrds[docIDToID.Get(docID)]; int upto = 0; long ord; while ((ord = iter.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.SeekExact(ord); BytesRef expected = termsArray[answers[upto++]]; if (Verbose) { Console.WriteLine(" exp=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString()); } Assert.AreEqual(expected, te.Term, "expected=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString() + " ord=" + ord); } Assert.AreEqual(answers.Length, upto); } }