internal HighFrequencyIterator(HighFrequencyDictionary outerInstance) { this.outerInstance = outerInstance; Terms terms = MultiFields.GetTerms(outerInstance.reader, outerInstance.field); if (terms != null) { termsEnum = terms.Iterator(null); } else { termsEnum = null; } minNumDocs = (int)(outerInstance.thresh * (float)outerInstance.reader.NumDocs); }
public static void VerifyEquals(Fields d1, Fields d2) { if (d1 == null) { Assert.IsTrue(d2 == null || d2.Count == 0); return; } Assert.IsTrue(d2 != null); IEnumerator <string> fieldsEnum2 = d2.GetEnumerator(); foreach (string field1 in d1) { fieldsEnum2.MoveNext(); string field2 = fieldsEnum2.Current; Assert.AreEqual(field1, field2); Terms terms1 = d1.GetTerms(field1); Assert.IsNotNull(terms1); TermsEnum termsEnum1 = terms1.GetEnumerator(); Terms terms2 = d2.GetTerms(field2); Assert.IsNotNull(terms2); TermsEnum termsEnum2 = terms2.GetEnumerator(); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while (termsEnum1.MoveNext()) { term1 = termsEnum1.Term; termsEnum2.MoveNext(); BytesRef term2 = termsEnum2.Term; Assert.AreEqual(term1, term2); Assert.AreEqual(termsEnum1.TotalTermFreq, termsEnum2.TotalTermFreq); dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2); if (dpEnum1 != null) { Assert.IsNotNull(dpEnum2); int docID1 = dpEnum1.NextDoc(); dpEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dpEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.Freq; int freq2 = dpEnum2.Freq; Assert.AreEqual(freq1, freq2); IOffsetAttribute offsetAtt1 = dpEnum1.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes.GetAttribute <IOffsetAttribute>() : null; IOffsetAttribute offsetAtt2 = dpEnum2.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes.GetAttribute <IOffsetAttribute>() : null; if (offsetAtt1 != null) { Assert.IsNotNull(offsetAtt2); } else { Assert.IsNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.NextPosition(); int pos2 = dpEnum2.NextPosition(); Assert.AreEqual(pos1, pos2); if (offsetAtt1 != null) { Assert.AreEqual(offsetAtt1.StartOffset, offsetAtt2.StartOffset); Assert.AreEqual(offsetAtt1.EndOffset, offsetAtt2.EndOffset); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc()); } else { dEnum1 = TestUtil.Docs(Random, termsEnum1, null, dEnum1, DocsFlags.FREQS); dEnum2 = TestUtil.Docs(Random, termsEnum2, null, dEnum2, DocsFlags.FREQS); Assert.IsNotNull(dEnum1); Assert.IsNotNull(dEnum2); int docID1 = dEnum1.NextDoc(); dEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.Freq; int freq2 = dEnum2.Freq; Assert.AreEqual(freq1, freq2); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc()); } } Assert.IsFalse(termsEnum2.MoveNext()); } Assert.IsFalse(fieldsEnum2.MoveNext()); }
/// <summary> /// checks the terms enum sequentially /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums /// </summary> public virtual void AssertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep) { BytesRef term; Bits randomBits = new RandomBits(MAXDOC, Random().NextDouble(), Random()); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while ((term = leftTermsEnum.Next()) != null) { Assert.AreEqual(term, rightTermsEnum.Next()); AssertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { // with payloads + off AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); // with payloads only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_PAYLOADS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_PAYLOADS)); // with offsets only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsAndPositionsEnum.FLAG_OFFSETS), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsAndPositionsEnum.FLAG_OFFSETS)); // with positions only AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); AssertDocsAndPositionsEnum(leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions, DocsEnum.FLAG_NONE)); AssertPositionsSkipping(leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions, DocsEnum.FLAG_NONE), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions, DocsEnum.FLAG_NONE)); // with freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsEnum(leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE)); AssertDocsEnum(leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); // with freqs: AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs)); AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs)); // w/o freqs: AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE)); AssertDocsSkipping(leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE)); } } Assert.IsNull(rightTermsEnum.Next()); }
private void VerifyEnum(ThreadState threadState, string field, BytesRef term, TermsEnum termsEnum, FieldInfo.IndexOptions maxTestOptions, FieldInfo.IndexOptions maxIndexOptions, ISet<Option> options, bool alwaysTestMax) // Maximum options (docs/freqs/positions/offsets) to test: { if (VERBOSE) { Console.WriteLine(" verifyEnum: options=" + options + " maxTestOptions=" + maxTestOptions); } // Make sure TermsEnum really is positioned on the // expected term: Assert.AreEqual(term, termsEnum.Term()); // 50% of the time time pass liveDocs: bool useLiveDocs = options.Contains(Option.LIVE_DOCS) && Random().NextBoolean(); Bits liveDocs; if (useLiveDocs) { liveDocs = GlobalLiveDocs; if (VERBOSE) { Console.WriteLine(" use liveDocs"); } } else { liveDocs = null; if (VERBOSE) { Console.WriteLine(" no liveDocs"); } } FieldInfo fieldInfo = CurrentFieldInfos.FieldInfo(field); // NOTE: can be empty list if we are using liveDocs: SeedPostings expected = GetSeedPostings(term.Utf8ToString(), Fields[field][term], useLiveDocs, maxIndexOptions); Assert.AreEqual(expected.DocFreq, termsEnum.DocFreq()); bool allowFreqs = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS && maxTestOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS) >= 0; bool doCheckFreqs = allowFreqs && (alwaysTestMax || Random().Next(3) <= 2); bool allowPositions = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS && maxTestOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; bool doCheckPositions = allowPositions && (alwaysTestMax || Random().Next(3) <= 2); bool allowOffsets = fieldInfo.FieldIndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS && maxTestOptions.CompareTo(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; bool doCheckOffsets = allowOffsets && (alwaysTestMax || Random().Next(3) <= 2); bool doCheckPayloads = options.Contains(Option.PAYLOADS) && allowPositions && fieldInfo.HasPayloads() && (alwaysTestMax || Random().Next(3) <= 2); DocsEnum prevDocsEnum = null; DocsEnum docsEnum; DocsAndPositionsEnum docsAndPositionsEnum; if (!doCheckPositions) { if (allowPositions && Random().Next(10) == 7) { // 10% of the time, even though we will not check positions, pull a DocsAndPositions enum if (options.Contains(Option.REUSE_ENUMS) && Random().Next(10) < 9) { prevDocsEnum = threadState.ReuseDocsAndPositionsEnum; } int flags = 0; if (alwaysTestMax || Random().NextBoolean()) { flags |= DocsAndPositionsEnum.FLAG_OFFSETS; } if (alwaysTestMax || Random().NextBoolean()) { flags |= DocsAndPositionsEnum.FLAG_PAYLOADS; } if (VERBOSE) { Console.WriteLine(" get DocsAndPositionsEnum (but we won't check positions) flags=" + flags); } threadState.ReuseDocsAndPositionsEnum = termsEnum.DocsAndPositions(liveDocs, (DocsAndPositionsEnum)prevDocsEnum, flags); docsEnum = threadState.ReuseDocsAndPositionsEnum; docsAndPositionsEnum = threadState.ReuseDocsAndPositionsEnum; } else { if (VERBOSE) { Console.WriteLine(" get DocsEnum"); } if (options.Contains(Option.REUSE_ENUMS) && Random().Next(10) < 9) { prevDocsEnum = threadState.ReuseDocsEnum; } threadState.ReuseDocsEnum = termsEnum.Docs(liveDocs, prevDocsEnum, doCheckFreqs ? DocsEnum.FLAG_FREQS : DocsEnum.FLAG_NONE); docsEnum = threadState.ReuseDocsEnum; docsAndPositionsEnum = null; } } else { if (options.Contains(Option.REUSE_ENUMS) && Random().Next(10) < 9) { prevDocsEnum = threadState.ReuseDocsAndPositionsEnum; } int flags = 0; if (alwaysTestMax || doCheckOffsets || Random().Next(3) == 1) { flags |= DocsAndPositionsEnum.FLAG_OFFSETS; } if (alwaysTestMax || doCheckPayloads || Random().Next(3) == 1) { flags |= DocsAndPositionsEnum.FLAG_PAYLOADS; } if (VERBOSE) { Console.WriteLine(" get DocsAndPositionsEnum flags=" + flags); } threadState.ReuseDocsAndPositionsEnum = termsEnum.DocsAndPositions(liveDocs, (DocsAndPositionsEnum)prevDocsEnum, flags); docsEnum = threadState.ReuseDocsAndPositionsEnum; docsAndPositionsEnum = threadState.ReuseDocsAndPositionsEnum; } Assert.IsNotNull(docsEnum, "null DocsEnum"); int initialDocID = docsEnum.DocID(); Assert.AreEqual(-1, initialDocID, "inital docID should be -1" + docsEnum); if (VERBOSE) { if (prevDocsEnum == null) { Console.WriteLine(" got enum=" + docsEnum); } else if (prevDocsEnum == docsEnum) { Console.WriteLine(" got reuse enum=" + docsEnum); } else { Console.WriteLine(" got enum=" + docsEnum + " (reuse of " + prevDocsEnum + " failed)"); } } // 10% of the time don't consume all docs: int stopAt; if (!alwaysTestMax && options.Contains(Option.PARTIAL_DOC_CONSUME) && expected.DocFreq > 1 && Random().Next(10) == 7) { stopAt = Random().Next(expected.DocFreq - 1); if (VERBOSE) { Console.WriteLine(" will not consume all docs (" + stopAt + " vs " + expected.DocFreq + ")"); } } else { stopAt = expected.DocFreq; if (VERBOSE) { Console.WriteLine(" consume all docs"); } } double skipChance = alwaysTestMax ? 0.5 : Random().NextDouble(); int numSkips = expected.DocFreq < 3 ? 1 : TestUtil.NextInt(Random(), 1, Math.Min(20, expected.DocFreq / 3)); int skipInc = expected.DocFreq / numSkips; int skipDocInc = MaxDoc / numSkips; // Sometimes do 100% skipping: bool doAllSkipping = options.Contains(Option.SKIPPING) && Random().Next(7) == 1; double freqAskChance = alwaysTestMax ? 1.0 : Random().NextDouble(); double payloadCheckChance = alwaysTestMax ? 1.0 : Random().NextDouble(); double offsetCheckChance = alwaysTestMax ? 1.0 : Random().NextDouble(); if (VERBOSE) { if (options.Contains(Option.SKIPPING)) { Console.WriteLine(" skipChance=" + skipChance + " numSkips=" + numSkips); } else { Console.WriteLine(" no skipping"); } if (doCheckFreqs) { Console.WriteLine(" freqAskChance=" + freqAskChance); } if (doCheckPayloads) { Console.WriteLine(" payloadCheckChance=" + payloadCheckChance); } if (doCheckOffsets) { Console.WriteLine(" offsetCheckChance=" + offsetCheckChance); } } while (expected.Upto <= stopAt) { if (expected.Upto == stopAt) { if (stopAt == expected.DocFreq) { Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc(), "DocsEnum should have ended but didn't"); // Common bug is to forget to set this.Doc=NO_MORE_DOCS in the enum!: Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.DocID(), "DocsEnum should have ended but didn't"); } break; } if (options.Contains(Option.SKIPPING) && (doAllSkipping || Random().NextDouble() <= skipChance)) { int targetDocID = -1; if (expected.Upto < stopAt && Random().NextBoolean()) { // Pick target we know exists: int skipCount = TestUtil.NextInt(Random(), 1, skipInc); for (int skip = 0; skip < skipCount; skip++) { if (expected.NextDoc() == DocsEnum.NO_MORE_DOCS) { break; } } } else { // Pick random target (might not exist): int skipDocIDs = TestUtil.NextInt(Random(), 1, skipDocInc); if (skipDocIDs > 0) { targetDocID = expected.DocID() + skipDocIDs; expected.Advance(targetDocID); } } if (expected.Upto >= stopAt) { int target = Random().NextBoolean() ? MaxDoc : DocsEnum.NO_MORE_DOCS; if (VERBOSE) { Console.WriteLine(" now advance to end (target=" + target + ")"); } Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.Advance(target), "DocsEnum should have ended but didn't"); break; } else { if (VERBOSE) { if (targetDocID != -1) { Console.WriteLine(" now advance to random target=" + targetDocID + " (" + expected.Upto + " of " + stopAt + ") current=" + docsEnum.DocID()); } else { Console.WriteLine(" now advance to known-exists target=" + expected.DocID() + " (" + expected.Upto + " of " + stopAt + ") current=" + docsEnum.DocID()); } } int docID = docsEnum.Advance(targetDocID != -1 ? targetDocID : expected.DocID()); Assert.AreEqual(expected.DocID(), docID, "docID is wrong"); } } else { expected.NextDoc(); if (VERBOSE) { Console.WriteLine(" now nextDoc to " + expected.DocID() + " (" + expected.Upto + " of " + stopAt + ")"); } int docID = docsEnum.NextDoc(); Assert.AreEqual(expected.DocID(), docID, "docID is wrong"); if (docID == DocsEnum.NO_MORE_DOCS) { break; } } if (doCheckFreqs && Random().NextDouble() <= freqAskChance) { if (VERBOSE) { Console.WriteLine(" now freq()=" + expected.Freq()); } int freq = docsEnum.Freq(); Assert.AreEqual(expected.Freq(), freq, "freq is wrong"); } if (doCheckPositions) { int freq = docsEnum.Freq(); int numPosToConsume; if (!alwaysTestMax && options.Contains(Option.PARTIAL_POS_CONSUME) && Random().Next(5) == 1) { numPosToConsume = Random().Next(freq); } else { numPosToConsume = freq; } for (int i = 0; i < numPosToConsume; i++) { int pos = expected.NextPosition(); if (VERBOSE) { Console.WriteLine(" now nextPosition to " + pos); } Assert.AreEqual(pos, docsAndPositionsEnum.NextPosition(), "position is wrong"); if (doCheckPayloads) { BytesRef expectedPayload = expected.Payload; if (Random().NextDouble() <= payloadCheckChance) { if (VERBOSE) { Console.WriteLine(" now check expectedPayload length=" + (expectedPayload == null ? 0 : expectedPayload.Length)); } if (expectedPayload == null || expectedPayload.Length == 0) { Assert.IsNull(docsAndPositionsEnum.Payload, "should not have payload"); } else { BytesRef payload = docsAndPositionsEnum.Payload; Assert.IsNotNull(payload, "should have payload but doesn't"); Assert.AreEqual(expectedPayload.Length, payload.Length, "payload length is wrong"); for (int byteUpto = 0; byteUpto < expectedPayload.Length; byteUpto++) { Assert.AreEqual(expectedPayload.Bytes[expectedPayload.Offset + byteUpto], payload.Bytes[payload.Offset + byteUpto], "payload bytes are wrong"); } // make a deep copy payload = BytesRef.DeepCopyOf(payload); Assert.AreEqual(payload, docsAndPositionsEnum.Payload, "2nd call to getPayload returns something different!"); } } else { if (VERBOSE) { Console.WriteLine(" skip check payload length=" + (expectedPayload == null ? 0 : expectedPayload.Length)); } } } if (doCheckOffsets) { if (Random().NextDouble() <= offsetCheckChance) { if (VERBOSE) { Console.WriteLine(" now check offsets: startOff=" + expected.StartOffset() + " endOffset=" + expected.EndOffset()); } Assert.AreEqual(expected.StartOffset(), docsAndPositionsEnum.StartOffset(), "startOffset is wrong"); Assert.AreEqual(expected.EndOffset(), docsAndPositionsEnum.EndOffset(), "endOffset is wrong"); } else { if (VERBOSE) { Console.WriteLine(" skip check offsets"); } } } else if (fieldInfo.FieldIndexOptions < FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) { if (VERBOSE) { Console.WriteLine(" now check offsets are -1"); } Assert.AreEqual(-1, docsAndPositionsEnum.StartOffset(), "startOffset isn't -1"); Assert.AreEqual(-1, docsAndPositionsEnum.EndOffset(), "endOffset isn't -1"); } } } } }
public AssertingTermsEnum(TermsEnum @in) : base(@in) { }
internal long Ord_Renamed; // force "real" seek public OrdWrappedTermsEnum(DocTermOrds outerInstance, AtomicReader reader) { this.OuterInstance = outerInstance; if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } Debug.Assert(outerInstance.IndexedTermsArray != null); TermsEnum = reader.Fields.Terms(outerInstance.Field).Iterator(null); }
public override TermsEnum Iterator(TermsEnum reuse) { TVTermsEnum termsEnum; if (reuse != null && reuse is TVTermsEnum) { termsEnum = (TVTermsEnum)reuse; } else { termsEnum = new TVTermsEnum(); } termsEnum.Reset(NumTerms, Flags, PrefixLengths, SuffixLengths, TermFreqs, PositionIndex, Positions, StartOffsets, Lengths, PayloadIndex, PayloadBytes, new ByteArrayDataInput((byte[])(Array)TermBytes.Bytes, TermBytes.Offset, TermBytes.Length)); return termsEnum; }
/// <exception cref="System.IO.IOException"></exception> internal SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd , int endFacetOrd) : base(counts, total - counts[0], counts[0], endFacetOrd + 1) { this.tenum = tenum; this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd + 1; if (mergePos < maxTermPos) { tenum != null.SeekExact(startFacetOrd == -1 ? 0 : startFacetOrd); mergeTerm = tenum.Term(); } }
public virtual void _run() { for (int iter = 0; iter < NUM_TEST_ITER; iter++) { FieldData field = Fields[Random().Next(Fields.Length)]; TermsEnum termsEnum = TermsDict.GetTerms(field.FieldInfo.Name).GetIterator(null); #pragma warning disable 612, 618 if (Si.Codec is Lucene3xCodec) #pragma warning restore 612, 618 { // code below expects unicode sort order continue; } int upto = 0; // Test straight enum of the terms: while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } BytesRef expected = new BytesRef(field.Terms[upto++].Text2); Assert.IsTrue(expected.BytesEquals(term), "expected=" + expected + " vs actual " + term); } Assert.AreEqual(upto, field.Terms.Length); // Test random seek: TermData term2 = field.Terms[Random().Next(field.Terms.Length)]; TermsEnum.SeekStatus status = termsEnum.SeekCeil(new BytesRef(term2.Text2)); Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND); Assert.AreEqual(term2.Docs.Length, termsEnum.DocFreq); if (field.OmitTF) { this.VerifyDocs(term2.Docs, term2.Positions, TestUtil.Docs(Random(), termsEnum, null, null, DocsFlags.NONE), false); } else { this.VerifyDocs(term2.Docs, term2.Positions, termsEnum.DocsAndPositions(null, null), true); } // Test random seek by ord: int idx = Random().Next(field.Terms.Length); term2 = field.Terms[idx]; bool success = false; try { termsEnum.SeekExact(idx); success = true; } #pragma warning disable 168 catch (System.NotSupportedException uoe) #pragma warning restore 168 { // ok -- skip it } if (success) { Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND); Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(term2.Text2))); Assert.AreEqual(term2.Docs.Length, termsEnum.DocFreq); if (field.OmitTF) { this.VerifyDocs(term2.Docs, term2.Positions, TestUtil.Docs(Random(), termsEnum, null, null, DocsFlags.NONE), false); } else { this.VerifyDocs(term2.Docs, term2.Positions, termsEnum.DocsAndPositions(null, null), true); } } // Test seek to non-existent terms: if (VERBOSE) { Console.WriteLine("TEST: seek non-exist terms"); } for (int i = 0; i < 100; i++) { string text2 = TestUtil.RandomUnicodeString(Random()) + "."; status = termsEnum.SeekCeil(new BytesRef(text2)); Assert.IsTrue(status == TermsEnum.SeekStatus.NOT_FOUND || status == TermsEnum.SeekStatus.END); } // Seek to each term, backwards: if (VERBOSE) { Console.WriteLine("TEST: seek terms backwards"); } for (int i = field.Terms.Length - 1; i >= 0; i--) { Assert.AreEqual(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(field.Terms[i].Text2)), Thread.CurrentThread.Name + ": field=" + field.FieldInfo.Name + " term=" + field.Terms[i].Text2); Assert.AreEqual(field.Terms[i].Docs.Length, termsEnum.DocFreq); } // Seek to each term by ord, backwards for (int i = field.Terms.Length - 1; i >= 0; i--) { try { termsEnum.SeekExact(i); Assert.AreEqual(field.Terms[i].Docs.Length, termsEnum.DocFreq); Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.Terms[i].Text2))); } #pragma warning disable 168 catch (System.NotSupportedException uoe) #pragma warning restore 168 { } } // Seek to non-existent empty-string term status = termsEnum.SeekCeil(new BytesRef("")); Assert.IsNotNull(status); //Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status); // Make sure we're now pointing to first term Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.Terms[0].Text2))); // Test docs enum termsEnum.SeekCeil(new BytesRef("")); upto = 0; do { term2 = field.Terms[upto]; if (Random().Next(3) == 1) { DocsEnum docs; DocsEnum docsAndFreqs; DocsAndPositionsEnum postings; if (!field.OmitTF) { postings = termsEnum.DocsAndPositions(null, null); if (postings != null) { docs = docsAndFreqs = postings; } else { docs = docsAndFreqs = TestUtil.Docs(Random(), termsEnum, null, null, DocsFlags.FREQS); } } else { postings = null; docsAndFreqs = null; docs = TestUtil.Docs(Random(), termsEnum, null, null, DocsFlags.NONE); } Assert.IsNotNull(docs); int upto2 = -1; bool ended = false; while (upto2 < term2.Docs.Length - 1) { // Maybe skip: int left = term2.Docs.Length - upto2; int doc; if (Random().Next(3) == 1 && left >= 1) { int inc = 1 + Random().Next(left - 1); upto2 += inc; if (Random().Next(2) == 1) { doc = docs.Advance(term2.Docs[upto2]); Assert.AreEqual(term2.Docs[upto2], doc); } else { doc = docs.Advance(1 + term2.Docs[upto2]); if (doc == DocIdSetIterator.NO_MORE_DOCS) { // skipped past last doc Debug.Assert(upto2 == term2.Docs.Length - 1); ended = true; break; } else { // skipped to next doc Debug.Assert(upto2 < term2.Docs.Length - 1); if (doc >= term2.Docs[1 + upto2]) { upto2++; } } } } else { doc = docs.NextDoc(); Assert.IsTrue(doc != -1); upto2++; } Assert.AreEqual(term2.Docs[upto2], doc); if (!field.OmitTF) { Assert.AreEqual(term2.Positions[upto2].Length, postings.Freq); if (Random().Next(2) == 1) { this.VerifyPositions(term2.Positions[upto2], postings); } } } if (!ended) { Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docs.NextDoc()); } } upto++; } while (termsEnum.Next() != null); Assert.AreEqual(upto, field.Terms.Length); } }
public virtual void TestFixedPostings() { const int NUM_TERMS = 100; TermData[] terms = new TermData[NUM_TERMS]; for (int i = 0; i < NUM_TERMS; i++) { int[] docs = new int[] { i }; string text = Convert.ToString(i); terms[i] = new TermData(this, text, docs, null); } FieldInfos.Builder builder = new FieldInfos.Builder(); FieldData field = new FieldData(this, "field", builder, terms, true, false); FieldData[] fields = new FieldData[] { field }; FieldInfos fieldInfos = builder.Finish(); // LUCENENET specific - BUG: we must wrap this in a using block in case anything in the below loop throws using (Directory dir = NewDirectory()) { this.Write(fieldInfos, dir, fields, true); Codec codec = Codec.Default; SegmentInfo si = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, SEGMENT, 10000, false, codec, null); // LUCENENET specific - BUG: we must wrap this in a using block in case anything in the below loop throws using (FieldsProducer reader = codec.PostingsFormat.FieldsProducer(new SegmentReadState(dir, si, fieldInfos, NewIOContext(Random()), DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR))) { IEnumerator <string> fieldsEnum = reader.GetEnumerator(); fieldsEnum.MoveNext(); string fieldName = fieldsEnum.Current; Assert.IsNotNull(fieldName); Terms terms2 = reader.GetTerms(fieldName); Assert.IsNotNull(terms2); TermsEnum termsEnum = terms2.GetIterator(null); DocsEnum docsEnum = null; for (int i = 0; i < NUM_TERMS; i++) { BytesRef term = termsEnum.Next(); Assert.IsNotNull(term); Assert.AreEqual(terms[i].Text2, term.Utf8ToString()); // do this twice to stress test the codec's reuse, ie, // make sure it properly fully resets (rewinds) its // internal state: for (int iter = 0; iter < 2; iter++) { docsEnum = TestUtil.Docs(Random(), termsEnum, null, docsEnum, DocsFlags.NONE); Assert.AreEqual(terms[i].Docs[0], docsEnum.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc()); } } Assert.IsNull(termsEnum.Next()); for (int i = 0; i < NUM_TERMS; i++) { Assert.AreEqual(termsEnum.SeekCeil(new BytesRef(terms[i].Text2)), TermsEnum.SeekStatus.FOUND); } Assert.IsFalse(fieldsEnum.MoveNext()); } } }
public virtual void Test() { IList <string> postingsList = new List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt(Random(), 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field")); IndexWriterConfig iwc = NewIndexWriterConfig(Random(), TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TEST_NIGHTLY || RANDOM_MULTIPLIER > 1)) { // Otherwise test can take way too long (> 2 hours) numTerms /= 2; } if (VERBOSE) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList = CollectionsHelper.Shuffle(postingsList); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir("bagofpostings")); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc); int threadCount = TestUtil.NextInt(Random(), 1, 5); if (VERBOSE) { Console.WriteLine("config: " + iw.w.Config); Console.WriteLine("threadCount=" + threadCount); } ThreadClass[] threads = new ThreadClass[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { threads[threadID] = new ThreadAnonymousInnerClassHelper(this, maxTermsPerDoc, postings, iw, startingGun); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadClass t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.Reader; Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.Terms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: Assert.AreEqual(numTerms - 1, air.Fields.UniqueTermCount); if (iwc.Codec is Lucene3xCodec == false) { Assert.AreEqual(numTerms - 1, terms.Size()); } TermsEnum termsEnum = terms.Iterator(null); BytesRef term_; while ((term_ = termsEnum.Next()) != null) { int value = Convert.ToInt32(term_.Utf8ToString()); Assert.AreEqual(value, termsEnum.DocFreq()); // don't really need to check more than this, as CheckIndex // will verify that docFreq == actual number of documents seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
public TestTermsEnum(TermsEnum @in) : base(@in) { }
public override TermsEnum GetIterator(TermsEnum reuse) { return(new TestTermsEnum(base.GetIterator(reuse))); }
public override void Run() { if (VERBOSE) { Console.WriteLine(Thread.CurrentThread.Name + ": launch search thread"); } while (Environment.TickCount < stopTimeMS) { try { IndexSearcher s = outerInstance.CurrentSearcher; try { // Verify 1) IW is correctly setting // diagnostics, and 2) segment warming for // merged segments is actually happening: foreach (AtomicReaderContext sub in s.IndexReader.Leaves) { SegmentReader segReader = (SegmentReader)sub.Reader; IDictionary <string, string> diagnostics = segReader.SegmentInfo.Info.Diagnostics; assertNotNull(diagnostics); string source; diagnostics.TryGetValue("source", out source); assertNotNull(source); if (source.Equals("merge", StringComparison.Ordinal)) { assertTrue("sub reader " + sub + " wasn't warmed: warmed=" + outerInstance.warmed + " diagnostics=" + diagnostics + " si=" + segReader.SegmentInfo, !outerInstance.assertMergedSegmentsWarmed || outerInstance.warmed.ContainsKey(segReader.core)); } } if (s.IndexReader.NumDocs > 0) { outerInstance.SmokeTestSearcher(s); Fields fields = MultiFields.GetFields(s.IndexReader); if (fields == null) { continue; } Terms terms = fields.GetTerms("body"); if (terms == null) { continue; } TermsEnum termsEnum = terms.GetIterator(null); int seenTermCount = 0; int shift; int trigger; if (totTermCount.Get() < 30) { shift = 0; trigger = 1; } else { trigger = totTermCount.Get() / 30; shift = Random().Next(trigger); } while (Environment.TickCount < stopTimeMS) { BytesRef term = termsEnum.Next(); if (term == null) { totTermCount.Set(seenTermCount); break; } seenTermCount++; // search 30 terms if ((seenTermCount + shift) % trigger == 0) { //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + " now search body:" + term.Utf8ToString()); //} totHits.AddAndGet(outerInstance.RunQuery(s, new TermQuery(new Term("body", term)))); } } //if (VERBOSE) { //System.out.println(Thread.currentThread().getName() + ": search done"); //} } } finally { outerInstance.ReleaseSearcher(s); } } catch (Exception t) { Console.WriteLine(Thread.CurrentThread.Name + ": hit exc"); outerInstance.failed.Set(true); Console.WriteLine(t.ToString()); throw new Exception(t.ToString(), t); } } }
public virtual void TestArbitraryFields() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); int NUM_DOCS = AtLeast(27); if (VERBOSE) { Console.WriteLine("TEST: " + NUM_DOCS + " docs"); } int[] fieldsPerDoc = new int[NUM_DOCS]; int baseCount = 0; for (int docCount = 0; docCount < NUM_DOCS; docCount++) { int fieldCount = TestUtil.NextInt(Random(), 1, 17); fieldsPerDoc[docCount] = fieldCount - 1; int finalDocCount = docCount; if (VERBOSE) { Console.WriteLine("TEST: " + fieldCount + " fields in doc " + docCount); } int finalBaseCount = baseCount; baseCount += fieldCount - 1; w.AddDocument(new IterableAnonymousInnerClassHelper(this, fieldCount, finalDocCount, finalBaseCount)); } IndexReader r = w.Reader; w.Dispose(); IndexSearcher s = NewSearcher(r); int counter = 0; for (int id = 0; id < NUM_DOCS; id++) { if (VERBOSE) { Console.WriteLine("TEST: verify doc id=" + id + " (" + fieldsPerDoc[id] + " fields) counter=" + counter); } TopDocs hits = s.Search(new TermQuery(new Term("id", "" + id)), 1); Assert.AreEqual(1, hits.TotalHits); int docID = hits.ScoreDocs[0].Doc; Document doc = s.Doc(docID); int endCounter = counter + fieldsPerDoc[id]; while (counter < endCounter) { string name = "f" + counter; int fieldID = counter % 10; bool stored = (counter & 1) == 0 || fieldID == 3; bool binary = fieldID == 3; bool indexed = fieldID != 3; string stringValue; if (fieldID != 3 && fieldID != 9) { stringValue = "text " + counter; } else { stringValue = null; } // stored: if (stored) { IIndexableField f = doc.GetField(name); Assert.IsNotNull(f, "doc " + id + " doesn't have field f" + counter); if (binary) { Assert.IsNotNull(f, "doc " + id + " doesn't have field f" + counter); BytesRef b = f.GetBinaryValue(); Assert.IsNotNull(b); Assert.AreEqual(10, b.Length); for (int idx = 0; idx < 10; idx++) { Assert.AreEqual((byte)(idx + counter), b.Bytes[b.Offset + idx]); } } else { Debug.Assert(stringValue != null); Assert.AreEqual(stringValue, f.GetStringValue()); } } if (indexed) { bool tv = counter % 2 == 1 && fieldID != 9; if (tv) { Terms tfv = r.GetTermVectors(docID).GetTerms(name); Assert.IsNotNull(tfv); TermsEnum termsEnum = tfv.GetIterator(null); Assert.AreEqual(new BytesRef("" + counter), termsEnum.Next()); Assert.AreEqual(1, termsEnum.TotalTermFreq); DocsAndPositionsEnum dpEnum = termsEnum.DocsAndPositions(null, null); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, dpEnum.Freq); Assert.AreEqual(1, dpEnum.NextPosition()); Assert.AreEqual(new BytesRef("text"), termsEnum.Next()); Assert.AreEqual(1, termsEnum.TotalTermFreq); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, dpEnum.Freq); Assert.AreEqual(0, dpEnum.NextPosition()); Assert.IsNull(termsEnum.Next()); // TODO: offsets } else { Fields vectors = r.GetTermVectors(docID); Assert.IsTrue(vectors == null || vectors.GetTerms(name) == null); } BooleanQuery bq = new BooleanQuery(); bq.Add(new TermQuery(new Term("id", "" + id)), Occur.MUST); bq.Add(new TermQuery(new Term(name, "text")), Occur.MUST); TopDocs hits2 = s.Search(bq, 1); Assert.AreEqual(1, hits2.TotalHits); Assert.AreEqual(docID, hits2.ScoreDocs[0].Doc); bq = new BooleanQuery(); bq.Add(new TermQuery(new Term("id", "" + id)), Occur.MUST); bq.Add(new TermQuery(new Term(name, "" + counter)), Occur.MUST); TopDocs hits3 = s.Search(bq, 1); Assert.AreEqual(1, hits3.TotalHits); Assert.AreEqual(docID, hits3.ScoreDocs[0].Doc); } counter++; } } r.Dispose(); dir.Dispose(); }
public override TermsEnum Iterator(TermsEnum reuse) { return DocTermOrds.TermsEnum(); }
/// <summary> /// Creates a filtered <seealso cref="TermsEnum"/> on a terms enum. </summary> /// <param name="tenum"> the terms enumeration to filter. </param> public FilteredTermsEnum(TermsEnum tenum) : this(tenum, true) { }
public virtual void TestMerge() { Codec codec = Codec.Default; SegmentInfo si = new SegmentInfo(MergedDir, Constants.LUCENE_MAIN_VERSION, MergedSegment, -1, false, codec, null); SegmentMerger merger = new SegmentMerger(new List <AtomicReader> { Reader1, Reader2 }, si, InfoStream.Default, MergedDir, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, CheckAbort.NONE, new FieldInfos.FieldNumbers(), NewIOContext(Random), true); MergeState mergeState = merger.Merge(); int docsMerged = mergeState.SegmentInfo.DocCount; Assert.IsTrue(docsMerged == 2); //Should be able to open a new SegmentReader against the new directory SegmentReader mergedReader = new SegmentReader(new SegmentCommitInfo(new SegmentInfo(MergedDir, Constants.LUCENE_MAIN_VERSION, MergedSegment, docsMerged, false, codec, null), 0, -1L, -1L), DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, NewIOContext(Random)); Assert.IsTrue(mergedReader != null); Assert.IsTrue(mergedReader.NumDocs == 2); Document newDoc1 = mergedReader.Document(0); Assert.IsTrue(newDoc1 != null); //There are 2 unstored fields on the document Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(Doc1) - DocHelper.Unstored.Count); Document newDoc2 = mergedReader.Document(1); Assert.IsTrue(newDoc2 != null); Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(Doc2) - DocHelper.Unstored.Count); DocsEnum termDocs = TestUtil.Docs(Random, mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), MultiFields.GetLiveDocs(mergedReader), null, 0); Assert.IsTrue(termDocs != null); Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int tvCount = 0; foreach (FieldInfo fieldInfo in mergedReader.FieldInfos) { if (fieldInfo.HasVectors) { tvCount++; } } //System.out.println("stored size: " + stored.Size()); Assert.AreEqual(3, tvCount, "We do not have 3 fields that were indexed with term vector"); Terms vector = mergedReader.GetTermVectors(0).GetTerms(DocHelper.TEXT_FIELD_2_KEY); Assert.IsNotNull(vector); Assert.AreEqual(3, vector.Count); TermsEnum termsEnum = vector.GetIterator(null); int i = 0; while (termsEnum.Next() != null) { string term = termsEnum.Term.Utf8ToString(); int freq = (int)termsEnum.TotalTermFreq; //System.out.println("Term: " + term + " Freq: " + freq); Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term, StringComparison.Ordinal) != -1); Assert.IsTrue(DocHelper.FIELD_2_FREQS[i] == freq); i++; } TestSegmentReader.CheckNorms(mergedReader); mergedReader.Dispose(); }
/// <exception cref="System.IO.IOException"></exception> internal SegmentResult(int[] counts, int total, int missingCountIndex, TermsEnum tenum, int startFacetOrd, int endFacetOrd) : base(counts, total - counts[missingCountIndex ], counts[missingCountIndex], endFacetOrd == missingCountIndex + 1 ? missingCountIndex : endFacetOrd) { this.tenum = tenum; this.mergePos = startFacetOrd; if (tenum != null) { tenum.SeekExact(mergePos); mergeTerm = tenum.Term(); } }
public virtual void TestStressAdvance_Mem() { for (int iter = 0; iter < 3; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter); } Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); HashSet <int> aDocs = new HashSet <int>(); Documents.Document doc = new Documents.Document(); Field f = NewStringField("field", "", Field.Store.NO); doc.Add(f); Field idField = NewStringField("id", "", Field.Store.YES); doc.Add(idField); int num = AtLeast(4097); if (VERBOSE) { Console.WriteLine("\nTEST: numDocs=" + num); } for (int id = 0; id < num; id++) { if (Random().Next(4) == 3) { f.SetStringValue("a"); aDocs.Add(id); } else { f.SetStringValue("b"); } idField.SetStringValue("" + id); w.AddDocument(doc); if (VERBOSE) { Console.WriteLine("\nTEST: doc upto " + id); } } w.ForceMerge(1); IList <int> aDocIDs = new List <int>(); IList <int> bDocIDs = new List <int>(); DirectoryReader r = w.Reader; int[] idToDocID = new int[r.MaxDoc]; for (int docID = 0; docID < idToDocID.Length; docID++) { int id = Convert.ToInt32(r.Document(docID).Get("id")); if (aDocs.Contains(id)) { aDocIDs.Add(docID); } else { bDocIDs.Add(docID); } } TermsEnum te = GetOnlySegmentReader(r).Fields.GetTerms("field").GetIterator(null); DocsEnum de = null; for (int iter2 = 0; iter2 < 10; iter2++) { if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " iter2=" + iter2); } Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("a"))); de = TestUtil.Docs(Random(), te, null, de, DocsFlags.NONE); TestOne(de, aDocIDs); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("b"))); de = TestUtil.Docs(Random(), te, null, de, DocsFlags.NONE); TestOne(de, bDocIDs); } w.Dispose(); r.Dispose(); dir.Dispose(); } }
// sugar private bool SeekExact(TermsEnum te, string term) { return te.SeekExact(new BytesRef(term)); }
public virtual void TestPositionReader() { TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(Dir, Seg.Info, FieldInfos, NewIOContext(Random)); //BytesRef[] terms; // LUCENENET NOTE: Not used in Lucene Terms vector = reader.Get(0).GetTerms(TestFields[0]); Assert.IsNotNull(vector); Assert.AreEqual(TestTerms.Length, vector.Count); TermsEnum termsEnum = vector.GetIterator(null); DocsAndPositionsEnum dpEnum = null; for (int i = 0; i < TestTerms.Length; i++) { BytesRef text = termsEnum.Next(); Assert.IsNotNull(text); string term = text.Utf8ToString(); //System.out.println("Term: " + term); Assert.AreEqual(TestTerms[i], term); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); Assert.IsNotNull(dpEnum); int doc = dpEnum.DocID; Assert.AreEqual(-1, doc); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(dpEnum.Freq, Positions[i].Length); for (int j = 0; j < Positions[i].Length; j++) { Assert.AreEqual(Positions[i][j], dpEnum.NextPosition()); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); dpEnum = termsEnum.DocsAndPositions(null, dpEnum); doc = dpEnum.DocID; Assert.AreEqual(-1, doc); Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.IsNotNull(dpEnum); Assert.AreEqual(dpEnum.Freq, Positions[i].Length); for (int j = 0; j < Positions[i].Length; j++) { Assert.AreEqual(Positions[i][j], dpEnum.NextPosition()); Assert.AreEqual(j * 10, dpEnum.StartOffset); Assert.AreEqual(j * 10 + TestTerms[i].Length, dpEnum.EndOffset); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc()); } Terms freqVector = reader.Get(0).GetTerms(TestFields[1]); //no pos, no offset Assert.IsNotNull(freqVector); Assert.AreEqual(TestTerms.Length, freqVector.Count); termsEnum = freqVector.GetIterator(null); Assert.IsNotNull(termsEnum); for (int i = 0; i < TestTerms.Length; i++) { BytesRef text = termsEnum.Next(); Assert.IsNotNull(text); string term = text.Utf8ToString(); //System.out.println("Term: " + term); Assert.AreEqual(TestTerms[i], term); Assert.IsNotNull(termsEnum.Docs(null, null)); Assert.IsNull(termsEnum.DocsAndPositions(null, null)); // no pos } reader.Dispose(); }
internal Iterator(DocTermOrds outerInstance, AtomicReader reader) { this.OuterInstance = outerInstance; this.Reader = reader; this.Te = TermsEnum(); }
public IntDocValuesAnonymousInnerClassHelper(JoinDocFreqValueSource outerInstance, JoinDocFreqValueSource @this, BinaryDocValues terms, TermsEnum termsEnum) : base(@this) { this.outerInstance = outerInstance; this.terms = terms; this.termsEnum = termsEnum; @ref = new BytesRef(); }
public virtual void TestRandom() { // token -> docID -> tokens IDictionary <string, IDictionary <int?, IList <Token> > > actualTokens = new Dictionary <string, IDictionary <int?, IList <Token> > >(); Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); int numDocs = AtLeast(20); //final int numDocs = AtLeast(5); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: randomize what IndexOptions we use; also test // changing this up in one IW buffered segment...: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (Random.NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = Random.NextBoolean(); ft.StoreTermVectorPositions = Random.NextBoolean(); } for (int docCount = 0; docCount < numDocs; docCount++) { Document doc = new Document(); doc.Add(new Int32Field("id", docCount, Field.Store.NO)); IList <Token> tokens = new List <Token>(); int numTokens = AtLeast(100); //final int numTokens = AtLeast(20); int pos = -1; int offset = 0; //System.out.println("doc id=" + docCount); for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) { string text; if (Random.NextBoolean()) { text = "a"; } else if (Random.NextBoolean()) { text = "b"; } else if (Random.NextBoolean()) { text = "c"; } else { text = "d"; } int posIncr = Random.NextBoolean() ? 1 : Random.Next(5); if (tokenCount == 0 && posIncr == 0) { posIncr = 1; } int offIncr = Random.NextBoolean() ? 0 : Random.Next(5); int tokenOffset = Random.Next(5); Token token = MakeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset); if (!actualTokens.TryGetValue(text, out IDictionary <int?, IList <Token> > postingsByDoc)) { actualTokens[text] = postingsByDoc = new Dictionary <int?, IList <Token> >(); } if (!postingsByDoc.TryGetValue(docCount, out IList <Token> postings)) { postingsByDoc[docCount] = postings = new List <Token>(); } postings.Add(token); tokens.Add(token); pos += posIncr; // stuff abs position into type: token.Type = "" + pos; offset += offIncr + tokenOffset; //System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.StartOffset + "/" + token.EndOffset + " (freq=" + postingsByDoc.Get(docCount).Size() + ")"); } doc.Add(new Field("content", new CannedTokenStream(tokens.ToArray()), ft)); w.AddDocument(doc); } DirectoryReader r = w.GetReader(); w.Dispose(); string[] terms = new string[] { "a", "b", "c", "d" }; foreach (AtomicReaderContext ctx in r.Leaves) { // TODO: improve this AtomicReader sub = (AtomicReader)ctx.Reader; //System.out.println("\nsub=" + sub); TermsEnum termsEnum = sub.Fields.GetTerms("content").GetIterator(null); DocsEnum docs = null; DocsAndPositionsEnum docsAndPositions = null; DocsAndPositionsEnum docsAndPositionsAndOffsets = null; FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(sub, "id", false); foreach (string term in terms) { //System.out.println(" term=" + term); if (termsEnum.SeekExact(new BytesRef(term))) { docs = termsEnum.Docs(null, docs); Assert.IsNotNull(docs); int doc; //System.out.println(" doc/freq"); while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " docID=" + doc + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docs.Freq); } // explicitly exclude offsets here docsAndPositions = termsEnum.DocsAndPositions(null, docsAndPositions, DocsAndPositionsFlags.PAYLOADS); Assert.IsNotNull(docsAndPositions); //System.out.println(" doc/freq/pos"); while ((doc = docsAndPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docsAndPositions.Freq); foreach (Token token in expected) { int pos = Convert.ToInt32(token.Type); //System.out.println(" pos=" + pos); Assert.AreEqual(pos, docsAndPositions.NextPosition()); } } docsAndPositionsAndOffsets = termsEnum.DocsAndPositions(null, docsAndPositions); Assert.IsNotNull(docsAndPositionsAndOffsets); //System.out.println(" doc/freq/pos/offs"); while ((doc = docsAndPositionsAndOffsets.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docsAndPositionsAndOffsets.Freq); foreach (Token token in expected) { int pos = Convert.ToInt32(token.Type); //System.out.println(" pos=" + pos); Assert.AreEqual(pos, docsAndPositionsAndOffsets.NextPosition()); Assert.AreEqual(token.StartOffset, docsAndPositionsAndOffsets.StartOffset); Assert.AreEqual(token.EndOffset, docsAndPositionsAndOffsets.EndOffset); } } } } // TODO: test advance: } r.Dispose(); dir.Dispose(); }
public override TermsEnum Iterator(TermsEnum reuse) { return @in.Iterator(reuse); }
public abstract TermsEnum GetEnumerator(); // LUCENENET specific - Refactored to require both overloads, so we don't have a strange null parameter unless needed /// <summary> /// Returns an iterator that will step through all /// terms. This method will not return <c>null</c>. /// </summary> /// <param name="reuse">If you have a previous <see cref="TermsEnum"/>, /// for example from a different field, you can pass it for possible /// reuse if the implementation can do so.</param> public virtual TermsEnum GetEnumerator(TermsEnum reuse) => GetEnumerator(); // LUCENENET specific - Refactored to require both overloads, so we don't have a strange null parameter unless needed
/// <summary> /// checks the terms enum sequentially /// if deep is false, it does a 'shallow' test that doesnt go down to the docsenums /// </summary> public void AssertTermsEnumEquals(string info, IndexReader leftReader, TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, bool deep) { BytesRef term; Bits randomBits = new RandomBits(leftReader.MaxDoc, Random().NextDouble(), Random()); DocsAndPositionsEnum leftPositions = null; DocsAndPositionsEnum rightPositions = null; DocsEnum leftDocs = null; DocsEnum rightDocs = null; while ((term = leftTermsEnum.Next()) != null) { Assert.AreEqual(term, rightTermsEnum.Next(), info); AssertTermStatsEquals(info, leftTermsEnum, rightTermsEnum); if (deep) { AssertDocsAndPositionsEnumEquals(info, leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertDocsAndPositionsEnumEquals(info, leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); AssertPositionsSkippingEquals(info, leftReader, leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(null, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(null, rightPositions)); AssertPositionsSkippingEquals(info, leftReader, leftTermsEnum.DocFreq(), leftPositions = leftTermsEnum.DocsAndPositions(randomBits, leftPositions), rightPositions = rightTermsEnum.DocsAndPositions(randomBits, rightPositions)); // with freqs: AssertDocsEnumEquals(info, leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs), true); AssertDocsEnumEquals(info, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs), true); // w/o freqs: AssertDocsEnumEquals(info, leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE), false); AssertDocsEnumEquals(info, leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE), false); // with freqs: AssertDocsSkippingEquals(info, leftReader, leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs), rightDocs = rightTermsEnum.Docs(null, rightDocs), true); AssertDocsSkippingEquals(info, leftReader, leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs), true); // w/o freqs: AssertDocsSkippingEquals(info, leftReader, leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(null, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(null, rightDocs, DocsEnum.FLAG_NONE), false); AssertDocsSkippingEquals(info, leftReader, leftTermsEnum.DocFreq(), leftDocs = leftTermsEnum.Docs(randomBits, leftDocs, DocsEnum.FLAG_NONE), rightDocs = rightTermsEnum.Docs(randomBits, rightDocs, DocsEnum.FLAG_NONE), false); } } Assert.IsNull(rightTermsEnum.Next(), info); }
public virtual TermsEnum GetIterator(TermsEnum reuse) => GetEnumerator(reuse);
/// <summary> /// Subclass can override this </summary> protected internal virtual void VisitTerm(TermsEnum te, int termNum) { }
/// <summary> /// Returns an iterator that will step through all /// terms. this method will not return null. If you have /// a previous TermsEnum, for example from a different /// field, you can pass it for possible reuse if the /// implementation can do so. /// </summary> public abstract TermsEnum Iterator(TermsEnum reuse);
/// <summary> /// Call this only once (if you subclass!) </summary> protected internal virtual void Uninvert(AtomicReader reader, Bits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(Field); if (info != null && info.HasDocValues()) { throw new InvalidOperationException("Type mismatch: " + Field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = DateTime.Now.Millisecond; Prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.Terms(Field); if (terms == null) { // No terms return; } TermsEnum te = terms.Iterator(null); BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; DocsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term(); if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { OrdBase = (int)te.Ord(); //System.out.println("got ordBase=" + ordBase); } catch (System.NotSupportedException uoe) { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & IndexIntervalMask) == 0) { // Index this term SizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq(); if (df <= MaxTermDocFreq) { DocsEnum = te.Docs(liveDocs, DocsEnum, DocsEnum.FLAG_NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = DocsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; TermInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VIntSize(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (te.Next() == null) { break; } } NumTermsInField = termNum; long midPoint = DateTime.Now.Millisecond; if (TermInstances == 0) { // we didn't invert anything // lower memory consumption. Tnums = null; } else { this.Index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = Tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + Field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; /// <summary> ///* we don't have to worry about the array getting too large /// since the "pos" param will overflow first (only 24 bits available) /// if ((newlen<<1) <= 0) { /// // overflow... /// newlen = Integer.MAX_VALUE; /// if (newlen <= pos + len) { /// throw new SolrException(400,"Too many terms to uninvert field!"); /// } /// } else { /// while (newlen <= pos + len) newlen<<=1; // doubling strategy /// } /// *** /// </summary> while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { IndexedTermsArray = indexedTerms.ToArray(); } long endTime = DateTime.Now.Millisecond; Total_time = (int)(endTime - startTime); Phase1_time = (int)(midPoint - startTime); }
/// <summary> /// Creates a filtered <seealso cref="TermsEnum"/> on a terms enum. </summary> /// <param name="tenum"> the terms enumeration to filter. </param> public FilteredTermsEnum(TermsEnum tenum, bool startWithSeek) { Debug.Assert(tenum != null); this.Tenum = tenum; DoSeek = startWithSeek; }
/// <summary> /// Returns the term (<seealso cref="BytesRef"/>) corresponding to /// the provided ordinal. /// </summary> public virtual BytesRef LookupTerm(TermsEnum termsEnum, int ord) { termsEnum.SeekExact(ord); return(termsEnum.Term()); }
/// <exception cref="System.IO.IOException"></exception> public override void SetNextReader(AtomicReaderContext context) { if (segmentFacetCounts != null) { segmentResults.AddItem(((TermGroupFacetCollector.MV.SegmentResult)CreateSegmentResult ())); } groupFieldTermsIndex = FieldCache.DEFAULT.GetTermsIndex(((AtomicReader)context.Reader ()), groupField); facetFieldDocTermOrds = FieldCache.DEFAULT.GetDocTermOrds(((AtomicReader)context. Reader()), facetField); facetFieldNumTerms = (int)facetFieldDocTermOrds.GetValueCount(); if (facetFieldNumTerms == 0) { facetOrdTermsEnum = null; } else { facetOrdTermsEnum = facetFieldDocTermOrds.TermsEnum(); } // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field segmentFacetCounts = new int[facetFieldNumTerms + 1]; segmentTotalCount = 0; segmentGroupedFacetHits.Clear(); foreach (GroupedFacetHit groupedFacetHit in groupedFacetHits) { int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.LookupTerm (groupedFacetHit.groupValue); if (groupedFacetHit.groupValue != null && groupOrd < 0) { continue; } int facetOrd; if (groupedFacetHit.facetValue != null) { if (facetOrdTermsEnum == null || !facetOrdTermsEnum.SeekExact(groupedFacetHit.facetValue )) { continue; } facetOrd = (int)facetOrdTermsEnum.Ord(); } else { facetOrd = facetFieldNumTerms; } // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd; segmentGroupedFacetHits.Put(segmentGroupedFacetsIndex); } if (facetPrefix != null) { TermsEnum.SeekStatus seekStatus; if (facetOrdTermsEnum != null) { seekStatus = facetOrdTermsEnum.SeekCeil(facetPrefix); } else { seekStatus = TermsEnum.SeekStatus.END; } if (seekStatus != TermsEnum.SeekStatus.END) { startFacetOrd = (int)facetOrdTermsEnum.Ord(); } else { startFacetOrd = 0; endFacetOrd = 0; return; } BytesRef facetEndPrefix = BytesRef.DeepCopyOf(facetPrefix); facetEndPrefix.Append(UnicodeUtil.BIG_TERM); seekStatus = facetOrdTermsEnum.SeekCeil(facetEndPrefix); if (seekStatus != TermsEnum.SeekStatus.END) { endFacetOrd = (int)facetOrdTermsEnum.Ord(); } else { endFacetOrd = facetFieldNumTerms; } } else { // Don't include null... startFacetOrd = 0; endFacetOrd = facetFieldNumTerms + 1; } }
public TermStateAnonymousInnerClassHelper(TermsEnum outerInstance) { this.OuterInstance = outerInstance; }
private void Verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) { DocTermOrds dto = new DocTermOrds(r, r.LiveDocs, "field", prefixRef, int.MaxValue, TestUtil.NextInt32(Random, 2, 10)); FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(r, "id", false); /* * for(int docID=0;docID<subR.MaxDoc;docID++) { * System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); * } */ if (Verbose) { Console.WriteLine("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.Utf8ToString())); Console.WriteLine("TEST: all TERMS:"); TermsEnum allTE = MultiFields.GetTerms(r, "field").GetIterator(null); int ord = 0; while (allTE.Next() != null) { Console.WriteLine(" ord=" + (ord++) + " term=" + allTE.Term.Utf8ToString()); } } //final TermsEnum te = subR.Fields.Terms("field").iterator(); TermsEnum te = dto.GetOrdTermsEnum(r); if (dto.NumTerms == 0) { if (prefixRef == null) { Assert.IsNull(MultiFields.GetTerms(r, "field")); } else { Terms terms = MultiFields.GetTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); TermsEnum.SeekStatus result = termsEnum.SeekCeil(prefixRef); if (result != TermsEnum.SeekStatus.END) { Assert.IsFalse(StringHelper.StartsWith(termsEnum.Term, prefixRef), "term=" + termsEnum.Term.Utf8ToString() + " matches prefix=" + prefixRef.Utf8ToString()); } else { // ok } } else { // ok } } return; } if (Verbose) { Console.WriteLine("TEST: TERMS:"); te.SeekExact(0); while (true) { Console.WriteLine(" ord=" + te.Ord + " term=" + te.Term.Utf8ToString()); if (te.Next() == null) { break; } } } SortedSetDocValues iter = dto.GetIterator(r); for (int docID = 0; docID < r.MaxDoc; docID++) { if (Verbose) { Console.WriteLine("TEST: docID=" + docID + " of " + r.MaxDoc + " (id=" + docIDToID.Get(docID) + ")"); } iter.SetDocument(docID); int[] answers = idToOrds[docIDToID.Get(docID)]; int upto = 0; long ord; while ((ord = iter.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.SeekExact(ord); BytesRef expected = termsArray[answers[upto++]]; if (Verbose) { Console.WriteLine(" exp=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString()); } Assert.AreEqual(expected, te.Term, "expected=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString() + " ord=" + ord); } Assert.AreEqual(answers.Length, upto); } }
// sugar private string Next(TermsEnum te) { BytesRef br = te.Next(); if (br == null) { return null; } else { return br.Utf8ToString(); } }
public virtual void TestSortedTermsEnum() { Directory directory = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random); IndexWriterConfig iwconfig = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwconfig.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(Random, directory, iwconfig); Document doc = new Document(); doc.Add(new StringField("field", "hello", Field.Store.NO)); iwriter.AddDocument(doc); doc = new Document(); doc.Add(new StringField("field", "world", Field.Store.NO)); iwriter.AddDocument(doc); doc = new Document(); doc.Add(new StringField("field", "beer", Field.Store.NO)); iwriter.AddDocument(doc); iwriter.ForceMerge(1); DirectoryReader ireader = iwriter.GetReader(); iwriter.Dispose(); AtomicReader ar = GetOnlySegmentReader(ireader); SortedSetDocValues dv = FieldCache.DEFAULT.GetDocTermOrds(ar, "field"); Assert.AreEqual(3, dv.ValueCount); TermsEnum termsEnum = dv.GetTermsEnum(); // next() Assert.AreEqual("beer", termsEnum.Next().Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.AreEqual("hello", termsEnum.Next().Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.AreEqual("world", termsEnum.Next().Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); // seekCeil() Assert.AreEqual(SeekStatus.NOT_FOUND, termsEnum.SeekCeil(new BytesRef("ha!"))); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef("beer"))); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.AreEqual(SeekStatus.END, termsEnum.SeekCeil(new BytesRef("zzz"))); // seekExact() Assert.IsTrue(termsEnum.SeekExact(new BytesRef("beer"))); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("hello"))); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("world"))); Assert.AreEqual("world", termsEnum.Term.Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); Assert.IsFalse(termsEnum.SeekExact(new BytesRef("bogus"))); // seek(ord) termsEnum.SeekExact(0); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); termsEnum.SeekExact(1); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); termsEnum.SeekExact(2); Assert.AreEqual("world", termsEnum.Term.Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); ireader.Dispose(); directory.Dispose(); }
public virtual void Test() { IList <string> postingsList = new List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt(Random(), 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field")); IndexWriterConfig iwc = NewIndexWriterConfig(Random(), TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TEST_NIGHTLY || RANDOM_MULTIPLIER > 1)) { // Otherwise test can take way too long (> 2 hours) numTerms /= 2; } if (VERBOSE) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i); for (int j = 0; j < i; j++) { postingsList.Add(term); } } Collections.Shuffle(postingsList); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir(GetFullMethodName())); RandomIndexWriter iw = new RandomIndexWriter(Random(), dir, iwc); int threadCount = TestUtil.NextInt(Random(), 1, 5); if (VERBOSE) { Console.WriteLine("config: " + iw.w.Config); Console.WriteLine("threadCount=" + threadCount); } Field prototype = NewTextField("field", "", Field.Store.NO); FieldType fieldType = new FieldType((FieldType)prototype.FieldType); if (Random().NextBoolean()) { fieldType.OmitNorms = true; } int options = Random().Next(3); if (options == 0) { fieldType.IndexOptions = IndexOptions.DOCS_AND_FREQS; // we dont actually need positions fieldType.StoreTermVectors = true; // but enforce term vectors when we do this so we check SOMETHING } else if (options == 1 && !DoesntSupportOffsets.Contains(TestUtil.GetPostingsFormat("field"))) { fieldType.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; } // else just positions ThreadClass[] threads = new ThreadClass[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { Random threadRandom = new Random(Random().Next()); Document document = new Document(); Field field = new Field("field", "", fieldType); document.Add(field); threads[threadID] = new ThreadAnonymousInnerClassHelper(this, numTerms, maxTermsPerDoc, postings, iw, startingGun, threadRandom, document, field); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadClass t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.Reader; Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.GetTerms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: Assert.AreEqual(numTerms - 1, terms.Count); TermsEnum termsEnum = terms.GetIterator(null); BytesRef termBR; while ((termBR = termsEnum.Next()) != null) { int value = Convert.ToInt32(termBR.Utf8ToString()); Assert.AreEqual(value, termsEnum.TotalTermFreq); // don't really need to check more than this, as CheckIndex // will verify that totalTermFreq == total number of positions seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
/// <summary> /// Returns the term (<seealso cref="BytesRef"/>) corresponding to /// the provided ordinal. /// </summary> public virtual BytesRef LookupTerm(TermsEnum termsEnum, int ord) { termsEnum.SeekExact(ord); return termsEnum.Term(); }
public override TermsEnum GetIterator(TermsEnum reuse) { return(m_input.GetIterator(reuse)); }
/// <exception cref="System.IO.IOException"></exception> public BaseTermsEnumTraverser(AbstractPrefixTreeFilter _enclosing, AtomicReaderContext context, IBits acceptDocs) { this._enclosing = _enclosing; //remember to check for null in getDocIdSet this.context = context; AtomicReader reader = context.AtomicReader; this.acceptDocs = acceptDocs; maxDoc = reader.MaxDoc; Terms terms = reader.Terms(this._enclosing.fieldName); if (terms != null) { termsEnum = terms.Iterator(null); } }
/// <summary> /// Creates a new <see cref="FilterTermsEnum"/> </summary> /// <param name="input"> the underlying <see cref="TermsEnum"/> instance. </param> public FilterTermsEnum(TermsEnum input) { this.m_input = input; }
public override TermsEnum Iterator(TermsEnum reuse) { // TODO: should we give this thing a random to be super-evil, // and randomly *not* unwrap? if (reuse is AssertingTermsEnum) { reuse = ((AssertingTermsEnum)reuse).TermsEnumIn_Nunit(); } TermsEnum termsEnum = base.Iterator(reuse); Debug.Assert(termsEnum != null); return new AssertingTermsEnum(termsEnum); }
/// <summary> /// Creates a filtered <see cref="TermsEnum"/> on a terms enum. </summary> /// <param name="tenum"> the terms enumeration to filter. </param> public FilteredTermsEnum(TermsEnum tenum) : this(tenum, true) { }
internal SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms, int[] ords) : base(tenum) { Terms = terms; Ords = ords; _comparator = BytesRef.UTF8SortedAsUnicodeComparer; _lastElement = terms.Size() - 1; _lastTerm = terms.Get(ords[_lastElement], new BytesRef()); _seekTerm = terms.Get(ords[_upto], _spare); }
/// <summary> /// Creates a filtered <see cref="TermsEnum"/> on a terms enum. </summary> /// <param name="tenum"> the terms enumeration to filter. </param> /// <param name="startWithSeek"> start with seek </param> public FilteredTermsEnum(TermsEnum tenum, bool startWithSeek) { Debug.Assert(tenum != null); this.tenum = tenum; doSeek = startWithSeek; }
public BaseTermsEnumTraverser(AbstractPrefixTreeFilter outerInstance, AtomicReaderContext context, Bits acceptDocs) { this.outerInstance = outerInstance; this.context = context; AtomicReader reader = context.AtomicReader; this.acceptDocs = acceptDocs; maxDoc = reader.MaxDoc; Terms terms = reader.Terms(outerInstance.fieldName); if (terms != null) { termsEnum = terms.Iterator(null); } }
public virtual void Test() { IList <string> postingsList = new List <string>(); int numTerms = AtLeast(300); int maxTermsPerDoc = TestUtil.NextInt32(Random, 10, 20); bool isSimpleText = "SimpleText".Equals(TestUtil.GetPostingsFormat("field"), StringComparison.Ordinal); IndexWriterConfig iwc = NewIndexWriterConfig(Random, TEST_VERSION_CURRENT, new MockAnalyzer(Random)); if ((isSimpleText || iwc.MergePolicy is MockRandomMergePolicy) && (TestNightly || RandomMultiplier > 1)) { // Otherwise test can take way too long (> 2 hours) //numTerms /= 2; // LUCENENET specific - To keep this under the 1 hour free limit // of Azure DevOps, this was reduced from /2 to /6. numTerms /= 6; } if (Verbose) { Console.WriteLine("maxTermsPerDoc=" + maxTermsPerDoc); Console.WriteLine("numTerms=" + numTerms); } for (int i = 0; i < numTerms; i++) { string term = Convert.ToString(i, CultureInfo.InvariantCulture); for (int j = 0; j < i; j++) { postingsList.Add(term); } } postingsList.Shuffle(Random); ConcurrentQueue <string> postings = new ConcurrentQueue <string>(postingsList); Directory dir = NewFSDirectory(CreateTempDir("bagofpostings")); RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc); int threadCount = TestUtil.NextInt32(Random, 1, 5); if (Verbose) { Console.WriteLine("config: " + iw.IndexWriter.Config); Console.WriteLine("threadCount=" + threadCount); } ThreadJob[] threads = new ThreadJob[threadCount]; CountdownEvent startingGun = new CountdownEvent(1); for (int threadID = 0; threadID < threadCount; threadID++) { threads[threadID] = new ThreadAnonymousClass(this, maxTermsPerDoc, postings, iw, startingGun); threads[threadID].Start(); } startingGun.Signal(); foreach (ThreadJob t in threads) { t.Join(); } iw.ForceMerge(1); DirectoryReader ir = iw.GetReader(); Assert.AreEqual(1, ir.Leaves.Count); AtomicReader air = (AtomicReader)ir.Leaves[0].Reader; Terms terms = air.GetTerms("field"); // numTerms-1 because there cannot be a term 0 with 0 postings: #pragma warning disable 612, 618 Assert.AreEqual(numTerms - 1, air.Fields.UniqueTermCount); if (iwc.Codec is Lucene3xCodec == false) #pragma warning restore 612, 618 { Assert.AreEqual(numTerms - 1, terms.Count); } TermsEnum termsEnum = terms.GetEnumerator(); while (termsEnum.MoveNext()) { int value = Convert.ToInt32(termsEnum.Term.Utf8ToString(), CultureInfo.InvariantCulture); Assert.AreEqual(value, termsEnum.DocFreq); // don't really need to check more than this, as CheckIndex // will verify that docFreq == actual number of documents seen // from a docsAndPositionsEnum. } ir.Dispose(); iw.Dispose(); dir.Dispose(); }
/// <summary> /// Creates a new FilterTermsEnum </summary> /// <param name="in"> the underlying TermsEnum instance. </param> public FilterTermsEnum(TermsEnum @in) { this.@in = @in; }
public virtual void TestIntersectRandom() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); int numTerms = AtLeast(300); //final int numTerms = 50; ISet <string> terms = new JCG.HashSet <string>(); ICollection <string> pendingTerms = new List <string>(); IDictionary <BytesRef, int?> termToID = new Dictionary <BytesRef, int?>(); int id = 0; while (terms.Count != numTerms) { string s = RandomString; if (!terms.Contains(s)) { terms.Add(s); pendingTerms.Add(s); if (Random.Next(20) == 7) { AddDoc(w, pendingTerms, termToID, id++); } } } AddDoc(w, pendingTerms, termToID, id++); BytesRef[] termsArray = new BytesRef[terms.Count]; ISet <BytesRef> termsSet = new JCG.HashSet <BytesRef>(); { int upto = 0; foreach (string s in terms) { BytesRef b = new BytesRef(s); termsArray[upto++] = b; termsSet.Add(b); } Array.Sort(termsArray); } if (VERBOSE) { Console.WriteLine("\nTEST: indexed terms (unicode order):"); foreach (BytesRef t in termsArray) { Console.WriteLine(" " + t.Utf8ToString() + " -> id:" + termToID[t]); } } IndexReader r = w.GetReader(); w.Dispose(); // NOTE: intentional insanity!! FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(SlowCompositeReaderWrapper.Wrap(r), "id", false); for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { // TODO: can we also test infinite As here...? // From the random terms, pick some ratio and compile an // automaton: ISet <string> acceptTerms = new JCG.HashSet <string>(); JCG.SortedSet <BytesRef> sortedAcceptTerms = new JCG.SortedSet <BytesRef>(); double keepPct = Random.NextDouble(); Automaton a; if (iter == 0) { if (VERBOSE) { Console.WriteLine("\nTEST: empty automaton"); } a = BasicAutomata.MakeEmpty(); } else { if (VERBOSE) { Console.WriteLine("\nTEST: keepPct=" + keepPct); } foreach (string s in terms) { string s2; if (Random.NextDouble() <= keepPct) { s2 = s; } else { s2 = RandomString; } acceptTerms.Add(s2); sortedAcceptTerms.Add(new BytesRef(s2)); } a = BasicAutomata.MakeStringUnion(sortedAcceptTerms); } if (Random.NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: reduce the automaton"); } a.Reduce(); } CompiledAutomaton c = new CompiledAutomaton(a, true, false); BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.Count]; ISet <BytesRef> acceptTermsSet = new JCG.HashSet <BytesRef>(); int upto = 0; foreach (string s in acceptTerms) { BytesRef b = new BytesRef(s); acceptTermsArray[upto++] = b; acceptTermsSet.Add(b); Assert.IsTrue(Accepts(c, b)); } Array.Sort(acceptTermsArray); if (VERBOSE) { Console.WriteLine("\nTEST: accept terms (unicode order):"); foreach (BytesRef t in acceptTermsArray) { Console.WriteLine(" " + t.Utf8ToString() + (termsSet.Contains(t) ? " (exists)" : "")); } Console.WriteLine(a.ToDot()); } for (int iter2 = 0; iter2 < 100; iter2++) { BytesRef startTerm = acceptTermsArray.Length == 0 || Random.NextBoolean() ? null : acceptTermsArray[Random.Next(acceptTermsArray.Length)]; if (VERBOSE) { Console.WriteLine("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.Utf8ToString())); if (startTerm != null) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < startTerm.Length; idx++) { int label = startTerm.Bytes[startTerm.Offset + idx] & 0xff; Console.WriteLine(" state=" + state + " label=" + label); state = c.RunAutomaton.Step(state, label); Assert.IsTrue(state != -1); } Console.WriteLine(" state=" + state); } } TermsEnum te = MultiFields.GetTerms(r, "f").Intersect(c, startTerm); int loc; if (startTerm == null) { loc = 0; } else { loc = Array.BinarySearch(termsArray, BytesRef.DeepCopyOf(startTerm)); if (loc < 0) { loc = -(loc + 1); } else { // startTerm exists in index loc++; } } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])) { loc++; } DocsEnum docsEnum = null; while (loc < termsArray.Length) { BytesRef expected = termsArray[loc]; BytesRef actual = te.Next(); if (VERBOSE) { Console.WriteLine("TEST: next() expected=" + expected.Utf8ToString() + " actual=" + (actual == null ? "null" : actual.Utf8ToString())); } Assert.AreEqual(expected, actual); Assert.AreEqual(1, te.DocFreq); docsEnum = TestUtil.Docs(Random, te, null, docsEnum, DocsFlags.NONE); int docID = docsEnum.NextDoc(); Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(docIDToID.Get(docID), (int)termToID[expected]); do { loc++; } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])); } Assert.IsNull(te.Next()); } } r.Dispose(); dir.Dispose(); }
/// <summary> /// checks term-level statistics /// </summary> public virtual void AssertTermStats(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) { Assert.AreEqual(leftTermsEnum.DocFreq(), rightTermsEnum.DocFreq()); if (leftTermsEnum.TotalTermFreq() != -1 && rightTermsEnum.TotalTermFreq() != -1) { Assert.AreEqual(leftTermsEnum.TotalTermFreq(), rightTermsEnum.TotalTermFreq()); } }
public virtual void Test() { Random random = new Random(Random.Next()); LineFileDocs docs = new LineFileDocs(random, DefaultCodecSupportsDocValues); Directory d = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(LuceneTestCase.Random); analyzer.MaxTokenLength = TestUtil.NextInt32(LuceneTestCase.Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif LuceneTestCase.Random, d, analyzer); int numDocs = AtLeast(10); for (int docCount = 0; docCount < numDocs; docCount++) { w.AddDocument(docs.NextDoc()); } IndexReader r = w.GetReader(); w.Dispose(); List <BytesRef> terms = new List <BytesRef>(); TermsEnum termsEnum = MultiFields.GetTerms(r, "body").GetIterator(null); BytesRef term; while ((term = termsEnum.Next()) != null) { terms.Add(BytesRef.DeepCopyOf(term)); } if (VERBOSE) { Console.WriteLine("TEST: " + terms.Count + " terms"); } int upto = -1; int iters = AtLeast(200); for (int iter = 0; iter < iters; iter++) { bool isEnd; if (upto != -1 && LuceneTestCase.Random.NextBoolean()) { // next if (VERBOSE) { Console.WriteLine("TEST: iter next"); } isEnd = termsEnum.Next() == null; upto++; if (isEnd) { if (VERBOSE) { Console.WriteLine(" end"); } Assert.AreEqual(upto, terms.Count); upto = -1; } else { if (VERBOSE) { Console.WriteLine(" got term=" + termsEnum.Term.Utf8ToString() + " expected=" + terms[upto].Utf8ToString()); } Assert.IsTrue(upto < terms.Count); Assert.AreEqual(terms[upto], termsEnum.Term); } } else { BytesRef target; string exists; if (LuceneTestCase.Random.NextBoolean()) { // likely fake term if (LuceneTestCase.Random.NextBoolean()) { target = new BytesRef(TestUtil.RandomSimpleString(LuceneTestCase.Random)); } else { target = new BytesRef(TestUtil.RandomRealisticUnicodeString(LuceneTestCase.Random)); } exists = "likely not"; } else { // real term target = terms[LuceneTestCase.Random.Next(terms.Count)]; exists = "yes"; } upto = terms.BinarySearch(target); if (LuceneTestCase.Random.NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: iter seekCeil target=" + target.Utf8ToString() + " exists=" + exists); } // seekCeil TermsEnum.SeekStatus status = termsEnum.SeekCeil(target); if (VERBOSE) { Console.WriteLine(" got " + status); } if (upto < 0) { upto = -(upto + 1); if (upto >= terms.Count) { Assert.AreEqual(TermsEnum.SeekStatus.END, status); upto = -1; } else { Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status); Assert.AreEqual(terms[upto], termsEnum.Term); } } else { Assert.AreEqual(TermsEnum.SeekStatus.FOUND, status); Assert.AreEqual(terms[upto], termsEnum.Term); } } else { if (VERBOSE) { Console.WriteLine("TEST: iter seekExact target=" + target.Utf8ToString() + " exists=" + exists); } // seekExact bool result = termsEnum.SeekExact(target); if (VERBOSE) { Console.WriteLine(" got " + result); } if (upto < 0) { Assert.IsFalse(result); upto = -1; } else { Assert.IsTrue(result); Assert.AreEqual(target, termsEnum.Term); } } } } r.Dispose(); d.Dispose(); docs.Dispose(); }
/// <summary> /// checks term-level statistics /// </summary> public void AssertTermStatsEquals(string info, TermsEnum leftTermsEnum, TermsEnum rightTermsEnum) { Assert.AreEqual(leftTermsEnum.DocFreq(), rightTermsEnum.DocFreq(), info); if (leftTermsEnum.TotalTermFreq() != -1 && rightTermsEnum.TotalTermFreq() != -1) { Assert.AreEqual(leftTermsEnum.TotalTermFreq(), rightTermsEnum.TotalTermFreq(), info); } }
public virtual void VerifyEquals(DirectoryReader r1, DirectoryReader r2, string idField) { if (Verbose) { Console.WriteLine("\nr1 docs:"); PrintDocs(r1); Console.WriteLine("\nr2 docs:"); PrintDocs(r2); } if (r1.NumDocs != r2.NumDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(false, () => "r1.NumDocs=" + r1.NumDocs + " vs r2.NumDocs=" + r2.NumDocs); } } bool hasDeletes = !(r1.MaxDoc == r2.MaxDoc && r1.NumDocs == r1.MaxDoc); int[] r2r1 = new int[r2.MaxDoc]; // r2 id to r1 id mapping // create mapping from id2 space to id2 based on idField Fields f1 = MultiFields.GetFields(r1); if (f1 == null) { // make sure r2 is empty Assert.IsNull(MultiFields.GetFields(r2)); return; } Terms terms1 = f1.GetTerms(idField); if (terms1 == null) { Assert.IsTrue(MultiFields.GetFields(r2) == null || MultiFields.GetFields(r2).GetTerms(idField) == null); return; } TermsEnum termsEnum = terms1.GetEnumerator(); IBits liveDocs1 = MultiFields.GetLiveDocs(r1); IBits liveDocs2 = MultiFields.GetLiveDocs(r2); Fields fields = MultiFields.GetFields(r2); if (fields == null) { // make sure r1 is in fact empty (eg has only all // deleted docs): IBits liveDocs = MultiFields.GetLiveDocs(r1); DocsEnum docs = null; while (termsEnum.MoveNext()) { docs = TestUtil.Docs(Random, termsEnum, liveDocs, docs, DocsFlags.NONE); while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.Fail("r1 is not empty but r2 is"); } } return; } Terms terms2 = fields.GetTerms(idField); TermsEnum termsEnum2 = terms2.GetEnumerator(); DocsEnum termDocs1 = null; DocsEnum termDocs2 = null; while (termsEnum.MoveNext()) { BytesRef term = termsEnum.Term; //System.out.println("TEST: match id term=" + term); termDocs1 = TestUtil.Docs(Random, termsEnum, liveDocs1, termDocs1, DocsFlags.NONE); if (termsEnum2.SeekExact(term)) { termDocs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, termDocs2, DocsFlags.NONE); } else { termDocs2 = null; } if (termDocs1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS) { // this doc is deleted and wasn't replaced Assert.IsTrue(termDocs2 == null || termDocs2.NextDoc() == DocIdSetIterator.NO_MORE_DOCS); continue; } int id1 = termDocs1.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs1.NextDoc()); Assert.IsTrue(termDocs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int id2 = termDocs2.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs2.NextDoc()); r2r1[id2] = id1; // verify stored fields are equivalent try { VerifyEquals(r1.Document(id1), r2.Document(id2)); } catch (Exception /*t*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); Console.WriteLine(" d1=" + r1.Document(id1)); Console.WriteLine(" d2=" + r2.Document(id2)); throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } try { // verify term vectors are equivalent VerifyEquals(r1.GetTermVectors(id1), r2.GetTermVectors(id2)); } catch (Exception /*e*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); Fields tv1 = r1.GetTermVectors(id1); Console.WriteLine(" d1=" + tv1); if (tv1 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv1) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv1.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetEnumerator(); while (termsEnum3.MoveNext()) { Console.WriteLine(" " + termsEnum3.Term.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } Fields tv2 = r2.GetTermVectors(id2); Console.WriteLine(" d2=" + tv2); if (tv2 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv2) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv2.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetEnumerator(); while (termsEnum3.MoveNext()) { Console.WriteLine(" " + termsEnum3.Term.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } //System.out.println("TEST: done match id"); // Verify postings //System.out.println("TEST: create te1"); Fields fields1 = MultiFields.GetFields(r1); IEnumerator <string> fields1Enum = fields1.GetEnumerator(); Fields fields2 = MultiFields.GetFields(r2); IEnumerator <string> fields2Enum = fields2.GetEnumerator(); string field1 = null, field2 = null; TermsEnum termsEnum1 = null; termsEnum2 = null; DocsEnum docs1 = null, docs2 = null; // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs]; long[] info2 = new long[r2.NumDocs]; for (; ;) { BytesRef term1 = null, term2 = null; // iterate until we get some docs int len1; for (; ;) { len1 = 0; if (termsEnum1 == null) { if (!fields1Enum.MoveNext()) { break; } field1 = fields1Enum.Current; Terms terms = fields1.GetTerms(field1); if (terms == null) { continue; } termsEnum1 = terms.GetEnumerator(); } if (!termsEnum1.MoveNext()) { term1 = null; // no more terms in this field termsEnum1 = null; continue; } term1 = termsEnum1.Term; //System.out.println("TEST: term1=" + term1); docs1 = TestUtil.Docs(Random, termsEnum1, liveDocs1, docs1, DocsFlags.FREQS); while (docs1.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = docs1.DocID; int f = docs1.Freq; info1[len1] = (((long)d) << 32) | (uint)f; len1++; } if (len1 > 0) { break; } } // iterate until we get some docs int len2; for (; ;) { len2 = 0; if (termsEnum2 == null) { if (!fields2Enum.MoveNext()) { break; } field2 = fields2Enum.Current; Terms terms = fields2.GetTerms(field2); if (terms == null) { continue; } termsEnum2 = terms.GetEnumerator(); } if (!termsEnum2.MoveNext()) { term2 = null; // no more terms in this field termsEnum2 = null; continue; } term2 = termsEnum2.Term; //System.out.println("TEST: term1=" + term1); docs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, docs2, DocsFlags.FREQS); while (docs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = r2r1[docs2.DocID]; int f = docs2.Freq; info2[len2] = (((long)d) << 32) | (uint)f; len2++; } if (len2 > 0) { break; } } Assert.AreEqual(len1, len2); if (len1 == 0) // no more terms { break; } Assert.AreEqual(field1, field2); Assert.IsTrue(term1.BytesEquals(term2)); if (!hasDeletes) { Assert.AreEqual(termsEnum1.DocFreq, termsEnum2.DocFreq); } Assert.AreEqual(term1, term2, "len1=" + len1 + " len2=" + len2 + " deletes?=" + hasDeletes); // sort info2 to get it into ascending docid Array.Sort(info2, 0, len2); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i], "i=" + i + " len=" + len1 + " d1=" + ((long)((ulong)info1[i] >> 32)) + " f1=" + (info1[i] & int.MaxValue) + " d2=" + ((long)((ulong)info2[i] >> 32)) + " f2=" + (info2[i] & int.MaxValue) + " field=" + field1 + " term=" + term1.Utf8ToString()); } } }