private int CountDocs(DocsEnum docs) { int count = 0; while ((docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { count++; } return(count); }
public static int[] ToArray(DocsEnum docsEnum) { IList<int?> docs = new List<int?>(); while (docsEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int docID = docsEnum.DocID(); docs.Add(docID); } return ArrayUtil.ToIntArray(docs); }
private void TestOne(DocsEnum docs, IList <int> expected) { if (VERBOSE) { Console.WriteLine("test"); } int upto = -1; while (upto < expected.Count) { if (VERBOSE) { Console.WriteLine(" cycle upto=" + upto + " of " + expected.Count); } int docID; if (Random().Next(4) == 1 || upto == expected.Count - 1) { // test nextDoc() if (VERBOSE) { Console.WriteLine(" do nextDoc"); } upto++; docID = docs.NextDoc(); } else { // test advance() int inc = TestUtil.NextInt(Random(), 1, expected.Count - 1 - upto); if (VERBOSE) { Console.WriteLine(" do advance inc=" + inc); } upto += inc; docID = docs.Advance(expected[upto]); } if (upto == expected.Count) { if (VERBOSE) { Console.WriteLine(" expect docID=" + DocIdSetIterator.NO_MORE_DOCS + " actual=" + docID); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docID); } else { if (VERBOSE) { Console.WriteLine(" expect docID=" + expected[upto] + " actual=" + docID); } Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual((int)expected[upto], docID); } } }
public static int[] ToArray(DocsEnum docsEnum) { IList <int?> docs = new List <int?>(); while (docsEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int docID = docsEnum.DocID; docs.Add(docID); } return(ArrayUtil.ToInt32Array(docs)); }
public static int[] ToArray(DocsEnum docsEnum) { IList <int> docs = new JCG.List <int>(); while (docsEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int docID = docsEnum.DocID; docs.Add(docID); } return(docs.ToArray()); // LUCENENET: ArrayUtil.ToIntArray() call unnecessary because we aren't dealing with reference types }
public override int Advance(int target) { if (Debugging.AssertsEnabled) { Debugging.Assert(target > doc); } while (true) { if (current != null) { int doc; if (target < currentBase) { // target was in the previous slice but there was no matching doc after it doc = current.NextDoc(); } else { doc = current.Advance(target - currentBase); } if (doc == NO_MORE_DOCS) { current = null; } else { return(this.doc = doc + currentBase); } } else if (upto == numSubs - 1) { return(this.doc = NO_MORE_DOCS); } else { upto++; current = subs[upto].DocsEnum; currentBase = subs[upto].Slice.Start; } } }
public virtual void TestTermDocsEnum() { Directory dir = NewDirectory(); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); Documents.Document d = new Documents.Document(); d.Add(NewStringField("f", "j", Field.Store.NO)); w.AddDocument(d); w.Commit(); w.AddDocument(d); IndexReader r = w.GetReader(); w.Dispose(); DocsEnum de = MultiFields.GetTermDocsEnum(r, null, "f", new BytesRef("j")); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, de.NextDoc()); r.Dispose(); dir.Dispose(); }
public override int Advance(int target) { Debug.Assert(target > Doc); while (true) { if (Current != null) { int doc; if (target < CurrentBase) { // target was in the previous slice but there was no matching doc after it doc = Current.NextDoc(); } else { doc = Current.Advance(target - CurrentBase); } if (doc == NO_MORE_DOCS) { Current = null; } else { return(this.Doc = doc + CurrentBase); } } else if (Upto == NumSubs_Renamed - 1) { return(this.Doc = NO_MORE_DOCS); } else { Upto++; Current = Subs_Renamed[Upto].DocsEnum; CurrentBase = Subs_Renamed[Upto].Slice.Start; } } }
public virtual int DoTest(int iter, int ndocs, int maxTF, float percentDocs) { Directory dir = NewDirectory(); long start = Environment.TickCount; AddDocs(Random(), dir, ndocs, "foo", "val", maxTF, percentDocs); long end = Environment.TickCount; if (VERBOSE) { Console.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start)); } IndexReader reader = DirectoryReader.Open(dir); TermsEnum tenum = MultiFields.GetTerms(reader, "foo").GetIterator(null); start = Environment.TickCount; int ret = 0; DocsEnum tdocs = null; Random random = new Random(Random().Next()); for (int i = 0; i < iter; i++) { tenum.SeekCeil(new BytesRef("val")); tdocs = TestUtil.Docs(random, tenum, MultiFields.GetLiveDocs(reader), tdocs, DocsFlags.NONE); while (tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret += tdocs.DocID; } } end = Environment.TickCount; if (VERBOSE) { Console.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start)); } return(ret); }
public virtual int DoTest(int iter, int ndocs, int maxTF, float percentDocs) { Directory dir = NewDirectory(); long start = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results AddDocs(LuceneTestCase.Random, dir, ndocs, "foo", "val", maxTF, percentDocs); long end = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (Verbose) { Console.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start)); } IndexReader reader = DirectoryReader.Open(dir); TermsEnum tenum = MultiFields.GetTerms(reader, "foo").GetEnumerator(); start = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results int ret = 0; DocsEnum tdocs = null; Random random = new Random(Random.Next()); for (int i = 0; i < iter; i++) { tenum.SeekCeil(new BytesRef("val")); tdocs = TestUtil.Docs(random, tenum, MultiFields.GetLiveDocs(reader), tdocs, DocsFlags.NONE); while (tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret += tdocs.DocID; } } end = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (Verbose) { Console.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start)); } return(ret); }
public virtual void TestSeparateEnums() { Directory dir = NewDirectory(); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); Documents.Document d = new Documents.Document(); d.Add(NewStringField("f", "j", Field.Store.NO)); w.AddDocument(d); w.Commit(); w.AddDocument(d); IndexReader r = w.GetReader(); w.Dispose(); DocsEnum d1 = TestUtil.Docs(Random, r, "f", new BytesRef("j"), null, null, DocsFlags.NONE); DocsEnum d2 = TestUtil.Docs(Random, r, "f", new BytesRef("j"), null, null, DocsFlags.NONE); Assert.AreEqual(0, d1.NextDoc()); Assert.AreEqual(0, d2.NextDoc()); r.Dispose(); dir.Dispose(); }
public virtual void TestTermDocs(int indexDivisor) { //After adding the document, we should be able to read it back in SegmentReader reader = new SegmentReader(Info, indexDivisor, NewIOContext(Random())); Assert.IsTrue(reader != null); Assert.AreEqual(indexDivisor, reader.TermInfosIndexDivisor); TermsEnum terms = reader.Fields.Terms(DocHelper.TEXT_FIELD_2_KEY).Iterator(null); terms.SeekCeil(new BytesRef("field")); DocsEnum termDocs = TestUtil.Docs(Random(), terms, reader.LiveDocs, null, DocsEnum.FLAG_FREQS); if (termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int docId = termDocs.DocID(); Assert.IsTrue(docId == 0); int freq = termDocs.Freq(); Assert.IsTrue(freq == 3); } reader.Dispose(); }
public virtual void TestBasic() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); Document doc = new Document(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; Field f = NewField("foo", "this is a test test", ft); doc.Add(f); for (int i = 0; i < 100; i++) { w.AddDocument(doc); } IndexReader reader = w.GetReader(); w.Dispose(); Assert.IsNull(MultiFields.GetTermPositionsEnum(reader, null, "foo", new BytesRef("test"))); DocsEnum de = TestUtil.Docs(Random, reader, "foo", new BytesRef("test"), null, null, DocsFlags.FREQS); while (de.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.AreEqual(2, de.Freq); } reader.Dispose(); dir.Dispose(); }
public virtual void TestSkipTo(int indexDivisor) { Directory dir = NewDirectory(); IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy())); Term ta = new Term("content", "aaa"); for (int i = 0; i < 10; i++) { AddDoc(writer, "aaa aaa aaa aaa"); } Term tb = new Term("content", "bbb"); for (int i = 0; i < 16; i++) { AddDoc(writer, "bbb bbb bbb bbb"); } Term tc = new Term("content", "ccc"); for (int i = 0; i < 50; i++) { AddDoc(writer, "ccc ccc ccc ccc"); } // assure that we deal with a single segment writer.ForceMerge(1); writer.Dispose(); IndexReader reader = DirectoryReader.Open(dir, indexDivisor); DocsEnum tdocs = TestUtil.Docs(Random(), reader, ta.Field(), new BytesRef(ta.Text()), MultiFields.GetLiveDocs(reader), null, DocsEnum.FLAG_FREQS); // without optimization (assumption skipInterval == 16) // with next Assert.IsTrue(tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(0, tdocs.DocID()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(1, tdocs.DocID()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.Advance(2) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(2, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(4) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(4, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(9) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(9, tdocs.DocID()); Assert.IsFalse(tdocs.Advance(10) != DocIdSetIterator.NO_MORE_DOCS); // without next tdocs = TestUtil.Docs(Random(), reader, ta.Field(), new BytesRef(ta.Text()), MultiFields.GetLiveDocs(reader), null, 0); Assert.IsTrue(tdocs.Advance(0) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(0, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(4) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(4, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(9) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(9, tdocs.DocID()); Assert.IsFalse(tdocs.Advance(10) != DocIdSetIterator.NO_MORE_DOCS); // exactly skipInterval documents and therefore with optimization // with next tdocs = TestUtil.Docs(Random(), reader, tb.Field(), new BytesRef(tb.Text()), MultiFields.GetLiveDocs(reader), null, DocsEnum.FLAG_FREQS); Assert.IsTrue(tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(10, tdocs.DocID()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(11, tdocs.DocID()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.Advance(12) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(12, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(15) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(15, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(24) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(24, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(25) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(25, tdocs.DocID()); Assert.IsFalse(tdocs.Advance(26) != DocIdSetIterator.NO_MORE_DOCS); // without next tdocs = TestUtil.Docs(Random(), reader, tb.Field(), new BytesRef(tb.Text()), MultiFields.GetLiveDocs(reader), null, DocsEnum.FLAG_FREQS); Assert.IsTrue(tdocs.Advance(5) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(10, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(15) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(15, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(24) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(24, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(25) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(25, tdocs.DocID()); Assert.IsFalse(tdocs.Advance(26) != DocIdSetIterator.NO_MORE_DOCS); // much more than skipInterval documents and therefore with optimization // with next tdocs = TestUtil.Docs(Random(), reader, tc.Field(), new BytesRef(tc.Text()), MultiFields.GetLiveDocs(reader), null, DocsEnum.FLAG_FREQS); Assert.IsTrue(tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(26, tdocs.DocID()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(27, tdocs.DocID()); Assert.AreEqual(4, tdocs.Freq()); Assert.IsTrue(tdocs.Advance(28) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(28, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(40) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(40, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(57) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(57, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(74) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(74, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(75) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(75, tdocs.DocID()); Assert.IsFalse(tdocs.Advance(76) != DocIdSetIterator.NO_MORE_DOCS); //without next tdocs = TestUtil.Docs(Random(), reader, tc.Field(), new BytesRef(tc.Text()), MultiFields.GetLiveDocs(reader), null, 0); Assert.IsTrue(tdocs.Advance(5) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(26, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(40) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(40, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(57) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(57, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(74) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(74, tdocs.DocID()); Assert.IsTrue(tdocs.Advance(75) != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(75, tdocs.DocID()); Assert.IsFalse(tdocs.Advance(76) != DocIdSetIterator.NO_MORE_DOCS); reader.Dispose(); dir.Dispose(); }
/// <summary> /// checks docs + freqs, sequentially /// </summary> public void AssertDocsEnumEquals(string info, DocsEnum leftDocs, DocsEnum rightDocs, bool hasFreqs) { if (leftDocs == null) { Assert.IsNull(rightDocs); return; } Assert.AreEqual(-1, leftDocs.DocID(), info); Assert.AreEqual(-1, rightDocs.DocID(), info); int docid; while ((docid = leftDocs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { Assert.AreEqual(docid, rightDocs.NextDoc(), info); if (hasFreqs) { Assert.AreEqual(leftDocs.Freq(), rightDocs.Freq(), info); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, rightDocs.NextDoc(), info); }
public virtual void TestRandom() { int num = AtLeast(2); for (int iter = 0; iter < num; iter++) { if (VERBOSE) { Console.WriteLine("TEST: iter=" + iter); } Directory dir = NewDirectory(); IndexWriter w = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMergePolicy(NoMergePolicy.COMPOUND_FILES)); // we can do this because we use NoMergePolicy (and dont merge to "nothing") w.KeepFullyDeletedSegments = true; IDictionary <BytesRef, IList <int?> > docs = new Dictionary <BytesRef, IList <int?> >(); ISet <int?> deleted = new JCG.HashSet <int?>(); IList <BytesRef> terms = new List <BytesRef>(); int numDocs = TestUtil.NextInt32(Random, 1, 100 * RANDOM_MULTIPLIER); Documents.Document doc = new Documents.Document(); Field f = NewStringField("field", "", Field.Store.NO); doc.Add(f); Field id = NewStringField("id", "", Field.Store.NO); doc.Add(id); bool onlyUniqueTerms = Random.NextBoolean(); if (VERBOSE) { Console.WriteLine("TEST: onlyUniqueTerms=" + onlyUniqueTerms + " numDocs=" + numDocs); } ISet <BytesRef> uniqueTerms = new JCG.HashSet <BytesRef>(); for (int i = 0; i < numDocs; i++) { if (!onlyUniqueTerms && Random.NextBoolean() && terms.Count > 0) { // re-use existing term BytesRef term = terms[Random.Next(terms.Count)]; docs[term].Add(i); f.SetStringValue(term.Utf8ToString()); } else { string s = TestUtil.RandomUnicodeString(Random, 10); BytesRef term = new BytesRef(s); if (!docs.TryGetValue(term, out IList <int?> docsTerm)) { docs[term] = docsTerm = new List <int?>(); } docsTerm.Add(i); terms.Add(term); uniqueTerms.Add(term); f.SetStringValue(s); } id.SetStringValue("" + i); w.AddDocument(doc); if (Random.Next(4) == 1) { w.Commit(); } if (i > 0 && Random.Next(20) == 1) { int delID = Random.Next(i); deleted.Add(delID); w.DeleteDocuments(new Term("id", "" + delID)); if (VERBOSE) { Console.WriteLine("TEST: delete " + delID); } } } if (VERBOSE) { List <BytesRef> termsList = new List <BytesRef>(uniqueTerms); #pragma warning disable 612, 618 termsList.Sort(BytesRef.UTF8SortedAsUTF16Comparer); #pragma warning restore 612, 618 Console.WriteLine("TEST: terms in UTF16 order:"); foreach (BytesRef b in termsList) { Console.WriteLine(" " + UnicodeUtil.ToHexString(b.Utf8ToString()) + " " + b); foreach (int docID in docs[b]) { if (deleted.Contains(docID)) { Console.WriteLine(" " + docID + " (deleted)"); } else { Console.WriteLine(" " + docID); } } } } IndexReader reader = w.GetReader(); w.Dispose(); if (VERBOSE) { Console.WriteLine("TEST: reader=" + reader); } IBits liveDocs = MultiFields.GetLiveDocs(reader); foreach (int delDoc in deleted) { Assert.IsFalse(liveDocs.Get(delDoc)); } for (int i = 0; i < 100; i++) { BytesRef term = terms[Random.Next(terms.Count)]; if (VERBOSE) { Console.WriteLine("TEST: seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " " + term); } DocsEnum docsEnum = TestUtil.Docs(Random, reader, "field", term, liveDocs, null, DocsFlags.NONE); Assert.IsNotNull(docsEnum); foreach (int docID in docs[term]) { if (!deleted.Contains(docID)) { Assert.AreEqual(docID, docsEnum.NextDoc()); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc()); } reader.Dispose(); dir.Dispose(); } }
public virtual void TestIntersectRandom() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); int numTerms = AtLeast(300); //final int numTerms = 50; HashSet <string> terms = new HashSet <string>(); ICollection <string> pendingTerms = new List <string>(); IDictionary <BytesRef, int?> termToID = new Dictionary <BytesRef, int?>(); int id = 0; while (terms.Count != numTerms) { string s = RandomString; if (!terms.Contains(s)) { terms.Add(s); pendingTerms.Add(s); if (Random.Next(20) == 7) { AddDoc(w, pendingTerms, termToID, id++); } } } AddDoc(w, pendingTerms, termToID, id++); BytesRef[] termsArray = new BytesRef[terms.Count]; HashSet <BytesRef> termsSet = new HashSet <BytesRef>(); { int upto = 0; foreach (string s in terms) { BytesRef b = new BytesRef(s); termsArray[upto++] = b; termsSet.Add(b); } Array.Sort(termsArray); } if (VERBOSE) { Console.WriteLine("\nTEST: indexed terms (unicode order):"); foreach (BytesRef t in termsArray) { Console.WriteLine(" " + t.Utf8ToString() + " -> id:" + termToID[t]); } } IndexReader r = w.GetReader(); w.Dispose(); // NOTE: intentional insanity!! FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(SlowCompositeReaderWrapper.Wrap(r), "id", false); for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { // TODO: can we also test infinite As here...? // From the random terms, pick some ratio and compile an // automaton: HashSet <string> acceptTerms = new HashSet <string>(); SortedSet <BytesRef> sortedAcceptTerms = new SortedSet <BytesRef>(); double keepPct = Random.NextDouble(); Automaton a; if (iter == 0) { if (VERBOSE) { Console.WriteLine("\nTEST: empty automaton"); } a = BasicAutomata.MakeEmpty(); } else { if (VERBOSE) { Console.WriteLine("\nTEST: keepPct=" + keepPct); } foreach (string s in terms) { string s2; if (Random.NextDouble() <= keepPct) { s2 = s; } else { s2 = RandomString; } acceptTerms.Add(s2); sortedAcceptTerms.Add(new BytesRef(s2)); } a = BasicAutomata.MakeStringUnion(sortedAcceptTerms); } if (Random.NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: reduce the automaton"); } a.Reduce(); } CompiledAutomaton c = new CompiledAutomaton(a, true, false); BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.Count]; HashSet <BytesRef> acceptTermsSet = new HashSet <BytesRef>(); int upto = 0; foreach (string s in acceptTerms) { BytesRef b = new BytesRef(s); acceptTermsArray[upto++] = b; acceptTermsSet.Add(b); Assert.IsTrue(Accepts(c, b)); } Array.Sort(acceptTermsArray); if (VERBOSE) { Console.WriteLine("\nTEST: accept terms (unicode order):"); foreach (BytesRef t in acceptTermsArray) { Console.WriteLine(" " + t.Utf8ToString() + (termsSet.Contains(t) ? " (exists)" : "")); } Console.WriteLine(a.ToDot()); } for (int iter2 = 0; iter2 < 100; iter2++) { BytesRef startTerm = acceptTermsArray.Length == 0 || Random.NextBoolean() ? null : acceptTermsArray[Random.Next(acceptTermsArray.Length)]; if (VERBOSE) { Console.WriteLine("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.Utf8ToString())); if (startTerm != null) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < startTerm.Length; idx++) { int label = startTerm.Bytes[startTerm.Offset + idx] & 0xff; Console.WriteLine(" state=" + state + " label=" + label); state = c.RunAutomaton.Step(state, label); Assert.IsTrue(state != -1); } Console.WriteLine(" state=" + state); } } TermsEnum te = MultiFields.GetTerms(r, "f").Intersect(c, startTerm); int loc; if (startTerm == null) { loc = 0; } else { loc = Array.BinarySearch(termsArray, BytesRef.DeepCopyOf(startTerm)); if (loc < 0) { loc = -(loc + 1); } else { // startTerm exists in index loc++; } } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])) { loc++; } DocsEnum docsEnum = null; while (loc < termsArray.Length) { BytesRef expected = termsArray[loc]; BytesRef actual = te.Next(); if (VERBOSE) { Console.WriteLine("TEST: next() expected=" + expected.Utf8ToString() + " actual=" + (actual == null ? "null" : actual.Utf8ToString())); } Assert.AreEqual(expected, actual); Assert.AreEqual(1, te.DocFreq); docsEnum = TestUtil.Docs(Random, te, null, docsEnum, DocsFlags.NONE); int docID = docsEnum.NextDoc(); Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(docIDToID.Get(docID), (int)termToID[expected]); do { loc++; } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])); } Assert.IsNull(te.Next()); } } r.Dispose(); dir.Dispose(); }
// Delete by Term private long ApplyTermDeletes(IEnumerable <Term> termsIter, ReadersAndUpdates rld, SegmentReader reader) { lock (this) { long delCount = 0; Fields fields = reader.Fields; if (fields == null) { // this reader has no postings return(0); } TermsEnum termsEnum = null; string currentField = null; DocsEnum docs = null; Debug.Assert(CheckDeleteTerm(null)); bool any = false; //System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader); foreach (Term term in termsIter) { // Since we visit terms sorted, we gain performance // by re-using the same TermsEnum and seeking only // forwards if (!term.Field().Equals(currentField)) { Debug.Assert(currentField == null || currentField.CompareTo(term.Field()) < 0); currentField = term.Field(); Terms terms = fields.Terms(currentField); if (terms != null) { termsEnum = terms.Iterator(termsEnum); } else { termsEnum = null; } } if (termsEnum == null) { continue; } Debug.Assert(CheckDeleteTerm(term)); // System.out.println(" term=" + term); if (termsEnum.SeekExact(term.Bytes())) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsEnum.FLAG_NONE); //System.out.println("BDS: got docsEnum=" + docsEnum); if (docsEnum != null) { while (true) { int docID = docsEnum.NextDoc(); //System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" + docID); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } if (!any) { rld.InitWritableLiveDocs(); any = true; } // NOTE: there is no limit check on the docID // when deleting by Term (unlike by Query) // because on flush we apply all Term deletes to // each segment. So all Term deleting here is // against prior segments: if (rld.Delete(docID)) { delCount++; } } } } } return(delCount); } }
protected virtual void CollectDocs(FixedBitSet bitSet) { //WARN: keep this specialization in sync Debug.Assert(termsEnum != null); docsEnum = termsEnum.Docs(acceptDocs, docsEnum, DocsEnum.FLAG_NONE); int docid; while ((docid = docsEnum.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { bitSet.Set(docid); } }
public virtual void TestCloseWithThreads([ValueSource(typeof(ConcurrentMergeSchedulerFactories), "Values")] Func <IConcurrentMergeScheduler> newScheduler) { int NUM_THREADS = 3; int numIterations = TEST_NIGHTLY ? 7 : 3; for (int iter = 0; iter < numIterations; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter); } Directory dir = NewDirectory(); var config = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)) .SetMaxBufferedDocs(10) .SetMergeScheduler(newScheduler()) .SetMergePolicy(NewLogMergePolicy(4)); IndexWriter writer = new IndexWriter(dir, config); var scheduler = config.mergeScheduler as IConcurrentMergeScheduler; if (scheduler != null) { scheduler.SetSuppressExceptions(); } IndexerThread[] threads = new IndexerThread[NUM_THREADS]; for (int i = 0; i < NUM_THREADS; i++) { threads[i] = new IndexerThread(writer, false, NewField) // LUCENENET NOTE - ConcurrentMergeScheduler // used to take too long for this test to index a single document // so, increased the time from 200 to 300 ms. // But it has now been restored to 200 ms like Lucene. { TimeToRunInMilliseconds = 200 }; } for (int i = 0; i < NUM_THREADS; i++) { threads[i].Start(); } bool done = false; while (!done) { Thread.Sleep(100); for (int i = 0; i < NUM_THREADS; i++) // only stop when at least one thread has added a doc { if (threads[i].AddCount > 0) { done = true; break; } else if (!threads[i].IsAlive) { Assert.Fail("thread failed before indexing a single document"); } } } if (VERBOSE) { Console.WriteLine("\nTEST: now close"); } writer.Dispose(false); // Make sure threads that are adding docs are not hung: for (int i = 0; i < NUM_THREADS; i++) { // Without fix for LUCENE-1130: one of the // threads will hang threads[i].Join(); if (threads[i].IsAlive) { Assert.Fail("thread seems to be hung"); } } // Quick test to make sure index is not corrupt: IndexReader reader = DirectoryReader.Open(dir); DocsEnum tdocs = TestUtil.Docs(Random, reader, "field", new BytesRef("aaa"), MultiFields.GetLiveDocs(reader), null, 0); int count = 0; while (tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { count++; } Assert.IsTrue(count > 0); reader.Dispose(); dir.Dispose(); } }
protected internal virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms) { Assert.AreEqual(1, terms.DocCount); int termCount = (new HashSet <string>(Arrays.AsList(tk.Terms))).Count; Assert.AreEqual(termCount, terms.Size()); Assert.AreEqual(termCount, terms.SumDocFreq); Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions()); Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets()); Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads()); HashSet <BytesRef> uniqueTerms = new HashSet <BytesRef>(); foreach (string term in tk.Freqs.Keys) { uniqueTerms.Add(new BytesRef(term)); } BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/); Array.Sort(sortedTerms, terms.Comparator); TermsEnum termsEnum = terms.Iterator(Random().NextBoolean() ? null : this.termsEnum.Value); this.termsEnum.Value = termsEnum; for (int i = 0; i < sortedTerms.Length; ++i) { BytesRef nextTerm = termsEnum.Next(); Assert.AreEqual(sortedTerms[i], nextTerm); Assert.AreEqual(sortedTerms[i], termsEnum.Term()); Assert.AreEqual(1, termsEnum.DocFreq()); FixedBitSet bits = new FixedBitSet(1); DocsEnum docsEnum = termsEnum.Docs(bits, Random().NextBoolean() ? null : this.docsEnum.Value); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); bits.Set(0); docsEnum = termsEnum.Docs(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsEnum); Assert.IsNotNull(docsEnum); Assert.AreEqual(0, docsEnum.NextDoc()); Assert.AreEqual(0, docsEnum.DocID()); Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)docsEnum.Freq()); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); this.docsEnum.Value = docsEnum; bits.Clear(0); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random().NextBoolean() ? null : this.docsAndPositionsEnum.Value); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (docsAndPositionsEnum != null) { Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } bits.Set(0); docsAndPositionsEnum = termsEnum.DocsAndPositions(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsAndPositionsEnum); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (terms.HasPositions() || terms.HasOffsets()) { Assert.AreEqual(0, docsAndPositionsEnum.NextDoc()); int freq = docsAndPositionsEnum.Freq(); Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)freq); if (docsAndPositionsEnum != null) { for (int k = 0; k < freq; ++k) { int position = docsAndPositionsEnum.NextPosition(); ISet <int?> indexes; if (terms.HasPositions()) { indexes = tk.PositionToTerms[position]; Assert.IsNotNull(indexes); } else { indexes = tk.StartOffsetToTerms[docsAndPositionsEnum.StartOffset()]; Assert.IsNotNull(indexes); } if (terms.HasPositions()) { bool foundPosition = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.Positions[index] == position) { foundPosition = true; break; } } Assert.IsTrue(foundPosition); } if (terms.HasOffsets()) { bool foundOffset = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.StartOffsets[index] == docsAndPositionsEnum.StartOffset() && tk.EndOffsets[index] == docsAndPositionsEnum.EndOffset()) { foundOffset = true; break; } } Assert.IsTrue(foundOffset); } if (terms.HasPayloads()) { bool foundPayload = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && Equals(tk.Payloads[index], docsAndPositionsEnum.Payload)) { foundPayload = true; break; } } Assert.IsTrue(foundPayload); } } try { docsAndPositionsEnum.NextPosition(); Assert.Fail(); } catch (Exception e) { // ok } } Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } this.docsAndPositionsEnum.Value = docsAndPositionsEnum; } Assert.IsNull(termsEnum.Next()); for (int i = 0; i < 5; ++i) { if (Random().NextBoolean()) { Assert.IsTrue(termsEnum.SeekExact(RandomInts.RandomFrom(Random(), tk.TermBytes))); } else { Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomInts.RandomFrom(Random(), tk.TermBytes))); } } }
public virtual void DoTestNumbers(bool withPayloads) { Directory dir = NewDirectory(); Analyzer analyzer = withPayloads ? (Analyzer) new MockPayloadAnalyzer() : new MockAnalyzer(Random); iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwc.SetMergePolicy(NewLogMergePolicy()); // will rely on docids a bit for skipping RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); FieldType ft = new FieldType(TextField.TYPE_STORED); ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (Random.NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = Random.NextBoolean(); ft.StoreTermVectorPositions = Random.NextBoolean(); } int numDocs = AtLeast(500); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(new Field("numbers", English.Int32ToEnglish(i), ft)); doc.Add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft)); doc.Add(new StringField("id", "" + i, Field.Store.NO)); w.AddDocument(doc); } IndexReader reader = w.GetReader(); w.Dispose(); string[] terms = new string[] { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" }; foreach (string term in terms) { DocsAndPositionsEnum dp = MultiFields.GetTermPositionsEnum(reader, null, "numbers", new BytesRef(term)); int doc; while ((doc = dp.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { string storedNumbers = reader.Document(doc).Get("numbers"); int freq = dp.Freq; for (int i = 0; i < freq; i++) { dp.NextPosition(); int start = dp.StartOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(start >= 0); } int end = dp.EndOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(end >= 0 && end >= start); } // check that the offsets correspond to the term in the src text Assert.IsTrue(storedNumbers.Substring(start, end - start).Equals(term, StringComparison.Ordinal)); if (withPayloads) { // check that we have a payload and it starts with "pos" Assert.IsNotNull(dp.GetPayload()); BytesRef payload = dp.GetPayload(); Assert.IsTrue(payload.Utf8ToString().StartsWith("pos:", StringComparison.Ordinal)); } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer! } } } // check we can skip correctly int numSkippingTests = AtLeast(50); for (int j = 0; j < numSkippingTests; j++) { int num = TestUtil.NextInt32(Random, 100, Math.Min(numDocs - 1, 999)); DocsAndPositionsEnum dp = MultiFields.GetTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred")); int doc = dp.Advance(num); Assert.AreEqual(num, doc); int freq = dp.Freq; for (int i = 0; i < freq; i++) { string storedNumbers = reader.Document(doc).Get("numbers"); dp.NextPosition(); int start = dp.StartOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(start >= 0); } int end = dp.EndOffset; if (Debugging.AssertsEnabled) { Debugging.Assert(end >= 0 && end >= start); } // check that the offsets correspond to the term in the src text Assert.IsTrue(storedNumbers.Substring(start, end - start).Equals("hundred", StringComparison.Ordinal)); if (withPayloads) { // check that we have a payload and it starts with "pos" Assert.IsNotNull(dp.GetPayload()); BytesRef payload = dp.GetPayload(); Assert.IsTrue(payload.Utf8ToString().StartsWith("pos:", StringComparison.Ordinal)); } // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer! } } // check that other fields (without offsets) work correctly for (int i = 0; i < numDocs; i++) { DocsEnum dp = MultiFields.GetTermDocsEnum(reader, null, "id", new BytesRef("" + i), 0); Assert.AreEqual(i, dp.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dp.NextDoc()); } reader.Dispose(); dir.Dispose(); }
public virtual void VerifyEquals(DirectoryReader r1, DirectoryReader r2, string idField) { if (Verbose) { Console.WriteLine("\nr1 docs:"); PrintDocs(r1); Console.WriteLine("\nr2 docs:"); PrintDocs(r2); } if (r1.NumDocs != r2.NumDocs) { if (Debugging.AssertsEnabled) { Debugging.Assert(false, () => "r1.NumDocs=" + r1.NumDocs + " vs r2.NumDocs=" + r2.NumDocs); } } bool hasDeletes = !(r1.MaxDoc == r2.MaxDoc && r1.NumDocs == r1.MaxDoc); int[] r2r1 = new int[r2.MaxDoc]; // r2 id to r1 id mapping // create mapping from id2 space to id2 based on idField Fields f1 = MultiFields.GetFields(r1); if (f1 == null) { // make sure r2 is empty Assert.IsNull(MultiFields.GetFields(r2)); return; } Terms terms1 = f1.GetTerms(idField); if (terms1 == null) { Assert.IsTrue(MultiFields.GetFields(r2) == null || MultiFields.GetFields(r2).GetTerms(idField) == null); return; } TermsEnum termsEnum = terms1.GetIterator(null); IBits liveDocs1 = MultiFields.GetLiveDocs(r1); IBits liveDocs2 = MultiFields.GetLiveDocs(r2); Fields fields = MultiFields.GetFields(r2); if (fields == null) { // make sure r1 is in fact empty (eg has only all // deleted docs): IBits liveDocs = MultiFields.GetLiveDocs(r1); DocsEnum docs = null; while (termsEnum.Next() != null) { docs = TestUtil.Docs(Random, termsEnum, liveDocs, docs, DocsFlags.NONE); while (docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.Fail("r1 is not empty but r2 is"); } } return; } Terms terms2 = fields.GetTerms(idField); TermsEnum termsEnum2 = terms2.GetIterator(null); DocsEnum termDocs1 = null; DocsEnum termDocs2 = null; while (true) { BytesRef term = termsEnum.Next(); //System.out.println("TEST: match id term=" + term); if (term == null) { break; } termDocs1 = TestUtil.Docs(Random, termsEnum, liveDocs1, termDocs1, DocsFlags.NONE); if (termsEnum2.SeekExact(term)) { termDocs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, termDocs2, DocsFlags.NONE); } else { termDocs2 = null; } if (termDocs1.NextDoc() == DocIdSetIterator.NO_MORE_DOCS) { // this doc is deleted and wasn't replaced Assert.IsTrue(termDocs2 == null || termDocs2.NextDoc() == DocIdSetIterator.NO_MORE_DOCS); continue; } int id1 = termDocs1.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs1.NextDoc()); Assert.IsTrue(termDocs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int id2 = termDocs2.DocID; Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, termDocs2.NextDoc()); r2r1[id2] = id1; // verify stored fields are equivalent try { VerifyEquals(r1.Document(id1), r2.Document(id2)); } catch (Exception /*t*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2 + " term=" + term); Console.WriteLine(" d1=" + r1.Document(id1)); Console.WriteLine(" d2=" + r2.Document(id2)); throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } try { // verify term vectors are equivalent VerifyEquals(r1.GetTermVectors(id1), r2.GetTermVectors(id2)); } catch (Exception /*e*/) { Console.WriteLine("FAILED id=" + term + " id1=" + id1 + " id2=" + id2); Fields tv1 = r1.GetTermVectors(id1); Console.WriteLine(" d1=" + tv1); if (tv1 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv1) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv1.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetIterator(null); BytesRef term2; while ((term2 = termsEnum3.Next()) != null) { Console.WriteLine(" " + term2.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } Fields tv2 = r2.GetTermVectors(id2); Console.WriteLine(" d2=" + tv2); if (tv2 != null) { DocsAndPositionsEnum dpEnum = null; DocsEnum dEnum = null; foreach (string field in tv2) { Console.WriteLine(" " + field + ":"); Terms terms3 = tv2.GetTerms(field); Assert.IsNotNull(terms3); TermsEnum termsEnum3 = terms3.GetIterator(null); BytesRef term2; while ((term2 = termsEnum3.Next()) != null) { Console.WriteLine(" " + term2.Utf8ToString() + ": freq=" + termsEnum3.TotalTermFreq); dpEnum = termsEnum3.DocsAndPositions(null, dpEnum); if (dpEnum != null) { Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dpEnum.Freq; Console.WriteLine(" doc=" + dpEnum.DocID + " freq=" + freq); for (int posUpto = 0; posUpto < freq; posUpto++) { Console.WriteLine(" pos=" + dpEnum.NextPosition()); } } else { dEnum = TestUtil.Docs(Random, termsEnum3, null, dEnum, DocsFlags.FREQS); Assert.IsNotNull(dEnum); Assert.IsTrue(dEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = dEnum.Freq; Console.WriteLine(" doc=" + dEnum.DocID + " freq=" + freq); } } } } throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } //System.out.println("TEST: done match id"); // Verify postings //System.out.println("TEST: create te1"); Fields fields1 = MultiFields.GetFields(r1); IEnumerator <string> fields1Enum = fields1.GetEnumerator(); Fields fields2 = MultiFields.GetFields(r2); IEnumerator <string> fields2Enum = fields2.GetEnumerator(); string field1 = null, field2 = null; TermsEnum termsEnum1 = null; termsEnum2 = null; DocsEnum docs1 = null, docs2 = null; // pack both doc and freq into single element for easy sorting long[] info1 = new long[r1.NumDocs]; long[] info2 = new long[r2.NumDocs]; for (; ;) { BytesRef term1 = null, term2 = null; // iterate until we get some docs int len1; for (; ;) { len1 = 0; if (termsEnum1 == null) { if (!fields1Enum.MoveNext()) { break; } field1 = fields1Enum.Current; Terms terms = fields1.GetTerms(field1); if (terms == null) { continue; } termsEnum1 = terms.GetIterator(null); } term1 = termsEnum1.Next(); if (term1 == null) { // no more terms in this field termsEnum1 = null; continue; } //System.out.println("TEST: term1=" + term1); docs1 = TestUtil.Docs(Random, termsEnum1, liveDocs1, docs1, DocsFlags.FREQS); while (docs1.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = docs1.DocID; int f = docs1.Freq; info1[len1] = (((long)d) << 32) | (uint)f; len1++; } if (len1 > 0) { break; } } // iterate until we get some docs int len2; for (; ;) { len2 = 0; if (termsEnum2 == null) { if (!fields2Enum.MoveNext()) { break; } field2 = fields2Enum.Current; Terms terms = fields2.GetTerms(field2); if (terms == null) { continue; } termsEnum2 = terms.GetIterator(null); } term2 = termsEnum2.Next(); if (term2 == null) { // no more terms in this field termsEnum2 = null; continue; } //System.out.println("TEST: term1=" + term1); docs2 = TestUtil.Docs(Random, termsEnum2, liveDocs2, docs2, DocsFlags.FREQS); while (docs2.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int d = r2r1[docs2.DocID]; int f = docs2.Freq; info2[len2] = (((long)d) << 32) | (uint)f; len2++; } if (len2 > 0) { break; } } Assert.AreEqual(len1, len2); if (len1 == 0) // no more terms { break; } Assert.AreEqual(field1, field2); Assert.IsTrue(term1.BytesEquals(term2)); if (!hasDeletes) { Assert.AreEqual(termsEnum1.DocFreq, termsEnum2.DocFreq); } Assert.AreEqual(term1, term2, "len1=" + len1 + " len2=" + len2 + " deletes?=" + hasDeletes); // sort info2 to get it into ascending docid Array.Sort(info2, 0, len2); // now compare for (int i = 0; i < len1; i++) { Assert.AreEqual(info1[i], info2[i], "i=" + i + " len=" + len1 + " d1=" + ((long)((ulong)info1[i] >> 32)) + " f1=" + (info1[i] & int.MaxValue) + " d2=" + ((long)((ulong)info2[i] >> 32)) + " f2=" + (info2[i] & int.MaxValue) + " field=" + field1 + " term=" + term1.Utf8ToString()); } } }
/// <summary> /// Call this only once (if you subclass!) </summary> protected internal virtual void Uninvert(AtomicReader reader, Bits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(Field); if (info != null && info.HasDocValues()) { throw new InvalidOperationException("Type mismatch: " + Field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = DateTime.Now.Millisecond; Prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document sbyte[][] bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.Terms(Field); if (terms == null) { // No terms return; } TermsEnum te = terms.Iterator(null); BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList<BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. sbyte[] tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; DocsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { BytesRef t = te.Term(); if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { OrdBase = (int)te.Ord(); //System.out.println("got ordBase=" + ordBase); } catch (System.NotSupportedException uoe) { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List<BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & IndexIntervalMask) == 0) { // Index this term SizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq(); if (df <= MaxTermDocFreq) { DocsEnum = te.Docs(liveDocs, DocsEnum, DocsEnum.FLAG_NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = DocsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; TermInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VIntSize(delta); sbyte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked((int)0xfffffffc); // 4 byte alignment sbyte[] newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (te.Next() == null) { break; } } NumTermsInField = termNum; long midPoint = DateTime.Now.Millisecond; if (TermInstances == 0) { // we didn't invert anything // lower memory consumption. Tnums = null; } else { this.Index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { sbyte[] target = Tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + Field); } sbyte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; /// <summary> ///* we don't have to worry about the array getting too large /// since the "pos" param will overflow first (only 24 bits available) /// if ((newlen<<1) <= 0) { /// // overflow... /// newlen = Integer.MAX_VALUE; /// if (newlen <= pos + len) { /// throw new SolrException(400,"Too many terms to uninvert field!"); /// } /// } else { /// while (newlen <= pos + len) newlen<<=1; // doubling strategy /// } /// *** /// </summary> while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } sbyte[] newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { sbyte[] newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { IndexedTermsArray = indexedTerms.ToArray(); } long endTime = DateTime.Now.Millisecond; Total_time = (int)(endTime - startTime); Phase1_time = (int)(midPoint - startTime); }
public virtual void TestCloseWithThreads() { int NUM_THREADS = 3; int numIterations = TEST_NIGHTLY ? 7 : 3; for (int iter = 0; iter < numIterations; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter); } Directory dir = NewDirectory(); IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMaxBufferedDocs(10).SetMergeScheduler(new ConcurrentMergeScheduler()).SetMergePolicy(NewLogMergePolicy(4))); ((ConcurrentMergeScheduler)writer.Config.MergeScheduler).SetSuppressExceptions(); IndexerThread[] threads = new IndexerThread[NUM_THREADS]; for (int i = 0; i < NUM_THREADS; i++) { threads[i] = new IndexerThread(this, writer, false); } for (int i = 0; i < NUM_THREADS; i++) { threads[i].Start(); } bool done = false; while (!done) { Thread.Sleep(100); for (int i = 0; i < NUM_THREADS; i++) // only stop when at least one thread has added a doc { if (threads[i].AddCount > 0) { done = true; break; } else if (!threads[i].IsAlive) { Assert.Fail("thread failed before indexing a single document"); } } } if (VERBOSE) { Console.WriteLine("\nTEST: now close"); } writer.Dispose(false); // Make sure threads that are adding docs are not hung: for (int i = 0; i < NUM_THREADS; i++) { // Without fix for LUCENE-1130: one of the // threads will hang threads[i].Join(); if (threads[i].IsAlive) { Assert.Fail("thread seems to be hung"); } } // Quick test to make sure index is not corrupt: IndexReader reader = DirectoryReader.Open(dir); DocsEnum tdocs = TestUtil.Docs(Random(), reader, "field", new BytesRef("aaa"), MultiFields.GetLiveDocs(reader), null, 0); int count = 0; while (tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { count++; } Assert.IsTrue(count > 0); reader.Dispose(); dir.Dispose(); } }
// DocValues updates private void ApplyDocValuesUpdates <T1>(IEnumerable <T1> updates, ReadersAndUpdates rld, SegmentReader reader, DocValuesFieldUpdates.Container dvUpdatesContainer) where T1 : DocValuesUpdate { lock (this) { Fields fields = reader.Fields; if (fields == null) { // this reader has no postings return; } // TODO: we can process the updates per DV field, from last to first so that // if multiple terms affect same document for the same field, we add an update // only once (that of the last term). To do that, we can keep a bitset which // marks which documents have already been updated. So e.g. if term T1 // updates doc 7, and then we process term T2 and it updates doc 7 as well, // we don't apply the update since we know T1 came last and therefore wins // the update. // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so // that these documents aren't even returned. string currentField = null; TermsEnum termsEnum = null; DocsEnum docs = null; //System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader); foreach (DocValuesUpdate update in updates) { Term term = update.Term; int limit = update.DocIDUpto; // TODO: we traverse the terms in update order (not term order) so that we // apply the updates in the correct order, i.e. if two terms udpate the // same document, the last one that came in wins, irrespective of the // terms lexical order. // we can apply the updates in terms order if we keep an updatesGen (and // increment it with every update) and attach it to each NumericUpdate. Note // that we cannot rely only on docIDUpto because an app may send two updates // which will get same docIDUpto, yet will still need to respect the order // those updates arrived. if (!term.Field().Equals(currentField)) { // if we change the code to process updates in terms order, enable this assert // assert currentField == null || currentField.compareTo(term.field()) < 0; currentField = term.Field(); Terms terms = fields.Terms(currentField); if (terms != null) { termsEnum = terms.Iterator(termsEnum); } else { termsEnum = null; continue; // no terms in that field } } if (termsEnum == null) { continue; } // System.out.println(" term=" + term); if (termsEnum.SeekExact(term.Bytes())) { // we don't need term frequencies for this DocsEnum docsEnum = termsEnum.Docs(rld.LiveDocs, docs, DocsEnum.FLAG_NONE); //System.out.println("BDS: got docsEnum=" + docsEnum); DocValuesFieldUpdates dvUpdates = dvUpdatesContainer.GetUpdates(update.Field, update.Type); if (dvUpdates == null) { dvUpdates = dvUpdatesContainer.NewUpdates(update.Field, update.Type, reader.MaxDoc); } int doc; while ((doc = docsEnum.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term + " doc=" + docID); if (doc >= limit) { break; // no more docs that can be updated for this term } dvUpdates.Add(doc, update.Value); } } } } }
/// <summary> /// checks advancing docs /// </summary> public void AssertDocsSkippingEquals(string info, IndexReader leftReader, int docFreq, DocsEnum leftDocs, DocsEnum rightDocs, bool hasFreqs) { if (leftDocs == null) { Assert.IsNull(rightDocs); return; } int docid = -1; int averageGap = leftReader.MaxDoc / (1 + docFreq); int skipInterval = 16; while (true) { if (Random().NextBoolean()) { // nextDoc() docid = leftDocs.NextDoc(); Assert.AreEqual(docid, rightDocs.NextDoc(), info); } else { // advance() int skip = docid + (int)Math.Ceiling(Math.Abs(skipInterval + Random().NextDouble() * averageGap)); docid = leftDocs.Advance(skip); Assert.AreEqual(docid, rightDocs.Advance(skip), info); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } if (hasFreqs) { Assert.AreEqual(leftDocs.Freq(), rightDocs.Freq(), info); } } }
public override int NextDoc() { return(m_input.NextDoc()); }
public virtual void TestFixedPostings() { const int NUM_TERMS = 100; TermData[] terms = new TermData[NUM_TERMS]; for (int i = 0; i < NUM_TERMS; i++) { int[] docs = new int[] { i }; string text = Convert.ToString(i); terms[i] = new TermData(this, text, docs, null); } FieldInfos.Builder builder = new FieldInfos.Builder(); FieldData field = new FieldData(this, "field", builder, terms, true, false); FieldData[] fields = new FieldData[] { field }; FieldInfos fieldInfos = builder.Finish(); // LUCENENET specific - BUG: we must wrap this in a using block in case anything in the below loop throws using (Directory dir = NewDirectory()) { this.Write(fieldInfos, dir, fields, true); Codec codec = Codec.Default; SegmentInfo si = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, SEGMENT, 10000, false, codec, null); // LUCENENET specific - BUG: we must wrap this in a using block in case anything in the below loop throws using (FieldsProducer reader = codec.PostingsFormat.FieldsProducer(new SegmentReadState(dir, si, fieldInfos, NewIOContext(Random), DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR))) { IEnumerator <string> fieldsEnum = reader.GetEnumerator(); fieldsEnum.MoveNext(); string fieldName = fieldsEnum.Current; Assert.IsNotNull(fieldName); Terms terms2 = reader.GetTerms(fieldName); Assert.IsNotNull(terms2); TermsEnum termsEnum = terms2.GetIterator(null); DocsEnum docsEnum = null; for (int i = 0; i < NUM_TERMS; i++) { BytesRef term = termsEnum.Next(); Assert.IsNotNull(term); Assert.AreEqual(terms[i].text2, term.Utf8ToString()); // do this twice to stress test the codec's reuse, ie, // make sure it properly fully resets (rewinds) its // internal state: for (int iter = 0; iter < 2; iter++) { docsEnum = TestUtil.Docs(Random, termsEnum, null, docsEnum, DocsFlags.NONE); Assert.AreEqual(terms[i].docs[0], docsEnum.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc()); } } Assert.IsNull(termsEnum.Next()); for (int i = 0; i < NUM_TERMS; i++) { Assert.AreEqual(termsEnum.SeekCeil(new BytesRef(terms[i].text2)), TermsEnum.SeekStatus.FOUND); } Assert.IsFalse(fieldsEnum.MoveNext()); } } }
private void TestOne(DocsEnum docs, IList<int> expected) { if (VERBOSE) { Console.WriteLine("test"); } int upto = -1; while (upto < expected.Count) { if (VERBOSE) { Console.WriteLine(" cycle upto=" + upto + " of " + expected.Count); } int docID; if (Random().Next(4) == 1 || upto == expected.Count - 1) { // test nextDoc() if (VERBOSE) { Console.WriteLine(" do nextDoc"); } upto++; docID = docs.NextDoc(); } else { // test advance() int inc = TestUtil.NextInt(Random(), 1, expected.Count - 1 - upto); if (VERBOSE) { Console.WriteLine(" do advance inc=" + inc); } upto += inc; docID = docs.Advance(expected[upto]); } if (upto == expected.Count) { if (VERBOSE) { Console.WriteLine(" expect docID=" + DocIdSetIterator.NO_MORE_DOCS + " actual=" + docID); } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docID); } else { if (VERBOSE) { Console.WriteLine(" expect docID=" + expected[upto] + " actual=" + docID); } Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual((int)expected[upto], docID); } } }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw new InvalidOperationException("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = Environment.TickCount; m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetIterator(null); BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } #pragma warning disable 168 catch (NotSupportedException uoe) #pragma warning restore 168 { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (te.Next() == null) { break; } } m_numTermsInField = termNum; long midPoint = Environment.TickCount; if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = Environment.TickCount; m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }
/// <summary> /// Default merge impl: append documents, mapping around /// deletes /// </summary> public virtual TermStats Merge(MergeState mergeState, FieldInfo.IndexOptions? indexOptions, DocsEnum postings, FixedBitSet visitedDocs) { int df = 0; long totTF = 0; if (indexOptions == FieldInfo.IndexOptions.DOCS_ONLY) { while (true) { int doc = postings.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } visitedDocs.Set(doc); this.StartDoc(doc, -1); this.FinishDoc(); df++; } totTF = -1; } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS) { while (true) { int doc = postings.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } visitedDocs.Set(doc); int freq = postings.Freq(); this.StartDoc(doc, freq); this.FinishDoc(); df++; totTF += freq; } } else if (indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { var postingsEnum = (DocsAndPositionsEnum)postings; while (true) { int doc = postingsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } visitedDocs.Set(doc); int freq = postingsEnum.Freq(); this.StartDoc(doc, freq); totTF += freq; for (int i = 0; i < freq; i++) { int position = postingsEnum.NextPosition(); BytesRef payload = postingsEnum.Payload; this.AddPosition(position, payload, -1, -1); } this.FinishDoc(); df++; } } else { Debug.Assert(indexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); var postingsEnum = (DocsAndPositionsEnum)postings; while (true) { int doc = postingsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } visitedDocs.Set(doc); int freq = postingsEnum.Freq(); this.StartDoc(doc, freq); totTF += freq; for (int i = 0; i < freq; i++) { int position = postingsEnum.NextPosition(); BytesRef payload = postingsEnum.Payload; this.AddPosition(position, payload, postingsEnum.StartOffset(), postingsEnum.EndOffset()); } this.FinishDoc(); df++; } } return new TermStats(df, indexOptions == FieldInfo.IndexOptions.DOCS_ONLY ? -1 : totTF); }
/// <summary> /// checks docs + freqs, sequentially /// </summary> public virtual void AssertDocsEnum(DocsEnum leftDocs, DocsEnum rightDocs) { if (leftDocs == null) { Assert.IsNull(rightDocs); return; } Assert.AreEqual(-1, leftDocs.DocID()); Assert.AreEqual(-1, rightDocs.DocID()); int docid; while ((docid = leftDocs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { Assert.AreEqual(docid, rightDocs.NextDoc()); // we don't assert freqs, they are allowed to be different } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, rightDocs.NextDoc()); }
public virtual void TestMerge() { Codec codec = Codec.Default; SegmentInfo si = new SegmentInfo(MergedDir, Constants.LUCENE_MAIN_VERSION, MergedSegment, -1, false, codec, null); SegmentMerger merger = new SegmentMerger(Arrays.AsList <AtomicReader>(Reader1, Reader2), si, InfoStream.Default, MergedDir, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, CheckAbort.NONE, new FieldInfos.FieldNumbers(), NewIOContext(Random()), true); MergeState mergeState = merger.Merge(); int docsMerged = mergeState.SegmentInfo.DocCount; Assert.IsTrue(docsMerged == 2); //Should be able to open a new SegmentReader against the new directory SegmentReader mergedReader = new SegmentReader(new SegmentCommitInfo(new SegmentInfo(MergedDir, Constants.LUCENE_MAIN_VERSION, MergedSegment, docsMerged, false, codec, null), 0, -1L, -1L), DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, NewIOContext(Random())); Assert.IsTrue(mergedReader != null); Assert.IsTrue(mergedReader.NumDocs == 2); Document newDoc1 = mergedReader.Document(0); Assert.IsTrue(newDoc1 != null); //There are 2 unstored fields on the document Assert.IsTrue(DocHelper.NumFields(newDoc1) == DocHelper.NumFields(Doc1) - DocHelper.Unstored.Count); Document newDoc2 = mergedReader.Document(1); Assert.IsTrue(newDoc2 != null); Assert.IsTrue(DocHelper.NumFields(newDoc2) == DocHelper.NumFields(Doc2) - DocHelper.Unstored.Count); DocsEnum termDocs = TestUtil.Docs(Random(), mergedReader, DocHelper.TEXT_FIELD_2_KEY, new BytesRef("field"), MultiFields.GetLiveDocs(mergedReader), null, 0); Assert.IsTrue(termDocs != null); Assert.IsTrue(termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS); int tvCount = 0; foreach (FieldInfo fieldInfo in mergedReader.FieldInfos) { if (fieldInfo.HasVectors) { tvCount++; } } //System.out.println("stored size: " + stored.Size()); Assert.AreEqual(3, tvCount, "We do not have 3 fields that were indexed with term vector"); Terms vector = mergedReader.GetTermVectors(0).GetTerms(DocHelper.TEXT_FIELD_2_KEY); Assert.IsNotNull(vector); Assert.AreEqual(3, vector.Count); TermsEnum termsEnum = vector.GetIterator(null); int i = 0; while (termsEnum.Next() != null) { string term = termsEnum.Term.Utf8ToString(); int freq = (int)termsEnum.TotalTermFreq; //System.out.println("Term: " + term + " Freq: " + freq); Assert.IsTrue(DocHelper.FIELD_2_TEXT.IndexOf(term) != -1); Assert.IsTrue(DocHelper.FIELD_2_FREQS[i] == freq); i++; } TestSegmentReader.CheckNorms(mergedReader); mergedReader.Dispose(); }
private int CountDocs(DocsEnum docs) { int count = 0; while ((docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { count++; } return count; }
/// <summary> /// checks advancing docs /// </summary> public virtual void AssertDocsSkipping(int docFreq, DocsEnum leftDocs, DocsEnum rightDocs) { if (leftDocs == null) { Assert.IsNull(rightDocs); return; } int docid = -1; int averageGap = MAXDOC / (1 + docFreq); int skipInterval = 16; while (true) { if (Random().NextBoolean()) { // nextDoc() docid = leftDocs.NextDoc(); Assert.AreEqual(docid, rightDocs.NextDoc()); } else { // advance() int skip = docid + (int)Math.Ceiling(Math.Abs(skipInterval + Random().NextDouble() * averageGap)); docid = leftDocs.Advance(skip); Assert.AreEqual(docid, rightDocs.Advance(skip)); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } // we don't assert freqs, they are allowed to be different } }
public virtual void TestRandom() { // token -> docID -> tokens IDictionary <string, IDictionary <int?, IList <Token> > > actualTokens = new Dictionary <string, IDictionary <int?, IList <Token> > >(); Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); int numDocs = AtLeast(20); //final int numDocs = AtLeast(5); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: randomize what IndexOptions we use; also test // changing this up in one IW buffered segment...: ft.IndexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; if (Random.NextBoolean()) { ft.StoreTermVectors = true; ft.StoreTermVectorOffsets = Random.NextBoolean(); ft.StoreTermVectorPositions = Random.NextBoolean(); } for (int docCount = 0; docCount < numDocs; docCount++) { Document doc = new Document(); doc.Add(new Int32Field("id", docCount, Field.Store.NO)); IList <Token> tokens = new List <Token>(); int numTokens = AtLeast(100); //final int numTokens = AtLeast(20); int pos = -1; int offset = 0; //System.out.println("doc id=" + docCount); for (int tokenCount = 0; tokenCount < numTokens; tokenCount++) { string text; if (Random.NextBoolean()) { text = "a"; } else if (Random.NextBoolean()) { text = "b"; } else if (Random.NextBoolean()) { text = "c"; } else { text = "d"; } int posIncr = Random.NextBoolean() ? 1 : Random.Next(5); if (tokenCount == 0 && posIncr == 0) { posIncr = 1; } int offIncr = Random.NextBoolean() ? 0 : Random.Next(5); int tokenOffset = Random.Next(5); Token token = MakeToken(text, posIncr, offset + offIncr, offset + offIncr + tokenOffset); if (!actualTokens.TryGetValue(text, out IDictionary <int?, IList <Token> > postingsByDoc)) { actualTokens[text] = postingsByDoc = new Dictionary <int?, IList <Token> >(); } if (!postingsByDoc.TryGetValue(docCount, out IList <Token> postings)) { postingsByDoc[docCount] = postings = new List <Token>(); } postings.Add(token); tokens.Add(token); pos += posIncr; // stuff abs position into type: token.Type = "" + pos; offset += offIncr + tokenOffset; //System.out.println(" " + token + " posIncr=" + token.getPositionIncrement() + " pos=" + pos + " off=" + token.StartOffset + "/" + token.EndOffset + " (freq=" + postingsByDoc.Get(docCount).Size() + ")"); } doc.Add(new Field("content", new CannedTokenStream(tokens.ToArray()), ft)); w.AddDocument(doc); } DirectoryReader r = w.GetReader(); w.Dispose(); string[] terms = new string[] { "a", "b", "c", "d" }; foreach (AtomicReaderContext ctx in r.Leaves) { // TODO: improve this AtomicReader sub = (AtomicReader)ctx.Reader; //System.out.println("\nsub=" + sub); TermsEnum termsEnum = sub.Fields.GetTerms("content").GetEnumerator(); DocsEnum docs = null; DocsAndPositionsEnum docsAndPositions = null; DocsAndPositionsEnum docsAndPositionsAndOffsets = null; FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(sub, "id", false); foreach (string term in terms) { //System.out.println(" term=" + term); if (termsEnum.SeekExact(new BytesRef(term))) { docs = termsEnum.Docs(null, docs); Assert.IsNotNull(docs); int doc; //System.out.println(" doc/freq"); while ((doc = docs.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " docID=" + doc + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docs.Freq); } // explicitly exclude offsets here docsAndPositions = termsEnum.DocsAndPositions(null, docsAndPositions, DocsAndPositionsFlags.PAYLOADS); Assert.IsNotNull(docsAndPositions); //System.out.println(" doc/freq/pos"); while ((doc = docsAndPositions.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docsAndPositions.Freq); foreach (Token token in expected) { int pos = Convert.ToInt32(token.Type); //System.out.println(" pos=" + pos); Assert.AreEqual(pos, docsAndPositions.NextPosition()); } } docsAndPositionsAndOffsets = termsEnum.DocsAndPositions(null, docsAndPositions); Assert.IsNotNull(docsAndPositionsAndOffsets); //System.out.println(" doc/freq/pos/offs"); while ((doc = docsAndPositionsAndOffsets.NextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { IList <Token> expected = actualTokens[term][docIDToID.Get(doc)]; //System.out.println(" doc=" + docIDToID.Get(doc) + " " + expected.Size() + " freq"); Assert.IsNotNull(expected); Assert.AreEqual(expected.Count, docsAndPositionsAndOffsets.Freq); foreach (Token token in expected) { int pos = Convert.ToInt32(token.Type); //System.out.println(" pos=" + pos); Assert.AreEqual(pos, docsAndPositionsAndOffsets.NextPosition()); Assert.AreEqual(token.StartOffset, docsAndPositionsAndOffsets.StartOffset); Assert.AreEqual(token.EndOffset, docsAndPositionsAndOffsets.EndOffset); } } } } // TODO: test advance: } r.Dispose(); dir.Dispose(); }
public virtual void TestRandomDocs() { Directory dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetMergePolicy(NewLogMergePolicy())); int numDocs = AtLeast(49); int max = 15678; int term = Random.Next(max); int[] freqInDoc = new int[numDocs]; FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.OmitNorms = true; for (int i = 0; i < numDocs; i++) { Document doc = new Document(); StringBuilder builder = new StringBuilder(); for (int j = 0; j < 199; j++) { int nextInt = Random.Next(max); builder.Append(nextInt).Append(' '); if (nextInt == term) { freqInDoc[i]++; } } doc.Add(NewField(FieldName, builder.ToString(), customType)); writer.AddDocument(doc); } IndexReader reader = writer.GetReader(); writer.Dispose(); int num = AtLeast(13); for (int i = 0; i < num; i++) { BytesRef bytes = new BytesRef("" + term); IndexReaderContext topReaderContext = reader.Context; foreach (AtomicReaderContext context in topReaderContext.Leaves) { int maxDoc = context.AtomicReader.MaxDoc; DocsEnum docsEnum = TestUtil.Docs(Random, context.Reader, FieldName, bytes, null, null, DocsFlags.FREQS); if (FindNext(freqInDoc, context.DocBase, context.DocBase + maxDoc) == int.MaxValue) { Assert.IsNull(docsEnum); continue; } Assert.IsNotNull(docsEnum); docsEnum.NextDoc(); for (int j = 0; j < maxDoc; j++) { if (freqInDoc[context.DocBase + j] != 0) { Assert.AreEqual(j, docsEnum.DocID); Assert.AreEqual(docsEnum.Freq, freqInDoc[context.DocBase + j]); if (i % 2 == 0 && Random.Next(10) == 0) { int next = FindNext(freqInDoc, context.DocBase + j + 1, context.DocBase + maxDoc) - context.DocBase; int advancedTo = docsEnum.Advance(next); if (next >= maxDoc) { Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, advancedTo); } else { Assert.IsTrue(next >= advancedTo, "advanced to: " + advancedTo + " but should be <= " + next); } } else { docsEnum.NextDoc(); } } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.DocID, "DocBase: " + context.DocBase + " maxDoc: " + maxDoc + " " + docsEnum.GetType()); } } reader.Dispose(); dir.Dispose(); }
public static void VerifyEquals(Fields d1, Fields d2) { if (d1 == null) { Assert.IsTrue(d2 == null || d2.Count == 0); return; } Assert.IsTrue(d2 != null); IEnumerator <string> fieldsEnum2 = d2.GetEnumerator(); foreach (string field1 in d1) { fieldsEnum2.MoveNext(); string field2 = fieldsEnum2.Current; Assert.AreEqual(field1, field2); Terms terms1 = d1.GetTerms(field1); Assert.IsNotNull(terms1); TermsEnum termsEnum1 = terms1.GetIterator(null); Terms terms2 = d2.GetTerms(field2); Assert.IsNotNull(terms2); TermsEnum termsEnum2 = terms2.GetIterator(null); DocsAndPositionsEnum dpEnum1 = null; DocsAndPositionsEnum dpEnum2 = null; DocsEnum dEnum1 = null; DocsEnum dEnum2 = null; BytesRef term1; while ((term1 = termsEnum1.Next()) != null) { BytesRef term2 = termsEnum2.Next(); Assert.AreEqual(term1, term2); Assert.AreEqual(termsEnum1.TotalTermFreq, termsEnum2.TotalTermFreq); dpEnum1 = termsEnum1.DocsAndPositions(null, dpEnum1); dpEnum2 = termsEnum2.DocsAndPositions(null, dpEnum2); if (dpEnum1 != null) { Assert.IsNotNull(dpEnum2); int docID1 = dpEnum1.NextDoc(); dpEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dpEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dpEnum1.Freq; int freq2 = dpEnum2.Freq; Assert.AreEqual(freq1, freq2); IOffsetAttribute offsetAtt1 = dpEnum1.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum1.Attributes.GetAttribute <IOffsetAttribute>() : null; IOffsetAttribute offsetAtt2 = dpEnum2.Attributes.HasAttribute <IOffsetAttribute>() ? dpEnum2.Attributes.GetAttribute <IOffsetAttribute>() : null; if (offsetAtt1 != null) { Assert.IsNotNull(offsetAtt2); } else { Assert.IsNull(offsetAtt2); } for (int posUpto = 0; posUpto < freq1; posUpto++) { int pos1 = dpEnum1.NextPosition(); int pos2 = dpEnum2.NextPosition(); Assert.AreEqual(pos1, pos2); if (offsetAtt1 != null) { Assert.AreEqual(offsetAtt1.StartOffset, offsetAtt2.StartOffset); Assert.AreEqual(offsetAtt1.EndOffset, offsetAtt2.EndOffset); } } Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum2.NextDoc()); } else { dEnum1 = TestUtil.Docs(Random, termsEnum1, null, dEnum1, DocsFlags.FREQS); dEnum2 = TestUtil.Docs(Random, termsEnum2, null, dEnum2, DocsFlags.FREQS); Assert.IsNotNull(dEnum1); Assert.IsNotNull(dEnum2); int docID1 = dEnum1.NextDoc(); dEnum2.NextDoc(); // docIDs are not supposed to be equal //int docID2 = dEnum2.NextDoc(); //Assert.AreEqual(docID1, docID2); Assert.IsTrue(docID1 != DocIdSetIterator.NO_MORE_DOCS); int freq1 = dEnum1.Freq; int freq2 = dEnum2.Freq; Assert.AreEqual(freq1, freq2); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum1.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dEnum2.NextDoc()); } } Assert.IsNull(termsEnum2.Next()); } Assert.IsFalse(fieldsEnum2.MoveNext()); }
protected virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms) { Assert.AreEqual(1, terms.DocCount); int termCount = new JCG.HashSet <string>(tk.terms).Count; Assert.AreEqual((long)termCount, terms.Count); // LUCENENET specific - cast required because types don't match (xUnit checks this) Assert.AreEqual((long)termCount, terms.SumDocFreq); // LUCENENET specific - cast required because types don't match (xUnit checks this) Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions); Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets); Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads); ISet <BytesRef> uniqueTerms = new JCG.HashSet <BytesRef>(); foreach (string term in tk.freqs.Keys) { uniqueTerms.Add(new BytesRef(term)); } BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/); Array.Sort(sortedTerms, terms.Comparer); TermsEnum termsEnum = terms.GetEnumerator(Random.NextBoolean() ? null : this.termsEnum.Value); this.termsEnum.Value = termsEnum; for (int i = 0; i < sortedTerms.Length; ++i) { Assert.IsTrue(termsEnum.MoveNext()); Assert.AreEqual(sortedTerms[i], termsEnum.Term); Assert.AreEqual(1, termsEnum.DocFreq); FixedBitSet bits = new FixedBitSet(1); DocsEnum docsEnum = termsEnum.Docs(bits, Random.NextBoolean() ? null : this.docsEnum.Value); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); bits.Set(0); docsEnum = termsEnum.Docs(Random.NextBoolean() ? bits : null, Random.NextBoolean() ? null : docsEnum); Assert.IsNotNull(docsEnum); Assert.AreEqual(0, docsEnum.NextDoc()); Assert.AreEqual(0, docsEnum.DocID); Assert.AreEqual(tk.freqs[termsEnum.Term.Utf8ToString()], docsEnum.Freq); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); this.docsEnum.Value = docsEnum; bits.Clear(0); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random.NextBoolean() ? null : this.docsAndPositionsEnum.Value); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (docsAndPositionsEnum != null) { Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } bits.Set(0); docsAndPositionsEnum = termsEnum.DocsAndPositions(Random.NextBoolean() ? bits : null, Random.NextBoolean() ? null : docsAndPositionsEnum); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (terms.HasPositions || terms.HasOffsets) { Assert.AreEqual(0, docsAndPositionsEnum.NextDoc()); int freq = docsAndPositionsEnum.Freq; Assert.AreEqual(tk.freqs[termsEnum.Term.Utf8ToString()], freq); if (docsAndPositionsEnum != null) { for (int k = 0; k < freq; ++k) { int position = docsAndPositionsEnum.NextPosition(); ISet <int> indexes; if (terms.HasPositions) { indexes = tk.positionToTerms[position]; Assert.IsNotNull(indexes); } else { indexes = tk.startOffsetToTerms[docsAndPositionsEnum.StartOffset]; Assert.IsNotNull(indexes); } if (terms.HasPositions) { bool foundPosition = false; foreach (int index in indexes) { if (tk.termBytes[index].Equals(termsEnum.Term) && tk.positions[index] == position) { foundPosition = true; break; } } Assert.IsTrue(foundPosition); } if (terms.HasOffsets) { bool foundOffset = false; foreach (int index in indexes) { if (tk.termBytes[index].Equals(termsEnum.Term) && tk.startOffsets[index] == docsAndPositionsEnum.StartOffset && tk.endOffsets[index] == docsAndPositionsEnum.EndOffset) { foundOffset = true; break; } } Assert.IsTrue(foundOffset); } if (terms.HasPayloads) { bool foundPayload = false; foreach (int index in indexes) { if (tk.termBytes[index].Equals(termsEnum.Term) && Equals(tk.payloads[index], docsAndPositionsEnum.GetPayload())) { foundPayload = true; break; } } Assert.IsTrue(foundPayload); } } try { docsAndPositionsEnum.NextPosition(); Assert.Fail(); } catch (Exception e) when(e.IsException()) { // ok } catch (Exception e) when(e.IsAssertionError()) { // ok } } Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } this.docsAndPositionsEnum.Value = docsAndPositionsEnum; } Assert.IsFalse(termsEnum.MoveNext()); for (int i = 0; i < 5; ++i) { if (Random.NextBoolean()) { Assert.IsTrue(termsEnum.SeekExact(RandomPicks.RandomFrom(Random, tk.termBytes))); } else { Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomPicks.RandomFrom(Random, tk.termBytes))); } } }