public virtual void TestSeeking() { for (int i = 0; i < numIterations; i++) { string reg = AutomatonTestUtil.RandomRegexp(Random); Automaton automaton = (new RegExp(reg, RegExpSyntax.NONE)).ToAutomaton(); TermsEnum te = MultiFields.GetTerms(reader, "field").GetIterator(null); IList <BytesRef> unsortedTerms = new List <BytesRef>(terms); unsortedTerms.Shuffle(Random); foreach (BytesRef term in unsortedTerms) { if (BasicOperations.Run(automaton, term.Utf8ToString())) { // term is accepted if (Random.NextBoolean()) { // seek exact Assert.IsTrue(te.SeekExact(term)); } else { // seek ceil Assert.AreEqual(SeekStatus.FOUND, te.SeekCeil(term)); Assert.AreEqual(term, te.Term); } } } } }
public virtual void TestSeekingAndNexting() { for (int i = 0; i < numIterations; i++) { TermsEnum te = MultiFields.GetTerms(reader, "field").GetIterator(null); foreach (BytesRef term in terms) { int c = Random.Next(3); if (c == 0) { Assert.AreEqual(term, te.Next()); } else if (c == 1) { Assert.AreEqual(SeekStatus.FOUND, te.SeekCeil(term)); Assert.AreEqual(term, te.Term); } else { Assert.IsTrue(te.SeekExact(term)); } } } }
private void VerifyDocFreq() { IndexReader reader = DirectoryReader.Open(dir); TermsEnum termEnum = MultiFields.GetTerms(reader, "content").GetEnumerator(); // create enumeration of all terms // go to the first term (aaa) termEnum.MoveNext(); // assert that term is 'aaa' Assert.AreEqual("aaa", termEnum.Term.Utf8ToString()); Assert.AreEqual(200, termEnum.DocFreq); // go to the second term (bbb) termEnum.MoveNext(); // assert that term is 'bbb' Assert.AreEqual("bbb", termEnum.Term.Utf8ToString()); Assert.AreEqual(100, termEnum.DocFreq); // create enumeration of terms after term 'aaa', // including 'aaa' termEnum.SeekCeil(new BytesRef("aaa")); // assert that term is 'aaa' Assert.AreEqual("aaa", termEnum.Term.Utf8ToString()); Assert.AreEqual(200, termEnum.DocFreq); // go to term 'bbb' termEnum.MoveNext(); // assert that term is 'bbb' Assert.AreEqual("bbb", termEnum.Term.Utf8ToString()); Assert.AreEqual(100, termEnum.DocFreq); reader.Dispose(); }
private void CheckTermsOrder(IndexReader r, ISet <string> allTerms, bool isTop) { TermsEnum terms = MultiFields.GetFields(r).GetTerms("f").GetEnumerator(); BytesRef last = new BytesRef(); ISet <string> seenTerms = new JCG.HashSet <string>(); while (terms.MoveNext()) { BytesRef term = terms.Term; Assert.IsTrue(last.CompareTo(term) < 0); last.CopyBytes(term); string s = term.Utf8ToString(); Assert.IsTrue(allTerms.Contains(s), "term " + TermDesc(s) + " was not added to index (count=" + allTerms.Count + ")"); seenTerms.Add(s); } if (isTop) { Assert.IsTrue(allTerms.SetEquals(seenTerms)); } // Test seeking: IEnumerator <string> it = seenTerms.GetEnumerator(); while (it.MoveNext()) { BytesRef tr = new BytesRef(it.Current); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(tr), "seek failed for term=" + TermDesc(tr.Utf8ToString())); } }
public override BytesRef Next() { //System.out.println("FTE.next doSeek=" + doSeek); //new Throwable().printStackTrace(System.out); for (; ;) { // Seek or forward the iterator if (doSeek) { doSeek = false; BytesRef t = NextSeekTerm(actualTerm); //System.out.println(" seek to t=" + (t == null ? "null" : t.utf8ToString()) + " tenum=" + tenum); // Make sure we always seek forward: if (Debugging.AssertsEnabled) { Debugging.Assert(actualTerm == null || t == null || Comparer.Compare(t, actualTerm) > 0, () => "curTerm=" + actualTerm + " seekTerm=" + t); } if (t == null || tenum.SeekCeil(t) == SeekStatus.END) { // no more terms to seek to or enum exhausted //System.out.println(" return null"); return(null); } actualTerm = tenum.Term; //System.out.println(" got term=" + actualTerm.utf8ToString()); } else { actualTerm = tenum.Next(); if (actualTerm == null) { // enum exhausted return(null); } } // check if term is accepted switch (Accept(actualTerm)) { case FilteredTermsEnum.AcceptStatus.YES_AND_SEEK: doSeek = true; // term accepted, but we need to seek so fall-through goto case FilteredTermsEnum.AcceptStatus.YES; case FilteredTermsEnum.AcceptStatus.YES: // term accepted return(actualTerm); case FilteredTermsEnum.AcceptStatus.NO_AND_SEEK: // invalid term, seek next time doSeek = true; break; case FilteredTermsEnum.AcceptStatus.END: // we are supposed to end the enum return(null); } } }
public virtual void TestFilterIndexReader() { Directory directory = NewDirectory(); IndexWriter writer = new IndexWriter(directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); Document d1 = new Document(); d1.Add(NewTextField("default", "one two", Field.Store.YES)); writer.AddDocument(d1); Document d2 = new Document(); d2.Add(NewTextField("default", "one three", Field.Store.YES)); writer.AddDocument(d2); Document d3 = new Document(); d3.Add(NewTextField("default", "two four", Field.Store.YES)); writer.AddDocument(d3); writer.Dispose(); Directory target = NewDirectory(); // We mess with the postings so this can fail: ((BaseDirectoryWrapper)target).CrossCheckTermVectorsOnDispose = false; writer = new IndexWriter(target, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))); IndexReader reader = new TestReader(DirectoryReader.Open(directory)); writer.AddIndexes(reader); writer.Dispose(); reader.Dispose(); reader = DirectoryReader.Open(target); TermsEnum terms = MultiFields.GetTerms(reader, "default").GetEnumerator(); while (terms.MoveNext()) { Assert.IsTrue(terms.Term.Utf8ToString().IndexOf('e') != -1); } Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(new BytesRef("one"))); DocsAndPositionsEnum positions = terms.DocsAndPositions(MultiFields.GetLiveDocs(reader), null); while (positions.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { Assert.IsTrue((positions.DocID % 2) == 1); } reader.Dispose(); directory.Dispose(); target.Dispose(); }
public virtual void TestNextIntoWrongField() { foreach (string name in OldNames) { Directory dir = OldIndexDirs[name]; IndexReader r = DirectoryReader.Open(dir); TermsEnum terms = MultiFields.GetFields(r).Terms("content").Iterator(null); BytesRef t = terms.Next(); Assert.IsNotNull(t); // content field only has term aaa: Assert.AreEqual("aaa", t.Utf8ToString()); Assert.IsNull(terms.Next()); BytesRef aaaTerm = new BytesRef("aaa"); // should be found exactly Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(aaaTerm)); Assert.AreEqual(35, CountDocs(TestUtil.Docs(Random(), terms, null, null, DocsEnum.FLAG_NONE))); Assert.IsNull(terms.Next()); // should hit end of field Assert.AreEqual(TermsEnum.SeekStatus.END, terms.SeekCeil(new BytesRef("bbb"))); Assert.IsNull(terms.Next()); // should seek to aaa Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, terms.SeekCeil(new BytesRef("a"))); Assert.IsTrue(terms.Term().BytesEquals(aaaTerm)); Assert.AreEqual(35, CountDocs(TestUtil.Docs(Random(), terms, null, null, DocsEnum.FLAG_NONE))); Assert.IsNull(terms.Next()); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, terms.SeekCeil(aaaTerm)); Assert.AreEqual(35, CountDocs(TestUtil.Docs(Random(), terms, null, null, DocsEnum.FLAG_NONE))); Assert.IsNull(terms.Next()); r.Dispose(); } }
public void TestSplitSeq() { MultiPassIndexSplitter splitter = new MultiPassIndexSplitter(); Directory[] dirs = new Directory[] { NewDirectory(), NewDirectory(), NewDirectory() }; try { splitter.Split(TEST_VERSION_CURRENT, input, dirs, true); Document doc; int start; IndexReader ir; using (ir = DirectoryReader.Open(dirs[0])) { assertTrue(ir.NumDocs - NUM_DOCS / 3 <= 1); doc = ir.Document(0); assertEquals("0", doc.Get("id")); start = ir.NumDocs; } using (ir = DirectoryReader.Open(dirs[1])) { assertTrue(ir.NumDocs - NUM_DOCS / 3 <= 1); doc = ir.Document(0); assertEquals(start + "", doc.Get("id")); start += ir.NumDocs; } using (ir = DirectoryReader.Open(dirs[2])) { assertTrue(ir.NumDocs - NUM_DOCS / 3 <= 1); doc = ir.Document(0); assertEquals(start + "", doc.Get("id")); // make sure the deleted doc is not here TermsEnum te = MultiFields.GetTerms(ir, "id").GetIterator(null); Term t = new Term("id", (NUM_DOCS - 1) + ""); assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.SeekCeil(new BytesRef(t.Text()))); assertNotSame(t.Text(), te.Term.Utf8ToString()); } } finally { foreach (Directory d in dirs) { d.Dispose(); } } }
public override long LookupTerm(BytesRef key) { try { if (te.SeekCeil(key) == SeekStatus.FOUND) { return(te.Ord); } else { return(-te.Ord - 1); } } catch (Exception e) when(e.IsIOException()) { throw RuntimeException.Create(e); } }
public override long LookupTerm(BytesRef key) { try { if (Te.SeekCeil(key) == SeekStatus.FOUND) { return(Te.Ord()); } else { return(-Te.Ord() - 1); } } catch (System.IO.IOException e) { throw new Exception(e.Message, e); } }
public override long LookupTerm(BytesRef key) { try { if (te.SeekCeil(key) == SeekStatus.FOUND) { return(te.Ord); } else { return(-te.Ord - 1); } } catch (IOException e) { throw new Exception(e.ToString(), e); } }
public virtual int DoTest(int iter, int ndocs, int maxTF, float percentDocs) { Directory dir = NewDirectory(); long start = Environment.TickCount; AddDocs(Random(), dir, ndocs, "foo", "val", maxTF, percentDocs); long end = Environment.TickCount; if (VERBOSE) { Console.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start)); } IndexReader reader = DirectoryReader.Open(dir); TermsEnum tenum = MultiFields.GetTerms(reader, "foo").GetIterator(null); start = Environment.TickCount; int ret = 0; DocsEnum tdocs = null; Random random = new Random(Random().Next()); for (int i = 0; i < iter; i++) { tenum.SeekCeil(new BytesRef("val")); tdocs = TestUtil.Docs(random, tenum, MultiFields.GetLiveDocs(reader), tdocs, DocsFlags.NONE); while (tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret += tdocs.DocID; } } end = Environment.TickCount; if (VERBOSE) { Console.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start)); } return(ret); }
public virtual int DoTest(int iter, int ndocs, int maxTF, float percentDocs) { Directory dir = NewDirectory(); long start = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results AddDocs(LuceneTestCase.Random, dir, ndocs, "foo", "val", maxTF, percentDocs); long end = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (Verbose) { Console.WriteLine("milliseconds for creation of " + ndocs + " docs = " + (end - start)); } IndexReader reader = DirectoryReader.Open(dir); TermsEnum tenum = MultiFields.GetTerms(reader, "foo").GetEnumerator(); start = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results int ret = 0; DocsEnum tdocs = null; Random random = new Random(Random.Next()); for (int i = 0; i < iter; i++) { tenum.SeekCeil(new BytesRef("val")); tdocs = TestUtil.Docs(random, tenum, MultiFields.GetLiveDocs(reader), tdocs, DocsFlags.NONE); while (tdocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret += tdocs.DocID; } } end = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results if (Verbose) { Console.WriteLine("milliseconds for " + iter + " TermDocs iteration: " + (end - start)); } return(ret); }
private void TestSavedTerms(IndexReader r, IList <BytesRef> terms) { Console.WriteLine("TEST: run " + terms.Count + " terms on reader=" + r); IndexSearcher s = NewSearcher(r); terms = CollectionsHelper.Shuffle(terms); TermsEnum termsEnum = MultiFields.GetTerms(r, "field").Iterator(null); bool failed = false; for (int iter = 0; iter < 10 * terms.Count; iter++) { BytesRef term = terms[Random().Next(terms.Count)]; Console.WriteLine("TEST: search " + term); long t0 = Environment.TickCount; int count = s.Search(new TermQuery(new Term("field", term)), 1).TotalHits; if (count <= 0) { Console.WriteLine(" FAILED: count=" + count); failed = true; } long t1 = Environment.TickCount; Console.WriteLine(" took " + (t1 - t0) + " millis"); TermsEnum.SeekStatus result = termsEnum.SeekCeil(term); if (result != TermsEnum.SeekStatus.FOUND) { if (result == TermsEnum.SeekStatus.END) { Console.WriteLine(" FAILED: got END"); } else { Console.WriteLine(" FAILED: wrong term: got " + termsEnum.Term()); } failed = true; } } Assert.IsFalse(failed); }
private void TestSavedTerms(IndexReader r, IList <BytesRef> terms) { Console.WriteLine("TEST: run " + terms.Count + " terms on reader=" + r); IndexSearcher s = NewSearcher(r); terms.Shuffle(Random); TermsEnum termsEnum = MultiFields.GetTerms(r, "field").GetEnumerator(); bool failed = false; for (int iter = 0; iter < 10 * terms.Count; iter++) { BytesRef term = terms[Random.Next(terms.Count)]; Console.WriteLine("TEST: search " + term); long t0 = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results int count = s.Search(new TermQuery(new Term("field", term)), 1).TotalHits; if (count <= 0) { Console.WriteLine(" FAILED: count=" + count); failed = true; } long t1 = J2N.Time.NanoTime() / J2N.Time.MillisecondsPerNanosecond; // LUCENENET: Use NanoTime() rather than CurrentTimeMilliseconds() for more accurate/reliable results Console.WriteLine(" took " + (t1 - t0) + " millis"); TermsEnum.SeekStatus result = termsEnum.SeekCeil(term); if (result != TermsEnum.SeekStatus.FOUND) { if (result == TermsEnum.SeekStatus.END) { Console.WriteLine(" FAILED: got END"); } else { Console.WriteLine(" FAILED: wrong term: got " + termsEnum.Term); } failed = true; } } Assert.IsFalse(failed); }
public virtual void TestTermDocs(int indexDivisor) { //After adding the document, we should be able to read it back in SegmentReader reader = new SegmentReader(Info, indexDivisor, NewIOContext(Random())); Assert.IsTrue(reader != null); Assert.AreEqual(indexDivisor, reader.TermInfosIndexDivisor); TermsEnum terms = reader.Fields.Terms(DocHelper.TEXT_FIELD_2_KEY).Iterator(null); terms.SeekCeil(new BytesRef("field")); DocsEnum termDocs = TestUtil.Docs(Random(), terms, reader.LiveDocs, null, DocsEnum.FLAG_FREQS); if (termDocs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int docId = termDocs.DocID(); Assert.IsTrue(docId == 0); int freq = termDocs.Freq(); Assert.IsTrue(freq == 3); } reader.Dispose(); }
public virtual void TestNonFlex() { Directory d = NewDirectory(); const int DOC_COUNT = 177; IndexWriter w = new IndexWriter(d, (new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random))).SetMaxBufferedDocs(7).SetMergePolicy(NewLogMergePolicy())); for (int iter = 0; iter < 2; iter++) { if (iter == 0) { Documents.Document doc = new Documents.Document(); doc.Add(NewTextField("field1", "this is field1", Field.Store.NO)); doc.Add(NewTextField("field2", "this is field2", Field.Store.NO)); doc.Add(NewTextField("field3", "aaa", Field.Store.NO)); doc.Add(NewTextField("field4", "bbb", Field.Store.NO)); for (int i = 0; i < DOC_COUNT; i++) { w.AddDocument(doc); } } else { w.ForceMerge(1); } IndexReader r = w.GetReader(); TermsEnum terms = MultiFields.GetTerms(r, "field3").GetEnumerator(); Assert.AreEqual(TermsEnum.SeekStatus.END, terms.SeekCeil(new BytesRef("abc"))); r.Dispose(); } w.Dispose(); d.Dispose(); }
private void TestRandomSeeks(IndexReader r, params string[] validTermStrings) { BytesRef[] validTerms = new BytesRef[validTermStrings.Length]; for (int termIDX = 0; termIDX < validTermStrings.Length; termIDX++) { validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]); } Array.Sort(validTerms); if (VERBOSE) { Console.WriteLine("TEST: " + validTerms.Length + " terms:"); foreach (BytesRef t in validTerms) { Console.WriteLine(" " + t.Utf8ToString() + " " + t); } } TermsEnum te = MultiFields.GetTerms(r, FIELD).GetIterator(null); int END_LOC = -validTerms.Length - 1; IList <TermAndState> termStates = new List <TermAndState>(); for (int iter = 0; iter < 100 * RANDOM_MULTIPLIER; iter++) { BytesRef t; int loc; TermState termState; if (Random.Next(6) == 4) { // pick term that doens't exist: t = GetNonExistTerm(validTerms); termState = null; if (VERBOSE) { Console.WriteLine("\nTEST: invalid term=" + t.Utf8ToString()); } loc = Array.BinarySearch(validTerms, t); } else if (termStates.Count != 0 && Random.Next(4) == 1) { TermAndState ts = termStates[Random.Next(termStates.Count)]; t = ts.Term; loc = Array.BinarySearch(validTerms, t); Assert.IsTrue(loc >= 0); termState = ts.State; if (VERBOSE) { Console.WriteLine("\nTEST: valid termState term=" + t.Utf8ToString()); } } else { // pick valid term loc = Random.Next(validTerms.Length); t = BytesRef.DeepCopyOf(validTerms[loc]); termState = null; if (VERBOSE) { Console.WriteLine("\nTEST: valid term=" + t.Utf8ToString()); } } // seekCeil or seekExact: bool doSeekExact = Random.NextBoolean(); if (termState != null) { if (VERBOSE) { Console.WriteLine(" seekExact termState"); } te.SeekExact(t, termState); } else if (doSeekExact) { if (VERBOSE) { Console.WriteLine(" seekExact"); } Assert.AreEqual(loc >= 0, te.SeekExact(t)); } else { if (VERBOSE) { Console.WriteLine(" seekCeil"); } TermsEnum.SeekStatus result = te.SeekCeil(t); if (VERBOSE) { Console.WriteLine(" got " + result); } if (loc >= 0) { Assert.AreEqual(TermsEnum.SeekStatus.FOUND, result); } else if (loc == END_LOC) { Assert.AreEqual(TermsEnum.SeekStatus.END, result); } else { Debug.Assert(loc >= -validTerms.Length); Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, result); } } if (loc >= 0) { Assert.AreEqual(t, te.Term); } else if (doSeekExact) { // TermsEnum is unpositioned if seekExact returns false continue; } else if (loc == END_LOC) { continue; } else { loc = -loc - 1; Assert.AreEqual(validTerms[loc], te.Term); } // Do a bunch of next's after the seek int numNext = Random.Next(validTerms.Length); for (int nextCount = 0; nextCount < numNext; nextCount++) { if (VERBOSE) { Console.WriteLine("\nTEST: next loc=" + loc + " of " + validTerms.Length); } BytesRef t2 = te.Next(); loc++; if (loc == validTerms.Length) { Assert.IsNull(t2); break; } else { Assert.AreEqual(validTerms[loc], t2); if (Random.Next(40) == 17 && termStates.Count < 100) { termStates.Add(new TermAndState(validTerms[loc], te.GetTermState())); } } } } }
public virtual void Test() { Random random = new Random(Random.Next()); LineFileDocs docs = new LineFileDocs(random, DefaultCodecSupportsDocValues); Directory d = NewDirectory(); MockAnalyzer analyzer = new MockAnalyzer(LuceneTestCase.Random); analyzer.MaxTokenLength = TestUtil.NextInt32(LuceneTestCase.Random, 1, IndexWriter.MAX_TERM_LENGTH); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif LuceneTestCase.Random, d, analyzer); int numDocs = AtLeast(10); for (int docCount = 0; docCount < numDocs; docCount++) { w.AddDocument(docs.NextDoc()); } IndexReader r = w.GetReader(); w.Dispose(); List <BytesRef> terms = new List <BytesRef>(); TermsEnum termsEnum = MultiFields.GetTerms(r, "body").GetIterator(null); BytesRef term; while ((term = termsEnum.Next()) != null) { terms.Add(BytesRef.DeepCopyOf(term)); } if (VERBOSE) { Console.WriteLine("TEST: " + terms.Count + " terms"); } int upto = -1; int iters = AtLeast(200); for (int iter = 0; iter < iters; iter++) { bool isEnd; if (upto != -1 && LuceneTestCase.Random.NextBoolean()) { // next if (VERBOSE) { Console.WriteLine("TEST: iter next"); } isEnd = termsEnum.Next() == null; upto++; if (isEnd) { if (VERBOSE) { Console.WriteLine(" end"); } Assert.AreEqual(upto, terms.Count); upto = -1; } else { if (VERBOSE) { Console.WriteLine(" got term=" + termsEnum.Term.Utf8ToString() + " expected=" + terms[upto].Utf8ToString()); } Assert.IsTrue(upto < terms.Count); Assert.AreEqual(terms[upto], termsEnum.Term); } } else { BytesRef target; string exists; if (LuceneTestCase.Random.NextBoolean()) { // likely fake term if (LuceneTestCase.Random.NextBoolean()) { target = new BytesRef(TestUtil.RandomSimpleString(LuceneTestCase.Random)); } else { target = new BytesRef(TestUtil.RandomRealisticUnicodeString(LuceneTestCase.Random)); } exists = "likely not"; } else { // real term target = terms[LuceneTestCase.Random.Next(terms.Count)]; exists = "yes"; } upto = terms.BinarySearch(target); if (LuceneTestCase.Random.NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: iter seekCeil target=" + target.Utf8ToString() + " exists=" + exists); } // seekCeil TermsEnum.SeekStatus status = termsEnum.SeekCeil(target); if (VERBOSE) { Console.WriteLine(" got " + status); } if (upto < 0) { upto = -(upto + 1); if (upto >= terms.Count) { Assert.AreEqual(TermsEnum.SeekStatus.END, status); upto = -1; } else { Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status); Assert.AreEqual(terms[upto], termsEnum.Term); } } else { Assert.AreEqual(TermsEnum.SeekStatus.FOUND, status); Assert.AreEqual(terms[upto], termsEnum.Term); } } else { if (VERBOSE) { Console.WriteLine("TEST: iter seekExact target=" + target.Utf8ToString() + " exists=" + exists); } // seekExact bool result = termsEnum.SeekExact(target); if (VERBOSE) { Console.WriteLine(" got " + result); } if (upto < 0) { Assert.IsFalse(result); upto = -1; } else { Assert.IsTrue(result); Assert.AreEqual(target, termsEnum.Term); } } } } r.Dispose(); d.Dispose(); docs.Dispose(); }
public override SeekStatus SeekCeil(BytesRef target) { // already here if (term != null && term.Equals(target)) { return(SeekStatus.FOUND); } int startIdx = Array.BinarySearch(outerInstance.m_indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target); if (Debugging.AssertsEnabled) { Debugging.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); } ord = startIdx << outerInstance.indexIntervalBits; SetTerm(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } return(SeekStatus.FOUND); } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target); if (Debugging.AssertsEnabled) { Debugging.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND); } ord = 0; SetTerm(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } return(SeekStatus.NOT_FOUND); } // back up to the start of the block startIdx--; if ((ord >> outerInstance.indexIntervalBits) == startIdx && term != null && term.CompareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(outerInstance.m_indexedTermsArray[startIdx]); if (Debugging.AssertsEnabled) { Debugging.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); } ord = startIdx << outerInstance.indexIntervalBits; SetTerm(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); // should be non-null since it's in the index } } while (term != null && term.CompareTo(target) < 0) { Next(); } if (term == null) { return(SeekStatus.END); } else if (term.CompareTo(target) == 0) { return(SeekStatus.FOUND); } else { return(SeekStatus.NOT_FOUND); } }
protected internal virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms) { Assert.AreEqual(1, terms.DocCount); int termCount = (new HashSet <string>(Arrays.AsList(tk.Terms))).Count; Assert.AreEqual(termCount, terms.Size()); Assert.AreEqual(termCount, terms.SumDocFreq); Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions()); Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets()); Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads()); HashSet <BytesRef> uniqueTerms = new HashSet <BytesRef>(); foreach (string term in tk.Freqs.Keys) { uniqueTerms.Add(new BytesRef(term)); } BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/); Array.Sort(sortedTerms, terms.Comparator); TermsEnum termsEnum = terms.Iterator(Random().NextBoolean() ? null : this.termsEnum.Value); this.termsEnum.Value = termsEnum; for (int i = 0; i < sortedTerms.Length; ++i) { BytesRef nextTerm = termsEnum.Next(); Assert.AreEqual(sortedTerms[i], nextTerm); Assert.AreEqual(sortedTerms[i], termsEnum.Term()); Assert.AreEqual(1, termsEnum.DocFreq()); FixedBitSet bits = new FixedBitSet(1); DocsEnum docsEnum = termsEnum.Docs(bits, Random().NextBoolean() ? null : this.docsEnum.Value); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); bits.Set(0); docsEnum = termsEnum.Docs(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsEnum); Assert.IsNotNull(docsEnum); Assert.AreEqual(0, docsEnum.NextDoc()); Assert.AreEqual(0, docsEnum.DocID()); Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)docsEnum.Freq()); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); this.docsEnum.Value = docsEnum; bits.Clear(0); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random().NextBoolean() ? null : this.docsAndPositionsEnum.Value); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (docsAndPositionsEnum != null) { Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } bits.Set(0); docsAndPositionsEnum = termsEnum.DocsAndPositions(Random().NextBoolean() ? bits : null, Random().NextBoolean() ? null : docsAndPositionsEnum); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (terms.HasPositions() || terms.HasOffsets()) { Assert.AreEqual(0, docsAndPositionsEnum.NextDoc()); int freq = docsAndPositionsEnum.Freq(); Assert.AreEqual(tk.Freqs[termsEnum.Term().Utf8ToString()], (int?)freq); if (docsAndPositionsEnum != null) { for (int k = 0; k < freq; ++k) { int position = docsAndPositionsEnum.NextPosition(); ISet <int?> indexes; if (terms.HasPositions()) { indexes = tk.PositionToTerms[position]; Assert.IsNotNull(indexes); } else { indexes = tk.StartOffsetToTerms[docsAndPositionsEnum.StartOffset()]; Assert.IsNotNull(indexes); } if (terms.HasPositions()) { bool foundPosition = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.Positions[index] == position) { foundPosition = true; break; } } Assert.IsTrue(foundPosition); } if (terms.HasOffsets()) { bool foundOffset = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && tk.StartOffsets[index] == docsAndPositionsEnum.StartOffset() && tk.EndOffsets[index] == docsAndPositionsEnum.EndOffset()) { foundOffset = true; break; } } Assert.IsTrue(foundOffset); } if (terms.HasPayloads()) { bool foundPayload = false; foreach (int index in indexes) { if (tk.TermBytes[index].Equals(termsEnum.Term()) && Equals(tk.Payloads[index], docsAndPositionsEnum.Payload)) { foundPayload = true; break; } } Assert.IsTrue(foundPayload); } } try { docsAndPositionsEnum.NextPosition(); Assert.Fail(); } catch (Exception e) { // ok } } Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } this.docsAndPositionsEnum.Value = docsAndPositionsEnum; } Assert.IsNull(termsEnum.Next()); for (int i = 0; i < 5; ++i) { if (Random().NextBoolean()) { Assert.IsTrue(termsEnum.SeekExact(RandomInts.RandomFrom(Random(), tk.TermBytes))); } else { Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomInts.RandomFrom(Random(), tk.TermBytes))); } } }
public override SeekStatus SeekCeil(BytesRef target) { // already here if (Term_Renamed != null && Term_Renamed.Equals(target)) { return(SeekStatus.FOUND); } int startIdx = OuterInstance.IndexedTermsArray.ToList().BinarySearch(target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(target); Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); Ord_Renamed = startIdx << OuterInstance.IndexIntervalBits; SetTerm(); Debug.Assert(Term_Renamed != null); return(SeekStatus.FOUND); } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(target); Debug.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND); Ord_Renamed = 0; SetTerm(); Debug.Assert(Term_Renamed != null); return(SeekStatus.NOT_FOUND); } // back up to the start of the block startIdx--; if ((Ord_Renamed >> OuterInstance.IndexIntervalBits) == startIdx && Term_Renamed != null && Term_Renamed.CompareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = TermsEnum.SeekCeil(OuterInstance.IndexedTermsArray[startIdx]); Debug.Assert(seekStatus == TermsEnum.SeekStatus.FOUND); Ord_Renamed = startIdx << OuterInstance.IndexIntervalBits; SetTerm(); Debug.Assert(Term_Renamed != null); // should be non-null since it's in the index } while (Term_Renamed != null && Term_Renamed.CompareTo(target) < 0) { Next(); } if (Term_Renamed == null) { return(SeekStatus.END); } else if (Term_Renamed.CompareTo(target) == 0) { return(SeekStatus.FOUND); } else { return(SeekStatus.NOT_FOUND); } }
public override SeekStatus SeekCeil(BytesRef text) { return(m_input.SeekCeil(text)); }
private void Verify(AtomicReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) { DocTermOrds dto = new DocTermOrds(r, r.LiveDocs, "field", prefixRef, int.MaxValue, TestUtil.NextInt32(Random, 2, 10)); FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(r, "id", false); /* * for(int docID=0;docID<subR.MaxDoc;docID++) { * System.out.println(" docID=" + docID + " id=" + docIDToID[docID]); * } */ if (VERBOSE) { Console.WriteLine("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.Utf8ToString())); Console.WriteLine("TEST: all TERMS:"); TermsEnum allTE = MultiFields.GetTerms(r, "field").GetIterator(null); int ord = 0; while (allTE.Next() != null) { Console.WriteLine(" ord=" + (ord++) + " term=" + allTE.Term.Utf8ToString()); } } //final TermsEnum te = subR.Fields.Terms("field").iterator(); TermsEnum te = dto.GetOrdTermsEnum(r); if (dto.NumTerms == 0) { if (prefixRef == null) { Assert.IsNull(MultiFields.GetTerms(r, "field")); } else { Terms terms = MultiFields.GetTerms(r, "field"); if (terms != null) { TermsEnum termsEnum = terms.GetIterator(null); TermsEnum.SeekStatus result = termsEnum.SeekCeil(prefixRef); if (result != TermsEnum.SeekStatus.END) { Assert.IsFalse(StringHelper.StartsWith(termsEnum.Term, prefixRef), "term=" + termsEnum.Term.Utf8ToString() + " matches prefix=" + prefixRef.Utf8ToString()); } else { // ok } } else { // ok } } return; } if (VERBOSE) { Console.WriteLine("TEST: TERMS:"); te.SeekExact(0); while (true) { Console.WriteLine(" ord=" + te.Ord + " term=" + te.Term.Utf8ToString()); if (te.Next() == null) { break; } } } SortedSetDocValues iter = dto.GetIterator(r); for (int docID = 0; docID < r.MaxDoc; docID++) { if (VERBOSE) { Console.WriteLine("TEST: docID=" + docID + " of " + r.MaxDoc + " (id=" + docIDToID.Get(docID) + ")"); } iter.SetDocument(docID); int[] answers = idToOrds[docIDToID.Get(docID)]; int upto = 0; long ord; while ((ord = iter.NextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { te.SeekExact(ord); BytesRef expected = termsArray[answers[upto++]]; if (VERBOSE) { Console.WriteLine(" exp=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString()); } Assert.AreEqual(expected, te.Term, "expected=" + expected.Utf8ToString() + " actual=" + te.Term.Utf8ToString() + " ord=" + ord); } Assert.AreEqual(answers.Length, upto); } }
public virtual void TestSortedTermsEnum() { Directory directory = NewDirectory(); Analyzer analyzer = new MockAnalyzer(Random); IndexWriterConfig iwconfig = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); iwconfig.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter iwriter = new RandomIndexWriter(Random, directory, iwconfig); Document doc = new Document(); doc.Add(new StringField("field", "hello", Field.Store.NO)); iwriter.AddDocument(doc); doc = new Document(); doc.Add(new StringField("field", "world", Field.Store.NO)); iwriter.AddDocument(doc); doc = new Document(); doc.Add(new StringField("field", "beer", Field.Store.NO)); iwriter.AddDocument(doc); iwriter.ForceMerge(1); DirectoryReader ireader = iwriter.GetReader(); iwriter.Dispose(); AtomicReader ar = GetOnlySegmentReader(ireader); SortedSetDocValues dv = FieldCache.DEFAULT.GetDocTermOrds(ar, "field"); Assert.AreEqual(3, dv.ValueCount); TermsEnum termsEnum = dv.GetTermsEnum(); // next() Assert.AreEqual("beer", termsEnum.Next().Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.AreEqual("hello", termsEnum.Next().Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.AreEqual("world", termsEnum.Next().Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); // seekCeil() Assert.AreEqual(SeekStatus.NOT_FOUND, termsEnum.SeekCeil(new BytesRef("ha!"))); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef("beer"))); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.AreEqual(SeekStatus.END, termsEnum.SeekCeil(new BytesRef("zzz"))); // seekExact() Assert.IsTrue(termsEnum.SeekExact(new BytesRef("beer"))); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("hello"))); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("world"))); Assert.AreEqual("world", termsEnum.Term.Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); Assert.IsFalse(termsEnum.SeekExact(new BytesRef("bogus"))); // seek(ord) termsEnum.SeekExact(0); Assert.AreEqual("beer", termsEnum.Term.Utf8ToString()); Assert.AreEqual(0, termsEnum.Ord); termsEnum.SeekExact(1); Assert.AreEqual("hello", termsEnum.Term.Utf8ToString()); Assert.AreEqual(1, termsEnum.Ord); termsEnum.SeekExact(2); Assert.AreEqual("world", termsEnum.Term.Utf8ToString()); Assert.AreEqual(2, termsEnum.Ord); ireader.Dispose(); directory.Dispose(); }
public virtual void TestFixedPostings() { const int NUM_TERMS = 100; TermData[] terms = new TermData[NUM_TERMS]; for (int i = 0; i < NUM_TERMS; i++) { int[] docs = new int[] { i }; string text = Convert.ToString(i); terms[i] = new TermData(this, text, docs, null); } FieldInfos.Builder builder = new FieldInfos.Builder(); FieldData field = new FieldData(this, "field", builder, terms, true, false); FieldData[] fields = new FieldData[] { field }; FieldInfos fieldInfos = builder.Finish(); // LUCENENET specific - BUG: we must wrap this in a using block in case anything in the below loop throws using (Directory dir = NewDirectory()) { this.Write(fieldInfos, dir, fields, true); Codec codec = Codec.Default; SegmentInfo si = new SegmentInfo(dir, Constants.LUCENE_MAIN_VERSION, SEGMENT, 10000, false, codec, null); // LUCENENET specific - BUG: we must wrap this in a using block in case anything in the below loop throws using (FieldsProducer reader = codec.PostingsFormat.FieldsProducer(new SegmentReadState(dir, si, fieldInfos, NewIOContext(Random), DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR))) { IEnumerator <string> fieldsEnum = reader.GetEnumerator(); fieldsEnum.MoveNext(); string fieldName = fieldsEnum.Current; Assert.IsNotNull(fieldName); Terms terms2 = reader.GetTerms(fieldName); Assert.IsNotNull(terms2); TermsEnum termsEnum = terms2.GetIterator(null); DocsEnum docsEnum = null; for (int i = 0; i < NUM_TERMS; i++) { BytesRef term = termsEnum.Next(); Assert.IsNotNull(term); Assert.AreEqual(terms[i].text2, term.Utf8ToString()); // do this twice to stress test the codec's reuse, ie, // make sure it properly fully resets (rewinds) its // internal state: for (int iter = 0; iter < 2; iter++) { docsEnum = TestUtil.Docs(Random, termsEnum, null, docsEnum, DocsFlags.NONE); Assert.AreEqual(terms[i].docs[0], docsEnum.NextDoc()); Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc()); } } Assert.IsNull(termsEnum.Next()); for (int i = 0; i < NUM_TERMS; i++) { Assert.AreEqual(termsEnum.SeekCeil(new BytesRef(terms[i].text2)), TermsEnum.SeekStatus.FOUND); } Assert.IsFalse(fieldsEnum.MoveNext()); } } }
private void AssertEquals(long numOrds, TermsEnum expected, TermsEnum actual) { BytesRef @ref; // sequential next() through all terms while ((@ref = expected.Next()) != null) { Assert.AreEqual(@ref, actual.Next()); Assert.AreEqual(expected.Ord(), actual.Ord()); Assert.AreEqual(expected.Term(), actual.Term()); } Assert.IsNull(actual.Next()); // sequential seekExact(ord) through all terms for (long i = 0; i < numOrds; i++) { expected.SeekExact(i); actual.SeekExact(i); Assert.AreEqual(expected.Ord(), actual.Ord()); Assert.AreEqual(expected.Term(), actual.Term()); } // sequential seekExact(BytesRef) through all terms for (long i = 0; i < numOrds; i++) { expected.SeekExact(i); Assert.IsTrue(actual.SeekExact(expected.Term())); Assert.AreEqual(expected.Ord(), actual.Ord()); Assert.AreEqual(expected.Term(), actual.Term()); } // sequential seekCeil(BytesRef) through all terms for (long i = 0; i < numOrds; i++) { expected.SeekExact(i); Assert.AreEqual(SeekStatus.FOUND, actual.SeekCeil(expected.Term())); Assert.AreEqual(expected.Ord(), actual.Ord()); Assert.AreEqual(expected.Term(), actual.Term()); } // random seekExact(ord) for (long i = 0; i < numOrds; i++) { long randomOrd = TestUtil.NextLong(Random(), 0, numOrds - 1); expected.SeekExact(randomOrd); actual.SeekExact(randomOrd); Assert.AreEqual(expected.Ord(), actual.Ord()); Assert.AreEqual(expected.Term(), actual.Term()); } // random seekExact(BytesRef) for (long i = 0; i < numOrds; i++) { long randomOrd = TestUtil.NextLong(Random(), 0, numOrds - 1); expected.SeekExact(randomOrd); actual.SeekExact(expected.Term()); Assert.AreEqual(expected.Ord(), actual.Ord()); Assert.AreEqual(expected.Term(), actual.Term()); } // random seekCeil(BytesRef) for (long i = 0; i < numOrds; i++) { BytesRef target = new BytesRef(TestUtil.RandomUnicodeString(Random())); SeekStatus expectedStatus = expected.SeekCeil(target); Assert.AreEqual(expectedStatus, actual.SeekCeil(target)); if (expectedStatus != SeekStatus.END) { Assert.AreEqual(expected.Ord(), actual.Ord()); Assert.AreEqual(expected.Term(), actual.Term()); } } }
public virtual void _run() { for (int iter = 0; iter < NUM_TEST_ITER; iter++) { FieldData field = fields[Random.Next(fields.Length)]; TermsEnum termsEnum = termsDict.GetTerms(field.fieldInfo.Name).GetIterator(null); #pragma warning disable 612, 618 if (si.Codec is Lucene3xCodec) #pragma warning restore 612, 618 { // code below expects unicode sort order continue; } int upto = 0; // Test straight enum of the terms: while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } BytesRef expected = new BytesRef(field.terms[upto++].text2); Assert.IsTrue(expected.BytesEquals(term), "expected=" + expected + " vs actual " + term); } Assert.AreEqual(upto, field.terms.Length); // Test random seek: TermData term2 = field.terms[Random.Next(field.terms.Length)]; TermsEnum.SeekStatus status = termsEnum.SeekCeil(new BytesRef(term2.text2)); Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND); Assert.AreEqual(term2.docs.Length, termsEnum.DocFreq); if (field.omitTF) { this.VerifyDocs(term2.docs, term2.positions, TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE), false); } else { this.VerifyDocs(term2.docs, term2.positions, termsEnum.DocsAndPositions(null, null), true); } // Test random seek by ord: int idx = Random.Next(field.terms.Length); term2 = field.terms[idx]; bool success = false; try { termsEnum.SeekExact(idx); success = true; } #pragma warning disable 168 catch (NotSupportedException uoe) #pragma warning restore 168 { // ok -- skip it } if (success) { Assert.AreEqual(status, TermsEnum.SeekStatus.FOUND); Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(term2.text2))); Assert.AreEqual(term2.docs.Length, termsEnum.DocFreq); if (field.omitTF) { this.VerifyDocs(term2.docs, term2.positions, TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE), false); } else { this.VerifyDocs(term2.docs, term2.positions, termsEnum.DocsAndPositions(null, null), true); } } // Test seek to non-existent terms: if (Verbose) { Console.WriteLine("TEST: seek non-exist terms"); } for (int i = 0; i < 100; i++) { string text2 = TestUtil.RandomUnicodeString(Random) + "."; status = termsEnum.SeekCeil(new BytesRef(text2)); Assert.IsTrue(status == TermsEnum.SeekStatus.NOT_FOUND || status == TermsEnum.SeekStatus.END); } // Seek to each term, backwards: if (Verbose) { Console.WriteLine("TEST: seek terms backwards"); } for (int i = field.terms.Length - 1; i >= 0; i--) { Assert.AreEqual(TermsEnum.SeekStatus.FOUND, termsEnum.SeekCeil(new BytesRef(field.terms[i].text2)), Thread.CurrentThread.Name + ": field=" + field.fieldInfo.Name + " term=" + field.terms[i].text2); Assert.AreEqual(field.terms[i].docs.Length, termsEnum.DocFreq); } // Seek to each term by ord, backwards for (int i = field.terms.Length - 1; i >= 0; i--) { try { termsEnum.SeekExact(i); Assert.AreEqual(field.terms[i].docs.Length, termsEnum.DocFreq); Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.terms[i].text2))); } #pragma warning disable 168 catch (NotSupportedException uoe) #pragma warning restore 168 { } } // Seek to non-existent empty-string term status = termsEnum.SeekCeil(new BytesRef("")); Assert.IsNotNull(status); //Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status); // Make sure we're now pointing to first term Assert.IsTrue(termsEnum.Term.BytesEquals(new BytesRef(field.terms[0].text2))); // Test docs enum termsEnum.SeekCeil(new BytesRef("")); upto = 0; do { term2 = field.terms[upto]; if (Random.Next(3) == 1) { DocsEnum docs; DocsEnum docsAndFreqs; DocsAndPositionsEnum postings; if (!field.omitTF) { postings = termsEnum.DocsAndPositions(null, null); if (postings != null) { docs = docsAndFreqs = postings; } else { docs = docsAndFreqs = TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.FREQS); } } else { postings = null; docsAndFreqs = null; docs = TestUtil.Docs(Random, termsEnum, null, null, DocsFlags.NONE); } Assert.IsNotNull(docs); int upto2 = -1; bool ended = false; while (upto2 < term2.docs.Length - 1) { // Maybe skip: int left = term2.docs.Length - upto2; int doc; if (Random.Next(3) == 1 && left >= 1) { int inc = 1 + Random.Next(left - 1); upto2 += inc; if (Random.Next(2) == 1) { doc = docs.Advance(term2.docs[upto2]); Assert.AreEqual(term2.docs[upto2], doc); } else { doc = docs.Advance(1 + term2.docs[upto2]); if (doc == DocIdSetIterator.NO_MORE_DOCS) { // skipped past last doc if (Debugging.AssertsEnabled) { Debugging.Assert(upto2 == term2.docs.Length - 1); } ended = true; break; } else { // skipped to next doc if (Debugging.AssertsEnabled) { Debugging.Assert(upto2 < term2.docs.Length - 1); } if (doc >= term2.docs[1 + upto2]) { upto2++; } } } } else { doc = docs.NextDoc(); Assert.IsTrue(doc != -1); upto2++; } Assert.AreEqual(term2.docs[upto2], doc); if (!field.omitTF) { Assert.AreEqual(term2.positions[upto2].Length, postings.Freq); if (Random.Next(2) == 1) { this.VerifyPositions(term2.positions[upto2], postings); } } } if (!ended) { Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docs.NextDoc()); } } upto++; } while (termsEnum.Next() != null); Assert.AreEqual(upto, field.terms.Length); } }
/// <summary> /// Call this only once (if you subclass!) </summary> protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix) { FieldInfo info = reader.FieldInfos.FieldInfo(m_field); if (info != null && info.HasDocValues) { throw new InvalidOperationException("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType); } //System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); long startTime = Environment.TickCount; m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix); int maxDoc = reader.MaxDoc; int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number int[] lastTerm = new int[maxDoc]; // last term we saw for this document var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) Fields fields = reader.Fields; if (fields == null) { // No terms return; } Terms terms = fields.GetTerms(m_field); if (terms == null) { // No terms return; } TermsEnum te = terms.GetIterator(null); BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); //System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // If we need our "term index wrapper", these will be // init'd below: IList <BytesRef> indexedTerms = null; PagedBytes indexedTermsBytes = null; bool testedOrd = false; // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. var tempArr = new sbyte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in it's byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. this requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; m_docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ;) { BytesRef t = te.Term; if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix))) { break; } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); if (!testedOrd) { try { m_ordBase = (int)te.Ord; //System.out.println("got ordBase=" + ordBase); } #pragma warning disable 168 catch (NotSupportedException uoe) #pragma warning restore 168 { // Reader cannot provide ord support, so we wrap // our own support by creating our own terms index: indexedTerms = new List <BytesRef>(); indexedTermsBytes = new PagedBytes(15); //System.out.println("NO ORDS"); } testedOrd = true; } VisitTerm(te, termNum); if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { // Index this term m_sizeOfIndexedStrings += t.Length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.Copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.Add(indexedTerm); } int df = te.DocFreq; if (df <= m_maxTermDocFreq) { m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ;) { int doc = m_docsEnum.NextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } //System.out.println(" chunk=" + chunk + " docs"); actualDF++; m_termInstances++; //System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0xff) == 1) { // index into byte array (actually the end of // the doc-specific byte[] when building) int pos = (int)((uint)val >> 8); int ilen = VInt32Size(delta); var arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.Length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. // TODO: figure out what array lengths we can round up to w/o actually using more memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & unchecked ((int)0xfffffffc); // 4 byte alignment var newarr = new sbyte[newLen]; Array.Copy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = WriteInt32(delta, arr, pos); index[doc] = (pos << 8) | 1; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } //System.out.println(" ipos=" + ipos); int endPos = WriteInt32(delta, tempArr, ipos); //System.out.println(" endpos=" + endPos); if (endPos <= 4) { //System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (sbyte)val; val = (int)((uint)val >> 8); } // point at the end index in the byte[] index[doc] = (endPos << 8) | 1; bytes[doc] = tempArr; tempArr = new sbyte[12]; } } } SetActualDocFreq(termNum, actualDF); } termNum++; if (te.Next() == null) { break; } } m_numTermsInField = termNum; long midPoint = Environment.TickCount; if (m_termInstances == 0) { // we didn't invert anything // lower memory consumption. m_tnums = null; } else { this.m_index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { var target = m_tnums[pass]; var pos = 0; // end in target; if (target != null) { pos = target.Length; } else { target = new sbyte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.Min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { //System.out.println(" pass="******" process docID=" + doc); int val = index[doc]; if ((val & 0xff) == 1) { int len = (int)((uint)val >> 8); //System.out.println(" ptr pos=" + pos); index[doc] = (pos << 8) | 1; // change index to point to start of array if ((pos & 0xff000000) != 0) { // we only have 24 bits for the array index throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + m_field); } var arr = bytes[doc]; /* * for(byte b : arr) { * //System.out.println(" b=" + Integer.toHexString((int) b)); * } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.Length <= pos + len) { int newlen = target.Length; //* we don't have to worry about the array getting too large // since the "pos" param will overflow first (only 24 bits available) // if ((newlen<<1) <= 0) { // // overflow... // newlen = Integer.MAX_VALUE; // if (newlen <= pos + len) { // throw new SolrException(400,"Too many terms to uninvert field!"); // } // } else { // while (newlen <= pos + len) newlen<<=1; // doubling strategy // } // while (newlen <= pos + len) // doubling strategy { newlen <<= 1; } var newtarget = new sbyte[newlen]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } Array.Copy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.Length) { var newtarget = new sbyte[pos]; Array.Copy(target, 0, newtarget, 0, pos); target = newtarget; } m_tnums[pass] = target; if ((pass << 16) > maxDoc) { break; } } } if (indexedTerms != null) { m_indexedTermsArray = new BytesRef[indexedTerms.Count]; indexedTerms.CopyTo(m_indexedTermsArray, 0); } long endTime = Environment.TickCount; m_total_time = (int)(endTime - startTime); m_phase1_time = (int)(midPoint - startTime); }
protected virtual void AssertEquals(RandomTokenStream tk, FieldType ft, Terms terms) { Assert.AreEqual(1, terms.DocCount); int termCount = new JCG.HashSet <string>(tk.terms).Count; Assert.AreEqual((long)termCount, terms.Count); // LUCENENET specific - cast required because types don't match (xUnit checks this) Assert.AreEqual((long)termCount, terms.SumDocFreq); // LUCENENET specific - cast required because types don't match (xUnit checks this) Assert.AreEqual(ft.StoreTermVectorPositions, terms.HasPositions); Assert.AreEqual(ft.StoreTermVectorOffsets, terms.HasOffsets); Assert.AreEqual(ft.StoreTermVectorPayloads && tk.HasPayloads(), terms.HasPayloads); ISet <BytesRef> uniqueTerms = new JCG.HashSet <BytesRef>(); foreach (string term in tk.freqs.Keys) { uniqueTerms.Add(new BytesRef(term)); } BytesRef[] sortedTerms = uniqueTerms.ToArray(/*new BytesRef[0]*/); Array.Sort(sortedTerms, terms.Comparer); TermsEnum termsEnum = terms.GetEnumerator(Random.NextBoolean() ? null : this.termsEnum.Value); this.termsEnum.Value = termsEnum; for (int i = 0; i < sortedTerms.Length; ++i) { Assert.IsTrue(termsEnum.MoveNext()); Assert.AreEqual(sortedTerms[i], termsEnum.Term); Assert.AreEqual(1, termsEnum.DocFreq); FixedBitSet bits = new FixedBitSet(1); DocsEnum docsEnum = termsEnum.Docs(bits, Random.NextBoolean() ? null : this.docsEnum.Value); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); bits.Set(0); docsEnum = termsEnum.Docs(Random.NextBoolean() ? bits : null, Random.NextBoolean() ? null : docsEnum); Assert.IsNotNull(docsEnum); Assert.AreEqual(0, docsEnum.NextDoc()); Assert.AreEqual(0, docsEnum.DocID); Assert.AreEqual(tk.freqs[termsEnum.Term.Utf8ToString()], docsEnum.Freq); Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsEnum.NextDoc()); this.docsEnum.Value = docsEnum; bits.Clear(0); DocsAndPositionsEnum docsAndPositionsEnum = termsEnum.DocsAndPositions(bits, Random.NextBoolean() ? null : this.docsAndPositionsEnum.Value); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (docsAndPositionsEnum != null) { Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } bits.Set(0); docsAndPositionsEnum = termsEnum.DocsAndPositions(Random.NextBoolean() ? bits : null, Random.NextBoolean() ? null : docsAndPositionsEnum); Assert.AreEqual(ft.StoreTermVectorOffsets || ft.StoreTermVectorPositions, docsAndPositionsEnum != null); if (terms.HasPositions || terms.HasOffsets) { Assert.AreEqual(0, docsAndPositionsEnum.NextDoc()); int freq = docsAndPositionsEnum.Freq; Assert.AreEqual(tk.freqs[termsEnum.Term.Utf8ToString()], freq); if (docsAndPositionsEnum != null) { for (int k = 0; k < freq; ++k) { int position = docsAndPositionsEnum.NextPosition(); ISet <int> indexes; if (terms.HasPositions) { indexes = tk.positionToTerms[position]; Assert.IsNotNull(indexes); } else { indexes = tk.startOffsetToTerms[docsAndPositionsEnum.StartOffset]; Assert.IsNotNull(indexes); } if (terms.HasPositions) { bool foundPosition = false; foreach (int index in indexes) { if (tk.termBytes[index].Equals(termsEnum.Term) && tk.positions[index] == position) { foundPosition = true; break; } } Assert.IsTrue(foundPosition); } if (terms.HasOffsets) { bool foundOffset = false; foreach (int index in indexes) { if (tk.termBytes[index].Equals(termsEnum.Term) && tk.startOffsets[index] == docsAndPositionsEnum.StartOffset && tk.endOffsets[index] == docsAndPositionsEnum.EndOffset) { foundOffset = true; break; } } Assert.IsTrue(foundOffset); } if (terms.HasPayloads) { bool foundPayload = false; foreach (int index in indexes) { if (tk.termBytes[index].Equals(termsEnum.Term) && Equals(tk.payloads[index], docsAndPositionsEnum.GetPayload())) { foundPayload = true; break; } } Assert.IsTrue(foundPayload); } } try { docsAndPositionsEnum.NextPosition(); Assert.Fail(); } catch (Exception e) when(e.IsException()) { // ok } catch (Exception e) when(e.IsAssertionError()) { // ok } } Assert.AreEqual(DocsEnum.NO_MORE_DOCS, docsAndPositionsEnum.NextDoc()); } this.docsAndPositionsEnum.Value = docsAndPositionsEnum; } Assert.IsFalse(termsEnum.MoveNext()); for (int i = 0; i < 5; ++i) { if (Random.NextBoolean()) { Assert.IsTrue(termsEnum.SeekExact(RandomPicks.RandomFrom(Random, tk.termBytes))); } else { Assert.AreEqual(SeekStatus.FOUND, termsEnum.SeekCeil(RandomPicks.RandomFrom(Random, tk.termBytes))); } } }
public virtual void TestStressAdvance_Mem() { for (int iter = 0; iter < 3; iter++) { if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter); } Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir); HashSet <int> aDocs = new HashSet <int>(); Documents.Document doc = new Documents.Document(); Field f = NewStringField("field", "", Field.Store.NO); doc.Add(f); Field idField = NewStringField("id", "", Field.Store.YES); doc.Add(idField); int num = AtLeast(4097); if (VERBOSE) { Console.WriteLine("\nTEST: numDocs=" + num); } for (int id = 0; id < num; id++) { if (Random().Next(4) == 3) { f.StringValue = "a"; aDocs.Add(id); } else { f.StringValue = "b"; } idField.StringValue = "" + id; w.AddDocument(doc); if (VERBOSE) { Console.WriteLine("\nTEST: doc upto " + id); } } w.ForceMerge(1); IList <int> aDocIDs = new List <int>(); IList <int> bDocIDs = new List <int>(); DirectoryReader r = w.Reader; int[] idToDocID = new int[r.MaxDoc]; for (int docID = 0; docID < idToDocID.Length; docID++) { int id = Convert.ToInt32(r.Document(docID).Get("id")); if (aDocs.Contains(id)) { aDocIDs.Add(docID); } else { bDocIDs.Add(docID); } } TermsEnum te = GetOnlySegmentReader(r).Fields.Terms("field").Iterator(null); DocsEnum de = null; for (int iter2 = 0; iter2 < 10; iter2++) { if (VERBOSE) { Console.WriteLine("\nTEST: iter=" + iter + " iter2=" + iter2); } Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("a"))); de = TestUtil.Docs(Random(), te, null, de, DocsEnum.FLAG_NONE); TestOne(de, aDocIDs); Assert.AreEqual(TermsEnum.SeekStatus.FOUND, te.SeekCeil(new BytesRef("b"))); de = TestUtil.Docs(Random(), te, null, de, DocsEnum.FLAG_NONE); TestOne(de, bDocIDs); } w.Dispose(); r.Dispose(); dir.Dispose(); } }