// following code is almost an exact dup of code from TestDuelingCodecs: sorry! public virtual void AssertTerms(Terms leftTerms, Terms rightTerms, bool deep) { if (leftTerms == null || rightTerms == null) { Assert.IsNull(leftTerms); Assert.IsNull(rightTerms); return; } AssertTermsStatistics(leftTerms, rightTerms); // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different TermsEnum leftTermsEnum = leftTerms.Iterator(null); TermsEnum rightTermsEnum = rightTerms.Iterator(null); AssertTermsEnum(leftTermsEnum, rightTermsEnum, true); AssertTermsSeeking(leftTerms, rightTerms); if (deep) { int numIntersections = AtLeast(3); for (int i = 0; i < numIntersections; i++) { string re = AutomatonTestUtil.RandomRegexp(Random()); CompiledAutomaton automaton = new CompiledAutomaton((new RegExp(re, RegExp.NONE)).ToAutomaton()); if (automaton.Type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.Intersect(automaton, null); TermsEnum rightIntersection = rightTerms.Intersect(automaton, null); AssertTermsEnum(leftIntersection, rightIntersection, Rarely()); } } } }
/// <summary> /// Terms api equivalency /// </summary> public void AssertTermsEquals(string info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, bool deep) { if (leftTerms == null || rightTerms == null) { Assert.IsNull(leftTerms, info); Assert.IsNull(rightTerms, info); return; } AssertTermsStatisticsEquals(info, leftTerms, rightTerms); Assert.AreEqual(leftTerms.HasOffsets(), rightTerms.HasOffsets()); Assert.AreEqual(leftTerms.HasPositions(), rightTerms.HasPositions()); Assert.AreEqual(leftTerms.HasPayloads(), rightTerms.HasPayloads()); TermsEnum leftTermsEnum = leftTerms.Iterator(null); TermsEnum rightTermsEnum = rightTerms.Iterator(null); AssertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true); AssertTermsSeekingEquals(info, leftTerms, rightTerms); if (deep) { int numIntersections = AtLeast(3); for (int i = 0; i < numIntersections; i++) { string re = AutomatonTestUtil.RandomRegexp(Random()); CompiledAutomaton automaton = new CompiledAutomaton((new RegExp(re, RegExp.NONE)).ToAutomaton()); if (automaton.Type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.Intersect(automaton, null); TermsEnum rightIntersection = rightTerms.Intersect(automaton, null); AssertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, Rarely()); } } } }
public virtual void TestIntersectStartTerm() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(new LogDocMergePolicy()); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); doc.Add(NewStringField("field", "abc", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "abd", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "acd", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "bcd", Field.Store.NO)); w.AddDocument(doc); w.ForceMerge(1); DirectoryReader r = w.Reader; w.Dispose(); AtomicReader sub = GetOnlySegmentReader(r); Terms terms = sub.Fields.Terms("field"); Automaton automaton = (new RegExp(".*d", RegExp.NONE)).ToAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te; // should seek to startTerm te = terms.Intersect(ca, new BytesRef("aad")); Assert.AreEqual("abd", te.Next().Utf8ToString()); Assert.AreEqual(1, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.AreEqual("acd", te.Next().Utf8ToString()); Assert.AreEqual(2, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.AreEqual("bcd", te.Next().Utf8ToString()); Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.IsNull(te.Next()); // should fail to find ceil label on second arc, rewind te = terms.Intersect(ca, new BytesRef("add")); Assert.AreEqual("bcd", te.Next().Utf8ToString()); Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.IsNull(te.Next()); // should reach end te = terms.Intersect(ca, new BytesRef("bcd")); Assert.IsNull(te.Next()); te = terms.Intersect(ca, new BytesRef("ddd")); Assert.IsNull(te.Next()); r.Dispose(); dir.Dispose(); }
private bool Accepts(CompiledAutomaton c, BytesRef b) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < b.Length; idx++) { Assert.IsTrue(state != -1); state = c.RunAutomaton.Step(state, b.Bytes[b.Offset + idx] & 0xff); } return c.RunAutomaton.IsAccept(state); }
public virtual void TestIntersectRandom() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir); int numTerms = AtLeast(300); //final int numTerms = 50; HashSet<string> terms = new HashSet<string>(); ICollection<string> pendingTerms = new List<string>(); IDictionary<BytesRef, int?> termToID = new Dictionary<BytesRef, int?>(); int id = 0; while (terms.Count != numTerms) { string s = RandomString; if (!terms.Contains(s)) { terms.Add(s); pendingTerms.Add(s); if (Random().Next(20) == 7) { AddDoc(w, pendingTerms, termToID, id++); } } } AddDoc(w, pendingTerms, termToID, id++); BytesRef[] termsArray = new BytesRef[terms.Count]; HashSet<BytesRef> termsSet = new HashSet<BytesRef>(); { int upto = 0; foreach (string s in terms) { BytesRef b = new BytesRef(s); termsArray[upto++] = b; termsSet.Add(b); } Array.Sort(termsArray); } if (VERBOSE) { Console.WriteLine("\nTEST: indexed terms (unicode order):"); foreach (BytesRef t in termsArray) { Console.WriteLine(" " + t.Utf8ToString() + " -> id:" + termToID[t]); } } IndexReader r = w.Reader; w.Dispose(); // NOTE: intentional insanity!! FieldCache.Ints docIDToID = FieldCache.DEFAULT.GetInts(SlowCompositeReaderWrapper.Wrap(r), "id", false); for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { // TODO: can we also test infinite As here...? // From the random terms, pick some ratio and compile an // automaton: HashSet<string> acceptTerms = new HashSet<string>(); SortedSet<BytesRef> sortedAcceptTerms = new SortedSet<BytesRef>(); double keepPct = Random().NextDouble(); Automaton a; if (iter == 0) { if (VERBOSE) { Console.WriteLine("\nTEST: empty automaton"); } a = BasicAutomata.MakeEmpty(); } else { if (VERBOSE) { Console.WriteLine("\nTEST: keepPct=" + keepPct); } foreach (string s in terms) { string s2; if (Random().NextDouble() <= keepPct) { s2 = s; } else { s2 = RandomString; } acceptTerms.Add(s2); sortedAcceptTerms.Add(new BytesRef(s2)); } a = BasicAutomata.MakeStringUnion(sortedAcceptTerms); } if (Random().NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: reduce the automaton"); } a.Reduce(); } CompiledAutomaton c = new CompiledAutomaton(a, true, false); BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.Count]; HashSet<BytesRef> acceptTermsSet = new HashSet<BytesRef>(); int upto = 0; foreach (string s in acceptTerms) { BytesRef b = new BytesRef(s); acceptTermsArray[upto++] = b; acceptTermsSet.Add(b); Assert.IsTrue(Accepts(c, b)); } Array.Sort(acceptTermsArray); if (VERBOSE) { Console.WriteLine("\nTEST: accept terms (unicode order):"); foreach (BytesRef t in acceptTermsArray) { Console.WriteLine(" " + t.Utf8ToString() + (termsSet.Contains(t) ? " (exists)" : "")); } Console.WriteLine(a.ToDot()); } for (int iter2 = 0; iter2 < 100; iter2++) { BytesRef startTerm = acceptTermsArray.Length == 0 || Random().NextBoolean() ? null : acceptTermsArray[Random().Next(acceptTermsArray.Length)]; if (VERBOSE) { Console.WriteLine("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.Utf8ToString())); if (startTerm != null) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < startTerm.Length; idx++) { int label = startTerm.Bytes[startTerm.Offset + idx] & 0xff; Console.WriteLine(" state=" + state + " label=" + label); state = c.RunAutomaton.Step(state, label); Assert.IsTrue(state != -1); } Console.WriteLine(" state=" + state); } } TermsEnum te = MultiFields.GetTerms(r, "f").Intersect(c, startTerm); int loc; if (startTerm == null) { loc = 0; } else { loc = Array.BinarySearch(termsArray, BytesRef.DeepCopyOf(startTerm)); if (loc < 0) { loc = -(loc + 1); } else { // startTerm exists in index loc++; } } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])) { loc++; } DocsEnum docsEnum = null; while (loc < termsArray.Length) { BytesRef expected = termsArray[loc]; BytesRef actual = te.Next(); if (VERBOSE) { Console.WriteLine("TEST: next() expected=" + expected.Utf8ToString() + " actual=" + (actual == null ? "null" : actual.Utf8ToString())); } Assert.AreEqual(expected, actual); Assert.AreEqual(1, te.DocFreq()); docsEnum = TestUtil.Docs(Random(), te, null, docsEnum, DocsEnum.FLAG_NONE); int docID = docsEnum.NextDoc(); Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(docIDToID.Get(docID), (int)termToID[expected]); do { loc++; } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])); } Assert.IsNull(te.Next()); } } r.Dispose(); dir.Dispose(); }
public virtual void TestIntersectEmptyString() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(new LogDocMergePolicy()); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); doc.Add(NewStringField("field", "", Field.Store.NO)); doc.Add(NewStringField("field", "abc", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); // add empty string to both documents, so that singletonDocID == -1. // For a FST-based term dict, we'll expect to see the first arc is // flaged with HAS_FINAL_OUTPUT doc.Add(NewStringField("field", "abc", Field.Store.NO)); doc.Add(NewStringField("field", "", Field.Store.NO)); w.AddDocument(doc); w.ForceMerge(1); DirectoryReader r = w.Reader; w.Dispose(); AtomicReader sub = GetOnlySegmentReader(r); Terms terms = sub.Fields.Terms("field"); Automaton automaton = (new RegExp(".*", RegExp.NONE)).ToAutomaton(); // accept ALL CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te = terms.Intersect(ca, null); DocsEnum de; Assert.AreEqual("", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsEnum.FLAG_NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.AreEqual("abc", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsEnum.FLAG_NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.IsNull(te.Next()); // pass empty string te = terms.Intersect(ca, new BytesRef("")); Assert.AreEqual("abc", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsEnum.FLAG_NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.IsNull(te.Next()); r.Dispose(); dir.Dispose(); }
public override TermsEnum Intersect(CompiledAutomaton automaton, BytesRef bytes) { TermsEnum termsEnum = @in.Intersect(automaton, bytes); Debug.Assert(termsEnum != null); Debug.Assert(bytes == null || bytes.Valid); return new AssertingTermsEnum(termsEnum); }
/// <summary> /// Construct an enumerator based upon an automaton, enumerating the specified /// field, working on a supplied TermsEnum /// <p> /// @lucene.experimental /// <p> </summary> /// <param name="compiled"> CompiledAutomaton </param> public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) : base(tenum) { this.Finite = compiled.Finite; this.RunAutomaton = compiled.RunAutomaton; Debug.Assert(this.RunAutomaton != null); this.CommonSuffixRef = compiled.CommonSuffixRef; this.AllTransitions = compiled.SortedTransitions; // used for path tracking, where each bit is a numbered state. Visited = new long[RunAutomaton.Size]; TermComp = Comparator; }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { IList<MultiTermsEnum.TermsEnumIndex> termsEnums = new List<MultiTermsEnum.TermsEnumIndex>(); for (int i = 0; i < Subs.Length; i++) { TermsEnum termsEnum = Subs[i].Intersect(compiled, startTerm); if (termsEnum != null) { termsEnums.Add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); } } if (termsEnums.Count > 0) { return (new MultiTermsEnum(SubSlices)).Reset(termsEnums.ToArray(/*MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY*/)); } else { return TermsEnum.EMPTY; } }