/// <summary> /// Create a new AutomatonQuery from an <seealso cref="Automaton"/>. /// </summary> /// <param name="term"> Term containing field and possibly some pattern structure. The /// term text is ignored. </param> /// <param name="automaton"> Automaton to run, terms that are accepted are considered a /// match. </param> public AutomatonQuery(Term term, Automaton automaton) : base(term.Field) { this.Term = term; this.Automaton_Renamed = automaton; this.Compiled = new CompiledAutomaton(automaton); }
internal IntersectTermsEnum(TermsReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) : base(outerInstance) { //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = outerInstance.index; this.fstReader = fst.GetBytesReader(); this.fstOutputs = outerInstance.index.Outputs; this.fsa = compiled.RunAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.Length; i++) { this.stack[i] = new Frame(); } Frame frame; /*frame = */ LoadVirtualFrame(NewFrame()); // LUCENENET: IDE0059: Remove unnecessary value assignment this.level++; frame = LoadFirstFrame(NewFrame()); PushFrame(frame); this.decoded = false; this.pending = false; if (startTerm == null) { pending = IsAccept(TopFrame()); } else { DoSeekCeil(startTerm); pending = !startTerm.Equals(term) && IsValid(TopFrame()) && IsAccept(TopFrame()); } }
public virtual void TestIntersectStartTerm() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(new LogDocMergePolicy()); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); doc.Add(NewStringField("field", "abc", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "abd", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "acd", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "bcd", Field.Store.NO)); w.AddDocument(doc); w.ForceMerge(1); DirectoryReader r = w.Reader; w.Dispose(); AtomicReader sub = GetOnlySegmentReader(r); Terms terms = sub.Fields.Terms("field"); Automaton automaton = (new RegExp(".*d", RegExp.NONE)).ToAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te; // should seek to startTerm te = terms.Intersect(ca, new BytesRef("aad")); Assert.AreEqual("abd", te.Next().Utf8ToString()); Assert.AreEqual(1, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.AreEqual("acd", te.Next().Utf8ToString()); Assert.AreEqual(2, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.AreEqual("bcd", te.Next().Utf8ToString()); Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.IsNull(te.Next()); // should fail to find ceil label on second arc, rewind te = terms.Intersect(ca, new BytesRef("add")); Assert.AreEqual("bcd", te.Next().Utf8ToString()); Assert.AreEqual(3, te.Docs(null, null, DocsEnum.FLAG_NONE).NextDoc()); Assert.IsNull(te.Next()); // should reach end te = terms.Intersect(ca, new BytesRef("bcd")); Assert.IsNull(te.Next()); te = terms.Intersect(ca, new BytesRef("ddd")); Assert.IsNull(te.Next()); r.Dispose(); dir.Dispose(); }
/// <summary> /// Create a new AutomatonQuery from an <see cref="Automaton"/>. /// </summary> /// <param name="term"> <see cref="Term"/> containing field and possibly some pattern structure. The /// term text is ignored. </param> /// <param name="automaton"> <see cref="Automaton"/> to run, terms that are accepted are considered a /// match. </param> public AutomatonQuery(Term term, Automaton automaton) : base(term.Field) { this.m_term = term; this.m_automaton = automaton; this.m_compiled = new CompiledAutomaton(automaton); }
public virtual void TestIntersectEmptyString() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(new LogDocMergePolicy()); RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); Document doc = new Document(); doc.Add(NewStringField("field", "", Field.Store.NO)); doc.Add(NewStringField("field", "abc", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); // add empty string to both documents, so that singletonDocID == -1. // For a FST-based term dict, we'll expect to see the first arc is // flaged with HAS_FINAL_OUTPUT doc.Add(NewStringField("field", "abc", Field.Store.NO)); doc.Add(NewStringField("field", "", Field.Store.NO)); w.AddDocument(doc); w.ForceMerge(1); DirectoryReader r = w.GetReader(); w.Dispose(); AtomicReader sub = GetOnlySegmentReader(r); Terms terms = sub.Fields.GetTerms("field"); Automaton automaton = (new RegExp(".*", RegExpSyntax.NONE)).ToAutomaton(); // accept ALL CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te = terms.Intersect(ca, null); DocsEnum de; Assert.AreEqual("", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsFlags.NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.AreEqual("abc", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsFlags.NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.IsNull(te.Next()); // pass empty string te = terms.Intersect(ca, new BytesRef("")); Assert.AreEqual("abc", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsFlags.NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.IsNull(te.Next()); r.Dispose(); dir.Dispose(); }
public override TermsEnum Intersect(CompiledAutomaton automaton, BytesRef bytes) { TermsEnum termsEnum = m_input.Intersect(automaton, bytes); Debug.Assert(termsEnum != null); Debug.Assert(bytes == null || bytes.IsValid()); return(new AssertingAtomicReader.AssertingTermsEnum(termsEnum)); }
public override TermsEnum Intersect(CompiledAutomaton automaton, BytesRef bytes) { TermsEnum termsEnum = @in.Intersect(automaton, bytes); Debug.Assert(termsEnum != null); Debug.Assert(bytes == null || bytes.Valid); return(new AssertingTermsEnum(termsEnum)); }
public virtual void TestIntersectBasic() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)); iwc.SetMergePolicy(new LogDocMergePolicy()); RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc); Document doc = new Document(); doc.Add(NewTextField("field", "aaa", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewStringField("field", "bbb", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); doc.Add(NewTextField("field", "ccc", Field.Store.NO)); w.AddDocument(doc); w.ForceMerge(1); DirectoryReader r = w.GetReader(); w.Dispose(); AtomicReader sub = GetOnlySegmentReader(r); Terms terms = sub.Fields.GetTerms("field"); Automaton automaton = (new RegExp(".*", RegExpSyntax.NONE)).ToAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te = terms.Intersect(ca, null); Assert.AreEqual("aaa", te.Next().Utf8ToString()); Assert.AreEqual(0, te.Docs(null, null, DocsFlags.NONE).NextDoc()); Assert.AreEqual("bbb", te.Next().Utf8ToString()); Assert.AreEqual(1, te.Docs(null, null, DocsFlags.NONE).NextDoc()); Assert.AreEqual("ccc", te.Next().Utf8ToString()); Assert.AreEqual(2, te.Docs(null, null, DocsFlags.NONE).NextDoc()); Assert.IsNull(te.Next()); te = terms.Intersect(ca, new BytesRef("abc")); Assert.AreEqual("bbb", te.Next().Utf8ToString()); Assert.AreEqual(1, te.Docs(null, null, DocsFlags.NONE).NextDoc()); Assert.AreEqual("ccc", te.Next().Utf8ToString()); Assert.AreEqual(2, te.Docs(null, null, DocsFlags.NONE).NextDoc()); Assert.IsNull(te.Next()); te = terms.Intersect(ca, new BytesRef("aaa")); Assert.AreEqual("bbb", te.Next().Utf8ToString()); Assert.AreEqual(1, te.Docs(null, null, DocsFlags.NONE).NextDoc()); Assert.AreEqual("ccc", te.Next().Utf8ToString()); Assert.AreEqual(2, te.Docs(null, null, DocsFlags.NONE).NextDoc()); Assert.IsNull(te.Next()); r.Dispose(); dir.Dispose(); }
private bool Accepts(CompiledAutomaton c, BytesRef b) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < b.Length; idx++) { Assert.IsTrue(state != -1); state = c.RunAutomaton.Step(state, b.Bytes[b.Offset + idx] & 0xff); } return(c.RunAutomaton.IsAccept(state)); }
/// <summary> /// Construct an enumerator based upon an automaton, enumerating the specified /// field, working on a supplied <see cref="TermsEnum"/> /// <para/> /// @lucene.experimental /// </summary> /// <param name="tenum"> TermsEnum </param> /// <param name="compiled"> CompiledAutomaton </param> public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) : base(tenum) { this.finite = compiled.Finite; this.runAutomaton = compiled.RunAutomaton; Debug.Assert(this.runAutomaton != null); this.commonSuffixRef = compiled.CommonSuffixRef; this.allTransitions = compiled.SortedTransitions; // used for path tracking, where each bit is a numbered state. visited = new long[runAutomaton.Count]; termComp = Comparer; }
/// <summary> /// Return an automata-based enum for matching up to <paramref name="editDistance"/> from /// <paramref name="lastTerm"/>, if possible /// </summary> protected virtual TermsEnum GetAutomatonEnum(int editDistance, BytesRef lastTerm) { IList <CompiledAutomaton> runAutomata = InitAutomata(editDistance); if (editDistance < runAutomata.Count) { //if (BlockTreeTermsWriter.DEBUG) System.out.println("FuzzyTE.getAEnum: ed=" + editDistance + " lastTerm=" + (lastTerm==null ? "null" : lastTerm.utf8ToString())); CompiledAutomaton compiled = runAutomata[editDistance]; return(new AutomatonFuzzyTermsEnum(this, m_terms.Intersect(compiled, lastTerm == null ? null : compiled.Floor(lastTerm, new BytesRef())), runAutomata.SubList(0, editDistance + 1).ToArray(/*new CompiledAutomaton[editDistance + 1]*/))); } else { return(null); } }
/// <summary> /// Returns a <see cref="TermsEnum"/> that iterates over all terms that /// are accepted by the provided /// <see cref="CompiledAutomaton"/>. If the <paramref name="startTerm"/> is /// provided then the returned enum will only accept terms /// > <paramref name="startTerm"/>, but you still must call /// <see cref="TermsEnum.Next()"/> first to get to the first term. Note that the /// provided <paramref name="startTerm"/> must be accepted by /// the automaton. /// /// <para><b>NOTE</b>: the returned <see cref="TermsEnum"/> cannot /// seek</para>. /// </summary> public virtual TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { // TODO: eventually we could support seekCeil/Exact on // the returned enum, instead of only being able to seek // at the start if (compiled.Type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new System.ArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } if (startTerm == null) { return(new AutomatonTermsEnum(GetIterator(null), compiled)); } else { return(new AutomatonTermsEnumAnonymousInnerClassHelper(this, GetIterator(null), compiled, startTerm)); } }
/// <summary> /// Returns a TermsEnum that iterates over all terms that /// are accepted by the provided {@link /// CompiledAutomaton}. If the <code>startTerm</code> is /// provided then the returned enum will only accept terms /// > <code>startTerm</code>, but you still must call /// next() first to get to the first term. Note that the /// provided <code>startTerm</code> must be accepted by /// the automaton. /// /// <p><b>NOTE</b>: the returned TermsEnum cannot /// seek</p>. /// </summary> public virtual TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { // TODO: eventually we could support seekCeil/Exact on // the returned enum, instead of only being able to seek // at the start if (compiled.Type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new System.ArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } if (startTerm == null) { return new AutomatonTermsEnum(Iterator(null), compiled); } else { return new AutomatonTermsEnumAnonymousInnerClassHelper(this, Iterator(null), compiled, startTerm); } }
public virtual void TestIntersect() { for (int i = 0; i < numIterations; i++) { string reg = AutomatonTestUtil.RandomRegexp(Random); Automaton automaton = (new RegExp(reg, RegExpSyntax.NONE)).ToAutomaton(); CompiledAutomaton ca = new CompiledAutomaton(automaton, SpecialOperations.IsFinite(automaton), false); TermsEnum te = MultiFields.GetTerms(reader, "field").Intersect(ca, null); Automaton expected = BasicOperations.Intersection(termsAutomaton, automaton); JCG.SortedSet <BytesRef> found = new JCG.SortedSet <BytesRef>(); while (te.Next() != null) { found.Add(BytesRef.DeepCopyOf(te.Term)); } Automaton actual = BasicAutomata.MakeStringUnion(found); Assert.IsTrue(BasicOperations.SameLanguage(expected, actual)); } }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { IList <MultiTermsEnum.TermsEnumIndex> termsEnums = new List <MultiTermsEnum.TermsEnumIndex>(); for (int i = 0; i < subs.Length; i++) { TermsEnum termsEnum = subs[i].Intersect(compiled, startTerm); if (termsEnum != null) { termsEnums.Add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); } } if (termsEnums.Count > 0) { return((new MultiTermsEnum(subSlices)).Reset(termsEnums.ToArray(/*MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY*/))); } else { return(TermsEnum.EMPTY); } }
// following code is almost an exact dup of code from TestDuelingCodecs: sorry! public virtual void AssertTerms(Terms leftTerms, Terms rightTerms, bool deep) { if (leftTerms == null || rightTerms == null) { Assert.IsNull(leftTerms); Assert.IsNull(rightTerms); return; } AssertTermsStatistics(leftTerms, rightTerms); // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different TermsEnum leftTermsEnum = leftTerms.GetIterator(null); TermsEnum rightTermsEnum = rightTerms.GetIterator(null); AssertTermsEnum(leftTermsEnum, rightTermsEnum, true); AssertTermsSeeking(leftTerms, rightTerms); if (deep) { int numIntersections = AtLeast(3); for (int i = 0; i < numIntersections; i++) { string re = AutomatonTestUtil.RandomRegexp(Random); CompiledAutomaton automaton = new CompiledAutomaton((new RegExp(re, RegExpSyntax.NONE)).ToAutomaton()); if (automaton.Type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.Intersect(automaton, null); TermsEnum rightIntersection = rightTerms.Intersect(automaton, null); AssertTermsEnum(leftIntersection, rightIntersection, Rarely()); } } } }
public virtual void TestIntersectRandom() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter( #if FEATURE_INSTANCE_TESTDATA_INITIALIZATION this, #endif Random, dir); int numTerms = AtLeast(300); //final int numTerms = 50; HashSet <string> terms = new HashSet <string>(); ICollection <string> pendingTerms = new List <string>(); IDictionary <BytesRef, int?> termToID = new Dictionary <BytesRef, int?>(); int id = 0; while (terms.Count != numTerms) { string s = RandomString; if (!terms.Contains(s)) { terms.Add(s); pendingTerms.Add(s); if (Random.Next(20) == 7) { AddDoc(w, pendingTerms, termToID, id++); } } } AddDoc(w, pendingTerms, termToID, id++); BytesRef[] termsArray = new BytesRef[terms.Count]; HashSet <BytesRef> termsSet = new HashSet <BytesRef>(); { int upto = 0; foreach (string s in terms) { BytesRef b = new BytesRef(s); termsArray[upto++] = b; termsSet.Add(b); } Array.Sort(termsArray); } if (VERBOSE) { Console.WriteLine("\nTEST: indexed terms (unicode order):"); foreach (BytesRef t in termsArray) { Console.WriteLine(" " + t.Utf8ToString() + " -> id:" + termToID[t]); } } IndexReader r = w.GetReader(); w.Dispose(); // NOTE: intentional insanity!! FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(SlowCompositeReaderWrapper.Wrap(r), "id", false); for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { // TODO: can we also test infinite As here...? // From the random terms, pick some ratio and compile an // automaton: HashSet <string> acceptTerms = new HashSet <string>(); SortedSet <BytesRef> sortedAcceptTerms = new SortedSet <BytesRef>(); double keepPct = Random.NextDouble(); Automaton a; if (iter == 0) { if (VERBOSE) { Console.WriteLine("\nTEST: empty automaton"); } a = BasicAutomata.MakeEmpty(); } else { if (VERBOSE) { Console.WriteLine("\nTEST: keepPct=" + keepPct); } foreach (string s in terms) { string s2; if (Random.NextDouble() <= keepPct) { s2 = s; } else { s2 = RandomString; } acceptTerms.Add(s2); sortedAcceptTerms.Add(new BytesRef(s2)); } a = BasicAutomata.MakeStringUnion(sortedAcceptTerms); } if (Random.NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: reduce the automaton"); } a.Reduce(); } CompiledAutomaton c = new CompiledAutomaton(a, true, false); BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.Count]; HashSet <BytesRef> acceptTermsSet = new HashSet <BytesRef>(); int upto = 0; foreach (string s in acceptTerms) { BytesRef b = new BytesRef(s); acceptTermsArray[upto++] = b; acceptTermsSet.Add(b); Assert.IsTrue(Accepts(c, b)); } Array.Sort(acceptTermsArray); if (VERBOSE) { Console.WriteLine("\nTEST: accept terms (unicode order):"); foreach (BytesRef t in acceptTermsArray) { Console.WriteLine(" " + t.Utf8ToString() + (termsSet.Contains(t) ? " (exists)" : "")); } Console.WriteLine(a.ToDot()); } for (int iter2 = 0; iter2 < 100; iter2++) { BytesRef startTerm = acceptTermsArray.Length == 0 || Random.NextBoolean() ? null : acceptTermsArray[Random.Next(acceptTermsArray.Length)]; if (VERBOSE) { Console.WriteLine("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.Utf8ToString())); if (startTerm != null) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < startTerm.Length; idx++) { int label = startTerm.Bytes[startTerm.Offset + idx] & 0xff; Console.WriteLine(" state=" + state + " label=" + label); state = c.RunAutomaton.Step(state, label); Assert.IsTrue(state != -1); } Console.WriteLine(" state=" + state); } } TermsEnum te = MultiFields.GetTerms(r, "f").Intersect(c, startTerm); int loc; if (startTerm == null) { loc = 0; } else { loc = Array.BinarySearch(termsArray, BytesRef.DeepCopyOf(startTerm)); if (loc < 0) { loc = -(loc + 1); } else { // startTerm exists in index loc++; } } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])) { loc++; } DocsEnum docsEnum = null; while (loc < termsArray.Length) { BytesRef expected = termsArray[loc]; BytesRef actual = te.Next(); if (VERBOSE) { Console.WriteLine("TEST: next() expected=" + expected.Utf8ToString() + " actual=" + (actual == null ? "null" : actual.Utf8ToString())); } Assert.AreEqual(expected, actual); Assert.AreEqual(1, te.DocFreq); docsEnum = TestUtil.Docs(Random, te, null, docsEnum, DocsFlags.NONE); int docID = docsEnum.NextDoc(); Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(docIDToID.Get(docID), (int)termToID[expected]); do { loc++; } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])); } Assert.IsNull(te.Next()); } } r.Dispose(); dir.Dispose(); }
public DirectIntersectTermsEnum(DirectPostingsFormat.DirectField outerInstance, CompiledAutomaton compiled, BytesRef startTerm) { this.outerInstance = outerInstance; runAutomaton = compiled.RunAutomaton; compiledAutomaton = compiled; termOrd = -1; states = new State[1]; states[0] = new State(this); states[0].changeOrd = outerInstance.terms.Length; states[0].state = runAutomaton.InitialState; states[0].transitions = compiledAutomaton.SortedTransitions[states[0].state]; states[0].transitionUpto = -1; states[0].transitionMax = -1; //System.out.println("IE.init startTerm=" + startTerm); if (startTerm != null) { int skipUpto = 0; if (startTerm.Length == 0) { if (outerInstance.terms.Length > 0 && outerInstance.termOffsets[1] == 0) { termOrd = 0; } } else { termOrd++; for (int i = 0; i < startTerm.Length; i++) { int label = startTerm.Bytes[startTerm.Offset + i] & 0xFF; while (label > states[i].transitionMax) { states[i].transitionUpto++; Debug.Assert(states[i].transitionUpto < states[i].transitions.Length); states[i].transitionMin = states[i].transitions[states[i].transitionUpto].Min; states[i].transitionMax = states[i].transitions[states[i].transitionUpto].Max; Debug.Assert(states[i].transitionMin >= 0); Debug.Assert(states[i].transitionMin <= 255); Debug.Assert(states[i].transitionMax >= 0); Debug.Assert(states[i].transitionMax <= 255); } // Skip forwards until we find a term matching // the label at this position: while (termOrd < outerInstance.terms.Length) { int skipOffset = outerInstance.skipOffsets[termOrd]; int numSkips = outerInstance.skipOffsets[termOrd + 1] - skipOffset; int termOffset_i = outerInstance.termOffsets[termOrd]; int termLength = outerInstance.termOffsets[1 + termOrd] - termOffset_i; // if (DEBUG) { // System.out.println(" check termOrd=" + termOrd + " term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips) + " i=" + i); // } if (termOrd == states[stateUpto].changeOrd) { // if (DEBUG) { // System.out.println(" end push return"); // } stateUpto--; termOrd--; return; } if (termLength == i) { termOrd++; skipUpto = 0; // if (DEBUG) { // System.out.println(" term too short; next term"); // } } else if (label < (outerInstance.termBytes[termOffset_i + i] & 0xFF)) { termOrd--; // if (DEBUG) { // System.out.println(" no match; already beyond; return termOrd=" + termOrd); // } stateUpto -= skipUpto; Debug.Assert(stateUpto >= 0); return; } else if (label == (outerInstance.termBytes[termOffset_i + i] & 0xFF)) { // if (DEBUG) { // System.out.println(" label[" + i + "] matches"); // } if (skipUpto < numSkips) { Grow(); int nextState = runAutomaton.Step(states[stateUpto].state, label); // Automaton is required to accept startTerm: Debug.Assert(nextState != -1); stateUpto++; states[stateUpto].changeOrd = outerInstance.skips[skipOffset + skipUpto++]; states[stateUpto].state = nextState; states[stateUpto].transitions = compiledAutomaton.SortedTransitions[nextState]; states[stateUpto].transitionUpto = -1; states[stateUpto].transitionMax = -1; //System.out.println(" push " + states[stateUpto].transitions.length + " trans"); // if (DEBUG) { // System.out.println(" push skip; changeOrd=" + states[stateUpto].changeOrd); // } // Match next label at this same term: goto nextLabelContinue; } else { // if (DEBUG) { // System.out.println(" linear scan"); // } // Index exhausted: just scan now (the // number of scans required will be less // than the minSkipCount): int startTermOrd = termOrd; while (termOrd < outerInstance.terms.Length && outerInstance.Compare(termOrd, startTerm) <= 0) { Debug.Assert(termOrd == startTermOrd || outerInstance.skipOffsets[termOrd] == outerInstance.skipOffsets[termOrd + 1]); termOrd++; } Debug.Assert(termOrd - startTermOrd < outerInstance.minSkipCount); termOrd--; stateUpto -= skipUpto; // if (DEBUG) { // System.out.println(" end termOrd=" + termOrd); // } return; } } else { if (skipUpto < numSkips) { termOrd = outerInstance.skips[skipOffset + skipUpto]; // if (DEBUG) { // System.out.println(" no match; skip to termOrd=" + termOrd); // } } else { // if (DEBUG) { // System.out.println(" no match; next term"); // } termOrd++; } skipUpto = 0; } } // startTerm is >= last term so enum will not // return any terms: termOrd--; // if (DEBUG) { // System.out.println(" beyond end; no terms will match"); // } return; nextLabelContinue: ; } nextLabelBreak: ; } int termOffset = outerInstance.termOffsets[termOrd]; int termLen = outerInstance.termOffsets[1 + termOrd] - termOffset; if (termOrd >= 0 && !startTerm.Equals(new BytesRef(outerInstance.termBytes, termOffset, termLen))) { stateUpto -= skipUpto; termOrd--; } // if (DEBUG) { // System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + stateUpto); // } } }
public AutomatonTermsEnumAnonymousInnerClassHelper(Terms outerInstance, Lucene.Net.Index.TermsEnum iterator, CompiledAutomaton compiled, BytesRef startTerm) : base(iterator, compiled) { this.outerInstance = outerInstance; this.startTerm = startTerm; }
/// <summary> /// Terms api equivalency /// </summary> public void AssertTermsEquals(string info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, bool deep) { if (leftTerms == null || rightTerms == null) { Assert.IsNull(leftTerms, info); Assert.IsNull(rightTerms, info); return; } AssertTermsStatisticsEquals(info, leftTerms, rightTerms); Assert.AreEqual(leftTerms.HasOffsets(), rightTerms.HasOffsets()); Assert.AreEqual(leftTerms.HasPositions(), rightTerms.HasPositions()); Assert.AreEqual(leftTerms.HasPayloads(), rightTerms.HasPayloads()); TermsEnum leftTermsEnum = leftTerms.Iterator(null); TermsEnum rightTermsEnum = rightTerms.Iterator(null); AssertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true); AssertTermsSeekingEquals(info, leftTerms, rightTerms); if (deep) { int numIntersections = AtLeast(3); for (int i = 0; i < numIntersections; i++) { string re = AutomatonTestUtil.RandomRegexp(Random()); CompiledAutomaton automaton = new CompiledAutomaton((new RegExp(re, RegExp.NONE)).ToAutomaton()); if (automaton.Type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.Intersect(automaton, null); TermsEnum rightIntersection = rightTerms.Intersect(automaton, null); AssertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, Rarely()); } } } }
public AutomatonFuzzyTermsEnum(FuzzyTermsEnum outerInstance, TermsEnum tenum, CompiledAutomaton[] compiled) : base(tenum, false) { this.OuterInstance = outerInstance; if (!InstanceFieldsInitialized) { InitializeInstanceFields(); InstanceFieldsInitialized = true; } this.Matchers = new ByteRunAutomaton[compiled.Length]; for (int i = 0; i < compiled.Length; i++) { this.Matchers[i] = compiled[i].RunAutomaton; } TermRef = new BytesRef(outerInstance.Term_Renamed.Text()); }
public AutomatonTermsEnumAnonymousInnerClassHelper(TermsEnum iterator, CompiledAutomaton compiled, BytesRef startTerm) : base(iterator, compiled) { this.startTerm = startTerm; }
public AutomatonTermsEnumAnonymousInnerClassHelper(Terms outerInstance, Lucene.Net.Index.TermsEnum iterator, CompiledAutomaton compiled, BytesRef startTerm) : base(iterator, compiled) { this.OuterInstance = outerInstance; this.StartTerm = startTerm; }
public override TermsEnum Intersect(CompiledAutomaton automaton, BytesRef bytes) { TermsEnum termsEnum = @in.Intersect(automaton, bytes); Debug.Assert(termsEnum != null); Debug.Assert(bytes == null || bytes.Valid); return new AssertingTermsEnum(termsEnum); }
public virtual void TestIntersectRandom() { Directory dir = NewDirectory(); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, Similarity, TimeZone); int numTerms = AtLeast(300); //final int numTerms = 50; HashSet<string> terms = new HashSet<string>(); ICollection<string> pendingTerms = new List<string>(); IDictionary<BytesRef, int?> termToID = new Dictionary<BytesRef, int?>(); int id = 0; while (terms.Count != numTerms) { string s = RandomString; if (!terms.Contains(s)) { terms.Add(s); pendingTerms.Add(s); if (Random().Next(20) == 7) { AddDoc(w, pendingTerms, termToID, id++); } } } AddDoc(w, pendingTerms, termToID, id++); BytesRef[] termsArray = new BytesRef[terms.Count]; HashSet<BytesRef> termsSet = new HashSet<BytesRef>(); { int upto = 0; foreach (string s in terms) { BytesRef b = new BytesRef(s); termsArray[upto++] = b; termsSet.Add(b); } Array.Sort(termsArray); } if (VERBOSE) { Console.WriteLine("\nTEST: indexed terms (unicode order):"); foreach (BytesRef t in termsArray) { Console.WriteLine(" " + t.Utf8ToString() + " -> id:" + termToID[t]); } } IndexReader r = w.Reader; w.Dispose(); // NOTE: intentional insanity!! FieldCache.Ints docIDToID = FieldCache.DEFAULT.GetInts(SlowCompositeReaderWrapper.Wrap(r), "id", false); for (int iter = 0; iter < 10 * RANDOM_MULTIPLIER; iter++) { // TODO: can we also test infinite As here...? // From the random terms, pick some ratio and compile an // automaton: HashSet<string> acceptTerms = new HashSet<string>(); SortedSet<BytesRef> sortedAcceptTerms = new SortedSet<BytesRef>(); double keepPct = Random().NextDouble(); Automaton a; if (iter == 0) { if (VERBOSE) { Console.WriteLine("\nTEST: empty automaton"); } a = BasicAutomata.MakeEmpty(); } else { if (VERBOSE) { Console.WriteLine("\nTEST: keepPct=" + keepPct); } foreach (string s in terms) { string s2; if (Random().NextDouble() <= keepPct) { s2 = s; } else { s2 = RandomString; } acceptTerms.Add(s2); sortedAcceptTerms.Add(new BytesRef(s2)); } a = BasicAutomata.MakeStringUnion(sortedAcceptTerms); } if (Random().NextBoolean()) { if (VERBOSE) { Console.WriteLine("TEST: reduce the automaton"); } a.Reduce(); } CompiledAutomaton c = new CompiledAutomaton(a, true, false); BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.Count]; HashSet<BytesRef> acceptTermsSet = new HashSet<BytesRef>(); int upto = 0; foreach (string s in acceptTerms) { BytesRef b = new BytesRef(s); acceptTermsArray[upto++] = b; acceptTermsSet.Add(b); Assert.IsTrue(Accepts(c, b)); } Array.Sort(acceptTermsArray); if (VERBOSE) { Console.WriteLine("\nTEST: accept terms (unicode order):"); foreach (BytesRef t in acceptTermsArray) { Console.WriteLine(" " + t.Utf8ToString() + (termsSet.Contains(t) ? " (exists)" : "")); } Console.WriteLine(a.ToDot()); } for (int iter2 = 0; iter2 < 100; iter2++) { BytesRef startTerm = acceptTermsArray.Length == 0 || Random().NextBoolean() ? null : acceptTermsArray[Random().Next(acceptTermsArray.Length)]; if (VERBOSE) { Console.WriteLine("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.Utf8ToString())); if (startTerm != null) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < startTerm.Length; idx++) { int label = startTerm.Bytes[startTerm.Offset + idx] & 0xff; Console.WriteLine(" state=" + state + " label=" + label); state = c.RunAutomaton.Step(state, label); Assert.IsTrue(state != -1); } Console.WriteLine(" state=" + state); } } TermsEnum te = MultiFields.GetTerms(r, "f").Intersect(c, startTerm); int loc; if (startTerm == null) { loc = 0; } else { loc = Array.BinarySearch(termsArray, BytesRef.DeepCopyOf(startTerm)); if (loc < 0) { loc = -(loc + 1); } else { // startTerm exists in index loc++; } } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])) { loc++; } DocsEnum docsEnum = null; while (loc < termsArray.Length) { BytesRef expected = termsArray[loc]; BytesRef actual = te.Next(); if (VERBOSE) { Console.WriteLine("TEST: next() expected=" + expected.Utf8ToString() + " actual=" + (actual == null ? "null" : actual.Utf8ToString())); } Assert.AreEqual(expected, actual); Assert.AreEqual(1, te.DocFreq()); docsEnum = TestUtil.Docs(Random(), te, null, docsEnum, DocsEnum.FLAG_NONE); int docID = docsEnum.NextDoc(); Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS); Assert.AreEqual(docIDToID.Get(docID), (int)termToID[expected]); do { loc++; } while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc])); } Assert.IsNull(te.Next()); } } r.Dispose(); dir.Dispose(); }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { return(new IntersectTermsEnum(this, compiled, startTerm)); }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { if (compiled.Type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { throw new System.ArgumentException("please use CompiledAutomaton.getTermsEnum instead"); } return new IntersectEnum(this, compiled, startTerm); }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { return _delegateTerms.Intersect(compiled, startTerm); }
// following code is almost an exact dup of code from TestDuelingCodecs: sorry! public virtual void AssertTerms(Terms leftTerms, Terms rightTerms, bool deep) { if (leftTerms == null || rightTerms == null) { Assert.IsNull(leftTerms); Assert.IsNull(rightTerms); return; } AssertTermsStatistics(leftTerms, rightTerms); // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different TermsEnum leftTermsEnum = leftTerms.Iterator(null); TermsEnum rightTermsEnum = rightTerms.Iterator(null); AssertTermsEnum(leftTermsEnum, rightTermsEnum, true); AssertTermsSeeking(leftTerms, rightTerms); if (deep) { int numIntersections = AtLeast(3); for (int i = 0; i < numIntersections; i++) { string re = AutomatonTestUtil.RandomRegexp(Random()); CompiledAutomaton automaton = new CompiledAutomaton((new RegExp(re, RegExp.NONE)).ToAutomaton()); if (automaton.Type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { // TODO: test start term too TermsEnum leftIntersection = leftTerms.Intersect(automaton, null); TermsEnum rightIntersection = rightTerms.Intersect(automaton, null); AssertTermsEnum(leftIntersection, rightIntersection, Rarely()); } } } }
public virtual void TestIntersectEmptyString() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(new LogDocMergePolicy()); RandomIndexWriter w = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); doc.Add(NewStringField("field", "", Field.Store.NO)); doc.Add(NewStringField("field", "abc", Field.Store.NO)); w.AddDocument(doc); doc = new Document(); // add empty string to both documents, so that singletonDocID == -1. // For a FST-based term dict, we'll expect to see the first arc is // flaged with HAS_FINAL_OUTPUT doc.Add(NewStringField("field", "abc", Field.Store.NO)); doc.Add(NewStringField("field", "", Field.Store.NO)); w.AddDocument(doc); w.ForceMerge(1); DirectoryReader r = w.Reader; w.Dispose(); AtomicReader sub = GetOnlySegmentReader(r); Terms terms = sub.Fields.Terms("field"); Automaton automaton = (new RegExp(".*", RegExp.NONE)).ToAutomaton(); // accept ALL CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false); TermsEnum te = terms.Intersect(ca, null); DocsEnum de; Assert.AreEqual("", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsEnum.FLAG_NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.AreEqual("abc", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsEnum.FLAG_NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.IsNull(te.Next()); // pass empty string te = terms.Intersect(ca, new BytesRef("")); Assert.AreEqual("abc", te.Next().Utf8ToString()); de = te.Docs(null, null, DocsEnum.FLAG_NONE); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(1, de.NextDoc()); Assert.IsNull(te.Next()); r.Dispose(); dir.Dispose(); }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { return(new SortingTermsEnum(m_input.Intersect(compiled, startTerm), docMap, indexOptions)); }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { return new DirectIntersectTermsEnum(this, compiled, startTerm); }
public override TermsEnum Intersect(CompiledAutomaton compiled, BytesRef startTerm) { return(_delegateTerms.Intersect(compiled, startTerm)); }
// TODO: in some cases we can filter by length? eg // regexp foo*bar must be at least length 6 bytes public IntersectEnum(BlockTreeTermsReader.FieldReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) { this.OuterInstance = outerInstance; // if (DEBUG) { // System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef)); // } runAutomaton = compiled.RunAutomaton; CompiledAutomaton = compiled; @in = (IndexInput)[email protected](); Stack = new Frame[5]; for (int idx = 0; idx < Stack.Length; idx++) { Stack[idx] = new Frame(this, idx); } for (int arcIdx = 0; arcIdx < Arcs.Length; arcIdx++) { Arcs[arcIdx] = new FST<BytesRef>.Arc<BytesRef>(); } if (outerInstance.Index == null) { FstReader = null; } else { FstReader = outerInstance.Index.BytesReader; } // TODO: if the automaton is "smallish" we really // should use the terms index to seek at least to // the initial term and likely to subsequent terms // (or, maybe just fallback to ATE for such cases). // Else the seek cost of loading the frames will be // too costly. FST<BytesRef>.Arc<BytesRef> arc = outerInstance.Index.GetFirstArc(Arcs[0]); // Empty string prefix must have an output in the index! Debug.Assert(arc.Final); // Special pushFrame since it's the first one: Frame f = Stack[0]; f.Fp = f.FpOrig = outerInstance.RootBlockFP; f.Prefix = 0; f.State = runAutomaton.InitialState; f.Arc = arc; f.OutputPrefix = arc.Output; f.Load(outerInstance.RootCode); // for assert: Debug.Assert(SetSavedStartTerm(startTerm)); CurrentFrame = f; if (startTerm != null) { SeekToStartTerm(startTerm); } }
internal IntersectTermsEnum(FSTTermsReader.TermsReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) : base(outerInstance) { this.outerInstance = outerInstance; //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = outerInstance.dict; this.fstReader = fst.GetBytesReader(); this.fstOutputs = outerInstance.dict.Outputs; this.fsa = compiled.RunAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.Length; i++) { this.stack[i] = new Frame(this); } Frame frame; frame = LoadVirtualFrame(NewFrame()); this.level++; frame = LoadFirstFrame(NewFrame()); PushFrame(frame); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = IsAccept(TopFrame()); } else { DoSeekCeil(startTerm); pending = !startTerm.Equals(term) && IsValid(TopFrame()) && IsAccept(TopFrame()); } }
private bool Accepts(CompiledAutomaton c, BytesRef b) { int state = c.RunAutomaton.InitialState; for (int idx = 0; idx < b.Length; idx++) { Assert.IsTrue(state != -1); state = c.RunAutomaton.Step(state, b.Bytes[b.Offset + idx] & 0xff); } return c.RunAutomaton.IsAccept(state); }