private static void AssertAutomaton(Automaton a) { Automaton clone = (Automaton)a.Clone(); // complement(complement(a)) = a Automaton equivalent = BasicOperations.Complement(BasicOperations.Complement(a)); Assert.IsTrue(BasicOperations.SameLanguage(a, equivalent)); // a union a = a equivalent = BasicOperations.Union(a, clone); Assert.IsTrue(BasicOperations.SameLanguage(a, equivalent)); // a intersect a = a equivalent = BasicOperations.Intersection(a, clone); Assert.IsTrue(BasicOperations.SameLanguage(a, equivalent)); // a minus a = empty Automaton empty = BasicOperations.Minus(a, clone); Assert.IsTrue(BasicOperations.IsEmpty(empty)); // as long as don't accept the empty string // then optional(a) - empty = a if (!BasicOperations.Run(a, "")) { //System.out.println("test " + a); Automaton optional = BasicOperations.Optional(a); //System.out.println("optional " + optional); equivalent = BasicOperations.Minus(optional, BasicAutomata.MakeEmptyString()); //System.out.println("equiv " + equivalent); Assert.IsTrue(BasicOperations.SameLanguage(a, equivalent)); } }
/// <summary> /// Returns a new (deterministic) automaton that accepts a single codepoint of /// the given value. /// </summary> public static Automaton MakeChar(int c) { Automaton a = new Automaton(); a.singleton = new string(Character.ToChars(c)); a.deterministic = true; return a; }
public override void SetUp() { base.SetUp(); // we generate aweful regexps: good for testing. // but for preflex codec, the test can be very slow, so use less iterations. NumIterations = Codec.Default.Name.Equals("Lucene3x") ? 10 * RANDOM_MULTIPLIER : AtLeast(50); Dir = NewDirectory(); RandomIndexWriter writer = new RandomIndexWriter(Random(), Dir, (IndexWriterConfig)NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt(Random(), 50, 1000))); Document doc = new Document(); Field field = NewStringField("field", "", Field.Store.YES); doc.Add(field); Terms = new SortedSet<BytesRef>(); int num = AtLeast(200); for (int i = 0; i < num; i++) { string s = TestUtil.RandomUnicodeString(Random()); field.StringValue = s; Terms.Add(new BytesRef(s)); writer.AddDocument(doc); } TermsAutomaton = BasicAutomata.MakeStringUnion(Terms); Reader = writer.Reader; Searcher = NewSearcher(Reader); writer.Dispose(); }
/// <summary> /// Returns true if the language of this automaton is finite. /// </summary> public static bool IsFinite(Automaton a) { if (a.IsSingleton) { return true; } return IsFinite(a.Initial, new BitArray(a.NumberOfStates), new BitArray(a.NumberOfStates)); }
/// <summary> /// Returns a new (deterministic) automaton that accepts all strings. /// </summary> public static Automaton MakeAnyString() { Automaton a = new Automaton(); State s = new State(); a.Initial = s; s.accept = true; s.AddTransition(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, s)); a.deterministic = true; return a; }
/// <summary> /// Simple, original brics implementation of determinize() /// </summary> public static void DeterminizeSimple(Automaton a) { if (a.Deterministic || a.Singleton != null) { return; } HashSet<State> initialset = new HashSet<State>(); initialset.Add(a.InitialState); DeterminizeSimple(a, initialset); }
/// <summary> /// Minimizes (and determinizes if not already deterministic) the given /// automaton. /// </summary> /// <seealso cref= Automaton#setMinimization(int) </seealso> public static void Minimize(Automaton a) { if (!a.IsSingleton) { MinimizeHopcroft(a); } // recompute hash code //a.hash_code = 1a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2; //if (a.hash_code == 0) a.hash_code = 1; }
public override void SetUp() { base.SetUp(); // build an automaton matching this jvm's letter definition State initial = new State(); State accept = new State(); accept.Accept = true; for (int i = 0; i <= 0x10FFFF; i++) { if (Character.IsLetter(i)) { initial.AddTransition(new Transition(i, i, accept)); } } Automaton single = new Automaton(initial); single.Reduce(); Automaton repeat = BasicOperations.Repeat(single); jvmLetter = new CharacterRunAutomaton(repeat); }
private void AssertBruteForceT(string input, Automaton dfa, int distance) { CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa); int maxLen = input.Length + distance + 1; int maxNum = (int)Math.Pow(2, maxLen); for (int i = 0; i < maxNum; i++) { string encoded = Convert.ToString(i, 2); bool accepts = ra.Run(encoded); if (accepts) { Assert.IsTrue(GetTDistance(input, encoded) <= distance); } else { Assert.IsTrue(GetTDistance(input, encoded) > distance); } } }
public void TestRandomRanges() { Random r = Random(); int ITERS = AtLeast(10); int ITERS_PER_DFA = AtLeast(100); for (int iter = 0; iter < ITERS; iter++) { int x1 = GetCodeStart(r); int x2 = GetCodeStart(r); int startCode, endCode; if (x1 < x2) { startCode = x1; endCode = x2; } else { startCode = x2; endCode = x1; } if (IsSurrogate(startCode) && IsSurrogate(endCode)) { iter--; continue; } var a = new Automaton(); var end = new State {Accept = true}; a.InitialState.AddTransition(new Transition(startCode, endCode, end)); a.Deterministic = true; TestOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA); } }
public RandomAcceptedStrings(Automaton a) { this.a = a; if (!String.IsNullOrEmpty(a.Singleton)) { LeadsToAccept = null; return; } // must use IdentityHashmap because two Transitions w/ // different start nodes can be considered the same LeadsToAccept = new IdentityHashMap <Transition, bool?>(); IDictionary <State, IList <ArrivingTransition> > allArriving = new Dictionary <State, IList <ArrivingTransition> >(); LinkedList <State> q = new LinkedList <State>(); HashSet <State> seen = new HashSet <State>(); // reverse map the transitions, so we can quickly look // up all arriving transitions to a given state foreach (State s in a.NumberedStates) { for (int i = 0; i < s.numTransitions; i++) { Transition t = s.TransitionsArray[i]; IList <ArrivingTransition> tl; allArriving.TryGetValue(t.Dest, out tl); if (tl == null) { tl = new List <ArrivingTransition>(); allArriving[t.Dest] = tl; } tl.Add(new ArrivingTransition(s, t)); } if (s.Accept) { q.AddLast(s); seen.Add(s); } } // Breadth-first search, from accept states, // backwards: while (q.Count > 0) { State s = q.First.Value; q.RemoveFirst(); IList <ArrivingTransition> arriving; allArriving.TryGetValue(s, out arriving); if (arriving != null) { foreach (ArrivingTransition at in arriving) { State from = at.From; if (!seen.Contains(from)) { q.AddLast(from); seen.Add(from); LeadsToAccept[at.t] = true; } } } } }
/// <summary> /// Builds a DFA for some string, and checks all Lev automata /// up to some maximum distance. /// </summary> private void AssertLev(string s, int maxDistance) { LevenshteinAutomata builder = new LevenshteinAutomata(s, false); LevenshteinAutomata tbuilder = new LevenshteinAutomata(s, true); Automaton[] automata = new Automaton[maxDistance + 1]; Automaton[] tautomata = new Automaton[maxDistance + 1]; for (int n = 0; n < automata.Length; n++) { automata[n] = builder.ToAutomaton(n); tautomata[n] = tbuilder.ToAutomaton(n); Assert.IsNotNull(automata[n]); Assert.IsNotNull(tautomata[n]); Assert.IsTrue(automata[n].Deterministic); Assert.IsTrue(tautomata[n].Deterministic); Assert.IsTrue(SpecialOperations.IsFinite(automata[n])); Assert.IsTrue(SpecialOperations.IsFinite(tautomata[n])); AutomatonTestUtil.AssertNoDetachedStates(automata[n]); AutomatonTestUtil.AssertNoDetachedStates(tautomata[n]); // check that the dfa for n-1 accepts a subset of the dfa for n if (n > 0) { Assert.IsTrue(automata[n - 1].SubsetOf(automata[n])); Assert.IsTrue(automata[n - 1].SubsetOf(tautomata[n])); Assert.IsTrue(tautomata[n - 1].SubsetOf(automata[n])); Assert.IsTrue(tautomata[n - 1].SubsetOf(tautomata[n])); Assert.AreNotSame(automata[n - 1], automata[n]); } // check that Lev(N) is a subset of LevT(N) Assert.IsTrue(automata[n].SubsetOf(tautomata[n])); // special checks for specific n switch (n) { case 0: // easy, matches the string itself Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), automata[0])); Assert.IsTrue(BasicOperations.SameLanguage(BasicAutomata.MakeString(s), tautomata[0])); break; case 1: // generate a lev1 naively, and check the accepted lang is the same. Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1(s), automata[1])); Assert.IsTrue(BasicOperations.SameLanguage(NaiveLev1T(s), tautomata[1])); break; default: AssertBruteForce(s, automata[n], n); AssertBruteForceT(s, tautomata[n], n); break; } } }
/// <summary> /// Returns true if the language of <paramref name="a1"/> is a subset of the language /// of <paramref name="a2"/>. As a side-effect, <paramref name="a2"/> is determinized if /// not already marked as deterministic. /// <para/> /// Complexity: quadratic in number of states. /// </summary> public static bool SubsetOf(Automaton a1, Automaton a2) { if (a1 == a2) { return(true); } if (a1.IsSingleton) { if (a2.IsSingleton) { return(a1.singleton.Equals(a2.singleton, StringComparison.Ordinal)); } return(BasicOperations.Run(a2, a1.singleton)); } a2.Determinize(); Transition[][] transitions1 = a1.GetSortedTransitions(); Transition[][] transitions2 = a2.GetSortedTransitions(); Queue <StatePair> worklist = new Queue <StatePair>(); // LUCENENET specific - Queue is much more performant than LinkedList JCG.HashSet <StatePair> visited = new JCG.HashSet <StatePair>(); StatePair p = new StatePair(a1.initial, a2.initial); worklist.Enqueue(p); visited.Add(p); while (worklist.Count > 0) { p = worklist.Dequeue(); if (p.s1.accept && !p.s2.accept) { return(false); } Transition[] t1 = transitions1[p.s1.number]; Transition[] t2 = transitions2[p.s2.number]; for (int n1 = 0, b2 = 0; n1 < t1.Length; n1++) { while (b2 < t2.Length && t2[b2].max < t1[n1].min) { b2++; } int min1 = t1[n1].min, max1 = t1[n1].max; for (int n2 = b2; n2 < t2.Length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].min > min1) { return(false); } if (t2[n2].max < Character.MaxCodePoint) { min1 = t2[n2].max + 1; } else { min1 = Character.MaxCodePoint; max1 = Character.MinCodePoint; } StatePair q = new StatePair(t1[n1].to, t2[n2].to); if (!visited.Contains(q)) { worklist.Enqueue(q); visited.Add(q); } } if (min1 <= max1) { return(false); } } } return(true); }
/// <summary> /// Adds epsilon transitions to the given automaton. This method adds extra /// character interval transitions that are equivalent to the given set of /// epsilon transitions. /// </summary> /// <param name="a"> Automaton. </param> /// <param name="pairs"> Collection of <see cref="StatePair"/> objects representing pairs of /// source/destination states where epsilon transitions should be /// added. </param> public static void AddEpsilons(Automaton a, ICollection <StatePair> pairs) { a.ExpandSingleton(); Dictionary <State, JCG.HashSet <State> > forward = new Dictionary <State, JCG.HashSet <State> >(); Dictionary <State, JCG.HashSet <State> > back = new Dictionary <State, JCG.HashSet <State> >(); foreach (StatePair p in pairs) { if (!forward.TryGetValue(p.s1, out JCG.HashSet <State> to)) { to = new JCG.HashSet <State>(); forward[p.s1] = to; } to.Add(p.s2); if (!back.TryGetValue(p.s2, out JCG.HashSet <State> from)) { from = new JCG.HashSet <State>(); back[p.s2] = from; } from.Add(p.s1); } // calculate epsilon closure LinkedList <StatePair> worklist = new LinkedList <StatePair>(pairs); JCG.HashSet <StatePair> workset = new JCG.HashSet <StatePair>(pairs); while (worklist.Count > 0) { StatePair p = worklist.First.Value; worklist.Remove(p); workset.Remove(p); #pragma warning disable IDE0018 // Inline variable declaration JCG.HashSet <State> from; #pragma warning restore IDE0018 // Inline variable declaration if (forward.TryGetValue(p.s2, out JCG.HashSet <State> to)) { foreach (State s in to) { StatePair pp = new StatePair(p.s1, s); if (!pairs.Contains(pp)) { pairs.Add(pp); forward[p.s1].Add(s); back[s].Add(p.s1); worklist.AddLast(pp); workset.Add(pp); if (back.TryGetValue(p.s1, out from)) { foreach (State q in from) { StatePair qq = new StatePair(q, p.s1); if (!workset.Contains(qq)) { worklist.AddLast(qq); workset.Add(qq); } } } } } } } // add transitions foreach (StatePair p in pairs) { p.s1.AddEpsilon(p.s2); } a.deterministic = false; //a.clearHashCode(); a.ClearNumberedStates(); a.CheckMinimizeAlways(); }
/// <summary> /// Determinizes the given automaton. /// <para/> /// Worst case complexity: exponential in number of states. /// </summary> public static void Determinize(Automaton a) { if (a.IsDeterministic || a.IsSingleton) { return; } State[] allStates = a.GetNumberedStates(); // subset construction bool initAccept = a.initial.accept; int initNumber = a.initial.number; a.initial = new State(); SortedInt32Set.FrozenInt32Set initialset = new SortedInt32Set.FrozenInt32Set(initNumber, a.initial); LinkedList <SortedInt32Set.FrozenInt32Set> worklist = new LinkedList <SortedInt32Set.FrozenInt32Set>(); IDictionary <SortedInt32Set.FrozenInt32Set, State> newstate = new Dictionary <SortedInt32Set.FrozenInt32Set, State>(); worklist.AddLast(initialset); a.initial.accept = initAccept; newstate[initialset] = a.initial; int newStateUpto = 0; State[] newStatesArray = new State[5]; newStatesArray[newStateUpto] = a.initial; a.initial.number = newStateUpto; newStateUpto++; // like Set<Integer,PointTransitions> PointTransitionSet points = new PointTransitionSet(); // like SortedMap<Integer,Integer> SortedInt32Set statesSet = new SortedInt32Set(5); // LUCENENET TODO: THIS IS INFINITE LOOPING // LUCENENET NOTE: The problem here is almost certainly // due to the conversion to FrozenIntSet along with its // differing equality checking. while (worklist.Count > 0) { SortedInt32Set.FrozenInt32Set s = worklist.First.Value; worklist.Remove(s); // Collate all outgoing transitions by min/1+max: for (int i = 0; i < s.values.Length; i++) { State s0 = allStates[s.values[i]]; for (int j = 0; j < s0.numTransitions; j++) { points.Add(s0.TransitionsArray[j]); } } if (points.count == 0) { // No outgoing transitions -- skip it continue; } points.Sort(); int lastPoint = -1; int accCount = 0; State r = s.state; for (int i = 0; i < points.count; i++) { int point = points.points[i].point; if (statesSet.upto > 0) { Debug.Assert(lastPoint != -1); statesSet.ComputeHash(); State q; if (!newstate.TryGetValue(statesSet.ToFrozenInt32Set(), out q) || q == null) { q = new State(); SortedInt32Set.FrozenInt32Set p = statesSet.Freeze(q); worklist.AddLast(p); if (newStateUpto == newStatesArray.Length) { State[] newArray = new State[ArrayUtil.Oversize(1 + newStateUpto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(newStatesArray, 0, newArray, 0, newStateUpto); newStatesArray = newArray; } newStatesArray[newStateUpto] = q; q.number = newStateUpto; newStateUpto++; q.accept = accCount > 0; newstate[p] = q; } else { Debug.Assert((accCount > 0) == q.accept, "accCount=" + accCount + " vs existing accept=" + q.accept + " states=" + statesSet); } r.AddTransition(new Transition(lastPoint, point - 1, q)); } // process transitions that end on this point // (closes an overlapping interval) Transition[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Decr(num); accCount -= t.to.accept ? 1 : 0; } points.points[i].ends.count = 0; // process transitions that start on this point // (opens a new interval) transitions = points.points[i].starts.transitions; limit = points.points[i].starts.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Incr(num); accCount += t.to.accept ? 1 : 0; } lastPoint = point; points.points[i].starts.count = 0; } points.Reset(); Debug.Assert(statesSet.upto == 0, "upto=" + statesSet.upto); } a.deterministic = true; a.SetNumberedStates(newStatesArray, newStateUpto); }
/// <summary> /// Returns <c>true</c> if the given string is accepted by the automaton. /// <para/> /// Complexity: linear in the length of the string. /// <para/> /// <b>Note:</b> for full performance, use the <see cref="RunAutomaton"/> class. /// </summary> public static bool Run(Automaton a, string s) { if (a.IsSingleton) { return(s.Equals(a.singleton, StringComparison.Ordinal)); } if (a.deterministic) { State p = a.initial; int cp; // LUCENENET: Removed unnecessary assignment for (int i = 0; i < s.Length; i += Character.CharCount(cp)) { State q = p.Step(cp = Character.CodePointAt(s, i)); if (q is null) { return(false); } p = q; } return(p.accept); } else { State[] states = a.GetNumberedStates(); LinkedList <State> pp = new LinkedList <State>(); LinkedList <State> pp_other = new LinkedList <State>(); OpenBitSet bb = new OpenBitSet(states.Length); OpenBitSet bb_other = new OpenBitSet(states.Length); pp.AddLast(a.initial); JCG.List <State> dest = new JCG.List <State>(); bool accept = a.initial.accept; int c; // LUCENENET: Removed unnecessary assignment for (int i = 0; i < s.Length; i += Character.CharCount(c)) { c = Character.CodePointAt(s, i); accept = false; pp_other.Clear(); bb_other.Clear(0, bb_other.Length - 1); foreach (State p in pp) { dest.Clear(); p.Step(c, dest); foreach (State q in dest) { if (q.accept) { accept = true; } if (!bb_other.Get(q.number)) { bb_other.Set(q.number); pp_other.AddLast(q); } } } LinkedList <State> tp = pp; pp = pp_other; pp_other = tp; OpenBitSet tb = bb; bb = bb_other; bb_other = tb; } return(accept); } }
protected internal override Automaton ConvertAutomaton(Automaton a) { if (unicodeAware) { Automaton utf8automaton = (new UTF32ToUTF8()).Convert(a); BasicOperations.Determinize(utf8automaton); return utf8automaton; } else { return a; } }
internal Automaton ToLevenshteinAutomata(Automaton automaton) { var @ref = SpecialOperations.GetFiniteStrings(automaton, -1); Automaton[] subs = new Automaton[@ref.Count]; int upto = 0; foreach (IntsRef path in @ref) { if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength) { subs[upto] = BasicAutomata.MakeString(path.Ints, path.Offset, path.Length); upto++; } else { Automaton prefix = BasicAutomata.MakeString(path.Ints, path.Offset, nonFuzzyPrefix); int[] ints = new int[path.Length - nonFuzzyPrefix]; Array.Copy(path.Ints, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? char.MAX_CODE_POINT : 255, transpositions); Automaton levAutomaton = lev.ToAutomaton(maxEdits); Automaton combined = BasicOperations.Concatenate(Arrays.AsList(prefix, levAutomaton)); combined.Deterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already subs[upto] = combined; upto++; } } if (subs.Length == 0) { // automaton is empty, there is no accepted paths through it return BasicAutomata.MakeEmpty(); // matches nothing } else if (subs.Length == 1) { // no synonyms or anything: just a single path through the tokenstream return subs[0]; } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = BasicOperations.Union(Arrays.AsList(subs)); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) BasicOperations.Determinize(a); return a; } }
public CompiledAutomaton(Automaton automaton, bool?finite, bool simplify) { if (simplify) { // Test whether the automaton is a "simple" form and // if so, don't create a runAutomaton. Note that on a // large automaton these tests could be costly: if (BasicOperations.IsEmpty(automaton)) { // matches nothing Type = AUTOMATON_TYPE.NONE; Term = null; CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } else if (BasicOperations.IsTotal(automaton)) { // matches all possible strings Type = AUTOMATON_TYPE.ALL; Term = null; CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } else { string commonPrefix; string singleton; if (automaton.Singleton == null) { commonPrefix = SpecialOperations.GetCommonPrefix(automaton); if (commonPrefix.Length > 0 && BasicOperations.SameLanguage(automaton, BasicAutomata.MakeString(commonPrefix))) { singleton = commonPrefix; } else { singleton = null; } } else { commonPrefix = null; singleton = automaton.Singleton; } if (singleton != null) { // matches a fixed string in singleton or expanded // representation Type = AUTOMATON_TYPE.SINGLE; Term = new BytesRef(singleton); CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } else if (BasicOperations.SameLanguage(automaton, BasicOperations.Concatenate(BasicAutomata.MakeString(commonPrefix), BasicAutomata.MakeAnyString()))) { // matches a constant prefix Type = AUTOMATON_TYPE.PREFIX; Term = new BytesRef(commonPrefix); CommonSuffixRef = null; RunAutomaton = null; sortedTransitions = null; this.Finite = null; return; } } } Type = AUTOMATON_TYPE.NORMAL; Term = null; if (finite == null) { this.Finite = SpecialOperations.IsFinite(automaton); } else { this.Finite = finite; } Automaton utf8 = (new UTF32ToUTF8()).Convert(automaton); if (this.Finite == true) { CommonSuffixRef = null; } else { CommonSuffixRef = SpecialOperations.GetCommonSuffixBytesRef(utf8); } RunAutomaton = new ByteRunAutomaton(utf8, true); sortedTransitions = utf8.GetSortedTransitions(); }
private static void AssertAutomaton(Automaton automaton) { var cra = new CharacterRunAutomaton(automaton); var bra = new ByteRunAutomaton(automaton); var ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); int num = AtLeast(1000); for (int i = 0; i < num; i++) { string s; if (Random().NextBoolean()) { // likely not accepted s = TestUtil.RandomUnicodeString(Random()); } else { // will be accepted int[] codepoints = ras.GetRandomAcceptedString(Random()); try { s = UnicodeUtil.NewString(codepoints, 0, codepoints.Length); } catch (Exception e) { Console.WriteLine(codepoints.Length + " codepoints:"); for (int j = 0; j < codepoints.Length; j++) { Console.WriteLine(" " + codepoints[j].ToString("x")); } throw e; } } var bytes = s.GetBytes(Encoding.UTF8); Assert.AreEqual(cra.Run(s), bra.Run(bytes, 0, bytes.Length)); } }
public CompiledAutomaton(Automaton automaton) : this(automaton, null, true) { }
/// <summary> /// Simple, original brics implementation of Determinize() /// Determinizes the given automaton using the given set of initial states. /// </summary> public static void DeterminizeSimple(Automaton a, ISet <State> initialset) { int[] points = a.GetStartPoints(); // subset construction IDictionary <ISet <State>, ISet <State> > sets = new Dictionary <ISet <State>, ISet <State> >(); Queue <ISet <State> > worklist = new Queue <ISet <State> >();// LUCENENET specific - Queue is much more performant than LinkedList IDictionary <ISet <State>, State> newstate = new Dictionary <ISet <State>, State>(); sets[initialset] = initialset; worklist.Enqueue(initialset); a.initial = new State(); newstate[initialset] = a.initial; while (worklist.Count > 0) { ISet <State> s = worklist.Dequeue(); State r = newstate[s]; foreach (State q in s) { if (q.accept) { r.accept = true; break; } } for (int n = 0; n < points.Length; n++) { ISet <State> p = new JCG.HashSet <State>(); foreach (State q in s) { foreach (Transition t in q.GetTransitions()) { if (t.min <= points[n] && points[n] <= t.max) { p.Add(t.to); } } } if (!sets.ContainsKey(p)) { sets[p] = p; worklist.Enqueue(p); newstate[p] = new State(); } State q_ = newstate[p]; int min = points[n]; int max; if (n + 1 < points.Length) { max = points[n + 1] - 1; } else { max = Character.MaxCodePoint; } r.AddTransition(new Transition(min, max, q_)); } } a.deterministic = true; a.ClearNumberedStates(); a.RemoveDeadTransitions(); }
public CharacterRunAutomaton(Automaton a) : base(a, Character.MAX_CODE_POINT, false) { }
/// <summary> /// Returns true if the language of this automaton is finite. /// <p> /// WARNING: this method is slow, it will blow up if the automaton is large. /// this is only used to test the correctness of our faster implementation. /// </summary> public static bool IsFiniteSlow(Automaton a) { if (!String.IsNullOrEmpty(a.Singleton)) { return true; } return IsFiniteSlow(a.InitialState, new HashSet<State>()); }
public static BytesRef GetCommonSuffixBytesRef(Automaton a) { if (a.IsSingleton) // if singleton, the suffix is the string itself. { return new BytesRef(a.singleton); } // reverse the language of the automaton, then reverse its common prefix. Automaton r = (Automaton)a.Clone(); Reverse(r); r.Determinize(); BytesRef @ref = SpecialOperations.GetCommonPrefixBytesRef(r); ReverseBytes(@ref); return @ref; }
/// <summary> /// Checks that an automaton has no detached states that are unreachable /// from the initial state. /// </summary> public static void AssertNoDetachedStates(Automaton a) { int numStates = a.NumberOfStates; a.ClearNumberedStates(); // force recomputation of cached numbered states Assert.True(numStates == a.NumberOfStates, "automaton has " + (numStates - a.NumberOfStates) + " detached states"); }
protected internal override IList<FSTUtil.Path<Pair<long?, BytesRef>>> GetFullPrefixPaths(IList<FSTUtil.Path<Pair<long?, BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<long?, BytesRef>> fst) { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(ToLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.IntersectPrefixPaths(levA, fst); }
// TODO: this currently requites a determinized machine, // but it need not -- we can speed it up by walking the // NFA instead. it'd still be fail fast. public static BytesRef GetCommonPrefixBytesRef(Automaton a) { if (a.IsSingleton) { return new BytesRef(a.singleton); } BytesRef @ref = new BytesRef(10); HashSet<State> visited = new HashSet<State>(); State s = a.Initial; bool done; do { done = true; visited.Add(s); if (!s.accept && s.NumTransitions() == 1) { var iter = s.Transitions.GetEnumerator(); iter.MoveNext(); Transition t = iter.Current; if (t.Min_Renamed == t.Max_Renamed && !visited.Contains(t.To)) { @ref.Grow([email protected]); @ref.Bytes[@ref.Length - 1] = (byte)t.Min_Renamed; s = t.To; done = false; } } } while (!done); return @ref; }
private static Automaton NaiveUnion(IList<BytesRef> strings) { Automaton[] eachIndividual = new Automaton[strings.Count]; int i = 0; foreach (BytesRef bref in strings) { eachIndividual[i++] = BasicAutomata.MakeString(bref.Utf8ToString()); } return BasicOperations.Union(eachIndividual); }
// TODO: this is a dangerous method ... Automaton could be // huge ... and it's better in general for caller to // enumerate & process in a single walk: /// <summary> /// Returns the set of accepted strings, assuming that at most /// <code>limit</code> strings are accepted. If more than <code>limit</code> /// strings are accepted, the first limit strings found are returned. If <code>limit</code><0, then /// the limit is infinite. /// </summary> public static ISet<IntsRef> GetFiniteStrings(Automaton a, int limit) { HashSet<IntsRef> strings = new HashSet<IntsRef>(); if (a.IsSingleton) { if (limit > 0) { strings.Add(Util.ToUTF32(a.Singleton, new IntsRef())); } } else if (!GetFiniteStrings(a.Initial, new HashSet<State>(), strings, new IntsRef(), limit)) { return strings; } return strings; }
/// <summary> /// Compute a DFA that accepts all strings within an edit distance of <paramref name="n"/>. /// <para> /// All automata have the following properties: /// <list type="bullet"> /// <item><description>They are deterministic (DFA).</description></item> /// <item><description>There are no transitions to dead states.</description></item> /// <item><description>They are not minimal (some transitions could be combined).</description></item> /// </list> /// </para> /// </summary> public virtual Automaton ToAutomaton(int n) { if (n == 0) { return(BasicAutomata.MakeString(word, 0, word.Length)); } if (n >= descriptions.Length) { return(null); } int range = 2 * n + 1; ParametricDescription description = descriptions[n]; // the number of states is based on the length of the word and n State[] states = new State[description.Count]; // create all states, and mark as accept states if appropriate for (int i = 0; i < states.Length; i++) { states[i] = new State(); states[i].number = i; states[i].Accept = description.IsAccept(i); } // create transitions from state to state for (int k = 0; k < states.Length; k++) { int xpos = description.GetPosition(k); if (xpos < 0) { continue; } int end = xpos + Math.Min(word.Length - xpos, range); for (int x = 0; x < alphabet.Length; x++) { int ch = alphabet[x]; // get the characteristic vector at this position wrt ch int cvec = GetVector(ch, xpos, end); int dest = description.Transition(k, xpos, cvec); if (dest >= 0) { states[k].AddTransition(new Transition(ch, states[dest])); } } // add transitions for all other chars in unicode // by definition, their characteristic vectors are always 0, // because they do not exist in the input string. int dest_ = description.Transition(k, xpos, 0); // by definition if (dest_ >= 0) { for (int r = 0; r < numRanges; r++) { states[k].AddTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest_])); } } } Automaton a = new Automaton(states[0]); a.IsDeterministic = true; // we create some useless unconnected states, and its a net-win overall to remove these, // as well as to combine any adjacent transitions (it makes later algorithms more efficient). // so, while we could set our numberedStates here, its actually best not to, and instead to // force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions. //a.setNumberedStates(states); a.Reduce(); // we need not trim transitions to dead states, as they are not created. //a.restoreInvariant(); return(a); }
/// <summary> /// Returns true if the given string is accepted by the automaton. /// <p> /// Complexity: linear in the length of the string. /// <p> /// <b>Note:</b> for full performance, use the <seealso cref="RunAutomaton"/> class. /// </summary> public static bool Run(Automaton a, string s) { if (a.IsSingleton) { return(s.Equals(a.singleton)); } if (a.deterministic) { State p = a.Initial; for (int i = 0, cp = 0; i < s.Length; i += Character.CharCount(cp)) { State q = p.Step(cp = Character.CodePointAt(s, i)); if (q == null) { return(false); } p = q; } return(p.accept); } else { State[] states = a.NumberedStates; LinkedList <State> pp = new LinkedList <State>(); LinkedList <State> pp_other = new LinkedList <State>(); BitArray bb = new BitArray(states.Length); BitArray bb_other = new BitArray(states.Length); pp.AddLast(a.Initial); List <State> dest = new List <State>(); bool accept = a.Initial.accept; for (int i = 0, c = 0; i < s.Length; i += Character.CharCount(c)) { c = Character.CodePointAt(s, i); accept = false; pp_other.Clear(); bb_other.SetAll(false); foreach (State p in pp) { dest.Clear(); p.Step(c, dest); foreach (State q in dest) { if (q.accept) { accept = true; } if (!bb_other.SafeGet(q.number)) { bb_other.SafeSet(q.number, true); pp_other.AddLast(q); } } } LinkedList <State> tp = pp; pp = pp_other; pp_other = tp; BitArray tb = bb; bb = bb_other; bb_other = tb; } return(accept); } }
/// <summary> /// See <seealso cref="BasicOperations#concatenate(Automaton, Automaton)"/>. /// </summary> public virtual Automaton Concatenate(Automaton a) { return(BasicOperations.Concatenate(this, a)); }
/// <summary> /// Returns an automaton that accepts the intersection of the languages of the /// given automata. Never modifies the input automata languages. /// <para/> /// Complexity: quadratic in number of states. /// </summary> public static Automaton Intersection(Automaton a1, Automaton a2) { if (a1.IsSingleton) { if (BasicOperations.Run(a2, a1.singleton)) { return(a1.CloneIfRequired()); } else { return(BasicAutomata.MakeEmpty()); } } if (a2.IsSingleton) { if (BasicOperations.Run(a1, a2.singleton)) { return(a2.CloneIfRequired()); } else { return(BasicAutomata.MakeEmpty()); } } if (a1 == a2) { return(a1.CloneIfRequired()); } Transition[][] transitions1 = a1.GetSortedTransitions(); Transition[][] transitions2 = a2.GetSortedTransitions(); Automaton c = new Automaton(); Queue <StatePair> worklist = new Queue <StatePair>(); // LUCENENET specific - Queue is much more performant than LinkedList Dictionary <StatePair, StatePair> newstates = new Dictionary <StatePair, StatePair>(); StatePair p = new StatePair(c.initial, a1.initial, a2.initial); worklist.Enqueue(p); newstates[p] = p; while (worklist.Count > 0) { p = worklist.Dequeue(); p.s.accept = p.s1.accept && p.s2.accept; Transition[] t1 = transitions1[p.s1.number]; Transition[] t2 = transitions2[p.s2.number]; for (int n1 = 0, b2 = 0; n1 < t1.Length; n1++) { while (b2 < t2.Length && t2[b2].max < t1[n1].min) { b2++; } for (int n2 = b2; n2 < t2.Length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].max >= t1[n1].min) { StatePair q = new StatePair(t1[n1].to, t2[n2].to); if (!newstates.TryGetValue(q, out StatePair r) || r is null) { q.s = new State(); worklist.Enqueue(q); newstates[q] = q; r = q; } int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; p.s.AddTransition(new Transition(min, max, r.s)); } } } } c.deterministic = a1.deterministic && a2.deterministic; c.RemoveDeadTransitions(); c.CheckMinimizeAlways(); return(c); }
/// <summary> /// See <seealso cref="BasicOperations#minus(Automaton, Automaton)"/>. /// </summary> public virtual Automaton Minus(Automaton a) { return(BasicOperations.Minus(this, a)); }
/// <summary> /// Determinizes the given automaton. /// <para/> /// Worst case complexity: exponential in number of states. /// </summary> public static void Determinize(Automaton a) { if (a.IsDeterministic || a.IsSingleton) { return; } State[] allStates = a.GetNumberedStates(); // subset construction bool initAccept = a.initial.accept; int initNumber = a.initial.number; a.initial = new State(); SortedInt32Set.FrozenInt32Set initialset = new SortedInt32Set.FrozenInt32Set(initNumber, a.initial); Queue <SortedInt32Set.FrozenInt32Set> worklist = new Queue <SortedInt32Set.FrozenInt32Set>(); // LUCENENET specific - Queue is much more performant than LinkedList IDictionary <SortedInt32Set.FrozenInt32Set, State> newstate = new Dictionary <SortedInt32Set.FrozenInt32Set, State>(); worklist.Enqueue(initialset); a.initial.accept = initAccept; newstate[initialset] = a.initial; int newStateUpto = 0; State[] newStatesArray = new State[5]; newStatesArray[newStateUpto] = a.initial; a.initial.number = newStateUpto; newStateUpto++; // like Set<Integer,PointTransitions> PointTransitionSet points = new PointTransitionSet(); // like SortedMap<Integer,Integer> SortedInt32Set statesSet = new SortedInt32Set(5); while (worklist.Count > 0) { SortedInt32Set.FrozenInt32Set s = worklist.Dequeue(); //worklist.Remove(s); // Collate all outgoing transitions by min/1+max: for (int i = 0; i < s.values.Length; i++) { State s0 = allStates[s.values[i]]; for (int j = 0; j < s0.numTransitions; j++) { points.Add(s0.TransitionsArray[j]); } } if (points.count == 0) { // No outgoing transitions -- skip it continue; } points.Sort(); int lastPoint = -1; int accCount = 0; State r = s.state; for (int i = 0; i < points.count; i++) { int point = points.points[i].point; if (statesSet.upto > 0) { if (Debugging.AssertsEnabled) { Debugging.Assert(lastPoint != -1); } statesSet.ComputeHash(); if (!newstate.TryGetValue(statesSet.ToFrozenInt32Set(), out State q) || q is null) { q = new State(); SortedInt32Set.FrozenInt32Set p = statesSet.Freeze(q); worklist.Enqueue(p); if (newStateUpto == newStatesArray.Length) { // LUCENENET: Resize rather than copy Array.Resize(ref newStatesArray, ArrayUtil.Oversize(1 + newStateUpto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)); } newStatesArray[newStateUpto] = q; q.number = newStateUpto; newStateUpto++; q.accept = accCount > 0; newstate[p] = q; } else { if (Debugging.AssertsEnabled) { Debugging.Assert((accCount > 0) == q.accept, "accCount={0} vs existing accept={1} states={2}", accCount, q.accept, statesSet); } } r.AddTransition(new Transition(lastPoint, point - 1, q)); } // process transitions that end on this point // (closes an overlapping interval) Transition[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Decr(num); accCount -= t.to.accept ? 1 : 0; } points.points[i].ends.count = 0; // process transitions that start on this point // (opens a new interval) transitions = points.points[i].starts.transitions; limit = points.points[i].starts.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Incr(num); accCount += t.to.accept ? 1 : 0; } lastPoint = point; points.points[i].starts.count = 0; } points.Reset(); if (Debugging.AssertsEnabled) { Debugging.Assert(statesSet.upto == 0, "upto={0}", statesSet.upto); } } a.deterministic = true; a.SetNumberedStates(newStatesArray, newStateUpto); }
/// <summary> /// See <seealso cref="BasicOperations#intersection(Automaton, Automaton)"/>. /// </summary> public virtual Automaton Intersection(Automaton a) { return(BasicOperations.Intersection(this, a)); }
/// <summary> /// Returns an automaton that accepts the concatenation of the languages of the /// given automata. /// <para/> /// Complexity: linear in total number of states. /// </summary> public static Automaton Concatenate(IList <Automaton> l) { if (l.Count == 0) { return(BasicAutomata.MakeEmptyString()); } bool all_singleton = true; foreach (Automaton a in l) { if (!a.IsSingleton) { all_singleton = false; break; } } if (all_singleton) { StringBuilder b = new StringBuilder(); foreach (Automaton a in l) { b.Append(a.singleton); } return(BasicAutomata.MakeString(b.ToString())); } else { foreach (Automaton a in l) { if (BasicOperations.IsEmpty(a)) { return(BasicAutomata.MakeEmpty()); } } JCG.HashSet <int> ids = new JCG.HashSet <int>(); foreach (Automaton a in l) { ids.Add(a.GetHashCode()); } bool has_aliases = ids.Count != l.Count; Automaton b = l[0]; if (has_aliases) { b = b.CloneExpanded(); } else { b = b.CloneExpandedIfRequired(); } ISet <State> ac = b.GetAcceptStates(); bool first = true; foreach (Automaton a in l) { if (first) { first = false; } else { if (a.IsEmptyString) { continue; } Automaton aa = a; if (has_aliases) { aa = aa.CloneExpanded(); } else { aa = aa.CloneExpandedIfRequired(); } ISet <State> ns = aa.GetAcceptStates(); foreach (State s in ac) { s.accept = false; s.AddEpsilon(aa.initial); if (s.accept) { ns.Add(s); } } ac = ns; } } b.deterministic = false; //b.clearHashCode(); b.ClearNumberedStates(); b.CheckMinimizeAlways(); return(b); } }
/// <summary> /// See <seealso cref="BasicOperations#subsetOf(Automaton, Automaton)"/>. /// </summary> public virtual bool SubsetOf(Automaton a) { return(BasicOperations.SubsetOf(this, a)); }
/// <summary> /// Simple, original brics implementation of determinize() /// Determinizes the given automaton using the given set of initial states. /// </summary> public static void DeterminizeSimple(Automaton a, ISet <State> initialset) { int[] points = a.StartPoints; // subset construction IDictionary <ISet <State>, ISet <State> > sets = new Dictionary <ISet <State>, ISet <State> >(); LinkedList <ISet <State> > worklist = new LinkedList <ISet <State> >(); IDictionary <ISet <State>, State> newstate = new Dictionary <ISet <State>, State>(); sets[initialset] = initialset; worklist.AddLast(initialset); a.Initial = new State(); newstate[initialset] = a.Initial; while (worklist.Count > 0) { ISet <State> s = worklist.First.Value; worklist.RemoveFirst(); State r = newstate[s]; foreach (State q in s) { if (q.Accept) { r.Accept = true; break; } } for (int n = 0; n < points.Length; n++) { ISet <State> p = new ValueHashSet <State>(); foreach (State q in s) { foreach (Transition t in q.Transitions) { if (t.Min <= points[n] && points[n] <= t.Max) { p.Add(t.To); } } } if (!sets.ContainsKey(p)) { sets[p] = p; worklist.AddLast(p); newstate[p] = new State(); } State q_ = newstate[p]; int min = points[n]; int max; if (n + 1 < points.Length) { max = points[n + 1] - 1; } else { max = Character.MAX_CODE_POINT; } r.AddTransition(new Transition(min, max, q_)); } } a.Deterministic = true; a.ClearNumberedStates(); a.RemoveDeadTransitions(); }
/// <summary> /// See <seealso cref="BasicOperations#union(Automaton, Automaton)"/>. /// </summary> public virtual Automaton Union(Automaton a) { return(BasicOperations.Union(this, a)); }
/// <summary> /// See <seealso cref="MinimizationOperations#minimize(Automaton)"/>. Returns the /// automaton being given as argument. /// </summary> public static Automaton Minimize(Automaton a) { MinimizationOperations.Minimize(a); return(a); }
public CharacterRunAutomaton(Automaton a) : base(a, Character.MaxCodePoint, false) { }
/// <summary> /// below are original, unoptimized implementations of DFA operations for testing. /// These are from brics automaton, full license (BSD) below: /// </summary> /* * dk.brics.automaton * * Copyright (c) 2001-2009 Anders Moeller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /// <summary> /// Simple, original brics implementation of Brzozowski minimize() /// </summary> public static void MinimizeSimple(Automaton a) { if (!String.IsNullOrEmpty(a.Singleton)) { return; } DeterminizeSimple(a, SpecialOperations.Reverse(a)); DeterminizeSimple(a, SpecialOperations.Reverse(a)); }
internal DumbRegexpQuery(Term term, RegExpSyntax flags) : base(term.Field) { RegExp re = new RegExp(term.Text, flags); automaton = re.ToAutomaton(); }
public RandomAcceptedStrings(Automaton a) { this.a = a; if (!String.IsNullOrEmpty(a.Singleton)) { LeadsToAccept = null; return; } // must use IdentityHashmap because two Transitions w/ // different start nodes can be considered the same LeadsToAccept = new IdentityHashMap<Transition, bool?>(); IDictionary<State, IList<ArrivingTransition>> allArriving = new Dictionary<State, IList<ArrivingTransition>>(); LinkedList<State> q = new LinkedList<State>(); HashSet<State> seen = new HashSet<State>(); // reverse map the transitions, so we can quickly look // up all arriving transitions to a given state foreach (State s in a.NumberedStates) { for (int i = 0; i < s.numTransitions; i++) { Transition t = s.TransitionsArray[i]; IList<ArrivingTransition> tl; allArriving.TryGetValue(t.Dest, out tl); if (tl == null) { tl = new List<ArrivingTransition>(); allArriving[t.Dest] = tl; } tl.Add(new ArrivingTransition(s, t)); } if (s.Accept) { q.AddLast(s); seen.Add(s); } } // Breadth-first search, from accept states, // backwards: while (q.Count > 0) { State s = q.First.Value; q.RemoveFirst(); IList<ArrivingTransition> arriving; allArriving.TryGetValue(s, out arriving); if (arriving != null) { foreach (ArrivingTransition at in arriving) { State from = at.From; if (!seen.Contains(from)) { q.AddLast(from); seen.Add(from); LeadsToAccept[at.t] = true; } } } } }
/// <summary> /// Minimizes the given automaton using Hopcroft's algorithm. /// </summary> public static void MinimizeHopcroft(Automaton a) { a.Determinize(); if (a.initial.numTransitions == 1) { Transition t = a.initial.TransitionsArray[0]; if (t.to == a.initial && t.min == Character.MinCodePoint && t.max == Character.MaxCodePoint) { return; } } a.Totalize(); // initialize data structures int[] sigma = a.GetStartPoints(); State[] states = a.GetNumberedStates(); int sigmaLen = sigma.Length, statesLen = states.Length; List <State>[,] reverse = new List <State> [statesLen, sigmaLen]; ISet <State>[] partition = new JCG.HashSet <State> [statesLen]; List <State>[] splitblock = new List <State> [statesLen]; int[] block = new int[statesLen]; StateList[,] active = new StateList[statesLen, sigmaLen]; StateListNode[,] active2 = new StateListNode[statesLen, sigmaLen]; LinkedList <Int32Pair> pending = new LinkedList <Int32Pair>(); OpenBitSet pending2 = new OpenBitSet(sigmaLen * statesLen); OpenBitSet split = new OpenBitSet(statesLen), refine = new OpenBitSet(statesLen), refine2 = new OpenBitSet(statesLen); for (int q = 0; q < statesLen; q++) { splitblock[q] = new List <State>(); partition[q] = new JCG.HashSet <State>(); for (int x = 0; x < sigmaLen; x++) { active[q, x] = new StateList(); } } // find initial partition and reverse edges for (int q = 0; q < statesLen; q++) { State qq = states[q]; int j = qq.accept ? 0 : 1; partition[j].Add(qq); block[q] = j; for (int x = 0; x < sigmaLen; x++) { //List<State>[] r = reverse[qq.Step(sigma[x]).number]; var r = qq.Step(sigma[x]).number; if (reverse[r, x] == null) { reverse[r, x] = new List <State>(); } reverse[r, x].Add(qq); } } // initialize active sets for (int j = 0; j <= 1; j++) { for (int x = 0; x < sigmaLen; x++) { foreach (State qq in partition[j]) { if (reverse[qq.number, x] != null) { active2[qq.number, x] = active[j, x].Add(qq); } } } } // initialize pending for (int x = 0; x < sigmaLen; x++) { int j = (active[0, x].Count <= active[1, x].Count) ? 0 : 1; pending.AddLast(new Int32Pair(j, x)); pending2.Set(x * statesLen + j); } // process pending until fixed point int k = 2; while (pending.Count > 0) { Int32Pair ip = pending.First.Value; pending.Remove(ip); int p = ip.N1; int x = ip.N2; pending2.Clear(x * statesLen + p); // find states that need to be split off their blocks for (StateListNode m = active[p, x].First; m != null; m = m.Next) { List <State> r = reverse[m.Q.number, x]; if (r != null) { foreach (State s in r) { int i = s.number; if (!split.Get(i)) { split.Set(i); int j = block[i]; splitblock[j].Add(s); if (!refine2.Get(j)) { refine2.Set(j); refine.Set(j); } } } } } // refine blocks for (int j = refine.NextSetBit(0); j >= 0; j = refine.NextSetBit(j + 1)) { List <State> sb = splitblock[j]; if (sb.Count < partition[j].Count) { ISet <State> b1 = partition[j]; ISet <State> b2 = partition[k]; foreach (State s in sb) { b1.Remove(s); b2.Add(s); block[s.number] = k; for (int c = 0; c < sigmaLen; c++) { StateListNode sn = active2[s.number, c]; if (sn != null && sn.Sl == active[j, c]) { sn.Remove(); active2[s.number, c] = active[k, c].Add(s); } } } // update pending for (int c = 0; c < sigmaLen; c++) { int aj = active[j, c].Count, ak = active[k, c].Count, ofs = c * statesLen; if (!pending2.Get(ofs + j) && 0 < aj && aj <= ak) { pending2.Set(ofs + j); pending.AddLast(new Int32Pair(j, c)); } else { pending2.Set(ofs + k); pending.AddLast(new Int32Pair(k, c)); } } k++; } refine2.Clear(j); foreach (State s in sb) { split.Clear(s.number); } sb.Clear(); } refine.Clear(0, refine.Length - 1); } // make a new state for each equivalence class, set initial state State[] newstates = new State[k]; for (int n = 0; n < newstates.Length; n++) { State s = new State(); newstates[n] = s; foreach (State q in partition[n]) { if (q == a.initial) { a.initial = s; } s.accept = q.accept; s.number = q.number; // select representative q.number = n; } } // build transitions and set acceptance for (int n = 0; n < newstates.Length; n++) { State s = newstates[n]; s.accept = states[s.number].accept; foreach (Transition t in states[s.number].GetTransitions()) { s.AddTransition(new Transition(t.min, t.max, newstates[t.to.number])); } } a.ClearNumberedStates(); a.RemoveDeadTransitions(); }
/// <summary> /// Simple, original brics implementation of determinize() /// Determinizes the given automaton using the given set of initial states. /// </summary> public static void DeterminizeSimple(Automaton a, ISet<State> initialset) { int[] points = a.StartPoints; // subset construction IDictionary<ISet<State>, ISet<State>> sets = new Dictionary<ISet<State>, ISet<State>>(); LinkedList<ISet<State>> worklist = new LinkedList<ISet<State>>(); IDictionary<ISet<State>, State> newstate = new Dictionary<ISet<State>, State>(); sets[initialset] = initialset; worklist.AddLast(initialset); a.InitialState = new State(); newstate[initialset] = a.InitialState; while (worklist.Count > 0) { ISet<State> s = worklist.First.Value; worklist.RemoveFirst(); State r = newstate[s]; foreach (State q in s) { if (q.Accept) { r.Accept = true; break; } } for (int n = 0; n < points.Length; n++) { ISet<State> p = new HashSet<State>(); foreach (State q in s) { foreach (Transition t in q.Transitions) { if (t.Min <= points[n] && points[n] <= t.Max) { p.Add(t.Dest); } } } if (!sets.ContainsKey(p)) { sets[p] = p; worklist.AddLast(p); newstate[p] = new State(); } State q_ = newstate[p]; int min = points[n]; int max; if (n + 1 < points.Length) { max = points[n + 1] - 1; } else { max = Character.MAX_CODE_POINT; } r.AddTransition(new Transition(min, max, q_)); } } a.Deterministic = true; a.ClearNumberedStates(); a.RemoveDeadTransitions(); }
/// <summary> /// Minimizes the given automaton using Hopcroft's algorithm. /// </summary> public static void MinimizeHopcroft(Automaton a) { a.Determinize(); if (a.Initial.numTransitions == 1) { Transition t = a.Initial.TransitionsArray[0]; if (t.To == a.Initial && t.Min_Renamed == Character.MIN_CODE_POINT && t.Max_Renamed == Character.MAX_CODE_POINT) { return; } } a.Totalize(); // initialize data structures int[] sigma = a.StartPoints; State[] states = a.NumberedStates; int sigmaLen = sigma.Length, statesLen = states.Length; List<State>[,] reverse = new List<State>[statesLen, sigmaLen]; HashSet<State>[] partition = new HashSet<State>[statesLen]; List<State>[] splitblock = new List<State>[statesLen]; int[] block = new int[statesLen]; StateList[,] active = new StateList[statesLen, sigmaLen]; StateListNode[,] active2 = new StateListNode[statesLen, sigmaLen]; LinkedList<IntPair> pending = new LinkedList<IntPair>(); BitArray pending2 = new BitArray(sigmaLen * statesLen); BitArray split = new BitArray(statesLen), refine = new BitArray(statesLen), refine2 = new BitArray(statesLen); for (int q = 0; q < statesLen; q++) { splitblock[q] = new List<State>(); partition[q] = new HashSet<State>(); for (int x = 0; x < sigmaLen; x++) { active[q, x] = new StateList(); } } // find initial partition and reverse edges for (int q = 0; q < statesLen; q++) { State qq = states[q]; int j = qq.accept ? 0 : 1; partition[j].Add(qq); block[q] = j; for (int x = 0; x < sigmaLen; x++) { //List<State>[] r = reverse[qq.Step(sigma[x]).number]; var r = qq.Step(sigma[x]).number; if (reverse[r, x] == null) { reverse[r, x] = new List<State>(); } reverse[r, x].Add(qq); } } // initialize active sets for (int j = 0; j <= 1; j++) { for (int x = 0; x < sigmaLen; x++) { foreach (State qq in partition[j]) { if (reverse[qq.number, x] != null) { active2[qq.number, x] = active[j, x].Add(qq); } } } } // initialize pending for (int x = 0; x < sigmaLen; x++) { int j = (active[0, x].Size <= active[1, x].Size) ? 0 : 1; pending.AddLast(new IntPair(j, x)); pending2.Set(x * statesLen + j, true); } // process pending until fixed point int k = 2; while (pending.Count > 0) { IntPair ip = pending.First.Value; pending.RemoveFirst(); int p = ip.N1; int x = ip.N2; pending2.Set(x * statesLen + p, false); // find states that need to be split off their blocks for (StateListNode m = active[p, x].First; m != null; m = m.Next) { List<State> r = reverse[m.q.number, x]; if (r != null) { foreach (State s in r) { int i = s.number; if (!split.Get(i)) { split.Set(i, true); int j = block[i]; splitblock[j].Add(s); if (!refine2.Get(j)) { refine2.Set(j, true); refine.Set(j, true); } } } } } // refine blocks for (int j = Number.NextSetBit(refine, 0); j >= 0; j = Number.NextSetBit(refine, j + 1)) { List<State> sb = splitblock[j]; if (sb.Count < partition[j].Count) { HashSet<State> b1 = partition[j]; HashSet<State> b2 = partition[k]; foreach (State s in sb) { b1.Remove(s); b2.Add(s); block[s.number] = k; for (int c = 0; c < sigmaLen; c++) { StateListNode sn = active2[s.number, c]; if (sn != null && sn.Sl == active[j, c]) { sn.Remove(); active2[s.number, c] = active[k, c].Add(s); } } } // update pending for (int c = 0; c < sigmaLen; c++) { int aj = active[j, c].Size, ak = active[k, c].Size, ofs = c * statesLen; if (!pending2.Get(ofs + j) && 0 < aj && aj <= ak) { pending2.Set(ofs + j, true); pending.AddLast(new IntPair(j, c)); } else { pending2.Set(ofs + k, true); pending.AddLast(new IntPair(k, c)); } } k++; } refine2.Set(j, false); foreach (State s in sb) { split.Set(s.number, false); } sb.Clear(); } refine.SetAll(false); } // make a new state for each equivalence class, set initial state State[] newstates = new State[k]; for (int n = 0; n < newstates.Length; n++) { State s = new State(); newstates[n] = s; foreach (State q in partition[n]) { if (q == a.Initial) { a.Initial = s; } s.accept = q.accept; s.number = q.number; // select representative q.number = n; } } // build transitions and set acceptance for (int n = 0; n < newstates.Length; n++) { State s = newstates[n]; s.accept = states[s.number].accept; foreach (Transition t in states[s.number].Transitions) { s.AddTransition(new Transition(t.Min_Renamed, t.Max_Renamed, newstates[t.To.number])); } } a.ClearNumberedStates(); a.RemoveDeadTransitions(); }
/// <summary> /// Returns the longest string that is a prefix of all accepted strings and /// visits each state at most once. /// </summary> /// <returns> common prefix </returns> public static string GetCommonPrefix(Automaton a) { if (a.IsSingleton) { return a.singleton; } StringBuilder b = new StringBuilder(); HashSet<State> visited = new HashSet<State>(); State s = a.Initial; bool done; do { done = true; visited.Add(s); if (!s.accept && s.NumTransitions() == 1) { var iter = s.Transitions.GetEnumerator(); iter.MoveNext(); Transition t = iter.Current; if (t.Min_Renamed == t.Max_Renamed && !visited.Contains(t.To)) { //b.appendCodePoint(t.Min_Renamed); b.Append(t.Min_Renamed); s = t.To; done = false; } } } while (!done); return b.ToString(); }
/// <summary> /// Returns an automaton that accepts the intersection of the languages of the /// given automata. Never modifies the input automata languages. /// <p> /// Complexity: quadratic in number of states. /// </summary> public static Automaton Intersection(Automaton a1, Automaton a2) { if (a1.IsSingleton) { if (BasicOperations.Run(a2, a1.singleton)) { return(a1.CloneIfRequired()); } else { return(BasicAutomata.MakeEmpty()); } } if (a2.IsSingleton) { if (BasicOperations.Run(a1, a2.singleton)) { return(a2.CloneIfRequired()); } else { return(BasicAutomata.MakeEmpty()); } } if (a1 == a2) { return(a1.CloneIfRequired()); } Transition[][] transitions1 = a1.SortedTransitions; Transition[][] transitions2 = a2.SortedTransitions; Automaton c = new Automaton(); LinkedList <StatePair> worklist = new LinkedList <StatePair>(); Dictionary <StatePair, StatePair> newstates = new Dictionary <StatePair, StatePair>(); StatePair p = new StatePair(c.Initial, a1.Initial, a2.Initial); worklist.AddLast(p); newstates[p] = p; while (worklist.Count > 0) { p = worklist.First.Value; worklist.RemoveFirst(); p.s.accept = p.S1.accept && p.S2.accept; Transition[] t1 = transitions1[p.S1.number]; Transition[] t2 = transitions2[p.S2.number]; for (int n1 = 0, b2 = 0; n1 < t1.Length; n1++) { while (b2 < t2.Length && t2[b2].Max_Renamed < t1[n1].Min_Renamed) { b2++; } for (int n2 = b2; n2 < t2.Length && t1[n1].Max_Renamed >= t2[n2].Min_Renamed; n2++) { if (t2[n2].Max_Renamed >= t1[n1].Min_Renamed) { StatePair q = new StatePair(t1[n1].To, t2[n2].To); StatePair r; newstates.TryGetValue(q, out r); if (r == null) { q.s = new State(); worklist.AddLast(q); newstates[q] = q; r = q; } int min = t1[n1].Min_Renamed > t2[n2].Min_Renamed ? t1[n1].Min_Renamed : t2[n2].Min_Renamed; int max = t1[n1].Max_Renamed < t2[n2].Max_Renamed ? t1[n1].Max_Renamed : t2[n2].Max_Renamed; p.s.AddTransition(new Transition(min, max, r.s)); } } } } c.deterministic = a1.deterministic && a2.deterministic; c.RemoveDeadTransitions(); c.CheckMinimizeAlways(); return(c); }
/// <summary> /// Returns the longest string that is a suffix of all accepted strings and /// visits each state at most once. /// </summary> /// <returns> common suffix </returns> public static string GetCommonSuffix(Automaton a) { if (a.IsSingleton) // if singleton, the suffix is the string itself. { return a.singleton; } // reverse the language of the automaton, then reverse its common prefix. Automaton r = (Automaton)a.Clone(); Reverse(r); r.Determinize(); return (new StringBuilder(SpecialOperations.GetCommonPrefix(r))).Reverse().ToString(); }
private Automaton ToAutomaton(IDictionary <string, Automaton> automata, AutomatonProvider automaton_provider) { IList <Automaton> list; Automaton a = null; switch (kind) { case Kind.REGEXP_UNION: list = new List <Automaton>(); FindLeaves(Exp1, Kind.REGEXP_UNION, list, automata, automaton_provider); FindLeaves(Exp2, Kind.REGEXP_UNION, list, automata, automaton_provider); a = BasicOperations.Union(list); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_CONCATENATION: list = new List <Automaton>(); FindLeaves(Exp1, Kind.REGEXP_CONCATENATION, list, automata, automaton_provider); FindLeaves(Exp2, Kind.REGEXP_CONCATENATION, list, automata, automaton_provider); a = BasicOperations.Concatenate(list); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_INTERSECTION: a = Exp1.ToAutomaton(automata, automaton_provider).Intersection(Exp2.ToAutomaton(automata, automaton_provider)); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_OPTIONAL: a = Exp1.ToAutomaton(automata, automaton_provider).Optional(); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_REPEAT: a = Exp1.ToAutomaton(automata, automaton_provider).Repeat(); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_REPEAT_MIN: a = Exp1.ToAutomaton(automata, automaton_provider).Repeat(Min); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_REPEAT_MINMAX: a = Exp1.ToAutomaton(automata, automaton_provider).Repeat(Min, Max); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_COMPLEMENT: a = Exp1.ToAutomaton(automata, automaton_provider).Complement(); MinimizationOperations.Minimize(a); break; case Kind.REGEXP_CHAR: a = BasicAutomata.MakeChar(c); break; case Kind.REGEXP_CHAR_RANGE: a = BasicAutomata.MakeCharRange(From, To); break; case Kind.REGEXP_ANYCHAR: a = BasicAutomata.MakeAnyChar(); break; case Kind.REGEXP_EMPTY: a = BasicAutomata.MakeEmpty(); break; case Kind.REGEXP_STRING: a = BasicAutomata.MakeString(s); break; case Kind.REGEXP_ANYSTRING: a = BasicAutomata.MakeAnyString(); break; case Kind.REGEXP_AUTOMATON: Automaton aa = null; if (automata != null) { aa = automata[s]; } if (aa == null && automaton_provider != null) { try { aa = automaton_provider.GetAutomaton(s); } catch (System.IO.IOException e) { throw new System.ArgumentException(e.Message, e); } } if (aa == null) { throw new System.ArgumentException("'" + s + "' not found"); } a = (Automaton)aa.Clone(); // always clone here (ignore allow_mutate) break; case Kind.REGEXP_INTERVAL: a = BasicAutomata.MakeInterval(Min, Max, Digits); break; } return(a); }
/// <summary> /// Reverses the language of the given (non-singleton) automaton while returning /// the set of new initial states. /// </summary> public static ISet<State> Reverse(Automaton a) { a.ExpandSingleton(); // reverse all edges Dictionary<State, HashSet<Transition>> m = new Dictionary<State, HashSet<Transition>>(); State[] states = a.NumberedStates; HashSet<State> accept = new HashSet<State>(); foreach (State s in states) { if (s.Accept) { accept.Add(s); } } foreach (State r in states) { m[r] = new HashSet<Transition>(); r.accept = false; } foreach (State r in states) { foreach (Transition t in r.Transitions) { m[t.To].Add(new Transition(t.Min_Renamed, t.Max_Renamed, r)); } } foreach (State r in states) { HashSet<Transition> tr = m[r]; r.Transitions = tr.ToArray(/*new Transition[tr.Count]*/); } // make new initial+final states a.Initial.accept = true; a.Initial = new State(); foreach (State r in accept) { a.Initial.AddEpsilon(r); // ensures that all initial states are reachable } a.deterministic = false; a.ClearNumberedStates(); return accept; }
/// <summary> /// Constructs a new <code>RunAutomaton</code> from a deterministic /// <code>Automaton</code>. /// </summary> /// <param name="a"> an automaton </param> /// <param name="maxInterval"></param> /// <param name="tableize"></param> protected RunAutomaton(Automaton a, int maxInterval, bool tableize) { this._maxInterval = maxInterval; a.Determinize(); _points = a.StartPoints; State[] states = a.NumberedStates; Initial = a.Initial.Number; _size = states.Length; Accept = new bool[_size]; Transitions = new int[_size * _points.Length]; for (int n = 0; n < _size * _points.Length; n++) { Transitions[n] = -1; } foreach (State s in states) { int n = s.number; Accept[n] = s.accept; for (int c = 0; c < _points.Length; c++) { State q = s.Step(_points[c]); if (q != null) { Transitions[n * _points.Length + c] = q.number; } } } /* * Set alphabet table for optimal run performance. */ if (tableize) { _classmap = new int[maxInterval + 1]; int i = 0; for (int j = 0; j <= maxInterval; j++) { if (i + 1 < _points.Length && j == _points[i + 1]) { i++; } _classmap[j] = i; } } else { _classmap = null; } }
/// <summary> /// Compute a DFA that accepts all strings within an edit distance of <code>n</code>. /// <p> /// All automata have the following properties: /// <ul> /// <li>They are deterministic (DFA). /// <li>There are no transitions to dead states. /// <li>They are not minimal (some transitions could be combined). /// </ul> /// </p> /// </summary> public virtual Automaton ToAutomaton(int n) { if (n == 0) { return BasicAutomata.MakeString(Word, 0, Word.Length); } if (n >= Descriptions.Length) { return null; } int range = 2 * n + 1; ParametricDescription description = Descriptions[n]; // the number of states is based on the length of the word and n State[] states = new State[description.Size()]; // create all states, and mark as accept states if appropriate for (int i = 0; i < states.Length; i++) { states[i] = new State(); states[i].number = i; states[i].Accept = description.IsAccept(i); } // create transitions from state to state for (int k = 0; k < states.Length; k++) { int xpos = description.GetPosition(k); if (xpos < 0) { continue; } int end = xpos + Math.Min(Word.Length - xpos, range); for (int x = 0; x < Alphabet.Length; x++) { int ch = Alphabet[x]; // get the characteristic vector at this position wrt ch int cvec = GetVector(ch, xpos, end); int dest = description.Transition(k, xpos, cvec); if (dest >= 0) { states[k].AddTransition(new Transition(ch, states[dest])); } } // add transitions for all other chars in unicode // by definition, their characteristic vectors are always 0, // because they do not exist in the input string. int dest_ = description.Transition(k, xpos, 0); // by definition if (dest_ >= 0) { for (int r = 0; r < NumRanges; r++) { states[k].AddTransition(new Transition(RangeLower[r], RangeUpper[r], states[dest_])); } } } Automaton a = new Automaton(states[0]); a.Deterministic = true; // we create some useless unconnected states, and its a net-win overall to remove these, // as well as to combine any adjacent transitions (it makes later algorithms more efficient). // so, while we could set our numberedStates here, its actually best not to, and instead to // force a traversal in reduce, pruning the unconnected states while we combine adjacent transitions. //a.setNumberedStates(states); a.Reduce(); // we need not trim transitions to dead states, as they are not created. //a.restoreInvariant(); return a; }
/// <summary> /// Returns true if the language of <paramref name="a1"/> is a subset of the language /// of <paramref name="a2"/>. As a side-effect, <paramref name="a2"/> is determinized if /// not already marked as deterministic. /// <para/> /// Complexity: quadratic in number of states. /// </summary> public static bool SubsetOf(Automaton a1, Automaton a2) { if (a1 == a2) { return(true); } if (a1.IsSingleton) { if (a2.IsSingleton) { return(a1.singleton.Equals(a2.singleton, StringComparison.Ordinal)); } return(BasicOperations.Run(a2, a1.singleton)); } a2.Determinize(); Transition[][] transitions1 = a1.GetSortedTransitions(); Transition[][] transitions2 = a2.GetSortedTransitions(); LinkedList <StatePair> worklist = new LinkedList <StatePair>(); HashSet <StatePair> visited = new HashSet <StatePair>(); StatePair p = new StatePair(a1.initial, a2.initial); worklist.AddLast(p); visited.Add(p); while (worklist.Count > 0) { p = worklist.First.Value; worklist.Remove(p); if (p.S1.accept && !p.S2.accept) { return(false); } Transition[] t1 = transitions1[p.S1.number]; Transition[] t2 = transitions2[p.S2.number]; for (int n1 = 0, b2 = 0; n1 < t1.Length; n1++) { while (b2 < t2.Length && t2[b2].max < t1[n1].min) { b2++; } int min1 = t1[n1].min, max1 = t1[n1].max; for (int n2 = b2; n2 < t2.Length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].min > min1) { return(false); } if (t2[n2].max < Character.MAX_CODE_POINT) { min1 = t2[n2].max + 1; } else { min1 = Character.MAX_CODE_POINT; max1 = Character.MIN_CODE_POINT; } StatePair q = new StatePair(t1[n1].to, t2[n2].to); if (!visited.Contains(q)) { worklist.AddLast(q); visited.Add(q); } } if (min1 <= max1) { return(false); } } } return(true); }