/// <summary> /// Converts an incoming utf32 <see cref="Automaton"/> to an equivalent /// utf8 one. The incoming automaton need not be /// deterministic. Note that the returned automaton will /// not in general be deterministic, so you must /// determinize it if that's needed. /// </summary> public Automaton Convert(Automaton utf32) { if (utf32.IsSingleton) { utf32 = utf32.CloneExpanded(); } State[] map = new State[utf32.GetNumberedStates().Length]; JCG.List <State> pending = new JCG.List <State>(); State utf32State = utf32.GetInitialState(); pending.Add(utf32State); Automaton utf8 = new Automaton(); utf8.IsDeterministic = false; State utf8State = utf8.GetInitialState(); utf8States = new State[5]; utf8StateCount = 0; utf8State.number = utf8StateCount; utf8States[utf8StateCount] = utf8State; utf8StateCount++; utf8State.Accept = utf32State.Accept; map[utf32State.number] = utf8State; while (pending.Count != 0) { utf32State = pending[pending.Count - 1]; pending.RemoveAt(pending.Count - 1); utf8State = map[utf32State.number]; for (int i = 0; i < utf32State.numTransitions; i++) { Transition t = utf32State.TransitionsArray[i]; State destUTF32 = t.to; State destUTF8 = map[destUTF32.number]; if (destUTF8 == null) { destUTF8 = NewUTF8State(); destUTF8.accept = destUTF32.accept; map[destUTF32.number] = destUTF8; pending.Add(destUTF32); } ConvertOneEdge(utf8State, destUTF8, t.min, t.max); } } utf8.SetNumberedStates(utf8States, utf8StateCount); return(utf8); }
/// <summary> /// Determinizes the given automaton. /// <para/> /// Worst case complexity: exponential in number of states. /// </summary> public static void Determinize(Automaton a) { if (a.IsDeterministic || a.IsSingleton) { return; } State[] allStates = a.GetNumberedStates(); // subset construction bool initAccept = a.initial.accept; int initNumber = a.initial.number; a.initial = new State(); SortedInt32Set.FrozenInt32Set initialset = new SortedInt32Set.FrozenInt32Set(initNumber, a.initial); Queue <SortedInt32Set.FrozenInt32Set> worklist = new Queue <SortedInt32Set.FrozenInt32Set>(); // LUCENENET specific - Queue is much more performant than LinkedList IDictionary <SortedInt32Set.FrozenInt32Set, State> newstate = new Dictionary <SortedInt32Set.FrozenInt32Set, State>(); worklist.Enqueue(initialset); a.initial.accept = initAccept; newstate[initialset] = a.initial; int newStateUpto = 0; State[] newStatesArray = new State[5]; newStatesArray[newStateUpto] = a.initial; a.initial.number = newStateUpto; newStateUpto++; // like Set<Integer,PointTransitions> PointTransitionSet points = new PointTransitionSet(); // like SortedMap<Integer,Integer> SortedInt32Set statesSet = new SortedInt32Set(5); while (worklist.Count > 0) { SortedInt32Set.FrozenInt32Set s = worklist.Dequeue(); //worklist.Remove(s); // Collate all outgoing transitions by min/1+max: for (int i = 0; i < s.values.Length; i++) { State s0 = allStates[s.values[i]]; for (int j = 0; j < s0.numTransitions; j++) { points.Add(s0.TransitionsArray[j]); } } if (points.count == 0) { // No outgoing transitions -- skip it continue; } points.Sort(); int lastPoint = -1; int accCount = 0; State r = s.state; for (int i = 0; i < points.count; i++) { int point = points.points[i].point; if (statesSet.upto > 0) { if (Debugging.AssertsEnabled) { Debugging.Assert(lastPoint != -1); } statesSet.ComputeHash(); if (!newstate.TryGetValue(statesSet.ToFrozenInt32Set(), out State q) || q == null) { q = new State(); SortedInt32Set.FrozenInt32Set p = statesSet.Freeze(q); worklist.Enqueue(p); if (newStateUpto == newStatesArray.Length) { // LUCENENET: Resize rather than copy Array.Resize(ref newStatesArray, ArrayUtil.Oversize(1 + newStateUpto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)); } newStatesArray[newStateUpto] = q; q.number = newStateUpto; newStateUpto++; q.accept = accCount > 0; newstate[p] = q; } else { if (Debugging.AssertsEnabled) { Debugging.Assert((accCount > 0) == q.accept, "accCount={0} vs existing accept={1} states={2}", accCount, q.accept, statesSet); } } r.AddTransition(new Transition(lastPoint, point - 1, q)); } // process transitions that end on this point // (closes an overlapping interval) Transition[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Decr(num); accCount -= t.to.accept ? 1 : 0; } points.points[i].ends.count = 0; // process transitions that start on this point // (opens a new interval) transitions = points.points[i].starts.transitions; limit = points.points[i].starts.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Incr(num); accCount += t.to.accept ? 1 : 0; } lastPoint = point; points.points[i].starts.count = 0; } points.Reset(); if (Debugging.AssertsEnabled) { Debugging.Assert(statesSet.upto == 0, "upto={0}", statesSet.upto); } } a.deterministic = true; a.SetNumberedStates(newStatesArray, newStateUpto); }
/// <summary> /// Determinizes the given automaton. /// <para/> /// Worst case complexity: exponential in number of states. /// </summary> public static void Determinize(Automaton a) { if (a.IsDeterministic || a.IsSingleton) { return; } State[] allStates = a.GetNumberedStates(); // subset construction bool initAccept = a.initial.accept; int initNumber = a.initial.number; a.initial = new State(); SortedInt32Set.FrozenInt32Set initialset = new SortedInt32Set.FrozenInt32Set(initNumber, a.initial); LinkedList <SortedInt32Set.FrozenInt32Set> worklist = new LinkedList <SortedInt32Set.FrozenInt32Set>(); IDictionary <SortedInt32Set.FrozenInt32Set, State> newstate = new Dictionary <SortedInt32Set.FrozenInt32Set, State>(); worklist.AddLast(initialset); a.initial.accept = initAccept; newstate[initialset] = a.initial; int newStateUpto = 0; State[] newStatesArray = new State[5]; newStatesArray[newStateUpto] = a.initial; a.initial.number = newStateUpto; newStateUpto++; // like Set<Integer,PointTransitions> PointTransitionSet points = new PointTransitionSet(); // like SortedMap<Integer,Integer> SortedInt32Set statesSet = new SortedInt32Set(5); // LUCENENET NOTE: The problem here is almost certainly // due to the conversion to FrozenIntSet along with its // differing equality checking. while (worklist.Count > 0) { SortedInt32Set.FrozenInt32Set s = worklist.First.Value; worklist.Remove(s); // Collate all outgoing transitions by min/1+max: for (int i = 0; i < s.values.Length; i++) { State s0 = allStates[s.values[i]]; for (int j = 0; j < s0.numTransitions; j++) { points.Add(s0.TransitionsArray[j]); } } if (points.count == 0) { // No outgoing transitions -- skip it continue; } points.Sort(); int lastPoint = -1; int accCount = 0; State r = s.state; for (int i = 0; i < points.count; i++) { int point = points.points[i].point; if (statesSet.upto > 0) { Debug.Assert(lastPoint != -1); statesSet.ComputeHash(); State q; if (!newstate.TryGetValue(statesSet.ToFrozenInt32Set(), out q) || q == null) { q = new State(); SortedInt32Set.FrozenInt32Set p = statesSet.Freeze(q); worklist.AddLast(p); if (newStateUpto == newStatesArray.Length) { State[] newArray = new State[ArrayUtil.Oversize(1 + newStateUpto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(newStatesArray, 0, newArray, 0, newStateUpto); newStatesArray = newArray; } newStatesArray[newStateUpto] = q; q.number = newStateUpto; newStateUpto++; q.accept = accCount > 0; newstate[p] = q; } else { Debug.Assert((accCount > 0) == q.accept, "accCount=" + accCount + " vs existing accept=" + q.accept + " states=" + statesSet); } r.AddTransition(new Transition(lastPoint, point - 1, q)); } // process transitions that end on this point // (closes an overlapping interval) Transition[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Decr(num); accCount -= t.to.accept ? 1 : 0; } points.points[i].ends.count = 0; // process transitions that start on this point // (opens a new interval) transitions = points.points[i].starts.transitions; limit = points.points[i].starts.count; for (int j = 0; j < limit; j++) { Transition t = transitions[j]; int num = t.to.number; statesSet.Incr(num); accCount += t.to.accept ? 1 : 0; } lastPoint = point; points.points[i].starts.count = 0; } points.Reset(); Debug.Assert(statesSet.upto == 0, "upto=" + statesSet.upto); } a.deterministic = true; a.SetNumberedStates(newStatesArray, newStateUpto); }
/// <summary> /// Converts an incoming utf32 automaton to an equivalent /// utf8 one. The incoming automaton need not be /// deterministic. Note that the returned automaton will /// not in general be deterministic, so you must /// determinize it if that's needed. /// </summary> public Automaton Convert(Automaton utf32) { if (utf32.IsSingleton) { utf32 = utf32.CloneExpanded(); } State[] map = new State[utf32.NumberedStates.Length]; List<State> pending = new List<State>(); State utf32State = utf32.InitialState; pending.Add(utf32State); Automaton utf8 = new Automaton(); utf8.Deterministic = false; State utf8State = utf8.InitialState; Utf8States = new State[5]; Utf8StateCount = 0; utf8State.number = Utf8StateCount; Utf8States[Utf8StateCount] = utf8State; Utf8StateCount++; utf8State.Accept = utf32State.Accept; map[utf32State.number] = utf8State; while (pending.Count != 0) { utf32State = pending[pending.Count - 1]; pending.RemoveAt(pending.Count - 1); utf8State = map[utf32State.number]; for (int i = 0; i < utf32State.numTransitions; i++) { Transition t = utf32State.TransitionsArray[i]; State destUTF32 = t.To; State destUTF8 = map[destUTF32.number]; if (destUTF8 == null) { destUTF8 = NewUTF8State(); destUTF8.accept = destUTF32.accept; map[destUTF32.number] = destUTF8; pending.Add(destUTF32); } ConvertOneEdge(utf8State, destUTF8, t.Min_Renamed, t.Max_Renamed); } } utf8.SetNumberedStates(Utf8States, Utf8StateCount); return utf8; }