public TokenEnumerator(FA lexer, IEnumerable <char> @string, bool reportEndToken) { _lexer = lexer; _input = @string.GetEnumerator(); _buffer = new StringBuilder(); _reportEndToken = reportEndToken; _initialStates = _lexer.FillEpsilonClosure(); Reset(); // Reset is used here to initialize the rest of the values }
public static FA Optional(FA expr, int accept = -1) { var result = expr.Clone(); var f = result.FirstAcceptingState; f.AcceptSymbol = accept; result.EpsilonTransitions.Add(f); return(result); }
static void _Concat(FA lhs, FA rhs) { //Debug.Assert(lhs != rhs); var f = lhs.FirstAcceptingState; //Debug.Assert(null != rhs.FirstAcceptingState); f.IsAccepting = false; f.EpsilonTransitions.Add(rhs); //Debug.Assert(null!= lhs.FirstAcceptingState); }
/// <summary> /// Builds a simple lexer using the specified FA expressions /// </summary> /// <param name="expressions">The FSMs/expressions to compose the lexer from</param> /// <returns>An FSM suitable for lexing</returns> public static FA ToLexer(IEnumerable <FA> expressions) { var result = new FA(); foreach (var expr in expressions) { result.EpsilonTransitions.Add(expr); } return(result); }
public void AddInputTransition(KeyValuePair <int, int> range, FA dst) { foreach (var trns in InputTransitions) { if (RangeUtility.Intersects(trns.Key, range)) { throw new ArgumentException("There already is a transition to a different state on at least part of the specified input range"); } } InputTransitions.Add(range, dst); }
/// <summary> /// Indicates whether this state is a duplicate of another state. /// </summary> /// <param name="rhs">The state to compare with</param> /// <returns>True if the states are duplicates (one can be removed without changing the language of the machine)</returns> public bool IsDuplicate(FA rhs) { if (null != rhs && IsAccepting == rhs.IsAccepting && _SetComparer.Default.Equals(EpsilonTransitions, rhs.EpsilonTransitions) && _SetComparer.Default.Equals(InputTransitions, rhs.InputTransitions)) { if (!IsAccepting || AcceptSymbol == rhs.AcceptSymbol) { return(true); } } return(false); }
public static FA Literal(IEnumerable <int> @string, int accept = -1) { var result = new FA(); var current = result; foreach (var ch in @string) { current.IsAccepting = false; var fa = new FA(true, accept); current.AddInputTransition(new KeyValuePair <int, int>(ch, ch), fa); current = fa; } return(result); }
public static FA Set(int[] ranges, int accept = -1) { var result = new FA(); var final = new FA(true, accept); var pairs = new List <KeyValuePair <int, int> >(RangeUtility.ToPairs(ranges)); pairs.Sort((x, y) => { return(x.Key.CompareTo(y.Key)); }); RangeUtility.NormalizeSortedRangeList(pairs); foreach (var pair in pairs) { result.AddInputTransition(pair, final); } return(result); }
static bool _TryForwardNeutral(FA fa, out FA result) { if (!fa.IsNeutral) { result = fa; return(false); } result = fa; foreach (var efa in fa.EpsilonTransitions) { result = efa; break; } return(fa != result); // false if circular }
static FA _ForwardNeutrals(FA fa) { if (null == fa) { throw new ArgumentNullException(nameof(fa)); } var result = fa; while (_TryForwardNeutral(result, out result)) { ; } return(result); }
public static FA Concat(IEnumerable <FA> exprs, int accept = -1) { FA result = null, left = null, right = null; foreach (var val in exprs) { if (null == val) { continue; } //Debug.Assert(null != val.FirstAcceptingState); var nval = val.Clone(); //Debug.Assert(null != nval.FirstAcceptingState); if (null == left) { if (null == result) { result = nval; } left = nval; //Debug.Assert(null != left.FirstAcceptingState); continue; } if (null == right) { right = nval; } //Debug.Assert(null != left.FirstAcceptingState); nval = right.Clone(); _Concat(left, nval); right = null; left = nval; //Debug.Assert(null != left.FirstAcceptingState); } if (null != right) { right.FirstAcceptingState.AcceptSymbol = accept; } else { result.FirstAcceptingState.AcceptSymbol = accept; } return(result); }
public static FA CaseInsensitive(FA expr, int accept = -1) { var result = expr.Clone(); var closure = new List <FA>(); result.FillClosure(closure); for (int ic = closure.Count, i = 0; i < ic; ++i) { var fa = closure[i]; var t = new List <KeyValuePair <KeyValuePair <int, int>, FA> >(fa.InputTransitions); fa.InputTransitions.Clear(); foreach (var trns in t) { var f = char.ConvertFromUtf32(trns.Key.Key); var l = char.ConvertFromUtf32(trns.Key.Value); if (char.IsLower(f, 0)) { if (!char.IsLower(l, 0)) { throw new NotSupportedException("Attempt to make an invalid range case insensitive"); } fa.InputTransitions.Add(trns.Key, trns.Value); f = f.ToUpperInvariant(); l = l.ToUpperInvariant(); fa.InputTransitions.Add(new KeyValuePair <int, int>(char.ConvertToUtf32(f, 0), char.ConvertToUtf32(l, 0)), trns.Value); } else if (char.IsUpper(f, 0)) { if (!char.IsUpper(l, 0)) { throw new NotSupportedException("Attempt to make an invalid range case insensitive"); } fa.InputTransitions.Add(trns.Key, trns.Value); f = f.ToLowerInvariant(); l = l.ToLowerInvariant(); fa.InputTransitions.Add(new KeyValuePair <int, int>(char.ConvertToUtf32(f, 0), char.ConvertToUtf32(l, 0)), trns.Value); } else { fa.InputTransitions.Add(trns.Key, trns.Value); } } } return(result); }
public static int Lex(DfaEntry[] dfaTable, IEnumerator <int> input, StringBuilder capture, out bool more) { var state = 0; while (input.MoveNext()) { more = true; var next = FA.Move(dfaTable, state, input.Current); if (-1 == next) { return(dfaTable[state].AcceptSymbolId); } capture.Append(char.ConvertFromUtf32(input.Current)); state = next; } more = false; return(dfaTable[state].AcceptSymbolId); }
/// <summary> /// Converts the state machine to a Generalized NFA /// </summary> /// <param name="accept">The accept symbol</param> /// <returns>A new GNFA state machine that accepts the same language</returns> /// <remarks>A generalized NFA has a single start state and a single accept state that is final.</remarks> public FA ToGnfa(int accept = -1) { var fa = Clone(); var accepting = fa.FillAcceptingStates(); if (1 < accepting.Count) { var newFinal = new FA(true, accept); foreach (var afa in accepting) { afa.IsAccepting = false; afa.EpsilonTransitions.Add(newFinal); } } else { foreach (var afa in accepting) { afa.AcceptSymbol = accept; } } // using the state removal method // first convert to a GNFA var last = fa.FirstAcceptingState; if (!last.IsFinal) { // sometimes our last state isn't final, // so we have to extend the machine to have // a final last state last.IsAccepting = false; last.EpsilonTransitions.Add(new FA(true, accept)); } if (!fa.IsNeutral) { // add a neutral transition to the beginning var nfa = new FA(); nfa.EpsilonTransitions.Add(fa); fa = nfa; } return(fa); }
public static FA Or(IEnumerable <FA> exprs, int accept = -1) { var result = new FA(); var final = new FA(true, accept); foreach (var fa in exprs) { if (null != fa) { var nfa = fa.Clone(); result.EpsilonTransitions.Add(nfa); var nffa = nfa.FirstAcceptingState; nffa.IsAccepting = false; nffa.EpsilonTransitions.Add(final); } else if (!result.EpsilonTransitions.Contains(final)) { result.EpsilonTransitions.Add(final); } } return(result); }
public Tokenizer(FA lexer, IEnumerable <char> input, bool reportEndToken = false) { _lexer = lexer; _input = input; _reportEndToken = reportEndToken; }
/// <summary> /// This is where the work happens /// </summary> /// <returns>The symbol that was matched. members _state _line,_column,_position,_buffer and _input are also modified.</returns> int _Lex() { int acc; var states = _initialStates; _buffer.Clear(); switch (_state) { case -1: // initial if (!_MoveNextInput()) { _state = -2; acc = _GetAcceptingSymbol(states); if (-1 < acc) { return(acc); } else { return(-1); // "#ERROR"; } } _state = 0; // running break; case -2: // end of stream return(-2); // "#EOS"; } // Here's where we run most of the match. FillMove runs one interation of the NFA state machine. // We match until we can't match anymore (greedy matching) and then report the symbol of the last // match we found, or an error ("#ERROR") if we couldn't find one. while (true) { var next = FA.FillMove(states, _input.Current); if (0 == next.Count) // couldn't find any states { break; } _buffer.Append(_input.Current); states = next; if (!_MoveNextInput()) { // end of stream _state = -2; acc = _GetAcceptingSymbol(states); if (-1 < acc) // do we accept? { return(acc); } else { return(-1); // "#ERROR"; } } } acc = _GetAcceptingSymbol(states); if (-1 < acc) // do we accept? { return(acc); } else { // handle the error condition _buffer.Append(_input.Current); if (!_MoveNextInput()) { _state = -2; } return(-1); // "#ERROR"; } }
internal static FA Parse(LexContext pc, int accept = -1) { FA result = null, next = null; int ich; pc.EnsureStarted(); while (true) { switch (pc.Current) { case -1: #if MINIMIZE result = result.ToDfa(); result.TrimDuplicates(); #endif return(result); case '.': var dot = FA.Set(new int[] { 0, 0x10ffff }, accept); if (null == result) { result = dot; } else { result = FA.Concat(new FA[] { result, dot }, accept); } pc.Advance(); result = _ParseModifier(result, pc, accept); break; case '\\': pc.Advance(); pc.Expecting(); var isNot = false; switch (pc.Current) { case 'P': isNot = true; goto case 'p'; case 'p': pc.Advance(); pc.Expecting('{'); var uc = new StringBuilder(); int uli = pc.Line; int uco = pc.Column; long upo = pc.Position; while (-1 != pc.Advance() && '}' != pc.Current) { uc.Append((char)pc.Current); } pc.Expecting('}'); pc.Advance(); int uci = 0; switch (uc.ToString()) { case "Pe": uci = 21; break; case "Pc": uci = 19; break; case "Cc": uci = 14; break; case "Sc": uci = 26; break; case "Pd": uci = 19; break; case "Nd": uci = 8; break; case "Me": uci = 7; break; case "Pf": uci = 23; break; case "Cf": uci = 15; break; case "Pi": uci = 22; break; case "Nl": uci = 9; break; case "Zl": uci = 12; break; case "Ll": uci = 1; break; case "Sm": uci = 25; break; case "Lm": uci = 3; break; case "Sk": uci = 27; break; case "Mn": uci = 5; break; case "Ps": uci = 20; break; case "Lo": uci = 4; break; case "Cn": uci = 29; break; case "No": uci = 10; break; case "Po": uci = 24; break; case "So": uci = 28; break; case "Zp": uci = 13; break; case "Co": uci = 17; break; case "Zs": uci = 11; break; case "Mc": uci = 6; break; case "Cs": uci = 16; break; case "Lt": uci = 2; break; case "Lu": uci = 0; break; } if (isNot) { next = FA.Set(CharacterClasses.UnicodeCategories[uci], accept); } else { next = FA.Set(CharacterClasses.NotUnicodeCategories[uci], accept); } break; case 'd': next = FA.Set(CharacterClasses.digit, accept); pc.Advance(); break; case 'D': next = FA.Set(RangeUtility.NotRanges(CharacterClasses.digit), accept); pc.Advance(); break; case 's': next = FA.Set(CharacterClasses.space, accept); pc.Advance(); break; case 'S': next = FA.Set(RangeUtility.NotRanges(CharacterClasses.space), accept); pc.Advance(); break; case 'w': next = FA.Set(CharacterClasses.word, accept); pc.Advance(); break; case 'W': next = FA.Set(RangeUtility.NotRanges(CharacterClasses.word), accept); pc.Advance(); break; default: if (-1 != (ich = _ParseEscapePart(pc))) { next = FA.Literal(new int[] { ich }, accept); } else { pc.Expecting(); // throw an error return(null); // doesn't execute } break; } next = _ParseModifier(next, pc, accept); if (null != result) { result = FA.Concat(new FA[] { result, next }, accept); } else { result = next; } break; case ')': #if MINIMIZE result = result.ToDfa(); result.TrimDuplicates(); #endif return(result); case '(': pc.Advance(); pc.Expecting(); next = Parse(pc, accept); pc.Expecting(')'); pc.Advance(); next = _ParseModifier(next, pc, accept); if (null == result) { result = next; } else { result = FA.Concat(new FA[] { result, next }, accept); } break; case '|': if (-1 != pc.Advance()) { next = Parse(pc, accept); result = FA.Or(new FA[] { result, next }, accept); } else { result = FA.Optional(result, accept); } break; case '[': var seti = _ParseSet(pc); var set = seti.Value; if (seti.Key) { set = RangeUtility.NotRanges(set); } next = FA.Set(set, accept); next = _ParseModifier(next, pc, accept); if (null == result) { result = next; } else { result = FA.Concat(new FA[] { result, next }, accept); } break; default: ich = pc.Current; if (char.IsHighSurrogate((char)ich)) { if (-1 == pc.Advance()) { throw new ExpectingException("Expecting low surrogate in Unicode stream", pc.Line, pc.Column, pc.Position, pc.FileOrUrl, "low-surrogate"); } ich = char.ConvertToUtf32((char)ich, (char)pc.Current); } next = FA.Literal(new int[] { ich }, accept); pc.Advance(); next = _ParseModifier(next, pc, accept); if (null == result) { result = next; } else { result = FA.Concat(new FA[] { result, next }, accept); } break; } } }
/// <summary> /// Returns a DFA table that can be used to lex or match /// </summary> /// <param name="symbolTable">The symbol table to use, or null to just implicitly tag symbols with integer ids</param> /// <returns>A DFA table that can be used to efficiently match or lex input</returns> public DfaEntry[] ToDfaStateTable(IList <int> symbolTable = null, IProgress <FAProgress> progress = null) { // only convert to a DFA if we haven't already // ToDfa() already checks but it always copies // the state information so this performs better FA dfa = null; if (!IsDfa) { dfa = ToDfa(progress); dfa.TrimDuplicates(progress); } else { dfa = this; } var closure = new List <FA>(); dfa.FillClosure(closure); var symbolLookup = new Dictionary <int, int>(); // if we don't have a symbol table, build // the symbol lookup from the states. if (null == symbolTable) { // go through each state, looking for accept symbols // and then add them to the new symbol table is we // haven't already var i = 0; for (int jc = closure.Count, j = 0; j < jc; ++j) { var fa = closure[j]; if (fa.IsAccepting && !symbolLookup.ContainsKey(fa.AcceptSymbol)) { if (0 > fa.AcceptSymbol) { throw new InvalidOperationException("An accept symbol was never specified for state q" + jc.ToString()); } symbolLookup.Add(fa.AcceptSymbol, i); ++i; } } } else // build the symbol lookup from the symbol table { for (int ic = symbolTable.Count, i = 0; i < ic; ++i) { symbolLookup.Add(symbolTable[i], i); } } // build the root array var result = new DfaEntry[closure.Count]; for (var i = 0; i < result.Length; i++) { var fa = closure[i]; #if DEBUG if (fa.IsAccepting) { System.Diagnostics.Debug.Assert(-1 < fa.AcceptSymbol, "Illegal accept symbol " + fa.AcceptSymbol.ToString() + " was found on state state q" + i.ToString()); } #endif // get all the transition ranges for each destination state var trgs = fa.FillInputTransitionRangesGroupedByState(); // make a new transition entry array for our DFA state table var trns = new DfaTransitionEntry[trgs.Count]; var j = 0; // for each transition range foreach (var trg in trgs) { // add the transition entry using // the packed ranges from CharRange trns[j] = new DfaTransitionEntry( trg.Value, closure.IndexOf(trg.Key)); ++j; } // now add the state entry for the state above #if DEBUG if (fa.IsAccepting && !symbolLookup.ContainsKey(fa.AcceptSymbol)) { try { dfa.RenderToFile(@"dfastatetable_crashdump_dfa.jpg"); } catch { } System.Diagnostics.Debug.Assert(false, "The symbol table did not contain an entry for state q" + i.ToString()); } #endif result[i] = new DfaEntry( fa.IsAccepting ? symbolLookup[fa.AcceptSymbol] : -1, trns); } return(result); }
static FA _Determinize(FA fa, IProgress <FAProgress> progress = null) { if (null != progress) { progress.Report(new FAProgress(FAStatus.DfaTransform, 0)); } var p = new HashSet <int>(); var closure = new List <FA>(); fa.FillClosure(closure); for (int ic = closure.Count, i = 0; i < ic; ++i) { var ffa = closure[i]; p.Add(0); foreach (var t in ffa.InputTransitions) { p.Add(t.Key.Key); if (t.Key.Value < 0x10ffff) { p.Add((t.Key.Value + 1)); } } } var points = new int[p.Count]; p.CopyTo(points, 0); Array.Sort(points); var comparer = _SetComparer.Default; var sets = new Dictionary <HashSet <FA>, HashSet <FA> >(comparer); var working = new Queue <HashSet <FA> >(); var dfaMap = new Dictionary <HashSet <FA>, FA>(comparer); var initial = new HashSet <FA>(); fa.FillEpsilonClosure(initial); sets.Add(initial, initial); working.Enqueue(initial); var result = new FA(); foreach (var afa in initial) { if (afa.IsAccepting) { result.IsAccepting = true; result.AcceptSymbol = afa.AcceptSymbol; break; } } dfaMap.Add(initial, result); var j = 1; while (working.Count > 0) { var s = working.Dequeue(); var ecs = FillEpsilonClosure(s); FA dfa; dfaMap.TryGetValue(s, out dfa); foreach (FA q in ecs) { if (q.IsAccepting) { dfa.IsAccepting = true; dfa.AcceptSymbol = q.AcceptSymbol; break; } } for (var i = 0; i < points.Length; i++) { var set = new HashSet <FA>(); foreach (FA c in ecs) { foreach (var trns in c.InputTransitions) { if (trns.Key.Key <= points[i] && points[i] <= trns.Key.Value) { foreach (var efa in trns.Value.FillEpsilonClosure()) { set.Add(efa); } } } } if (!sets.ContainsKey(set)) { sets.Add(set, set); working.Enqueue(set); dfaMap.Add(set, new FA()); } FA dst; dfaMap.TryGetValue(set, out dst); int first = points[i]; int last; if (i + 1 < points.Length) { last = (points[i + 1] - 1); } else { last = 0x10ffff; } dfa.InputTransitions.Add(new KeyValuePair <int, int>(first, last), dst); } if (null != progress) { progress.Report(new FAProgress(FAStatus.DfaTransform, j)); } ++j; } // remove dead transitions foreach (var ffa in result.FillClosure()) { var itrns = new List <KeyValuePair <KeyValuePair <int, int>, FA> >(ffa.InputTransitions); foreach (var trns in itrns) { if (null == trns.Value.FirstAcceptingState) { ffa.InputTransitions.Remove(trns.Key); } } if (null != progress) { progress.Report(new FAProgress(FAStatus.DfaTransform, j)); } ++j; } return(result); }
public static FA Repeat(FA expr, int minOccurs = -1, int maxOccurs = -1, int accept = -1) { expr = expr.Clone(); if (minOccurs > 0 && maxOccurs > 0 && minOccurs > maxOccurs) { throw new ArgumentOutOfRangeException(nameof(maxOccurs)); } FA result; switch (minOccurs) { case -1: case 0: switch (maxOccurs) { case -1: case 0: //return Repeat(Optional(expr, accept),1,0,accept); result = new FA(); var final = new FA(true, accept); final.EpsilonTransitions.Add(result); foreach (var afa in expr.FillAcceptingStates()) { afa.IsAccepting = false; afa.EpsilonTransitions.Add(final); } result.EpsilonTransitions.Add(expr); result.EpsilonTransitions.Add(final); //Debug.Assert(null != result.FirstAcceptingState); return(result); case 1: result = Optional(expr, accept); //Debug.Assert(null != result.FirstAcceptingState); return(result); default: var l = new List <FA>(); expr = Optional(expr); l.Add(expr); for (int i = 1; i < maxOccurs; ++i) { l.Add(expr.Clone()); } result = Concat(l, accept); //Debug.Assert(null != result.FirstAcceptingState); return(result); } case 1: switch (maxOccurs) { case -1: case 0: result = new FA(); var final = new FA(true, accept); final.EpsilonTransitions.Add(result); foreach (var afa in expr.FillAcceptingStates()) { afa.IsAccepting = false; afa.EpsilonTransitions.Add(final); } result.EpsilonTransitions.Add(expr); //Debug.Assert(null != result.FirstAcceptingState); return(result); case 1: //Debug.Assert(null != expr.FirstAcceptingState); return(expr); default: result = Concat(new FA[] { expr, Repeat(expr.Clone(), 0, maxOccurs - 1) }, accept); //Debug.Assert(null != result.FirstAcceptingState); return(result); } default: switch (maxOccurs) { case -1: case 0: result = Concat(new FA[] { Repeat(expr, minOccurs, minOccurs, accept), Repeat(expr, 0, 0, accept) }, accept); //Debug.Assert(null != result.FirstAcceptingState); return(result); case 1: throw new ArgumentOutOfRangeException(nameof(maxOccurs)); default: if (minOccurs == maxOccurs) { var l = new List <FA>(); l.Add(expr); //Debug.Assert(null != expr.FirstAcceptingState); for (int i = 1; i < minOccurs; ++i) { var e = expr.Clone(); //Debug.Assert(null != e.FirstAcceptingState); l.Add(e); } result = Concat(l, accept); //Debug.Assert(null != result.FirstAcceptingState); return(result); } result = Concat(new FA[] { Repeat(expr.Clone(), minOccurs, minOccurs, accept), Repeat(Optional(expr.Clone()), maxOccurs - minOccurs, maxOccurs - minOccurs, accept) }, accept); //Debug.Assert(null != result.FirstAcceptingState); return(result); } } // should never get here throw new NotImplementedException(); }