internal static RE.CharFAMatch Match(RE.ParseContext context) { context.EnsureStarted(); int line = context.Line; int column = context.Column; long position = context.Position; int l = context.CaptureBuffer.Length; bool success = false; for ( // ; ((false == success) && (-1 != context.Current)); // ) { // q0 if ((((context.Current >= 'A') && (context.Current <= 'Z')) || ((context.Current >= 'a') && (context.Current <= 'z')))) { context.CaptureCurrent(); context.Advance(); goto q1; } goto error; q1: if ((((context.Current >= 'A') && (context.Current <= 'Z')) || ((context.Current >= 'a') && (context.Current <= 'z')))) { context.CaptureCurrent(); context.Advance(); goto q1; } success = true; goto done; error: success = false; context.Advance(); done: if ((false == success)) { line = context.Line; column = context.Column; position = context.Position; l = context.CaptureBuffer.Length; } } if (success) { return(new RE.CharFAMatch(line, column, position, context.GetCapture(l))); } return(null); }
/// <summary> /// Lexes the next input from the parse context. /// </summary> /// <param name="context">The <see cref="ParseContext"/> to use.</param> /// <param name="errorSymbol">The symbol to report in the case of an error</param> /// <returns>The next symbol matched - <paramref name="context"/> contains the capture and line information</returns> public TAccept Lex(ParseContext context, TAccept errorSymbol = default(TAccept)) { TAccept acc; // get the initial states var states = FillEpsilonClosure(); // prepare the parse context context.EnsureStarted(); while (true) { // if no more input if (-1 == context.Current) { // if we accept, return that if (TryGetAnyAcceptSymbol(states, out acc)) { return(acc); } // otherwise return error return(errorSymbol); } // move by current character var newStates = FillMove(states, (char)context.Current); // we couldn't match anything if (0 == newStates.Count) { // if we accept, return that if (TryGetAnyAcceptSymbol(states, out acc)) { return(acc); } // otherwise error // store the current character context.CaptureCurrent(); // advance the input context.Advance(); return(errorSymbol); } // store the current character context.CaptureCurrent(); // advance the input context.Advance(); // iterate to our next states states = newStates; } }
/// <summary> /// Lexes the next input from the parse context. /// </summary> /// <param name="dfaTable">The DFA state table to use</param> /// <param name="context">The <see cref="ParseContext"/> to use.</param> /// <param name="errorSymbol">The symbol id to report in the case of an error</param> /// <returns>The next symbol id matched - <paramref name="context"/> contains the capture and line information</returns> public static int LexDfa(CharDfaEntry[] dfaTable, ParseContext context, int errorSymbol = -1) { // track our current state var state = 0; // prepare the parse context context.EnsureStarted(); while (true) { // if no more input if (-1 == context.Current) { var sid = dfaTable[state].AcceptSymbolId; // if we accept, return that if (-1 != sid) { return(sid); } // otherwise return error return(errorSymbol); } // move by current character var newState = MoveDfa(dfaTable, state, (char)context.Current); // we couldn't match anything if (-1 == newState) { // if we accept, return that if (-1 != dfaTable[state].AcceptSymbolId) { return(dfaTable[state].AcceptSymbolId); } // otherwise error // store the current character context.CaptureCurrent(); // advance the input context.Advance(); return(errorSymbol); } // store the current character context.CaptureCurrent(); // advance the input context.Advance(); // iterate to our next states state = newState; } }
/// <summary> /// Lexes the next input from the parse context. /// </summary> /// <param name="context">The <see cref="ParseContext"/> to use.</param> /// <param name="errorSymbol">The symbol to report in the case of an error</param> /// <returns>The next symbol matched - <paramref name="context"/> contains the capture and line information</returns> /// <remarks>This method will not work properly on an NFA but will not error in that case, so take care to only use this with a DFA</remarks> public TAccept LexDfa(ParseContext context, TAccept errorSymbol = default(TAccept)) { // track our current state var state = this; // prepare the parse context context.EnsureStarted(); while (true) { // if no more input if (-1 == context.Current) { // if we accept, return that if (state.IsAccepting) { return(state.AcceptSymbol); } // otherwise return error return(errorSymbol); } // move by current character var newState = state.MoveDfa((char)context.Current); // we couldn't match anything if (null == newState) { // if we accept, return that if (state.IsAccepting) { return(state.AcceptSymbol); } // otherwise error // store the current character context.CaptureCurrent(); // advance the input context.Advance(); return(errorSymbol); } // store the current character context.CaptureCurrent(); // advance the input context.Advance(); // iterate to our next states state = newState; } }
static void _ParseCharClassEscape(ParseContext pc, string cls, List <RegexCharsetEntry> result, ref RegexCharsetEntry next, ref bool readDash) { if (null != next) { result.Add(next); if (readDash) { result.Add(new RegexCharsetCharEntry('-')); } result.Add(new RegexCharsetCharEntry('-')); } pc.Advance(); result.Add(new RegexCharsetClassEntry(cls)); next = null; readDash = false; }
internal static int Lex(RE.ParseContext context) { context.EnsureStarted(); // q0 if (((context.Current >= '0') && (context.Current <= '9'))) { context.CaptureCurrent(); context.Advance(); goto q1; } if ((((context.Current >= 'A') && (context.Current <= 'Z')) || ((context.Current >= 'a') && (context.Current <= 'z')))) { context.CaptureCurrent(); context.Advance(); goto q2; } if (((((context.Current == '\t') || ((context.Current >= '\n') && (context.Current <= ''))) || (context.Current == '\r')) || (context.Current == ' '))) { context.CaptureCurrent(); context.Advance(); goto q3; } goto error; q1: if (((context.Current >= '0') && (context.Current <= '9'))) { context.CaptureCurrent(); context.Advance(); goto q1; } return(0); q2: if ((((context.Current >= 'A') && (context.Current <= 'Z')) || ((context.Current >= 'a') && (context.Current <= 'z')))) { context.CaptureCurrent(); context.Advance(); goto q2; } return(1); q3: if (((((context.Current == '\t') || ((context.Current >= '\n') && (context.Current <= ''))) || (context.Current == '\r')) || (context.Current == ' '))) { context.CaptureCurrent(); context.Advance(); goto q3; } return(2); error: context.CaptureCurrent(); context.Advance(); return(3); }
static int _ParseRangeEscapePart(ParseContext pc) { if (-1 == pc.Current) { return(-1); } switch (pc.Current) { case 'f': pc.Advance(); return('\f'); case 'v': pc.Advance(); return('\v'); case 't': pc.Advance(); return('\t'); case 'n': pc.Advance(); return('\n'); case 'r': pc.Advance(); return('\r'); case 'x': if (-1 == pc.Advance() || !_IsHexChar((char)pc.Current)) { return('x'); } byte b = _FromHexChar((char)pc.Current); if (-1 == pc.Advance() || !_IsHexChar((char)pc.Current)) { return(unchecked ((char)b)); } b <<= 4; b |= _FromHexChar((char)pc.Current); if (-1 == pc.Advance() || !_IsHexChar((char)pc.Current)) { return(unchecked ((char)b)); } b <<= 4; b |= _FromHexChar((char)pc.Current); if (-1 == pc.Advance() || !_IsHexChar((char)pc.Current)) { return(unchecked ((char)b)); } b <<= 4; b |= _FromHexChar((char)pc.Current); return(unchecked ((char)b)); case 'u': if (-1 == pc.Advance()) { return('u'); } ushort u = _FromHexChar((char)pc.Current); u <<= 4; if (-1 == pc.Advance()) { return(unchecked ((char)u)); } u |= _FromHexChar((char)pc.Current); u <<= 4; if (-1 == pc.Advance()) { return(unchecked ((char)u)); } u |= _FromHexChar((char)pc.Current); u <<= 4; if (-1 == pc.Advance()) { return(unchecked ((char)u)); } u |= _FromHexChar((char)pc.Current); return(unchecked ((char)u)); default: int i = pc.Current; pc.Advance(); return((char)i); } }
static RegexExpression _ParseModifier(RegexExpression expr, ParseContext pc) { var line = pc.Line; var column = pc.Column; var position = pc.Position; switch (pc.Current) { case '*': expr = new RegexRepeatExpression(expr); expr.SetLocation(line, column, position); pc.Advance(); break; case '+': expr = new RegexRepeatExpression(expr, 1); expr.SetLocation(line, column, position); pc.Advance(); break; case '?': expr = new RegexOptionalExpression(expr); expr.SetLocation(line, column, position); pc.Advance(); break; case '{': pc.Advance(); pc.TrySkipWhiteSpace(); pc.Expecting('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ',', '}'); var min = -1; var max = -1; if (',' != pc.Current && '}' != pc.Current) { var l = pc.CaptureBuffer.Length; pc.TryReadDigits(); min = int.Parse(pc.GetCapture(l)); pc.TrySkipWhiteSpace(); } if (',' == pc.Current) { pc.Advance(); pc.TrySkipWhiteSpace(); pc.Expecting('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '}'); if ('}' != pc.Current) { var l = pc.CaptureBuffer.Length; pc.TryReadDigits(); max = int.Parse(pc.GetCapture(l)); pc.TrySkipWhiteSpace(); } } else { max = min; } pc.Expecting('}'); pc.Advance(); expr = new RegexRepeatExpression(expr, min, max); expr.SetLocation(line, column, position); break; } return(expr); }
static IList <RegexCharsetEntry> _ParseRanges(ParseContext pc) { pc.EnsureStarted(); var result = new List <RegexCharsetEntry>(); RegexCharsetEntry next = null; bool readDash = false; while (-1 != pc.Current && ']' != pc.Current) { switch (pc.Current) { case '[': // char class if (null != next) { result.Add(next); if (readDash) { result.Add(new RegexCharsetCharEntry('-')); } result.Add(new RegexCharsetCharEntry('-')); } pc.Advance(); pc.Expecting(':'); pc.Advance(); var l = pc.CaptureBuffer.Length; pc.TryReadUntil(':', false); var n = pc.GetCapture(l); pc.Advance(); pc.Expecting(']'); pc.Advance(); result.Add(new RegexCharsetClassEntry(n)); readDash = false; next = null; break; case '\\': pc.Advance(); pc.Expecting(); switch (pc.Current) { case 'h': _ParseCharClassEscape(pc, "space", result, ref next, ref readDash); break; case 'd': _ParseCharClassEscape(pc, "digit", result, ref next, ref readDash); break; case 'D': _ParseCharClassEscape(pc, "^digit", result, ref next, ref readDash); break; case 'l': _ParseCharClassEscape(pc, "lower", result, ref next, ref readDash); break; case 's': _ParseCharClassEscape(pc, "space", result, ref next, ref readDash); break; case 'S': _ParseCharClassEscape(pc, "^space", result, ref next, ref readDash); break; case 'u': _ParseCharClassEscape(pc, "upper", result, ref next, ref readDash); break; case 'w': _ParseCharClassEscape(pc, "word", result, ref next, ref readDash); break; case 'W': _ParseCharClassEscape(pc, "^word", result, ref next, ref readDash); break; default: var ch = (char)_ParseRangeEscapePart(pc); if (null == next) { next = new RegexCharsetCharEntry(ch); } else if (readDash) { result.Add(new RegexCharsetRangeEntry(((RegexCharsetCharEntry)next).Value, ch)); next = null; readDash = false; } else { result.Add(next); next = new RegexCharsetCharEntry(ch); } break; } break; case '-': pc.Advance(); if (null == next) { next = new RegexCharsetCharEntry('-'); readDash = false; } else { if (readDash) { result.Add(next); } readDash = true; } break; default: if (null == next) { next = new RegexCharsetCharEntry((char)pc.Current); } else { if (readDash) { result.Add(new RegexCharsetRangeEntry(((RegexCharsetCharEntry)next).Value, (char)pc.Current)); next = null; readDash = false; } else { result.Add(next); next = new RegexCharsetCharEntry((char)pc.Current); } } pc.Advance(); break; } } if (null != next) { result.Add(next); if (readDash) { next = new RegexCharsetCharEntry('-'); result.Add(next); } } return(result); }
/// <summary> /// Parses a regular expression from the specified <see cref="ParseContext"/> /// </summary> /// <param name="pc">The parse context to use</param> /// <returns>A new abstract syntax tree representing the expression</returns> public static RegexExpression Parse(ParseContext pc) { RegexExpression result = null, next = null; int ich; pc.EnsureStarted(); var line = pc.Line; var column = pc.Column; var position = pc.Position; while (true) { switch (pc.Current) { case -1: return(result); case '.': var nset = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetRangeEntry(char.MinValue, char.MaxValue) }, false); nset.SetLocation(line, column, position); if (null == result) { result = nset; } else { result = new RegexConcatExpression(result, nset); result.SetLocation(line, column, position); } pc.Advance(); result = _ParseModifier(result, pc); line = pc.Line; column = pc.Column; position = pc.Position; break; case '\\': pc.Advance(); pc.Expecting(); switch (pc.Current) { case 'd': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("digit") }); pc.Advance(); break; case 'D': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("digit") }, true); pc.Advance(); break; case 'h': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("blank") }); pc.Advance(); break; case 'l': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("lower") }); pc.Advance(); break; case 's': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("space") }); pc.Advance(); break; case 'S': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("space") }, true); pc.Advance(); break; case 'u': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("upper") }); pc.Advance(); break; case 'w': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("word") }); pc.Advance(); break; case 'W': next = new RegexCharsetExpression(new RegexCharsetEntry[] { new RegexCharsetClassEntry("word") }, true); pc.Advance(); break; default: if (-1 != (ich = _ParseEscapePart(pc))) { next = new RegexLiteralExpression((char)ich); } else { pc.Expecting(); // throw an error return(null); // doesn't execute } break; } next.SetLocation(line, column, position); next = _ParseModifier(next, pc); if (null != result) { result = new RegexConcatExpression(result, next); result.SetLocation(line, column, position); } else { result = next; } line = pc.Line; column = pc.Column; position = pc.Position; break; case ')': return(result); case '(': pc.Advance(); pc.Expecting(); next = Parse(pc); pc.Expecting(')'); pc.Advance(); next = _ParseModifier(next, pc); if (null == result) { result = next; } else { result = new RegexConcatExpression(result, next); result.SetLocation(line, column, position); } line = pc.Line; column = pc.Column; position = pc.Position; break; case '|': if (-1 != pc.Advance()) { next = Parse(pc); result = new RegexOrExpression(result, next); result.SetLocation(line, column, position); } else { result = new RegexOrExpression(result, null); result.SetLocation(line, column, position); } line = pc.Line; column = pc.Column; position = pc.Position; break; case '[': pc.ClearCapture(); pc.Advance(); pc.Expecting(); bool not = false; if ('^' == pc.Current) { not = true; pc.Advance(); pc.Expecting(); } var ranges = _ParseRanges(pc); if (ranges.Count == 0) { System.Diagnostics.Debugger.Break(); } pc.Expecting(']'); pc.Advance(); next = new RegexCharsetExpression(ranges, not); next.SetLocation(line, column, position); next = _ParseModifier(next, pc); if (null == result) { result = next; } else { result = new RegexConcatExpression(result, next); result.SetLocation(pc.Line, pc.Column, pc.Position); } line = pc.Line; column = pc.Column; position = pc.Position; break; default: ich = pc.Current; next = new RegexLiteralExpression((char)ich); next.SetLocation(line, column, position); pc.Advance(); next = _ParseModifier(next, pc); if (null == result) { result = next; } else { result = new RegexConcatExpression(result, next); result.SetLocation(line, column, position); } line = pc.Line; column = pc.Column; position = pc.Position; break; } } }