// Current implementation relies on regular expressions. We could likely do // much better, both in terms of allocation and in terms of pure speed of // traversal, if we implemented a custom recognition engine. However, // tokenization is currently only a very small portion of script compilation // (about 2ms lex + parse on a typical script), so optimization efforts are // best spent elsewhere. /// <summary> A token recognized as a regular expression. </summary> internal TokenDefinition(Regex regularExpression, JsRegex jsPattern, int?maximumLength = null, string startsWith = null) { JsRegex = jsPattern; RegularExpression = regularExpression; MaximumLength = maximumLength ?? int.MaxValue; _startsWith = startsWith; }
/// <summary> A token recognized as one of many strings. </summary> public TokenDefinition(IReadOnlyList <string> strings, bool caseSensitive = true) { var flags = RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.Multiline; if (!caseSensitive) { flags = flags | RegexOptions.IgnoreCase; } // Order strings by descending length, because C# regular expressions match "a|b" from // left to right rather than the longest match. var pattern = string.Join("|", strings.OrderByDescending(s => s.Length).Select(Regex.Escape)); RegularExpression = new Regex("\\G(" + pattern + ")", flags); MaximumLength = strings.Select(s => s.Length).Max(); var chars = new HashSet <char>(strings.Select(s => s.ToLowerInvariant()[0])); chars.UnionWith(strings.Select(s => s.ToUpperInvariant()[0])); _startsWith = new string(chars.ToArray()); JsRegex = new JsRegex(pattern, caseSensitive ? "" : "i"); }
/// <summary> Converts attribute to <see cref="TokenDefinition"/>. </summary> public TokenDefinition ToDefinition() { var flags = RegexOptions.Compiled | RegexOptions.CultureInvariant; if (!CaseSensitive) { flags = flags | RegexOptions.IgnoreCase; } var csPattern = Pattern.StartsWith("\\G") ? Pattern : $"\\G({Pattern})"; var jsPattern = new JsRegex(Pattern.Replace("\\G", ""), CaseSensitive ? "" : "i"); return(new TokenDefinition(new Regex(csPattern, flags), jsPattern, startsWith: Start)); }
/// <summary> /// Initializes all static readonly members of this class from the attributes /// on the <typeparamref name="TTok"/> enumeration. /// </summary> static ReflectionTokenReader() { var t = typeof(TTok); if (!t.IsEnum) { throw new ArgumentException($"Type {t} is not an enum.", nameof(TTok)); } // The TokensAttribute itself var tokensAttribute = t.GetCustomAttribute <TokensAttribute>(); if (tokensAttribute == null) { throw new ArgumentException($"Enum {t} does not carry {nameof(TokensAttribute)}.", nameof(TTok)); } var commentStart = tokensAttribute.Comments[0]; var commentStartsWith = "[]\\(.<".IndexOf(commentStart) > -1 ? null : new string(commentStart, 1); var csComments = tokensAttribute.Comments.StartsWith("\\G") ? tokensAttribute.Comments : $"\\G({tokensAttribute.Comments})"; var jsComments = new JsRegex(tokensAttribute.Comments.Replace("\\G", ""), ""); StaticComments = new TokenDefinition(new Regex(csComments), jsComments, startsWith: commentStartsWith); StaticEscapeNewlines = tokensAttribute.EscapeNewlines; // The enumeration contents var names = t.GetEnumNames(); var values = t.GetEnumValues(); var pairs = names.Select((n, i) => new KeyValuePair <string, TTok>(n, (TTok)values.GetValue(i))).ToArray(); // Detect 'end', 'indent', 'dedent' and 'error', and extract the structure and // definition for the other rules. var definitions = new Dictionary <TTok, TokenDefinition>(); var parent = new Dictionary <TTok, TTok>(); var publicChild = new HashSet <TTok>(); foreach (var kv in pairs) { var name = kv.Key; var tok = kv.Value; var mbr = t.GetMember(name)[0]; var endAttribute = mbr.GetCustomAttribute <EndAttribute>(); if (endAttribute != null) { StaticEnd = tok; continue; } var errorAttribute = mbr.GetCustomAttribute <ErrorAttribute>(); if (errorAttribute != null) { StaticError = tok; continue; } var indentAttribute = mbr.GetCustomAttribute <IndentAttribute>(); if (indentAttribute != null) { StaticIndent = tok; continue; } var endOfLineAttribute = mbr.GetCustomAttribute <EndOfLineAttribute>(); if (endOfLineAttribute != null) { StaticEndOfLine = tok; continue; } var dedentAttribute = mbr.GetCustomAttribute <DedentAttribute>(); if (dedentAttribute != null) { StaticDedent = tok; continue; } var fromAttribute = mbr.GetCustomAttribute <FromAttribute>(); if (fromAttribute != null) { parent.Add(tok, (TTok)(object)fromAttribute.Parent); if (!fromAttribute.IsPrivate) { publicChild.Add(tok); } // No continue: still need to determine definition } var patternAttribute = mbr.GetCustomAttribute <PatternAttribute>(); if (patternAttribute != null) { definitions.Add(tok, patternAttribute.ToDefinition()); continue; } var anyAttribute = mbr.GetCustomAttribute <AnyAttribute>(); if (anyAttribute != null) { definitions.Add(tok, anyAttribute.ToDefinition()); continue; } var ciAttribute = mbr.GetCustomAttribute <CiAttribute>(); if (ciAttribute != null) { definitions.Add(tok, new AnyAttribute(name) { CaseSensitive = false }.ToDefinition()); } } // Construct the actual rules. This algorithm is NOT optimal. var seen = new HashSet <TTok>(); StaticRules = definitions .Where(kv => !parent.ContainsKey(kv.Key)) .Select(kv => new LexerRule <TTok>( kv.Value, kv.Key, publicChild.Contains(kv.Key), SubRules(kv.Key, definitions, parent, publicChild, seen))) .ToArray(); }