Ejemplo n.º 1
0
        // Current implementation relies on regular expressions. We could likely do
        // much better, both in terms of allocation and in terms of pure speed of
        // traversal, if we implemented a custom recognition engine. However,
        // tokenization is currently only a very small portion of script compilation
        // (about 2ms lex + parse on a typical script), so optimization efforts are
        // best spent elsewhere.

        /// <summary> A token recognized as a regular expression. </summary>
        internal TokenDefinition(Regex regularExpression, JsRegex jsPattern, int?maximumLength = null, string startsWith = null)
        {
            JsRegex           = jsPattern;
            RegularExpression = regularExpression;
            MaximumLength     = maximumLength ?? int.MaxValue;
            _startsWith       = startsWith;
        }
Ejemplo n.º 2
0
        /// <summary> A token recognized as one of many strings. </summary>
        public TokenDefinition(IReadOnlyList <string> strings, bool caseSensitive = true)
        {
            var flags = RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.Multiline;

            if (!caseSensitive)
            {
                flags = flags | RegexOptions.IgnoreCase;
            }

            // Order strings by descending length, because C# regular expressions match "a|b" from
            // left to right rather than the longest match.

            var pattern = string.Join("|", strings.OrderByDescending(s => s.Length).Select(Regex.Escape));

            RegularExpression = new Regex("\\G(" + pattern + ")", flags);
            MaximumLength     = strings.Select(s => s.Length).Max();

            var chars = new HashSet <char>(strings.Select(s => s.ToLowerInvariant()[0]));

            chars.UnionWith(strings.Select(s => s.ToUpperInvariant()[0]));

            _startsWith = new string(chars.ToArray());

            JsRegex = new JsRegex(pattern, caseSensitive ? "" : "i");
        }
Ejemplo n.º 3
0
        /// <summary> Converts attribute to <see cref="TokenDefinition"/>. </summary>
        public TokenDefinition ToDefinition()
        {
            var flags = RegexOptions.Compiled | RegexOptions.CultureInvariant;

            if (!CaseSensitive)
            {
                flags = flags | RegexOptions.IgnoreCase;
            }

            var csPattern = Pattern.StartsWith("\\G") ? Pattern : $"\\G({Pattern})";
            var jsPattern = new JsRegex(Pattern.Replace("\\G", ""), CaseSensitive ? "" : "i");

            return(new TokenDefinition(new Regex(csPattern, flags), jsPattern, startsWith: Start));
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Initializes all static readonly members of this class from the attributes
        /// on the <typeparamref name="TTok"/> enumeration.
        /// </summary>
        static ReflectionTokenReader()
        {
            var t = typeof(TTok);

            if (!t.IsEnum)
            {
                throw new ArgumentException($"Type {t} is not an enum.", nameof(TTok));
            }

            // The TokensAttribute itself

            var tokensAttribute = t.GetCustomAttribute <TokensAttribute>();

            if (tokensAttribute == null)
            {
                throw new ArgumentException($"Enum {t} does not carry {nameof(TokensAttribute)}.", nameof(TTok));
            }

            var commentStart      = tokensAttribute.Comments[0];
            var commentStartsWith = "[]\\(.<".IndexOf(commentStart) > -1 ? null : new string(commentStart, 1);

            var csComments = tokensAttribute.Comments.StartsWith("\\G")
                ? tokensAttribute.Comments
                : $"\\G({tokensAttribute.Comments})";

            var jsComments = new JsRegex(tokensAttribute.Comments.Replace("\\G", ""), "");

            StaticComments       = new TokenDefinition(new Regex(csComments), jsComments, startsWith: commentStartsWith);
            StaticEscapeNewlines = tokensAttribute.EscapeNewlines;

            // The enumeration contents
            var names  = t.GetEnumNames();
            var values = t.GetEnumValues();

            var pairs = names.Select((n, i) => new KeyValuePair <string, TTok>(n, (TTok)values.GetValue(i))).ToArray();

            // Detect 'end', 'indent', 'dedent' and 'error', and extract the structure and
            // definition for the other rules.

            var definitions = new Dictionary <TTok, TokenDefinition>();
            var parent      = new Dictionary <TTok, TTok>();
            var publicChild = new HashSet <TTok>();

            foreach (var kv in pairs)
            {
                var name = kv.Key;
                var tok  = kv.Value;
                var mbr  = t.GetMember(name)[0];

                var endAttribute = mbr.GetCustomAttribute <EndAttribute>();
                if (endAttribute != null)
                {
                    StaticEnd = tok;
                    continue;
                }

                var errorAttribute = mbr.GetCustomAttribute <ErrorAttribute>();
                if (errorAttribute != null)
                {
                    StaticError = tok;
                    continue;
                }

                var indentAttribute = mbr.GetCustomAttribute <IndentAttribute>();
                if (indentAttribute != null)
                {
                    StaticIndent = tok;
                    continue;
                }
                var endOfLineAttribute = mbr.GetCustomAttribute <EndOfLineAttribute>();
                if (endOfLineAttribute != null)
                {
                    StaticEndOfLine = tok;
                    continue;
                }

                var dedentAttribute = mbr.GetCustomAttribute <DedentAttribute>();
                if (dedentAttribute != null)
                {
                    StaticDedent = tok;
                    continue;
                }

                var fromAttribute = mbr.GetCustomAttribute <FromAttribute>();
                if (fromAttribute != null)
                {
                    parent.Add(tok, (TTok)(object)fromAttribute.Parent);
                    if (!fromAttribute.IsPrivate)
                    {
                        publicChild.Add(tok);
                    }
                    // No continue: still need to determine definition
                }

                var patternAttribute = mbr.GetCustomAttribute <PatternAttribute>();
                if (patternAttribute != null)
                {
                    definitions.Add(tok, patternAttribute.ToDefinition());
                    continue;
                }

                var anyAttribute = mbr.GetCustomAttribute <AnyAttribute>();
                if (anyAttribute != null)
                {
                    definitions.Add(tok, anyAttribute.ToDefinition());
                    continue;
                }

                var ciAttribute = mbr.GetCustomAttribute <CiAttribute>();
                if (ciAttribute != null)
                {
                    definitions.Add(tok, new AnyAttribute(name)
                    {
                        CaseSensitive = false
                    }.ToDefinition());
                }
            }

            // Construct the actual rules. This algorithm is NOT optimal.

            var seen = new HashSet <TTok>();

            StaticRules = definitions
                          .Where(kv => !parent.ContainsKey(kv.Key))
                          .Select(kv => new LexerRule <TTok>(
                                      kv.Value,
                                      kv.Key,
                                      publicChild.Contains(kv.Key),
                                      SubRules(kv.Key, definitions, parent, publicChild, seen)))
                          .ToArray();
        }