Example #1
0
        HtmlElement GetHtmlElement(string tagName)
        {
            var descriptor = HtmlTagDescriptor.Find(tagName);
            var tag        = new HtmlElement(descriptor?.Name ?? tagName)
            {
                Location   = startTagLocation,
                Descriptor = descriptor,
                Kind       = ElementKind.OpeningClosing
            };

            return(tag);
        }
Example #2
0
        public bool AcceptParent(HtmlTagDescriptor parentDescriptor)
        {
            if (parentDescriptor == null)
            {
                return(true);
            }

            //return ((ParentKind & parentDescriptor.Category) != 0)
            //       || (ParentTags != null && Array.IndexOf(ParentTags, Name) >= 0);

            return((Category & parentDescriptor.AcceptContent) != 0 ||
                   parentDescriptor.AcceptContent == ContentKind.Any ||
                   (parentDescriptor.AcceptContentTags != null &&
                    Array.IndexOf(parentDescriptor.AcceptContentTags, Name) >= 0));
        }
Example #3
0
        void CloseTags(int indexOfOpenTag, SourceLocation span, string parentTag)
        {
            // Else we will close all intermediate tag
            for (int i = stack.Count - 1; i >= indexOfOpenTag; i--)
            {
                var element     = stack[i];
                var elementDesc = HtmlTagDescriptor.Find(element.Name);
                if (i > indexOfOpenTag &&
                    (elementDesc == null ||
                     (elementDesc.AcceptContent != ContentKind.None && (elementDesc.EndKind != TagEndKind.Omission || !elementDesc.CanOmitEndTag(element, null, true)))))
                {
                    Warning(element.Location, $"Unbalanced tag [{element.Name}] within tag [{parentTag}] requiring a closing tag. Force closing it");
                }

                stack.RemoveAt(i);
            }
        }
Example #4
0
        void TryProcessEndTag()
        {
            tempBuilder.Clear();

            // Remove any invalid space at the beginning
            tempBuilder.Append(c);

            while (true)
            {
                c = NextChar();
                if (c.IsAlphaNumeric() || c == '_' || c == ':' || c == '.' || c == '-')
                {
                    tempBuilder.Append(c);
                }
                else
                {
                    break;
                }
            }

            // Skip any spaces after
            while (c.IsSpace())
            {
                c = NextChar();
            }

            var tagName = tempBuilder.ToString().ToLowerInvariant();

            if (c == '>')
            {
                c = NextChar();

                int indexOfOpenTag;
                for (indexOfOpenTag = stack.Count - 1; indexOfOpenTag >= 0; indexOfOpenTag--)
                {
                    if (string.Equals(stack[indexOfOpenTag].Name, tagName, StringComparison.OrdinalIgnoreCase))
                    {
                        break;
                    }
                }

                // Opening tag found?
                if (indexOfOpenTag >= 0)
                {
                    CloseTags(indexOfOpenTag, startTagLocation, tagName);
                }
                else
                {
                    var descriptor = HtmlTagDescriptor.Find(tagName);

                    // If we have a closing tag without an opening tag
                    // Log a warning but keep the tag (that should be an error, but we assume we can recover from it)
                    var invalidTag = new HtmlElement(tagName)
                    {
                        Location   = startTagLocation,
                        Descriptor = descriptor,
                        Kind       = ElementKind.Closing
                    };

                    CurrentParent.AppendChild(invalidTag);

                    // if only <html> exists in the stack, attempt to create missing structure
                    if (stack.Count == 1)
                    {
                        var parent = CurrentParent;
                        TryCreateOptionalStart(ref parent, invalidTag);
                    }

                    if (descriptor != null && descriptor.EndKind == TagEndKind.AutoSelfClosing)
                    {
                        invalidTag.Kind = ElementKind.SelfClosing;
                        Warning(startTagLocation, $"Invalid end tag </{tagName}> used instead of self closing tag <{tagName}/> or <{tagName}>");
                    }
                    else
                    {
                        Warning(startTagLocation, $"Unable to find opening tag for closing tag </{tagName}>");
                    }
                }
            }
            else
            {
                Error(c == 0
                    ? $"Invalid EOF found while parsing </{tagName}>"
                    : $"Invalid character '{c}' found while parsing </{tagName}>");

                AppendText(startTagLocation, position - 1);
            }

            tempBuilder.Clear();
        }
Example #5
0
        void TryProcessStartTag()
        {
            // https://www.w3.org/TR/html-markup/syntax.html#syntax-elements
            // start tags consist of the following parts, in exactly the following order:
            //
            //    A "<" character.
            //    The element’s tag name.
            //       tag names are used within element start tags and end tags to give the element’s name.
            //       HTML elements all have names that only use characters in the range 0–9, a–z, and A–Z.
            //    Optionally, one or more attributes, each of which must be preceded by one or more space characters.
            //    Optionally, one or more space characters.
            //    Optionally, a "/" character, which may be present only if the element is a void element.
            //    A ">" character.
            tempBuilder.Clear();

            var isProcessingInstruction = false;

            if (c == '?')
            {
                isProcessingInstruction = true;
            }
            else
            {
                tempBuilder.Append(c);
            }

            while (true)
            {
                c = NextChar();
                // TODO: not entirely correct for <? as we should only test for Alpha for the first char
                if (c.IsTagChar() || c == '_' || c == ':' || c == '.' || c == '-') // Plus some special characters not supported by default HTML but used and supported by browsers
                {
                    tempBuilder.Append(c);
                }
                else
                {
                    break;
                }
            }

            var tagName    = tempBuilder.ToString();
            var tag        = GetHtmlElement(tagName);
            var descriptor = tag.Descriptor;

            // Check processing is valid
            if (isProcessingInstruction)
            {
                if (descriptor == null)
                {
                    tag.Kind = ElementKind.ProcessingInstruction;
                }
                else
                {
                    Error(startTagLocation, $"The HTML tag [{tagName}] cannot start with a processing instruction <?{tagName}...>");
                    isProcessingInstruction = false;
                }
            }

            // If an element is selfclosing, setup it by default
            if (descriptor != null && descriptor.EndKind == TagEndKind.AutoSelfClosing)
            {
                tag.Kind = ElementKind.SelfClosing;
            }

            tag.Descriptor = HtmlTagDescriptor.Find(tag.Name);

            tempBuilder.Clear();

            bool hasAttribute = false;

            bool isValid      = false;
            var  errorContext = string.Empty;

            while (true)
            {
                var hasWhitespaces = false;

                // Skip any whitespaces
                while (c.IsSpace())
                {
                    c = NextChar();
                    hasWhitespaces = true;
                }

                switch (c)
                {
                case '\0':
                    goto exit;

                case '@':
                    // Try to continue parsing the tag even if we have an error
                    // We may be able to recover from it
                    var postText = (string.IsNullOrEmpty(errorContext) ? string.Empty : " " + errorContext);
                    Error($"Invalid character '{c}' found while parsing <{tag.Name}>{postText}");
                    // Fake a whitespace instead
                    c            = !isProcessingInstruction && c == '>' ? '>' : ' ';
                    errorContext = null;
                    break;

                case '?':
                    if (isProcessingInstruction)
                    {
                        c = NextChar();
                        if (c == '>')
                        {
                            c       = NextChar();
                            isValid = true;
                            goto exit;
                        }
                    }
                    goto case '@';

                case '>':
                    if (!isProcessingInstruction)
                    {
                        c       = NextChar();
                        isValid = true;
                        goto exit;
                    }
                    goto case '@';

                case '/':
                    c = NextChar();
                    if (c == '>' && !isProcessingInstruction)
                    {
                        tag.Kind = ElementKind.SelfClosing;
                        c        = NextChar();
                        isValid  = true;
                        goto exit;
                    }
                    goto case '@';

                case '=':
                    if (!hasAttribute)
                    {
                        goto case '@';
                    }

                    // Skip any spaces after
                    while (true)
                    {
                        c = NextChar();
                        if (!c.IsSpace())
                        {
                            break;
                        }
                    }

                    tempBuilder.Clear();

                    var attrIndex = tag.Attributes.Count - 1;
                    var attr      = tag.Attributes[attrIndex];

                    // Parse a quoted string
                    if (c == '\'' || c == '\"')
                    {
                        var openingStringChar = c;
                        while (true)
                        {
                            c = NextChar();
                            if (c == '\0')
                            {
                                goto exit;
                            }
                            if (c != openingStringChar)
                            {
                                tempBuilder.Append(c);
                            }
                            else
                            {
                                break;
                            }
                        }
                        c = NextChar();
                    }
                    else
                    {
                        // Parse until we match a space or a special html character
                        int matchCount = 0;
                        while (true)
                        {
                            if (c == '\0')
                            {
                                goto exit;
                            }
                            if (c.IsSpace() || c == '"' || c == '\'' || c == '=' || c == '<' || c == '>' || c == '`')
                            {
                                break;
                            }
                            matchCount++;
                            tempBuilder.Append(c);
                            c = NextChar();
                        }

                        // We need at least one char after '='
                        if (matchCount == 0)
                        {
                            errorContext = $"and after attribute [{attr.Name}]. Expecting valid character after '='";
                            goto case '@';
                        }
                    }

                    attr.Value = tempBuilder.ToString();
                    tempBuilder.Clear();

                    hasAttribute = false;
                    break;

                default:
                    // Parse the attribute name
                    if (!hasWhitespaces)
                    {
                        Error($"Invalid character '{c}' found while parsing <{tag.Name}>. Expecting a whitespace before an attribute");
                        // still try to recover from this error
                    }

                    // Attribute names must consist of one or more characters other than
                    // the space characters,
                    // U +0000 NULL,
                    // U +0022 QUOTATION MARK ("),
                    // U +0027 APOSTROPHE ('),
                    // U +003E GREATER-THAN SIGN (>),
                    // U +002F SOLIDUS (/),
                    // and U+003D EQUALS SIGN (=) characters,
                    // the control characters,
                    // and any characters that are not defined by Unicode.


                    if (!c.IsAttributeNameChar())
                    {
                        goto case '@';
                    }

                    tempBuilder.Clear();
                    tempBuilder.Append(c);

                    while (true)
                    {
                        c = NextChar();
                        if (c.IsAttributeNameChar())
                        {
                            tempBuilder.Append(c);
                        }
                        else
                        {
                            break;
                        }
                    }

                    hasAttribute = true;
                    if (tag.Attributes == null)
                    {
                        tag.Attributes = new List <HtmlAttribute>();
                    }
                    tag.Attributes.Add(new HtmlAttribute(tempBuilder.ToString(), null));

                    tempBuilder.Clear();
                    break;
                }
            }

exit:

            if (isValid)
            {
                // TODO: Process stack and check if we need to close them
                while (true)
                {
                    var parent = CurrentParent;

                    // If the parent has an AcceptContent == Transparent
                    // We need to find a higher parent that is not transparent and use its ContentKind
                    var nonTransparentParent     = parent;
                    var nonTransparentDescriptor = parent.Descriptor;
                    while (nonTransparentDescriptor != null && nonTransparentDescriptor.AcceptContent == ContentKind.Transparent)
                    {
                        nonTransparentParent     = nonTransparentParent.Parent;
                        nonTransparentDescriptor = HtmlTagDescriptor.Find(nonTransparentParent.Name);
                    }

                    var finalParentDescriptor = parent.Descriptor;
                    var parentIsTransparent   = parent.Descriptor != null && parent.Descriptor.AcceptContent == ContentKind.Transparent;
                    if (parentIsTransparent)
                    {
                        finalParentDescriptor = nonTransparentDescriptor;
                    }

                    // - If the parent has no descriptor, we assume that it is a non-HTML tag but it accepts children
                    // - If parent has a descriptor and is not closable by a new tag
                    // - If parent is supporting omission tag but is not closed by current opening tag
                    var isParentClosableByNewTag = parent.Descriptor != null && parent.Descriptor.EndKind == TagEndKind.Omission;
                    if (parent.Descriptor == null || !isParentClosableByNewTag || !parent.Descriptor.CanOmitEndTag(parent, tag, true))
                    {
                        if (parent.Descriptor != null && !isParentClosableByNewTag && descriptor != null)
                        {
                            // Check if the parent accepts the tag
                            // we will emit a warning just in case
                            if (!descriptor.AcceptParent(finalParentDescriptor))
                            {
                                // If a new parent was created, we don't need to log a warning
                                if (!TryCreateOptionalStart(ref parent, tag))
                                {
                                    Warning(tag.Location, $"The tag <{tag.Name}> is not a valid tag within the parent tag <{parent.Name}>");
                                }
                            }
                        }

                        parent.AppendChild(tag);

                        if ((tag.Descriptor == null || tag.Descriptor.AcceptContent != ContentKind.None || tag.Descriptor.AcceptContentTags != null) && tag.Kind != ElementKind.SelfClosing)
                        {
                            PushStack(tag);
                        }
                        break;
                    }

                    // This should not happen, so throw an error if we got there
                    if (stack.Count == 1)
                    {
                        Error(tag.Location, $"The tag <{tag.Name}> is not a valid tag within the parent tag <{parent.Name}>");
                        break;
                    }

                    PopStack();
                }

                // The content of SCRIPT and STYLE are considered as kind of "CDATA"
                // and are expecting to mach either a </script> or </style>
                // so we parse the content immediately here
                if (tag.Kind != ElementKind.SelfClosing && (tag.Name.Equals("script", StringComparison.OrdinalIgnoreCase) || tag.Name.Equals("style", StringComparison.OrdinalIgnoreCase)))
                {
                    ParseScriptOrStyleContent(tag);

                    // Remove the <script> or <style> element from the stack as we have parse it
                    PopStack();
                }
            }
            else
            {
                if (isProcessingInstruction)
                {
                    tagName = "?" + tagName + "?";
                }

                Error(c == 0
                    ? $"Invalid EOF found while parsing <{tagName}>"
                    : $"Invalid character '{c}' found while parsing <{tagName}>");

                AppendText(startTagLocation, position - 1);
            }
        }