HtmlElement GetHtmlElement(string tagName) { var descriptor = HtmlTagDescriptor.Find(tagName); var tag = new HtmlElement(descriptor?.Name ?? tagName) { Location = startTagLocation, Descriptor = descriptor, Kind = ElementKind.OpeningClosing }; return(tag); }
void CloseTags(int indexOfOpenTag, SourceLocation span, string parentTag) { // Else we will close all intermediate tag for (int i = stack.Count - 1; i >= indexOfOpenTag; i--) { var element = stack[i]; var elementDesc = HtmlTagDescriptor.Find(element.Name); if (i > indexOfOpenTag && (elementDesc == null || (elementDesc.AcceptContent != ContentKind.None && (elementDesc.EndKind != TagEndKind.Omission || !elementDesc.CanOmitEndTag(element, null, true))))) { Warning(element.Location, $"Unbalanced tag [{element.Name}] within tag [{parentTag}] requiring a closing tag. Force closing it"); } stack.RemoveAt(i); } }
void TryProcessEndTag() { tempBuilder.Clear(); // Remove any invalid space at the beginning tempBuilder.Append(c); while (true) { c = NextChar(); if (c.IsAlphaNumeric() || c == '_' || c == ':' || c == '.' || c == '-') { tempBuilder.Append(c); } else { break; } } // Skip any spaces after while (c.IsSpace()) { c = NextChar(); } var tagName = tempBuilder.ToString().ToLowerInvariant(); if (c == '>') { c = NextChar(); int indexOfOpenTag; for (indexOfOpenTag = stack.Count - 1; indexOfOpenTag >= 0; indexOfOpenTag--) { if (string.Equals(stack[indexOfOpenTag].Name, tagName, StringComparison.OrdinalIgnoreCase)) { break; } } // Opening tag found? if (indexOfOpenTag >= 0) { CloseTags(indexOfOpenTag, startTagLocation, tagName); } else { var descriptor = HtmlTagDescriptor.Find(tagName); // If we have a closing tag without an opening tag // Log a warning but keep the tag (that should be an error, but we assume we can recover from it) var invalidTag = new HtmlElement(tagName) { Location = startTagLocation, Descriptor = descriptor, Kind = ElementKind.Closing }; CurrentParent.AppendChild(invalidTag); // if only <html> exists in the stack, attempt to create missing structure if (stack.Count == 1) { var parent = CurrentParent; TryCreateOptionalStart(ref parent, invalidTag); } if (descriptor != null && descriptor.EndKind == TagEndKind.AutoSelfClosing) { invalidTag.Kind = ElementKind.SelfClosing; Warning(startTagLocation, $"Invalid end tag </{tagName}> used instead of self closing tag <{tagName}/> or <{tagName}>"); } else { Warning(startTagLocation, $"Unable to find opening tag for closing tag </{tagName}>"); } } } else { Error(c == 0 ? $"Invalid EOF found while parsing </{tagName}>" : $"Invalid character '{c}' found while parsing </{tagName}>"); AppendText(startTagLocation, position - 1); } tempBuilder.Clear(); }
void TryProcessStartTag() { // https://www.w3.org/TR/html-markup/syntax.html#syntax-elements // start tags consist of the following parts, in exactly the following order: // // A "<" character. // The element’s tag name. // tag names are used within element start tags and end tags to give the element’s name. // HTML elements all have names that only use characters in the range 0–9, a–z, and A–Z. // Optionally, one or more attributes, each of which must be preceded by one or more space characters. // Optionally, one or more space characters. // Optionally, a "/" character, which may be present only if the element is a void element. // A ">" character. tempBuilder.Clear(); var isProcessingInstruction = false; if (c == '?') { isProcessingInstruction = true; } else { tempBuilder.Append(c); } while (true) { c = NextChar(); // TODO: not entirely correct for <? as we should only test for Alpha for the first char if (c.IsTagChar() || c == '_' || c == ':' || c == '.' || c == '-') // Plus some special characters not supported by default HTML but used and supported by browsers { tempBuilder.Append(c); } else { break; } } var tagName = tempBuilder.ToString(); var tag = GetHtmlElement(tagName); var descriptor = tag.Descriptor; // Check processing is valid if (isProcessingInstruction) { if (descriptor == null) { tag.Kind = ElementKind.ProcessingInstruction; } else { Error(startTagLocation, $"The HTML tag [{tagName}] cannot start with a processing instruction <?{tagName}...>"); isProcessingInstruction = false; } } // If an element is selfclosing, setup it by default if (descriptor != null && descriptor.EndKind == TagEndKind.AutoSelfClosing) { tag.Kind = ElementKind.SelfClosing; } tag.Descriptor = HtmlTagDescriptor.Find(tag.Name); tempBuilder.Clear(); bool hasAttribute = false; bool isValid = false; var errorContext = string.Empty; while (true) { var hasWhitespaces = false; // Skip any whitespaces while (c.IsSpace()) { c = NextChar(); hasWhitespaces = true; } switch (c) { case '\0': goto exit; case '@': // Try to continue parsing the tag even if we have an error // We may be able to recover from it var postText = (string.IsNullOrEmpty(errorContext) ? string.Empty : " " + errorContext); Error($"Invalid character '{c}' found while parsing <{tag.Name}>{postText}"); // Fake a whitespace instead c = !isProcessingInstruction && c == '>' ? '>' : ' '; errorContext = null; break; case '?': if (isProcessingInstruction) { c = NextChar(); if (c == '>') { c = NextChar(); isValid = true; goto exit; } } goto case '@'; case '>': if (!isProcessingInstruction) { c = NextChar(); isValid = true; goto exit; } goto case '@'; case '/': c = NextChar(); if (c == '>' && !isProcessingInstruction) { tag.Kind = ElementKind.SelfClosing; c = NextChar(); isValid = true; goto exit; } goto case '@'; case '=': if (!hasAttribute) { goto case '@'; } // Skip any spaces after while (true) { c = NextChar(); if (!c.IsSpace()) { break; } } tempBuilder.Clear(); var attrIndex = tag.Attributes.Count - 1; var attr = tag.Attributes[attrIndex]; // Parse a quoted string if (c == '\'' || c == '\"') { var openingStringChar = c; while (true) { c = NextChar(); if (c == '\0') { goto exit; } if (c != openingStringChar) { tempBuilder.Append(c); } else { break; } } c = NextChar(); } else { // Parse until we match a space or a special html character int matchCount = 0; while (true) { if (c == '\0') { goto exit; } if (c.IsSpace() || c == '"' || c == '\'' || c == '=' || c == '<' || c == '>' || c == '`') { break; } matchCount++; tempBuilder.Append(c); c = NextChar(); } // We need at least one char after '=' if (matchCount == 0) { errorContext = $"and after attribute [{attr.Name}]. Expecting valid character after '='"; goto case '@'; } } attr.Value = tempBuilder.ToString(); tempBuilder.Clear(); hasAttribute = false; break; default: // Parse the attribute name if (!hasWhitespaces) { Error($"Invalid character '{c}' found while parsing <{tag.Name}>. Expecting a whitespace before an attribute"); // still try to recover from this error } // Attribute names must consist of one or more characters other than // the space characters, // U +0000 NULL, // U +0022 QUOTATION MARK ("), // U +0027 APOSTROPHE ('), // U +003E GREATER-THAN SIGN (>), // U +002F SOLIDUS (/), // and U+003D EQUALS SIGN (=) characters, // the control characters, // and any characters that are not defined by Unicode. if (!c.IsAttributeNameChar()) { goto case '@'; } tempBuilder.Clear(); tempBuilder.Append(c); while (true) { c = NextChar(); if (c.IsAttributeNameChar()) { tempBuilder.Append(c); } else { break; } } hasAttribute = true; if (tag.Attributes == null) { tag.Attributes = new List <HtmlAttribute>(); } tag.Attributes.Add(new HtmlAttribute(tempBuilder.ToString(), null)); tempBuilder.Clear(); break; } } exit: if (isValid) { // TODO: Process stack and check if we need to close them while (true) { var parent = CurrentParent; // If the parent has an AcceptContent == Transparent // We need to find a higher parent that is not transparent and use its ContentKind var nonTransparentParent = parent; var nonTransparentDescriptor = parent.Descriptor; while (nonTransparentDescriptor != null && nonTransparentDescriptor.AcceptContent == ContentKind.Transparent) { nonTransparentParent = nonTransparentParent.Parent; nonTransparentDescriptor = HtmlTagDescriptor.Find(nonTransparentParent.Name); } var finalParentDescriptor = parent.Descriptor; var parentIsTransparent = parent.Descriptor != null && parent.Descriptor.AcceptContent == ContentKind.Transparent; if (parentIsTransparent) { finalParentDescriptor = nonTransparentDescriptor; } // - If the parent has no descriptor, we assume that it is a non-HTML tag but it accepts children // - If parent has a descriptor and is not closable by a new tag // - If parent is supporting omission tag but is not closed by current opening tag var isParentClosableByNewTag = parent.Descriptor != null && parent.Descriptor.EndKind == TagEndKind.Omission; if (parent.Descriptor == null || !isParentClosableByNewTag || !parent.Descriptor.CanOmitEndTag(parent, tag, true)) { if (parent.Descriptor != null && !isParentClosableByNewTag && descriptor != null) { // Check if the parent accepts the tag // we will emit a warning just in case if (!descriptor.AcceptParent(finalParentDescriptor)) { // If a new parent was created, we don't need to log a warning if (!TryCreateOptionalStart(ref parent, tag)) { Warning(tag.Location, $"The tag <{tag.Name}> is not a valid tag within the parent tag <{parent.Name}>"); } } } parent.AppendChild(tag); if ((tag.Descriptor == null || tag.Descriptor.AcceptContent != ContentKind.None || tag.Descriptor.AcceptContentTags != null) && tag.Kind != ElementKind.SelfClosing) { PushStack(tag); } break; } // This should not happen, so throw an error if we got there if (stack.Count == 1) { Error(tag.Location, $"The tag <{tag.Name}> is not a valid tag within the parent tag <{parent.Name}>"); break; } PopStack(); } // The content of SCRIPT and STYLE are considered as kind of "CDATA" // and are expecting to mach either a </script> or </style> // so we parse the content immediately here if (tag.Kind != ElementKind.SelfClosing && (tag.Name.Equals("script", StringComparison.OrdinalIgnoreCase) || tag.Name.Equals("style", StringComparison.OrdinalIgnoreCase))) { ParseScriptOrStyleContent(tag); // Remove the <script> or <style> element from the stack as we have parse it PopStack(); } } else { if (isProcessingInstruction) { tagName = "?" + tagName + "?"; } Error(c == 0 ? $"Invalid EOF found while parsing <{tagName}>" : $"Invalid character '{c}' found while parsing <{tagName}>"); AppendText(startTagLocation, position - 1); } }