private bool TryParseDoctype() { int doctypeStartIndex = currentIndex; int doctypeEndIndex = -1; while (ReadNext()) { if (currentChar == '>') { doctypeEndIndex = currentIndex - 1; break; } } if (doctypeEndIndex == -1) { // No end of doctype found, e.g. "<!DOCTYPE html" parseError = HtmlParseError.NodeNotClosed; return(false); } string doctypeString = text.Substring(doctypeStartIndex, doctypeEndIndex - doctypeStartIndex + 1); doctype = new HtmlDoctype(doctypeString); ReadAndSkipWhitespace(); return(true); }
/// <summary> /// Fires an error occurred event. /// </summary> /// <param name="error">The associated error code.</param> /// <param name="position">The position of the error.</param> public void RaiseErrorOccurred(HtmlParseError error, TextPosition position) { if (_events != null) { var errorEvent = new HtmlParseErrorEvent(error.GetCode(), error.GetMessage(), position); _events.Publish(errorEvent); } }
private bool TryParseComment() { ReadNext(); if (currentChar != '-') { // Invalid char after '!', e.g "<!a", but maybe is a doctype if we haven't parsed anything yet if (rootElement == null) { return(TryParseDoctype()); } parseError = HtmlParseError.InvalidCommentStart; return(false); } if (!ReadNext() || currentChar != '-') { // Invalid char after '-', e.g. "<!-", "<!-a" parseError = HtmlParseError.InvalidCommentStart; return(false); } int commentStartIndex = currentIndex + 1; int commentEndIndex = -1; while (ReadNext()) { if (currentChar == '-') { if (ReadNext() && currentChar == '-') { if (ReadNext() && currentChar == '>') { commentEndIndex = currentIndex - 3; break; } } } } if (commentEndIndex == -1) { // No end of comment, e.g. "<!--", "<!--abc", "<!--abc-", "<!--abc-a", "<!--abc--", "<!--abc--a" parseError = HtmlParseError.InvalidCommentEnd; return(false); } string comment = text.Substring(commentStartIndex, commentEndIndex - commentStartIndex + 1); if (currentElement == null) { // Comment on its own, e.g. "<!--comment-->" parseError = HtmlParseError.LoneComment; return(false); } currentElement.Add(new HtmlComment(comment)); ReadAndSkipWhitespace(); return(true); }
private bool TryParseAttributeName(out string name, out bool isExtendedAttribute, out bool isFinalAttribute) { name = null; isExtendedAttribute = false; isFinalAttribute = false; int nameStartIndex = currentIndex; int nameEndIndex = -1; bool foundWhitespace = false; while (ReadNext()) { if (!foundWhitespace && char.IsWhiteSpace(currentChar)) { nameEndIndex = currentIndex - 1; ReadAndSkipWhitespace(); foundWhitespace = true; } if (currentChar == '/' || currentChar == '>') { if (nameEndIndex == -1) { nameEndIndex = currentIndex - 1; } isFinalAttribute = true; break; } else if (currentChar == '=') { if (nameEndIndex == -1) { nameEndIndex = currentIndex - 1; } isExtendedAttribute = true; ReadAndSkipWhitespace(); break; } else if (nameEndIndex != -1) { // Found another attribute break; } } if (nameEndIndex == -1) { parseError = HtmlParseError.OpeningTagNotClosedAfterAttribute; return(false); } int nameLength = nameEndIndex - nameStartIndex + 1; name = text.ToAsciiLower(nameStartIndex, nameLength); return(true); }
private bool TryParseClosingTag() { ReadAndSkipWhitespace(); int tagStartIndex = currentIndex; int tagEndIndex = -1; // Non-void closing tag, e.g. "<abc></abc>" while (ReadNext()) { if (currentChar == '>') { // Found terminating '>' tagEndIndex = currentIndex - 1; break; } else if (char.IsWhiteSpace(currentChar)) { tagEndIndex = currentIndex - 1; ReadAndSkipWhitespace(); if (currentChar == '>') { break; } else { // Can only have '>' after whitespace, e.g. <abc></abc " parseError = HtmlParseError.ClosingTagNotClosed; return(false); } } } if (tagEndIndex == -1) { // Invalid closing tag, e.g. "<abc></", "<abc></abc", "<abc></abc " parseError = HtmlParseError.ClosingTagNotClosed; return(false); } int tagLength = tagEndIndex - tagStartIndex + 1; if (!StringExtensions.EqualsAsciiOrdinalIgnoreCase(currentElement.Tag, 0, currentElement.Tag.Length, text, tagStartIndex, tagLength)) { // Non matching closing tag, e.g. "<abc></def>" parseError = HtmlParseError.ClosingTagDoesntMatchOpeningTag; return(false); } currentElement = currentElement.Parent; ReadAndSkipWhitespace(); return(true); }
private bool TryParseInnerText() { int innerTextStartIndex = currentIndex; while (currentChar != '<' && ReadNext()) { ; } int innerTextLength = currentIndex - innerTextStartIndex; if (innerTextLength != 0) { if (currentElement == null) { // Just text, e.g. "<!DOCTYPE>a" parseError = HtmlParseError.InvalidTextAfterNode; return(false); } string innerText = text.Substring(innerTextStartIndex, innerTextLength); currentElement.Add(new HtmlText(innerText)); } return(TryParseOpeningTag()); }
/// <summary> /// Fires an error occurred event. /// </summary> /// <param name="code">The associated error code.</param> /// <param name="token">The associated token.</param> void RaiseErrorOccurred(HtmlParseError code, HtmlToken token) { _tokenizer.RaiseErrorOccurred(code, token.Position); }
/// <summary> /// Fires an error occurred event at the current position. /// </summary> /// <param name="code">The associated error code.</param> public void RaiseErrorOccurred(HtmlParseError code) { RaiseErrorOccurred(code, GetCurrentPosition()); }
public static Int32 GetCode(this HtmlParseError code) { return((Int32)code); }
internal void RaiseErrorOccurred(HtmlParseError code, TextPosition position) { var handler = Error; if (IsStrictMode) { var message = "Error while parsing the provided HTML document."; throw new HtmlParseException(code.GetCode(), message, position); } else if (handler != null) { var errorEvent = new HtmlErrorEvent(code, position); handler.Invoke(this, errorEvent); } }
/// <summary> /// Creates a new HtmlParseErrorEvent event. /// </summary> /// <param name="code">The provided error code.</param> /// <param name="position">The position in the source.</param> /// public HtmlErrorEvent(HtmlParseError code, TextPosition position) : base(EventNames.Error) { _code = code; _position = position; }
/// <summary> /// Creates a new HtmlParseErrorEvent event. /// </summary> /// <param name="code">The provided error code.</param> /// <param name="position">The position in the source.</param> /// public HtmlErrorEvent(HtmlParseError code, TextPosition position) : base(EventNames.ParseError) { _code = code; _position = position; }
private bool TryParseAttribute(out HtmlAttribute attribute) { attribute = null; string name; bool isExtendedAttribute; bool isFinalAttribute; if (!TryParseAttributeName(out name, out isExtendedAttribute, out isFinalAttribute)) { // Invalid attribute, e.g. "<abc attribute" return(false); } if (isFinalAttribute || !isExtendedAttribute) { attribute = new HtmlAttribute(name); return(true); } bool singleDelimeted = currentChar == '\''; bool doubleDelimeted = currentChar == '"'; bool notDelimited = IsLetter(currentChar); if (!singleDelimeted && !doubleDelimeted && !notDelimited) { // Invalid character after equals, e.g. "<abc attribute=" or "<abc attribute=!" parseError = HtmlParseError.NoAttributeValue; return(false); } int valueStartIndex = notDelimited ? currentIndex : currentIndex + 1; int valueEndIndex = -1; while (ReadNext()) { if (singleDelimeted) { if (currentChar == '\'') { valueEndIndex = currentIndex - 1; break; } continue; } else if (doubleDelimeted) { if (currentChar == '"') { valueEndIndex = currentIndex - 1; break; } continue; } else { bool foundWhitespace = char.IsWhiteSpace(currentChar); if (char.IsWhiteSpace(currentChar)) { valueEndIndex = currentIndex - 1; ReadAndSkipWhitespace(); } if (currentChar == '/' || currentChar == '>') { if (valueEndIndex == -1) { valueEndIndex = currentIndex - 1; } break; } else if (foundWhitespace) { // Found another attribute break; } continue; } } if (valueEndIndex == -1) { // No end of attribute value, e.g. "<abc attribute=", "<abc attribute=', "<abc attribute="a, "<abc attribute='a or , "<abc attribute=a parseError = HtmlParseError.OpeningTagNotClosedAfterAttribute; return(false); } string value = text.Substring(valueStartIndex, valueEndIndex - valueStartIndex + 1); attribute = new HtmlAttribute(name, value); return(notDelimited || ReadAndSkipWhitespace()); }
public bool TryParseOpeningTag() { if (currentChar != '<') { // No opening tag, e.g. "abc", "<abc>", "<abc> " or "<abc>InnerText" parseError = HtmlParseError.NoOpeningTag; return(false); } ReadAndSkipWhitespace(); if (currentChar == '/') { if (!TryParseClosingTag()) { return(false); } if (currentElement != null || currentIndex + 1 < text.Length) { // If the element we just finished parsing has a parent, that parent needs to have a closing tag too, so keep parsing. // If we have more text, then there may be more to parse. return(TryParseInnerText()); } // Finished parsing return(true); } else if (currentChar == '!') { if (!TryParseComment()) { // Couldn't parse the comment return(false); } if (currentIndex + 1 < text.Length) { // Got more to parse? return(TryParseInnerText()); } else { // Doctype or comment on its own, e.g. "<!--comment-->" or "<div><!--comment-->" parseError = HtmlParseError.LoneDoctype; return(false); } } if (!IsLetter(currentChar)) { // No valid tag, e.g. "<>", "<1" parseError = HtmlParseError.InvalidTag; return(false); } int tagStartIndex = currentIndex; int tagEndIndex = -1; bool foundTagEnd = false; while (ReadNext()) { foundTagEnd = currentChar == '/' || currentChar == '>'; if (foundTagEnd || char.IsWhiteSpace(currentChar)) { tagEndIndex = currentIndex - 1; break; } } if (tagEndIndex == -1) { // No end of tag, e.g. "<abc", "<abc " parseError = HtmlParseError.OpeningTagNotClosed; return(false); } string tag = text.ToAsciiLower(tagStartIndex, tagEndIndex - tagStartIndex + 1); HtmlObjectLinkedList <HtmlAttribute> attributes = null; if (!foundTagEnd && !TryParseAttributes(out attributes)) { // Could not parse attributes return(false); } bool isVoid = false; if (currentChar == '/') { // Void element? ReadAndSkipWhitespace(); if (currentChar != '>') { // No end of void tag, e.g. "<abc/", "<abc/a>" parseError = HtmlParseError.NodeNotClosed; return(false); } isVoid = true; if (currentElement != null) { // Read on if this void element is a child ReadAndSkipWhitespace(); } } else { ReadAndSkipWhitespace(); } HtmlElement element; if (rootElement == null) { if (tag == "html" && !isVoid) { element = new HtmlDocument() { Doctype = doctype }; } else if (parsingDocument) { // First tag of a document has to be an open html tag parseError = HtmlParseError.FirstElementInDocumentNotHtml; return(false); } else { element = new HtmlElement(tag, isVoid); } } else { element = new HtmlElement(tag, isVoid); } if (attributes != null) { element._attributes = attributes; } SetParsing(element); if (element.IsVoid && element.Parent == null) { if (!ReadNext() || (char.IsWhiteSpace(currentChar) && !ReadAndSkipWhitespace())) { // Valid void element without a parent, e.g. "<abc/>", "<abc/> " return(true); } // Invalid text after a void element without a parent, e.g. "<abc/>a" parseError = HtmlParseError.InvalidTextAfterNode; return(false); } if (element.Parent != null || !element.IsVoid) { // If the element has a parent, we need to make sure the parent has a closing tag. // If the element has no parent, but is non-void, we also need to make sure the element has a closing tag. if (!TryParseInnerText()) { // Couldn't parse inner text, e.g. "<abc>Inner<def></def>" return(false); } } return(true); }
/// <summary> /// Creates a new HtmlParseErrorEvent event. /// </summary> /// <param name="code">The provided error code.</param> /// <param name="position">The position in the source.</param> /// public HtmlErrorEvent(HtmlParseError code, TextPosition position) { _code = code; _position = position; }