Ejemplo n.º 1
0
            private bool TryParseDoctype()
            {
                int doctypeStartIndex = currentIndex;
                int doctypeEndIndex   = -1;

                while (ReadNext())
                {
                    if (currentChar == '>')
                    {
                        doctypeEndIndex = currentIndex - 1;
                        break;
                    }
                }
                if (doctypeEndIndex == -1)
                {
                    // No end of doctype found, e.g. "<!DOCTYPE html"
                    parseError = HtmlParseError.NodeNotClosed;
                    return(false);
                }
                string doctypeString = text.Substring(doctypeStartIndex, doctypeEndIndex - doctypeStartIndex + 1);

                doctype = new HtmlDoctype(doctypeString);
                ReadAndSkipWhitespace();
                return(true);
            }
Ejemplo n.º 2
0
 /// <summary>
 /// Fires an error occurred event.
 /// </summary>
 /// <param name="error">The associated error code.</param>
 /// <param name="position">The position of the error.</param>
 public void RaiseErrorOccurred(HtmlParseError error, TextPosition position)
 {
     if (_events != null)
     {
         var errorEvent = new HtmlParseErrorEvent(error.GetCode(), error.GetMessage(), position);
         _events.Publish(errorEvent);
     }
 }
Ejemplo n.º 3
0
            private bool TryParseComment()
            {
                ReadNext();
                if (currentChar != '-')
                {
                    // Invalid char after '!', e.g "<!a", but maybe is a doctype if we haven't parsed anything yet
                    if (rootElement == null)
                    {
                        return(TryParseDoctype());
                    }
                    parseError = HtmlParseError.InvalidCommentStart;
                    return(false);
                }
                if (!ReadNext() || currentChar != '-')
                {
                    // Invalid char after '-', e.g. "<!-", "<!-a"
                    parseError = HtmlParseError.InvalidCommentStart;
                    return(false);
                }
                int commentStartIndex = currentIndex + 1;
                int commentEndIndex   = -1;

                while (ReadNext())
                {
                    if (currentChar == '-')
                    {
                        if (ReadNext() && currentChar == '-')
                        {
                            if (ReadNext() && currentChar == '>')
                            {
                                commentEndIndex = currentIndex - 3;
                                break;
                            }
                        }
                    }
                }
                if (commentEndIndex == -1)
                {
                    // No end of comment, e.g. "<!--", "<!--abc", "<!--abc-", "<!--abc-a", "<!--abc--", "<!--abc--a"
                    parseError = HtmlParseError.InvalidCommentEnd;
                    return(false);
                }
                string comment = text.Substring(commentStartIndex, commentEndIndex - commentStartIndex + 1);

                if (currentElement == null)
                {
                    // Comment on its own, e.g. "<!--comment-->"
                    parseError = HtmlParseError.LoneComment;
                    return(false);
                }
                currentElement.Add(new HtmlComment(comment));
                ReadAndSkipWhitespace();
                return(true);
            }
Ejemplo n.º 4
0
            private bool TryParseAttributeName(out string name, out bool isExtendedAttribute, out bool isFinalAttribute)
            {
                name = null;
                isExtendedAttribute = false;
                isFinalAttribute    = false;

                int  nameStartIndex  = currentIndex;
                int  nameEndIndex    = -1;
                bool foundWhitespace = false;

                while (ReadNext())
                {
                    if (!foundWhitespace && char.IsWhiteSpace(currentChar))
                    {
                        nameEndIndex = currentIndex - 1;
                        ReadAndSkipWhitespace();
                        foundWhitespace = true;
                    }
                    if (currentChar == '/' || currentChar == '>')
                    {
                        if (nameEndIndex == -1)
                        {
                            nameEndIndex = currentIndex - 1;
                        }
                        isFinalAttribute = true;
                        break;
                    }
                    else if (currentChar == '=')
                    {
                        if (nameEndIndex == -1)
                        {
                            nameEndIndex = currentIndex - 1;
                        }
                        isExtendedAttribute = true;
                        ReadAndSkipWhitespace();
                        break;
                    }
                    else if (nameEndIndex != -1)
                    {
                        // Found another attribute
                        break;
                    }
                }
                if (nameEndIndex == -1)
                {
                    parseError = HtmlParseError.OpeningTagNotClosedAfterAttribute;
                    return(false);
                }
                int nameLength = nameEndIndex - nameStartIndex + 1;

                name = text.ToAsciiLower(nameStartIndex, nameLength);
                return(true);
            }
Ejemplo n.º 5
0
            private bool TryParseClosingTag()
            {
                ReadAndSkipWhitespace();
                int tagStartIndex = currentIndex;
                int tagEndIndex   = -1;

                // Non-void closing tag, e.g. "<abc></abc>"
                while (ReadNext())
                {
                    if (currentChar == '>')
                    {
                        // Found terminating '>'
                        tagEndIndex = currentIndex - 1;
                        break;
                    }
                    else if (char.IsWhiteSpace(currentChar))
                    {
                        tagEndIndex = currentIndex - 1;
                        ReadAndSkipWhitespace();
                        if (currentChar == '>')
                        {
                            break;
                        }
                        else
                        {
                            // Can only have '>' after whitespace, e.g. <abc></abc "
                            parseError = HtmlParseError.ClosingTagNotClosed;
                            return(false);
                        }
                    }
                }
                if (tagEndIndex == -1)
                {
                    // Invalid closing tag, e.g. "<abc></", "<abc></abc", "<abc></abc  "
                    parseError = HtmlParseError.ClosingTagNotClosed;
                    return(false);
                }
                int tagLength = tagEndIndex - tagStartIndex + 1;

                if (!StringExtensions.EqualsAsciiOrdinalIgnoreCase(currentElement.Tag, 0, currentElement.Tag.Length, text, tagStartIndex, tagLength))
                {
                    // Non matching closing tag, e.g. "<abc></def>"
                    parseError = HtmlParseError.ClosingTagDoesntMatchOpeningTag;
                    return(false);
                }
                currentElement = currentElement.Parent;
                ReadAndSkipWhitespace();
                return(true);
            }
Ejemplo n.º 6
0
            private bool TryParseInnerText()
            {
                int innerTextStartIndex = currentIndex;

                while (currentChar != '<' && ReadNext())
                {
                    ;
                }

                int innerTextLength = currentIndex - innerTextStartIndex;

                if (innerTextLength != 0)
                {
                    if (currentElement == null)
                    {
                        // Just text, e.g. "<!DOCTYPE>a"
                        parseError = HtmlParseError.InvalidTextAfterNode;
                        return(false);
                    }
                    string innerText = text.Substring(innerTextStartIndex, innerTextLength);
                    currentElement.Add(new HtmlText(innerText));
                }
                return(TryParseOpeningTag());
            }
Ejemplo n.º 7
0
 /// <summary>
 /// Fires an error occurred event.
 /// </summary>
 /// <param name="code">The associated error code.</param>
 /// <param name="token">The associated token.</param>
 void RaiseErrorOccurred(HtmlParseError code, HtmlToken token)
 {
     _tokenizer.RaiseErrorOccurred(code, token.Position);
 }
Ejemplo n.º 8
0
 /// <summary>
 /// Fires an error occurred event at the current position.
 /// </summary>
 /// <param name="code">The associated error code.</param>
 public void RaiseErrorOccurred(HtmlParseError code)
 {
     RaiseErrorOccurred(code, GetCurrentPosition());
 }
Ejemplo n.º 9
0
 public static Int32 GetCode(this HtmlParseError code)
 {
     return((Int32)code);
 }
Ejemplo n.º 10
0
        internal void RaiseErrorOccurred(HtmlParseError code, TextPosition position)
        {
            var handler = Error;

            if (IsStrictMode)
            {
                var message = "Error while parsing the provided HTML document.";
                throw new HtmlParseException(code.GetCode(), message, position);
            }
            else if (handler != null)
            {
                var errorEvent = new HtmlErrorEvent(code, position);
                handler.Invoke(this, errorEvent);
            }
        }
Ejemplo n.º 11
0
 /// <summary>
 /// Creates a new HtmlParseErrorEvent event.
 /// </summary>
 /// <param name="code">The provided error code.</param>
 /// <param name="position">The position in the source.</param>
 ///
 public HtmlErrorEvent(HtmlParseError code, TextPosition position)
     : base(EventNames.Error)
 {
     _code     = code;
     _position = position;
 }
Ejemplo n.º 12
0
 /// <summary>
 /// Creates a new HtmlParseErrorEvent event.
 /// </summary>
 /// <param name="code">The provided error code.</param>
 /// <param name="position">The position in the source.</param>
 /// 
 public HtmlErrorEvent(HtmlParseError code, TextPosition position)
     : base(EventNames.ParseError)
 {
     _code = code;
     _position = position;
 }
Ejemplo n.º 13
0
            private bool TryParseAttribute(out HtmlAttribute attribute)
            {
                attribute = null;
                string name;
                bool   isExtendedAttribute;
                bool   isFinalAttribute;

                if (!TryParseAttributeName(out name, out isExtendedAttribute, out isFinalAttribute))
                {
                    // Invalid attribute, e.g. "<abc attribute"
                    return(false);
                }
                if (isFinalAttribute || !isExtendedAttribute)
                {
                    attribute = new HtmlAttribute(name);
                    return(true);
                }
                bool singleDelimeted = currentChar == '\'';
                bool doubleDelimeted = currentChar == '"';
                bool notDelimited    = IsLetter(currentChar);

                if (!singleDelimeted && !doubleDelimeted && !notDelimited)
                {
                    // Invalid character after equals, e.g. "<abc attribute=" or "<abc attribute=!"
                    parseError = HtmlParseError.NoAttributeValue;
                    return(false);
                }
                int valueStartIndex = notDelimited ? currentIndex : currentIndex + 1;
                int valueEndIndex   = -1;

                while (ReadNext())
                {
                    if (singleDelimeted)
                    {
                        if (currentChar == '\'')
                        {
                            valueEndIndex = currentIndex - 1;
                            break;
                        }
                        continue;
                    }
                    else if (doubleDelimeted)
                    {
                        if (currentChar == '"')
                        {
                            valueEndIndex = currentIndex - 1;
                            break;
                        }
                        continue;
                    }
                    else
                    {
                        bool foundWhitespace = char.IsWhiteSpace(currentChar);
                        if (char.IsWhiteSpace(currentChar))
                        {
                            valueEndIndex = currentIndex - 1;
                            ReadAndSkipWhitespace();
                        }
                        if (currentChar == '/' || currentChar == '>')
                        {
                            if (valueEndIndex == -1)
                            {
                                valueEndIndex = currentIndex - 1;
                            }
                            break;
                        }
                        else if (foundWhitespace)
                        {
                            // Found another attribute
                            break;
                        }
                        continue;
                    }
                }
                if (valueEndIndex == -1)
                {
                    // No end of attribute value, e.g. "<abc attribute=", "<abc attribute=', "<abc attribute="a, "<abc attribute='a or , "<abc attribute=a
                    parseError = HtmlParseError.OpeningTagNotClosedAfterAttribute;
                    return(false);
                }
                string value = text.Substring(valueStartIndex, valueEndIndex - valueStartIndex + 1);

                attribute = new HtmlAttribute(name, value);
                return(notDelimited || ReadAndSkipWhitespace());
            }
Ejemplo n.º 14
0
            public bool TryParseOpeningTag()
            {
                if (currentChar != '<')
                {
                    // No opening tag, e.g. "abc", "<abc>", "<abc>  " or "<abc>InnerText"
                    parseError = HtmlParseError.NoOpeningTag;
                    return(false);
                }
                ReadAndSkipWhitespace();
                if (currentChar == '/')
                {
                    if (!TryParseClosingTag())
                    {
                        return(false);
                    }
                    if (currentElement != null || currentIndex + 1 < text.Length)
                    {
                        // If the element we just finished parsing has a parent, that parent needs to have a closing tag too, so keep parsing.
                        // If we have more text, then there may be more to parse.
                        return(TryParseInnerText());
                    }
                    // Finished parsing
                    return(true);
                }
                else if (currentChar == '!')
                {
                    if (!TryParseComment())
                    {
                        // Couldn't parse the comment
                        return(false);
                    }
                    if (currentIndex + 1 < text.Length)
                    {
                        // Got more to parse?
                        return(TryParseInnerText());
                    }
                    else
                    {
                        // Doctype or comment on its own, e.g. "<!--comment-->" or "<div><!--comment-->"
                        parseError = HtmlParseError.LoneDoctype;
                        return(false);
                    }
                }
                if (!IsLetter(currentChar))
                {
                    // No valid tag, e.g. "<>", "<1"
                    parseError = HtmlParseError.InvalidTag;
                    return(false);
                }

                int  tagStartIndex = currentIndex;
                int  tagEndIndex   = -1;
                bool foundTagEnd   = false;

                while (ReadNext())
                {
                    foundTagEnd = currentChar == '/' || currentChar == '>';
                    if (foundTagEnd || char.IsWhiteSpace(currentChar))
                    {
                        tagEndIndex = currentIndex - 1;
                        break;
                    }
                }
                if (tagEndIndex == -1)
                {
                    // No end of tag, e.g. "<abc", "<abc "
                    parseError = HtmlParseError.OpeningTagNotClosed;
                    return(false);
                }

                string tag = text.ToAsciiLower(tagStartIndex, tagEndIndex - tagStartIndex + 1);
                HtmlObjectLinkedList <HtmlAttribute> attributes = null;

                if (!foundTagEnd && !TryParseAttributes(out attributes))
                {
                    // Could not parse attributes
                    return(false);
                }

                bool isVoid = false;

                if (currentChar == '/')
                {
                    // Void element?
                    ReadAndSkipWhitespace();
                    if (currentChar != '>')
                    {
                        // No end of void tag, e.g. "<abc/", "<abc/a>"
                        parseError = HtmlParseError.NodeNotClosed;
                        return(false);
                    }
                    isVoid = true;
                    if (currentElement != null)
                    {
                        // Read on if this void element is a child
                        ReadAndSkipWhitespace();
                    }
                }
                else
                {
                    ReadAndSkipWhitespace();
                }

                HtmlElement element;

                if (rootElement == null)
                {
                    if (tag == "html" && !isVoid)
                    {
                        element = new HtmlDocument()
                        {
                            Doctype = doctype
                        };
                    }
                    else if (parsingDocument)
                    {
                        // First tag of a document has to be an open html tag
                        parseError = HtmlParseError.FirstElementInDocumentNotHtml;
                        return(false);
                    }
                    else
                    {
                        element = new HtmlElement(tag, isVoid);
                    }
                }
                else
                {
                    element = new HtmlElement(tag, isVoid);
                }
                if (attributes != null)
                {
                    element._attributes = attributes;
                }
                SetParsing(element);

                if (element.IsVoid && element.Parent == null)
                {
                    if (!ReadNext() || (char.IsWhiteSpace(currentChar) && !ReadAndSkipWhitespace()))
                    {
                        // Valid void element without a parent, e.g. "<abc/>", "<abc/>  "
                        return(true);
                    }

                    // Invalid text after a void element without a parent, e.g. "<abc/>a"
                    parseError = HtmlParseError.InvalidTextAfterNode;
                    return(false);
                }
                if (element.Parent != null || !element.IsVoid)
                {
                    // If the element has a parent, we need to make sure the parent has a closing tag.
                    // If the element has no parent, but is non-void, we also need to make sure the element has a closing tag.
                    if (!TryParseInnerText())
                    {
                        // Couldn't parse inner text, e.g. "<abc>Inner<def></def>"
                        return(false);
                    }
                }

                return(true);
            }
Ejemplo n.º 15
0
 /// <summary>
 /// Creates a new HtmlParseErrorEvent event.
 /// </summary>
 /// <param name="code">The provided error code.</param>
 /// <param name="position">The position in the source.</param>
 ///
 public HtmlErrorEvent(HtmlParseError code, TextPosition position)
 {
     _code     = code;
     _position = position;
 }