private DocumentBuilder(List <HtmlParserToken> tokens) { _tokens = tokens; string doctype = string.Empty; HtmlParserToken doctypeToken = tokens.Where(t => t.Type == TokenType.DocTypeDeclaration).FirstOrDefault(); if (doctypeToken != null) { doctype = doctypeToken.Raw; } try { _doc = XDocument.Parse(string.Format("<?xml version=\"1.0\"?>{0}<html />", doctype)); } catch (XmlException) { // System.Xml.Linq.XDocument throws an XmlException if it encounters a DOCTYPE it // can't parse. If this occurs, do not use the DOCTYPE from the page. _doc = XDocument.Parse("<?xml version=\"1.0\"?><html />"); } if (_doc.DocumentType != null) { #if !__MonoCS__ _doc.DocumentType.InternalSubset = null; #endif } }
private static void ReadCloseElement(ParserContext context) { var match = RxReadCloseAttribute.Match(context.Html, context.Index); if (!match.Success) { var str = context.Html.Substring(context.Index); context.Tokens.Add(new HtmlParserToken { Type = TokenType.Text, A = context.AdjustForWhitespace(str), Raw = str }); context.Index = context.Html.Length; } else { var newToken = new HtmlParserToken { Type = TokenType.CloseElement, Raw = match.Value, A = match.Groups["name"].Value }; context.Tokens.Add(newToken); //HACK there might be a tag inside of a tag (Incorrectly closed tag) like </strong</td> //If we find this, we are going to adjust if (newToken.A.IndexOf("<") > -1) { var index = match.Value.Substring(2).IndexOf("<"); newToken.A = newToken.A.Substring(0, index); context.Index += index + 2; } else { context.Index += match.Length; } } }
private static void ReadElement(ParserContext context) { var start = context.Index; context.Index++; var match = RxReadTagName.Match(context.Html, context.Index); context.Index += match.Length; var elementToken = new HtmlParserToken { Type = TokenType.Element, A = match.Value }; if (match.Value.ToLowerInvariant() == "script") { context.InScriptTag = true; } context.Tokens.Add(elementToken); while (!context.EndOfString) { // read whitespace before an attribute name SkipWhiteSpace(context); if (!context.EndOfString && RxReadAttribute.IsMatch(context.CharAtIndex.ToString())) { var attrMatch = RxReadAttribute.Match(context.Html, context.Index); var token = new HtmlParserToken { Type = TokenType.Attribute, A = attrMatch.Groups["name"].Value, Raw = attrMatch.Value }; var valgrp = attrMatch.Groups["value"]; if (valgrp.Success) { token.B = valgrp.Value; } context.Tokens.Add(token); context.Index += attrMatch.Length; } else { if (context.Index < context.Html.Length - 1 && context.Html.Substring(context.Index, 2) == "/>") { context.Index += 2; break; } var ch = context.CharAtIndex; if (ch != '<') { context.Index++; } if (ch == '>' || ch == '<') { break; } } } elementToken.Raw = context.Html.Substring(start, context.Index - start); }
private void ReadAttributes(XElement current) { while (this._index < this._tokens.Count && this._tokens[this._index].Type == TokenType.Attribute) { HtmlParserToken token = this._tokens[this._index++]; string name = token.A.ToLowerInvariant(); name = name.Replace(':', '_'); if (name == "xmlns") { name += "_"; } if (RxValidAttrName.IsMatch(name)) { current.SetAttributeValue(name, HttpUtility.HtmlDecode(token.B ?? token.A ?? string.Empty)); } } }
private static void ReadCloseElement(ParserContext context) { var match = RxReadCloseAttribute.Match(context.Html, context.Index); if(!match.Success) { var str = context.Html.Substring(context.Index); context.Tokens.Add(new HtmlParserToken { Type = TokenType.Text, A = context.AdjustForWhitespace(str), Raw = str }); context.Index = context.Html.Length; } else { var newToken = new HtmlParserToken { Type = TokenType.CloseElement, Raw = match.Value, A = match.Groups["name"].Value }; context.Tokens.Add(newToken); //HACK there might be a tag inside of a tag (Incorrectly closed tag) like </strong</td> //If we find this, we are going to adjust if (newToken.A.IndexOf("<") > -1) { var index = match.Value.Substring(2).IndexOf("<"); newToken.A = newToken.A.Substring(0, index); context.Index += index + 2; } else { context.Index += match.Length; } } }
private static void ReadElement(ParserContext context) { var start = context.Index; context.Index++; var match = RxReadTagName.Match(context.Html, context.Index); context.Index += match.Length; var elementToken = new HtmlParserToken { Type = TokenType.Element, A = match.Value }; if(match.Value.ToLowerInvariant() == "script") context.InScriptTag = true; context.Tokens.Add(elementToken); while(!context.EndOfString) { // read whitespace before an attribute name SkipWhiteSpace(context); if(!context.EndOfString && RxReadAttribute.IsMatch(context.CharAtIndex.ToString())) { var attrMatch = RxReadAttribute.Match(context.Html, context.Index); var token = new HtmlParserToken { Type = TokenType.Attribute, A = attrMatch.Groups["name"].Value, Raw = attrMatch.Value }; var valgrp = attrMatch.Groups["value"]; if(valgrp.Success) token.B = valgrp.Value; context.Tokens.Add(token); context.Index += attrMatch.Length; } else { if(context.Index < context.Html.Length - 1 && context.Html.Substring(context.Index, 2) == "/>") { context.Index += 2; break; } var ch = context.CharAtIndex; if(ch != '<') context.Index++; if(ch == '>' || ch == '<') break; } } elementToken.Raw = context.Html.Substring(start, context.Index - start); }
private void Assemble() { Stack <XElement> stack = new Stack <XElement>(); Func <XElement> topOrRoot = () => stack.Count == 0 ? this._doc.Root : stack.Peek(); while (this._index < this._tokens.Count) { HtmlParserToken token = this._tokens[this._index++]; switch (token.Type) { case TokenType.Element: { string name = this.SanitizeElementName(token.A); if (SiblingOnly.Contains(name)) { this.CloseElement(stack, name); } XElement current = null; if (name == "html") { current = topOrRoot(); } else { current = new XElement(name); topOrRoot().Add(current); } this.ReadAttributes(current); if (!SelfClosing.Contains(name)) { stack.Push(current); } break; } case TokenType.CloseElement: { this.CloseElement(stack, this.SanitizeElementName(token.A)); break; } case TokenType.Comment: { topOrRoot().Add(new XComment(token.A)); break; } case TokenType.Cdata: { topOrRoot().Add(new XCData(token.A)); break; } case TokenType.Text: { XElement parent = topOrRoot(); if (parent.Name.LocalName.Equals("textarea", StringComparison.InvariantCultureIgnoreCase) || parent.Name.LocalName.Equals("pre", StringComparison.InvariantCultureIgnoreCase)) { parent.Add(new XText(token.Raw)); } else { parent.Add(new XText(token.A)); } break; } } } }