Ejemplo n.º 1
0
        private DocumentBuilder(List <HtmlParserToken> tokens)
        {
            _tokens = tokens;
            string          doctype      = string.Empty;
            HtmlParserToken doctypeToken = tokens.Where(t => t.Type == TokenType.DocTypeDeclaration).FirstOrDefault();

            if (doctypeToken != null)
            {
                doctype = doctypeToken.Raw;
            }

            try
            {
                _doc = XDocument.Parse(string.Format("<?xml version=\"1.0\"?>{0}<html />", doctype));
            }
            catch (XmlException)
            {
                // System.Xml.Linq.XDocument throws an XmlException if it encounters a DOCTYPE it
                // can't parse. If this occurs, do not use the DOCTYPE from the page.
                _doc = XDocument.Parse("<?xml version=\"1.0\"?><html />");
            }
            if (_doc.DocumentType != null)
            {
#if !__MonoCS__
                _doc.DocumentType.InternalSubset = null;
#endif
            }
        }
Ejemplo n.º 2
0
        private static void ReadCloseElement(ParserContext context)
        {
            var match = RxReadCloseAttribute.Match(context.Html, context.Index);

            if (!match.Success)
            {
                var str = context.Html.Substring(context.Index);
                context.Tokens.Add(new HtmlParserToken {
                    Type = TokenType.Text, A = context.AdjustForWhitespace(str), Raw = str
                });
                context.Index = context.Html.Length;
            }
            else
            {
                var newToken = new HtmlParserToken {
                    Type = TokenType.CloseElement, Raw = match.Value, A = match.Groups["name"].Value
                };
                context.Tokens.Add(newToken);
                //HACK there might be a tag inside of a tag (Incorrectly closed tag) like </strong</td>
                //If we find this, we are going to adjust
                if (newToken.A.IndexOf("<") > -1)
                {
                    var index = match.Value.Substring(2).IndexOf("<");
                    newToken.A     = newToken.A.Substring(0, index);
                    context.Index += index + 2;
                }
                else
                {
                    context.Index += match.Length;
                }
            }
        }
Ejemplo n.º 3
0
        private static void ReadElement(ParserContext context)
        {
            var start = context.Index;

            context.Index++;
            var match = RxReadTagName.Match(context.Html, context.Index);

            context.Index += match.Length;
            var elementToken = new HtmlParserToken {
                Type = TokenType.Element, A = match.Value
            };

            if (match.Value.ToLowerInvariant() == "script")
            {
                context.InScriptTag = true;
            }
            context.Tokens.Add(elementToken);

            while (!context.EndOfString)
            {
                // read whitespace before an attribute name
                SkipWhiteSpace(context);

                if (!context.EndOfString && RxReadAttribute.IsMatch(context.CharAtIndex.ToString()))
                {
                    var attrMatch = RxReadAttribute.Match(context.Html, context.Index);
                    var token     = new HtmlParserToken {
                        Type = TokenType.Attribute, A = attrMatch.Groups["name"].Value, Raw = attrMatch.Value
                    };
                    var valgrp = attrMatch.Groups["value"];
                    if (valgrp.Success)
                    {
                        token.B = valgrp.Value;
                    }
                    context.Tokens.Add(token);
                    context.Index += attrMatch.Length;
                }
                else
                {
                    if (context.Index < context.Html.Length - 1 && context.Html.Substring(context.Index, 2) == "/>")
                    {
                        context.Index += 2;
                        break;
                    }
                    var ch = context.CharAtIndex;
                    if (ch != '<')
                    {
                        context.Index++;
                    }
                    if (ch == '>' || ch == '<')
                    {
                        break;
                    }
                }
            }

            elementToken.Raw = context.Html.Substring(start, context.Index - start);
        }
Ejemplo n.º 4
0
        private void ReadAttributes(XElement current)
        {
            while (this._index < this._tokens.Count && this._tokens[this._index].Type == TokenType.Attribute)
            {
                HtmlParserToken token = this._tokens[this._index++];
                string          name  = token.A.ToLowerInvariant();
                name = name.Replace(':', '_');
                if (name == "xmlns")
                {
                    name += "_";
                }

                if (RxValidAttrName.IsMatch(name))
                {
                    current.SetAttributeValue(name, HttpUtility.HtmlDecode(token.B ?? token.A ?? string.Empty));
                }
            }
        }
Ejemplo n.º 5
0
		private static void ReadCloseElement(ParserContext context)
		{
			var match = RxReadCloseAttribute.Match(context.Html, context.Index);
			if(!match.Success)
			{
				var str = context.Html.Substring(context.Index);
				context.Tokens.Add(new HtmlParserToken { Type = TokenType.Text, A = context.AdjustForWhitespace(str), Raw = str });
				context.Index = context.Html.Length;
			}
			else
			{
				var newToken = new HtmlParserToken { Type = TokenType.CloseElement, Raw = match.Value, A = match.Groups["name"].Value };
				context.Tokens.Add(newToken);
				//HACK there might be a tag inside of a tag (Incorrectly closed tag) like </strong</td>
				//If we find this, we are going to adjust
				if (newToken.A.IndexOf("<") > -1)
				{
					var index = match.Value.Substring(2).IndexOf("<");
					newToken.A = newToken.A.Substring(0, index);
					context.Index += index + 2;
				}
				else
				{
					context.Index += match.Length;
				}
			}
		}
Ejemplo n.º 6
0
		private static void ReadElement(ParserContext context)
		{
			var start = context.Index;
			context.Index++;
			var match = RxReadTagName.Match(context.Html, context.Index);
			context.Index += match.Length;
			var elementToken = new HtmlParserToken { Type = TokenType.Element, A = match.Value };
			if(match.Value.ToLowerInvariant() == "script")
				context.InScriptTag = true;
			context.Tokens.Add(elementToken);

			while(!context.EndOfString)
			{
				// read whitespace before an attribute name
				SkipWhiteSpace(context);

				if(!context.EndOfString && RxReadAttribute.IsMatch(context.CharAtIndex.ToString()))
				{
					var attrMatch = RxReadAttribute.Match(context.Html, context.Index);
					var token = new HtmlParserToken { Type = TokenType.Attribute, A = attrMatch.Groups["name"].Value, Raw = attrMatch.Value };
					var valgrp = attrMatch.Groups["value"];
					if(valgrp.Success)
						token.B = valgrp.Value;
					context.Tokens.Add(token);
					context.Index += attrMatch.Length;
				}
				else
				{
					if(context.Index < context.Html.Length - 1 && context.Html.Substring(context.Index, 2) == "/>")
					{
						context.Index += 2;
						break;
					}
					var ch = context.CharAtIndex;
					if(ch != '<')
						context.Index++;
					if(ch == '>' || ch == '<')
						break;
				}
			}

			elementToken.Raw = context.Html.Substring(start, context.Index - start);
		}
Ejemplo n.º 7
0
        private void Assemble()
        {
            Stack <XElement> stack     = new Stack <XElement>();
            Func <XElement>  topOrRoot = () => stack.Count == 0 ? this._doc.Root : stack.Peek();

            while (this._index < this._tokens.Count)
            {
                HtmlParserToken token = this._tokens[this._index++];
                switch (token.Type)
                {
                case TokenType.Element:
                {
                    string name = this.SanitizeElementName(token.A);
                    if (SiblingOnly.Contains(name))
                    {
                        this.CloseElement(stack, name);
                    }

                    XElement current = null;
                    if (name == "html")
                    {
                        current = topOrRoot();
                    }
                    else
                    {
                        current = new XElement(name);
                        topOrRoot().Add(current);
                    }

                    this.ReadAttributes(current);
                    if (!SelfClosing.Contains(name))
                    {
                        stack.Push(current);
                    }

                    break;
                }

                case TokenType.CloseElement:
                {
                    this.CloseElement(stack, this.SanitizeElementName(token.A));

                    break;
                }

                case TokenType.Comment:
                {
                    topOrRoot().Add(new XComment(token.A));
                    break;
                }

                case TokenType.Cdata:
                {
                    topOrRoot().Add(new XCData(token.A));
                    break;
                }

                case TokenType.Text:
                {
                    XElement parent = topOrRoot();
                    if (parent.Name.LocalName.Equals("textarea", StringComparison.InvariantCultureIgnoreCase) ||
                        parent.Name.LocalName.Equals("pre", StringComparison.InvariantCultureIgnoreCase))
                    {
                        parent.Add(new XText(token.Raw));
                    }
                    else
                    {
                        parent.Add(new XText(token.A));
                    }

                    break;
                }
                }
            }
        }