Пример #1
0
        private static void ReadCloseElement(ParserContext context)
        {
            Match match = RxReadCloseAttribute.Match(context.Html, context.Index);

            if (!match.Success)
            {
                string str = context.Html.Substring(context.Index);
                context.Tokens.Add(new HtmlParserToken {
                    Type = TokenType.Text, A = context.AdjustForWhitespace(str), Raw = str
                });
                context.Index = context.Html.Length;
            }
            else
            {
                HtmlParserToken newToken = new HtmlParserToken {
                    Type = TokenType.CloseElement, Raw = match.Value, A = match.Groups["name"].Value
                };
                context.Tokens.Add(newToken);
                //HACK there might be a tag inside of a tag (Incorrectly closed tag) like </strong</td>
                //If we find this, we are going to adjust
                if (newToken.A.IndexOf("<") > -1)
                {
                    int index = match.Value.Substring(2).IndexOf("<");
                    newToken.A     = newToken.A.Substring(0, index);
                    context.Index += index + 2;
                }
                else
                {
                    context.Index += match.Length;
                }
            }
        }
Пример #2
0
        private void ReadAttributes(XElement current)
        {
            while (this._index < this._tokens.Count && this._tokens[this._index].Type == TokenType.Attribute)
            {
                HtmlParserToken token = this._tokens[this._index++];
                string          name  = token.A.ToLowerInvariant();
                name = name.Replace(':', '_');
                if (name == "xmlns")
                {
                    name += "_";
                }

                if (RxValidAttrName.IsMatch(name))
                {
                    current.SetAttributeValue(name, HttpUtility.HtmlDecode(token.B ?? token.A ?? string.Empty));
                }
            }
        }
Пример #3
0
        private void Assemble()
        {
            Stack <XElement> stack     = new Stack <XElement>();
            Func <XElement>  topOrRoot = () => stack.Count == 0 ? this._doc.Root : stack.Peek();

            while (this._index < this._tokens.Count)
            {
                HtmlParserToken token = this._tokens[this._index++];
                switch (token.Type)
                {
                case TokenType.Element:
                {
                    string name = this.SanitizeElementName(token.A);
                    if (SiblingOnly.Contains(name))
                    {
                        this.CloseElement(stack, name);
                    }

                    XElement current = null;
                    if (name == "html")
                    {
                        current = topOrRoot();
                    }
                    else
                    {
                        current = new XElement(name);
                        topOrRoot().Add(current);
                    }

                    this.ReadAttributes(current);
                    if (!SelfClosing.Contains(name))
                    {
                        stack.Push(current);
                    }

                    break;
                }

                case TokenType.CloseElement:
                {
                    this.CloseElement(stack, this.SanitizeElementName(token.A));

                    break;
                }

                case TokenType.Comment:
                {
                    topOrRoot().Add(new XComment(token.A));
                    break;
                }

                case TokenType.Cdata:
                {
                    topOrRoot().Add(new XCData(token.A));
                    break;
                }

                case TokenType.Text:
                {
                    XElement parent = topOrRoot();
                    if (parent.Name.LocalName.Equals("textarea", StringComparison.InvariantCultureIgnoreCase) ||
                        parent.Name.LocalName.Equals("pre", StringComparison.InvariantCultureIgnoreCase))
                    {
                        parent.Add(new XText(token.Raw));
                    }
                    else
                    {
                        parent.Add(new XText(token.A));
                    }

                    break;
                }
                }
            }
        }
Пример #4
0
        private static void ReadElement(ParserContext context)
        {
            int start = context.Index;

            context.Index++;
            Match match = RxReadTagName.Match(context.Html, context.Index);

            context.Index += match.Length;
            HtmlParserToken elementToken = new HtmlParserToken {
                Type = TokenType.Element, A = match.Value
            };

            if (match.Value.ToLowerInvariant() == "script")
            {
                context.InScriptTag = true;
            }

            context.Tokens.Add(elementToken);

            while (!context.EndOfString)
            {
                // read whitespace before an attribute name
                SkipWhiteSpace(context);

                if (!context.EndOfString && RxReadAttribute.IsMatch(context.CharAtIndex.ToString()))
                {
                    Match           attrMatch = RxReadAttribute.Match(context.Html, context.Index);
                    HtmlParserToken token     = new HtmlParserToken {
                        Type = TokenType.Attribute, A = attrMatch.Groups["name"].Value, Raw = attrMatch.Value
                    };
                    Group valgrp = attrMatch.Groups["value"];
                    if (valgrp.Success)
                    {
                        token.B = valgrp.Value;
                    }

                    context.Tokens.Add(token);
                    context.Index += attrMatch.Length;
                }
                else
                {
                    if (context.Index < context.Html.Length - 1 && context.Html.Substring(context.Index, 2) == "/>")
                    {
                        context.Index += 2;
                        break;
                    }
                    char ch = context.CharAtIndex;
                    if (ch != '<')
                    {
                        context.Index++;
                    }

                    if (ch == '>' || ch == '<')
                    {
                        break;
                    }
                }
            }

            elementToken.Raw = context.Html.Substring(start, context.Index - start);
        }