Пример #1
0
        /// <summary>
        /// This will parse a string containing HTML and will produce a domain tree.
        /// </summary>
        /// <param name="html">The HTML to be parsed</param>
        /// <param name="isRemoveEmptyElementText">The default mechanism will extract a pure DOM tree, which will contain many text nodes containing just whitespace (carriage returns etc.) However, with normal parsing, these are useless and only serve to complicate matters. Therefore, this option exists to automatically remove those empty text nodes.</param>
        /// <returns>A tree representing the elements</returns>
        public static XhtmlTCollection <XhtmlElement> Parse(string html, bool isRemoveEmptyElementText)
        {
            XhtmlTCollection <XhtmlElement> nodes = new XhtmlTCollection <XhtmlElement>();

            html = PreprocessScript(html, "script");
            html = PreprocessScript(html, "style");

            html = RemoveComments(html);
            html = RemoveSGMLComments(html);
            StringCollection tokens = GetTokens(html);

            int          index   = 0;
            XhtmlSection element = null;

            while (index < tokens.Count)
            {
                if ("<".Equals(tokens[index]))
                {
                    // Read open tag

                    index++;
                    if (index >= tokens.Count)
                    {
                        break;
                    }
                    string tag_name = tokens[index];
                    index++;
                    element = new XhtmlSection(tag_name);
                    // read the attributes and values

                    while (index < tokens.Count && !">".Equals(tokens[index]) && !"/>".Equals(tokens[index]))
                    {
                        string attribute_name = tokens[index];
                        index++;
                        if (index < tokens.Count && "=".Equals(tokens[index]))
                        {
                            index++;
                            string attribute_value;
                            if (index < tokens.Count)
                            {
                                attribute_value = tokens[index];
                            }
                            else
                            {
                                attribute_value = null;
                            }
                            index++;
                            XhtmlAttribute attribute = new XhtmlAttribute(attribute_name, XhtmlEncoder.Decode(attribute_value));
                            element.Attributes.Add(attribute);
                        }
                        else if (index < tokens.Count)
                        {
                            // Null-value attribute
                            XhtmlAttribute attribute = new XhtmlAttribute(attribute_name, null);
                            element.Attributes.Add(attribute);
                        }
                    }
                    nodes.Add(element);
                    if (index < tokens.Count && "/>".Equals(tokens[index]))
                    {
                        element.IsTerminated = true;
                        index++;
                        element = null;
                    }
                    else if (index < tokens.Count && ">".Equals(tokens[index]))
                    {
                        index++;
                    }
                }
                else if (">".Equals(tokens[index]))
                {
                    index++;
                }
                else if ("</".Equals(tokens[index]))
                {
                    // Read close tag
                    index++;
                    if (index >= tokens.Count)
                    {
                        break;
                    }
                    string tag_name = tokens[index];
                    index++;

                    int open_index = FindTagOpenNodeIndex(nodes, tag_name);
                    if (open_index != -1)
                    {
                        MoveNodesDown(ref nodes, open_index + 1, (XhtmlElement)nodes[open_index]);
                    }
                    else
                    {
                        // Er, there is a close tag without an opening tag!!
                    }

                    // Skip to the end of this tag
                    while (index < tokens.Count && !">".Equals(tokens[index]))
                    {
                        index++;
                    }
                    if (index < tokens.Count && ">".Equals(tokens[index]))
                    {
                        index++;
                    }

                    element = null;
                }
                else
                {
                    // Read text
                    string value = tokens[index];
                    if (isRemoveEmptyElementText)
                    {
                        value = RemoveWhitespace(value);
                    }
                    value = DecodeScript(value);

                    if (isRemoveEmptyElementText && value.Length == 0)
                    {
                        // We do nothing
                    }
                    else
                    {
                        if (!(element != null && element.NoEscaping))
                        {
                            value = XhtmlEncoder.Decode(value);
                        }
                        XhtmlText textNode = new XhtmlText(element, value);
                        nodes.Add(textNode);

                        //HtmlText node = new HtmlText(value);
                        //nodes.Add(node);
                    }
                    index++;
                }
            }
            return(nodes);
        }
Пример #2
0
        /// <summary>
        /// 创建一个Text节点
        /// </summary>
        /// <param name="text">Text节点的文本</param>
        public XhtmlText CreateXhtmlText(string text)
        {
            XhtmlText xhtmlText = new XhtmlText(this, text);

            return(xhtmlText);
        }