예제 #1
0
        public static string ExtractTextFromHtml(string html, bool trimWhiteSpace)
        {
            string str = new Regex("<!*[^<>]*>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(html, string.Empty);

            if (trimWhiteSpace)
            {
                return(TrimWhiteSpace(HtmlEncoder.DecodeValue(str)));
            }
            return(HtmlEncoder.DecodeValue(str));
        }
예제 #2
0
        /// <summary>
        /// This will parse a string containing HTML and will produce a domain tree.
        /// </summary>
        /// <param name="html">The HTML to be parsed</param>
        /// <returns>A tree representing the elements</returns>
        public HtmlNodeCollection Parse(string html)
        {
            HtmlNodeCollection nodes = new HtmlNodeCollection(null);

            html = PreprocessScript(html, "script");
            html = PreprocessScript(html, "style");

            html = RemoveComments(html);
            html = RemoveSGMLComments(html);
            StringCollection tokens = GetTokens(html);

            int         index   = 0;
            HtmlElement element = null;

            while (index < tokens.Count)
            {
                if ("<".Equals(tokens[index]))
                {
                    // Read open tag

                    index++;
                    if (index >= tokens.Count)
                    {
                        break;
                    }
                    string tag_name = tokens[index];
                    index++;
                    element = new HtmlElement(tag_name);
                    // read the attributes and values

                    while (index < tokens.Count && !">".Equals(tokens[index]) && !"/>".Equals(tokens[index]))
                    {
                        string attribute_name = tokens[index];
                        index++;
                        if (index < tokens.Count && "=".Equals(tokens[index]))
                        {
                            index++;
                            string attribute_value;
                            if (index < tokens.Count)
                            {
                                attribute_value = tokens[index];
                            }
                            else
                            {
                                attribute_value = null;
                            }
                            index++;
                            HtmlAttribute attribute = new HtmlAttribute(attribute_name, HtmlEncoder.DecodeValue(attribute_value));
                            element.Attributes.Add(attribute);
                        }
                        else if (index < tokens.Count)
                        {
                            // Null-value attribute
                            HtmlAttribute attribute = new HtmlAttribute(attribute_name, null);
                            element.Attributes.Add(attribute);
                        }
                    }
                    nodes.Add(element);
                    if (index < tokens.Count && "/>".Equals(tokens[index]))
                    {
                        element.IsTerminated = true;
                        index++;
                        element = null;
                    }
                    else if (index < tokens.Count && ">".Equals(tokens[index]))
                    {
                        index++;
                    }
                }
                else if (">".Equals(tokens[index]))
                {
                    index++;
                }
                else if ("</".Equals(tokens[index]))
                {
                    // Read close tag
                    index++;
                    if (index >= tokens.Count)
                    {
                        break;
                    }
                    string tag_name = tokens[index];
                    index++;

                    int open_index = FindTagOpenNodeIndex(nodes, tag_name);
                    if (open_index != -1)
                    {
                        MoveNodesDown(ref nodes, open_index + 1, (HtmlElement)nodes[open_index]);
                    }
                    else
                    {
                        // Er, there is a close tag without an opening tag!!
                    }

                    // Skip to the end of this tag
                    while (index < tokens.Count && !">".Equals(tokens[index]))
                    {
                        index++;
                    }
                    if (index < tokens.Count && ">".Equals(tokens[index]))
                    {
                        index++;
                    }

                    element = null;
                }
                else
                {
                    // Read text
                    string value = tokens[index];
                    if (mRemoveEmptyElementText)
                    {
                        value = RemoveWhitespace(value);
                    }
                    value = DecodeScript(value);

                    if (mRemoveEmptyElementText && value.Length == 0)
                    {
                        // We do nothing
                    }
                    else
                    {
                        if (!(element != null && element.NoEscaping))
                        {
                            value = HtmlEncoder.DecodeValue(value);
                        }
                        HtmlText node = new HtmlText(value);
                        nodes.Add(node);
                    }
                    index++;
                }
            }
            return(nodes);
        }
예제 #3
0
        public HtmlNodeCollection Parse(string html)
        {
            HtmlNodeCollection nodes = new HtmlNodeCollection((HtmlElement)null);

            html = this.PreprocessScript(html, "script");
            html = this.PreprocessScript(html, "style");
            html = this.RemoveComments(html);
            html = this.RemoveSGMLComments(html);
            StringCollection tokens = this.GetTokens(html);
            int         index1      = 0;
            HtmlElement htmlElement = (HtmlElement)null;

            while (index1 < tokens.Count)
            {
                if ("<".Equals(tokens[index1]))
                {
                    int index2 = index1 + 1;
                    if (index2 >= tokens.Count)
                    {
                        return(nodes);
                    }
                    string name1 = tokens[index2];
                    index1      = index2 + 1;
                    htmlElement = new HtmlElement(name1);
                    while (index1 < tokens.Count && !">".Equals(tokens[index1]) && !"/>".Equals(tokens[index1]))
                    {
                        string name2 = tokens[index1];
                        ++index1;
                        if (index1 < tokens.Count && "=".Equals(tokens[index1]))
                        {
                            int    index3 = index1 + 1;
                            string str    = index3 >= tokens.Count ? (string)null : tokens[index3];
                            index1 = index3 + 1;
                            HtmlAttribute attribute = new HtmlAttribute(name2, HtmlEncoder.DecodeValue(str));
                            htmlElement.Attributes.Add(attribute);
                        }
                        else if (index1 < tokens.Count)
                        {
                            HtmlAttribute attribute = new HtmlAttribute(name2, (string)null);
                            htmlElement.Attributes.Add(attribute);
                        }
                    }
                    nodes.Add((HtmlNode)htmlElement);
                    if (index1 < tokens.Count && "/>".Equals(tokens[index1]))
                    {
                        htmlElement.IsTerminated = true;
                        ++index1;
                        htmlElement = (HtmlElement)null;
                    }
                    else if (index1 < tokens.Count && ">".Equals(tokens[index1]))
                    {
                        ++index1;
                    }
                }
                else if (">".Equals(tokens[index1]))
                {
                    ++index1;
                }
                else if ("</".Equals(tokens[index1]))
                {
                    int index2 = index1 + 1;
                    if (index2 >= tokens.Count)
                    {
                        return(nodes);
                    }
                    string name = tokens[index2];
                    index1 = index2 + 1;
                    int tagOpenNodeIndex = this.FindTagOpenNodeIndex(nodes, name);
                    if (tagOpenNodeIndex != -1)
                    {
                        this.MoveNodesDown(ref nodes, tagOpenNodeIndex + 1, (HtmlElement)nodes[tagOpenNodeIndex]);
                    }
                    while (index1 < tokens.Count && !">".Equals(tokens[index1]))
                    {
                        ++index1;
                    }
                    if (index1 < tokens.Count && ">".Equals(tokens[index1]))
                    {
                        ++index1;
                    }
                    htmlElement = (HtmlElement)null;
                }
                else
                {
                    string str = tokens[index1];
                    if (this.mRemoveEmptyElementText)
                    {
                        str = this.RemoveWhitespace(str);
                    }
                    string text = HtmlParser.DecodeScript(str);
                    if (!this.mRemoveEmptyElementText || text.Length != 0)
                    {
                        if (htmlElement == null || !htmlElement.NoEscaping)
                        {
                            text = HtmlEncoder.DecodeValue(text);
                        }
                        HtmlText htmlText = new HtmlText(text);
                        nodes.Add((HtmlNode)htmlText);
                    }
                    ++index1;
                }
            }
            return(nodes);
        }
예제 #4
0
        public HtmlNodeCollection Parse(string html)
        {
            HtmlNodeCollection htmlNodeCollection = new HtmlNodeCollection(null);

            html = this.PreprocessScript(html, "script");
            html = this.PreprocessScript(html, "style");
            html = this.RemoveComments(html);
            html = this.RemoveSGMLComments(html);
            StringCollection tokens        = this.GetTokens(html);
            int                i           = 0;
            HtmlElement        htmlElement = null;
            HtmlNodeCollection result;

            while (i < tokens.Count)
            {
                if ("<".Equals(tokens[i]))
                {
                    i++;
                    if (i >= tokens.Count)
                    {
                        result = htmlNodeCollection;
                        return(result);
                    }
                    string name = tokens[i];
                    i++;
                    htmlElement = new HtmlElement(name);
                    while (i < tokens.Count && !">".Equals(tokens[i]) && !"/>".Equals(tokens[i]))
                    {
                        string name2 = tokens[i];
                        i++;
                        if (i < tokens.Count && "=".Equals(tokens[i]))
                        {
                            i++;
                            string value;
                            if (i < tokens.Count)
                            {
                                value = tokens[i];
                            }
                            else
                            {
                                value = null;
                            }
                            i++;
                            HtmlAttribute attribute = new HtmlAttribute(name2, HtmlEncoder.DecodeValue(value));
                            htmlElement.Attributes.Add(attribute);
                        }
                        else if (i < tokens.Count)
                        {
                            HtmlAttribute attribute = new HtmlAttribute(name2, null);
                            htmlElement.Attributes.Add(attribute);
                        }
                    }
                    htmlNodeCollection.Add(htmlElement);
                    if (i < tokens.Count && "/>".Equals(tokens[i]))
                    {
                        htmlElement.IsTerminated = true;
                        i++;
                        htmlElement = null;
                    }
                    else if (i < tokens.Count && ">".Equals(tokens[i]))
                    {
                        i++;
                    }
                }
                else if (">".Equals(tokens[i]))
                {
                    i++;
                }
                else if ("</".Equals(tokens[i]))
                {
                    i++;
                    if (i >= tokens.Count)
                    {
                        result = htmlNodeCollection;
                        return(result);
                    }
                    string name = tokens[i];
                    i++;
                    int num = this.FindTagOpenNodeIndex(htmlNodeCollection, name);
                    if (num != -1)
                    {
                        this.MoveNodesDown(ref htmlNodeCollection, num + 1, (HtmlElement)htmlNodeCollection[num]);
                    }
                    while (i < tokens.Count && !">".Equals(tokens[i]))
                    {
                        i++;
                    }
                    if (i < tokens.Count && ">".Equals(tokens[i]))
                    {
                        i++;
                    }
                    htmlElement = null;
                }
                else
                {
                    string text = tokens[i];
                    if (this.mRemoveEmptyElementText)
                    {
                        text = this.RemoveWhitespace(text);
                    }
                    text = HtmlParser.DecodeScript(text);
                    if (!this.mRemoveEmptyElementText || text.Length != 0)
                    {
                        if (htmlElement == null || !htmlElement.NoEscaping)
                        {
                            text = HtmlEncoder.DecodeValue(text);
                        }
                        HtmlText node = new HtmlText(text);
                        htmlNodeCollection.Add(node);
                    }
                    i++;
                }
            }
            result = htmlNodeCollection;
            return(result);
        }