public static string ExtractTextFromHtml(string html, bool trimWhiteSpace) { string str = new Regex("<!*[^<>]*>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(html, string.Empty); if (trimWhiteSpace) { return(TrimWhiteSpace(HtmlEncoder.DecodeValue(str))); } return(HtmlEncoder.DecodeValue(str)); }
/// <summary> /// This will parse a string containing HTML and will produce a domain tree. /// </summary> /// <param name="html">The HTML to be parsed</param> /// <returns>A tree representing the elements</returns> public HtmlNodeCollection Parse(string html) { HtmlNodeCollection nodes = new HtmlNodeCollection(null); html = PreprocessScript(html, "script"); html = PreprocessScript(html, "style"); html = RemoveComments(html); html = RemoveSGMLComments(html); StringCollection tokens = GetTokens(html); int index = 0; HtmlElement element = null; while (index < tokens.Count) { if ("<".Equals(tokens[index])) { // Read open tag index++; if (index >= tokens.Count) { break; } string tag_name = tokens[index]; index++; element = new HtmlElement(tag_name); // read the attributes and values while (index < tokens.Count && !">".Equals(tokens[index]) && !"/>".Equals(tokens[index])) { string attribute_name = tokens[index]; index++; if (index < tokens.Count && "=".Equals(tokens[index])) { index++; string attribute_value; if (index < tokens.Count) { attribute_value = tokens[index]; } else { attribute_value = null; } index++; HtmlAttribute attribute = new HtmlAttribute(attribute_name, HtmlEncoder.DecodeValue(attribute_value)); element.Attributes.Add(attribute); } else if (index < tokens.Count) { // Null-value attribute HtmlAttribute attribute = new HtmlAttribute(attribute_name, null); element.Attributes.Add(attribute); } } nodes.Add(element); if (index < tokens.Count && "/>".Equals(tokens[index])) { element.IsTerminated = true; index++; element = null; } else if (index < tokens.Count && ">".Equals(tokens[index])) { index++; } } else if (">".Equals(tokens[index])) { index++; } else if ("</".Equals(tokens[index])) { // Read close tag index++; if (index >= tokens.Count) { break; } string tag_name = tokens[index]; index++; int open_index = FindTagOpenNodeIndex(nodes, tag_name); if (open_index != -1) { MoveNodesDown(ref nodes, open_index + 1, (HtmlElement)nodes[open_index]); } else { // Er, there is a close tag without an opening tag!! } // Skip to the end of this tag while (index < tokens.Count && !">".Equals(tokens[index])) { index++; } if (index < tokens.Count && ">".Equals(tokens[index])) { index++; } element = null; } else { // Read text string value = tokens[index]; if (mRemoveEmptyElementText) { value = RemoveWhitespace(value); } value = DecodeScript(value); if (mRemoveEmptyElementText && value.Length == 0) { // We do nothing } else { if (!(element != null && element.NoEscaping)) { value = HtmlEncoder.DecodeValue(value); } HtmlText node = new HtmlText(value); nodes.Add(node); } index++; } } return(nodes); }
public HtmlNodeCollection Parse(string html) { HtmlNodeCollection nodes = new HtmlNodeCollection((HtmlElement)null); html = this.PreprocessScript(html, "script"); html = this.PreprocessScript(html, "style"); html = this.RemoveComments(html); html = this.RemoveSGMLComments(html); StringCollection tokens = this.GetTokens(html); int index1 = 0; HtmlElement htmlElement = (HtmlElement)null; while (index1 < tokens.Count) { if ("<".Equals(tokens[index1])) { int index2 = index1 + 1; if (index2 >= tokens.Count) { return(nodes); } string name1 = tokens[index2]; index1 = index2 + 1; htmlElement = new HtmlElement(name1); while (index1 < tokens.Count && !">".Equals(tokens[index1]) && !"/>".Equals(tokens[index1])) { string name2 = tokens[index1]; ++index1; if (index1 < tokens.Count && "=".Equals(tokens[index1])) { int index3 = index1 + 1; string str = index3 >= tokens.Count ? (string)null : tokens[index3]; index1 = index3 + 1; HtmlAttribute attribute = new HtmlAttribute(name2, HtmlEncoder.DecodeValue(str)); htmlElement.Attributes.Add(attribute); } else if (index1 < tokens.Count) { HtmlAttribute attribute = new HtmlAttribute(name2, (string)null); htmlElement.Attributes.Add(attribute); } } nodes.Add((HtmlNode)htmlElement); if (index1 < tokens.Count && "/>".Equals(tokens[index1])) { htmlElement.IsTerminated = true; ++index1; htmlElement = (HtmlElement)null; } else if (index1 < tokens.Count && ">".Equals(tokens[index1])) { ++index1; } } else if (">".Equals(tokens[index1])) { ++index1; } else if ("</".Equals(tokens[index1])) { int index2 = index1 + 1; if (index2 >= tokens.Count) { return(nodes); } string name = tokens[index2]; index1 = index2 + 1; int tagOpenNodeIndex = this.FindTagOpenNodeIndex(nodes, name); if (tagOpenNodeIndex != -1) { this.MoveNodesDown(ref nodes, tagOpenNodeIndex + 1, (HtmlElement)nodes[tagOpenNodeIndex]); } while (index1 < tokens.Count && !">".Equals(tokens[index1])) { ++index1; } if (index1 < tokens.Count && ">".Equals(tokens[index1])) { ++index1; } htmlElement = (HtmlElement)null; } else { string str = tokens[index1]; if (this.mRemoveEmptyElementText) { str = this.RemoveWhitespace(str); } string text = HtmlParser.DecodeScript(str); if (!this.mRemoveEmptyElementText || text.Length != 0) { if (htmlElement == null || !htmlElement.NoEscaping) { text = HtmlEncoder.DecodeValue(text); } HtmlText htmlText = new HtmlText(text); nodes.Add((HtmlNode)htmlText); } ++index1; } } return(nodes); }
public HtmlNodeCollection Parse(string html) { HtmlNodeCollection htmlNodeCollection = new HtmlNodeCollection(null); html = this.PreprocessScript(html, "script"); html = this.PreprocessScript(html, "style"); html = this.RemoveComments(html); html = this.RemoveSGMLComments(html); StringCollection tokens = this.GetTokens(html); int i = 0; HtmlElement htmlElement = null; HtmlNodeCollection result; while (i < tokens.Count) { if ("<".Equals(tokens[i])) { i++; if (i >= tokens.Count) { result = htmlNodeCollection; return(result); } string name = tokens[i]; i++; htmlElement = new HtmlElement(name); while (i < tokens.Count && !">".Equals(tokens[i]) && !"/>".Equals(tokens[i])) { string name2 = tokens[i]; i++; if (i < tokens.Count && "=".Equals(tokens[i])) { i++; string value; if (i < tokens.Count) { value = tokens[i]; } else { value = null; } i++; HtmlAttribute attribute = new HtmlAttribute(name2, HtmlEncoder.DecodeValue(value)); htmlElement.Attributes.Add(attribute); } else if (i < tokens.Count) { HtmlAttribute attribute = new HtmlAttribute(name2, null); htmlElement.Attributes.Add(attribute); } } htmlNodeCollection.Add(htmlElement); if (i < tokens.Count && "/>".Equals(tokens[i])) { htmlElement.IsTerminated = true; i++; htmlElement = null; } else if (i < tokens.Count && ">".Equals(tokens[i])) { i++; } } else if (">".Equals(tokens[i])) { i++; } else if ("</".Equals(tokens[i])) { i++; if (i >= tokens.Count) { result = htmlNodeCollection; return(result); } string name = tokens[i]; i++; int num = this.FindTagOpenNodeIndex(htmlNodeCollection, name); if (num != -1) { this.MoveNodesDown(ref htmlNodeCollection, num + 1, (HtmlElement)htmlNodeCollection[num]); } while (i < tokens.Count && !">".Equals(tokens[i])) { i++; } if (i < tokens.Count && ">".Equals(tokens[i])) { i++; } htmlElement = null; } else { string text = tokens[i]; if (this.mRemoveEmptyElementText) { text = this.RemoveWhitespace(text); } text = HtmlParser.DecodeScript(text); if (!this.mRemoveEmptyElementText || text.Length != 0) { if (htmlElement == null || !htmlElement.NoEscaping) { text = HtmlEncoder.DecodeValue(text); } HtmlText node = new HtmlText(text); htmlNodeCollection.Add(node); } i++; } } result = htmlNodeCollection; return(result); }