/// <summary> /// This will parse a string containing HTML and will produce a domain tree. /// </summary> /// <param name="html">The HTML to be parsed</param> /// <param name="isRemoveEmptyElementText">The default mechanism will extract a pure DOM tree, which will contain many text nodes containing just whitespace (carriage returns etc.) However, with normal parsing, these are useless and only serve to complicate matters. Therefore, this option exists to automatically remove those empty text nodes.</param> /// <returns>A tree representing the elements</returns> public static XhtmlTCollection <XhtmlElement> Parse(string html, bool isRemoveEmptyElementText) { XhtmlTCollection <XhtmlElement> nodes = new XhtmlTCollection <XhtmlElement>(); html = PreprocessScript(html, "script"); html = PreprocessScript(html, "style"); html = RemoveComments(html); html = RemoveSGMLComments(html); StringCollection tokens = GetTokens(html); int index = 0; XhtmlSection element = null; while (index < tokens.Count) { if ("<".Equals(tokens[index])) { // Read open tag index++; if (index >= tokens.Count) { break; } string tag_name = tokens[index]; index++; element = new XhtmlSection(tag_name); // read the attributes and values while (index < tokens.Count && !">".Equals(tokens[index]) && !"/>".Equals(tokens[index])) { string attribute_name = tokens[index]; index++; if (index < tokens.Count && "=".Equals(tokens[index])) { index++; string attribute_value; if (index < tokens.Count) { attribute_value = tokens[index]; } else { attribute_value = null; } index++; XhtmlAttribute attribute = new XhtmlAttribute(attribute_name, XhtmlEncoder.Decode(attribute_value)); element.Attributes.Add(attribute); } else if (index < tokens.Count) { // Null-value attribute XhtmlAttribute attribute = new XhtmlAttribute(attribute_name, null); element.Attributes.Add(attribute); } } nodes.Add(element); if (index < tokens.Count && "/>".Equals(tokens[index])) { element.IsTerminated = true; index++; element = null; } else if (index < tokens.Count && ">".Equals(tokens[index])) { index++; } } else if (">".Equals(tokens[index])) { index++; } else if ("</".Equals(tokens[index])) { // Read close tag index++; if (index >= tokens.Count) { break; } string tag_name = tokens[index]; index++; int open_index = FindTagOpenNodeIndex(nodes, tag_name); if (open_index != -1) { MoveNodesDown(ref nodes, open_index + 1, (XhtmlElement)nodes[open_index]); } else { // Er, there is a close tag without an opening tag!! } // Skip to the end of this tag while (index < tokens.Count && !">".Equals(tokens[index])) { index++; } if (index < tokens.Count && ">".Equals(tokens[index])) { index++; } element = null; } else { // Read text string value = tokens[index]; if (isRemoveEmptyElementText) { value = RemoveWhitespace(value); } value = DecodeScript(value); if (isRemoveEmptyElementText && value.Length == 0) { // We do nothing } else { if (!(element != null && element.NoEscaping)) { value = XhtmlEncoder.Decode(value); } XhtmlText textNode = new XhtmlText(element, value); nodes.Add(textNode); //HtmlText node = new HtmlText(value); //nodes.Add(node); } index++; } } return(nodes); }
/// <summary> /// 创建一个Text节点 /// </summary> /// <param name="text">Text节点的文本</param> public XhtmlText CreateXhtmlText(string text) { XhtmlText xhtmlText = new XhtmlText(this, text); return(xhtmlText); }