/// <summary> /// This constructs a new HTML element with the specified tag name. /// </summary> /// <param name="name">The name of this element</param> public HtmlElement(string name) { mNodes = new HtmlNodeCollection(this); mAttributes = new HtmlAttributeCollection(this); mName = name; mIsTerminated = false; }
// Look for all nodes of a given type public static List<HtmlElement> FindAll(HtmlNodeCollection nodes, string name, string attrname, string attrvalue) { List<HtmlElement> elts = new List<HtmlElement>(); foreach (HtmlNode node in nodes) { if (node is HtmlElement) { HtmlElement elt = (HtmlElement)node; // is this a match? if (elt.Name == name) { if (attrname == null) elts.Add(elt); else { HtmlAttribute attr = elt.Attributes.FindByName(attrname); if (attr != null && attr.Value == attrvalue) elts.Add(elt); } } // look down the branch elts.AddRange(FindAll(elt.Nodes, name, attrname, attrvalue)); } } // return all found return elts; }
public static List<HtmlElement> GetAllElements(HtmlNodeCollection coll) { var results = new List<HtmlElement>(); foreach (HtmlNode node in coll) { if (node is HtmlElement) { results.Add((HtmlElement)node); results.AddRange(GetAllElements(((HtmlElement)node).Nodes)); } } return results; }
// return all name, value pairs in a form public static Dictionary<string, string> FindForm(HtmlNodeCollection nodes, string name, out Dictionary<string, string> submits, out string action) { HtmlElement form = FindOne(nodes, "form", "name", name); if (form == null) { // try by class form = FindOne(nodes, "form", "class", name); if (form == null) { submits = null; action = null; return null; // failed! } } action = form.Attributes["action"].Value; List<HtmlElement> inputs = FindAll(form.Nodes, "input", null, null); // fill out dictionary Dictionary<string, string> values = new Dictionary<string, string>(); submits = new Dictionary<string, string>(); foreach (HtmlElement input in inputs) { HtmlAttribute attrname = input.Attributes.FindByName("name"); if (attrname == null) continue; HtmlAttribute attrvalue = input.Attributes.FindByName("value"); if (attrvalue == null) attrvalue = new HtmlAttribute("value", ""); string attrtype = input.Attributes.FindByName("type").Value.ToLower(); if (attrtype == "submit" || attrtype == "button" || attrtype == "image") submits[attrname.Value] = attrvalue.Value; else values[attrname.Value] = attrvalue.Value; } return values; }
/// <summary> /// This will create a new document object by parsing the HTML specified. /// </summary> /// <param name="html">The HTML to parse.</param> internal HtmlDocument(string html,bool wantSpaces) { HtmlParser parser = new HtmlParser(); parser.RemoveEmptyElementText = !wantSpaces; mNodes = parser.Parse( html ); }
/// <summary> /// This will parse a string containing HTML and will produce a domain tree. /// </summary> /// <param name="html">The HTML to be parsed</param> /// <returns>A tree representing the elements</returns> public HtmlNodeCollection Parse(string html) { HtmlNodeCollection nodes = new HtmlNodeCollection(null); html = PreprocessScript( html ,"script" ); html = PreprocessScript( html ,"style" ); html = RemoveComments( html ); html = RemoveSGMLComments( html ); StringCollection tokens = GetTokens( html ); int index = 0; HtmlElement element = null; while( index < tokens.Count ) { if( "<".Equals( tokens[index] ) ) { // Read open tag index++; if( index >= tokens.Count ) break; string tag_name = tokens[index]; index++; element = new HtmlElement( tag_name ); // read the attributes and values while( index < tokens.Count && ! ">".Equals( tokens[index] ) && ! "/>".Equals( tokens[index] ) ) { string attribute_name = tokens[ index ]; index++; if( index < tokens.Count && "=".Equals( tokens[ index ] ) ) { index++; string attribute_value; if( index < tokens.Count ) { attribute_value = tokens[ index ]; } else { attribute_value = null; } index++; HtmlAttribute attribute = new HtmlAttribute( attribute_name , HtmlEncoder.DecodeValue( attribute_value ) ); element.Attributes.Add( attribute ); } else if( index < tokens.Count ) { // Null-value attribute HtmlAttribute attribute = new HtmlAttribute( attribute_name , null ); element.Attributes.Add( attribute ); } } nodes.Add( element ); if( index < tokens.Count && "/>".Equals( tokens[ index ] ) ) { element.IsTerminated = true; index++; element = null; } else if( index < tokens.Count && ">".Equals( tokens[ index ] ) ) { index++; } } else if( ">".Equals( tokens[index] ) ) { index++; } else if( "</".Equals( tokens[index] ) ) { // Read close tag index++; if( index >= tokens.Count ) break; string tag_name = tokens[index]; index++; int open_index = FindTagOpenNodeIndex( nodes , tag_name ); if( open_index != -1 ) { MoveNodesDown( ref nodes , open_index + 1 , (HtmlElement)nodes[open_index] ); } else { // Er, there is a close tag without an opening tag!! } // Skip to the end of this tag while( index < tokens.Count && ! ">".Equals( tokens[ index ] ) ) { index++; } if( index < tokens.Count && ">".Equals( tokens[ index ] ) ) { index++; } element = null; } else { // Read text string value = tokens[ index ]; if( mRemoveEmptyElementText ) { value = RemoveWhitespace( value ); } value = DecodeScript( value ); if( mRemoveEmptyElementText && value.Length == 0 ) { // We do nothing } else { if( ! ( element != null && element.NoEscaping ) ) { value = HtmlEncoder.DecodeValue( value ); } HtmlText node = new HtmlText( value ); nodes.Add( node ); } index++; } } return nodes; }
/// <summary> /// This will find the corresponding opening tag for the named one. This is identified as /// the most recently read node with the same name, but with no child nodes. /// </summary> /// <param name="nodes">The collection of nodes</param> /// <param name="name">The name of the tag</param> /// <returns>The index of the opening tag, or -1 if it was not found</returns> private int FindTagOpenNodeIndex(HtmlNodeCollection nodes,string name) { for( int index = nodes.Count - 1 ; index >= 0 ; index-- ) { if( nodes[index] is HtmlElement ) { if( ( (HtmlElement) nodes[index] ).Name.ToLower().Equals( name.ToLower() ) && ( (HtmlElement) nodes[index] ).Nodes.Count == 0 && ( (HtmlElement) nodes[index] ).IsTerminated == false ) { return index; } } } return -1; }
/// <summary> /// This will move all the nodes from the specified index to the new parent. /// </summary> /// <param name="nodes">The collection of nodes</param> /// <param name="node_index">The index of the first node (in the above collection) to move</param> /// <param name="new_parent">The node which will become the parent of the moved nodes</param> private void MoveNodesDown(ref HtmlNodeCollection nodes,int node_index,HtmlElement new_parent) { for( int i = node_index ; i < nodes.Count ; i++ ) { ((HtmlElement)new_parent).Nodes.Add( nodes[i] ); nodes[i].SetParent( new_parent ); } int c = nodes.Count; for( int i = node_index ; i < c ; i++ ) { nodes.RemoveAt( node_index ); } new_parent.IsExplicitlyTerminated = true; }
public static string ToText(HtmlNodeCollection nodes) { StringBuilder result = new StringBuilder(); foreach (HtmlNode node in nodes) { if (node is HtmlElement) { HtmlElement elt = (HtmlElement)node; string text = ToText(elt.Nodes); if (text.StartsWith(" ") && result.ToString().EndsWith(" ")) text = text.Substring(1); result.Append(text); string name = elt.Name.ToLower(); if (name == "br" || name == "div") result.Append("\n"); } else { string text = node.ToString(); text = text.Replace("\r", " ").Replace("\n", " ").Replace("\t", " "); string next = text.Replace(" ", " "); while (text != next) { text = next; next = next.Replace(" ", " "); } if (text.StartsWith(" ") && result.ToString().EndsWith(" ")) text = text.Substring(1); result.Append(text); } } return result.ToString(); }
// Look for a given node with a given attribute public static HtmlElement FindOne(HtmlNodeCollection nodes, string name, string attrname, string attrvalue) { foreach (HtmlNode node in nodes) { if (node is HtmlElement) { HtmlElement elt = (HtmlElement)node; // is this a match? if (elt.Name == name) { if (attrname == null) return elt; HtmlAttribute attr = elt.Attributes.FindByName(attrname); if (attr != null && attr.Value == attrvalue) return elt; } // look down the branch HtmlElement result = FindOne(elt.Nodes, name, attrname, attrvalue); if (result != null) return result; } } // not found in this branch return null; }
public HtmlNodeCollection FindByAttributeNameValue(string attributeName,string attributeValue,bool searchChildren) { HtmlNodeCollection results = new HtmlNodeCollection(null); foreach( HtmlNode node in base.List ) { if( node is HtmlElement ) { foreach( HtmlAttribute attribute in ((HtmlElement)node).Attributes ) { if( attribute.Name.ToLower().Equals( attributeName.ToLower() ) ) { if( attribute.Value.ToLower().Equals( attributeValue.ToLower() ) ) { results.Add( node ); } break; } } if( searchChildren ) { foreach( HtmlNode matchedChild in ( (HtmlElement)node ).Nodes.FindByAttributeNameValue( attributeName , attributeValue , searchChildren ) ) { results.Add( matchedChild ); } } } } return results; }
/// <summary> /// This will search though this collection of nodes for all elements with the /// specified name. If you want to search the subnodes recursively, you should /// pass True as the parameter in searchChildren. This search is guaranteed to /// return nodes in the order in which they are found in the document. /// </summary> /// <param name="name">The name of the element to find</param> /// <param name="searchChildren">True if you want to search sub-nodes, False to /// only search this collection.</param> /// <returns>A collection of all the nodes that macth.</returns> public HtmlNodeCollection FindByName(string name,bool searchChildren) { HtmlNodeCollection results = new HtmlNodeCollection(null); foreach( HtmlNode node in base.List ) { if( node is HtmlElement ) { if( ( (HtmlElement)node ).Name.ToLower().Equals( name.ToLower() ) ) { results.Add( node ); } if( searchChildren ) { foreach( HtmlNode matchedChild in ( (HtmlElement)node ).Nodes.FindByName( name , searchChildren ) ) { results.Add( matchedChild ); } } } } return results; }