Beispiel #1
0
 /// <summary>
 /// This constructs a new HTML element with the specified tag name.
 /// </summary>
 /// <param name="name">The name of this element</param>
 public HtmlElement(string name)
 {
     mNodes = new HtmlNodeCollection(this);
     mAttributes = new HtmlAttributeCollection(this);
     mName = name;
     mIsTerminated = false;
 }
Beispiel #2
0
        // Look for all nodes of a given type
        public static List<HtmlElement> FindAll(HtmlNodeCollection nodes, string name, string attrname, string attrvalue)
        {
            List<HtmlElement> elts = new List<HtmlElement>();

            foreach (HtmlNode node in nodes)
            {
                if (node is HtmlElement)
                {
                    HtmlElement elt = (HtmlElement)node;
                    // is this a match?
                    if (elt.Name == name)
                    {
                        if (attrname == null)
                            elts.Add(elt);
                        else
                        {
                            HtmlAttribute attr = elt.Attributes.FindByName(attrname);
                            if (attr != null && attr.Value == attrvalue)
                                elts.Add(elt);
                        }
                    }

                    // look down the branch
                    elts.AddRange(FindAll(elt.Nodes, name, attrname, attrvalue));
                }
            }

            // return all found
            return elts;
        }
Beispiel #3
0
 public static List<HtmlElement> GetAllElements(HtmlNodeCollection coll)
 {
     var results = new List<HtmlElement>();
     foreach (HtmlNode node in coll)
     {
         if (node is HtmlElement)
         {
             results.Add((HtmlElement)node);
             results.AddRange(GetAllElements(((HtmlElement)node).Nodes));
         }
     }
     return results;
 }
Beispiel #4
0
        // return all name, value pairs in a form
        public static Dictionary<string, string> FindForm(HtmlNodeCollection nodes, string name, out Dictionary<string, string> submits, out string action)
        {
            HtmlElement form = FindOne(nodes, "form", "name", name);
            if (form == null)
            {
                // try by class
                form = FindOne(nodes, "form", "class", name);
                if (form == null) {
                    submits = null;
                    action = null;
                    return null;    // failed!
                }
            }
            action = form.Attributes["action"].Value;

            List<HtmlElement> inputs = FindAll(form.Nodes, "input", null, null);

            // fill out dictionary
            Dictionary<string, string> values = new Dictionary<string, string>();
            submits = new Dictionary<string, string>();
            foreach (HtmlElement input in inputs)
            {
                HtmlAttribute attrname = input.Attributes.FindByName("name");
                if (attrname == null)
                    continue;
                HtmlAttribute attrvalue = input.Attributes.FindByName("value");
                if (attrvalue == null)
                    attrvalue = new HtmlAttribute("value", "");
                string attrtype = input.Attributes.FindByName("type").Value.ToLower();

                if (attrtype == "submit" || attrtype == "button" || attrtype == "image")
                    submits[attrname.Value] = attrvalue.Value;
                else
                    values[attrname.Value] = attrvalue.Value;
            }

            return values;
        }
Beispiel #5
0
		/// <summary>
		/// This will create a new document object by parsing the HTML specified.
		/// </summary>
		/// <param name="html">The HTML to parse.</param>
		internal HtmlDocument(string html,bool wantSpaces)
		{
			HtmlParser parser = new HtmlParser();
			parser.RemoveEmptyElementText = !wantSpaces;
			mNodes = parser.Parse( html );
		}
Beispiel #6
0
		/// <summary>
		/// This will parse a string containing HTML and will produce a domain tree.
		/// </summary>
		/// <param name="html">The HTML to be parsed</param>
		/// <returns>A tree representing the elements</returns>
		public HtmlNodeCollection Parse(string html)
		{
			HtmlNodeCollection nodes = new HtmlNodeCollection(null);

			html = PreprocessScript( html ,"script" );
			html = PreprocessScript( html ,"style" );

			html = RemoveComments( html );
			html = RemoveSGMLComments( html );
			StringCollection tokens = GetTokens( html );

			int index = 0;
			HtmlElement element = null;
			while( index < tokens.Count )
			{
				if( "<".Equals( tokens[index] ) )
				{
					// Read open tag

					index++;
					if( index >= tokens.Count ) break;
					string tag_name = tokens[index];
					index++;
					element = new HtmlElement( tag_name );
					// read the attributes and values

					while( index < tokens.Count && ! ">".Equals( tokens[index] ) && ! "/>".Equals( tokens[index] ) )
					{
						string attribute_name = tokens[ index ];
						index++;
						if( index < tokens.Count && "=".Equals( tokens[ index ] ) )
						{
							index++;
							string attribute_value;
							if( index < tokens.Count )
							{
								attribute_value = tokens[ index ];
							}
							else
							{
								attribute_value = null;
							}
							index++;
							HtmlAttribute attribute = new HtmlAttribute( attribute_name , HtmlEncoder.DecodeValue( attribute_value ) );
							element.Attributes.Add( attribute );
						}
						else if( index < tokens.Count )
						{
							// Null-value attribute
							HtmlAttribute attribute = new HtmlAttribute( attribute_name , null );
							element.Attributes.Add( attribute );
						}
					}
					nodes.Add( element );
					if( index < tokens.Count && "/>".Equals( tokens[ index ] ) )
					{
						element.IsTerminated = true;
						index++;
						element = null;
					}
					else if( index < tokens.Count && ">".Equals( tokens[ index ] ) )
					{
						index++;
					}
				}
				else if( ">".Equals( tokens[index] ) )
				{
					index++;
				}
				else if( "</".Equals( tokens[index] ) )
				{
					// Read close tag
					index++;
					if( index >= tokens.Count ) break;
					string tag_name = tokens[index];
					index++;

					int open_index = FindTagOpenNodeIndex( nodes , tag_name );
					if( open_index != -1 )
					{
						MoveNodesDown( ref nodes , open_index + 1 , (HtmlElement)nodes[open_index] );
					}
					else
					{
						// Er, there is a close tag without an opening tag!!
					}

					// Skip to the end of this tag
					while( index < tokens.Count && ! ">".Equals( tokens[ index ] ) )
					{
						index++;
					}
					if( index < tokens.Count && ">".Equals( tokens[ index ] ) )
					{
						index++;
					}

					element = null;
				}
				else
				{
					// Read text
					string value = tokens[ index ];
					if( mRemoveEmptyElementText )
					{
						value = RemoveWhitespace( value );
					}
					value = DecodeScript( value );

					if( mRemoveEmptyElementText && value.Length == 0 )
					{
						// We do nothing
					}
					else
					{
						if( ! ( element != null && element.NoEscaping ) )
						{
							value = HtmlEncoder.DecodeValue( value );
						}
						HtmlText node = new HtmlText( value );
						nodes.Add( node );
					}
					index++;
				}
			}
			return nodes;
		}
Beispiel #7
0
		/// <summary>
		/// This will find the corresponding opening tag for the named one. This is identified as
		/// the most recently read node with the same name, but with no child nodes.
		/// </summary>
		/// <param name="nodes">The collection of nodes</param>
		/// <param name="name">The name of the tag</param>
		/// <returns>The index of the opening tag, or -1 if it was not found</returns>
		private int FindTagOpenNodeIndex(HtmlNodeCollection nodes,string name)
		{
			for( int index = nodes.Count - 1 ; index >= 0 ; index-- )
			{
				if( nodes[index] is HtmlElement )
				{
					if( ( (HtmlElement) nodes[index] ).Name.ToLower().Equals( name.ToLower() ) && ( (HtmlElement) nodes[index] ).Nodes.Count == 0 && ( (HtmlElement) nodes[index] ).IsTerminated == false )
					{
						return index;
					}
				}
			}
			return -1;
		}
Beispiel #8
0
		/// <summary>
		/// This will move all the nodes from the specified index to the new parent.
		/// </summary>
		/// <param name="nodes">The collection of nodes</param>
		/// <param name="node_index">The index of the first node (in the above collection) to move</param>
		/// <param name="new_parent">The node which will become the parent of the moved nodes</param>

		private void MoveNodesDown(ref HtmlNodeCollection nodes,int node_index,HtmlElement new_parent)
		{
			for( int i = node_index ; i < nodes.Count ; i++ )
			{
				((HtmlElement)new_parent).Nodes.Add( nodes[i] );
				nodes[i].SetParent( new_parent );
			}
			int c = nodes.Count;
			for( int i = node_index ; i < c ; i++ )
			{
				nodes.RemoveAt( node_index );
			}
			new_parent.IsExplicitlyTerminated = true;
		}
Beispiel #9
0
        public static string ToText(HtmlNodeCollection nodes)
        {
            StringBuilder result = new StringBuilder();

            foreach (HtmlNode node in nodes)
            {
                if (node is HtmlElement)
                {
                    HtmlElement elt = (HtmlElement)node;
                    string text = ToText(elt.Nodes);
                    if (text.StartsWith(" ") && result.ToString().EndsWith(" "))
                        text = text.Substring(1);

                    result.Append(text);

                    string name = elt.Name.ToLower();
                    if (name == "br" || name == "div")
                        result.Append("\n");
                }
                else
                {
                    string text = node.ToString();
                    text = text.Replace("\r", " ").Replace("\n", " ").Replace("\t", " ");
                    string next = text.Replace("  ", " ");
                    while (text != next)
                    {
                        text = next;
                        next = next.Replace("  ", " ");
                    }

                    if (text.StartsWith(" ") && result.ToString().EndsWith(" "))
                        text = text.Substring(1);

                    result.Append(text);
                }
            }

            return result.ToString();
        }
Beispiel #10
0
        // Look for a given node with a given attribute
        public static HtmlElement FindOne(HtmlNodeCollection nodes, string name, string attrname, string attrvalue)
        {
            foreach (HtmlNode node in nodes)
            {
                if (node is HtmlElement)
                {
                    HtmlElement elt = (HtmlElement)node;
                    // is this a match?
                    if (elt.Name == name)
                    {
                        if (attrname == null)
                            return elt;

                        HtmlAttribute attr = elt.Attributes.FindByName(attrname);
                        if (attr != null && attr.Value == attrvalue)
                            return elt;
                    }

                    // look down the branch
                    HtmlElement result = FindOne(elt.Nodes, name, attrname, attrvalue);
                    if (result != null)
                        return result;
                }
            }

            // not found in this branch
            return null;
        }
Beispiel #11
0
		public HtmlNodeCollection FindByAttributeNameValue(string attributeName,string attributeValue,bool searchChildren)
		{
			HtmlNodeCollection results = new HtmlNodeCollection(null);
			foreach( HtmlNode node in base.List )
			{
				if( node is HtmlElement )
				{
					foreach( HtmlAttribute attribute in ((HtmlElement)node).Attributes )
					{
						if( attribute.Name.ToLower().Equals( attributeName.ToLower() ) )
						{
							if( attribute.Value.ToLower().Equals( attributeValue.ToLower() ) )
							{
								results.Add( node );
							}
							break;
						}
					}
					if( searchChildren )
					{
						foreach( HtmlNode matchedChild in ( (HtmlElement)node ).Nodes.FindByAttributeNameValue( attributeName , attributeValue , searchChildren ) )
						{
							results.Add( matchedChild );
						}
					}
				}
			}
			return results;
		}
Beispiel #12
0
		/// <summary>
		/// This will search though this collection of nodes for all elements with the
		/// specified name. If you want to search the subnodes recursively, you should
		/// pass True as the parameter in searchChildren. This search is guaranteed to
		/// return nodes in the order in which they are found in the document.
		/// </summary>
		/// <param name="name">The name of the element to find</param>
		/// <param name="searchChildren">True if you want to search sub-nodes, False to
		/// only search this collection.</param>
		/// <returns>A collection of all the nodes that macth.</returns>
		public HtmlNodeCollection FindByName(string name,bool searchChildren)
		{
			HtmlNodeCollection results = new HtmlNodeCollection(null);
			foreach( HtmlNode node in base.List )
			{
				if( node is HtmlElement )
				{
					if( ( (HtmlElement)node ).Name.ToLower().Equals( name.ToLower() ) )
					{
						results.Add( node );
					}
					if( searchChildren )
					{
						foreach( HtmlNode matchedChild in ( (HtmlElement)node ).Nodes.FindByName( name , searchChildren ) )
						{
							results.Add( matchedChild );
						}
					}
				}
			}
			return results;
		}