Beispiel #1
0
        /// <summary>
        /// This will parse a string containing HTML and will produce a domain tree.
        /// </summary>
        /// <param name="html">The HTML to be parsed</param>
        /// <returns>A tree representing the elements</returns>
        public HtmlNodeCollection Parse(string html)
        {
            HtmlNodeCollection nodes = new HtmlNodeCollection(null);

            html = PreprocessScript(html, "script");
            html = PreprocessScript(html, "style");

            html = RemoveComments(html);
            html = RemoveSGMLComments(html);
            StringCollection tokens = GetTokens(html);

            int         index   = 0;
            HtmlElement element = null;

            while (index < tokens.Count)
            {
                if ("<".Equals(tokens[index]))
                {
                    // Read open tag

                    index++;
                    if (index >= tokens.Count)
                    {
                        break;
                    }
                    string tag_name = tokens[index];
                    index++;
                    element = new HtmlElement(tag_name);
                    // read the attributes and values

                    while (index < tokens.Count && !">".Equals(tokens[index]) && !"/>".Equals(tokens[index]))
                    {
                        string attribute_name = tokens[index];
                        index++;
                        if (index < tokens.Count && "=".Equals(tokens[index]))
                        {
                            index++;
                            string attribute_value;
                            if (index < tokens.Count)
                            {
                                attribute_value = tokens[index];
                            }
                            else
                            {
                                attribute_value = null;
                            }
                            index++;
                            HtmlAttribute attribute = new HtmlAttribute(attribute_name, HtmlEncoder.DecodeValue(attribute_value));
                            element.Attributes.Add(attribute);
                        }
                        else if (index < tokens.Count)
                        {
                            // Null-value attribute
                            HtmlAttribute attribute = new HtmlAttribute(attribute_name, null);
                            element.Attributes.Add(attribute);
                        }
                    }
                    nodes.Add(element);
                    if (index < tokens.Count && "/>".Equals(tokens[index]))
                    {
                        element.IsTerminated = true;
                        index++;
                        element = null;
                    }
                    else if (index < tokens.Count && ">".Equals(tokens[index]))
                    {
                        index++;
                    }
                }
                else if (">".Equals(tokens[index]))
                {
                    index++;
                }
                else if ("</".Equals(tokens[index]))
                {
                    // Read close tag
                    index++;
                    if (index >= tokens.Count)
                    {
                        break;
                    }
                    string tag_name = tokens[index];
                    index++;

                    int open_index = FindTagOpenNodeIndex(nodes, tag_name);
                    if (open_index != -1)
                    {
                        MoveNodesDown(ref nodes, open_index + 1, (HtmlElement)nodes[open_index]);
                    }
                    else
                    {
                        // Er, there is a close tag without an opening tag!!
                    }

                    // Skip to the end of this tag
                    while (index < tokens.Count && !">".Equals(tokens[index]))
                    {
                        index++;
                    }
                    if (index < tokens.Count && ">".Equals(tokens[index]))
                    {
                        index++;
                    }

                    element = null;
                }
                else
                {
                    // Read text
                    string value = tokens[index];
                    if (mRemoveEmptyElementText)
                    {
                        value = RemoveWhitespace(value);
                    }
                    value = DecodeScript(value);

                    if (mRemoveEmptyElementText && value.Length == 0)
                    {
                        // We do nothing
                    }
                    else
                    {
                        if (!(element != null && element.NoEscaping))
                        {
                            value = HtmlEncoder.DecodeValue(value);
                        }
                        HtmlText node = new HtmlText(value);
                        nodes.Add(node);
                    }
                    index++;
                }
            }
            return(nodes);
        }
Beispiel #2
0
		/// <summary>
		/// This will parse a string containing HTML and will produce a domain tree.
		/// </summary>
		/// <param name="html">The HTML to be parsed</param>
		/// <returns>A tree representing the elements</returns>
		public HtmlNodeCollection Parse(string html)
		{
			HtmlNodeCollection nodes = new HtmlNodeCollection(null);

			html = PreprocessScript( html ,"script" );
			html = PreprocessScript( html ,"style" );

			html = RemoveComments( html );
			html = RemoveSGMLComments( html );
			StringCollection tokens = GetTokens( html );

			int index = 0;
			HtmlElement element = null;
			while( index < tokens.Count )
			{
				if( "<".Equals( tokens[index] ) )
				{
					// Read open tag

					index++;
					if( index >= tokens.Count ) break;
					string tag_name = tokens[index];
					index++;
					element = new HtmlElement( tag_name );
					// read the attributes and values

					while( index < tokens.Count && ! ">".Equals( tokens[index] ) && ! "/>".Equals( tokens[index] ) )
					{
						string attribute_name = tokens[ index ];
						index++;
						if( index < tokens.Count && "=".Equals( tokens[ index ] ) )
						{
							index++;
							string attribute_value;
							if( index < tokens.Count )
							{
								attribute_value = tokens[ index ];
							}
							else
							{
								attribute_value = null;
							}
							index++;
							HtmlAttribute attribute = new HtmlAttribute( attribute_name , HtmlEncoder.DecodeValue( attribute_value ) );
							element.Attributes.Add( attribute );
						}
						else if( index < tokens.Count )
						{
							// Null-value attribute
							HtmlAttribute attribute = new HtmlAttribute( attribute_name , null );
							element.Attributes.Add( attribute );
						}
					}
					nodes.Add( element );
					if( index < tokens.Count && "/>".Equals( tokens[ index ] ) )
					{
						element.IsTerminated = true;
						index++;
						element = null;
					}
					else if( index < tokens.Count && ">".Equals( tokens[ index ] ) )
					{
						index++;
					}
				}
				else if( ">".Equals( tokens[index] ) )
				{
					index++;
				}
				else if( "</".Equals( tokens[index] ) )
				{
					// Read close tag
					index++;
					if( index >= tokens.Count ) break;
					string tag_name = tokens[index];
					index++;

					int open_index = FindTagOpenNodeIndex( nodes , tag_name );
					if( open_index != -1 )
					{
						MoveNodesDown( ref nodes , open_index + 1 , (HtmlElement)nodes[open_index] );
					}
					else
					{
						// Er, there is a close tag without an opening tag!!
					}

					// Skip to the end of this tag
					while( index < tokens.Count && ! ">".Equals( tokens[ index ] ) )
					{
						index++;
					}
					if( index < tokens.Count && ">".Equals( tokens[ index ] ) )
					{
						index++;
					}

					element = null;
				}
				else
				{
					// Read text
					string value = tokens[ index ];
					if( mRemoveEmptyElementText )
					{
						value = RemoveWhitespace( value );
					}
					value = DecodeScript( value );

					if( mRemoveEmptyElementText && value.Length == 0 )
					{
						// We do nothing
					}
					else
					{
						if( ! ( element != null && element.NoEscaping ) )
						{
							value = HtmlEncoder.DecodeValue( value );
						}
						HtmlText node = new HtmlText( value );
						nodes.Add( node );
					}
					index++;
				}
			}
			return nodes;
		}