private DOMContainer Raw2Hierarchy(List <DOMElement> rawResults)
        {
            // Create a DOM container
            DOMContainer domObject = new DOMContainer();

            // Define the current parent element
            HTMLTag currentParent = domObject;

            // Define our starting stack
            List <HTMLTag> openTagStack = new List <HTMLTag>()
            {
                currentParent
            };

            // Keep a running reference to any active <form> tags and <select> tags
            HTMLForm   currentForm   = null;
            HTMLSelect currentSelect = null;

            // Pre-processing - find and update self-closing tags like <IMG> and <BR> and eliminate any
            // unnecessary closing tags like </IMG> or </BR>
            List <DOMElement> elementsToRemove = new List <DOMElement>();

            foreach (DOMElement element in rawResults)
            {
                if (element is HTMLTag)
                {
                    HTMLTag elementTag     = (HTMLTag)element;
                    string  elementTagName = elementTag.TagName.ToUpper();
                    switch (elementTagName)
                    {
                    case "AREA":
                    case "BASE":
                    case "BR":
                    case "COL":
                    case "COMMAND":
                    case "EMBED":
                    case "HR":
                    case "IMG":
                    case "INPUT":
                    case "KEYGEN":
                    case "LINK":
                    case "META":
                    case "PARAM":
                    case "SOURCE":
                    case "TRACK":
                    case "WBR":
                        if (elementTag.IsClosingTag)
                        {
                            // </IMG> and </BR> and so on are invalid closing tags...
                            domObject.Warnings.Add("Marking " + element + " to be deleted (REASON: Self-closed tag)");
                            elementsToRemove.Add(elementTag);
                        }
                        else if (!elementTag.SelfClosed)
                        {
                            // <IMG> and <BR> and so on are self-closed tags...
                            domObject.Warnings.Add("Marking " + element + " as self-closed (REASON: Self-closed tag)");
                            elementTag.SelfClosed = true;
                        }
                        break;
                    }
                }
            }

            // Remove bad tags
            if (elementsToRemove.Count > 0)
            {
                foreach (DOMElement element in elementsToRemove)
                {
                    Console.WriteLine("Removing " + element);
                    rawResults.Remove(element);
                }
            }

            List <string> domErrors   = new List <string>();
            List <string> domWarnings = new List <string>();

            while (rawResults.Count > 0)
            {
                // Shift off the beginning
                DOMElement nextElement = rawResults[0];
                rawResults.RemoveAt(0);

                // string indent = "".PadLeft(openTagStack.Count,'\t');

                // If it's an opening tag, let's update the parentElement
                if (nextElement is HTMLTag)
                {
                    // Cast once
                    HTMLTag nextElementTag = (HTMLTag)nextElement;

                    // Console.Write(indent + nextElementTag + ": ");

                    // If it's an opening tag
                    if (nextElementTag.IsClosingTag)
                    {
                        // Closing tag - try to match to parent
                        // Console.Write("Closing tag, trying to match to current parent " + currentParent + "... ");

                        // If this is a closing </form>, then null out currentForm
                        if (nextElementTag.TagName == "form")
                        {
                            currentForm = null;
                        }
                        // Else If this is a closing </select>, then null out currentSelect
                        else if (nextElementTag.TagName == "select")
                        {
                            currentSelect = null;
                        }

                        // Check to see if the current parent matches the current element (<td><p></p></td> and not malformed HTML like <p><td></p></td>)
                        if (nextElementTag.TagName == currentParent.TagName)
                        {
                            // Mark current parent as successfully closing
                            // currentParent.Closes = true;

                            // Closing tag - pop the stack
                            openTagStack.Remove(currentParent);
                            currentParent = openTagStack.Last();

                            // Console.WriteLine("Match - popped the stack and adding to end of new currentParent " + currentParent + ".");

                            // REMOVED - So the hierarchy only lists the open node and the closed can be
                            //           inferred from the hierarchy.
                            // Move to current parent
                            // currentParent.Children.Add(nextElement);
                        }
                        else
                        {
                            // Console.WriteLine("Not a match - searching stack for a match...");

                            // Malformed HTML detected, try to find a matching open parent from the bottom to the top
                            bool foundStackSearchMatch = false;
                            for (int j = openTagStack.Count - 1; j >= 0; j--)
                            {
                                // Found it!
                                // Console.Write(indent + "  " + nextElementTag + " == " + openTagStack[j] + " ? ");
                                if (openTagStack[j].TagName == nextElementTag.TagName)
                                {
                                    domObject.Warnings.Add(nextElementTag + " was out of sequence. Current parent tag is " + currentParent + " but matching " + openTagStack[j] + " was found further outside.");

                                    // Console.WriteLine("Match! Moving to its parent.");
                                    foundStackSearchMatch = true;

                                    // REMOVED - See above reason.
                                    // Add to parent
                                    // openTagStack[j - 1].Children.Add(nextElement);
                                    // openTagStack[j - 1].Closes = true;

                                    // Remove that element from the open stack
                                    openTagStack.RemoveAt(j);
                                    break;
                                }
                            }
                            if (!foundStackSearchMatch)
                            {
                                // Uh-oh.... add it to the current parent
                                domObject.Errors.Add(nextElementTag + " did not match up to any open tag! Position in HTML: " + nextElementTag.StartPosition);
                                // currentParent.Children.Add(nextElement);
                                // Console.Write(indent + "  No matches found - adding to the currentParent " + currentParent);
                            }
                        }
                    }
                    else if (nextElementTag.SelfClosed)
                    {
                        // Self-closed tag
                        // Console.WriteLine("Self-closed tag, adding to current parent " + currentParent + "");

                        // Move to current parent
                        currentParent.Children.Add(nextElement);

                        // <input>s
                        if ((nextElement is HTMLInput) && (currentForm != null))
                        {
                            currentForm.Inputs.Add((HTMLInput)nextElement);
                            if (((HTMLInput)nextElement).Name != null)
                            {
                                currentForm.NamedInputs.Add((HTMLInput)nextElement);
                            }
                        }

                        // <option />s
                        else if ((nextElement is HTMLSelectOption) && (currentSelect != null))
                        {
                            currentSelect.Options.Add((HTMLSelectOption)nextElement);
                        }
                    }
                    else
                    {
                        // Open tag - push onto the stack
                        // Console.WriteLine("Open tag, adding to currentParent " + currentParent + ", adding to stack, and setting as new currentParent.");

                        // Move to current parent
                        currentParent.Children.Add(nextElementTag);

                        // <select>s and <textarea>s
                        if ((nextElement is HTMLInput) && (currentForm != null))
                        {
                            currentForm.Inputs.Add((HTMLInput)nextElement);
                            if (((HTMLInput)nextElement).Name != null)
                            {
                                currentForm.NamedInputs.Add((HTMLInput)nextElement);
                            }

                            // Indicate we're in a <select> (for easier <option> association)
                            if (nextElement is HTMLSelect)
                            {
                                currentSelect = (HTMLSelect)nextElement;
                            }
                        }
                        // <option />s
                        else if ((nextElement is HTMLSelectOption) && (currentSelect != null))
                        {
                            currentSelect.Options.Add((HTMLSelectOption)nextElement);
                        }

                        // Make this the new currentParent
                        openTagStack.Add(nextElementTag);
                        currentParent = nextElementTag;

                        // Initialize children list
                        currentParent.Children = new List <DOMElement>();

                        // Update current form
                        if (currentParent is HTMLForm)
                        {
                            currentForm = (HTMLForm)currentParent;
                        }
                    }
                }
                else
                {
                    // Content goes into current parent
                    currentParent.Children.Add(nextElement);
                }
            }

            // Return final result
            return(domObject);
        }
Пример #2
0
        // Return DOMElement instead of Tag, since we -could- return
        private DOMElement _ParseTag(int startPosition)
        {
            // Initialize new Tag and empty Attribute
            HTMLTag          tag = new HTMLTag(startPosition);
            HTMLTagAttribute currentAttribute = null;

            // Start looping through the HTML (skip 1 char since we're already at the '<'
            tagParserState = TagParserState.ExpectingTagName;
            int currentPosition = startPosition + 1;

            while (currentPosition < _HTML.Length)
            {
                // Read char and advance
                char chr = _HTML[currentPosition];

                switch (tagParserState)
                {
                    #region TagParserState.ExpectingTagName - Look for an optional '/' and/or a tag name and possibly an ending '>' (if there's a '/' found)

                /*
                 * MATCHES:
                 * <DIV ATTRIBUTE="FOO" ATTR = 'BAR'> or </DIV>
                 *  ‾‾‾                                   ‾‾‾‾‾
                 */

                // When we're start a tag and waiting for the tag name...
                case TagParserState.ExpectingTagName:
                {
                    if (isAlphaNumericChar(chr))
                    {
                        // A letter in the tag name - add it to sbTemp and read the rest of the tag name
                        tag.TagName = _readAlphaNumericWord(currentPosition);

                        if (tag.TagName.StartsWith("!--"))
                        {
                            // HTML comment
                            HTMLContent comment = new HTMLContent(startPosition, _readUntil(startPosition, "-->"));
                            return(comment);
                        }
                        else
                        {
                            // Any tag conversions?
                            switch (tag.TagName.ToLower())
                            {
                            case "form":
                                tag = new HTMLForm(tag.StartPosition)
                                {
                                    TagName = tag.TagName
                                };
                                break;

                            case "input":
                                tag = new HTMLInput(tag.StartPosition)
                                {
                                    TagName = tag.TagName
                                };
                                break;

                            case "select":
                                tag = new HTMLSelect(tag.StartPosition)
                                {
                                    TagName = tag.TagName
                                };
                                break;

                            case "option":
                                tag = new HTMLSelectOption(tag.StartPosition)
                                {
                                    TagName = tag.TagName
                                };
                                break;

                            case "textarea":
                                tag = new HTMLTextarea(tag.StartPosition)
                                {
                                    TagName = tag.TagName
                                };
                                break;
                            }

                            // Advance position by name length
                            currentPosition += tag.TagName.Length;
                            tagParserState   = TagParserState.ExpectingTagContentsOrEnd;
                        }
                    }
                    else if (chr == '/')
                    {
                        // This is a closing tag like </div> - read the tag name and close it
                        tag.IsClosingTag = true;

                        // Advance to the start of the tag name and read it
                        currentPosition  = this._indexOfNextNonWhitespaceChar(currentPosition + 1);
                        tag.TagName      = _readAlphaNumericWord(currentPosition);
                        currentPosition += tag.TagName.Length;

                        // Advance to end of tag '>'
                        currentPosition += _readUntil(currentPosition, '>').Length - 1;
                        tagParserState   = TagParserState.TagEnded;
                    }
                }
                break;
                    #endregion

                    #region TagParserState.ExpectingAttributeNameOrTagEnd - Inside the tag, looking for either alpha chars (start of an attribute), or a '/' self-closing flag, or the closing '>' character
                case TagParserState.ExpectingTagContentsOrEnd:

                    // Advance to the next non-whitespace char
                    currentPosition = _indexOfNextNonWhitespaceChar(currentPosition);
                    chr             = _HTML[currentPosition];

                    if (chr == '/')
                    {
                        /* MATCHES: <IMG />
                         *               ‾‾
                         */

                        // Self-closing tag
                        tag.SelfClosed = true;

                        // Advance to end of tag '>'
                        currentPosition += _readUntil(currentPosition, '>').Length - 1;
                        tagParserState   = TagParserState.TagEnded;
                    }
                    else if (chr == '>')
                    {
                        /* MATCHES: <DIV>
                         *              ‾
                         */

                        // End of tag
                        tagParserState = TagParserState.TagEnded;
                    }
                    else if ((chr == '"') || (chr == '\''))
                    {
                        // Unnamed, quoted attribute value, like a DOCTYPE dtd path <!DOCTYPE html "blah blah">

                        // Read the quoted value
                        string attributeValue = _readValue(currentPosition);

                        // Build a new attribute
                        currentAttribute = new HTMLTagAttribute(currentPosition, null, attributeValue, chr.ToString());

                        // Advance the position
                        currentPosition += attributeValue.Length;

                        // Finish the attribute and clear it
                        currentAttribute.EndPosition = currentPosition;
                        tag.Attributes.Add(currentAttribute);
                        currentAttribute = null;
                    }
                    else if (isAlphaChar(chr))
                    {
                        /*
                         * MATCHES:
                         * <DIV ATTRIBUTE="FOO" ATTR = 'BAR'>
                         *      ‾‾‾‾‾‾‾‾‾       ‾‾‾‾
                         */
                        // A letter in the attribute name - read the rest of the attribute
                        string attributeName = _readAlphaNumericWord(currentPosition);
                        currentAttribute = new HTMLTagAttribute(currentPosition, attributeName);

                        // Advance position to the end of the name
                        currentPosition += attributeName.Length;

                        // Do we have an attribute value?
                        int nextNonWhitespaceChar = _indexOfNextNonWhitespaceChar(currentPosition);
                        if (_HTML[nextNonWhitespaceChar] == '=')
                        {
                            // tagParserState = TagParserState.ExpectingAttributeValue;
                            currentPosition = nextNonWhitespaceChar + 1;

                            // Advance to the next non-whitespace char (in case of space-separated values like 'foo = "bar"'
                            nextNonWhitespaceChar = _indexOfNextNonWhitespaceChar(currentPosition);
                            string rawAttributeValue = _readValue(currentPosition);
                            currentAttribute.Value = rawAttributeValue;

                            // Advance position to end of the value
                            currentPosition += rawAttributeValue.Length;
                        }
                        else
                        {
                            // A standalone attributelike <!DOCTYPE html "foobar">
                            //                                      ‾‾‾‾
                        }

                        // End of attribute - mark the end position and add to the tag
                        currentAttribute.EndPosition = currentPosition;
                        tag.Attributes.Add(currentAttribute);

                        // Reset attribute
                        currentAttribute = null;
                    }
                    break;
                    #endregion
                }

                // End the tag?
                if (tagParserState == TagParserState.TagEnded)
                {
                    // Apply transformations?
                    if (_transforms == Transformations.LowercaseNames)
                    {
                        tag.TagName = tag.TagName.ToLower();
                        foreach (HTMLTagAttribute attr in tag.Attributes)
                        {
                            if (attr.Name != null)
                            {
                                attr.Name = attr.Name.ToLower();
                            }
                        }
                    }
                    else if (_transforms == Transformations.UppercaseNames)
                    {
                        tag.TagName = tag.TagName.ToUpper();
                        foreach (HTMLTagAttribute attr in tag.Attributes)
                        {
                            if (attr.Name != null)
                            {
                                attr.Name = attr.Name.ToUpper();
                            }
                        }
                    }

                    // Remove empty attributes list
                    if (tag.Attributes.Count == 0)
                    {
                        tag.Attributes = null;
                    }

                    // Mark the end position of the tag and return it
                    tag.MarkEndPosition(currentPosition);
                    return(tag);
                }
            }

            // Shouldn't really get here...
            return(tag);
        }
Пример #3
0
 public void run()
 {
     HTMLForm.runQueuedEvent();
 }