Пример #1
0
        private HTMLParser(string HTML, bool CaptureWhitespace, Transformations transforms, bool processRaw)
        {
            _HTML = HTML;
            _CaptureWhitespace = CaptureWhitespace;
            _transforms        = transforms;

            // Parse into raw results first
            RawResults = _ParseRaw();

            // Now process raw results into a hierarchy
            if (processRaw)
            {
                Results = Raw2Hierarchy(RawResults);
            }
        }
        private DOMContainer Raw2Hierarchy(List <DOMElement> rawResults)
        {
            // Create a DOM container
            DOMContainer domObject = new DOMContainer();

            // Define the current parent element
            HTMLTag currentParent = domObject;

            // Define our starting stack
            List <HTMLTag> openTagStack = new List <HTMLTag>()
            {
                currentParent
            };

            // Keep a running reference to any active <form> tags and <select> tags
            HTMLForm   currentForm   = null;
            HTMLSelect currentSelect = null;

            // Pre-processing - find and update self-closing tags like <IMG> and <BR> and eliminate any
            // unnecessary closing tags like </IMG> or </BR>
            List <DOMElement> elementsToRemove = new List <DOMElement>();

            foreach (DOMElement element in rawResults)
            {
                if (element is HTMLTag)
                {
                    HTMLTag elementTag     = (HTMLTag)element;
                    string  elementTagName = elementTag.TagName.ToUpper();
                    switch (elementTagName)
                    {
                    case "AREA":
                    case "BASE":
                    case "BR":
                    case "COL":
                    case "COMMAND":
                    case "EMBED":
                    case "HR":
                    case "IMG":
                    case "INPUT":
                    case "KEYGEN":
                    case "LINK":
                    case "META":
                    case "PARAM":
                    case "SOURCE":
                    case "TRACK":
                    case "WBR":
                        if (elementTag.IsClosingTag)
                        {
                            // </IMG> and </BR> and so on are invalid closing tags...
                            domObject.Warnings.Add("Marking " + element + " to be deleted (REASON: Self-closed tag)");
                            elementsToRemove.Add(elementTag);
                        }
                        else if (!elementTag.SelfClosed)
                        {
                            // <IMG> and <BR> and so on are self-closed tags...
                            domObject.Warnings.Add("Marking " + element + " as self-closed (REASON: Self-closed tag)");
                            elementTag.SelfClosed = true;
                        }
                        break;
                    }
                }
            }

            // Remove bad tags
            if (elementsToRemove.Count > 0)
            {
                foreach (DOMElement element in elementsToRemove)
                {
                    Console.WriteLine("Removing " + element);
                    rawResults.Remove(element);
                }
            }

            List <string> domErrors   = new List <string>();
            List <string> domWarnings = new List <string>();

            while (rawResults.Count > 0)
            {
                // Shift off the beginning
                DOMElement nextElement = rawResults[0];
                rawResults.RemoveAt(0);

                // string indent = "".PadLeft(openTagStack.Count,'\t');

                // If it's an opening tag, let's update the parentElement
                if (nextElement is HTMLTag)
                {
                    // Cast once
                    HTMLTag nextElementTag = (HTMLTag)nextElement;

                    // Console.Write(indent + nextElementTag + ": ");

                    // If it's an opening tag
                    if (nextElementTag.IsClosingTag)
                    {
                        // Closing tag - try to match to parent
                        // Console.Write("Closing tag, trying to match to current parent " + currentParent + "... ");

                        // If this is a closing </form>, then null out currentForm
                        if (nextElementTag.TagName == "form")
                        {
                            currentForm = null;
                        }
                        // Else If this is a closing </select>, then null out currentSelect
                        else if (nextElementTag.TagName == "select")
                        {
                            currentSelect = null;
                        }

                        // Check to see if the current parent matches the current element (<td><p></p></td> and not malformed HTML like <p><td></p></td>)
                        if (nextElementTag.TagName == currentParent.TagName)
                        {
                            // Mark current parent as successfully closing
                            // currentParent.Closes = true;

                            // Closing tag - pop the stack
                            openTagStack.Remove(currentParent);
                            currentParent = openTagStack.Last();

                            // Console.WriteLine("Match - popped the stack and adding to end of new currentParent " + currentParent + ".");

                            // REMOVED - So the hierarchy only lists the open node and the closed can be
                            //           inferred from the hierarchy.
                            // Move to current parent
                            // currentParent.Children.Add(nextElement);
                        }
                        else
                        {
                            // Console.WriteLine("Not a match - searching stack for a match...");

                            // Malformed HTML detected, try to find a matching open parent from the bottom to the top
                            bool foundStackSearchMatch = false;
                            for (int j = openTagStack.Count - 1; j >= 0; j--)
                            {
                                // Found it!
                                // Console.Write(indent + "  " + nextElementTag + " == " + openTagStack[j] + " ? ");
                                if (openTagStack[j].TagName == nextElementTag.TagName)
                                {
                                    domObject.Warnings.Add(nextElementTag + " was out of sequence. Current parent tag is " + currentParent + " but matching " + openTagStack[j] + " was found further outside.");

                                    // Console.WriteLine("Match! Moving to its parent.");
                                    foundStackSearchMatch = true;

                                    // REMOVED - See above reason.
                                    // Add to parent
                                    // openTagStack[j - 1].Children.Add(nextElement);
                                    // openTagStack[j - 1].Closes = true;

                                    // Remove that element from the open stack
                                    openTagStack.RemoveAt(j);
                                    break;
                                }
                            }
                            if (!foundStackSearchMatch)
                            {
                                // Uh-oh.... add it to the current parent
                                domObject.Errors.Add(nextElementTag + " did not match up to any open tag! Position in HTML: " + nextElementTag.StartPosition);
                                // currentParent.Children.Add(nextElement);
                                // Console.Write(indent + "  No matches found - adding to the currentParent " + currentParent);
                            }
                        }
                    }
                    else if (nextElementTag.SelfClosed)
                    {
                        // Self-closed tag
                        // Console.WriteLine("Self-closed tag, adding to current parent " + currentParent + "");

                        // Move to current parent
                        currentParent.Children.Add(nextElement);

                        // <input>s
                        if ((nextElement is HTMLInput) && (currentForm != null))
                        {
                            currentForm.Inputs.Add((HTMLInput)nextElement);
                            if (((HTMLInput)nextElement).Name != null)
                            {
                                currentForm.NamedInputs.Add((HTMLInput)nextElement);
                            }
                        }

                        // <option />s
                        else if ((nextElement is HTMLSelectOption) && (currentSelect != null))
                        {
                            currentSelect.Options.Add((HTMLSelectOption)nextElement);
                        }
                    }
                    else
                    {
                        // Open tag - push onto the stack
                        // Console.WriteLine("Open tag, adding to currentParent " + currentParent + ", adding to stack, and setting as new currentParent.");

                        // Move to current parent
                        currentParent.Children.Add(nextElementTag);

                        // <select>s and <textarea>s
                        if ((nextElement is HTMLInput) && (currentForm != null))
                        {
                            currentForm.Inputs.Add((HTMLInput)nextElement);
                            if (((HTMLInput)nextElement).Name != null)
                            {
                                currentForm.NamedInputs.Add((HTMLInput)nextElement);
                            }

                            // Indicate we're in a <select> (for easier <option> association)
                            if (nextElement is HTMLSelect)
                            {
                                currentSelect = (HTMLSelect)nextElement;
                            }
                        }
                        // <option />s
                        else if ((nextElement is HTMLSelectOption) && (currentSelect != null))
                        {
                            currentSelect.Options.Add((HTMLSelectOption)nextElement);
                        }

                        // Make this the new currentParent
                        openTagStack.Add(nextElementTag);
                        currentParent = nextElementTag;

                        // Initialize children list
                        currentParent.Children = new List <DOMElement>();

                        // Update current form
                        if (currentParent is HTMLForm)
                        {
                            currentForm = (HTMLForm)currentParent;
                        }
                    }
                }
                else
                {
                    // Content goes into current parent
                    currentParent.Children.Add(nextElement);
                }
            }

            // Return final result
            return(domObject);
        }