private DOMContainer Raw2Hierarchy(List <DOMElement> rawResults) { // Create a DOM container DOMContainer domObject = new DOMContainer(); // Define the current parent element HTMLTag currentParent = domObject; // Define our starting stack List <HTMLTag> openTagStack = new List <HTMLTag>() { currentParent }; // Keep a running reference to any active <form> tags and <select> tags HTMLForm currentForm = null; HTMLSelect currentSelect = null; // Pre-processing - find and update self-closing tags like <IMG> and <BR> and eliminate any // unnecessary closing tags like </IMG> or </BR> List <DOMElement> elementsToRemove = new List <DOMElement>(); foreach (DOMElement element in rawResults) { if (element is HTMLTag) { HTMLTag elementTag = (HTMLTag)element; string elementTagName = elementTag.TagName.ToUpper(); switch (elementTagName) { case "AREA": case "BASE": case "BR": case "COL": case "COMMAND": case "EMBED": case "HR": case "IMG": case "INPUT": case "KEYGEN": case "LINK": case "META": case "PARAM": case "SOURCE": case "TRACK": case "WBR": if (elementTag.IsClosingTag) { // </IMG> and </BR> and so on are invalid closing tags... domObject.Warnings.Add("Marking " + element + " to be deleted (REASON: Self-closed tag)"); elementsToRemove.Add(elementTag); } else if (!elementTag.SelfClosed) { // <IMG> and <BR> and so on are self-closed tags... domObject.Warnings.Add("Marking " + element + " as self-closed (REASON: Self-closed tag)"); elementTag.SelfClosed = true; } break; } } } // Remove bad tags if (elementsToRemove.Count > 0) { foreach (DOMElement element in elementsToRemove) { Console.WriteLine("Removing " + element); rawResults.Remove(element); } } List <string> domErrors = new List <string>(); List <string> domWarnings = new List <string>(); while (rawResults.Count > 0) { // Shift off the beginning DOMElement nextElement = rawResults[0]; rawResults.RemoveAt(0); // string indent = "".PadLeft(openTagStack.Count,'\t'); // If it's an opening tag, let's update the parentElement if (nextElement is HTMLTag) { // Cast once HTMLTag nextElementTag = (HTMLTag)nextElement; // Console.Write(indent + nextElementTag + ": "); // If it's an opening tag if (nextElementTag.IsClosingTag) { // Closing tag - try to match to parent // Console.Write("Closing tag, trying to match to current parent " + currentParent + "... "); // If this is a closing </form>, then null out currentForm if (nextElementTag.TagName == "form") { currentForm = null; } // Else If this is a closing </select>, then null out currentSelect else if (nextElementTag.TagName == "select") { currentSelect = null; } // Check to see if the current parent matches the current element (<td><p></p></td> and not malformed HTML like <p><td></p></td>) if (nextElementTag.TagName == currentParent.TagName) { // Mark current parent as successfully closing // currentParent.Closes = true; // Closing tag - pop the stack openTagStack.Remove(currentParent); currentParent = openTagStack.Last(); // Console.WriteLine("Match - popped the stack and adding to end of new currentParent " + currentParent + "."); // REMOVED - So the hierarchy only lists the open node and the closed can be // inferred from the hierarchy. // Move to current parent // currentParent.Children.Add(nextElement); } else { // Console.WriteLine("Not a match - searching stack for a match..."); // Malformed HTML detected, try to find a matching open parent from the bottom to the top bool foundStackSearchMatch = false; for (int j = openTagStack.Count - 1; j >= 0; j--) { // Found it! // Console.Write(indent + " " + nextElementTag + " == " + openTagStack[j] + " ? "); if (openTagStack[j].TagName == nextElementTag.TagName) { domObject.Warnings.Add(nextElementTag + " was out of sequence. Current parent tag is " + currentParent + " but matching " + openTagStack[j] + " was found further outside."); // Console.WriteLine("Match! Moving to its parent."); foundStackSearchMatch = true; // REMOVED - See above reason. // Add to parent // openTagStack[j - 1].Children.Add(nextElement); // openTagStack[j - 1].Closes = true; // Remove that element from the open stack openTagStack.RemoveAt(j); break; } } if (!foundStackSearchMatch) { // Uh-oh.... add it to the current parent domObject.Errors.Add(nextElementTag + " did not match up to any open tag! Position in HTML: " + nextElementTag.StartPosition); // currentParent.Children.Add(nextElement); // Console.Write(indent + " No matches found - adding to the currentParent " + currentParent); } } } else if (nextElementTag.SelfClosed) { // Self-closed tag // Console.WriteLine("Self-closed tag, adding to current parent " + currentParent + ""); // Move to current parent currentParent.Children.Add(nextElement); // <input>s if ((nextElement is HTMLInput) && (currentForm != null)) { currentForm.Inputs.Add((HTMLInput)nextElement); if (((HTMLInput)nextElement).Name != null) { currentForm.NamedInputs.Add((HTMLInput)nextElement); } } // <option />s else if ((nextElement is HTMLSelectOption) && (currentSelect != null)) { currentSelect.Options.Add((HTMLSelectOption)nextElement); } } else { // Open tag - push onto the stack // Console.WriteLine("Open tag, adding to currentParent " + currentParent + ", adding to stack, and setting as new currentParent."); // Move to current parent currentParent.Children.Add(nextElementTag); // <select>s and <textarea>s if ((nextElement is HTMLInput) && (currentForm != null)) { currentForm.Inputs.Add((HTMLInput)nextElement); if (((HTMLInput)nextElement).Name != null) { currentForm.NamedInputs.Add((HTMLInput)nextElement); } // Indicate we're in a <select> (for easier <option> association) if (nextElement is HTMLSelect) { currentSelect = (HTMLSelect)nextElement; } } // <option />s else if ((nextElement is HTMLSelectOption) && (currentSelect != null)) { currentSelect.Options.Add((HTMLSelectOption)nextElement); } // Make this the new currentParent openTagStack.Add(nextElementTag); currentParent = nextElementTag; // Initialize children list currentParent.Children = new List <DOMElement>(); // Update current form if (currentParent is HTMLForm) { currentForm = (HTMLForm)currentParent; } } } else { // Content goes into current parent currentParent.Children.Add(nextElement); } } // Return final result return(domObject); }
// Return DOMElement instead of Tag, since we -could- return private DOMElement _ParseTag(int startPosition) { // Initialize new Tag and empty Attribute HTMLTag tag = new HTMLTag(startPosition); HTMLTagAttribute currentAttribute = null; // Start looping through the HTML (skip 1 char since we're already at the '<' tagParserState = TagParserState.ExpectingTagName; int currentPosition = startPosition + 1; while (currentPosition < _HTML.Length) { // Read char and advance char chr = _HTML[currentPosition]; switch (tagParserState) { #region TagParserState.ExpectingTagName - Look for an optional '/' and/or a tag name and possibly an ending '>' (if there's a '/' found) /* * MATCHES: * <DIV ATTRIBUTE="FOO" ATTR = 'BAR'> or </DIV> * ‾‾‾ ‾‾‾‾‾ */ // When we're start a tag and waiting for the tag name... case TagParserState.ExpectingTagName: { if (isAlphaNumericChar(chr)) { // A letter in the tag name - add it to sbTemp and read the rest of the tag name tag.TagName = _readAlphaNumericWord(currentPosition); if (tag.TagName.StartsWith("!--")) { // HTML comment HTMLContent comment = new HTMLContent(startPosition, _readUntil(startPosition, "-->")); return(comment); } else { // Any tag conversions? switch (tag.TagName.ToLower()) { case "form": tag = new HTMLForm(tag.StartPosition) { TagName = tag.TagName }; break; case "input": tag = new HTMLInput(tag.StartPosition) { TagName = tag.TagName }; break; case "select": tag = new HTMLSelect(tag.StartPosition) { TagName = tag.TagName }; break; case "option": tag = new HTMLSelectOption(tag.StartPosition) { TagName = tag.TagName }; break; case "textarea": tag = new HTMLTextarea(tag.StartPosition) { TagName = tag.TagName }; break; } // Advance position by name length currentPosition += tag.TagName.Length; tagParserState = TagParserState.ExpectingTagContentsOrEnd; } } else if (chr == '/') { // This is a closing tag like </div> - read the tag name and close it tag.IsClosingTag = true; // Advance to the start of the tag name and read it currentPosition = this._indexOfNextNonWhitespaceChar(currentPosition + 1); tag.TagName = _readAlphaNumericWord(currentPosition); currentPosition += tag.TagName.Length; // Advance to end of tag '>' currentPosition += _readUntil(currentPosition, '>').Length - 1; tagParserState = TagParserState.TagEnded; } } break; #endregion #region TagParserState.ExpectingAttributeNameOrTagEnd - Inside the tag, looking for either alpha chars (start of an attribute), or a '/' self-closing flag, or the closing '>' character case TagParserState.ExpectingTagContentsOrEnd: // Advance to the next non-whitespace char currentPosition = _indexOfNextNonWhitespaceChar(currentPosition); chr = _HTML[currentPosition]; if (chr == '/') { /* MATCHES: <IMG /> * ‾‾ */ // Self-closing tag tag.SelfClosed = true; // Advance to end of tag '>' currentPosition += _readUntil(currentPosition, '>').Length - 1; tagParserState = TagParserState.TagEnded; } else if (chr == '>') { /* MATCHES: <DIV> * ‾ */ // End of tag tagParserState = TagParserState.TagEnded; } else if ((chr == '"') || (chr == '\'')) { // Unnamed, quoted attribute value, like a DOCTYPE dtd path <!DOCTYPE html "blah blah"> // Read the quoted value string attributeValue = _readValue(currentPosition); // Build a new attribute currentAttribute = new HTMLTagAttribute(currentPosition, null, attributeValue, chr.ToString()); // Advance the position currentPosition += attributeValue.Length; // Finish the attribute and clear it currentAttribute.EndPosition = currentPosition; tag.Attributes.Add(currentAttribute); currentAttribute = null; } else if (isAlphaChar(chr)) { /* * MATCHES: * <DIV ATTRIBUTE="FOO" ATTR = 'BAR'> * ‾‾‾‾‾‾‾‾‾ ‾‾‾‾ */ // A letter in the attribute name - read the rest of the attribute string attributeName = _readAlphaNumericWord(currentPosition); currentAttribute = new HTMLTagAttribute(currentPosition, attributeName); // Advance position to the end of the name currentPosition += attributeName.Length; // Do we have an attribute value? int nextNonWhitespaceChar = _indexOfNextNonWhitespaceChar(currentPosition); if (_HTML[nextNonWhitespaceChar] == '=') { // tagParserState = TagParserState.ExpectingAttributeValue; currentPosition = nextNonWhitespaceChar + 1; // Advance to the next non-whitespace char (in case of space-separated values like 'foo = "bar"' nextNonWhitespaceChar = _indexOfNextNonWhitespaceChar(currentPosition); string rawAttributeValue = _readValue(currentPosition); currentAttribute.Value = rawAttributeValue; // Advance position to end of the value currentPosition += rawAttributeValue.Length; } else { // A standalone attributelike <!DOCTYPE html "foobar"> // ‾‾‾‾ } // End of attribute - mark the end position and add to the tag currentAttribute.EndPosition = currentPosition; tag.Attributes.Add(currentAttribute); // Reset attribute currentAttribute = null; } break; #endregion } // End the tag? if (tagParserState == TagParserState.TagEnded) { // Apply transformations? if (_transforms == Transformations.LowercaseNames) { tag.TagName = tag.TagName.ToLower(); foreach (HTMLTagAttribute attr in tag.Attributes) { if (attr.Name != null) { attr.Name = attr.Name.ToLower(); } } } else if (_transforms == Transformations.UppercaseNames) { tag.TagName = tag.TagName.ToUpper(); foreach (HTMLTagAttribute attr in tag.Attributes) { if (attr.Name != null) { attr.Name = attr.Name.ToUpper(); } } } // Remove empty attributes list if (tag.Attributes.Count == 0) { tag.Attributes = null; } // Mark the end position of the tag and return it tag.MarkEndPosition(currentPosition); return(tag); } } // Shouldn't really get here... return(tag); }
public void run() { HTMLForm.runQueuedEvent(); }