private void CreateCorrectionEndTagBefore(int pos) { string endTagName = tag.TagName; int endTagBegin = pos; int endTagEnd = endTagBegin + endTagName.Length + 2; endTag = new EndTag(new TagData(endTagBegin, endTagEnd, endTagName, currLine)); }
public void TagExtraction() { string testHTML = "<AREA \n coords=0,0,52,52 href=\"http://www.yahoo.com/r/c1\" shape=RECT>"; CreateParser(testHTML); Tag tag = Tag.Find(parser.Reader, testHTML, 0); Assert.IsNotNull(tag); }
private void DoForceCorrectionCheckOn(Tag possibleEndTagCauser) { if (IsEndTagMissing(possibleEndTagCauser)) { CreateCorrectionEndTagBefore(possibleEndTagCauser); endTagFound = true; } }
protected override Tag CreateTag(TagData tagData, Tag tag, string url) { System.Collections.Hashtable table = tag.Attributes; string metaTagName = (string)table["NAME"]; string metaTagContents = (string)table["CONTENT"]; string httpEquiv = (string)table["HTTP-EQUIV"]; return(new MetaTag(tagData, httpEquiv, metaTagName, metaTagContents)); }
public void ExtractLinkInvertedCommasBug() { string tagContents = "a href=r/anorth/top.html"; Tag tag = new Tag(new TagData(0, 0, tagContents, "")); string url = "c:\\cvs\\html\\binaries\\yahoo.htm"; LinkScanner scanner = new LinkScanner("-l"); Assert.AreEqual("r/anorth/top.html", scanner.ExtractLink(tag, url), "Extracted Link"); }
public virtual string ReplaceFaultyTagWithEndTag(Tag tag, string currentLine) { string newLine = currentLine.Substring(0, tag.ElementBegin); newLine += "</" + tag.TagName + ">"; newLine += currentLine.Substring(tag.ElementEnd + 1, currentLine.Length - (tag.ElementEnd + 1)); return(newLine); }
public void InsertEndTagBeforeTag() { string currentLine = "<a href=s/7509><b>Yahoo! Movies</b></a>"; Tag tag = new Tag(new TagData(0, 14, "a href=s/7509", currentLine)); LinkScanner linkScanner = new LinkScanner(); string newLine = linkScanner.InsertEndTagBeforeNode(tag, currentLine); Assert.AreEqual("</A><a href=s/7509><b>Yahoo! Movies</b></a>", newLine, "Expected insertion"); }
private void AssertTagNameMatches(Tag nextExpectedTag, Tag nextActualTag, string displayMessage) { string expectedTagName = nextExpectedTag.TagName; string actualTagName = nextActualTag.TagName; displayMessage = "The tag names did not match: Expected " + expectedTagName + " \nbut was " + actualTagName + displayMessage; AssertStringEquals(displayMessage, expectedTagName, actualTagName); }
private void CreateCorrectionEndTagBefore(Tag possibleEndTagCauser) { string endTagName = tag.TagName; int endTagBegin = possibleEndTagCauser.ElementBegin; int endTagEnd = endTagBegin + endTagName.Length + 2; possibleEndTagCauser.TagBegin = endTagEnd + 1; reader.AddNextParsedNode(possibleEndTagCauser); endTag = new EndTag(new TagData(endTagBegin, endTagEnd, endTagName, currLine)); }
public virtual string ExtractFormMethod(Tag tag) { string method = tag["METHOD"]; if (method == null) { method = FormTag.GET; } return(method.ToUpper()); }
public void AssertTagEquals(Node expected, Node actual, string displayMessage) { if (expected is Tag) { Tag expectedTag = (Tag)expected; Tag actualTag = (Tag)actual; AssertTagNameMatches(expectedTag, actualTag, displayMessage); AssertAttributesMatch(expectedTag, actualTag, displayMessage); } }
private int CheckBeginParsingState(int i, int state, char ch, Tag tag) { if (ch == '<' && (state == TAG_BEFORE_PARSING_STATE || state == TAG_ILLEGAL_STATE)) { // Transition from State 0 to State 1 - Record data till > is encountered tag.TagBegin = i; state = TAG_BEGIN_PARSING_STATE; } return(state); }
private int CheckIllegalState(int i, int state, char ch, Tag tag) { if (ch == '/' && i > 0 && tag.TagLine[i - 1] == '<' && state != TAG_IGNORE_DATA_STATE && state != TAG_IGNORE_BEGIN_TAG_STATE) { state = TAG_ILLEGAL_STATE; } return(state); }
private void DoEmptyXmlTagCheckOn(Node currentNode) { if (currentNode is Tag) { Tag possibleEndTag = (Tag)currentNode; if (IsXmlEndTag(tag)) { endTag = possibleEndTag; endTagFound = true; } } }
public void ReplaceFaultyTagWithEndTag() { string currentLine = "<p>Site Comments?<br><a href=\"mailto:[email protected]?subject=Site Comments\">Mail Us<a></p>"; Tag tag = new Tag(new TagData(85, 87, "a", currentLine)); LinkScanner linkScanner = new LinkScanner(); string newLine = linkScanner.ReplaceFaultyTagWithEndTag(tag, currentLine); Assert.AreEqual( "<p>Site Comments?<br><a href=\"mailto:[email protected]?subject=Site Comments\">Mail Us</A></p>", newLine, "Expected replacement"); }
public void LinkDataContents() { CreateParser( "<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>", "http://transfer.go.com"); // Register the image scanner LinkScanner linkScanner = new LinkScanner("-l"); parser.AddScanner(linkScanner); parser.AddScanner(linkScanner.CreateImageScanner("-i")); ParseAndAssertNodeCount(1); Assert.IsTrue(node[0] is LinkTag, "Node 0 should be a link tag"); LinkTag linkTag = (LinkTag)node[0]; Assert.AreEqual( "http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689", linkTag.Link, "Link URL"); Assert.AreEqual("", linkTag.LinkText, "Link Text"); Node[] containedNodes = new AbstractNode[10]; int i = 0; foreach (Node nestedNode in linkTag) { containedNodes[i++] = nestedNode; } Assert.AreEqual(5, i, "There should be 5 contained nodes in the link tag"); Assert.IsTrue(containedNodes[0] is ImageTag, "First contained node should be an image tag"); ImageTag imageTag = (ImageTag)containedNodes[0]; Assert.AreEqual("http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif", imageTag.ImageURL, "Image Location"); Assert.AreEqual("60", imageTag["HEIGHT"], "Image Height"); Assert.AreEqual("468", imageTag["WIDTH"], "Image Width"); Assert.AreEqual("0", imageTag["BORDER"], "Image Border"); Assert.AreEqual("See Signs in Theaters 8-2 - Starring Mel Gibson", imageTag["ALT"], "Image Alt"); Assert.IsTrue(containedNodes[1] is Tag, "Second contained node should be Tag"); Tag tag1 = (Tag)containedNodes[1]; Assert.AreEqual("font face=\"verdana,arial,helvetica\" SIZE=\"1\"", tag1.Text, "Tag Contents"); Assert.IsTrue(containedNodes[2] is Tag, "Third contained node should be Tag"); Tag tag2 = (Tag)containedNodes[2]; Assert.AreEqual("b", tag2.Text, "Tag Contents"); Assert.IsTrue(containedNodes[3] is EndTag, "Fourth contained node should be HTMLEndTag"); EndTag endTag1 = (EndTag)containedNodes[3]; Assert.AreEqual("b", endTag1.Text, "Fourth Tag contents"); Assert.IsTrue(containedNodes[4] is EndTag, "Fifth contained node should be HTMLEndTag"); EndTag endTag2 = (EndTag)containedNodes[4]; Assert.AreEqual("font", endTag2.Text, "Fifth Tag contents"); }
protected override Tag CreateTag(TagData tagData, Tag tag, string url) { string baseUrl = tag["HREF"]; string absoluteBaseUrl = ""; if (baseUrl != null && baseUrl.Length > 0) { absoluteBaseUrl = LinkProcessor.RemoveLastSlash(baseUrl.Trim()); processor.BaseUrl = absoluteBaseUrl; } return(new BaseHrefTag(tagData, absoluteBaseUrl)); }
private void CheckIfAppendable(Bool encounteredQuery, int state, char ch, Tag tag) { if (state == TAG_IGNORE_DATA_STATE || state == TAG_BEGIN_PARSING_STATE || state == TAG_IGNORE_BEGIN_TAG_STATE) { if (ch == '?') { encounteredQuery.Boolean = true; } tag.Append(ch); } }
public CompositeTagData(Tag startTag, Tag endTag, NodeList children) { this.startTag = startTag; this.endTag = endTag; this.children = new NodeList(); if (children != null) { foreach (Node child in children) { this.children.Add(child); } } }
public CompositeTagScannerHelper(CompositeTagScanner scanner, Tag tag, string url, NodeReader reader, string currLine, bool balance_quotes) { this.scanner = scanner; this.tag = tag; this.url = url; this.reader = reader; this.currLine = currLine; this.endTag = null; this.nodeList = new NodeList(); this.endTagFound = false; this.balance_quotes = balance_quotes; }
public bool IsTagToBeEndedFor(Tag tag) { bool isEndTag = tag is EndTag; string tagName = tag.TagName; if ((isEndTag && endTagEnderSet.Contains(tagName)) || (!isEndTag && tagEnderSet.Contains(tagName))) { return(true); } else { return(false); } }
public static bool IsXMLTagFound(Node node, string tagName) { bool xmlTagFound = false; if (node is Tag) { Tag tag = (Tag)node; if (tag.Text.ToUpper().IndexOf(tagName) == 0) { xmlTagFound = true; } } return(xmlTagFound); }
private void DoChildAndEndTagCheckOn(Node currentNode) { if (currentNode is EndTag) { EndTag possibleEndTag = (EndTag)currentNode; if (IsExpectedEndTag(possibleEndTag)) { endTagFound = true; endTag = possibleEndTag; return; } } nodeList.Add(currentNode); scanner.ChildNodeEncountered(currentNode); }
private void FixIfXmlEndTag(Parser parser, Node node) { if (node is Tag) { Tag tag = (Tag)node; if (tag.EmptyXmlTag) { // Add end tag string currLine = parser.Reader.CurrentLine; int pos = parser.Reader.LastReadPosition; currLine = currLine.Substring(0, (pos + 1) - (0)) + "</" + tag.TagName + ">" + currLine.Substring(pos + 1, (currLine.Length) - (pos + 1)); parser.Reader.ChangeLine(currLine); } } }
public virtual void CorrectTag(Tag tag) { string tempText = tag.Text; StringBuilder absorbedText = new StringBuilder(); foreach (char c in tempText) { if (c != '"') { absorbedText.Append(c); } } // Go into the next stage. StringBuilder result = InsertInvertedCommasCorrectly(absorbedText); tag.Text = result.ToString(); }
/// <summary> Extract the location of the image, given the string to be parsed, and the url /// of the html page in which this tag exists. /// </summary> /// <param name="tag">String to be parsed /// </param> /// <param name="url">URL of web page being parsed /// /// </param> public virtual string ExtractImageLocn(Tag tag, string url) { string relativeLink = null; try { table = tag.Attributes; relativeLink = (string)table["SRC"]; if (relativeLink != null) { relativeLink = ParserUtils.RemoveChars(relativeLink, '\n'); relativeLink = ParserUtils.RemoveChars(relativeLink, '\r'); } if (relativeLink == null || relativeLink.Length == 0) { // try fix string tagText = tag.Text.ToUpper(); int indexSrc = tagText.IndexOf("SRC"); if (indexSrc != -1) { // There is a missing equals. tag.Text = tag.Text.Substring(0, (indexSrc + 3) - (0)) + "=" + tag.Text.Substring(indexSrc + 3, (tag.Text.Length) - (indexSrc + 3)); table = tag.RedoParseAttributes(); relativeLink = (string)table["SRC"]; } } if (relativeLink == null) { return(""); } else { return(processor.Extract(relativeLink, url)); } } catch (System.Exception e) { throw new ParserException( "HTMLImageScanner.ExtractImageLocn() : Error in extracting image location, relativeLink = " + relativeLink + ", url = " + url, e); } }
public void CompositeTagWithTagChild() { CreateParser("<Custom>" + "<Hello>" + "</Custom>"); CustomTag customTag = ParseCustomTag(1); int x = customTag.ChildCount; Assert.AreEqual(1, customTag.ChildCount, "child count"); Assert.IsFalse(customTag.EmptyXmlTag, "custom tag should not be xml end tag"); Assert.AreEqual(0, customTag.StartTag.ElementBegin, "starting loc"); Assert.AreEqual(7, customTag.StartTag.ElementEnd, "ending loc"); Assert.AreEqual(0, customTag.ElementBegin, "custom tag starting loc"); Assert.AreEqual(23, customTag.ElementEnd, "custom tag ending loc"); Node child = customTag[0]; AssertType("child", typeof(Tag), child); Tag tag = (Tag)child; AssertStringEquals("child html", "<HELLO>", child.ToHtml()); }
public void FreshMeatBug() { CreateParser("<a>Revision</a>", "http://www.yahoo.com"); // Register the image scanner parser.AddScanner(new LinkScanner("-l")); ParseAndAssertNodeCount(3); Assert.IsTrue(node[0] is Tag, "Node 0 should be a tag"); Tag tag = (Tag)node[0]; Assert.AreEqual("a", tag.Text, "Tag Contents"); Assert.IsTrue(node[1] is StringNode, "Node 1 should be a string node"); StringNode stringNode = (StringNode)node[1]; Assert.AreEqual("Revision", stringNode.Text, "StringNode Contents"); Assert.IsTrue(node[2] is EndTag, "Node 2 should be a string node"); EndTag endTag = (EndTag)node[2]; Assert.AreEqual("a", endTag.Text, "End Tag Contents"); }
public static string ToString(Tag tag) { string tagName = tag[Tag.TAGNAME]; System.Collections.Hashtable attrs = tag.Attributes; System.Text.StringBuilder lString = new System.Text.StringBuilder(tagName); lString.Append(" TAG\n"); lString.Append("--------\n"); foreach (string key in attrs.Keys) { string value = (string)attrs[key]; if (!key.ToUpper().Equals(Tag.TAGNAME.ToUpper()) && value.Length > 0) { lString.Append(key).Append(" : ").Append(value).Append("\n"); } } return(lString.ToString()); }
public virtual int IncrementCounter(int i, NodeReader reader, int state, Tag tag) { string nextLine = null; if ((state == TAG_BEGIN_PARSING_STATE || state == TAG_IGNORE_DATA_STATE || state == TAG_IGNORE_BEGIN_TAG_STATE) && i == tag.TagLine.Length - 1) { // The while loop below is a bug fix contributed by // Annette Doyle - see testcase HTMLImageScannerTest.testImageTagOnMultipleLines() // Further modified by Somik Raha, to remove bug - HTMLTagTest.testBrokenTag int numLinesAdvanced = 0; do { nextLine = reader.GetNextLine(); numLinesAdvanced++; } while (nextLine != null && nextLine.Length == 0); if (nextLine == null) { // This means we have a broken tag. Fill in an end tag symbol here. nextLine = ">"; } else { // This means this is just a new line, hence add the new line character tag.Append(Parser.LineSeparator); } // Ensure blank lines are included in tag's 'tagLines' while (--numLinesAdvanced > 0) { tag.TagLine = ""; } // We need to continue parsing to the next line tag.TagLine = nextLine; i = -1; } return(++i); }