public void Scan() { CreateParser("<A HREF=\"mytest.html\"><IMG SRC=\"abcd.jpg\">Hello World</A>", "http://www.yahoo.com"); // Register the image scanner LinkScanner linkScanner = new LinkScanner("-l"); parser.AddScanner(linkScanner); parser.AddScanner(linkScanner.CreateImageScanner("-i")); ParseAndAssertNodeCount(1); Assert.IsTrue(node[0] is LinkTag, "Node should be a link node"); LinkTag linkTag = (LinkTag)node[0]; // Get the link data and cross-check Node[] dataNode = new AbstractNode[10]; int i = 0; foreach (Node nestedNode in linkTag) { dataNode[i++] = nestedNode; } Assert.AreEqual(2, i, "Number of data nodes"); Assert.IsTrue(dataNode[0] is ImageTag, "First data node should be an Image Node"); Assert.IsTrue(dataNode[1] is StringNode, "Second data node shouls be a String Node"); // Check the contents of each data node ImageTag imageTag = (ImageTag)dataNode[0]; Assert.AreEqual("http://www.yahoo.com/abcd.jpg", imageTag.ImageURL, "Image URL"); StringNode stringNode = (StringNode)dataNode[1]; Assert.AreEqual("Hello World", stringNode.Text, "String Contents"); }
public void CompositeTagWithDeadlock() { CreateParser("<custom>" + "<another>something" + "</custom>" + "<custom>" + "<another>else</another>" + "</custom>"); parser.AddScanner(new AnotherScanner(true)); CustomTag customTag = ParseCustomTag(2); int x = customTag.ChildCount; Assert.AreEqual(1, customTag.ChildCount, "child count"); Assert.IsFalse(customTag.EmptyXmlTag, "custom tag should not be xml end tag"); Assert.AreEqual(0, customTag.StartTag.ElementBegin, "starting loc"); Assert.AreEqual(7, customTag.StartTag.ElementEnd, "ending loc"); Assert.AreEqual(1, customTag.tagData.StartLine, "starting line position"); Assert.AreEqual(1, customTag.tagData.EndLine, "ending line position"); AnotherTag anotherTag = (AnotherTag)customTag[0]; Assert.AreEqual(1, anotherTag.ChildCount, "anotherTag child count"); StringNode stringNode = (StringNode)anotherTag[0]; AssertStringEquals("anotherTag child text", "something", stringNode.ToPlainTextString()); AssertStringEquals("first custom tag html", "<CUSTOM><ANOTHER>something</ANOTHER></CUSTOM>", customTag.ToHtml()); customTag = (CustomTag)node[1]; AssertStringEquals("second custom tag html", "<CUSTOM><ANOTHER>else</ANOTHER></CUSTOM>", customTag.ToHtml()); }
public override void VisitStringNode(StringNode stringNode) { string text = stringNode.Text; if (!preTagBeingProcessed) { text = Translate.Decode(text); text = ReplaceNonBreakingSpaceWithOrdinarySpace(text); } textAccumulator.Append(text); }
public void EmptyTag3() { string testHTML = "<html><body>text<>text</body></html>"; CreateParser(testHTML); parser.RegisterScanners(); ParseAndAssertNodeCount(5); Assert.IsTrue(node[2] is StringNode, "Third node should be a string node"); StringNode stringNode = (StringNode)node[2]; Assert.AreEqual("text<>text", stringNode.Text, "Third node has incorrect text"); }
public void EmptyTag6() { string testHTML = "<html><body>text<>\ntext</body></html>"; CreateParser(testHTML); parser.RegisterScanners(); Parser.LineSeparator = "\r\n"; // actually a static method ParseAndAssertNodeCount(5); Assert.IsTrue(node[2] is StringNode, "Third node should be a string node"); StringNode stringNode = (StringNode)node[2]; string actual = stringNode.Text; Assert.AreEqual("text<>\r\ntext", actual, "Third node has incorrect text"); }
public void ErroneousLinkBug() { CreateParser("<p>Site Comments?<br>" + "<a href=\"mailto:[email protected]?subject=Site Comments\">" + "Mail Us" + "<a>" + "</p>"); parser.RegisterScanners(); ParseAndAssertNodeCount(6); // The first node should be a Tag Assert.IsTrue(node[0] is Tag, "First node should be a Tag"); // The second node should be a HTMLStringNode Assert.IsTrue(node[1] is StringNode, "Second node should be a HTMLStringNode"); StringNode stringNode = (StringNode)node[1]; Assert.AreEqual("Site Comments?", stringNode.Text, "Text of the StringNode"); Assert.IsTrue(node[2] is Tag, "Third node should be a tag"); }
public void MultipleLineBug() { CreateParser("<LI><font color=\"FF0000\" size=-1><b>Tech Samachar:</b></font><a \n" + "href=\"http://ads.samachar.com/bin/redirect/tech.txt?http://www.samachar.com/tech\n" + "nical.html\"> Journalism 3.0</a> by Rajesh Jain"); Parser.LineSeparator = "\r\n"; parser.AddScanner(new LinkScanner("-l")); ParseAndAssertNodeCount(8); Assert.IsTrue(node[6] is LinkTag, "Seventh node should be a link tag"); LinkTag linkTag = (LinkTag)node[6]; string exp = "http://ads.samachar.com/bin/redirect/tech.txt?http://www.samachar.com/technical.html"; AssertStringEquals("Link URL of link tag", exp, linkTag.Link); Assert.AreEqual(" Journalism 3.0", linkTag.LinkText, "Link Text of link tag"); Assert.IsTrue(node[7] is StringNode, "Eight node should be a string node"); StringNode stringNode = (StringNode)node[7]; Assert.AreEqual(" by Rajesh Jain", stringNode.Text, "String node contents"); }
public void FreshMeatBug() { CreateParser("<a>Revision</a>", "http://www.yahoo.com"); // Register the image scanner parser.AddScanner(new LinkScanner("-l")); ParseAndAssertNodeCount(3); Assert.IsTrue(node[0] is Tag, "Node 0 should be a tag"); Tag tag = (Tag)node[0]; Assert.AreEqual("a", tag.Text, "Tag Contents"); Assert.IsTrue(node[1] is StringNode, "Node 1 should be a string node"); StringNode stringNode = (StringNode)node[1]; Assert.AreEqual("Revision", stringNode.Text, "StringNode Contents"); Assert.IsTrue(node[2] is EndTag, "Node 2 should be a string node"); EndTag endTag = (EndTag)node[2]; Assert.AreEqual("a", endTag.Text, "End Tag Contents"); }
public void CompositeTagWithOneTextChild() { CreateParser("<Custom>" + "Hello" + "</Custom>"); CustomTag customTag = ParseCustomTag(1); int x = customTag.ChildCount; Assert.AreEqual(1, customTag.ChildCount, "child count"); Assert.IsFalse(customTag.EmptyXmlTag, "custom tag should not be xml end tag"); Assert.AreEqual(0, customTag.StartTag.ElementBegin, "starting loc"); Assert.AreEqual(7, customTag.StartTag.ElementEnd, "ending loc"); Assert.AreEqual(1, customTag.tagData.StartLine, "starting line position"); Assert.AreEqual(1, customTag.tagData.EndLine, "ending line position"); Node child = customTag[0]; AssertType("child", typeof(StringNode), child); StringNode text = (StringNode)child; AssertStringEquals("child text", "Hello", child.ToPlainTextString()); }
public void CompositeTagWithNestedTag() { CreateParser("<Custom>" + "<Another>" + "Hello" + "</Another>" + "<Custom/>" + "</Custom>" + "<Custom/>"); parser.AddScanner(new CustomScanner(this)); parser.AddScanner(new AnotherScanner()); ParseAndAssertNodeCount(2); AssertType("first node", typeof(CustomTag), this.node[0]); AssertType("second node", typeof(CustomTag), this.node[1]); CustomTag customTag = (CustomTag)this.node[0]; Node node = customTag[0]; AssertType("first child", typeof(AnotherTag), node); AnotherTag anotherTag = (AnotherTag)node; Assert.AreEqual(1, anotherTag.ChildCount, "another tag children count"); node = anotherTag[0]; AssertType("nested child", typeof(StringNode), node); StringNode text = (StringNode)node; Assert.AreEqual("Hello", text.ToPlainTextString(), "text"); }
public override void VisitStringNode(StringNode stringNode) { string stringToBeSearched = stringNode.Text.ToUpper(); if (!multipleSearchesWithinStrings && stringToBeSearched.IndexOf(stringToFind) != -1) { stringFound = true; foundCount++; } else if (multipleSearchesWithinStrings) { int index = -1; do { index = stringToBeSearched.IndexOf(stringToFind, index + 1); if (index != -1) { foundCount++; } } while (index != -1); } }
public static string ExtractXMLData(Node node, string tagName, NodeReader reader) { try { string xmlData = ""; bool xmlTagFound = IsXMLTagFound(node, tagName); if (xmlTagFound) { try { do { node = reader.ReadElement(); if (node != null) { if (node is StringNode) { StringNode stringNode = (StringNode)node; if (xmlData.Length > 0) { xmlData += " "; } xmlData += stringNode.Text; } else if (!(node is org.htmlparser.tags.EndTag)) { xmlTagFound = false; } } } while (node is StringNode); } catch (System.Exception e) { throw new ParserException( "HTMLTagScanner.extractXMLData() : error while trying to find xml tag", e); } } if (xmlTagFound) { if (node != null) { if (node is org.htmlparser.tags.EndTag) { org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node; if (!endTag.Text.Equals(tagName)) { xmlTagFound = false; } } } } if (xmlTagFound) { return(xmlData); } else { return(null); } } catch (System.Exception e) { throw new ParserException( "HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag", e); } }
/// <summary> Locate the StringNode within the input string, by parsing from the given position /// </summary> /// <param name="reader">HTML reader to be provided so as to allow reading of next line /// </param> /// <param name="input">Input String /// </param> /// <param name="position">Position to start parsing from /// </param> /// <param name="balance_quotes">If <code>true</code> enter ignoring state on /// encountering quotes. /// /// </param> public virtual Node Find(NodeReader reader, string input, int position, bool balance_quotes) { StringBuilder textBuffer = new StringBuilder(); int state = BEFORE_PARSE_BEGINS_STATE; int textBegin = position; int textEnd = position; int inputLen = input.Length; char ch; char ignore_ender = '\"'; for (int i = position; (i < inputLen && state != PARSE_COMPLETED_STATE); i++) { ch = input[i]; if (ch == '<' && state != PARSE_IGNORE_STATE) { if (BeginTag(input, i)) { state = PARSE_COMPLETED_STATE; textEnd = i - 1; } } if (balance_quotes && (ch == '\'' || ch == '"')) { if (state == PARSE_IGNORE_STATE) { if (ch == ignore_ender) { state = PARSE_HAS_BEGUN_STATE; } } else { ignore_ender = ch; state = PARSE_IGNORE_STATE; } } if (state == BEFORE_PARSE_BEGINS_STATE) { state = PARSE_HAS_BEGUN_STATE; } if (state == PARSE_HAS_BEGUN_STATE || state == PARSE_IGNORE_STATE) { textBuffer.Append(input[i]); } // Patch by Cedric Rosa if (state == BEFORE_PARSE_BEGINS_STATE && i == inputLen - 1) { state = PARSE_HAS_BEGUN_STATE; } if (state == PARSE_HAS_BEGUN_STATE && i == inputLen - 1) { do { input = reader.GetNextLine(); if (input != null && input.Length == 0) { textBuffer.Append(Parser.LineSeparator); } } while (input != null && input.Length == 0); if (input == null) { textEnd = i; state = PARSE_COMPLETED_STATE; } else { textBuffer.Append(Parser.LineSeparator); inputLen = input.Length; i = -1; } } } return(StringNode.CreateStringNode(textBuffer, textBegin, textEnd, reader.Parser.ShouldDecodeNodes)); }
public virtual void VisitStringNode(StringNode stringNode) { }