public void Scan()
        {
            CreateParser("<A HREF=\"mytest.html\"><IMG SRC=\"abcd.jpg\">Hello World</A>", "http://www.yahoo.com");
            // Register the image scanner
            LinkScanner linkScanner = new LinkScanner("-l");

            parser.AddScanner(linkScanner);
            parser.AddScanner(linkScanner.CreateImageScanner("-i"));

            ParseAndAssertNodeCount(1);
            Assert.IsTrue(node[0] is LinkTag, "Node should be a link node");

            LinkTag linkTag = (LinkTag)node[0];

            // Get the link data and cross-check
            Node[] dataNode = new AbstractNode[10];
            int    i        = 0;

            foreach (Node nestedNode in linkTag)
            {
                dataNode[i++] = nestedNode;
            }
            Assert.AreEqual(2, i, "Number of data nodes");
            Assert.IsTrue(dataNode[0] is ImageTag, "First data node should be an Image Node");
            Assert.IsTrue(dataNode[1] is StringNode, "Second data node shouls be a String Node");

            // Check the contents of each data node
            ImageTag imageTag = (ImageTag)dataNode[0];

            Assert.AreEqual("http://www.yahoo.com/abcd.jpg", imageTag.ImageURL, "Image URL");
            StringNode stringNode = (StringNode)dataNode[1];

            Assert.AreEqual("Hello World", stringNode.Text, "String Contents");
        }
        public void CompositeTagWithDeadlock()
        {
            CreateParser("<custom>" + "<another>something" + "</custom>" + "<custom>" + "<another>else</another>" +
                         "</custom>");
            parser.AddScanner(new AnotherScanner(true));
            CustomTag customTag = ParseCustomTag(2);
            int       x         = customTag.ChildCount;

            Assert.AreEqual(1, customTag.ChildCount, "child count");
            Assert.IsFalse(customTag.EmptyXmlTag, "custom tag should not be xml end tag");
            Assert.AreEqual(0, customTag.StartTag.ElementBegin, "starting loc");
            Assert.AreEqual(7, customTag.StartTag.ElementEnd, "ending loc");
            Assert.AreEqual(1, customTag.tagData.StartLine, "starting line position");
            Assert.AreEqual(1, customTag.tagData.EndLine, "ending line position");
            AnotherTag anotherTag = (AnotherTag)customTag[0];

            Assert.AreEqual(1, anotherTag.ChildCount, "anotherTag child count");
            StringNode stringNode = (StringNode)anotherTag[0];

            AssertStringEquals("anotherTag child text", "something", stringNode.ToPlainTextString());
            AssertStringEquals("first custom tag html", "<CUSTOM><ANOTHER>something</ANOTHER></CUSTOM>",
                               customTag.ToHtml());
            customTag = (CustomTag)node[1];
            AssertStringEquals("second custom tag html", "<CUSTOM><ANOTHER>else</ANOTHER></CUSTOM>", customTag.ToHtml());
        }
        public override void VisitStringNode(StringNode stringNode)
        {
            string text = stringNode.Text;

            if (!preTagBeingProcessed)
            {
                text = Translate.Decode(text);
                text = ReplaceNonBreakingSpaceWithOrdinarySpace(text);
            }
            textAccumulator.Append(text);
        }
        public void EmptyTag3()
        {
            string testHTML = "<html><body>text<>text</body></html>";

            CreateParser(testHTML);
            parser.RegisterScanners();
            ParseAndAssertNodeCount(5);
            Assert.IsTrue(node[2] is StringNode, "Third node should be a string node");
            StringNode stringNode = (StringNode)node[2];

            Assert.AreEqual("text<>text", stringNode.Text, "Third node has incorrect text");
        }
        public void EmptyTag6()
        {
            string testHTML = "<html><body>text<>\ntext</body></html>";

            CreateParser(testHTML);
            parser.RegisterScanners();
            Parser.LineSeparator = "\r\n"; // actually a static method
            ParseAndAssertNodeCount(5);
            Assert.IsTrue(node[2] is StringNode, "Third node should be a string node");
            StringNode stringNode = (StringNode)node[2];
            string     actual     = stringNode.Text;

            Assert.AreEqual("text<>\r\ntext", actual, "Third node has incorrect text");
        }
        public void ErroneousLinkBug()
        {
            CreateParser("<p>Site Comments?<br>" + "<a href=\"mailto:[email protected]?subject=Site Comments\">" +
                         "Mail Us" + "<a>" + "</p>");
            parser.RegisterScanners();
            ParseAndAssertNodeCount(6);
            // The first node should be a Tag
            Assert.IsTrue(node[0] is Tag, "First node should be a Tag");
            // The second node should be a HTMLStringNode
            Assert.IsTrue(node[1] is StringNode, "Second node should be a HTMLStringNode");
            StringNode stringNode = (StringNode)node[1];

            Assert.AreEqual("Site Comments?", stringNode.Text, "Text of the StringNode");
            Assert.IsTrue(node[2] is Tag, "Third node should be a tag");
        }
        public void MultipleLineBug()
        {
            CreateParser("<LI><font color=\"FF0000\" size=-1><b>Tech Samachar:</b></font><a \n" +
                         "href=\"http://ads.samachar.com/bin/redirect/tech.txt?http://www.samachar.com/tech\n" +
                         "nical.html\"> Journalism 3.0</a> by Rajesh Jain");
            Parser.LineSeparator = "\r\n";
            parser.AddScanner(new LinkScanner("-l"));
            ParseAndAssertNodeCount(8);
            Assert.IsTrue(node[6] is LinkTag, "Seventh node should be a link tag");
            LinkTag linkTag = (LinkTag)node[6];
            string  exp     = "http://ads.samachar.com/bin/redirect/tech.txt?http://www.samachar.com/technical.html";

            AssertStringEquals("Link URL of link tag", exp, linkTag.Link);
            Assert.AreEqual(" Journalism 3.0", linkTag.LinkText, "Link Text of link tag");
            Assert.IsTrue(node[7] is StringNode, "Eight node should be a string node");
            StringNode stringNode = (StringNode)node[7];

            Assert.AreEqual(" by Rajesh Jain", stringNode.Text, "String node contents");
        }
        public void FreshMeatBug()
        {
            CreateParser("<a>Revision</a>", "http://www.yahoo.com");
            // Register the image scanner
            parser.AddScanner(new LinkScanner("-l"));

            ParseAndAssertNodeCount(3);
            Assert.IsTrue(node[0] is Tag, "Node 0 should be a tag");
            Tag tag = (Tag)node[0];

            Assert.AreEqual("a", tag.Text, "Tag Contents");
            Assert.IsTrue(node[1] is StringNode, "Node 1 should be a string node");
            StringNode stringNode = (StringNode)node[1];

            Assert.AreEqual("Revision", stringNode.Text, "StringNode Contents");
            Assert.IsTrue(node[2] is EndTag, "Node 2 should be a string node");
            EndTag endTag = (EndTag)node[2];

            Assert.AreEqual("a", endTag.Text, "End Tag Contents");
        }
        public void CompositeTagWithOneTextChild()
        {
            CreateParser("<Custom>" + "Hello" + "</Custom>");
            CustomTag customTag = ParseCustomTag(1);
            int       x         = customTag.ChildCount;

            Assert.AreEqual(1, customTag.ChildCount, "child count");
            Assert.IsFalse(customTag.EmptyXmlTag, "custom tag should not be xml end tag");
            Assert.AreEqual(0, customTag.StartTag.ElementBegin, "starting loc");
            Assert.AreEqual(7, customTag.StartTag.ElementEnd, "ending loc");
            Assert.AreEqual(1, customTag.tagData.StartLine, "starting line position");
            Assert.AreEqual(1, customTag.tagData.EndLine, "ending line position");

            Node child = customTag[0];

            AssertType("child", typeof(StringNode), child);
            StringNode text = (StringNode)child;

            AssertStringEquals("child text", "Hello", child.ToPlainTextString());
        }
        public void CompositeTagWithNestedTag()
        {
            CreateParser("<Custom>" + "<Another>" + "Hello" + "</Another>" + "<Custom/>" + "</Custom>" + "<Custom/>");
            parser.AddScanner(new CustomScanner(this));
            parser.AddScanner(new AnotherScanner());
            ParseAndAssertNodeCount(2);
            AssertType("first node", typeof(CustomTag), this.node[0]);
            AssertType("second node", typeof(CustomTag), this.node[1]);
            CustomTag customTag = (CustomTag)this.node[0];
            Node      node      = customTag[0];

            AssertType("first child", typeof(AnotherTag), node);
            AnotherTag anotherTag = (AnotherTag)node;

            Assert.AreEqual(1, anotherTag.ChildCount, "another tag children count");
            node = anotherTag[0];
            AssertType("nested child", typeof(StringNode), node);
            StringNode text = (StringNode)node;

            Assert.AreEqual("Hello", text.ToPlainTextString(), "text");
        }
        public override void VisitStringNode(StringNode stringNode)
        {
            string stringToBeSearched = stringNode.Text.ToUpper();

            if (!multipleSearchesWithinStrings && stringToBeSearched.IndexOf(stringToFind) != -1)
            {
                stringFound = true;
                foundCount++;
            }
            else if (multipleSearchesWithinStrings)
            {
                int index = -1;
                do
                {
                    index = stringToBeSearched.IndexOf(stringToFind, index + 1);
                    if (index != -1)
                    {
                        foundCount++;
                    }
                } while (index != -1);
            }
        }
        public static string ExtractXMLData(Node node, string tagName, NodeReader reader)
        {
            try
            {
                string xmlData = "";

                bool xmlTagFound = IsXMLTagFound(node, tagName);
                if (xmlTagFound)
                {
                    try
                    {
                        do
                        {
                            node = reader.ReadElement();
                            if (node != null)
                            {
                                if (node is StringNode)
                                {
                                    StringNode stringNode = (StringNode)node;
                                    if (xmlData.Length > 0)
                                    {
                                        xmlData += " ";
                                    }
                                    xmlData += stringNode.Text;
                                }
                                else if (!(node is org.htmlparser.tags.EndTag))
                                {
                                    xmlTagFound = false;
                                }
                            }
                        } while (node is StringNode);
                    }
                    catch (System.Exception e)
                    {
                        throw new ParserException(
                                  "HTMLTagScanner.extractXMLData() : error while trying to find xml tag", e);
                    }
                }
                if (xmlTagFound)
                {
                    if (node != null)
                    {
                        if (node is org.htmlparser.tags.EndTag)
                        {
                            org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node;
                            if (!endTag.Text.Equals(tagName))
                            {
                                xmlTagFound = false;
                            }
                        }
                    }
                }
                if (xmlTagFound)
                {
                    return(xmlData);
                }
                else
                {
                    return(null);
                }
            }
            catch (System.Exception e)
            {
                throw new ParserException(
                          "HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag", e);
            }
        }
Example #13
0
        /// <summary> Locate the StringNode within the input string, by parsing from the given position
        /// </summary>
        /// <param name="reader">HTML reader to be provided so as to allow reading of next line
        /// </param>
        /// <param name="input">Input String
        /// </param>
        /// <param name="position">Position to start parsing from
        /// </param>
        /// <param name="balance_quotes">If <code>true</code> enter ignoring state on
        /// encountering quotes.
        ///
        /// </param>
        public virtual Node Find(NodeReader reader, string input, int position, bool balance_quotes)
        {
            StringBuilder textBuffer = new StringBuilder();
            int           state      = BEFORE_PARSE_BEGINS_STATE;
            int           textBegin  = position;
            int           textEnd    = position;
            int           inputLen   = input.Length;
            char          ch;
            char          ignore_ender = '\"';

            for (int i = position; (i < inputLen && state != PARSE_COMPLETED_STATE); i++)
            {
                ch = input[i];
                if (ch == '<' && state != PARSE_IGNORE_STATE)
                {
                    if (BeginTag(input, i))
                    {
                        state   = PARSE_COMPLETED_STATE;
                        textEnd = i - 1;
                    }
                }
                if (balance_quotes && (ch == '\'' || ch == '"'))
                {
                    if (state == PARSE_IGNORE_STATE)
                    {
                        if (ch == ignore_ender)
                        {
                            state = PARSE_HAS_BEGUN_STATE;
                        }
                    }
                    else
                    {
                        ignore_ender = ch;
                        state        = PARSE_IGNORE_STATE;
                    }
                }
                if (state == BEFORE_PARSE_BEGINS_STATE)
                {
                    state = PARSE_HAS_BEGUN_STATE;
                }
                if (state == PARSE_HAS_BEGUN_STATE || state == PARSE_IGNORE_STATE)
                {
                    textBuffer.Append(input[i]);
                }
                // Patch by Cedric Rosa
                if (state == BEFORE_PARSE_BEGINS_STATE && i == inputLen - 1)
                {
                    state = PARSE_HAS_BEGUN_STATE;
                }
                if (state == PARSE_HAS_BEGUN_STATE && i == inputLen - 1)
                {
                    do
                    {
                        input = reader.GetNextLine();
                        if (input != null && input.Length == 0)
                        {
                            textBuffer.Append(Parser.LineSeparator);
                        }
                    } while (input != null && input.Length == 0);

                    if (input == null)
                    {
                        textEnd = i;
                        state   = PARSE_COMPLETED_STATE;
                    }
                    else
                    {
                        textBuffer.Append(Parser.LineSeparator);
                        inputLen = input.Length;
                        i        = -1;
                    }
                }
            }
            return(StringNode.CreateStringNode(textBuffer, textBegin, textEnd, reader.Parser.ShouldDecodeNodes));
        }
Example #14
0
 public virtual void VisitStringNode(StringNode stringNode)
 {
 }