Esempio n. 1
0
        private void CreateCorrectionEndTagBefore(int pos)
        {
            string endTagName  = tag.TagName;
            int    endTagBegin = pos;
            int    endTagEnd   = endTagBegin + endTagName.Length + 2;

            endTag = new EndTag(new TagData(endTagBegin, endTagEnd, endTagName, currLine));
        }
Esempio n. 2
0
        public void TagExtraction()
        {
            string testHTML = "<AREA \n coords=0,0,52,52 href=\"http://www.yahoo.com/r/c1\" shape=RECT>";

            CreateParser(testHTML);
            Tag tag = Tag.Find(parser.Reader, testHTML, 0);

            Assert.IsNotNull(tag);
        }
Esempio n. 3
0
        private void DoForceCorrectionCheckOn(Tag possibleEndTagCauser)
        {
            if (IsEndTagMissing(possibleEndTagCauser))
            {
                CreateCorrectionEndTagBefore(possibleEndTagCauser);

                endTagFound = true;
            }
        }
Esempio n. 4
0
        protected override Tag CreateTag(TagData tagData, Tag tag, string url)
        {
            System.Collections.Hashtable table = tag.Attributes;
            string metaTagName     = (string)table["NAME"];
            string metaTagContents = (string)table["CONTENT"];
            string httpEquiv       = (string)table["HTTP-EQUIV"];

            return(new MetaTag(tagData, httpEquiv, metaTagName, metaTagContents));
        }
        public void ExtractLinkInvertedCommasBug()
        {
            string      tagContents = "a href=r/anorth/top.html";
            Tag         tag         = new Tag(new TagData(0, 0, tagContents, ""));
            string      url         = "c:\\cvs\\html\\binaries\\yahoo.htm";
            LinkScanner scanner     = new LinkScanner("-l");

            Assert.AreEqual("r/anorth/top.html", scanner.ExtractLink(tag, url), "Extracted Link");
        }
        public virtual string ReplaceFaultyTagWithEndTag(Tag tag, string currentLine)
        {
            string newLine = currentLine.Substring(0, tag.ElementBegin);

            newLine += "</" + tag.TagName + ">";
            newLine += currentLine.Substring(tag.ElementEnd + 1, currentLine.Length - (tag.ElementEnd + 1));

            return(newLine);
        }
        public void InsertEndTagBeforeTag()
        {
            string      currentLine = "<a href=s/7509><b>Yahoo! Movies</b></a>";
            Tag         tag         = new Tag(new TagData(0, 14, "a href=s/7509", currentLine));
            LinkScanner linkScanner = new LinkScanner();
            string      newLine     = linkScanner.InsertEndTagBeforeNode(tag, currentLine);

            Assert.AreEqual("</A><a href=s/7509><b>Yahoo! Movies</b></a>", newLine, "Expected insertion");
        }
Esempio n. 8
0
        private void AssertTagNameMatches(Tag nextExpectedTag, Tag nextActualTag, string displayMessage)
        {
            string expectedTagName = nextExpectedTag.TagName;
            string actualTagName   = nextActualTag.TagName;

            displayMessage = "The tag names did not match: Expected " + expectedTagName + " \nbut was " + actualTagName +
                             displayMessage;
            AssertStringEquals(displayMessage, expectedTagName, actualTagName);
        }
Esempio n. 9
0
        private void CreateCorrectionEndTagBefore(Tag possibleEndTagCauser)
        {
            string endTagName  = tag.TagName;
            int    endTagBegin = possibleEndTagCauser.ElementBegin;
            int    endTagEnd   = endTagBegin + endTagName.Length + 2;

            possibleEndTagCauser.TagBegin = endTagEnd + 1;
            reader.AddNextParsedNode(possibleEndTagCauser);
            endTag = new EndTag(new TagData(endTagBegin, endTagEnd, endTagName, currLine));
        }
        public virtual string ExtractFormMethod(Tag tag)
        {
            string method = tag["METHOD"];

            if (method == null)
            {
                method = FormTag.GET;
            }
            return(method.ToUpper());
        }
Esempio n. 11
0
 public void AssertTagEquals(Node expected, Node actual, string displayMessage)
 {
     if (expected is Tag)
     {
         Tag expectedTag = (Tag)expected;
         Tag actualTag   = (Tag)actual;
         AssertTagNameMatches(expectedTag, actualTag, displayMessage);
         AssertAttributesMatch(expectedTag, actualTag, displayMessage);
     }
 }
 private int CheckBeginParsingState(int i, int state, char ch, Tag tag)
 {
     if (ch == '<' && (state == TAG_BEFORE_PARSING_STATE || state == TAG_ILLEGAL_STATE))
     {
         // Transition from State 0 to State 1 - Record data till > is encountered
         tag.TagBegin = i;
         state        = TAG_BEGIN_PARSING_STATE;
     }
     return(state);
 }
        private int CheckIllegalState(int i, int state, char ch, Tag tag)
        {
            if (ch == '/' && i > 0 && tag.TagLine[i - 1] == '<' &&
                state != TAG_IGNORE_DATA_STATE &&
                state != TAG_IGNORE_BEGIN_TAG_STATE)
            {
                state = TAG_ILLEGAL_STATE;
            }

            return(state);
        }
Esempio n. 14
0
 private void DoEmptyXmlTagCheckOn(Node currentNode)
 {
     if (currentNode is Tag)
     {
         Tag possibleEndTag = (Tag)currentNode;
         if (IsXmlEndTag(tag))
         {
             endTag      = possibleEndTag;
             endTagFound = true;
         }
     }
 }
        public void ReplaceFaultyTagWithEndTag()
        {
            string currentLine =
                "<p>Site Comments?<br><a href=\"mailto:[email protected]?subject=Site Comments\">Mail Us<a></p>";
            Tag         tag         = new Tag(new TagData(85, 87, "a", currentLine));
            LinkScanner linkScanner = new LinkScanner();
            string      newLine     = linkScanner.ReplaceFaultyTagWithEndTag(tag, currentLine);

            Assert.AreEqual(
                "<p>Site Comments?<br><a href=\"mailto:[email protected]?subject=Site Comments\">Mail Us</A></p>",
                newLine, "Expected replacement");
        }
        public void LinkDataContents()
        {
            CreateParser(
                "<a href=\"http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689\" target=\"_new\"><img src=\"http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif\" width=468 height=60 border=\"0\" alt=\"See Signs in Theaters 8-2 - Starring Mel Gibson\" align=><font face=\"verdana,arial,helvetica\" SIZE=\"1\"><b></b></font></a>",
                "http://transfer.go.com");
            // Register the image scanner
            LinkScanner linkScanner = new LinkScanner("-l");

            parser.AddScanner(linkScanner);
            parser.AddScanner(linkScanner.CreateImageScanner("-i"));

            ParseAndAssertNodeCount(1);
            Assert.IsTrue(node[0] is LinkTag, "Node 0 should be a link tag");
            LinkTag linkTag = (LinkTag)node[0];

            Assert.AreEqual(
                "http://transfer.go.com/cgi/atransfer.pl?goto=http://www.signs.movies.com&name=114332&srvc=nws&context=283&guid=4AD5723D-C802-4310-A388-0B24E1A79689",
                linkTag.Link, "Link URL");
            Assert.AreEqual("", linkTag.LinkText, "Link Text");
            Node[] containedNodes = new AbstractNode[10];
            int    i = 0;

            foreach (Node nestedNode in linkTag)
            {
                containedNodes[i++] = nestedNode;
            }
            Assert.AreEqual(5, i, "There should be 5 contained nodes in the link tag");
            Assert.IsTrue(containedNodes[0] is ImageTag, "First contained node should be an image tag");
            ImageTag imageTag = (ImageTag)containedNodes[0];

            Assert.AreEqual("http://ad.abcnews.com/ad/sponsors/buena_vista_pictures/bvpi-ban0003.gif", imageTag.ImageURL,
                            "Image Location");
            Assert.AreEqual("60", imageTag["HEIGHT"], "Image Height");
            Assert.AreEqual("468", imageTag["WIDTH"], "Image Width");
            Assert.AreEqual("0", imageTag["BORDER"], "Image Border");
            Assert.AreEqual("See Signs in Theaters 8-2 - Starring Mel Gibson", imageTag["ALT"], "Image Alt");
            Assert.IsTrue(containedNodes[1] is Tag, "Second contained node should be Tag");
            Tag tag1 = (Tag)containedNodes[1];

            Assert.AreEqual("font face=\"verdana,arial,helvetica\" SIZE=\"1\"", tag1.Text, "Tag Contents");
            Assert.IsTrue(containedNodes[2] is Tag, "Third contained node should be Tag");
            Tag tag2 = (Tag)containedNodes[2];

            Assert.AreEqual("b", tag2.Text, "Tag Contents");
            Assert.IsTrue(containedNodes[3] is EndTag, "Fourth contained node should be HTMLEndTag");
            EndTag endTag1 = (EndTag)containedNodes[3];

            Assert.AreEqual("b", endTag1.Text, "Fourth Tag contents");
            Assert.IsTrue(containedNodes[4] is EndTag, "Fifth contained node should be HTMLEndTag");
            EndTag endTag2 = (EndTag)containedNodes[4];

            Assert.AreEqual("font", endTag2.Text, "Fifth Tag contents");
        }
        protected override Tag CreateTag(TagData tagData, Tag tag, string url)
        {
            string baseUrl         = tag["HREF"];
            string absoluteBaseUrl = "";

            if (baseUrl != null && baseUrl.Length > 0)
            {
                absoluteBaseUrl   = LinkProcessor.RemoveLastSlash(baseUrl.Trim());
                processor.BaseUrl = absoluteBaseUrl;
            }
            return(new BaseHrefTag(tagData, absoluteBaseUrl));
        }
 private void CheckIfAppendable(Bool encounteredQuery, int state, char ch, Tag tag)
 {
     if (state == TAG_IGNORE_DATA_STATE || state == TAG_BEGIN_PARSING_STATE ||
         state == TAG_IGNORE_BEGIN_TAG_STATE)
     {
         if (ch == '?')
         {
             encounteredQuery.Boolean = true;
         }
         tag.Append(ch);
     }
 }
Esempio n. 19
0
 public CompositeTagData(Tag startTag, Tag endTag, NodeList children)
 {
     this.startTag = startTag;
     this.endTag   = endTag;
     this.children = new NodeList();
     if (children != null)
     {
         foreach (Node child in children)
         {
             this.children.Add(child);
         }
     }
 }
Esempio n. 20
0
 public CompositeTagScannerHelper(CompositeTagScanner scanner, Tag tag, string url, NodeReader reader,
                                  string currLine, bool balance_quotes)
 {
     this.scanner        = scanner;
     this.tag            = tag;
     this.url            = url;
     this.reader         = reader;
     this.currLine       = currLine;
     this.endTag         = null;
     this.nodeList       = new NodeList();
     this.endTagFound    = false;
     this.balance_quotes = balance_quotes;
 }
        public bool IsTagToBeEndedFor(Tag tag)
        {
            bool   isEndTag = tag is EndTag;
            string tagName  = tag.TagName;

            if ((isEndTag && endTagEnderSet.Contains(tagName)) || (!isEndTag && tagEnderSet.Contains(tagName)))
            {
                return(true);
            }
            else
            {
                return(false);
            }
        }
        public static bool IsXMLTagFound(Node node, string tagName)
        {
            bool xmlTagFound = false;

            if (node is Tag)
            {
                Tag tag = (Tag)node;
                if (tag.Text.ToUpper().IndexOf(tagName) == 0)
                {
                    xmlTagFound = true;
                }
            }
            return(xmlTagFound);
        }
Esempio n. 23
0
 private void DoChildAndEndTagCheckOn(Node currentNode)
 {
     if (currentNode is EndTag)
     {
         EndTag possibleEndTag = (EndTag)currentNode;
         if (IsExpectedEndTag(possibleEndTag))
         {
             endTagFound = true;
             endTag      = possibleEndTag;
             return;
         }
     }
     nodeList.Add(currentNode);
     scanner.ChildNodeEncountered(currentNode);
 }
Esempio n. 24
0
 private void FixIfXmlEndTag(Parser parser, Node node)
 {
     if (node is Tag)
     {
         Tag tag = (Tag)node;
         if (tag.EmptyXmlTag)
         {
             // Add end tag
             string currLine = parser.Reader.CurrentLine;
             int    pos      = parser.Reader.LastReadPosition;
             currLine = currLine.Substring(0, (pos + 1) - (0)) + "</" + tag.TagName + ">" +
                        currLine.Substring(pos + 1, (currLine.Length) - (pos + 1));
             parser.Reader.ChangeLine(currLine);
         }
     }
 }
        public virtual void CorrectTag(Tag tag)
        {
            string        tempText     = tag.Text;
            StringBuilder absorbedText = new StringBuilder();

            foreach (char c in tempText)
            {
                if (c != '"')
                {
                    absorbedText.Append(c);
                }
            }
            // Go into the next stage.
            StringBuilder result = InsertInvertedCommasCorrectly(absorbedText);

            tag.Text = result.ToString();
        }
Esempio n. 26
0
        /// <summary> Extract the location of the image, given the string to be parsed, and the url
        /// of the html page in which this tag exists.
        /// </summary>
        /// <param name="tag">String to be parsed
        /// </param>
        /// <param name="url">URL of web page being parsed
        ///
        /// </param>
        public virtual string ExtractImageLocn(Tag tag, string url)
        {
            string relativeLink = null;

            try
            {
                table        = tag.Attributes;
                relativeLink = (string)table["SRC"];
                if (relativeLink != null)
                {
                    relativeLink = ParserUtils.RemoveChars(relativeLink, '\n');
                    relativeLink = ParserUtils.RemoveChars(relativeLink, '\r');
                }
                if (relativeLink == null || relativeLink.Length == 0)
                {
                    // try fix
                    string tagText  = tag.Text.ToUpper();
                    int    indexSrc = tagText.IndexOf("SRC");
                    if (indexSrc != -1)
                    {
                        // There is a missing equals.
                        tag.Text = tag.Text.Substring(0, (indexSrc + 3) - (0)) + "=" +
                                   tag.Text.Substring(indexSrc + 3, (tag.Text.Length) - (indexSrc + 3));
                        table        = tag.RedoParseAttributes();
                        relativeLink = (string)table["SRC"];
                    }
                }
                if (relativeLink == null)
                {
                    return("");
                }
                else
                {
                    return(processor.Extract(relativeLink, url));
                }
            }
            catch (System.Exception e)
            {
                throw new ParserException(
                          "HTMLImageScanner.ExtractImageLocn() : Error in extracting image location, relativeLink = " +
                          relativeLink + ", url = " + url, e);
            }
        }
        public void CompositeTagWithTagChild()
        {
            CreateParser("<Custom>" + "<Hello>" + "</Custom>");
            CustomTag customTag = ParseCustomTag(1);
            int       x         = customTag.ChildCount;

            Assert.AreEqual(1, customTag.ChildCount, "child count");
            Assert.IsFalse(customTag.EmptyXmlTag, "custom tag should not be xml end tag");
            Assert.AreEqual(0, customTag.StartTag.ElementBegin, "starting loc");
            Assert.AreEqual(7, customTag.StartTag.ElementEnd, "ending loc");
            Assert.AreEqual(0, customTag.ElementBegin, "custom tag starting loc");
            Assert.AreEqual(23, customTag.ElementEnd, "custom tag ending loc");

            Node child = customTag[0];

            AssertType("child", typeof(Tag), child);
            Tag tag = (Tag)child;

            AssertStringEquals("child html", "<HELLO>", child.ToHtml());
        }
        public void FreshMeatBug()
        {
            CreateParser("<a>Revision</a>", "http://www.yahoo.com");
            // Register the image scanner
            parser.AddScanner(new LinkScanner("-l"));

            ParseAndAssertNodeCount(3);
            Assert.IsTrue(node[0] is Tag, "Node 0 should be a tag");
            Tag tag = (Tag)node[0];

            Assert.AreEqual("a", tag.Text, "Tag Contents");
            Assert.IsTrue(node[1] is StringNode, "Node 1 should be a string node");
            StringNode stringNode = (StringNode)node[1];

            Assert.AreEqual("Revision", stringNode.Text, "StringNode Contents");
            Assert.IsTrue(node[2] is EndTag, "Node 2 should be a string node");
            EndTag endTag = (EndTag)node[2];

            Assert.AreEqual("a", endTag.Text, "End Tag Contents");
        }
Esempio n. 29
0
        public static string ToString(Tag tag)
        {
            string tagName = tag[Tag.TAGNAME];

            System.Collections.Hashtable attrs = tag.Attributes;

            System.Text.StringBuilder lString = new System.Text.StringBuilder(tagName);
            lString.Append(" TAG\n");
            lString.Append("--------\n");

            foreach (string key in attrs.Keys)
            {
                string value = (string)attrs[key];
                if (!key.ToUpper().Equals(Tag.TAGNAME.ToUpper()) && value.Length > 0)
                {
                    lString.Append(key).Append(" : ").Append(value).Append("\n");
                }
            }

            return(lString.ToString());
        }
        public virtual int IncrementCounter(int i, NodeReader reader, int state, Tag tag)
        {
            string nextLine = null;

            if ((state == TAG_BEGIN_PARSING_STATE || state == TAG_IGNORE_DATA_STATE ||
                 state == TAG_IGNORE_BEGIN_TAG_STATE) && i == tag.TagLine.Length - 1)
            {
                // The while loop below is a bug fix contributed by
                // Annette Doyle - see testcase HTMLImageScannerTest.testImageTagOnMultipleLines()
                // Further modified by Somik Raha, to remove bug - HTMLTagTest.testBrokenTag
                int numLinesAdvanced = 0;
                do
                {
                    nextLine = reader.GetNextLine();
                    numLinesAdvanced++;
                } while (nextLine != null && nextLine.Length == 0);
                if (nextLine == null)
                {
                    // This means we have a broken tag. Fill in an end tag symbol here.
                    nextLine = ">";
                }
                else
                {
                    // This means this is just a new line, hence add the new line character
                    tag.Append(Parser.LineSeparator);
                }

                // Ensure blank lines are included in tag's 'tagLines'
                while (--numLinesAdvanced > 0)
                {
                    tag.TagLine = "";
                }

                // We need to continue parsing to the next line
                tag.TagLine = nextLine;
                i           = -1;
            }
            return(++i);
        }