public virtual Tag Find(NodeReader reader, string input, int position)
        {
            int  state = TAG_BEFORE_PARSING_STATE;
            int  i     = position;
            char ch;

            char[] ignorechar = new char[1]; // holds the character we're looking for when in TAG_IGNORE_DATA_STATE
            Tag    tag        = new Tag(new TagData(position, 0, reader.LastLineNumber, 0, "", input, "", false));

            Bool encounteredQuery = new Bool(false);

            while (i < tag.TagLine.Length && state != TAG_FINISHED_PARSING_STATE && state != TAG_ILLEGAL_STATE)
            {
                ch    = tag.TagLine[i];
                state = AutomataInput(encounteredQuery, i, state, ch, tag, i, ignorechar);
                i     = IncrementCounter(i, reader, state, tag);
            }
            if (state == TAG_FINISHED_PARSING_STATE)
            {
                string tagLine = tag.TagLine;
                if (i > 1 && tagLine[i - 2] == '/')
                {
                    tag.EmptyXmlTag = true;
                    string tagContents = tag.Text;
                    tag.Text = tagContents.Substring(0, tagContents.Length - 1);
                }
                return(tag);
            }
            else
            {
                return(null);
            }
        }
        public override Tag Scan(Tag tag, string url, NodeReader reader, string currLine)
        {
            CompositeTagScannerHelper helper = new CompositeTagScannerHelper(this, tag, url, reader, currLine,
                                                                             balance_quotes);

            return(helper.Scan());
        }
        protected virtual Tag GetInsertedEndTag(Tag tag, NodeReader reader, string currentLine)
        {
            // Insert end tag
            string newLine = InsertEndTagBeforeNode(tag, currentLine);

            reader.ChangeLine(newLine);
            return(new EndTag(new TagData(tag.ElementBegin, tag.ElementBegin + 3, tag.TagName, currentLine)));
        }
        protected virtual Tag getReplacedEndTag(Tag tag, NodeReader reader, string currentLine)
        {
            // Replace tag - it was a <A> tag - replace with </a>
            string newLine = ReplaceFaultyTagWithEndTag(tag, currentLine);

            reader.ChangeLine(newLine);
            return(new EndTag(new TagData(tag.ElementBegin, tag.ElementBegin + 3, tag.TagName, currentLine)));
        }
        public Tag CreateScannedNode(Tag tag, string url, NodeReader reader, string currLine)
        {
            Tag thisTag = Scan(tag, url, reader, currLine);

            thisTag.ThisScanner = this;
            thisTag.Attributes  = tag.Attributes;
            return(thisTag);
        }
Beispiel #6
0
        public static IDictionary AdjustScanners(NodeReader reader)
        {
            IDictionary tempScanners = new Hashtable();

            tempScanners = reader.Parser.Scanners;
            // Remove all existing scanners
            reader.Parser.FlushScanners();
            return(tempScanners);
        }
Beispiel #7
0
 public CompositeTagScannerHelper(CompositeTagScanner scanner, Tag tag, string url, NodeReader reader,
                                  string currLine, bool balance_quotes)
 {
     this.scanner        = scanner;
     this.tag            = tag;
     this.url            = url;
     this.reader         = reader;
     this.currLine       = currLine;
     this.endTag         = null;
     this.nodeList       = new NodeList();
     this.endTagFound    = false;
     this.balance_quotes = balance_quotes;
 }
        /// <summary> Scan the tag to see using the scanners, and attempt identification.
        /// </summary>
        /// <param name="url">URL at which HTML page is located
        /// </param>
        /// <param name="reader">The NodeReader that is to be used for reading the url
        ///
        /// </param>
        public virtual AbstractNode Scan(IDictionary scanners, string url, NodeReader reader)
        {
            if (tagContents.Length == 0)
            {
                return(this);
            }
            try
            {
                bool         found  = false;
                AbstractNode retVal = null;
                // Find the first word in the scanners
                string firstWord = ExtractWord(tagContents.ToString());
                // Now, get the scanner associated with this.
                TagScanner scanner = (TagScanner)scanners[firstWord];

                // Now do a deep check
                if (scanner != null && scanner.Evaluate(tagContents.ToString(), reader.PreviousOpenScanner))
                {
                    found = true;
                    TagScanner save;
                    save = reader.PreviousOpenScanner;
                    reader.PreviousOpenScanner = scanner;
                    retVal = scanner.CreateScannedNode(this, url, reader, tagLine);
                    reader.PreviousOpenScanner = save;
                }

                if (!found)
                {
                    return(this);
                }
                else
                {
                    return(retVal);
                }
            }
            catch (System.Exception e)
            {
                string errorMsg;
                if (tagContents != null)
                {
                    errorMsg = tagContents.ToString();
                }
                else
                {
                    errorMsg = "null";
                }
                throw new ParserException(
                          "Tag.scan() : Error while scanning tag, tag contents = " + errorMsg + ", tagLine = " + tagLine, e);
            }
        }
        public virtual int IncrementCounter(int i, NodeReader reader, int state, Tag tag)
        {
            string nextLine = null;

            if ((state == TAG_BEGIN_PARSING_STATE || state == TAG_IGNORE_DATA_STATE ||
                 state == TAG_IGNORE_BEGIN_TAG_STATE) && i == tag.TagLine.Length - 1)
            {
                // The while loop below is a bug fix contributed by
                // Annette Doyle - see testcase HTMLImageScannerTest.testImageTagOnMultipleLines()
                // Further modified by Somik Raha, to remove bug - HTMLTagTest.testBrokenTag
                int numLinesAdvanced = 0;
                do
                {
                    nextLine = reader.GetNextLine();
                    numLinesAdvanced++;
                } while (nextLine != null && nextLine.Length == 0);
                if (nextLine == null)
                {
                    // This means we have a broken tag. Fill in an end tag symbol here.
                    nextLine = ">";
                }
                else
                {
                    // This means this is just a new line, hence add the new line character
                    tag.Append(Parser.LineSeparator);
                }

                // Ensure blank lines are included in tag's 'tagLines'
                while (--numLinesAdvanced > 0)
                {
                    tag.TagLine = "";
                }

                // We need to continue parsing to the next line
                tag.TagLine = nextLine;
                i           = -1;
            }
            return(++i);
        }
Beispiel #10
0
 public static void RestoreScanners(NodeReader reader, IDictionary tempScanners)
 {
     // Flush the scanners
     reader.Parser.Scanners = tempScanners;
 }
 public static void RestoreScanners(NodeReader pReader, System.Collections.Hashtable tempScanners)
 {
     // Flush the scanners
     pReader.Parser.Scanners = tempScanners;
 }
 /// <summary> Scan the tag and extract the information related to this type. The url of the
 /// initiating scan has to be provided in case relative links are found. The initial
 /// url is then prepended to it to give an absolute link.
 /// The NodeReader is provided in order to do a lookahead operation. We assume that
 /// the identification has already been performed using the Evaluate() method.
 /// </summary>
 /// <param name="tag">HTML Tag to be scanned for identification
 /// </param>
 /// <param name="url">The initiating url of the scan (Where the html page lies)
 /// </param>
 /// <param name="reader">The reader object responsible for reading the html page
 /// </param>
 public virtual Tag Scan(Tag tag, string url, NodeReader reader, string currLine)
 {
     return(CreateTag(new TagData(tag.ElementBegin, tag.ElementEnd, tag.Text, currLine), tag, url));
 }
        public static string ExtractXMLData(Node node, string tagName, NodeReader reader)
        {
            try
            {
                string xmlData = "";

                bool xmlTagFound = IsXMLTagFound(node, tagName);
                if (xmlTagFound)
                {
                    try
                    {
                        do
                        {
                            node = reader.ReadElement();
                            if (node != null)
                            {
                                if (node is StringNode)
                                {
                                    StringNode stringNode = (StringNode)node;
                                    if (xmlData.Length > 0)
                                    {
                                        xmlData += " ";
                                    }
                                    xmlData += stringNode.Text;
                                }
                                else if (!(node is org.htmlparser.tags.EndTag))
                                {
                                    xmlTagFound = false;
                                }
                            }
                        } while (node is StringNode);
                    }
                    catch (System.Exception e)
                    {
                        throw new ParserException(
                                  "HTMLTagScanner.extractXMLData() : error while trying to find xml tag", e);
                    }
                }
                if (xmlTagFound)
                {
                    if (node != null)
                    {
                        if (node is org.htmlparser.tags.EndTag)
                        {
                            org.htmlparser.tags.EndTag endTag = (org.htmlparser.tags.EndTag)node;
                            if (!endTag.Text.Equals(tagName))
                            {
                                xmlTagFound = false;
                            }
                        }
                    }
                }
                if (xmlTagFound)
                {
                    return(xmlData);
                }
                else
                {
                    return(null);
                }
            }
            catch (System.Exception e)
            {
                throw new ParserException(
                          "HTMLTagScanner.extractXMLData() : Error occurred while trying to extract xml tag", e);
            }
        }
 /// <summary> Locate the tag withing the input string, by parsing from the given position
 /// </summary>
 /// <param name="reader">HTML reader to be provided so as to allow reading of next line
 /// </param>
 /// <param name="input">Input String
 /// </param>
 /// <param name="position">Position to start parsing from
 ///
 /// </param>
 public static Tag Find(NodeReader reader, string input, int position)
 {
     return(tagParser.Find(reader, input, position));
 }
Beispiel #15
0
        /// <summary> Locate the StringNode within the input string, by parsing from the given position
        /// </summary>
        /// <param name="reader">HTML reader to be provided so as to allow reading of next line
        /// </param>
        /// <param name="input">Input String
        /// </param>
        /// <param name="position">Position to start parsing from
        /// </param>
        /// <param name="balance_quotes">If <code>true</code> enter ignoring state on
        /// encountering quotes.
        ///
        /// </param>
        public virtual Node Find(NodeReader reader, string input, int position, bool balance_quotes)
        {
            StringBuilder textBuffer = new StringBuilder();
            int           state      = BEFORE_PARSE_BEGINS_STATE;
            int           textBegin  = position;
            int           textEnd    = position;
            int           inputLen   = input.Length;
            char          ch;
            char          ignore_ender = '\"';

            for (int i = position; (i < inputLen && state != PARSE_COMPLETED_STATE); i++)
            {
                ch = input[i];
                if (ch == '<' && state != PARSE_IGNORE_STATE)
                {
                    if (BeginTag(input, i))
                    {
                        state   = PARSE_COMPLETED_STATE;
                        textEnd = i - 1;
                    }
                }
                if (balance_quotes && (ch == '\'' || ch == '"'))
                {
                    if (state == PARSE_IGNORE_STATE)
                    {
                        if (ch == ignore_ender)
                        {
                            state = PARSE_HAS_BEGUN_STATE;
                        }
                    }
                    else
                    {
                        ignore_ender = ch;
                        state        = PARSE_IGNORE_STATE;
                    }
                }
                if (state == BEFORE_PARSE_BEGINS_STATE)
                {
                    state = PARSE_HAS_BEGUN_STATE;
                }
                if (state == PARSE_HAS_BEGUN_STATE || state == PARSE_IGNORE_STATE)
                {
                    textBuffer.Append(input[i]);
                }
                // Patch by Cedric Rosa
                if (state == BEFORE_PARSE_BEGINS_STATE && i == inputLen - 1)
                {
                    state = PARSE_HAS_BEGUN_STATE;
                }
                if (state == PARSE_HAS_BEGUN_STATE && i == inputLen - 1)
                {
                    do
                    {
                        input = reader.GetNextLine();
                        if (input != null && input.Length == 0)
                        {
                            textBuffer.Append(Parser.LineSeparator);
                        }
                    } while (input != null && input.Length == 0);

                    if (input == null)
                    {
                        textEnd = i;
                        state   = PARSE_COMPLETED_STATE;
                    }
                    else
                    {
                        textBuffer.Append(Parser.LineSeparator);
                        inputLen = input.Length;
                        i        = -1;
                    }
                }
            }
            return(StringNode.CreateStringNode(textBuffer, textBegin, textEnd, reader.Parser.ShouldDecodeNodes));
        }
Beispiel #16
0
 public override Tag Scan(Tag tag, string url, NodeReader reader, string currLine)
 {
     return(null);
 }