예제 #1
0
        protected void CreateParser(string inputHTML)
        {
            string testHTML = new string(inputHTML.ToCharArray());

            System.IO.StringReader sr = new System.IO.StringReader(testHTML);
            reader = new NodeReader(sr, 5000);
            parser = new Parser(reader, new NoFeedback());
            node   = new AbstractNode[40];
        }
예제 #2
0
        protected void CreateParser(string inputHTML, string url, int numNodes)
        {
            string testHTML = new string(inputHTML.ToCharArray());

            System.IO.StringReader sr = new System.IO.StringReader(testHTML);
            reader = new NodeReader(sr, url);
            parser = new Parser(reader, new DefaultParserFeedback());
            node   = new AbstractNode[numNodes];
        }
예제 #3
0
        //
        // Constructors
        //

        /// <summary> Zero argument constructor.
        /// The parser is in a safe but useless state.
        /// Set the reader or connection using Reader or Connection.
        /// </summary>
        /// <seealso cref="">#Reader
        /// </seealso>
        /// <seealso cref="">#Connection
        ///
        /// </seealso>
        public Parser()
        {
            InitBlock();
            Feedback      = null;
            Scanners      = null;
            resourceLocn  = null;
            reader        = null;
            character_set = DEFAULT_CHARSET;
            url           = null;
            input         = null;
            Tag.TagParser = new TagParser(Feedback);
        }
예제 #4
0
 /// <summary> Constructor for custom HTTP access.
 /// </summary>
 /// <param name="connection">A fully conditioned connection.
 /// </param>
 /// <param name="fb">The object to use for message communication.
 ///
 /// </param>
 public Parser(WebRequest connection, ParserFeedback fb)
 {
     InitBlock();
     Feedback      = fb;
     Scanners      = null;
     resourceLocn  = null;
     reader        = null;
     character_set = DEFAULT_CHARSET;
     url           = null;
     input         = null;
     Tag.TagParser = new TagParser(feedback);
     Connection    = connection;
 }
예제 #5
0
 /// <summary> This constructor enables the construction of test cases, with readers
 /// associated with test string buffers. It can also be used with readers of the user's choice
 /// streaming data into the parser.<p/>
 /// <B>Important:</B> If you are using this constructor, and you would like to use the parser
 /// to parse multiple times (multiple calls to parser.GetEnumerator()), you must ensure the following:<br>
 /// <ul>
 /// <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
 /// <li>After the first parse, calls to GetEnumerator() must be preceded by calls to:
 /// <pre>
 /// parser.getReader().Reset();
 /// </pre>
 /// </li>
 /// </ul>
 /// </summary>
 /// <param name="rd">The reader to draw characters from.
 /// </param>
 /// <param name="fb">The object to use when information,
 /// warning and error messages are produced. If <em>null</em> no feedback
 /// is provided.
 ///
 /// </param>
 public Parser(NodeReader rd, ParserFeedback fb)
 {
     InitBlock();
     Feedback      = fb;
     Scanners      = null;
     resourceLocn  = null;
     reader        = null;
     character_set = DEFAULT_CHARSET;
     url           = null;
     input         = null;
     Reader        = rd;
     Tag.TagParser = new TagParser(feedback);
 }
예제 #6
0
        /// <summary> Create a new reader for the URLConnection object.
        /// The current character set is used to transform the input stream
        /// into a character reader.
        /// </summary>
        /// <exception cref=""> IOException if there is a problem constructing the reader.
        /// </exception>
        /// <seealso cref="">#CreateInputStreamReader()
        /// </seealso>
        /// <seealso cref="">#getEncoding()
        ///
        /// </seealso>
        protected virtual void CreateReader()
        {
            Stream       stream;
            StreamReader inReader;

            stream = url.GetResponse().GetResponseStream();
            input  = new BufferedStream(stream);
            if (stream.CanSeek)
            {
                markedPosition = input.Position;
            }
            inReader      = CreateInputStreamReader();
            reader        = new NodeReader(inReader, resourceLocn);
            reader.Parser = this;
        }
예제 #7
0
 /// <summary> Create a new reader for the URL object but reuse the input stream.
 /// The current character set is used to transform the input stream
 /// into a character reader. Defaults to <code>CreateReader()</code> if
 /// there is no existing input stream.
 /// </summary>
 /// <exception cref=""> IOException if there is a problem constructing the reader.
 /// </exception>
 /// <seealso cref="">#CreateReader()
 /// </seealso>
 /// <seealso cref="">#CreateInputStreamReader()
 /// </seealso>
 /// <seealso cref="">#getEncoding()
 ///
 /// </seealso>
 protected virtual void RecreateReader()
 {
     if (null == input)
     {
         CreateReader();
     }
     else
     {
         StreamReader inReader;
         if (input.CanSeek)
         {
             input.Position = markedPosition;
             markedPosition = input.Position;
         }
         inReader      = CreateInputStreamReader();
         reader        = new NodeReader(inReader, resourceLocn);
         reader.Parser = this;
     }
 }
예제 #8
0
 /// <summary> This constructor is present to enable users to plug in their own readers.
 /// A DefaultParserFeedback object is used for feedback. It can also be used with readers of the user's choice
 /// streaming data into the parser.<p/>
 /// <B>Important:</B> If you are using this constructor, and you would like to use the parser
 /// to parse multiple times (multiple calls to parser.GetEnumerator()), you must ensure the following:<br>
 /// <ul>
 /// <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
 /// <li>After the first parse, calls to GetEnumerator() must be preceded by calls to:
 /// <pre>
 /// parser.getReader().Reset();
 /// </pre>
 /// </li>
 /// </summary>
 /// <param name="reader">The source for HTML to be parsed.
 ///
 /// </param>
 public Parser(NodeReader reader) : this(reader, stdout)
 {
 }
예제 #9
0
        /// <summary> Locate the remark tag withing the input string, by parsing from the given position
        /// </summary>
        /// <param name="reader">HTML reader to be provided so as to allow reading of next line
        /// </param>
        /// <param name="input">Input String
        /// </param>
        /// <param name="position">Position to start parsing from
        ///
        /// </param>
        public virtual RemarkNode Find(NodeReader reader, string input, int position)
        {
            int state = REMARK_NODE_BEFORE_PARSING_STATE;

            System.Text.StringBuilder tagContents = new System.Text.StringBuilder();
            int  tagBegin = 0;
            int  tagEnd = 0;
            int  i = position;
            int  inputLen = input.Length;
            char ch, prevChar = ' ';

            while (i < inputLen && state < REMARK_NODE_ACCEPTED_STATE)
            {
                ch = input[i];
                if (state == REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE)
                {
                    if (ch == '>')
                    {
                        state  = REMARK_NODE_ACCEPTED_STATE;
                        tagEnd = i;
                    }
                    else if (ch == '-')
                    {
                        tagContents.Append(prevChar);
                    }
                    else
                    {
                        // Rollback last 2 characters (assumed same)
                        state = REMARK_NODE_ACCEPTING_STATE;
                        tagContents.Append(prevChar);
                        tagContents.Append(prevChar);
                    }
                }

                if (state == REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE)
                {
                    if (ch == '-')
                    {
                        state = REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE;
                    }
                    else
                    {
                        // Rollback
                        state = REMARK_NODE_ACCEPTING_STATE;
                        tagContents.Append(prevChar);
                    }
                }
                if (state == REMARK_NODE_ACCEPTING_STATE)
                {
                    if (ch == '-')
                    {
                        state = REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE;
                    }
                }
                if (state == REMARK_NODE_ACCEPTING_STATE)
                {
                    // We can append contents now
                    tagContents.Append(ch);
                }

                if (state == REMARK_NODE_FIRST_DASH_RECEIVED_STATE)
                {
                    if (ch == '-')
                    {
                        state = REMARK_NODE_ACCEPTING_STATE;
                        // Do a lookahead and see if the next char is >
                        if (input.Length > i + 1 && input[i + 1] == '>')
                        {
                            state  = REMARK_NODE_ACCEPTED_STATE;
                            tagEnd = i + 1;
                        }
                    }
                    else
                    {
                        state = REMARK_NODE_ILLEGAL_STATE;
                    }
                }
                if (state == REMARK_NODE_EXCLAMATION_RECEIVED_STATE)
                {
                    if (ch == '-')
                    {
                        state = REMARK_NODE_FIRST_DASH_RECEIVED_STATE;
                    }
                    else if (ch == '>')
                    {
                        state  = REMARK_NODE_ACCEPTED_STATE;
                        tagEnd = i;
                    }
                    else
                    {
                        state = REMARK_NODE_ILLEGAL_STATE;
                    }
                }
                if (state == REMARK_NODE_OPENING_ANGLE_BRACKET_STATE)
                {
                    if (ch == '!')
                    {
                        state = REMARK_NODE_EXCLAMATION_RECEIVED_STATE;
                    }
                    else
                    {
                        state = REMARK_NODE_ILLEGAL_STATE;
                    }
                    // This is not a remark tag
                }
                if (state == REMARK_NODE_BEFORE_PARSING_STATE)
                {
                    if (ch == '<')
                    {
                        // Transition from State 0 to State 1 - Record data till > is encountered
                        tagBegin = i;
                        state    = REMARK_NODE_OPENING_ANGLE_BRACKET_STATE;
                    }
                    else if (ch != ' ')
                    {
                        // Its not a space, hence this is probably a string node, not a remark node
                        state = REMARK_NODE_ILLEGAL_STATE;
                    }
                }
                //			if (state > REMARK_NODE_OPENING_ANGLE_BRACKET_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.length() - 1)
                if (state >= REMARK_NODE_ACCEPTING_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.Length - 1)
                {
                    // We need to continue parsing to the next line
                    tagContents.Append(Parser.LineSeparator);
                    do
                    {
                        input = reader.GetNextLine();
                    } while (input != null && input.Length == 0);
                    if (input != null)
                    {
                        inputLen = input.Length;
                    }
                    else
                    {
                        inputLen = -1;
                    }
                    i = -1;
                }
                if (state == REMARK_NODE_ILLEGAL_STATE)
                {
                    return(null);
                }
                i++;
                prevChar = ch;
            }
            if (state == REMARK_NODE_ACCEPTED_STATE)
            {
                return(new RemarkNode(tagBegin, tagEnd, tagContents.ToString()));
            }
            else
            {
                return(null);
            }
        }