protected void CreateParser(string inputHTML) { string testHTML = new string(inputHTML.ToCharArray()); System.IO.StringReader sr = new System.IO.StringReader(testHTML); reader = new NodeReader(sr, 5000); parser = new Parser(reader, new NoFeedback()); node = new AbstractNode[40]; }
protected void CreateParser(string inputHTML, string url, int numNodes) { string testHTML = new string(inputHTML.ToCharArray()); System.IO.StringReader sr = new System.IO.StringReader(testHTML); reader = new NodeReader(sr, url); parser = new Parser(reader, new DefaultParserFeedback()); node = new AbstractNode[numNodes]; }
// // Constructors // /// <summary> Zero argument constructor. /// The parser is in a safe but useless state. /// Set the reader or connection using Reader or Connection. /// </summary> /// <seealso cref="">#Reader /// </seealso> /// <seealso cref="">#Connection /// /// </seealso> public Parser() { InitBlock(); Feedback = null; Scanners = null; resourceLocn = null; reader = null; character_set = DEFAULT_CHARSET; url = null; input = null; Tag.TagParser = new TagParser(Feedback); }
/// <summary> Constructor for custom HTTP access. /// </summary> /// <param name="connection">A fully conditioned connection. /// </param> /// <param name="fb">The object to use for message communication. /// /// </param> public Parser(WebRequest connection, ParserFeedback fb) { InitBlock(); Feedback = fb; Scanners = null; resourceLocn = null; reader = null; character_set = DEFAULT_CHARSET; url = null; input = null; Tag.TagParser = new TagParser(feedback); Connection = connection; }
/// <summary> This constructor enables the construction of test cases, with readers /// associated with test string buffers. It can also be used with readers of the user's choice /// streaming data into the parser.<p/> /// <B>Important:</B> If you are using this constructor, and you would like to use the parser /// to parse multiple times (multiple calls to parser.GetEnumerator()), you must ensure the following:<br> /// <ul> /// <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li> /// <li>After the first parse, calls to GetEnumerator() must be preceded by calls to: /// <pre> /// parser.getReader().Reset(); /// </pre> /// </li> /// </ul> /// </summary> /// <param name="rd">The reader to draw characters from. /// </param> /// <param name="fb">The object to use when information, /// warning and error messages are produced. If <em>null</em> no feedback /// is provided. /// /// </param> public Parser(NodeReader rd, ParserFeedback fb) { InitBlock(); Feedback = fb; Scanners = null; resourceLocn = null; reader = null; character_set = DEFAULT_CHARSET; url = null; input = null; Reader = rd; Tag.TagParser = new TagParser(feedback); }
/// <summary> Create a new reader for the URLConnection object. /// The current character set is used to transform the input stream /// into a character reader. /// </summary> /// <exception cref=""> IOException if there is a problem constructing the reader. /// </exception> /// <seealso cref="">#CreateInputStreamReader() /// </seealso> /// <seealso cref="">#getEncoding() /// /// </seealso> protected virtual void CreateReader() { Stream stream; StreamReader inReader; stream = url.GetResponse().GetResponseStream(); input = new BufferedStream(stream); if (stream.CanSeek) { markedPosition = input.Position; } inReader = CreateInputStreamReader(); reader = new NodeReader(inReader, resourceLocn); reader.Parser = this; }
/// <summary> Create a new reader for the URL object but reuse the input stream. /// The current character set is used to transform the input stream /// into a character reader. Defaults to <code>CreateReader()</code> if /// there is no existing input stream. /// </summary> /// <exception cref=""> IOException if there is a problem constructing the reader. /// </exception> /// <seealso cref="">#CreateReader() /// </seealso> /// <seealso cref="">#CreateInputStreamReader() /// </seealso> /// <seealso cref="">#getEncoding() /// /// </seealso> protected virtual void RecreateReader() { if (null == input) { CreateReader(); } else { StreamReader inReader; if (input.CanSeek) { input.Position = markedPosition; markedPosition = input.Position; } inReader = CreateInputStreamReader(); reader = new NodeReader(inReader, resourceLocn); reader.Parser = this; } }
/// <summary> This constructor is present to enable users to plug in their own readers. /// A DefaultParserFeedback object is used for feedback. It can also be used with readers of the user's choice /// streaming data into the parser.<p/> /// <B>Important:</B> If you are using this constructor, and you would like to use the parser /// to parse multiple times (multiple calls to parser.GetEnumerator()), you must ensure the following:<br> /// <ul> /// <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li> /// <li>After the first parse, calls to GetEnumerator() must be preceded by calls to: /// <pre> /// parser.getReader().Reset(); /// </pre> /// </li> /// </summary> /// <param name="reader">The source for HTML to be parsed. /// /// </param> public Parser(NodeReader reader) : this(reader, stdout) { }
/// <summary> Locate the remark tag withing the input string, by parsing from the given position /// </summary> /// <param name="reader">HTML reader to be provided so as to allow reading of next line /// </param> /// <param name="input">Input String /// </param> /// <param name="position">Position to start parsing from /// /// </param> public virtual RemarkNode Find(NodeReader reader, string input, int position) { int state = REMARK_NODE_BEFORE_PARSING_STATE; System.Text.StringBuilder tagContents = new System.Text.StringBuilder(); int tagBegin = 0; int tagEnd = 0; int i = position; int inputLen = input.Length; char ch, prevChar = ' '; while (i < inputLen && state < REMARK_NODE_ACCEPTED_STATE) { ch = input[i]; if (state == REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE) { if (ch == '>') { state = REMARK_NODE_ACCEPTED_STATE; tagEnd = i; } else if (ch == '-') { tagContents.Append(prevChar); } else { // Rollback last 2 characters (assumed same) state = REMARK_NODE_ACCEPTING_STATE; tagContents.Append(prevChar); tagContents.Append(prevChar); } } if (state == REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE) { if (ch == '-') { state = REMARK_NODE_CLOSING_SECOND_DASH_RECEIVED_STATE; } else { // Rollback state = REMARK_NODE_ACCEPTING_STATE; tagContents.Append(prevChar); } } if (state == REMARK_NODE_ACCEPTING_STATE) { if (ch == '-') { state = REMARK_NODE_CLOSING_FIRST_DASH_RECEIVED_STATE; } } if (state == REMARK_NODE_ACCEPTING_STATE) { // We can append contents now tagContents.Append(ch); } if (state == REMARK_NODE_FIRST_DASH_RECEIVED_STATE) { if (ch == '-') { state = REMARK_NODE_ACCEPTING_STATE; // Do a lookahead and see if the next char is > if (input.Length > i + 1 && input[i + 1] == '>') { state = REMARK_NODE_ACCEPTED_STATE; tagEnd = i + 1; } } else { state = REMARK_NODE_ILLEGAL_STATE; } } if (state == REMARK_NODE_EXCLAMATION_RECEIVED_STATE) { if (ch == '-') { state = REMARK_NODE_FIRST_DASH_RECEIVED_STATE; } else if (ch == '>') { state = REMARK_NODE_ACCEPTED_STATE; tagEnd = i; } else { state = REMARK_NODE_ILLEGAL_STATE; } } if (state == REMARK_NODE_OPENING_ANGLE_BRACKET_STATE) { if (ch == '!') { state = REMARK_NODE_EXCLAMATION_RECEIVED_STATE; } else { state = REMARK_NODE_ILLEGAL_STATE; } // This is not a remark tag } if (state == REMARK_NODE_BEFORE_PARSING_STATE) { if (ch == '<') { // Transition from State 0 to State 1 - Record data till > is encountered tagBegin = i; state = REMARK_NODE_OPENING_ANGLE_BRACKET_STATE; } else if (ch != ' ') { // Its not a space, hence this is probably a string node, not a remark node state = REMARK_NODE_ILLEGAL_STATE; } } // if (state > REMARK_NODE_OPENING_ANGLE_BRACKET_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.length() - 1) if (state >= REMARK_NODE_ACCEPTING_STATE && state < REMARK_NODE_ACCEPTED_STATE && i == input.Length - 1) { // We need to continue parsing to the next line tagContents.Append(Parser.LineSeparator); do { input = reader.GetNextLine(); } while (input != null && input.Length == 0); if (input != null) { inputLen = input.Length; } else { inputLen = -1; } i = -1; } if (state == REMARK_NODE_ILLEGAL_STATE) { return(null); } i++; prevChar = ch; } if (state == REMARK_NODE_ACCEPTED_STATE) { return(new RemarkNode(tagBegin, tagEnd, tagContents.ToString())); } else { return(null); } }