Пример #1
0
		/// <summary>
		/// Parses a Document from an input steam, using the provided Parser.
		/// </summary>
		/// <param name="input">Input stream to parse. You will need to close it</param>
		/// <param name="charsetName">Character set of input</param>
		/// <param name="baseUri">Base URI of document, to resolve relative links against</param>
		/// <param name="parser">Alternate parser to use</param>
		/// <returns></returns>
		public static Document Load(Stream input, string charsetName, string baseUri, Parser parser)
		{
			byte[] data = ReadToByteBuffer(input);

			Document doc = ParseByteData(data, charsetName, baseUri, parser);

			input.Close();

			return doc;
		}
Пример #2
0
        public void noErrorsByDefault()
        {
            string html = "<p>One</p href='no'>&arrgh;<font /><br /><foo";

            NSoup.Parse.Parser parser = NSoup.Parse.Parser.HtmlParser();
            Document           doc    = NSoupClient.Parse(html, "http://example.com", parser);

            List <ParseError> errors = parser.GetErrors();

            Assert.AreEqual(0, errors.Count);
        }
Пример #3
0
		// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
		// switching the chartset midstream when a meta http-equiv tag defines the charset.
		public static Document ParseByteData(byte[] data, string charsetName, string baseUri, Parser parser)
		{
			var docData = string.Empty;
			Document doc = null;

			if (charsetName == null)
			{
				// determine from meta. safe parse as UTF-8

				// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
				docData = _defaultEncoding.GetString(data);
				doc = parser.ParseInput(docData, baseUri);
				Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").FirstOrDefault();

				if (meta != null)
				{
					// if not found, will keep utf-8 as best attempt
					string foundCharset = meta.HasAttr("http-equiv") ? GetCharsetFromContentType(meta.Attr("content")) : meta.Attr("charset");

					if (foundCharset != null && foundCharset.Length != 0 && !foundCharset.Equals(_defaultEncoding.WebName.ToUpperInvariant()))
					{ // need to re-decode
						charsetName = foundCharset;

						docData = Encoding.GetEncoding(foundCharset).GetString(data);
						doc = null;
					}
				}
			}
			else
			{
				// specified by content type header (or by user on file load)
				if (string.IsNullOrEmpty(charsetName))
				{
					throw new Exception("Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
				}

				docData = Encoding.GetEncoding(charsetName).GetString(data);
			}

			if (doc == null)
			{
				// there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
				// in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
				// into head mode
				if (docData.Length > 0 && docData[0] == 65279)
				{
					docData = docData.Substring(1);
				}

				doc = parser.ParseInput(docData, baseUri);
				doc.OutputSettings().SetEncoding(charsetName);
			}
			return doc;
		}
Пример #4
0
        public void tracksLimitedErrorsWhenRequested()
        {
            string html = "<p>One</p href='no'><!DOCTYPE html>&arrgh;<font /><br /><foo";

            NSoup.Parse.Parser parser = NSoup.Parse.Parser.HtmlParser().SetTrackErrors(3);
            Document           doc    = parser.ParseInput(html, "http://example.com");

            List <ParseError> errors = parser.GetErrors();

            Assert.AreEqual(3, errors.Count);
            Assert.AreEqual("20: Attributes incorrectly present on end tag", errors[0].ToString());
            Assert.AreEqual("35: Unexpected token [Doctype] when in state [InBody]", errors[1].ToString());
            Assert.AreEqual("36: Invalid character reference: Invalid named referenece 'arrgh'", errors[2].ToString());
        }
Пример #5
0
        public void tracksErrorsWhenRequested()
        {
            string html = "<p>One</p href='no'><!DOCTYPE html>&arrgh;<font /><br /><foo";

            NSoup.Parse.Parser parser = NSoup.Parse.Parser.HtmlParser().SetTrackErrors(500);
            Document           doc    = NSoupClient.Parse(html, "http://example.com", parser);

            List <ParseError> errors = parser.GetErrors();

            Assert.AreEqual(5, errors.Count);
            Assert.AreEqual("20: Attributes incorrectly present on end tag", errors[0].ToString());
            Assert.AreEqual("35: Unexpected token [Doctype] when in state [InBody]", errors[1].ToString());
            Assert.AreEqual("36: Invalid character reference: Invalid named referenece 'arrgh'", errors[2].ToString());
            Assert.AreEqual("50: Self closing flag not acknowledged", errors[3].ToString());
            Assert.AreEqual("61: Unexpectedly reached end of file (EOF) in input state [TagName]", errors[4].ToString());
        }
Пример #6
0
 /// <summary>
 /// Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </summary>
 /// <param name="html">HTML to parse.</param>
 /// <param name="baseUri">The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur before the HTML declares a &lt;base href&gt; tag.</param>
 /// <param name="parser">Alternate parser to use.</param>
 /// <returns>Sane HTML.</returns>
 public static Document Parse(string html, string baseUri, Parser parser)
 {
     return parser.ParseInput(html, baseUri);
 }
Пример #7
0
 /// <summary>
 /// Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </summary>
 /// <param name="input">Input stream to read. Make sure to close it after parsing.</param>
 /// <param name="charsetName">(Optional) Character set of file contents. Set to null to determine from http-equiv meta tag, if
 /// present, or fall back to UTF-8 (which is often safe to do).</param>
 /// <param name="baseUri">The URL where the HTML was retrieved from, to resolve relative links against.</param>
 /// <param name="parser">Alternate parser to use.</param>
 /// <returns>Sane HTML</returns>
 /// <exception cref="IOException">If the file could not be found, or read, or if the charsetName is invalid.</exception>
 public static Document Parse(Stream input, string charsetName, string baseUri, Parser parser)
 {
     return DataUtil.Load(input, charsetName, baseUri, parser);
 }