/// <summary> /// Parses a Document from an input steam, using the provided Parser. /// </summary> /// <param name="input">Input stream to parse. You will need to close it</param> /// <param name="charsetName">Character set of input</param> /// <param name="baseUri">Base URI of document, to resolve relative links against</param> /// <param name="parser">Alternate parser to use</param> /// <returns></returns> public static Document Load(Stream input, string charsetName, string baseUri, Parser parser) { byte[] data = ReadToByteBuffer(input); Document doc = ParseByteData(data, charsetName, baseUri, parser); input.Close(); return doc; }
public void noErrorsByDefault() { string html = "<p>One</p href='no'>&arrgh;<font /><br /><foo"; NSoup.Parse.Parser parser = NSoup.Parse.Parser.HtmlParser(); Document doc = NSoupClient.Parse(html, "http://example.com", parser); List <ParseError> errors = parser.GetErrors(); Assert.AreEqual(0, errors.Count); }
// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support // switching the chartset midstream when a meta http-equiv tag defines the charset. public static Document ParseByteData(byte[] data, string charsetName, string baseUri, Parser parser) { var docData = string.Empty; Document doc = null; if (charsetName == null) { // determine from meta. safe parse as UTF-8 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> docData = _defaultEncoding.GetString(data); doc = parser.ParseInput(docData, baseUri); Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").FirstOrDefault(); if (meta != null) { // if not found, will keep utf-8 as best attempt string foundCharset = meta.HasAttr("http-equiv") ? GetCharsetFromContentType(meta.Attr("content")) : meta.Attr("charset"); if (foundCharset != null && foundCharset.Length != 0 && !foundCharset.Equals(_defaultEncoding.WebName.ToUpperInvariant())) { // need to re-decode charsetName = foundCharset; docData = Encoding.GetEncoding(foundCharset).GetString(data); doc = null; } } } else { // specified by content type header (or by user on file load) if (string.IsNullOrEmpty(charsetName)) { throw new Exception("Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); } docData = Encoding.GetEncoding(charsetName).GetString(data); } if (doc == null) { // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight // into head mode if (docData.Length > 0 && docData[0] == 65279) { docData = docData.Substring(1); } doc = parser.ParseInput(docData, baseUri); doc.OutputSettings().SetEncoding(charsetName); } return doc; }
public void tracksLimitedErrorsWhenRequested() { string html = "<p>One</p href='no'><!DOCTYPE html>&arrgh;<font /><br /><foo"; NSoup.Parse.Parser parser = NSoup.Parse.Parser.HtmlParser().SetTrackErrors(3); Document doc = parser.ParseInput(html, "http://example.com"); List <ParseError> errors = parser.GetErrors(); Assert.AreEqual(3, errors.Count); Assert.AreEqual("20: Attributes incorrectly present on end tag", errors[0].ToString()); Assert.AreEqual("35: Unexpected token [Doctype] when in state [InBody]", errors[1].ToString()); Assert.AreEqual("36: Invalid character reference: Invalid named referenece 'arrgh'", errors[2].ToString()); }
public void tracksErrorsWhenRequested() { string html = "<p>One</p href='no'><!DOCTYPE html>&arrgh;<font /><br /><foo"; NSoup.Parse.Parser parser = NSoup.Parse.Parser.HtmlParser().SetTrackErrors(500); Document doc = NSoupClient.Parse(html, "http://example.com", parser); List <ParseError> errors = parser.GetErrors(); Assert.AreEqual(5, errors.Count); Assert.AreEqual("20: Attributes incorrectly present on end tag", errors[0].ToString()); Assert.AreEqual("35: Unexpected token [Doctype] when in state [InBody]", errors[1].ToString()); Assert.AreEqual("36: Invalid character reference: Invalid named referenece 'arrgh'", errors[2].ToString()); Assert.AreEqual("50: Self closing flag not acknowledged", errors[3].ToString()); Assert.AreEqual("61: Unexpectedly reached end of file (EOF) in input state [TagName]", errors[4].ToString()); }
/// <summary> /// Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML /// (non-HTML) parser. /// </summary> /// <param name="html">HTML to parse.</param> /// <param name="baseUri">The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur before the HTML declares a <base href> tag.</param> /// <param name="parser">Alternate parser to use.</param> /// <returns>Sane HTML.</returns> public static Document Parse(string html, string baseUri, Parser parser) { return parser.ParseInput(html, baseUri); }
/// <summary> /// Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML /// (non-HTML) parser. /// </summary> /// <param name="input">Input stream to read. Make sure to close it after parsing.</param> /// <param name="charsetName">(Optional) Character set of file contents. Set to null to determine from http-equiv meta tag, if /// present, or fall back to UTF-8 (which is often safe to do).</param> /// <param name="baseUri">The URL where the HTML was retrieved from, to resolve relative links against.</param> /// <param name="parser">Alternate parser to use.</param> /// <returns>Sane HTML</returns> /// <exception cref="IOException">If the file could not be found, or read, or if the charsetName is invalid.</exception> public static Document Parse(Stream input, string charsetName, string baseUri, Parser parser) { return DataUtil.Load(input, charsetName, baseUri, parser); }