/// <summary> /// Given a TextReader, create a new IDomDocument from the input. /// </summary> /// /// <exception cref="InvalidDataException"> /// Thrown when an invalid data error condition occurs. /// </exception> /// <exception cref="InvalidOperationException"> /// Thrown when the requested operation is invalid. /// </exception> /// /// <param name="inputStream"> /// The HTML input. /// </param> /// <param name="encoding"> /// The encoding. /// </param> /// /// <returns> /// A populated IDomDocument. /// </returns> public IDomDocument Parse(Stream inputStream, Encoding encoding) { ActiveStream = inputStream; ActiveEncoding = encoding; // split into two streams so we can restart if needed // without having to re-parse the entire stream. byte[] part1bytes = new byte[preprocessorBlockBytes]; int part1size = inputStream.Read(part1bytes, 0, preprocessorBlockBytes); MemoryStream part1stream = new MemoryStream(part1bytes, 0, part1size); if (part1stream.Length==0) { return new DomFragment(); } // create a combined stream from the pre-fetched part, and the remainder (whose position // will be wherever it was left after reading the part 1 block). Stream stream; // The official order of precedence for character set processing is as follows: // // HTTP Content-Type header // byte-order mark (BOM) // XML declaration // meta element // link charset attribute // // http://www.w3.org/International/questions/qa-html-encoding-declarations#precedence // // Chrome does this: // // A UTF-16 or UTF-8 BOM overrides the HTTP declaration for Internet Explorer, Safari and Chrome browsers. // // We act like chrome. var bomReader = new BOMReader(part1stream); if (bomReader.IsBOM) { // if there is a BOM encoding, and there's either no active encoding specified already, or it's utf-8/utf-16 // then use it. var bomEncoding = bomReader.Encoding; if (ActiveEncoding == null || (bomReader.Encoding != null && (bomReader.Encoding.WebName == "utf-8" || bomReader.Encoding.WebName == "utf-16") ) ) { ActiveEncoding = bomReader.Encoding; } // either way strip the BOM. stream = new CombinedStream(bomReader.StreamWithoutBOM, inputStream); } else { // no BOM, just reset the input stream part1stream.Position = 0; stream = new CombinedStream(part1stream, inputStream); } ActiveStreamReader = new StreamReader(stream, ActiveEncoding ?? Encoding.UTF8, false); if (HtmlParsingMode == HtmlParsingMode.Auto || ((HtmlParsingMode == HtmlParsingMode.Fragment ) && String.IsNullOrEmpty(FragmentContext))) { string ctx; ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx); if (HtmlParsingMode == HtmlParsingMode.Auto) { switch (ctx) { case "document": HtmlParsingMode = HtmlParsingMode.Document; ctx = ""; break; case "html": HtmlParsingMode = HtmlParsingMode.Content; break; default: HtmlParsingMode = HtmlParsingMode.Fragment; HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags; break; } } if (HtmlParsingMode == HtmlParsingMode.Fragment) { FragmentContext = ctx; } } Reset(); Tokenize(); // If the character set was declared within the first block if (ReEncode == ReEncodeAction.ReEncode) { AlreadyReEncoded = true; if (ActiveStreamOffset >= preprocessorBlockBytes) { // this should never happen, since we test this when accepting an alternate encoding and should // have already decided to change the encoding midstream instead of restart. But as a failsafe // in case there's some part of the parser abort sequence I don't understand, just switch // midstream if we end up here for some reason to keep things going. ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding); } else { part1stream = new MemoryStream(part1bytes); // if the 2nd stream has already been closed, then the whole thing is less than the // preprocessor block size; just restart the cached stream.. if (inputStream.CanRead) { stream = new CombinedStream(part1stream, inputStream); } else { stream = part1stream; } // assign the re-mapped stream to the source and start again ActiveStreamReader = new StreamReader(stream, ActiveEncoding); } Reset(); Tokenize(); } // set this before returning document to the client to improve performance during DOM alteration IDomIndexQueue indexQueue = treeBuilder.Document.DocumentIndex as IDomIndexQueue; if (indexQueue!=null) { indexQueue.QueueChanges = true; } return treeBuilder.Document; }
/// <summary> /// Given a TextReader, create a new IDomDocument from the input. /// </summary> /// /// <exception cref="InvalidDataException"> /// Thrown when an invalid data error condition occurs. /// </exception> /// <exception cref="InvalidOperationException"> /// Thrown when the requested operation is invalid. /// </exception> /// /// <param name="inputStream"> /// The HTML input. /// </param> /// <param name="encoding"> /// The encoding. /// </param> /// /// <returns> /// A populated IDomDocument. /// </returns> public IDomDocument Parse(Stream inputStream, Encoding encoding) { ActiveStream = inputStream; ActiveEncoding = encoding; // split into two streams so we can restart if needed // without having to re-parse the entire stream. byte[] part1bytes = new byte[preprocessorBlockBytes]; int part1size = inputStream.Read(part1bytes, 0, preprocessorBlockBytes); MemoryStream part1stream = new MemoryStream(part1bytes); if (part1stream.Length == 0) { return(new DomFragment()); } // create a combined stream from the pre-fetched part, and the remainder (whose position // will be wherever it was left after reading the part 1 block). Stream stream; // The official order of precedence for character set processing is as follows: // // HTTP Content-Type header // byte-order mark (BOM) // XML declaration // meta element // link charset attribute // // http://www.w3.org/International/questions/qa-html-encoding-declarations#precedence // // Chrome does this: // // A UTF-16 or UTF-8 BOM overrides the HTTP declaration for Internet Explorer, Safari and Chrome browsers. // // We act like chrome. var bomReader = new BOMReader(part1stream); if (bomReader.IsBOM) { // if there is a BOM encoding, and there's either no active encoding specified already, or it's utf-8/utf-16 // then use it. var bomEncoding = bomReader.Encoding; if (ActiveEncoding == null || (bomReader.Encoding != null && (bomReader.Encoding.WebName == "utf-8" || bomReader.Encoding.WebName == "utf-16") ) ) { ActiveEncoding = bomReader.Encoding; } // either way strip the BOM. stream = new CombinedStream(bomReader.StreamWithoutBOM, inputStream); } else { // no BOM, just reset the input stream part1stream.Position = 0; stream = new CombinedStream(part1stream, inputStream); } ActiveStreamReader = new StreamReader(stream, ActiveEncoding ?? Encoding.UTF8, false); if (HtmlParsingMode == HtmlParsingMode.Auto || ((HtmlParsingMode == HtmlParsingMode.Fragment) && String.IsNullOrEmpty(FragmentContext))) { string ctx; ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx); if (HtmlParsingMode == HtmlParsingMode.Auto) { switch (ctx) { case "document": HtmlParsingMode = HtmlParsingMode.Document; ctx = ""; break; case "html": HtmlParsingMode = HtmlParsingMode.Content; break; default: HtmlParsingMode = HtmlParsingMode.Fragment; HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags; break; } } if (HtmlParsingMode == HtmlParsingMode.Fragment) { FragmentContext = ctx; } } Reset(); Tokenize(); // If the character set was declared within the first block if (ReEncode == ReEncodeAction.ReEncode) { AlreadyReEncoded = true; if (ActiveStreamOffset >= preprocessorBlockBytes) { // this should never happen, since we test this when accepting an alternate encoding and should // have already decided to change the encoding midstream instead of restart. But as a failsafe // in case there's some part of the parser abort sequence I don't understand, just switch // midstream if we end up here for some reason to keep things going. ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding); } else { part1stream = new MemoryStream(part1bytes); // if the 2nd stream has already been closed, then the whole thing is less than the // preprocessor block size; just restart the cached stream.. if (inputStream.CanRead) { stream = new CombinedStream(part1stream, inputStream); } else { stream = part1stream; } // assign the re-mapped stream to the source and start again ActiveStreamReader = new StreamReader(stream, ActiveEncoding); } Reset(); Tokenize(); } // set this before returning document to the client to improve performance during DOM alteration IDomIndexQueue indexQueue = treeBuilder.Document.DocumentIndex as IDomIndexQueue; if (indexQueue != null) { indexQueue.QueueChanges = true; } return(treeBuilder.Document); }