/// <summary> /// Given a TextReader, create a new IDomDocument from the input. /// </summary> /// /// <exception cref="InvalidDataException"> /// Thrown when an invalid data error condition occurs. /// </exception> /// <exception cref="InvalidOperationException"> /// Thrown when the requested operation is invalid. /// </exception> /// /// <param name="inputStream"> /// The HTML input. /// </param> /// <param name="encoding"> /// The encoding. /// </param> /// /// <returns> /// A populated IDomDocument. /// </returns> public IDomDocument Parse(Stream inputStream, Encoding encoding) { ActiveStream = inputStream; ActiveEncoding = encoding; // split into two streams so we can restart if needed // without having to re-parse the entire stream. byte[] part1bytes = new byte[preprocessorBlockBytes]; int part1size = inputStream.Read(part1bytes, 0, preprocessorBlockBytes); MemoryStream part1stream = new MemoryStream(part1bytes, 0, part1size); if (part1stream.Length==0) { return new DomFragment(); } // create a combined stream from the pre-fetched part, and the remainder (whose position // will be wherever it was left after reading the part 1 block). Stream stream; // The official order of precedence for character set processing is as follows: // // HTTP Content-Type header // byte-order mark (BOM) // XML declaration // meta element // link charset attribute // // http://www.w3.org/International/questions/qa-html-encoding-declarations#precedence // // Chrome does this: // // A UTF-16 or UTF-8 BOM overrides the HTTP declaration for Internet Explorer, Safari and Chrome browsers. // // We act like chrome. var bomReader = new BOMReader(part1stream); if (bomReader.IsBOM) { // if there is a BOM encoding, and there's either no active encoding specified already, or it's utf-8/utf-16 // then use it. var bomEncoding = bomReader.Encoding; if (ActiveEncoding == null || (bomReader.Encoding != null && (bomReader.Encoding.WebName == "utf-8" || bomReader.Encoding.WebName == "utf-16") ) ) { ActiveEncoding = bomReader.Encoding; } // either way strip the BOM. stream = new CombinedStream(bomReader.StreamWithoutBOM, inputStream); } else { // no BOM, just reset the input stream part1stream.Position = 0; stream = new CombinedStream(part1stream, inputStream); } ActiveStreamReader = new StreamReader(stream, ActiveEncoding ?? Encoding.UTF8, false); if (HtmlParsingMode == HtmlParsingMode.Auto || ((HtmlParsingMode == HtmlParsingMode.Fragment ) && String.IsNullOrEmpty(FragmentContext))) { string ctx; ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx); if (HtmlParsingMode == HtmlParsingMode.Auto) { switch (ctx) { case "document": HtmlParsingMode = HtmlParsingMode.Document; ctx = ""; break; case "html": HtmlParsingMode = HtmlParsingMode.Content; break; default: HtmlParsingMode = HtmlParsingMode.Fragment; HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags; break; } } if (HtmlParsingMode == HtmlParsingMode.Fragment) { FragmentContext = ctx; } } Reset(); Tokenize(); // If the character set was declared within the first block if (ReEncode == ReEncodeAction.ReEncode) { AlreadyReEncoded = true; if (ActiveStreamOffset >= preprocessorBlockBytes) { // this should never happen, since we test this when accepting an alternate encoding and should // have already decided to change the encoding midstream instead of restart. But as a failsafe // in case there's some part of the parser abort sequence I don't understand, just switch // midstream if we end up here for some reason to keep things going. ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding); } else { part1stream = new MemoryStream(part1bytes); // if the 2nd stream has already been closed, then the whole thing is less than the // preprocessor block size; just restart the cached stream.. if (inputStream.CanRead) { stream = new CombinedStream(part1stream, inputStream); } else { stream = part1stream; } // assign the re-mapped stream to the source and start again ActiveStreamReader = new StreamReader(stream, ActiveEncoding); } Reset(); Tokenize(); } // set this before returning document to the client to improve performance during DOM alteration IDomIndexQueue indexQueue = treeBuilder.Document.DocumentIndex as IDomIndexQueue; if (indexQueue!=null) { indexQueue.QueueChanges = true; } return treeBuilder.Document; }
/// <summary> /// Given a TextReader, create a new IDomDocument from the input. /// </summary> /// /// <exception cref="InvalidDataException"> /// Thrown when an invalid data error condition occurs. /// </exception> /// <exception cref="InvalidOperationException"> /// Thrown when the requested operation is invalid. /// </exception> /// /// <param name="html"> /// The HTML input. /// </param> /// <param name="encoding"> /// The encoding. /// </param> /// /// <returns> /// A populated IDomDocument. /// </returns> public IDomDocument Parse(Stream html, Encoding encoding) { // split into two streams so we can restart if needed // without having to re-parse the entire stream. byte[] part1bytes = new byte[preprocessorBlockSize]; int part1size = html.Read(part1bytes, 0, preprocessorBlockSize); MemoryStream part1stream = new MemoryStream(part1bytes); if (part1stream.Length==0) { return new DomFragment(); } // create a combined stream from the pre-fetched part, and the remainder (whose position // will be wherever it was left after reading the part 1 block). Stream stream = new CombinedStream(part1stream,html); TextReader source; if (encoding == null) { source = new StreamReader(stream, true); } else { source = new StreamReader(stream, encoding); } charSetEncoding = ((StreamReader)source).CurrentEncoding; var originalCharSetEncoding = charSetEncoding; if (HtmlParsingMode == HtmlParsingMode.Auto || ((HtmlParsingMode == HtmlParsingMode.Fragment ) && String.IsNullOrEmpty(FragmentContext))) { string ctx; source = GetContextFromStream(source, out ctx); if (HtmlParsingMode == HtmlParsingMode.Auto) { switch (ctx) { case "document": HtmlParsingMode = HtmlParsingMode.Document; ctx = ""; break; case "html": HtmlParsingMode = HtmlParsingMode.Content; break; default: HtmlParsingMode = HtmlParsingMode.Fragment; HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags; break; } } if (HtmlParsingMode == HtmlParsingMode.Fragment) { FragmentContext = ctx; } } Reset(); Tokenize(source); if (reEncode) { // when this happens, the 2nd stream should still be at position zero (it should not have // advanced beyond the 1k mark) // since the charset encoding must occur within the first 1k. if (part1size == preprocessorBlockSize && html.CanRead && html.Position > preprocessorBlockSize) { throw new InvalidDataException( String.Format("The document contained a meta http-equiv Content-Type header after the first {0} bytes. It cannot be parsed.",preprocessorBlockSize) ); } part1stream = new MemoryStream(part1bytes); // if the 2nd stream has already been closed, then the whole thing is less than the block size. if (html.CanRead) { stream = new CombinedStream(part1stream, html); } else { stream = part1stream; } // re-encode the entire stream TextReader tempReader = new StreamReader(stream, originalCharSetEncoding); MemoryStream encoded = new MemoryStream(); var writer = new StreamWriter(encoded, charSetEncoding); writer.Write(tempReader.ReadToEnd()); writer.Flush(); encoded.Position = 0; // assign the re-mapped stream to the source and start again source = new StreamReader(encoded, charSetEncoding); Reset(); Tokenize(source); } if (reEncode) { throw new InvalidOperationException("The character set encoding changed twice, something seems to be wrong."); } return treeBuilder.Document; }
/// <summary> /// Given a TextReader, create a new IDomDocument from the input. /// </summary> /// /// <exception cref="InvalidDataException"> /// Thrown when an invalid data error condition occurs. /// </exception> /// <exception cref="InvalidOperationException"> /// Thrown when the requested operation is invalid. /// </exception> /// /// <param name="inputStream"> /// The HTML input. /// </param> /// <param name="encoding"> /// The encoding. /// </param> /// /// <returns> /// A populated IDomDocument. /// </returns> public IDomDocument Parse(Stream inputStream, Encoding encoding) { ActiveStream = inputStream; ActiveEncoding = encoding; // split into two streams so we can restart if needed // without having to re-parse the entire stream. byte[] part1bytes = new byte[preprocessorBlockSize]; int part1size = inputStream.Read(part1bytes, 0, preprocessorBlockSize); MemoryStream part1stream = new MemoryStream(part1bytes); if (part1stream.Length==0) { return new DomFragment(); } // create a combined stream from the pre-fetched part, and the remainder (whose position // will be wherever it was left after reading the part 1 block). Stream stream = new CombinedStream(part1stream,inputStream); if (ActiveEncoding == null) { ActiveStreamReader= new StreamReader(stream, true); } else { ActiveStreamReader = new StreamReader(stream, encoding,false); } ActiveEncoding = ((StreamReader)ActiveStreamReader).CurrentEncoding; var originalCharSetEncoding = ActiveEncoding; if (HtmlParsingMode == HtmlParsingMode.Auto || ((HtmlParsingMode == HtmlParsingMode.Fragment ) && String.IsNullOrEmpty(FragmentContext))) { string ctx; ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx); if (HtmlParsingMode == HtmlParsingMode.Auto) { switch (ctx) { case "document": HtmlParsingMode = HtmlParsingMode.Document; ctx = ""; break; case "html": HtmlParsingMode = HtmlParsingMode.Content; break; default: HtmlParsingMode = HtmlParsingMode.Fragment; HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags; break; } } if (HtmlParsingMode == HtmlParsingMode.Fragment) { FragmentContext = ctx; } } Reset(); Tokenize(); if (ReEncode == ReEncodeAction.ReEncode) { AlreadyReEncoded = true; if (ActiveStreamOffset >= preprocessorBlockSize) { // this should never happen, since we test this when accepting an alternate encoding and should // have already decided to change the encoding midstream instead of restart. But as a failsafe // in case there's some part of the parser abort sequence I don't understand, just switch // midstream if we end up here for some reason to keep things going. ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding); } else { part1stream = new MemoryStream(part1bytes); // if the 2nd stream has already been closed, then the whole thing is less than the // preprocessor block size; just restart the cached stream.. if (inputStream.CanRead) { stream = new CombinedStream(part1stream, inputStream); } else { stream = part1stream; } // re-encode the entire stream //TextReader tempReader = new StreamReader(stream, originalCharSetEncoding); //MemoryStream encoded = new MemoryStream(); //var writer = new StreamWriter(encoded, ActiveEncoding); //writer.Write(tempReader.ReadToEnd()); //writer.Flush(); //encoded.Position = 0; // assign the re-mapped stream to the source and start again ActiveStreamReader = new StreamReader(stream, ActiveEncoding); } Reset(); Tokenize(); } if (ReEncode != ReEncodeAction.None) { throw new InvalidOperationException("The character set encoding changed twice, something seems to be wrong."); } return treeBuilder.Document; }