A class to parse and expose information about the byte order marks (BOM) for a stream.
Exemplo n.º 1
0
        /// <summary>
        /// Given a TextReader, create a new IDomDocument from the input.
        /// </summary>
        ///
        /// <exception cref="InvalidDataException">
        /// Thrown when an invalid data error condition occurs.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when the requested operation is invalid.
        /// </exception>
        ///
        /// <param name="inputStream">
        /// The HTML input.
        /// </param>
        /// <param name="encoding">
        /// The encoding.
        /// </param>
        ///
        /// <returns>
        /// A populated IDomDocument.
        /// </returns>

        public IDomDocument Parse(Stream inputStream, Encoding encoding)
        {
            ActiveStream = inputStream;
            ActiveEncoding = encoding;

           // split into two streams so we can restart if needed
           // without having to re-parse the entire stream.

            byte[] part1bytes = new byte[preprocessorBlockBytes];
            int part1size = inputStream.Read(part1bytes, 0, preprocessorBlockBytes);

            MemoryStream part1stream = new MemoryStream(part1bytes, 0, part1size);
                 
            if (part1stream.Length==0)
            {
                return new DomFragment();
            }


        
            // create a combined stream from the pre-fetched part, and the remainder (whose position
            // will be wherever it was left after reading the part 1 block).
            
            Stream stream;

            // The official order of precedence for character set processing is as follows:
            //
            // HTTP Content-Type header
            // byte-order mark (BOM)
            // XML declaration
            // meta element
            // link charset attribute
            // 
            // http://www.w3.org/International/questions/qa-html-encoding-declarations#precedence
            //
            // Chrome does this:
            // 
            // A UTF-16 or UTF-8 BOM overrides the HTTP declaration for Internet Explorer, Safari and Chrome browsers.
            //
            // We act like chrome.

            
            var bomReader = new BOMReader(part1stream);
            
            if (bomReader.IsBOM) {
                
                // if there is a BOM encoding, and there's either no active encoding specified already, or it's utf-8/utf-16
                // then use it.

                var bomEncoding = bomReader.Encoding;

                if (ActiveEncoding == null ||
                    (bomReader.Encoding != null && 
                        (bomReader.Encoding.WebName == "utf-8" || bomReader.Encoding.WebName == "utf-16")
                    )
                )
                {
                    ActiveEncoding = bomReader.Encoding;
                }
                
                // either way strip the BOM.
                
                stream = new CombinedStream(bomReader.StreamWithoutBOM, inputStream);
            }
            else
            {
                // no BOM, just reset the input stream
                
                part1stream.Position = 0;
                stream = new CombinedStream(part1stream, inputStream);
            }

            ActiveStreamReader = new StreamReader(stream, ActiveEncoding ?? Encoding.UTF8, false);

            if (HtmlParsingMode == HtmlParsingMode.Auto || 
                ((HtmlParsingMode == HtmlParsingMode.Fragment )
                    && String.IsNullOrEmpty(FragmentContext)))
            {

                string ctx;
                ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx);

                if (HtmlParsingMode == HtmlParsingMode.Auto)
                {
                    switch (ctx)
                    {
                        case "document":
                            HtmlParsingMode = HtmlParsingMode.Document;
                            ctx = "";
                            break;
                        case "html":
                            HtmlParsingMode = HtmlParsingMode.Content;
                            break;
                        default:
                            HtmlParsingMode = HtmlParsingMode.Fragment;
                            HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags;
                            break;
                    }
                }

                if (HtmlParsingMode == HtmlParsingMode.Fragment) 
                {
                    FragmentContext = ctx;
                }
            }

            Reset();

            Tokenize();

            // If the character set was declared within the first block

            if (ReEncode == ReEncodeAction.ReEncode)
            {

                AlreadyReEncoded = true;

                if (ActiveStreamOffset >= preprocessorBlockBytes)
                {
                    // this should never happen, since we test this when accepting an alternate encoding and should
                    // have already decided to change the encoding midstream instead of restart. But as a failsafe
                    // in case there's some part of the parser abort sequence I don't understand, just switch
                    // midstream if we end up here for some reason to keep things going. 
                    
                    ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding);
                }
                else
                {

                    part1stream = new MemoryStream(part1bytes);

                    // if the 2nd stream has already been closed, then the whole thing is less than the
                    // preprocessor block size; just restart the cached stream..

                    if (inputStream.CanRead)
                    {
                        stream = new CombinedStream(part1stream, inputStream);
                    }
                    else
                    {
                        stream = part1stream;
                    }

                    // assign the re-mapped stream to the source and start again
                    ActiveStreamReader = new StreamReader(stream, ActiveEncoding);
                }

                Reset();
                Tokenize();

            }

            // set this before returning document to the client to improve performance during DOM alteration

            IDomIndexQueue indexQueue = treeBuilder.Document.DocumentIndex as IDomIndexQueue;
            if (indexQueue!=null)
            {
                indexQueue.QueueChanges = true;
            }
            

            return treeBuilder.Document;
        }
Exemplo n.º 2
0
        /// <summary>
        /// Given a TextReader, create a new IDomDocument from the input.
        /// </summary>
        ///
        /// <exception cref="InvalidDataException">
        /// Thrown when an invalid data error condition occurs.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when the requested operation is invalid.
        /// </exception>
        ///
        /// <param name="inputStream">
        /// The HTML input.
        /// </param>
        /// <param name="encoding">
        /// The encoding.
        /// </param>
        ///
        /// <returns>
        /// A populated IDomDocument.
        /// </returns>

        public IDomDocument Parse(Stream inputStream, Encoding encoding)
        {
            ActiveStream   = inputStream;
            ActiveEncoding = encoding;

            // split into two streams so we can restart if needed
            // without having to re-parse the entire stream.

            byte[] part1bytes = new byte[preprocessorBlockBytes];
            int    part1size  = inputStream.Read(part1bytes, 0, preprocessorBlockBytes);

            MemoryStream part1stream = new MemoryStream(part1bytes);

            if (part1stream.Length == 0)
            {
                return(new DomFragment());
            }



            // create a combined stream from the pre-fetched part, and the remainder (whose position
            // will be wherever it was left after reading the part 1 block).

            Stream stream;

            // The official order of precedence for character set processing is as follows:
            //
            // HTTP Content-Type header
            // byte-order mark (BOM)
            // XML declaration
            // meta element
            // link charset attribute
            //
            // http://www.w3.org/International/questions/qa-html-encoding-declarations#precedence
            //
            // Chrome does this:
            //
            // A UTF-16 or UTF-8 BOM overrides the HTTP declaration for Internet Explorer, Safari and Chrome browsers.
            //
            // We act like chrome.


            var bomReader = new BOMReader(part1stream);

            if (bomReader.IsBOM)
            {
                // if there is a BOM encoding, and there's either no active encoding specified already, or it's utf-8/utf-16
                // then use it.

                var bomEncoding = bomReader.Encoding;

                if (ActiveEncoding == null ||
                    (bomReader.Encoding != null &&
                     (bomReader.Encoding.WebName == "utf-8" || bomReader.Encoding.WebName == "utf-16")
                    )
                    )
                {
                    ActiveEncoding = bomReader.Encoding;
                }

                // either way strip the BOM.

                stream = new CombinedStream(bomReader.StreamWithoutBOM, inputStream);
            }
            else
            {
                // no BOM, just reset the input stream

                part1stream.Position = 0;
                stream = new CombinedStream(part1stream, inputStream);
            }

            ActiveStreamReader = new StreamReader(stream, ActiveEncoding ?? Encoding.UTF8, false);

            if (HtmlParsingMode == HtmlParsingMode.Auto ||
                ((HtmlParsingMode == HtmlParsingMode.Fragment) &&
                 String.IsNullOrEmpty(FragmentContext)))
            {
                string ctx;
                ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx);

                if (HtmlParsingMode == HtmlParsingMode.Auto)
                {
                    switch (ctx)
                    {
                    case "document":
                        HtmlParsingMode = HtmlParsingMode.Document;
                        ctx             = "";
                        break;

                    case "html":
                        HtmlParsingMode = HtmlParsingMode.Content;
                        break;

                    default:
                        HtmlParsingMode    = HtmlParsingMode.Fragment;
                        HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags;
                        break;
                    }
                }

                if (HtmlParsingMode == HtmlParsingMode.Fragment)
                {
                    FragmentContext = ctx;
                }
            }

            Reset();

            Tokenize();

            // If the character set was declared within the first block

            if (ReEncode == ReEncodeAction.ReEncode)
            {
                AlreadyReEncoded = true;

                if (ActiveStreamOffset >= preprocessorBlockBytes)
                {
                    // this should never happen, since we test this when accepting an alternate encoding and should
                    // have already decided to change the encoding midstream instead of restart. But as a failsafe
                    // in case there's some part of the parser abort sequence I don't understand, just switch
                    // midstream if we end up here for some reason to keep things going.

                    ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding);
                }
                else
                {
                    part1stream = new MemoryStream(part1bytes);

                    // if the 2nd stream has already been closed, then the whole thing is less than the
                    // preprocessor block size; just restart the cached stream..

                    if (inputStream.CanRead)
                    {
                        stream = new CombinedStream(part1stream, inputStream);
                    }
                    else
                    {
                        stream = part1stream;
                    }

                    // assign the re-mapped stream to the source and start again
                    ActiveStreamReader = new StreamReader(stream, ActiveEncoding);
                }

                Reset();
                Tokenize();
            }

            // set this before returning document to the client to improve performance during DOM alteration

            IDomIndexQueue indexQueue = treeBuilder.Document.DocumentIndex as IDomIndexQueue;

            if (indexQueue != null)
            {
                indexQueue.QueueChanges = true;
            }


            return(treeBuilder.Document);
        }