示例#1
0
        /// <summary>
        /// Given a TextReader, create a new IDomDocument from the input.
        /// </summary>
        ///
        /// <exception cref="InvalidDataException">
        /// Thrown when an invalid data error condition occurs.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when the requested operation is invalid.
        /// </exception>
        ///
        /// <param name="inputStream">
        /// The HTML input.
        /// </param>
        /// <param name="encoding">
        /// The encoding.
        /// </param>
        ///
        /// <returns>
        /// A populated IDomDocument.
        /// </returns>

        public IDomDocument Parse(Stream inputStream, Encoding encoding)
        {
            ActiveStream = inputStream;
            ActiveEncoding = encoding;

           // split into two streams so we can restart if needed
           // without having to re-parse the entire stream.

            byte[] part1bytes = new byte[preprocessorBlockBytes];
            int part1size = inputStream.Read(part1bytes, 0, preprocessorBlockBytes);

            MemoryStream part1stream = new MemoryStream(part1bytes, 0, part1size);
                 
            if (part1stream.Length==0)
            {
                return new DomFragment();
            }


        
            // create a combined stream from the pre-fetched part, and the remainder (whose position
            // will be wherever it was left after reading the part 1 block).
            
            Stream stream;

            // The official order of precedence for character set processing is as follows:
            //
            // HTTP Content-Type header
            // byte-order mark (BOM)
            // XML declaration
            // meta element
            // link charset attribute
            // 
            // http://www.w3.org/International/questions/qa-html-encoding-declarations#precedence
            //
            // Chrome does this:
            // 
            // A UTF-16 or UTF-8 BOM overrides the HTTP declaration for Internet Explorer, Safari and Chrome browsers.
            //
            // We act like chrome.

            
            var bomReader = new BOMReader(part1stream);
            
            if (bomReader.IsBOM) {
                
                // if there is a BOM encoding, and there's either no active encoding specified already, or it's utf-8/utf-16
                // then use it.

                var bomEncoding = bomReader.Encoding;

                if (ActiveEncoding == null ||
                    (bomReader.Encoding != null && 
                        (bomReader.Encoding.WebName == "utf-8" || bomReader.Encoding.WebName == "utf-16")
                    )
                )
                {
                    ActiveEncoding = bomReader.Encoding;
                }
                
                // either way strip the BOM.
                
                stream = new CombinedStream(bomReader.StreamWithoutBOM, inputStream);
            }
            else
            {
                // no BOM, just reset the input stream
                
                part1stream.Position = 0;
                stream = new CombinedStream(part1stream, inputStream);
            }

            ActiveStreamReader = new StreamReader(stream, ActiveEncoding ?? Encoding.UTF8, false);

            if (HtmlParsingMode == HtmlParsingMode.Auto || 
                ((HtmlParsingMode == HtmlParsingMode.Fragment )
                    && String.IsNullOrEmpty(FragmentContext)))
            {

                string ctx;
                ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx);

                if (HtmlParsingMode == HtmlParsingMode.Auto)
                {
                    switch (ctx)
                    {
                        case "document":
                            HtmlParsingMode = HtmlParsingMode.Document;
                            ctx = "";
                            break;
                        case "html":
                            HtmlParsingMode = HtmlParsingMode.Content;
                            break;
                        default:
                            HtmlParsingMode = HtmlParsingMode.Fragment;
                            HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags;
                            break;
                    }
                }

                if (HtmlParsingMode == HtmlParsingMode.Fragment) 
                {
                    FragmentContext = ctx;
                }
            }

            Reset();

            Tokenize();

            // If the character set was declared within the first block

            if (ReEncode == ReEncodeAction.ReEncode)
            {

                AlreadyReEncoded = true;

                if (ActiveStreamOffset >= preprocessorBlockBytes)
                {
                    // this should never happen, since we test this when accepting an alternate encoding and should
                    // have already decided to change the encoding midstream instead of restart. But as a failsafe
                    // in case there's some part of the parser abort sequence I don't understand, just switch
                    // midstream if we end up here for some reason to keep things going. 
                    
                    ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding);
                }
                else
                {

                    part1stream = new MemoryStream(part1bytes);

                    // if the 2nd stream has already been closed, then the whole thing is less than the
                    // preprocessor block size; just restart the cached stream..

                    if (inputStream.CanRead)
                    {
                        stream = new CombinedStream(part1stream, inputStream);
                    }
                    else
                    {
                        stream = part1stream;
                    }

                    // assign the re-mapped stream to the source and start again
                    ActiveStreamReader = new StreamReader(stream, ActiveEncoding);
                }

                Reset();
                Tokenize();

            }

            // set this before returning document to the client to improve performance during DOM alteration

            IDomIndexQueue indexQueue = treeBuilder.Document.DocumentIndex as IDomIndexQueue;
            if (indexQueue!=null)
            {
                indexQueue.QueueChanges = true;
            }
            

            return treeBuilder.Document;
        }
示例#2
0
        /// <summary>
        /// Given a TextReader, create a new IDomDocument from the input.
        /// </summary>
        ///
        /// <exception cref="InvalidDataException">
        /// Thrown when an invalid data error condition occurs.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when the requested operation is invalid.
        /// </exception>
        ///
        /// <param name="html">
        /// The HTML input.
        /// </param>
        /// <param name="encoding">
        /// The encoding.
        /// </param>
        ///
        /// <returns>
        /// A populated IDomDocument.
        /// </returns>

        public IDomDocument Parse(Stream html, Encoding encoding)
        {
    
            
           // split into two streams so we can restart if needed
           // without having to re-parse the entire stream.

            byte[] part1bytes = new byte[preprocessorBlockSize];
            int part1size = html.Read(part1bytes, 0, preprocessorBlockSize);

            MemoryStream part1stream = new MemoryStream(part1bytes);
                 
            if (part1stream.Length==0)
            {
                return new DomFragment();
            }

            // create a combined stream from the pre-fetched part, and the remainder (whose position
            // will be wherever it was left after reading the part 1 block).
            
            Stream stream = new CombinedStream(part1stream,html);

            TextReader source;
            if (encoding == null)
            {
                source = new StreamReader(stream, true);
            }
            else
            {
                source = new StreamReader(stream, encoding);
            }

            charSetEncoding = ((StreamReader)source).CurrentEncoding;
            var originalCharSetEncoding = charSetEncoding;

            if (HtmlParsingMode == HtmlParsingMode.Auto || 
                ((HtmlParsingMode == HtmlParsingMode.Fragment )
                    && String.IsNullOrEmpty(FragmentContext)))
            {

                string ctx;
                source = GetContextFromStream(source, out ctx);

                if (HtmlParsingMode == HtmlParsingMode.Auto)
                {
                    switch (ctx)
                    {
                        case "document":
                            HtmlParsingMode = HtmlParsingMode.Document;
                            ctx = "";
                            break;
                        case "html":
                            HtmlParsingMode = HtmlParsingMode.Content;
                            break;
                        default:
                            HtmlParsingMode = HtmlParsingMode.Fragment;
                            HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags;
                            break;
                    }
                }

                if (HtmlParsingMode == HtmlParsingMode.Fragment) 
                {
                    FragmentContext = ctx;
                }
            }

          


            Reset();
            Tokenize(source);

            if (reEncode)
            {
                // when this happens, the 2nd stream should still be at position zero (it should not have
                // advanced beyond the 1k mark)
                // since the charset encoding must occur within the first 1k. 
                
                if (part1size == preprocessorBlockSize 
                    && html.CanRead
                    && html.Position > preprocessorBlockSize)
                {
                    throw new InvalidDataException(
                        String.Format("The document contained a meta http-equiv Content-Type header after the first {0} bytes. It cannot be parsed.",preprocessorBlockSize)
                        );
                }

                part1stream = new MemoryStream(part1bytes);

                // if the 2nd stream has already been closed, then the whole thing is less than the block size.

                if (html.CanRead)
                {
                    stream = new CombinedStream(part1stream, html);
                }
                else
                {
                    stream = part1stream;
                }


                // re-encode the entire stream

                TextReader tempReader = new StreamReader(stream, originalCharSetEncoding);

                MemoryStream encoded = new MemoryStream();
                var writer = new StreamWriter(encoded, charSetEncoding);
                writer.Write(tempReader.ReadToEnd());
                writer.Flush();
                
                encoded.Position = 0;

                // assign the re-mapped stream to the source and start again
                source = new StreamReader(encoded, charSetEncoding);

                Reset();
                Tokenize(source);

            }

            if (reEncode)
            {
                throw new InvalidOperationException("The character set encoding changed twice, something seems to be wrong.");
            }


            return treeBuilder.Document;
        }
示例#3
0
        /// <summary>
        /// Given a TextReader, create a new IDomDocument from the input.
        /// </summary>
        ///
        /// <exception cref="InvalidDataException">
        /// Thrown when an invalid data error condition occurs.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when the requested operation is invalid.
        /// </exception>
        ///
        /// <param name="inputStream">
        /// The HTML input.
        /// </param>
        /// <param name="encoding">
        /// The encoding.
        /// </param>
        ///
        /// <returns>
        /// A populated IDomDocument.
        /// </returns>

        public IDomDocument Parse(Stream inputStream, Encoding encoding)
        {
            ActiveStream = inputStream;
            ActiveEncoding = encoding;

           // split into two streams so we can restart if needed
           // without having to re-parse the entire stream.

            byte[] part1bytes = new byte[preprocessorBlockSize];
            int part1size = inputStream.Read(part1bytes, 0, preprocessorBlockSize);

            MemoryStream part1stream = new MemoryStream(part1bytes);
                 
            if (part1stream.Length==0)
            {
                return new DomFragment();
            }

            // create a combined stream from the pre-fetched part, and the remainder (whose position
            // will be wherever it was left after reading the part 1 block).
            
            Stream stream = new CombinedStream(part1stream,inputStream);

            
            if (ActiveEncoding == null)
            {
                ActiveStreamReader= new StreamReader(stream, true);
            }
            else
            {
                ActiveStreamReader = new StreamReader(stream, encoding,false);
                
            }

            ActiveEncoding = ((StreamReader)ActiveStreamReader).CurrentEncoding;
            var originalCharSetEncoding = ActiveEncoding;

            if (HtmlParsingMode == HtmlParsingMode.Auto || 
                ((HtmlParsingMode == HtmlParsingMode.Fragment )
                    && String.IsNullOrEmpty(FragmentContext)))
            {

                string ctx;
                ActiveStreamReader = GetContextFromStream(ActiveStreamReader, out ctx);

                if (HtmlParsingMode == HtmlParsingMode.Auto)
                {
                    switch (ctx)
                    {
                        case "document":
                            HtmlParsingMode = HtmlParsingMode.Document;
                            ctx = "";
                            break;
                        case "html":
                            HtmlParsingMode = HtmlParsingMode.Content;
                            break;
                        default:
                            HtmlParsingMode = HtmlParsingMode.Fragment;
                            HtmlParsingOptions = HtmlParsingOptions.AllowSelfClosingTags;
                            break;
                    }
                }

                if (HtmlParsingMode == HtmlParsingMode.Fragment) 
                {
                    FragmentContext = ctx;
                }
            }

          


            Reset();

            Tokenize();

            if (ReEncode == ReEncodeAction.ReEncode)
            {

                AlreadyReEncoded = true;

                if (ActiveStreamOffset >= preprocessorBlockSize)
                {
                    // this should never happen, since we test this when accepting an alternate encoding and should
                    // have already decided to change the encoding midstream instead of restart. But as a failsafe
                    // in case there's some part of the parser abort sequence I don't understand, just switch
                    // midstream if we end up here for some reason to keep things going. 
                    
                    ActiveStreamReader = new StreamReader(ActiveStream, ActiveEncoding);
                }
                else
                {

                    part1stream = new MemoryStream(part1bytes);

                    // if the 2nd stream has already been closed, then the whole thing is less than the
                    // preprocessor block size; just restart the cached stream..

                    if (inputStream.CanRead)
                    {
                        stream = new CombinedStream(part1stream, inputStream);
                    }
                    else
                    {
                        stream = part1stream;
                    }


                    // re-encode the entire stream

                    //TextReader tempReader = new StreamReader(stream, originalCharSetEncoding);

                    //MemoryStream encoded = new MemoryStream();
                    //var writer = new StreamWriter(encoded, ActiveEncoding);
                    //writer.Write(tempReader.ReadToEnd());
                    //writer.Flush();

                    //encoded.Position = 0;

                    // assign the re-mapped stream to the source and start again
                    ActiveStreamReader = new StreamReader(stream, ActiveEncoding);
                }

                Reset();
                Tokenize();

            }

            if (ReEncode != ReEncodeAction.None)
            {
                throw new InvalidOperationException("The character set encoding changed twice, something seems to be wrong.");
            }


            return treeBuilder.Document;
        }