コード例 #1
0
        public HtmlTextReader(Stream stream, Encoding encoding, EncodingConfidence encodingConfidence)
        {
            _initialEncoding           = encoding;
            _initialEncodingConfidence = encodingConfidence;

            Init(stream, encoding, encodingConfidence);
        }
コード例 #2
0
 /// <summary>
 /// Creates a new text source from a <see cref="StringBuilder"/>. No underlying stream will
 /// be used.
 /// </summary>
 /// <param name="source">The data source.</param>
 public TextSource(StringBuilder source)
     : this(TextEncoding.Utf8)
 {
     _finished   = true;
     _content    = source;
     _confidence = EncodingConfidence.Irrelevant;
 }
コード例 #3
0
ファイル: TextSource.cs プロジェクト: tsu1980/AngleSharp
 /// <summary>
 /// Creates a new text source from a string. The underlying stream is
 /// used as an unknown data source.
 /// </summary>
 /// <param name="baseStream">
 /// The underlying stream as data source.
 /// </param>
 /// <param name="encoding">
 /// The initial encoding. Otherwise UTF-8.
 /// </param>
 public TextSource(Stream baseStream, Encoding encoding = null)
     : this(encoding)
 {
     _baseStream = baseStream;
     _content = Pool.NewStringBuilder();
     _confidence = EncodingConfidence.Tentative;
 }
コード例 #4
0
 /// <summary>
 /// Creates a new text source from a string. No
 /// underlying stream will be used.
 /// </summary>
 /// <param name="source">The data source.</param>
 public TextSource(String source)
     : this(null, Encoding.UTF8)
 {
     _finished = true;
     _content.Append(source.Replace("\r\n", "\n"));
     _confidence = EncodingConfidence.Irrelevant;
 }
コード例 #5
0
 /// <summary>
 /// Creates a new text source from a string. No underlying stream will
 /// be used.
 /// </summary>
 /// <param name="source">The data source.</param>
 public TextSource(String source)
     : this(null, TextEncoding.Utf8)
 {
     _finished = true;
     _content.Append(source);
     _confidence = EncodingConfidence.Irrelevant;
 }
コード例 #6
0
ファイル: TextSource.cs プロジェクト: tsu1980/AngleSharp
 /// <summary>
 /// Creates a new text source from a string. No underlying stream will
 /// be used.
 /// </summary>
 /// <param name="source">The data source.</param>
 public TextSource(String source)
     : this(null, TextEncoding.Utf8)
 {
     _finished = true;
     _content.Append(source.Replace("\r\n", "\n"));
     _confidence = EncodingConfidence.Irrelevant;
 }
コード例 #7
0
 /// <summary>
 /// Creates a new text source from a string. The underlying stream is
 /// used as an unknown data source.
 /// </summary>
 /// <param name="baseStream">
 /// The underlying stream as data source.
 /// </param>
 /// <param name="encoding">
 /// The initial encoding. Otherwise UTF-8.
 /// </param>
 public TextSource(Stream baseStream, Encoding encoding = null)
     : this(encoding)
 {
     _baseStream = baseStream;
     _content    = StringBuilderPool.Obtain();
     _confidence = EncodingConfidence.Tentative;
 }
コード例 #8
0
        public async Task TestGetTextReaderAsync_ForEncoding(string url, string expectedTitle, ClientOptions options = null)
        {
            ClientOptions optionsToUse = options == null ? HtmlClient.Options : options;
            XmlDocument   doc1         = new XmlDocument();

            System.Text.Encoding initialEncoding   = null;
            EncodingConfidence   initialConfidence = EncodingConfidence.Tentative;

            System.Text.Encoding finalEncoding   = null;
            EncodingConfidence   finalConfidence = EncodingConfidence.Tentative;

            // Get the Html asynchronously and Parse it into an Xml Document
            using (HtmlTextReader textReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse)) {
                initialEncoding   = textReader.CurrentEncoding;
                initialConfidence = textReader.CurrentEncodingConfidence;

                HtmlParser.DefaultParser.Parse(doc1, textReader, new ParserOptions {
                    BaseUrl = url
                });

                finalEncoding   = textReader.CurrentEncoding;
                finalConfidence = textReader.CurrentEncodingConfidence;
            }

            string title1 = doc1.SelectSingleNode("//title/text()").InnerText;

            Console.WriteLine("Crawled: " + url + ", title: " + title1 + ", default: " + optionsToUse.DefaultEncoding.WebName + " (detect=" + optionsToUse.DetectEncoding + "), inital: " + initialEncoding.WebName + " (" + initialConfidence + "), final: " + finalEncoding.WebName + " (" + finalConfidence + ")");

            // Compare the titles of the pages to see if the encoding is picking up consistently between
            Assert.AreEqual(expectedTitle, title1);
        }
コード例 #9
0
 /// <summary>
 /// Creates a new text source from a string. No underlying stream will
 /// be used.
 /// </summary>
 /// <param name="source">The data source.</param>
 public TextSource(String source)
     : this(TextEncoding.Utf8)
 {
     _finished = true;
     _content  = Pool.NewStringBuilder();
     _content.Append(source);
     _confidence = EncodingConfidence.Irrelevant;
 }
コード例 #10
0
 private void Init(Stream stream, Encoding encoding, EncodingConfidence encodingConfidence)
 {
     _currentEncodingConfidence = encodingConfidence;
     _htmlStream = stream is HtmlStream ? (HtmlStream)stream : null;
     _reader     = new StreamReader(stream, encoding, encodingConfidence == EncodingConfidence.Tentative);
     _currTok    = new StringBuilder();
     _parseState = ParseState.Text;
     _peekChar   = _reader.Read();
 }
コード例 #11
0
 private void Init(TextReader reader)
 {
     _initialEncoding           = (_reader is StreamReader) ? ((StreamReader)_reader).CurrentEncoding : null;
     _currentEncodingConfidence = reader is StringReader ? EncodingConfidence.Irrelevant : EncodingConfidence.Tentative;
     _initialEncodingConfidence = _currentEncodingConfidence;
     _htmlStream = null;
     _reader     = reader;
     _currTok    = new StringBuilder();
     _parseState = ParseState.Text;
     _peekChar   = _reader.Read();
 }
コード例 #12
0
        private async Task DetectByteOrderMarkAsync(CancellationToken cancellationToken)
        {
            var count = await _baseStream.ReadAsync(_buffer, 0, BufferSize).ConfigureAwait(false);

            var offset = 0;

            if ((count > 2) && (_buffer[0] == 0xef) && (_buffer[1] == 0xbb) && (_buffer[2] == 0xbf))
            {
                _encoding = TextEncoding.Utf8;
                offset    = 3;
            }
            else if ((count > 3) && (_buffer[0] == 0xff) && (_buffer[1] == 0xfe) && (_buffer[2] == 0x0) &&
                     (_buffer[3] == 0x0))
            {
                _encoding = TextEncoding.Utf32Le;
                offset    = 4;
            }
            else if ((count > 3) && (_buffer[0] == 0x0) && (_buffer[1] == 0x0) && (_buffer[2] == 0xfe) &&
                     (_buffer[3] == 0xff))
            {
                _encoding = TextEncoding.Utf32Be;
                offset    = 4;
            }
            else if ((count > 1) && (_buffer[0] == 0xfe) && (_buffer[1] == 0xff))
            {
                _encoding = TextEncoding.Utf16Be;
                offset    = 2;
            }
            else if ((count > 1) && (_buffer[0] == 0xff) && (_buffer[1] == 0xfe))
            {
                _encoding = TextEncoding.Utf16Le;
                offset    = 2;
            }
            else if ((count > 3) && (_buffer[0] == 0x84) && (_buffer[1] == 0x31) && (_buffer[2] == 0x95) &&
                     (_buffer[3] == 0x33))
            {
                _encoding = TextEncoding.Gb18030;
                offset    = 4;
            }

            if (offset > 0)
            {
                count -= offset;
                Array.Copy(_buffer, offset, _buffer, 0, count);
                _decoder    = _encoding.GetDecoder();
                _confidence = EncodingConfidence.Certain;
            }

            AppendContentFromBuffer(count);
        }
コード例 #13
0
        void DetectByteOrderMark()
        {
            var count  = _baseStream.Read(_buffer, 0, _bufferSize);
            var offset = 0;

            if (count > 2 && _buffer[0] == 0xef && _buffer[1] == 0xbb && _buffer[2] == 0xbf)
            {
                _encoding = TextEncoding.Utf8;
                offset    = 3;
            }
            else if (count > 3 && _buffer[0] == 0xff && _buffer[1] == 0xfe && _buffer[2] == 0x0 && _buffer[3] == 0x0)
            {
                _encoding = TextEncoding.Utf32Le;
                offset    = 4;
            }
            else if (count > 3 && _buffer[0] == 0x0 && _buffer[1] == 0x0 && _buffer[2] == 0xfe && _buffer[3] == 0xff)
            {
                _encoding = TextEncoding.Utf32Be;
                offset    = 4;
            }
            else if (count > 1 && _buffer[0] == 0xfe && _buffer[1] == 0xff)
            {
                _encoding = TextEncoding.Utf16Be;
                offset    = 2;
            }
            else if (count > 1 && _buffer[0] == 0xff && _buffer[1] == 0xfe)
            {
                _encoding = TextEncoding.Utf16Le;
                offset    = 2;
            }
            else if (count > 3 && _buffer[0] == 0x84 && _buffer[1] == 0x31 && _buffer[2] == 0x95 && _buffer[3] == 0x33)
            {
                _encoding = TextEncoding.Gb18030;
                offset    = 4;
            }

            if (offset > 0)
            {
                count -= offset;
                Array.Copy(_buffer, offset, _buffer, 0, count);
                _decoder    = _encoding.GetDecoder();
                _confidence = EncodingConfidence.Certain;
            }

            AppendContentFromBuffer(count);
        }
コード例 #14
0
        /// <summary>
        /// Creates a new text source from a string. The underlying stream is
        /// used as an unknown data source.
        /// </summary>
        /// <param name="baseStream">
        /// The underlying stream as data source.
        /// </param>
        /// <param name="encoding">
        /// The initial encoding. Otherwise UTF-8.
        /// </param>
        public TextSource(Stream baseStream, Encoding encoding = null)
            : this(encoding)
        {
            if (baseStream.CanSeek)
            {
                _bufferSize = (int)(baseStream.Length / 2);
            }
            else
            {
                _bufferSize = DefaultBufferSize;
            }

            _buffer     = new Byte[_bufferSize];
            _chars      = new Char[_bufferSize + 1];
            _raw        = new MemoryStream();
            _baseStream = baseStream;
            _content    = Pool.NewStringBuilder();
            _confidence = EncodingConfidence.Tentative;
        }
コード例 #15
0
ファイル: HtmlClient.cs プロジェクト: jrsell/XHtmlKit
        public static async Task <HtmlTextReader> GetHtmlTextReaderAsync(this HttpContent content, Encoding defaultEncoding, bool detectEncoding)
        {
            // Try to get the stream's encoding from the Response Headers, or fall back on default.
            // We will also try to detect the encoding from the Byte Order Mark if there is no encoding supplied
            // by the headers. If both of these fail, the Parser should look for an encoding in the <meta> tags of
            // the html itself.
            Encoding encoding = defaultEncoding;

            // Try to detect the encoding from Http Headers
            bool gotEncodingFromHttpHeaders = false;

            if (detectEncoding)
            {
                var    contentHeaders = content.Headers;
                string charset        = (contentHeaders.ContentType != null) ? contentHeaders.ContentType.CharSet : null;
                encoding = EncodingUtils.GetEncoding(charset);
                gotEncodingFromHttpHeaders = encoding != null;
                encoding = (encoding == null ? defaultEncoding : encoding);
                System.Diagnostics.Debug.WriteLine("Detected encoding: charset: " + charset + ", got encoding from headers: " + gotEncodingFromHttpHeaders);
            }

            // Out of band encoding can be either passed in by clients, or found in the http headers...
            bool gotEncodingFromOutOfBandSource   = !detectEncoding || gotEncodingFromHttpHeaders;
            EncodingConfidence encodingConfidence = gotEncodingFromOutOfBandSource ? EncodingConfidence.Certain : EncodingConfidence.Tentative;

            // If encoding was NOT supplied out of band, then we will try to detect it from the stream's BOM
            bool tryToDetectEncodingFromByteOrderMark = (encodingConfidence == EncodingConfidence.Tentative);

            // Get the stream from the network
            Stream networkStream = await content.ReadAsStreamAsync().ConfigureAwait(false);

            // If we are still tentative about the encoding, pop the stream into a wrapper that let's us re-wind.
            Stream baseStream = (encodingConfidence == EncodingConfidence.Tentative) ? new HtmlStream(networkStream) : networkStream;

            // Return a HtmlTextReader with the encoding as detected so far...
            HtmlTextReader htmlReader = new HtmlTextReader(baseStream, encoding, encodingConfidence);

            return(htmlReader);
        }
コード例 #16
0
ファイル: TextSource.cs プロジェクト: tsu1980/AngleSharp
        async Task DetectByteOrderMarkAsync(CancellationToken cancellationToken)
        {
            var count = await _baseStream.ReadAsync(_buffer, 0, BufferSize).ConfigureAwait(false);
            var offset = 0;

            if (count > 2 && _buffer[0] == 0xef && _buffer[1] == 0xbb && _buffer[2] == 0xbf)
            {
                _encoding = TextEncoding.Utf8;
                offset = 3;
            }
            else if (count > 3 && _buffer[0] == 0xff && _buffer[1] == 0xfe && _buffer[2] == 0x0 && _buffer[3] == 0x0)
            {
                _encoding = TextEncoding.Utf32Le;
                offset = 4;
            }
            else if (count > 3 && _buffer[0] == 0x0 && _buffer[1] == 0x0 && _buffer[2] == 0xfe && _buffer[3] == 0xff)
            {
                _encoding = TextEncoding.Utf32Be;
                offset = 4;
            }
            else if (count > 1 && _buffer[0] == 0xfe && _buffer[1] == 0xff)
            {
                _encoding = TextEncoding.Utf16Be;
                offset = 2;
            }
            else if (count > 1 && _buffer[0] == 0xff && _buffer[1] == 0xfe)
            {
                _encoding = TextEncoding.Utf16Le;
                offset = 2;
            }
            else if (count > 3 && _buffer[0] == 0x84 && _buffer[1] == 0x31 && _buffer[2] == 0x95 && _buffer[3] == 0x33)
            {
                _encoding = TextEncoding.Gb18030;
                offset = 4;
            }

            if (offset > 0) 
            {
                count -= offset;
                Array.Copy(_buffer, offset, _buffer, 0, count);
                _decoder = _encoding.GetDecoder();
                _confidence = EncodingConfidence.Certain;
            }

            AppendContentFromBuffer(count);
        }
コード例 #17
0
ファイル: InputStream.cs プロジェクト: bakera/Test
 // エンコード
 // Encoding と EncodingConfidence をセットし、textReaderを初期化します。
 public void SetEncoding(Encoding enc, EncodingConfidence conf)
 {
     if(myTextReader != null){
         //ToDo:
         throw new Exception("Encodingをあとから変更することはできません。InputStreamを初期化してください。");
     }
     this.Encoding = enc;
     this.EncodingConfidence = conf;
     myTextReader = new StreamReader(myStream, this.Encoding);
 }