public HtmlTextReader(Stream stream, Encoding encoding, EncodingConfidence encodingConfidence) { _initialEncoding = encoding; _initialEncodingConfidence = encodingConfidence; Init(stream, encoding, encodingConfidence); }
/// <summary> /// Creates a new text source from a <see cref="StringBuilder"/>. No underlying stream will /// be used. /// </summary> /// <param name="source">The data source.</param> public TextSource(StringBuilder source) : this(TextEncoding.Utf8) { _finished = true; _content = source; _confidence = EncodingConfidence.Irrelevant; }
/// <summary> /// Creates a new text source from a string. The underlying stream is /// used as an unknown data source. /// </summary> /// <param name="baseStream"> /// The underlying stream as data source. /// </param> /// <param name="encoding"> /// The initial encoding. Otherwise UTF-8. /// </param> public TextSource(Stream baseStream, Encoding encoding = null) : this(encoding) { _baseStream = baseStream; _content = Pool.NewStringBuilder(); _confidence = EncodingConfidence.Tentative; }
/// <summary> /// Creates a new text source from a string. No /// underlying stream will be used. /// </summary> /// <param name="source">The data source.</param> public TextSource(String source) : this(null, Encoding.UTF8) { _finished = true; _content.Append(source.Replace("\r\n", "\n")); _confidence = EncodingConfidence.Irrelevant; }
/// <summary> /// Creates a new text source from a string. No underlying stream will /// be used. /// </summary> /// <param name="source">The data source.</param> public TextSource(String source) : this(null, TextEncoding.Utf8) { _finished = true; _content.Append(source); _confidence = EncodingConfidence.Irrelevant; }
/// <summary> /// Creates a new text source from a string. No underlying stream will /// be used. /// </summary> /// <param name="source">The data source.</param> public TextSource(String source) : this(null, TextEncoding.Utf8) { _finished = true; _content.Append(source.Replace("\r\n", "\n")); _confidence = EncodingConfidence.Irrelevant; }
/// <summary> /// Creates a new text source from a string. The underlying stream is /// used as an unknown data source. /// </summary> /// <param name="baseStream"> /// The underlying stream as data source. /// </param> /// <param name="encoding"> /// The initial encoding. Otherwise UTF-8. /// </param> public TextSource(Stream baseStream, Encoding encoding = null) : this(encoding) { _baseStream = baseStream; _content = StringBuilderPool.Obtain(); _confidence = EncodingConfidence.Tentative; }
public async Task TestGetTextReaderAsync_ForEncoding(string url, string expectedTitle, ClientOptions options = null) { ClientOptions optionsToUse = options == null ? HtmlClient.Options : options; XmlDocument doc1 = new XmlDocument(); System.Text.Encoding initialEncoding = null; EncodingConfidence initialConfidence = EncodingConfidence.Tentative; System.Text.Encoding finalEncoding = null; EncodingConfidence finalConfidence = EncodingConfidence.Tentative; // Get the Html asynchronously and Parse it into an Xml Document using (HtmlTextReader textReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse)) { initialEncoding = textReader.CurrentEncoding; initialConfidence = textReader.CurrentEncodingConfidence; HtmlParser.DefaultParser.Parse(doc1, textReader, new ParserOptions { BaseUrl = url }); finalEncoding = textReader.CurrentEncoding; finalConfidence = textReader.CurrentEncodingConfidence; } string title1 = doc1.SelectSingleNode("//title/text()").InnerText; Console.WriteLine("Crawled: " + url + ", title: " + title1 + ", default: " + optionsToUse.DefaultEncoding.WebName + " (detect=" + optionsToUse.DetectEncoding + "), inital: " + initialEncoding.WebName + " (" + initialConfidence + "), final: " + finalEncoding.WebName + " (" + finalConfidence + ")"); // Compare the titles of the pages to see if the encoding is picking up consistently between Assert.AreEqual(expectedTitle, title1); }
/// <summary> /// Creates a new text source from a string. No underlying stream will /// be used. /// </summary> /// <param name="source">The data source.</param> public TextSource(String source) : this(TextEncoding.Utf8) { _finished = true; _content = Pool.NewStringBuilder(); _content.Append(source); _confidence = EncodingConfidence.Irrelevant; }
private void Init(Stream stream, Encoding encoding, EncodingConfidence encodingConfidence) { _currentEncodingConfidence = encodingConfidence; _htmlStream = stream is HtmlStream ? (HtmlStream)stream : null; _reader = new StreamReader(stream, encoding, encodingConfidence == EncodingConfidence.Tentative); _currTok = new StringBuilder(); _parseState = ParseState.Text; _peekChar = _reader.Read(); }
private void Init(TextReader reader) { _initialEncoding = (_reader is StreamReader) ? ((StreamReader)_reader).CurrentEncoding : null; _currentEncodingConfidence = reader is StringReader ? EncodingConfidence.Irrelevant : EncodingConfidence.Tentative; _initialEncodingConfidence = _currentEncodingConfidence; _htmlStream = null; _reader = reader; _currTok = new StringBuilder(); _parseState = ParseState.Text; _peekChar = _reader.Read(); }
private async Task DetectByteOrderMarkAsync(CancellationToken cancellationToken) { var count = await _baseStream.ReadAsync(_buffer, 0, BufferSize).ConfigureAwait(false); var offset = 0; if ((count > 2) && (_buffer[0] == 0xef) && (_buffer[1] == 0xbb) && (_buffer[2] == 0xbf)) { _encoding = TextEncoding.Utf8; offset = 3; } else if ((count > 3) && (_buffer[0] == 0xff) && (_buffer[1] == 0xfe) && (_buffer[2] == 0x0) && (_buffer[3] == 0x0)) { _encoding = TextEncoding.Utf32Le; offset = 4; } else if ((count > 3) && (_buffer[0] == 0x0) && (_buffer[1] == 0x0) && (_buffer[2] == 0xfe) && (_buffer[3] == 0xff)) { _encoding = TextEncoding.Utf32Be; offset = 4; } else if ((count > 1) && (_buffer[0] == 0xfe) && (_buffer[1] == 0xff)) { _encoding = TextEncoding.Utf16Be; offset = 2; } else if ((count > 1) && (_buffer[0] == 0xff) && (_buffer[1] == 0xfe)) { _encoding = TextEncoding.Utf16Le; offset = 2; } else if ((count > 3) && (_buffer[0] == 0x84) && (_buffer[1] == 0x31) && (_buffer[2] == 0x95) && (_buffer[3] == 0x33)) { _encoding = TextEncoding.Gb18030; offset = 4; } if (offset > 0) { count -= offset; Array.Copy(_buffer, offset, _buffer, 0, count); _decoder = _encoding.GetDecoder(); _confidence = EncodingConfidence.Certain; } AppendContentFromBuffer(count); }
void DetectByteOrderMark() { var count = _baseStream.Read(_buffer, 0, _bufferSize); var offset = 0; if (count > 2 && _buffer[0] == 0xef && _buffer[1] == 0xbb && _buffer[2] == 0xbf) { _encoding = TextEncoding.Utf8; offset = 3; } else if (count > 3 && _buffer[0] == 0xff && _buffer[1] == 0xfe && _buffer[2] == 0x0 && _buffer[3] == 0x0) { _encoding = TextEncoding.Utf32Le; offset = 4; } else if (count > 3 && _buffer[0] == 0x0 && _buffer[1] == 0x0 && _buffer[2] == 0xfe && _buffer[3] == 0xff) { _encoding = TextEncoding.Utf32Be; offset = 4; } else if (count > 1 && _buffer[0] == 0xfe && _buffer[1] == 0xff) { _encoding = TextEncoding.Utf16Be; offset = 2; } else if (count > 1 && _buffer[0] == 0xff && _buffer[1] == 0xfe) { _encoding = TextEncoding.Utf16Le; offset = 2; } else if (count > 3 && _buffer[0] == 0x84 && _buffer[1] == 0x31 && _buffer[2] == 0x95 && _buffer[3] == 0x33) { _encoding = TextEncoding.Gb18030; offset = 4; } if (offset > 0) { count -= offset; Array.Copy(_buffer, offset, _buffer, 0, count); _decoder = _encoding.GetDecoder(); _confidence = EncodingConfidence.Certain; } AppendContentFromBuffer(count); }
/// <summary> /// Creates a new text source from a string. The underlying stream is /// used as an unknown data source. /// </summary> /// <param name="baseStream"> /// The underlying stream as data source. /// </param> /// <param name="encoding"> /// The initial encoding. Otherwise UTF-8. /// </param> public TextSource(Stream baseStream, Encoding encoding = null) : this(encoding) { if (baseStream.CanSeek) { _bufferSize = (int)(baseStream.Length / 2); } else { _bufferSize = DefaultBufferSize; } _buffer = new Byte[_bufferSize]; _chars = new Char[_bufferSize + 1]; _raw = new MemoryStream(); _baseStream = baseStream; _content = Pool.NewStringBuilder(); _confidence = EncodingConfidence.Tentative; }
public static async Task <HtmlTextReader> GetHtmlTextReaderAsync(this HttpContent content, Encoding defaultEncoding, bool detectEncoding) { // Try to get the stream's encoding from the Response Headers, or fall back on default. // We will also try to detect the encoding from the Byte Order Mark if there is no encoding supplied // by the headers. If both of these fail, the Parser should look for an encoding in the <meta> tags of // the html itself. Encoding encoding = defaultEncoding; // Try to detect the encoding from Http Headers bool gotEncodingFromHttpHeaders = false; if (detectEncoding) { var contentHeaders = content.Headers; string charset = (contentHeaders.ContentType != null) ? contentHeaders.ContentType.CharSet : null; encoding = EncodingUtils.GetEncoding(charset); gotEncodingFromHttpHeaders = encoding != null; encoding = (encoding == null ? defaultEncoding : encoding); System.Diagnostics.Debug.WriteLine("Detected encoding: charset: " + charset + ", got encoding from headers: " + gotEncodingFromHttpHeaders); } // Out of band encoding can be either passed in by clients, or found in the http headers... bool gotEncodingFromOutOfBandSource = !detectEncoding || gotEncodingFromHttpHeaders; EncodingConfidence encodingConfidence = gotEncodingFromOutOfBandSource ? EncodingConfidence.Certain : EncodingConfidence.Tentative; // If encoding was NOT supplied out of band, then we will try to detect it from the stream's BOM bool tryToDetectEncodingFromByteOrderMark = (encodingConfidence == EncodingConfidence.Tentative); // Get the stream from the network Stream networkStream = await content.ReadAsStreamAsync().ConfigureAwait(false); // If we are still tentative about the encoding, pop the stream into a wrapper that let's us re-wind. Stream baseStream = (encodingConfidence == EncodingConfidence.Tentative) ? new HtmlStream(networkStream) : networkStream; // Return a HtmlTextReader with the encoding as detected so far... HtmlTextReader htmlReader = new HtmlTextReader(baseStream, encoding, encodingConfidence); return(htmlReader); }
async Task DetectByteOrderMarkAsync(CancellationToken cancellationToken) { var count = await _baseStream.ReadAsync(_buffer, 0, BufferSize).ConfigureAwait(false); var offset = 0; if (count > 2 && _buffer[0] == 0xef && _buffer[1] == 0xbb && _buffer[2] == 0xbf) { _encoding = TextEncoding.Utf8; offset = 3; } else if (count > 3 && _buffer[0] == 0xff && _buffer[1] == 0xfe && _buffer[2] == 0x0 && _buffer[3] == 0x0) { _encoding = TextEncoding.Utf32Le; offset = 4; } else if (count > 3 && _buffer[0] == 0x0 && _buffer[1] == 0x0 && _buffer[2] == 0xfe && _buffer[3] == 0xff) { _encoding = TextEncoding.Utf32Be; offset = 4; } else if (count > 1 && _buffer[0] == 0xfe && _buffer[1] == 0xff) { _encoding = TextEncoding.Utf16Be; offset = 2; } else if (count > 1 && _buffer[0] == 0xff && _buffer[1] == 0xfe) { _encoding = TextEncoding.Utf16Le; offset = 2; } else if (count > 3 && _buffer[0] == 0x84 && _buffer[1] == 0x31 && _buffer[2] == 0x95 && _buffer[3] == 0x33) { _encoding = TextEncoding.Gb18030; offset = 4; } if (offset > 0) { count -= offset; Array.Copy(_buffer, offset, _buffer, 0, count); _decoder = _encoding.GetDecoder(); _confidence = EncodingConfidence.Certain; } AppendContentFromBuffer(count); }
// エンコード // Encoding と EncodingConfidence をセットし、textReaderを初期化します。 public void SetEncoding(Encoding enc, EncodingConfidence conf) { if(myTextReader != null){ //ToDo: throw new Exception("Encodingをあとから変更することはできません。InputStreamを初期化してください。"); } this.Encoding = enc; this.EncodingConfidence = conf; myTextReader = new StreamReader(myStream, this.Encoding); }