public static string DetectAndReadToEnd(Stream stream, Encoding default_encoding) { var ms = new MemoryStream(); int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; bool detect_done = false; UniversalDetector detector = new UniversalDetector(null); while ((cur = stream.Read(buffer, 0, buffer_size)) > 0) { ms.Write(buffer, 0, cur); if (!detect_done) { detector.HandleData(buffer, 0, cur); detect_done = detector.IsDone(); } } detector.DataEnd(); Encoding encoding; if (detect_done) encoding = Encoding.GetEncoding(detector.GetDetectedCharset()); else if (default_encoding != null) encoding = default_encoding; else encoding = Default; ms.Seek(0, SeekOrigin.Begin); using (var sr = new StreamReader(ms, encoding)) return sr.ReadToEnd(); }
public static Encoding Detect(Stream seekable_stream) { if (!seekable_stream.CanSeek) throw new Exception("Detect encoding error: stream can't seek."); long ori_pos = seekable_stream.Position; int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; UniversalDetector detector = new UniversalDetector(null); while ((cur = seekable_stream.Read(buffer, 0, buffer_size)) > 0 && !detector.IsDone()) detector.HandleData(buffer, 0, cur); detector.DataEnd(); seekable_stream.Seek(ori_pos, SeekOrigin.Begin); if (detector.IsDone()) return Encoding.GetEncoding(detector.GetDetectedCharset()); return null; }
/// <summary> /// UniversalCharDet算法识别编码 /// </summary> /// <param name="bytes"></param> /// <returns></returns> private Encoding GetEncodingByUniversalCharDet(byte[] bytes) { var detector = new UniversalDetector(null); var detectBuffer = new byte[4096]; while (this.MemoryStream.Read(detectBuffer, 0, detectBuffer.Length) > 0 && !detector.IsDone()) { detector.HandleData(detectBuffer, 0, detectBuffer.Length); } detector.DataEnd(); if (!string.IsNullOrEmpty(detector.GetDetectedCharset())) { return Encoding.GetEncoding(detector.GetDetectedCharset()); } return null; }
private async void button_Click(object sender, RoutedEventArgs e) { CharSetBox.Text = ""; PageBox.Text = ""; button.IsEnabled = false; try { HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text); HttpWebResponse res; try { res = (HttpWebResponse)await hwr.GetResponseAsync(); } catch { CharSetBox.Text = "网页获取错误!"; return; } if (res.StatusCode == HttpStatusCode.OK) { Stream mystream = res.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } res.Dispose(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset(); string page = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); if(page.Length >2000) { page = page.Substring(0,2000); } PageBox.Text = page; } } } } catch { } finally { button.IsEnabled = true; } }