public static string DetectAndReadToEnd(Stream stream, Encoding default_encoding) { var ms = new MemoryStream(); int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; bool detect_done = false; UniversalDetector detector = new UniversalDetector(null); while ((cur = stream.Read(buffer, 0, buffer_size)) > 0) { ms.Write(buffer, 0, cur); if (!detect_done) { detector.HandleData(buffer, 0, cur); detect_done = detector.IsDone(); } } detector.DataEnd(); Encoding encoding; if (detect_done) encoding = Encoding.GetEncoding(detector.GetDetectedCharset()); else if (default_encoding != null) encoding = default_encoding; else encoding = Default; ms.Seek(0, SeekOrigin.Begin); using (var sr = new StreamReader(ms, encoding)) return sr.ReadToEnd(); }
public static Encoding Detect(Stream seekable_stream) { if (!seekable_stream.CanSeek) throw new Exception("Detect encoding error: stream can't seek."); long ori_pos = seekable_stream.Position; int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; UniversalDetector detector = new UniversalDetector(null); while ((cur = seekable_stream.Read(buffer, 0, buffer_size)) > 0 && !detector.IsDone()) detector.HandleData(buffer, 0, cur); detector.DataEnd(); seekable_stream.Seek(ori_pos, SeekOrigin.Begin); if (detector.IsDone()) return Encoding.GetEncoding(detector.GetDetectedCharset()); return null; }
/// <summary>Gets the character endcoding of a file</summary> /// <param name="File">The absolute path to a file</param> /// <returns>The character encoding, or unknown</returns> internal static Encoding GetEncodingFromFile(string File) { try { byte[] Data = System.IO.File.ReadAllBytes(File); if (Data.Length >= 3) { if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF) return Encoding.Utf8; if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76) return Encoding.Utf7; } if (Data.Length >= 2) { if (Data[0] == 0xFE & Data[1] == 0xFF) return Encoding.Utf16Be; if (Data[0] == 0xFF & Data[1] == 0xFE) return Encoding.Utf16Le; } if (Data.Length >= 4) { if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF) return Encoding.Utf32Be; if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00) return Encoding.Utf32Le; } UniversalDetector Det = new UniversalDetector(null); Det.HandleData(Data, 0, Data.Length); Det.DataEnd(); switch (Det.GetDetectedCharset()) { case "SHIFT_JIS": return Encoding.Shift_JIS; } Det.Reset(); return Encoding.Unknown; } catch { return Encoding.Unknown; } }
private void Check_Click(object sender, EventArgs e) { CharSetBox.Text = ""; PageBox.Text = ""; HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text); HttpWebResponse res; try { res = (HttpWebResponse)hwr.GetResponse(); } catch { CharSetBox.Text = "网页获取错误!"; return; } if (res.StatusCode == HttpStatusCode.OK) { Stream mystream = res.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; StreamReader ReadPage = new StreamReader(mystream); while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } res.Close(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; //CharsetListener listener = new CharsetListener(); UniversalDetector Det = new UniversalDetector(null); //while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) //{ // Det.HandleData(DetectBuff, 0, DetectBuff.Length); //} Det.HandleData(PageBytes, 0, PageBytes.Length); Det.DataEnd(); if (Det.GetDetectedCharset()!=null) { CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset(); PageBox.Text = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } Det.Reset(); } } }
/// <summary> /// UniversalCharDet算法识别编码 /// </summary> /// <param name="bytes"></param> /// <returns></returns> private Encoding GetEncodingByUniversalCharDet(byte[] bytes) { var detector = new UniversalDetector(null); var detectBuffer = new byte[4096]; while (this.MemoryStream.Read(detectBuffer, 0, detectBuffer.Length) > 0 && !detector.IsDone()) { detector.HandleData(detectBuffer, 0, detectBuffer.Length); } detector.DataEnd(); if (!string.IsNullOrEmpty(detector.GetDetectedCharset())) { return Encoding.GetEncoding(detector.GetDetectedCharset()); } return null; }
private async void button_Click(object sender, RoutedEventArgs e) { CharSetBox.Text = ""; PageBox.Text = ""; button.IsEnabled = false; try { HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text); HttpWebResponse res; try { res = (HttpWebResponse)await hwr.GetResponseAsync(); } catch { CharSetBox.Text = "网页获取错误!"; return; } if (res.StatusCode == HttpStatusCode.OK) { Stream mystream = res.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } res.Dispose(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset(); string page = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); if(page.Length >2000) { page = page.Substring(0,2000); } PageBox.Text = page; } } } } catch { } finally { button.IsEnabled = true; } }