public static Encoding Detect(Stream seekable_stream) { if (!seekable_stream.CanSeek) { throw new Exception("Detect encoding error: stream can't seek."); } long ori_pos = seekable_stream.Position; int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; UniversalDetector detector = new UniversalDetector(null); while ((cur = seekable_stream.Read(buffer, 0, buffer_size)) > 0 && !detector.IsDone()) { detector.HandleData(buffer, 0, cur); } detector.DataEnd(); seekable_stream.Seek(ori_pos, SeekOrigin.Begin); if (detector.IsDone()) { return(Encoding.GetEncoding(detector.GetDetectedCharset())); } return(null); }
/// <summary> /// 获取字节流编码 /// </summary> /// <param name="stream">字节流</param> /// <returns></returns> private static Encoding GetEncoding(Stream stream) { if (stream != null && stream.Length > 0) { //每次分配1024字节,进行编码判断 var buffer = new byte[1024]; var seek = stream.Position; stream.Seek(0, SeekOrigin.Begin); var ud = new UniversalDetector(null); while (!ud.IsDone() && stream.Read(buffer, 0, buffer.Length) > 0) { ud.HandleData(buffer, 0, buffer.Length); } ud.DataEnd(); stream.Seek(seek, SeekOrigin.Begin); var encoding = ud.GetDetectedCharset(); if (encoding != null) { if (encoding == Constants.CHARSET_X_ISO_10646_UCS_4_2143 || encoding == Constants.CHARSET_X_ISO_10646_UCS_4_3412) { encoding = "UTF-32"; } return(Encoding.GetEncoding(encoding)); } } return(Encoding.Default); }
public static string DetectAndReadToEnd(Stream stream, Encoding default_encoding) { var ms = new MemoryStream(); int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; bool detect_done = false; UniversalDetector detector = new UniversalDetector(null); while ((cur = stream.Read(buffer, 0, buffer_size)) > 0) { ms.Write(buffer, 0, cur); if (!detect_done) { detector.HandleData(buffer, 0, cur); detect_done = detector.IsDone(); } } detector.DataEnd(); Encoding encoding; if (detect_done) { encoding = Encoding.GetEncoding(detector.GetDetectedCharset()); } else if (default_encoding != null) { encoding = default_encoding; } else { encoding = Default; } ms.Seek(0, SeekOrigin.Begin); using (var sr = new StreamReader(ms, encoding)) return(sr.ReadToEnd()); }
/// <summary> /// UniversalCharDet算法识别编码 /// </summary> /// <param name="bytes"></param> /// <returns></returns> private Encoding GetEncodingByUniversalCharDet(byte[] bytes) { var detector = new UniversalDetector(null); var detectBuffer = new byte[4096]; while (this.MemoryStream.Read(detectBuffer, 0, detectBuffer.Length) > 0 && !detector.IsDone()) { detector.HandleData(detectBuffer, 0, detectBuffer.Length); } detector.DataEnd(); if (!string.IsNullOrEmpty(detector.GetDetectedCharset())) { return(Encoding.GetEncoding(detector.GetDetectedCharset())); } return(null); }
static void ProcessFile(String filePath) { var fileStream = new FileStream(filePath, FileMode.Open); if (fileStream.Length > 0) { fileStream.Seek(0, SeekOrigin.Begin); var pageBytes = new Byte[fileStream.Length]; fileStream.Read(pageBytes, 0, pageBytes.Length); fileStream.Seek(0, SeekOrigin.Begin); var detectionLength = 0; var detectionBuffer = new Byte[4096]; var universalDetector = new UniversalDetector(null); while ((detectionLength = fileStream.Read(detectionBuffer, 0, detectionBuffer.Length)) > 0 && !universalDetector.IsDone()) { universalDetector.HandleData(detectionBuffer, 0, detectionBuffer.Length); } universalDetector.DataEnd(); if (universalDetector.GetDetectedCharset() != null) { Console.WriteLine("Charset: " + universalDetector.GetDetectedCharset() + ". Encoding: " + System.Text.Encoding.GetEncoding(universalDetector.GetDetectedCharset()).EncodingName); Console.WriteLine(); } else { Console.WriteLine("Charset: " + "ASCII" + ". Encoding: " + System.Text.Encoding.GetEncoding("ASCII")); Console.WriteLine(); } } fileStream.Dispose(); }
/// <summary> /// 返回流的编码格式 /// </summary> /// <param name="stream"></param> /// <returns></returns> private static Encoding getEncoding(string streamName) { Encoding encoding = Encoding.Default; using (Stream stream = new FileStream(streamName, FileMode.Open)) { MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = stream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; UniversalDetector Det = new UniversalDetector(null); byte[] DetectBuff = new byte[4096]; while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { encoding = Encoding.GetEncoding(Det.GetDetectedCharset()); } } msTemp.Close(); msTemp.Dispose(); return(encoding); } }
/// <summary> /// HttpWebRequest读取网页 字符集将自动匹配如果找不倒指定字符集,则使用utf-8 /// </summary> /// <param name="url">url</param> /// <param name="parament">一个用于区分的参数 </param> private static string GetWeb(string url, string encoding) { string strHtmlContent = ""; //字符集编码 if (url.IndexOf("http") == -1)//如果米有HTTP { throw new Exception("请提供完整的HTTP地址"); } System.Net.HttpWebRequest myrequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url); myrequest.Timeout = 600000;//超时时间 10 分钟 //设置没有缓存 myrequest.Headers.Set("Pragma", "no-cache"); System.IO.Stream mystream = new System.IO.MemoryStream(); System.Net.HttpWebResponse myresponse = (System.Net.HttpWebResponse)myrequest.GetResponse(); mystream = myresponse.GetResponseStream(); //用于读取数据的内存流 System.IO.MemoryStream memoryStream = new System.IO.MemoryStream(); #region 自动判断编码字符集 //查看流长时是不是有效数据 int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, buff.Length)) > 0) { memoryStream.Write(buff, 0, len); } if (memoryStream.Length > 0) { //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); int DetLen = 0; //编码字符体的buffer 默认需要4KB的数据 byte[] DetectBuff = new byte[4096]; //开始取得编码 UniversalDetector Det = new UniversalDetector(null); //从当前流中读取块并写入到buff中 while ((DetLen = memoryStream.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); //得到字符集合 if (Det.GetDetectedCharset() != null) { if (encoding == "") { //得到字符集 encoding = Det.GetDetectedCharset(); } } //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); } #endregion System.Text.Encoding myencoding = System.Text.Encoding.GetEncoding(encoding); System.IO.StreamReader mystreamreader = new System.IO.StreamReader(memoryStream, myencoding); strHtmlContent = mystreamreader.ReadToEnd(); mystream.Close(); mystreamreader.Dispose(); mystream.Close(); mystream.Dispose(); return(strHtmlContent); }
/// <summary> /// /// </summary> /// <param name="url"></param> /// <param name="cookies"></param> /// <param name="refrere"></param> /// <param name="encoding">1gbk,2utf8,3auto</param> /// <param name="timeout"></param> /// <param name="isRedirect"></param> /// <returns></returns> public string httpGET(string url, ref CookieCollection cookies, string refrere, int encoding, int timeout, bool isRedirect) { url = getDealUrl(url); Stream stream = null; HttpWebResponse httpWebResponse = null; HttpWebRequest httpWebRequest = null; string result; try { ServicePointManager.Expect100Continue = false; ServicePointManager.DefaultConnectionLimit = 1000; ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); httpWebRequest = (HttpWebRequest)WebRequest.Create(url); httpWebRequest.Headers.Clear(); httpWebRequest.AutomaticDecompression = DecompressionMethods.GZip; httpWebRequest.CookieContainer = xkCookies.CookieContainer(cookies, url); httpWebRequest.KeepAlive = true; httpWebRequest.ProtocolVersion = HttpVersion.Version10; httpWebRequest.Method = "GET"; httpWebRequest.Referer = refrere; httpWebRequest.Timeout = timeout * 1000; httpWebRequest.AllowAutoRedirect = false; httpWebRequest.Accept = "image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*"; httpWebRequest.Headers.Add("Accept-Language", "zh-cn"); httpWebRequest.UserAgent = useragent; string text = httpWebRequest.Headers.ToString(); httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); stream = httpWebResponse.GetResponseStream(); xkCookies.UpCookie(ref cookies, url, httpWebResponse.Headers["Set-Cookie"], httpWebResponse.Cookies); string tmp_result = ""; if (httpWebResponse.ContentEncoding.ToLower().Contains("gzip")) { stream = new GZipStream(stream, CompressionMode.Decompress); } else { if (httpWebResponse.ContentEncoding.ToLower().Contains("deflate")) { stream = new DeflateStream(stream, CompressionMode.Decompress); } } Stream mystream = httpWebResponse.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } httpWebResponse.Close(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { tmp_result = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } else { tmp_result = System.Text.Encoding.GetEncoding("GBK").GetString(PageBytes); } } tmp_result = string.Concat(new object[] { tmp_result, "\r\n\r\n=================================================\r\n\r\n本次请求:", url, " 响应结果:", httpWebResponse.StatusCode, "\r\n\r\nCookie数量", httpWebRequest.CookieContainer.Count, "\r\n", httpWebRequest.CookieContainer.GetCookieHeader(new Uri(url)), "\r\nrequest:\r\n", text, "\r\nresponse:\r\n", httpWebResponse.Headers.ToString(), "\r\n\r\n=================================================\r\n\r\n" }); if (isRedirect) { if (httpWebResponse.Headers["Location"] != null && httpWebResponse.Headers["Location"].Length > 2) { string url_redirect = ""; if (httpWebResponse.Headers["Location"].ToLower().Contains("http://")) { url_redirect = httpWebResponse.Headers["Location"]; } else { url_redirect = geturl(httpWebResponse.Headers["Location"], url); } tmp_result = httpGET(url_redirect, ref cookies, url, 3, 10, isRedirect) + tmp_result; } else { if (httpWebResponse.Headers["Refresh"] != null && httpWebResponse.Headers["Refresh"].Length > 2) { string text3 = httpWebResponse.Headers["Refresh"].ToLower().Replace("url=", "`").Split('`')[1]; if (!text3.Contains("http://")) { text3 = geturl(text3, url); } tmp_result = httpGET(text3, ref cookies, url, 3, 10, isRedirect) + tmp_result; } } if (tmp_result.Contains("Refresh")) { Winista.Text.HtmlParser.Util.NodeList htmlNodes = new Parser(new Lexer(tmp_result)).Parse(new TagNameFilter("meta")); if (htmlNodes.Count > 1) { for (int i = 0; i < htmlNodes.Count; i++) { MetaTag option = (MetaTag)htmlNodes.ElementAt(i); if (option.GetAttribute("http-equiv") == "Refresh") { string content = option.GetAttribute("content"); string text3 = content.ToLower().Replace("url=", "`").Split('`')[1]; if (!text3.Contains("http://")) { text3 = geturl(text3, url); } tmp_result = httpGET(text3, ref cookies, url, 3, 10, isRedirect) + tmp_result; } } } } } httpWebResponse.Close(); httpWebRequest.Abort(); result = tmp_result; if (!url.Contains(":8888") && !url.Contains("renzhe") && !url.Contains("zq535228") && !url.Contains("whoissoft") && !url.Contains("chinaz")) { EchoHelper.Echo(string.Format("成功获取:{0}的HTML内容。", url), null, EchoHelper.EchoType.普通信息); } } catch (Exception ex) { result = ex.Message; } finally { if (stream != null) { stream.Close(); } if (httpWebResponse != null) { httpWebResponse.Close(); } if (httpWebRequest != null) { httpWebRequest.Abort(); } } return(result); }
/// <summary> /// HttpWebRequest读取网页 字符集将自动匹配如果找不倒指定字符集,则使用utf-8 /// </summary> /// <param name="url">url</param> /// <param name="parament">一个用于区分的参数 </param> private static string GetWeb(string url, string encoding) { string strHtmlContent = ""; System.IO.Stream mystream = new System.IO.MemoryStream(); System.Net.HttpWebRequest myrequest = null; try { myrequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url); //字符集编码 if (url.IndexOf("http") == -1)//如果米有HTTP { throw new Exception("请提供完整的HTTP地址"); } myrequest.Timeout = 20 * 1000;//超时时间 20秒 //设置没有缓存 myrequest.Headers.Set("Pragma", "no-cache"); System.Net.HttpWebResponse myresponse = null; if (myrequest.KeepAlive) { try { myresponse = (System.Net.HttpWebResponse)myrequest.GetResponse(); mystream = myresponse.GetResponseStream(); } catch (Exception ex) { System.Diagnostics.Debug.WriteLine(DateTime.Now + "获取网页内容出错:url:" + url + "\r\n" + ex.Message + " " + (ex.StackTrace == null ? " " : " " + ex.StackTrace)); return(strHtmlContent); } } //用于读取数据的内存流 System.IO.MemoryStream memoryStream = new System.IO.MemoryStream(); #region 自动判断编码字符集 //查看流长时是不是有效数据 int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, buff.Length)) > 0) { memoryStream.Write(buff, 0, len); } if (memoryStream.Length > 0) { //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); int DetLen = 0; //编码字符体的buffer 默认需要4KB的数据 byte[] DetectBuff = new byte[4096]; //开始取得编码 UniversalDetector Det = new UniversalDetector(null); //从当前流中读取块并写入到buff中 while ((DetLen = memoryStream.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); //得到字符集合 if (Det.GetDetectedCharset() != null) { if (encoding == "") { //得到字符集 encoding = Det.GetDetectedCharset(); } } //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); } #endregion System.Text.Encoding myencoding = System.Text.Encoding.GetEncoding(encoding); System.IO.StreamReader mystreamreader = new System.IO.StreamReader(memoryStream, myencoding); strHtmlContent = mystreamreader.ReadToEnd(); mystreamreader.Dispose(); if (myresponse != null) { myresponse.Close(); } } catch (Exception ex) { System.Diagnostics.Debug.WriteLine(DateTime.Now + "获取网页内容出错:url:" + url + "\r\n" + ex.Message + " " + (ex.StackTrace == null ? " " : " " + ex.StackTrace)); } finally { mystream.Close(); mystream.Dispose(); // HttpWebRequest 不会自己销毁对象 //销毁关闭连接 if (myrequest != null) { myrequest.Abort(); } } return(strHtmlContent); }
private void DetectedCharset(Stream stream, out string htmlText, out Encoding enc) { htmlText = ""; enc = Encoding.Default; try { Stream mystream = stream; if (stream == null) { return; } MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; CharsetListener listener = new CharsetListener(); UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { /*网页内容编码*/ enc = Encoding.GetEncoding(Det.GetDetectedCharset()); /*解码后的内容*/ htmlText = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } } } catch { } }
public static string Read_File(FileInfo file) { string tmp_result = ""; Stream mystream = file.OpenRead(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { tmp_result = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } else { EchoHelper.Echo("编码识别失败,请手工转码为UTF8保存到任务文件夹。文件:" + file.Name.ToLower(), "编码识别", EchoHelper.EchoType.任务信息); } } return(tmp_result); }
private async void button_Click(object sender, RoutedEventArgs e) { CharSetBox.Text = ""; PageBox.Text = ""; button.IsEnabled = false; try { HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text); HttpWebResponse res; try { res = (HttpWebResponse)await hwr.GetResponseAsync(); } catch { CharSetBox.Text = "网页获取错误!"; return; } if (res.StatusCode == HttpStatusCode.OK) { Stream mystream = res.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } res.Dispose(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset(); string page = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); if (page.Length > 2000) { page = page.Substring(0, 2000); } PageBox.Text = page; } } } } catch { } finally { button.IsEnabled = true; } }
/// <summary> /// 解析编码并获得字符串 /// </summary> /// <param name="buffer"></param> /// <returns></returns> public string GetString(byte[] buffer) { string result = string.Empty; if (buffer == null) { return(result); } using (MemoryStream msTemp = new MemoryStream(buffer)) { if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !det.IsDone()) { det.HandleData(DetectBuff, 0, DetectBuff.Length); } det.DataEnd(); if (det.GetDetectedCharset() != null) { try { result = System.Text.Encoding.GetEncoding(det.GetDetectedCharset()).GetString(buffer); } catch (ArgumentException) { } } } } return(result); }