/// <summary> /// 获取字节流编码 /// </summary> /// <param name="stream">字节流</param> /// <returns></returns> private static Encoding GetEncoding(Stream stream) { if (stream != null && stream.Length > 0) { //每次分配1024字节,进行编码判断 var buffer = new byte[1024]; var seek = stream.Position; stream.Seek(0, SeekOrigin.Begin); var ud = new UniversalDetector(null); while (!ud.IsDone() && stream.Read(buffer, 0, buffer.Length) > 0) { ud.HandleData(buffer, 0, buffer.Length); } ud.DataEnd(); stream.Seek(seek, SeekOrigin.Begin); var encoding = ud.GetDetectedCharset(); if (encoding != null) { if (encoding == Constants.CHARSET_X_ISO_10646_UCS_4_2143 || encoding == Constants.CHARSET_X_ISO_10646_UCS_4_3412) { encoding = "UTF-32"; } return(Encoding.GetEncoding(encoding)); } } return(Encoding.Default); }
public static Encoding Detect(Stream seekable_stream) { if (!seekable_stream.CanSeek) { throw new Exception("Detect encoding error: stream can't seek."); } long ori_pos = seekable_stream.Position; int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; UniversalDetector detector = new UniversalDetector(null); while ((cur = seekable_stream.Read(buffer, 0, buffer_size)) > 0 && !detector.IsDone()) { detector.HandleData(buffer, 0, cur); } detector.DataEnd(); seekable_stream.Seek(ori_pos, SeekOrigin.Begin); if (detector.IsDone()) { return(Encoding.GetEncoding(detector.GetDetectedCharset())); } return(null); }
public Encoding ResolveFileEncoding(string filePath) { var bytes = File.ReadAllBytes(filePath); var encoder = new UniversalDetector(); encoder.HandleData(bytes); encoder.DataEnd(); var charset = encoder.DetectedCharsetName; return(charset == null ? defaultEncoding : Encoding.GetEncoding(charset)); }
private static string DetectEncoding_Bytes(byte[] DetectBuff, int DetectLen) { UniversalDetector Det = new UniversalDetector(null); Det.HandleData(DetectBuff, 0, DetectLen); Det.DataEnd(); if (Det.GetDetectedCharset() != null) { return(Det.GetDetectedCharset()); } return("default"); }
private void DetectedCharset(Stream stream, out string htmlText, out Encoding enc) { htmlText = ""; enc = Encoding.Default; try { Stream mystream = stream; if (stream == null) { return; } MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; CharsetListener listener = new CharsetListener(); UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { /*网页内容编码*/ enc = Encoding.GetEncoding(Det.GetDetectedCharset()); /*解码后的内容*/ htmlText = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } } } catch { } }
//识别一个文本文件的字符集 public static string GetCharSet(string filename) { try { byte[] pReadByte = new byte[0]; FileStream fs = new FileStream(filename, FileMode.Open, FileAccess.Read); BinaryReader r = new BinaryReader(fs); r.BaseStream.Seek(0, SeekOrigin.Begin); //将文件指针设置到文件开 pReadByte = r.ReadBytes((int)r.BaseStream.Length); UniversalDetector Det = new UniversalDetector(null); Det.HandleData(pReadByte, 0, pReadByte.Length); Det.DataEnd(); return(Det.GetDetectedCharset()); } catch { return(null); } }
/// <summary> /// UniversalCharDet算法识别编码 /// </summary> /// <param name="bytes"></param> /// <returns></returns> private Encoding GetEncodingByUniversalCharDet(byte[] bytes) { var detector = new UniversalDetector(null); var detectBuffer = new byte[4096]; while (this.MemoryStream.Read(detectBuffer, 0, detectBuffer.Length) > 0 && !detector.IsDone()) { detector.HandleData(detectBuffer, 0, detectBuffer.Length); } detector.DataEnd(); if (!string.IsNullOrEmpty(detector.GetDetectedCharset())) { return(Encoding.GetEncoding(detector.GetDetectedCharset())); } return(null); }
public static string DetectAndReadToEnd(Stream stream, Encoding default_encoding) { var ms = new MemoryStream(); int buffer_size = 4096, cur; byte[] buffer = new byte[buffer_size]; bool detect_done = false; UniversalDetector detector = new UniversalDetector(null); while ((cur = stream.Read(buffer, 0, buffer_size)) > 0) { ms.Write(buffer, 0, cur); if (!detect_done) { detector.HandleData(buffer, 0, cur); detect_done = detector.IsDone(); } } detector.DataEnd(); Encoding encoding; if (detect_done) { encoding = Encoding.GetEncoding(detector.GetDetectedCharset()); } else if (default_encoding != null) { encoding = default_encoding; } else { encoding = Default; } ms.Seek(0, SeekOrigin.Begin); using (var sr = new StreamReader(ms, encoding)) return(sr.ReadToEnd()); }
public static string Read_File(FileInfo file) { string tmp_result = ""; Stream mystream = file.OpenRead(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { tmp_result = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } else { EchoHelper.Echo("编码识别失败,请手工转码为UTF8保存到任务文件夹。文件:" + file.Name.ToLower(), "编码识别", EchoHelper.EchoType.任务信息); } } return(tmp_result); }
private string GetResourceText(KFN.ResourceFile resource) { byte[] data = KFN.GetDataFromResource(resource); ////UTF-8 int detEncoding = 65001; UniversalDetector Det = new UniversalDetector(null); Det.HandleData(data, 0, data.Length); Det.DataEnd(); string enc = Det.GetDetectedCharset(); if (enc != null && enc != "Not supported") { // fix encoding for 1251 upper case and MAC //if (enc == "KOI8-R" || enc == "X-MAC-CYRILLIC") { enc = "WINDOWS-1251"; } Encoding denc = Encoding.GetEncoding(enc); detEncoding = denc.CodePage; } return(new string(Encoding.GetEncoding(detEncoding).GetChars(data))); }
/// <summary> /// 解析编码并获得字符串 /// </summary> /// <param name="buffer"></param> /// <returns></returns> public string GetString(byte[] buffer) { string result = string.Empty; if (buffer == null) { return(result); } using (MemoryStream msTemp = new MemoryStream(buffer)) { if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !det.IsDone()) { det.HandleData(DetectBuff, 0, DetectBuff.Length); } det.DataEnd(); if (det.GetDetectedCharset() != null) { try { result = System.Text.Encoding.GetEncoding(det.GetDetectedCharset()).GetString(buffer); } catch (ArgumentException) { } } } } return(result); }
public void ViewResourceButtonClick(object sender, RoutedEventArgs e) { KFN.ResourceFile resource = resourcesView.SelectedItem as KFN.ResourceFile; if (resource.FileType == "Text") { byte[] data = KFN.GetDataFromResource(resource); ////UTF-8 int detEncoding = 65001; UniversalDetector Det = new UniversalDetector(null); Det.HandleData(data, 0, data.Length); Det.DataEnd(); string enc = Det.GetDetectedCharset(); if (enc != null && enc != "Not supported") { // fix encoding for 1251 upper case and MAC //if (enc == "KOI8-R" || enc == "X-MAC-CYRILLIC") { enc = "WINDOWS-1251"; } Encoding denc = Encoding.GetEncoding(enc); detEncoding = denc.CodePage; } string text = new string(Encoding.GetEncoding(detEncoding).GetChars(data)); Window viewWindow = new ViewWindow( resource.FileName, text, Encoding.GetEncodings().Where(en => en.CodePage == detEncoding).First().DisplayName ); viewWindow.Show(); } else if (resource.FileType == "Image") { byte[] data = KFN.GetDataFromResource(resource); Window viewWindow = new ImageWindow(resource.FileName, data); viewWindow.Show(); } }
/// <summary> /// 返回流的编码格式 /// </summary> /// <param name="stream"></param> /// <returns></returns> private static Encoding getEncoding(string streamName) { Encoding encoding = Encoding.Default; using (Stream stream = new FileStream(streamName, FileMode.Open)) { MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = stream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; UniversalDetector Det = new UniversalDetector(null); byte[] DetectBuff = new byte[4096]; while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { encoding = Encoding.GetEncoding(Det.GetDetectedCharset()); } } msTemp.Close(); msTemp.Dispose(); return(encoding); } }
static void convert(string file) { string stringEncoded = null; using (BinaryReader br = new BinaryReader(File.OpenRead(file))) { int length = (int)br.BaseStream.Length; byte[] buffer = br.ReadBytes(length); UniversalDetector uDetecter = new UniversalDetector(null); uDetecter.HandleData(buffer, 0, length); uDetecter.DataEnd(); string detectedCharset = uDetecter.GetDetectedCharset(); if (string.IsNullOrEmpty(detectedCharset)) { Console.WriteLine("Warning: {0} not detected", file); } else { Console.WriteLine("Detected: {0} - {1}", file, detectedCharset); if (detectedCharset != "UTF-8") { Encoding encoding = Encoding.GetEncoding(detectedCharset); stringEncoded = encoding.GetString(buffer); } } } if (!string.IsNullOrEmpty(stringEncoded)) { using (StreamWriter sw = new StreamWriter(File.Open(file, FileMode.Create), Encoding.UTF8)) { sw.Write(stringEncoded); } } }
static void ProcessFile(String filePath) { var fileStream = new FileStream(filePath, FileMode.Open); if (fileStream.Length > 0) { fileStream.Seek(0, SeekOrigin.Begin); var pageBytes = new Byte[fileStream.Length]; fileStream.Read(pageBytes, 0, pageBytes.Length); fileStream.Seek(0, SeekOrigin.Begin); var detectionLength = 0; var detectionBuffer = new Byte[4096]; var universalDetector = new UniversalDetector(null); while ((detectionLength = fileStream.Read(detectionBuffer, 0, detectionBuffer.Length)) > 0 && !universalDetector.IsDone()) { universalDetector.HandleData(detectionBuffer, 0, detectionBuffer.Length); } universalDetector.DataEnd(); if (universalDetector.GetDetectedCharset() != null) { Console.WriteLine("Charset: " + universalDetector.GetDetectedCharset() + ". Encoding: " + System.Text.Encoding.GetEncoding(universalDetector.GetDetectedCharset()).EncodingName); Console.WriteLine(); } else { Console.WriteLine("Charset: " + "ASCII" + ". Encoding: " + System.Text.Encoding.GetEncoding("ASCII")); Console.WriteLine(); } } fileStream.Dispose(); }
public void ReadFile(int filesEncoding = 0) { this.error = null; this.properties.Clear(); this.unknownProperties.Clear(); this.resources.Clear(); using (FileStream fs = new FileStream(this.fullFileName, FileMode.Open, FileAccess.Read)) { byte[] signature = new byte[4]; fs.Read(signature, 0, signature.Length); string sign = new string(Encoding.UTF8.GetChars(signature)); if (sign != "KFNB") { this.error = "Invalid KFN signature!"; return; } byte[] prop = new byte[5]; byte[] propValue = new byte[4]; int maxProps = 40; while (maxProps > 0) { fs.Read(prop, 0, prop.Length); string propName = new string(Encoding.UTF8.GetChars(new ArraySegment <byte>(prop, 0, 4).ToArray())); if (propName == "ENDH") { fs.Position += 4; break; } string SpropName = this.GetPropDesc(propName); if (prop[4] == 1) { fs.Read(propValue, 0, propValue.Length); if (SpropName == "Genre" && BitConverter.ToUInt32(propValue, 0) == 0xffffffff) { this.properties.Add(SpropName, "Not set"); } else { if (SpropName.Contains("unknown")) { this.unknownProperties.Add(SpropName + ": " + BitConverter.ToUInt32(propValue, 0)); } if (propName != SpropName) { this.properties.Add(SpropName, BitConverter.ToUInt32(propValue, 0).ToString()); } } } else if (prop[4] == 2) { fs.Read(propValue, 0, propValue.Length); byte[] value = new byte[BitConverter.ToUInt32(propValue, 0)]; fs.Read(value, 0, value.Length); if (SpropName == "AES-ECB-128 Key") { string val = (value.Select(b => (int)b).Sum() == 0) ? "Not present" : value.Select(b => b.ToString("X2")).Aggregate((s1, s2) => s1 + s2); this.properties.Add(SpropName, val); } else { if (SpropName.Contains("unknown")) { this.unknownProperties.Add(SpropName + ": " + new string(Encoding.UTF8.GetChars(value))); } if (propName != SpropName) { this.properties.Add(SpropName, new string(Encoding.UTF8.GetChars(value))); } } } else { this.error = "unknown property block type - " + prop[4]; return; } maxProps--; } this.endOfPropsOffset = fs.Position; byte[] numOfResources = new byte[4]; fs.Read(numOfResources, 0, numOfResources.Length); int resourcesCount = BitConverter.ToInt32(numOfResources, 0); while (resourcesCount > 0) { byte[] resourceNameLenght = new byte[4]; byte[] resourceType = new byte[4]; byte[] resourceLenght = new byte[4]; byte[] resourceEncryptedLenght = new byte[4]; byte[] resourceOffset = new byte[4]; byte[] resourceEncrypted = new byte[4]; fs.Read(resourceNameLenght, 0, resourceNameLenght.Length); byte[] resourceName = new byte[BitConverter.ToUInt32(resourceNameLenght, 0)]; fs.Read(resourceName, 0, resourceName.Length); fs.Read(resourceType, 0, resourceType.Length); fs.Read(resourceLenght, 0, resourceLenght.Length); fs.Read(resourceOffset, 0, resourceOffset.Length); fs.Read(resourceEncryptedLenght, 0, resourceEncryptedLenght.Length); fs.Read(resourceEncrypted, 0, resourceEncrypted.Length); int encrypted = BitConverter.ToInt32(resourceEncrypted, 0); if (filesEncoding == 0 && resourceNamesEncodingAuto == 20127) { UniversalDetector Det = new UniversalDetector(null); Det.HandleData(resourceName, 0, resourceName.Length); Det.DataEnd(); string enc = Det.GetDetectedCharset(); if (enc != null && enc != "Not supported") { // fix encoding for 1251 upper case and MAC if (enc == "KOI8-R" || enc == "X-MAC-CYRILLIC") { enc = "WINDOWS-1251"; } Encoding denc = Encoding.GetEncoding(enc); resourceNamesEncodingAuto = denc.CodePage; this.autoDetectEncoding = denc.CodePage + ": " + denc.EncodingName; } else if (enc == null) { Encoding denc = Encoding.GetEncoding(resourceNamesEncodingAuto); this.autoDetectEncoding = denc.CodePage + ": " + denc.EncodingName; } else { this.autoDetectEncoding = "No supported: use " + Encoding.GetEncoding(resourceNamesEncodingAuto).EncodingName; } } int useEncoding = (filesEncoding != 0) ? filesEncoding : resourceNamesEncodingAuto; string fName = new string(Encoding.GetEncoding(useEncoding).GetChars(resourceName)); this.resources.Add(new KFN.ResourceFile( this.GetFileType(resourceType), fName, BitConverter.ToInt32(resourceEncryptedLenght, 0), BitConverter.ToInt32(resourceLenght, 0), BitConverter.ToInt32(resourceOffset, 0), (encrypted == 0) ? false : true, (fName == this.GetAudioSourceName()) ? true : false )); resourcesCount--; } this.endOfHeaderOffset = fs.Position; } }
/// <summary>Gets the character endcoding of a file</summary> /// <param name="File">The absolute path to a file</param> /// <returns>The character encoding, or unknown</returns> internal static Encoding GetEncodingFromFile(string File) { if (File == null || !System.IO.File.Exists(File)) { return(Encoding.Unknown); } try { System.IO.FileInfo fInfo = new FileInfo(File); byte[] Data = System.IO.File.ReadAllBytes(File); if (Data.Length >= 3) { if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF) { return(Encoding.Utf8); } if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76) { return(Encoding.Utf7); } } if (Data.Length >= 2) { if (Data[0] == 0xFE & Data[1] == 0xFF) { return(Encoding.Utf16Be); } if (Data[0] == 0xFF & Data[1] == 0xFE) { return(Encoding.Utf16Le); } } if (Data.Length >= 4) { if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF) { return(Encoding.Utf32Be); } if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00) { return(Encoding.Utf32Le); } } UniversalDetector Det = new UniversalDetector(null); Det.HandleData(Data, 0, Data.Length); Det.DataEnd(); switch (Det.GetDetectedCharset()) { case "SHIFT_JIS": return(Encoding.Shift_JIS); case "UTF-8": return(Encoding.Utf8); case "UTF-7": return(Encoding.Utf7); case "WINDOWS-1252": return(Encoding.Windows1252); case "BIG5": if (Path.GetFileName(File).ToLowerInvariant() == "stoklosy.b3d" && fInfo.Length == 18256) { //Polish Warsaw metro object file uses diacritics in filenames return(Encoding.Windows1252); } return(Encoding.Big5); case "EUC-KR": return(Encoding.EUC_KR); } Det.Reset(); return(Encoding.Unknown); } catch { return(Encoding.Unknown); } }
private async void button_Click(object sender, RoutedEventArgs e) { CharSetBox.Text = ""; PageBox.Text = ""; button.IsEnabled = false; try { HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text); HttpWebResponse res; try { res = (HttpWebResponse)await hwr.GetResponseAsync(); } catch { CharSetBox.Text = "网页获取错误!"; return; } if (res.StatusCode == HttpStatusCode.OK) { Stream mystream = res.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } res.Dispose(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset(); string page = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); if (page.Length > 2000) { page = page.Substring(0, 2000); } PageBox.Text = page; } } } } catch { } finally { button.IsEnabled = true; } }
/// <summary>Gets the character endcoding of a file</summary> /// <param name="File">The absolute path to a file</param> /// <returns>The character encoding, or unknown</returns> internal static Encoding GetEncodingFromFile(string File) { try { byte[] Data = System.IO.File.ReadAllBytes(File); if (Data.Length >= 3) { if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF) { return(Encoding.Utf8); } if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76) { return(Encoding.Utf7); } } if (Data.Length >= 2) { if (Data[0] == 0xFE & Data[1] == 0xFF) { return(Encoding.Utf16Be); } if (Data[0] == 0xFF & Data[1] == 0xFE) { return(Encoding.Utf16Le); } } if (Data.Length >= 4) { if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF) { return(Encoding.Utf32Be); } if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00) { return(Encoding.Utf32Le); } } UniversalDetector Det = new UniversalDetector(null); Det.HandleData(Data, 0, Data.Length); Det.DataEnd(); switch (Det.GetDetectedCharset()) { case "SHIFT_JIS": return(Encoding.Shift_JIS); case "UTF-8": return(Encoding.Utf8); case "UTF-7": return(Encoding.Utf7); case "WINDOWS-1252": return(Encoding.Windows1252); case "BIG5": return(Encoding.Big5); } Det.Reset(); return(Encoding.Unknown); } catch { return(Encoding.Unknown); } }
private void Check_Click(object sender, EventArgs e) { CharSetBox.Text = ""; PageBox.Text = ""; HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(UrlBox.Text); HttpWebResponse res; try { res = (HttpWebResponse)hwr.GetResponse(); } catch { CharSetBox.Text = "网页获取错误!"; return; } if (res.StatusCode == HttpStatusCode.OK) { Stream mystream = res.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; StreamReader ReadPage = new StreamReader(mystream); while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } res.Close(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; //CharsetListener listener = new CharsetListener(); UniversalDetector Det = new UniversalDetector(null); //while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) //{ // Det.HandleData(DetectBuff, 0, DetectBuff.Length); //} Det.HandleData(PageBytes, 0, PageBytes.Length); Det.DataEnd(); if (Det.GetDetectedCharset() != null) { CharSetBox.Text = "OK! CharSet=" + Det.GetDetectedCharset(); PageBox.Text = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } Det.Reset(); } } }
/// <summary> /// HttpWebRequest读取网页 字符集将自动匹配如果找不倒指定字符集,则使用utf-8 /// </summary> /// <param name="url">url</param> /// <param name="parament">一个用于区分的参数 </param> private static string GetWeb(string url, string encoding) { string strHtmlContent = ""; //字符集编码 if (url.IndexOf("http") == -1)//如果米有HTTP { throw new Exception("请提供完整的HTTP地址"); } System.Net.HttpWebRequest myrequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url); myrequest.Timeout = 600000;//超时时间 10 分钟 //设置没有缓存 myrequest.Headers.Set("Pragma", "no-cache"); System.IO.Stream mystream = new System.IO.MemoryStream(); System.Net.HttpWebResponse myresponse = (System.Net.HttpWebResponse)myrequest.GetResponse(); mystream = myresponse.GetResponseStream(); //用于读取数据的内存流 System.IO.MemoryStream memoryStream = new System.IO.MemoryStream(); #region 自动判断编码字符集 //查看流长时是不是有效数据 int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, buff.Length)) > 0) { memoryStream.Write(buff, 0, len); } if (memoryStream.Length > 0) { //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); int DetLen = 0; //编码字符体的buffer 默认需要4KB的数据 byte[] DetectBuff = new byte[4096]; //开始取得编码 UniversalDetector Det = new UniversalDetector(null); //从当前流中读取块并写入到buff中 while ((DetLen = memoryStream.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); //得到字符集合 if (Det.GetDetectedCharset() != null) { if (encoding == "") { //得到字符集 encoding = Det.GetDetectedCharset(); } } //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); } #endregion System.Text.Encoding myencoding = System.Text.Encoding.GetEncoding(encoding); System.IO.StreamReader mystreamreader = new System.IO.StreamReader(memoryStream, myencoding); strHtmlContent = mystreamreader.ReadToEnd(); mystream.Close(); mystreamreader.Dispose(); mystream.Close(); mystream.Dispose(); return(strHtmlContent); }
/// <summary> /// HttpWebRequest读取网页 字符集将自动匹配如果找不倒指定字符集,则使用utf-8 /// </summary> /// <param name="url">url</param> /// <param name="parament">一个用于区分的参数 </param> private static string GetWeb(string url, string encoding) { string strHtmlContent = ""; System.IO.Stream mystream = new System.IO.MemoryStream(); System.Net.HttpWebRequest myrequest = null; try { myrequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url); //字符集编码 if (url.IndexOf("http") == -1)//如果米有HTTP { throw new Exception("请提供完整的HTTP地址"); } myrequest.Timeout = 20 * 1000;//超时时间 20秒 //设置没有缓存 myrequest.Headers.Set("Pragma", "no-cache"); System.Net.HttpWebResponse myresponse = null; if (myrequest.KeepAlive) { try { myresponse = (System.Net.HttpWebResponse)myrequest.GetResponse(); mystream = myresponse.GetResponseStream(); } catch (Exception ex) { System.Diagnostics.Debug.WriteLine(DateTime.Now + "获取网页内容出错:url:" + url + "\r\n" + ex.Message + " " + (ex.StackTrace == null ? " " : " " + ex.StackTrace)); return(strHtmlContent); } } //用于读取数据的内存流 System.IO.MemoryStream memoryStream = new System.IO.MemoryStream(); #region 自动判断编码字符集 //查看流长时是不是有效数据 int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, buff.Length)) > 0) { memoryStream.Write(buff, 0, len); } if (memoryStream.Length > 0) { //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); int DetLen = 0; //编码字符体的buffer 默认需要4KB的数据 byte[] DetectBuff = new byte[4096]; //开始取得编码 UniversalDetector Det = new UniversalDetector(null); //从当前流中读取块并写入到buff中 while ((DetLen = memoryStream.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); //得到字符集合 if (Det.GetDetectedCharset() != null) { if (encoding == "") { //得到字符集 encoding = Det.GetDetectedCharset(); } } //设置流指向头 memoryStream.Seek(0, System.IO.SeekOrigin.Begin); } #endregion System.Text.Encoding myencoding = System.Text.Encoding.GetEncoding(encoding); System.IO.StreamReader mystreamreader = new System.IO.StreamReader(memoryStream, myencoding); strHtmlContent = mystreamreader.ReadToEnd(); mystreamreader.Dispose(); if (myresponse != null) { myresponse.Close(); } } catch (Exception ex) { System.Diagnostics.Debug.WriteLine(DateTime.Now + "获取网页内容出错:url:" + url + "\r\n" + ex.Message + " " + (ex.StackTrace == null ? " " : " " + ex.StackTrace)); } finally { mystream.Close(); mystream.Dispose(); // HttpWebRequest 不会自己销毁对象 //销毁关闭连接 if (myrequest != null) { myrequest.Abort(); } } return(strHtmlContent); }
/// <summary> /// /// </summary> /// <param name="url"></param> /// <param name="cookies"></param> /// <param name="refrere"></param> /// <param name="encoding">1gbk,2utf8,3auto</param> /// <param name="timeout"></param> /// <param name="isRedirect"></param> /// <returns></returns> public string httpGET(string url, ref CookieCollection cookies, string refrere, int encoding, int timeout, bool isRedirect) { url = getDealUrl(url); Stream stream = null; HttpWebResponse httpWebResponse = null; HttpWebRequest httpWebRequest = null; string result; try { ServicePointManager.Expect100Continue = false; ServicePointManager.DefaultConnectionLimit = 1000; ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult); httpWebRequest = (HttpWebRequest)WebRequest.Create(url); httpWebRequest.Headers.Clear(); httpWebRequest.AutomaticDecompression = DecompressionMethods.GZip; httpWebRequest.CookieContainer = xkCookies.CookieContainer(cookies, url); httpWebRequest.KeepAlive = true; httpWebRequest.ProtocolVersion = HttpVersion.Version10; httpWebRequest.Method = "GET"; httpWebRequest.Referer = refrere; httpWebRequest.Timeout = timeout * 1000; httpWebRequest.AllowAutoRedirect = false; httpWebRequest.Accept = "image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*"; httpWebRequest.Headers.Add("Accept-Language", "zh-cn"); httpWebRequest.UserAgent = useragent; string text = httpWebRequest.Headers.ToString(); httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); stream = httpWebResponse.GetResponseStream(); xkCookies.UpCookie(ref cookies, url, httpWebResponse.Headers["Set-Cookie"], httpWebResponse.Cookies); string tmp_result = ""; if (httpWebResponse.ContentEncoding.ToLower().Contains("gzip")) { stream = new GZipStream(stream, CompressionMode.Decompress); } else { if (httpWebResponse.ContentEncoding.ToLower().Contains("deflate")) { stream = new DeflateStream(stream, CompressionMode.Decompress); } } Stream mystream = httpWebResponse.GetResponseStream(); MemoryStream msTemp = new MemoryStream(); int len = 0; byte[] buff = new byte[512]; while ((len = mystream.Read(buff, 0, 512)) > 0) { msTemp.Write(buff, 0, len); } httpWebResponse.Close(); if (msTemp.Length > 0) { msTemp.Seek(0, SeekOrigin.Begin); byte[] PageBytes = new byte[msTemp.Length]; msTemp.Read(PageBytes, 0, PageBytes.Length); msTemp.Seek(0, SeekOrigin.Begin); int DetLen = 0; byte[] DetectBuff = new byte[4096]; UniversalDetector Det = new UniversalDetector(null); while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !Det.IsDone()) { Det.HandleData(DetectBuff, 0, DetectBuff.Length); } Det.DataEnd(); if (Det.GetDetectedCharset() != null) { tmp_result = System.Text.Encoding.GetEncoding(Det.GetDetectedCharset()).GetString(PageBytes); } else { tmp_result = System.Text.Encoding.GetEncoding("GBK").GetString(PageBytes); } } tmp_result = string.Concat(new object[] { tmp_result, "\r\n\r\n=================================================\r\n\r\n本次请求:", url, " 响应结果:", httpWebResponse.StatusCode, "\r\n\r\nCookie数量", httpWebRequest.CookieContainer.Count, "\r\n", httpWebRequest.CookieContainer.GetCookieHeader(new Uri(url)), "\r\nrequest:\r\n", text, "\r\nresponse:\r\n", httpWebResponse.Headers.ToString(), "\r\n\r\n=================================================\r\n\r\n" }); if (isRedirect) { if (httpWebResponse.Headers["Location"] != null && httpWebResponse.Headers["Location"].Length > 2) { string url_redirect = ""; if (httpWebResponse.Headers["Location"].ToLower().Contains("http://")) { url_redirect = httpWebResponse.Headers["Location"]; } else { url_redirect = geturl(httpWebResponse.Headers["Location"], url); } tmp_result = httpGET(url_redirect, ref cookies, url, 3, 10, isRedirect) + tmp_result; } else { if (httpWebResponse.Headers["Refresh"] != null && httpWebResponse.Headers["Refresh"].Length > 2) { string text3 = httpWebResponse.Headers["Refresh"].ToLower().Replace("url=", "`").Split('`')[1]; if (!text3.Contains("http://")) { text3 = geturl(text3, url); } tmp_result = httpGET(text3, ref cookies, url, 3, 10, isRedirect) + tmp_result; } } if (tmp_result.Contains("Refresh")) { Winista.Text.HtmlParser.Util.NodeList htmlNodes = new Parser(new Lexer(tmp_result)).Parse(new TagNameFilter("meta")); if (htmlNodes.Count > 1) { for (int i = 0; i < htmlNodes.Count; i++) { MetaTag option = (MetaTag)htmlNodes.ElementAt(i); if (option.GetAttribute("http-equiv") == "Refresh") { string content = option.GetAttribute("content"); string text3 = content.ToLower().Replace("url=", "`").Split('`')[1]; if (!text3.Contains("http://")) { text3 = geturl(text3, url); } tmp_result = httpGET(text3, ref cookies, url, 3, 10, isRedirect) + tmp_result; } } } } } httpWebResponse.Close(); httpWebRequest.Abort(); result = tmp_result; if (!url.Contains(":8888") && !url.Contains("renzhe") && !url.Contains("zq535228") && !url.Contains("whoissoft") && !url.Contains("chinaz")) { EchoHelper.Echo(string.Format("成功获取:{0}的HTML内容。", url), null, EchoHelper.EchoType.普通信息); } } catch (Exception ex) { result = ex.Message; } finally { if (stream != null) { stream.Close(); } if (httpWebResponse != null) { httpWebResponse.Close(); } if (httpWebRequest != null) { httpWebRequest.Abort(); } } return(result); }