/// <summary> /// Guessed string encoding /// </summary> /// <param name="bytes">Bytes to encode</param> /// <returns>Encoded string</returns> public static string GuessedStringEncoding(byte[] bytes) { string ret = null; if (bytes != null) { charsetDetector.Reset(); charsetDetector.Feed(bytes, 0, bytes.Length); charsetDetector.DataEnd(); try { string charset_name = charsetDetector.Charset; if (charset_name != null) { Encoding encoding = Encoding.GetEncoding(charset_name); if (encoding != null) { ret = encoding.GetString(bytes); } } } catch (Exception e) { Console.Error.WriteLine(e.Message); } if (ret == null) { try { ret = Encoding.Default.GetString(bytes); } catch (Exception e) { Console.Error.WriteLine(e.Message); } } } if (ret == null) { ret = ""; } return(ret); }
/// <summary>Gets the character encoding of a file</summary> /// <param name="File">The absolute path to a file</param> /// <returns>The character encoding, or unknown</returns> public static Encoding GetEncodingFromFile(string File) { if (File == null || !System.IO.File.Exists(File)) { return(Encoding.Unknown); } try { System.IO.FileInfo fInfo = new System.IO.FileInfo(File); byte[] Data = System.IO.File.ReadAllBytes(File); if (Data.Length >= 3) { if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF) { return(Encoding.UTF8); } if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76) { return(Encoding.UTF7); } } if (Data.Length >= 2) { if (Data[0] == 0xFE & Data[1] == 0xFF) { return(Encoding.UTF16_BE); } if (Data[0] == 0xFF & Data[1] == 0xFE) { return(Encoding.UTF16_LE); } } if (Data.Length >= 4) { if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF) { return(Encoding.UTF32_BE); } if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00) { return(Encoding.UTF32_LE); } } CharsetDetector Det = new CharsetDetector(); Det.Feed(Data, 0, Data.Length); Det.DataEnd(); if (Det.Charset == null) { return(Encoding.Unknown); } switch (Det.Charset) { case Charsets.IBM855: return(Encoding.IBM855); case Charsets.IBM866: return(Encoding.IBM866); case Charsets.SHIFT_JIS: return(Encoding.SHIFT_JIS); case Charsets.EUCKR: return(Encoding.EUC_KR); case Charsets.BIG5: if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "stoklosy.b3d" && fInfo.Length == 18256) { //Polish Warsaw metro object file uses diacritics in filenames return(Encoding.WIN1252); } return(Encoding.BIG5); case Charsets.UTF16_LE: return(Encoding.UTF16_LE); case Charsets.UTF16_BE: return(Encoding.UTF16_BE); case Charsets.WIN1251: if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "585tc1.csv" && fInfo.Length == 37302) { return(Encoding.SHIFT_JIS); } return(Encoding.WIN1251); case Charsets.WIN1252: if (fInfo.Length == 62861) { //HK tram route. Comes in a non-unicode zip, so filename may be subject to mangling return(Encoding.BIG5); } return(Encoding.WIN1252); case Charsets.WIN1253: return(Encoding.WIN1253); case Charsets.WIN1255: if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xdbetulasmall.csv" && fInfo.Length == 406) { //Hungarian birch tree; Actually loads OK with 1255, but use the correct one return(Encoding.WIN1252); } return(Encoding.WIN1255); case Charsets.MAC_CYRILLIC: if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "exit01.csv" && fInfo.Length == 752) { //hira2 return(Encoding.SHIFT_JIS); } return(Encoding.MAC_CYRILLIC); case Charsets.UTF32_LE: return(Encoding.UTF32_LE); case Charsets.UTF32_BE: return(Encoding.UTF32_BE); case Charsets.ASCII: return(Encoding.ASCII); case Charsets.KOI8R: return(Encoding.KOI8_R); case Charsets.EUCJP: if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xsara.b3d" && fInfo.Length == 3429) { //Uses an odd character in the comments, ASCII works just fine return(Encoding.ASCII); } return(Encoding.EUC_JP); case Charsets.ISO8859_2: return(Encoding.ISO8859_2); case Charsets.ISO8859_5: return(Encoding.ISO8859_5); case Charsets.ISO_8859_7: return(Encoding.ISO8859_7); case Charsets.ISO8859_8: return(Encoding.ISO8859_8); case Charsets.ISO2022_JP: return(Encoding.ISO2022_JP); case Charsets.ISO2022_KR: return(Encoding.ISO2022_KR); case Charsets.ISO2022_CN: return(Encoding.ISO2022_CN); case Charsets.HZ_GB_2312: return(Encoding.HZ_GB_2312); case Charsets.GB18030: //Extended new Chinese charset if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "people6.b3d" && fInfo.Length == 377) { //Polish Warsaw metro object file uses diacritics in filenames return(Encoding.GB18030); } return(Encoding.GB18030); case Charsets.UTF8: return(Encoding.UTF8); } Det.Reset(); return(Encoding.Unknown); } catch { return(Encoding.Unknown); } }
/// <summary> /// Gets the character encoding of the bytes array /// </summary> /// <param name="Data">The bytes array</param> /// <returns>The character encoding, or unknown</returns> public static Encoding GetEncodingFromBytes(byte[] Data) { if (Data.Length >= 3) { if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF) { return(Encoding.UTF8); } if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76) { return(Encoding.UTF7); } } if (Data.Length >= 2) { if (Data[0] == 0xFE & Data[1] == 0xFF) { return(Encoding.UTF16_BE); } if (Data[0] == 0xFF & Data[1] == 0xFE) { return(Encoding.UTF16_LE); } } if (Data.Length >= 4) { if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF) { return(Encoding.UTF32_BE); } if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00) { return(Encoding.UTF32_LE); } } CharsetDetector Det = new CharsetDetector(); Det.Feed(Data, 0, Data.Length); Det.DataEnd(); if (Det.Charset == null) { return(Encoding.Unknown); } switch (Det.Charset) { case Charsets.IBM855: return(Encoding.IBM855); case Charsets.IBM866: return(Encoding.IBM866); case Charsets.SHIFT_JIS: return(Encoding.SHIFT_JIS); case Charsets.EUCKR: return(Encoding.EUC_KR); case Charsets.BIG5: return(Encoding.BIG5); case Charsets.UTF16_LE: return(Encoding.UTF16_LE); case Charsets.UTF16_BE: return(Encoding.UTF16_BE); case Charsets.WIN1251: return(Encoding.WIN1251); case Charsets.WIN1252: return(Encoding.WIN1252); case Charsets.WIN1253: return(Encoding.WIN1253); case Charsets.WIN1255: return(Encoding.WIN1255); case Charsets.MAC_CYRILLIC: return(Encoding.MAC_CYRILLIC); case Charsets.UTF32_LE: return(Encoding.UTF32_LE); case Charsets.UTF32_BE: return(Encoding.UTF32_BE); case Charsets.ASCII: return(Encoding.ASCII); case Charsets.KOI8R: return(Encoding.KOI8_R); case Charsets.EUCJP: return(Encoding.EUC_JP); case Charsets.ISO8859_2: return(Encoding.ISO8859_2); case Charsets.ISO8859_5: return(Encoding.ISO8859_5); case Charsets.ISO_8859_7: return(Encoding.ISO8859_7); case Charsets.ISO8859_8: return(Encoding.ISO8859_8); case Charsets.ISO2022_JP: return(Encoding.ISO2022_JP); case Charsets.ISO2022_KR: return(Encoding.ISO2022_KR); case Charsets.ISO2022_CN: return(Encoding.ISO2022_CN); case Charsets.HZ_GB_2312: return(Encoding.HZ_GB_2312); case Charsets.GB18030: return(Encoding.GB18030); case Charsets.UTF8: return(Encoding.UTF8); } Det.Reset(); return(Encoding.Unknown); }
/// <summary>Gets the character endcoding of a file</summary> /// <param name="File">The absolute path to a file</param> /// <returns>The character encoding, or unknown</returns> public static Encoding GetEncodingFromFile(string File) { if (File == null || !System.IO.File.Exists(File)) { return(Encoding.Unknown); } try { System.IO.FileInfo fInfo = new System.IO.FileInfo(File); byte[] Data = System.IO.File.ReadAllBytes(File); if (Data.Length >= 3) { if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF) { return(Encoding.Utf8); } if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76) { return(Encoding.Utf7); } } if (Data.Length >= 2) { if (Data[0] == 0xFE & Data[1] == 0xFF) { return(Encoding.Utf16Be); } if (Data[0] == 0xFF & Data[1] == 0xFE) { return(Encoding.Utf16Le); } } if (Data.Length >= 4) { if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF) { return(Encoding.Utf32Be); } if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00) { return(Encoding.Utf32Le); } } CharsetDetector Det = new CharsetDetector(); Det.Feed(Data, 0, Data.Length); Det.DataEnd(); if (Det.Charset == null) { return(Encoding.Unknown); } switch (Det.Charset.ToUpperInvariant()) { case "SHIFT-JIS": case "SHIFT_JIS": return(Encoding.Shift_JIS); case "UTF-8": return(Encoding.Utf8); case "UTF-7": return(Encoding.Utf7); case "WINDOWS-1251": if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "585tc1.csv" && fInfo.Length == 37302) { return(Encoding.Shift_JIS); } return(Encoding.Windows1252); case "WINDOWS-1252": if (fInfo.Length == 62861) { //HK tram route. Comes in a non-unicode zip, so filename may be subject to mangling return(Encoding.Big5); } return(Encoding.Windows1252); case "WINDOWS-1255": if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xdbetulasmall.csv" && fInfo.Length == 406) { //Hungarian birch tree; Actually loads OK with 1255, but use the correct one return(Encoding.Windows1252); } return(Encoding.Big5); case "BIG5": if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "stoklosy.b3d" && fInfo.Length == 18256) { //Polish Warsaw metro object file uses diacritics in filenames return(Encoding.Windows1252); } return(Encoding.Big5); case "EUC-KR": return(Encoding.EUC_KR); case "ASCII": return(Encoding.ASCII); case "IBM866": return(Encoding.OEM866); case "X-MAC-CYRILLIC": if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "exit01.csv" && fInfo.Length == 752) { //hira2 return(Encoding.Shift_JIS); } break; case "GB18030": //Extended new Chinese charset if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "people6.b3d" && fInfo.Length == 377) { //Polish Warsaw metro object file uses diacritics in filenames return(Encoding.Windows1252); } break; case "EUC-JP": if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xsara.b3d" && fInfo.Length == 3429) { //Uses an odd character in the comments, ASCII works just fine return(Encoding.ASCII); } break; } Det.Reset(); return(Encoding.Unknown); } catch { return(Encoding.Unknown); } }