Пример #1
0
        /// <summary>
        /// Guessed string encoding
        /// </summary>
        /// <param name="bytes">Bytes to encode</param>
        /// <returns>Encoded string</returns>
        public static string GuessedStringEncoding(byte[] bytes)
        {
            string ret = null;

            if (bytes != null)
            {
                charsetDetector.Reset();
                charsetDetector.Feed(bytes, 0, bytes.Length);
                charsetDetector.DataEnd();
                try
                {
                    string charset_name = charsetDetector.Charset;
                    if (charset_name != null)
                    {
                        Encoding encoding = Encoding.GetEncoding(charset_name);
                        if (encoding != null)
                        {
                            ret = encoding.GetString(bytes);
                        }
                    }
                }
                catch (Exception e)
                {
                    Console.Error.WriteLine(e.Message);
                }
                if (ret == null)
                {
                    try
                    {
                        ret = Encoding.Default.GetString(bytes);
                    }
                    catch (Exception e)
                    {
                        Console.Error.WriteLine(e.Message);
                    }
                }
            }
            if (ret == null)
            {
                ret = "";
            }
            return(ret);
        }
Пример #2
0
        /// <summary>Gets the character encoding of a file</summary>
        /// <param name="File">The absolute path to a file</param>
        /// <returns>The character encoding, or unknown</returns>
        public static Encoding GetEncodingFromFile(string File)
        {
            if (File == null || !System.IO.File.Exists(File))
            {
                return(Encoding.Unknown);
            }

            try
            {
                System.IO.FileInfo fInfo = new System.IO.FileInfo(File);
                byte[]             Data  = System.IO.File.ReadAllBytes(File);

                if (Data.Length >= 3)
                {
                    if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF)
                    {
                        return(Encoding.UTF8);
                    }

                    if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76)
                    {
                        return(Encoding.UTF7);
                    }
                }

                if (Data.Length >= 2)
                {
                    if (Data[0] == 0xFE & Data[1] == 0xFF)
                    {
                        return(Encoding.UTF16_BE);
                    }

                    if (Data[0] == 0xFF & Data[1] == 0xFE)
                    {
                        return(Encoding.UTF16_LE);
                    }
                }

                if (Data.Length >= 4)
                {
                    if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF)
                    {
                        return(Encoding.UTF32_BE);
                    }

                    if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00)
                    {
                        return(Encoding.UTF32_LE);
                    }
                }

                CharsetDetector Det = new CharsetDetector();
                Det.Feed(Data, 0, Data.Length);
                Det.DataEnd();

                if (Det.Charset == null)
                {
                    return(Encoding.Unknown);
                }

                switch (Det.Charset)
                {
                case Charsets.IBM855:
                    return(Encoding.IBM855);

                case Charsets.IBM866:
                    return(Encoding.IBM866);

                case Charsets.SHIFT_JIS:
                    return(Encoding.SHIFT_JIS);

                case Charsets.EUCKR:
                    return(Encoding.EUC_KR);

                case Charsets.BIG5:
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "stoklosy.b3d" && fInfo.Length == 18256)
                    {
                        //Polish Warsaw metro object file uses diacritics in filenames
                        return(Encoding.WIN1252);
                    }

                    return(Encoding.BIG5);

                case Charsets.UTF16_LE:
                    return(Encoding.UTF16_LE);

                case Charsets.UTF16_BE:
                    return(Encoding.UTF16_BE);

                case Charsets.WIN1251:
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "585tc1.csv" && fInfo.Length == 37302)
                    {
                        return(Encoding.SHIFT_JIS);
                    }

                    return(Encoding.WIN1251);

                case Charsets.WIN1252:
                    if (fInfo.Length == 62861)
                    {
                        //HK tram route. Comes in a non-unicode zip, so filename may be subject to mangling
                        return(Encoding.BIG5);
                    }

                    return(Encoding.WIN1252);

                case Charsets.WIN1253:
                    return(Encoding.WIN1253);

                case Charsets.WIN1255:
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xdbetulasmall.csv" && fInfo.Length == 406)
                    {
                        //Hungarian birch tree; Actually loads OK with 1255, but use the correct one
                        return(Encoding.WIN1252);
                    }

                    return(Encoding.WIN1255);

                case Charsets.MAC_CYRILLIC:
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "exit01.csv" && fInfo.Length == 752)
                    {
                        //hira2
                        return(Encoding.SHIFT_JIS);
                    }

                    return(Encoding.MAC_CYRILLIC);

                case Charsets.UTF32_LE:
                    return(Encoding.UTF32_LE);

                case Charsets.UTF32_BE:
                    return(Encoding.UTF32_BE);

                case Charsets.ASCII:
                    return(Encoding.ASCII);

                case Charsets.KOI8R:
                    return(Encoding.KOI8_R);

                case Charsets.EUCJP:
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xsara.b3d" && fInfo.Length == 3429)
                    {
                        //Uses an odd character in the comments, ASCII works just fine
                        return(Encoding.ASCII);
                    }

                    return(Encoding.EUC_JP);

                case Charsets.ISO8859_2:
                    return(Encoding.ISO8859_2);

                case Charsets.ISO8859_5:
                    return(Encoding.ISO8859_5);

                case Charsets.ISO_8859_7:
                    return(Encoding.ISO8859_7);

                case Charsets.ISO8859_8:
                    return(Encoding.ISO8859_8);

                case Charsets.ISO2022_JP:
                    return(Encoding.ISO2022_JP);

                case Charsets.ISO2022_KR:
                    return(Encoding.ISO2022_KR);

                case Charsets.ISO2022_CN:
                    return(Encoding.ISO2022_CN);

                case Charsets.HZ_GB_2312:
                    return(Encoding.HZ_GB_2312);

                case Charsets.GB18030:
                    //Extended new Chinese charset
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "people6.b3d" && fInfo.Length == 377)
                    {
                        //Polish Warsaw metro object file uses diacritics in filenames
                        return(Encoding.GB18030);
                    }

                    return(Encoding.GB18030);

                case Charsets.UTF8:
                    return(Encoding.UTF8);
                }

                Det.Reset();
                return(Encoding.Unknown);
            }
            catch
            {
                return(Encoding.Unknown);
            }
        }
Пример #3
0
        /// <summary>
        /// Gets the character encoding of the bytes array
        /// </summary>
        /// <param name="Data">The bytes array</param>
        /// <returns>The character encoding, or unknown</returns>
        public static Encoding GetEncodingFromBytes(byte[] Data)
        {
            if (Data.Length >= 3)
            {
                if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF)
                {
                    return(Encoding.UTF8);
                }

                if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76)
                {
                    return(Encoding.UTF7);
                }
            }

            if (Data.Length >= 2)
            {
                if (Data[0] == 0xFE & Data[1] == 0xFF)
                {
                    return(Encoding.UTF16_BE);
                }

                if (Data[0] == 0xFF & Data[1] == 0xFE)
                {
                    return(Encoding.UTF16_LE);
                }
            }

            if (Data.Length >= 4)
            {
                if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF)
                {
                    return(Encoding.UTF32_BE);
                }

                if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00)
                {
                    return(Encoding.UTF32_LE);
                }
            }

            CharsetDetector Det = new CharsetDetector();

            Det.Feed(Data, 0, Data.Length);
            Det.DataEnd();

            if (Det.Charset == null)
            {
                return(Encoding.Unknown);
            }

            switch (Det.Charset)
            {
            case Charsets.IBM855:
                return(Encoding.IBM855);

            case Charsets.IBM866:
                return(Encoding.IBM866);

            case Charsets.SHIFT_JIS:
                return(Encoding.SHIFT_JIS);

            case Charsets.EUCKR:
                return(Encoding.EUC_KR);

            case Charsets.BIG5:
                return(Encoding.BIG5);

            case Charsets.UTF16_LE:
                return(Encoding.UTF16_LE);

            case Charsets.UTF16_BE:
                return(Encoding.UTF16_BE);

            case Charsets.WIN1251:
                return(Encoding.WIN1251);

            case Charsets.WIN1252:
                return(Encoding.WIN1252);

            case Charsets.WIN1253:
                return(Encoding.WIN1253);

            case Charsets.WIN1255:
                return(Encoding.WIN1255);

            case Charsets.MAC_CYRILLIC:
                return(Encoding.MAC_CYRILLIC);

            case Charsets.UTF32_LE:
                return(Encoding.UTF32_LE);

            case Charsets.UTF32_BE:
                return(Encoding.UTF32_BE);

            case Charsets.ASCII:
                return(Encoding.ASCII);

            case Charsets.KOI8R:
                return(Encoding.KOI8_R);

            case Charsets.EUCJP:
                return(Encoding.EUC_JP);

            case Charsets.ISO8859_2:
                return(Encoding.ISO8859_2);

            case Charsets.ISO8859_5:
                return(Encoding.ISO8859_5);

            case Charsets.ISO_8859_7:
                return(Encoding.ISO8859_7);

            case Charsets.ISO8859_8:
                return(Encoding.ISO8859_8);

            case Charsets.ISO2022_JP:
                return(Encoding.ISO2022_JP);

            case Charsets.ISO2022_KR:
                return(Encoding.ISO2022_KR);

            case Charsets.ISO2022_CN:
                return(Encoding.ISO2022_CN);

            case Charsets.HZ_GB_2312:
                return(Encoding.HZ_GB_2312);

            case Charsets.GB18030:
                return(Encoding.GB18030);

            case Charsets.UTF8:
                return(Encoding.UTF8);
            }

            Det.Reset();
            return(Encoding.Unknown);
        }
Пример #4
0
        /// <summary>Gets the character endcoding of a file</summary>
        /// <param name="File">The absolute path to a file</param>
        /// <returns>The character encoding, or unknown</returns>
        public static Encoding GetEncodingFromFile(string File)
        {
            if (File == null || !System.IO.File.Exists(File))
            {
                return(Encoding.Unknown);
            }

            try
            {
                System.IO.FileInfo fInfo = new System.IO.FileInfo(File);
                byte[]             Data  = System.IO.File.ReadAllBytes(File);

                if (Data.Length >= 3)
                {
                    if (Data[0] == 0xEF & Data[1] == 0xBB & Data[2] == 0xBF)
                    {
                        return(Encoding.Utf8);
                    }

                    if (Data[0] == 0x2b & Data[1] == 0x2f & Data[2] == 0x76)
                    {
                        return(Encoding.Utf7);
                    }
                }

                if (Data.Length >= 2)
                {
                    if (Data[0] == 0xFE & Data[1] == 0xFF)
                    {
                        return(Encoding.Utf16Be);
                    }

                    if (Data[0] == 0xFF & Data[1] == 0xFE)
                    {
                        return(Encoding.Utf16Le);
                    }
                }

                if (Data.Length >= 4)
                {
                    if (Data[0] == 0x00 & Data[1] == 0x00 & Data[2] == 0xFE & Data[3] == 0xFF)
                    {
                        return(Encoding.Utf32Be);
                    }

                    if (Data[0] == 0xFF & Data[1] == 0xFE & Data[2] == 0x00 & Data[3] == 0x00)
                    {
                        return(Encoding.Utf32Le);
                    }
                }

                CharsetDetector Det = new CharsetDetector();
                Det.Feed(Data, 0, Data.Length);
                Det.DataEnd();

                if (Det.Charset == null)
                {
                    return(Encoding.Unknown);
                }

                switch (Det.Charset.ToUpperInvariant())
                {
                case "SHIFT-JIS":
                case "SHIFT_JIS":
                    return(Encoding.Shift_JIS);

                case "UTF-8":
                    return(Encoding.Utf8);

                case "UTF-7":
                    return(Encoding.Utf7);

                case "WINDOWS-1251":
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "585tc1.csv" && fInfo.Length == 37302)
                    {
                        return(Encoding.Shift_JIS);
                    }
                    return(Encoding.Windows1252);

                case "WINDOWS-1252":
                    if (fInfo.Length == 62861)
                    {
                        //HK tram route. Comes in a non-unicode zip, so filename may be subject to mangling
                        return(Encoding.Big5);
                    }
                    return(Encoding.Windows1252);

                case "WINDOWS-1255":
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xdbetulasmall.csv" && fInfo.Length == 406)
                    {
                        //Hungarian birch tree; Actually loads OK with 1255, but use the correct one
                        return(Encoding.Windows1252);
                    }
                    return(Encoding.Big5);

                case "BIG5":
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "stoklosy.b3d" && fInfo.Length == 18256)
                    {
                        //Polish Warsaw metro object file uses diacritics in filenames
                        return(Encoding.Windows1252);
                    }
                    return(Encoding.Big5);

                case "EUC-KR":
                    return(Encoding.EUC_KR);

                case "ASCII":
                    return(Encoding.ASCII);

                case "IBM866":
                    return(Encoding.OEM866);

                case "X-MAC-CYRILLIC":
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "exit01.csv" && fInfo.Length == 752)
                    {
                        //hira2
                        return(Encoding.Shift_JIS);
                    }
                    break;

                case "GB18030":
                    //Extended new Chinese charset
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "people6.b3d" && fInfo.Length == 377)
                    {
                        //Polish Warsaw metro object file uses diacritics in filenames
                        return(Encoding.Windows1252);
                    }
                    break;

                case "EUC-JP":
                    if (System.IO.Path.GetFileName(File).ToLowerInvariant() == "xsara.b3d" && fInfo.Length == 3429)
                    {
                        //Uses an odd character in the comments, ASCII works just fine
                        return(Encoding.ASCII);
                    }
                    break;
                }

                Det.Reset();
                return(Encoding.Unknown);
            }
            catch
            {
                return(Encoding.Unknown);
            }
        }