예제 #1
0
        public static Encoding DetectTextFileEncoding(string pFilePath)
        {
            Encoding rtnEnc;

            byte[]             tempByte = null;
            TextEncodingDetect ted      = new TextEncodingDetect();

            TextEncodingDetect.Encoding tedEnc;

            tempByte = File.ReadAllBytes(pFilePath);
            tedEnc   = ted.DetectEncoding(tempByte, tempByte.Length);

            switch (tedEnc)
            {
            case TextEncodingDetect.Encoding.Utf8Bom:
            case TextEncodingDetect.Encoding.Utf8Nobom:
                rtnEnc = Encoding.UTF8;
                break;

            default:
                rtnEnc = Encoding.Default;
                break;
            }

            return(rtnEnc);
        }
예제 #2
0
    public static int Main(string[] args)
    {
        Console.WriteLine();

        if (args.Length != 1)
        {
            Console.WriteLine("Usage: TextEncodingDetect.exe <filename>");
            return(1);
        }

        // Read in the file in binary
        byte[] buffer;

        try
        {
            buffer = File.ReadAllBytes(args[0]);
        }
        catch (Exception ex)
        {
            Console.WriteLine(ex.Message);
            return(1);
        }

        // Detect encoding
        var textDetect = new TextEncodingDetect();

        TextEncodingDetect.Encoding encoding = textDetect.DetectEncoding(buffer, buffer.Length);

        Console.Write("Encoding: ");
        if (encoding == TextEncodingDetect.Encoding.None)
        {
            Console.WriteLine("Binary");
        }
        else if (encoding == TextEncodingDetect.Encoding.Ascii)
        {
            Console.WriteLine("ASCII (chars in the 0-127 range)");
        }
        else if (encoding == TextEncodingDetect.Encoding.Ansi)
        {
            Console.WriteLine("ANSI (chars in the range 0-255 range)");
        }
        else if (encoding == TextEncodingDetect.Encoding.Utf8Bom || encoding == TextEncodingDetect.Encoding.Utf8Nobom)
        {
            Console.WriteLine("UTF-8");
        }
        else if (encoding == TextEncodingDetect.Encoding.Utf16LeBom || encoding == TextEncodingDetect.Encoding.Utf16LeNoBom)
        {
            Console.WriteLine("UTF-16 Little Endian");
        }
        else if (encoding == TextEncodingDetect.Encoding.Utf16BeBom || encoding == TextEncodingDetect.Encoding.Utf16BeNoBom)
        {
            Console.WriteLine("UTF-16 Big Endian");
        }

        return(0);
    }
예제 #3
0
        private void button7_Click(object sender, EventArgs e)
        {
            if (comboBoxEdit1.SelectedIndex <= 0)
            {
                MessageBox.Show("請先選擇模組!");
                return;
            }
            string[] array_path = Directory.GetFiles(s_folderpath + "\\" + comboBoxEdit1.Properties.Items[comboBoxEdit1.SelectedIndex].ToString() + "\\SQL\\");
            for (int j = 0; j < array_path.Length; j++)
            {
                byte[] byteData = File.ReadAllBytes(array_path[j]);

                var textDetect = new TextEncodingDetect();
                TextEncodingDetect.Encoding encoding = textDetect.DetectEncoding(byteData, byteData.Length);

                if (encoding == TextEncodingDetect.Encoding.UTF8_BOM)
                {
                    string s = array_path[j] + " 是UTF8格式,請轉成無BOM格式!";
                    MessageBox.Show(s);
                    return;
                }
            }
            MessageBox.Show("檢查完成,全部都是UTF8無BOM格式");
        }
예제 #4
0
        // 检测文本文件的encoding

        /*
         * UTF-8: EF BB BF
         * UTF-16 big-endian byte order: FE FF
         * UTF-16 little-endian byte order: FF FE
         * UTF-32 big-endian byte order: 00 00 FE FF
         * UTF-32 little-endian byte order: FF FE 00 00
         * */
        public static Encoding DetectTextFileEncoding(string strFilename,
                                                      Encoding default_encoding)
        {
            byte[] buffer = new byte[4];

            try
            {
                using (FileStream file = File.Open(
                           strFilename,
                           FileMode.Open,
                           FileAccess.Read,
                           FileShare.ReadWrite))
                {
                    if (file.Length >= 2)
                    {
                        file.Read(buffer, 0, 2);    // 1, 2 BUG

                        if (buffer[0] == 0xff && buffer[1] == 0xfe)
                        {
                            return(Encoding.Unicode);    // little-endian
                        }

                        if (buffer[0] == 0xfe && buffer[1] == 0xff)
                        {
                            return(Encoding.BigEndianUnicode);
                        }
                    }

                    if (file.Length >= 3)
                    {
                        file.Read(buffer, 2, 1);
                        if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
                        {
                            return(Encoding.UTF8);
                        }
                    }

                    if (file.Length >= 4)
                    {
                        file.Read(buffer, 3, 1);

                        // UTF-32 big-endian byte order: 00 00 FE FF
                        // UTF-32 little-endian byte order: FF FE 00 00

                        if (buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xfe && buffer[3] == 0xff)
                        {
                            return(Encoding.UTF32);    // little-endian
                        }

                        if (buffer[0] == 0xff && buffer[1] == 0xfe && buffer[2] == 0x00 && buffer[3] == 0x00)
                        {
                            return(Encoding.GetEncoding(65006));    // UTF-32 big-endian
                        }
                    }

                    // 2018/11/6
                    // 检测是不是没有 BOM 的 UTF-8
                    {
                        byte[] temp_buffer = new byte[4096];
                        file.Seek(0, SeekOrigin.Begin);
                        int length = file.Read(temp_buffer, 0, temp_buffer.Length);
                        TextEncodingDetect          detector = new TextEncodingDetect();
                        TextEncodingDetect.Encoding encoding = detector.DetectEncoding(temp_buffer, length);
                        switch (encoding)
                        {
                        case TextEncodingDetect.Encoding.Utf8Bom:
                        case TextEncodingDetect.Encoding.Utf8Nobom:
                            return(Encoding.UTF8);
                        }
                    }
                }
            }
            catch
            {
            }

            return(default_encoding);    // default
        }
예제 #5
0
    public static FileStateModel GetFileState(string filename)
    {
        // Read in the file in binary
        byte[] buffer;

        try
        {
            buffer = File.ReadAllBytes(filename);
        }
        catch (Exception ex)
        {
            Console.WriteLine(ex.Message);
            throw ex;
        }

        // Detect encoding
        var textDetect = new TextEncodingDetect();

        TextEncodingDetect.Encoding encoding = textDetect.DetectEncoding(buffer, buffer.Length);

        string str = "";

        StringBuilder sb = new StringBuilder();

        //sb.AppendLine("File: " + filename);

        //sb.Append("Encoding: ");
        if (encoding == TextEncodingDetect.Encoding.None)
        {
            //sb.AppendLine("Binary");
        }
        else if (encoding == TextEncodingDetect.Encoding.Ascii)
        {
            str = Encoding.ASCII.GetString(buffer);
            //sb.AppendLine("ASCII (chars in the 0-127 range)");
        }
        else if (encoding == TextEncodingDetect.Encoding.Ansi)
        {
            str = Encoding.Default.GetString(buffer);
            //sb.AppendLine("ANSI (chars in the range 0-255 range)");
        }
        else if (encoding == TextEncodingDetect.Encoding.Utf8Bom || encoding == TextEncodingDetect.Encoding.Utf8Nobom)
        {
            str = Encoding.UTF8.GetString(buffer);
            //sb.AppendLine("UTF-8");
        }
        else if (encoding == TextEncodingDetect.Encoding.Utf16LeBom || encoding == TextEncodingDetect.Encoding.Utf16LeNoBom)
        {
            str = Encoding.Unicode.GetString(buffer);
            //sb.AppendLine("UTF-16 Little Endian");
        }
        else if (encoding == TextEncodingDetect.Encoding.Utf16BeBom || encoding == TextEncodingDetect.Encoding.Utf16BeNoBom)
        {
            str = Encoding.BigEndianUnicode.GetString(buffer);
            //sb.AppendLine("UTF-16 Big Endian");
        }

        int All_Lines  = 0;
        int CRLF_Count = 0;
        int LF_Count   = 0;

        if (encoding != TextEncodingDetect.Encoding.None)
        {
            All_Lines  = LineBreakCount(str);
            CRLF_Count = LineBreakCount(str, new[] { "\r\n" });
            LF_Count   = All_Lines - CRLF_Count;

            //sb.AppendLine(
            //    "Length: " + str.Length + "\t" +
            //    "Lines: " + All_Lines + "\t" +
            //    "CRLF: " + CRLF_Count + "\t" +
            //    "  LF: " + (LF_Count));
        }

        return(new FileStateModel()
        {
            Encoding = encoding,
            Lines = All_Lines,
            CRLFs = CRLF_Count,
            LFs = LF_Count
        });
    }
예제 #6
0
        public static bool IsText(byte[] buffer, int offset, int count)
        {
            if (buffer.Length < offset + count)
            {
                throw new ArgumentOutOfRangeException(nameof(buffer));
            }

            // [Stage 1] Contains unicode BOM -> text
            if (3 <= offset + count &&
                buffer[offset] == Utf8Bom[0] && buffer[offset + 1] == Utf8Bom[1] && buffer[offset + 2] == Utf8Bom[2])
            {
                return(true);
            }
            if (2 <= offset + count)
            {
                if (buffer[offset] == Utf16LeBom[0] && buffer[offset + 1] == Utf16LeBom[1])
                {
                    return(true);
                }
                if (buffer[offset] == Utf16BeBom[0] && buffer[offset + 1] == Utf16BeBom[1])
                {
                    return(true);
                }
            }

            // [Stage 2] Check if a chunk can be decoded as system default ANSI locale.
            // Many multibyte encodings have 'unused area'. If a file contains one of these area, treat it as a binary.
            // Ex) EUC-KR's layout : https://en.wikipedia.org/wiki/CP949#/media/File:Unified_Hangul_Code.svg
            bool     isText  = true;
            Encoding ansiEnc = Encoding.GetEncoding(DefaultAnsi.CodePage, new EncoderExceptionFallback(), new DecoderExceptionFallback());

            try
            {
                // ReSharper disable once ReturnValueOfPureMethodIsNotUsed
                ansiEnc.GetChars(buffer, offset, count);
            }
            catch (DecoderFallbackException)
            { // Failure
                isText = false;
            }

            // [Stage 3]
            // Problem: Some encodings make use of 128-255 area, so every byte is valid. (e.g. Windows-1252 / CP437)
            // To counter these issue, if file is seems to be text, check again with AutoIt.Common.TextEncodingDetect
            if (isText)
            {
                TextEncodingDetect detect = new TextEncodingDetect();
                byte[]             idxZeroBuffer;
                if (offset == 0)
                {
                    idxZeroBuffer = buffer;
                }
                else
                {
                    idxZeroBuffer = new byte[count];
                    Array.Copy(buffer, offset, idxZeroBuffer, 0, count);
                }

                switch (detect.DetectEncoding(idxZeroBuffer, idxZeroBuffer.Length))
                {
                // Binary
                case TextEncodingDetect.Encoding.None:
                // PEBakery mandates unicode text to have BOM.
                // They must have been filtered out in stage 1.
                case TextEncodingDetect.Encoding.Utf16LeBom:
                case TextEncodingDetect.Encoding.Utf16BeBom:
                case TextEncodingDetect.Encoding.Utf8Bom:
                // Treat unicode text file without a BOM as a binary.
                case TextEncodingDetect.Encoding.Utf16LeNoBom:
                case TextEncodingDetect.Encoding.Utf16BeNoBom:
                case TextEncodingDetect.Encoding.Utf8NoBom:
                    isText = false;
                    break;
                }
            }

            return(isText);
        }