Exemplo n.º 1
0
        public void GlobalSetup()
        {
            Program.NativeGlobalInit();
            _binaryDir     = Path.GetFullPath(AppDomain.CurrentDomain.BaseDirectory);
            _sampleBaseDir = Path.GetFullPath(Path.Combine(_binaryDir, "..", "..", "..", "..", "..", "..", "..", "Samples"));

            _sampleDir    = Path.Combine(_sampleBaseDir, "EncDetect");
            _magicFile    = Path.Combine(_binaryDir, "magic.mgc");
            _magic        = Magic.Open(_magicFile);
            _autoitDetect = new AdvTextEncDetect();

            foreach (string srcFileName in SrcFileNames)
            {
                string srcFile  = Path.Combine(_sampleDir, srcFileName);
                long   fileSize = new FileInfo(srcFile).Length;
                byte[] bytes    = new byte[fileSize];
                using (FileStream fs = new FileStream(srcFile, FileMode.Open, FileAccess.Read, FileShare.Read))
                {
                    fs.Read(bytes, 0, bytes.Length);
                }
                SrcFiles[srcFileName] = bytes;
            }
        }
Exemplo n.º 2
0
 public static Encoding DetectEncoding(ReadOnlySpan <byte> span, out TextEncoding textEnc)
 {
     textEnc = AdvDetect.DetectEncoding(span);
     return(AdvTextEncDetect.TextEncodingToBclEncoding(textEnc));
 }
Exemplo n.º 3
0
        public static bool IsText(byte[] buffer, int offset, int count)
        {
            if (buffer.Length < offset + count)
            {
                throw new ArgumentOutOfRangeException(nameof(buffer));
            }

            // [Stage 1] Contains unicode BOM -> text
            if (3 <= offset + count &&
                buffer[offset] == Utf8Bom[0] && buffer[offset + 1] == Utf8Bom[1] && buffer[offset + 2] == Utf8Bom[2])
            {
                return(true);
            }
            if (2 <= offset + count)
            {
                if (buffer[offset] == Utf16LeBom[0] && buffer[offset + 1] == Utf16LeBom[1])
                {
                    return(true);
                }
                if (buffer[offset] == Utf16BeBom[0] && buffer[offset + 1] == Utf16BeBom[1])
                {
                    return(true);
                }
            }

            // [Stage 2] Check if a chunk can be decoded as system default ANSI locale.
            // Many multi-byte encodings have 'unused area'. If a file contains one of these area, treat it as a binary.
            // Ex) EUC-KR's layout : https://en.wikipedia.org/wiki/CP949#/media/File:Unified_Hangul_Code.svg
            bool     isText  = true;
            Encoding ansiEnc = Encoding.GetEncoding(DefaultAnsi.CodePage, new EncoderExceptionFallback(), new DecoderExceptionFallback());

            try
            {
                // ReSharper disable once ReturnValueOfPureMethodIsNotUsed
                ansiEnc.GetChars(buffer, offset, count);
            }
            catch (DecoderFallbackException)
            { // Failure
                isText = false;
            }

            // [Stage 3]
            // Problem: Some encodings make use of 128-255 area, so every byte is valid. (e.g. Windows-1252 / CP437)
            // To counter these issue, if file is seems to be text, check again with AutoIt.Common.TextEncodingDetect
            if (isText)
            {
                AdvTextEncDetect detect = new AdvTextEncDetect();
                byte[]           idxZeroBuffer;
                if (offset == 0)
                {
                    idxZeroBuffer = buffer;
                }
                else
                {
                    idxZeroBuffer = new byte[count];
                    Array.Copy(buffer, offset, idxZeroBuffer, 0, count);
                }

                switch (detect.DetectEncoding(idxZeroBuffer, 0, idxZeroBuffer.Length))
                {
                // Binary
                case TextEncoding.None:
                // PEBakery mandates unicode text to have BOM.
                // They must have been filtered out in stage 1.
                case TextEncoding.Utf16LeBom:
                case TextEncoding.Utf16BeBom:
                case TextEncoding.Utf8Bom:
                // Treat unicode text file without a BOM as a binary.
                case TextEncoding.Utf16LeNoBom:
                case TextEncoding.Utf16BeNoBom:
                case TextEncoding.Utf8NoBom:
                    isText = false;
                    break;
                }
            }

            return(isText);
        }
Exemplo n.º 4
0
 public static Encoding DetectEncoding(byte[] buffer, int offset, int count, out TextEncoding textEnc)
 {
     textEnc = AdvDetect.DetectEncoding(buffer, offset, count);
     return(AdvTextEncDetect.TextEncodingToBclEncoding(textEnc));
 }