public static List <Encoding> GetFileEncoding(byte[] arrayByte)
        {
            List <Encoding> result = new List <Encoding>();

            CharsetDetector detector = new CharsetDetector();

            detector.Feed(arrayByte, 0, arrayByte.Length);
            detector.DataEnd();

            if (!string.IsNullOrEmpty(detector.Charset) && detector.Confidence > 0.8f)
            {
                try
                {
                    Encoding enc = Encoding.GetEncoding(detector.Charset);

                    result.Add(enc);
                }
                catch (Exception ex)
                {
                    DTEHelper.WriteExceptionToOutput(null, ex);

#if DEBUG
                    if (System.Diagnostics.Debugger.IsAttached)
                    {
                        System.Diagnostics.Debugger.Break();
                    }
#endif
                }
            }

            return(result);
        }
예제 #2
0
파일: Udetect.cs 프로젝트: Gpower2/AcTools
        /// <summary>
        /// Command line example: detects the encoding of the given file.
        /// </summary>
        /// <param name="args">a filename</param>
        public static void Main(String[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage: udetect <filename>");
                return;
            }

            string filename = args[0];

            using (FileStream fs = File.OpenRead(filename)) {
                ICharsetDetector cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    Console.WriteLine("Charset: {0}, confidence: {1}",
                                      cdet.Charset, cdet.Confidence);
                }
                else
                {
                    Console.WriteLine("Detection failed.");
                }
            }
        }
예제 #3
0
        private string DetectCharset(byte[] bytes)
        {
            try
            {
                using (var ms = new MemoryStream(bytes))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(ms);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        //_logger.Debug("UniversalDetector detected charset {0}", charset);
                    }

                    return(charset);
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine web socket message charset", ex);
            }

            return(null);
        }
예제 #4
0
        private string DetectCharset(string path)
        {
            try
            {
                using (var file = new FileStream(path, FileMode.Open))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    return(charset);
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return(null);
        }
예제 #5
0
        public static bool TryGetEncoding(byte[] data, out Encoding encoding)
        {
            ICharsetDetector cdet = new CharsetDetector();

            cdet.Feed(data, 0, data.Length);
            cdet.DataEnd();
            if (cdet.Charset != null)
            {
                if (cdet.Charset.ToLowerInvariant() == "big-5")
                {
                    encoding = Encoding.GetEncoding("big5");
                    return(true);
                }
                else
                {
                    try
                    {
                        encoding = Encoding.GetEncoding(cdet.Charset);
                        return(true);
                    }
                    catch
                    {
                        encoding = Encoding.Default;
                        return(false);
                    }
                }
            }

            encoding = Encoding.Default;
            return(false);
        }
        public static Encoding GetEncoding(byte[] data)
        {
            ICharsetDetector cdet = new CharsetDetector();

            cdet.Feed(data, 0, data.Length);
            cdet.DataEnd();
            if (cdet.Charset != null && cdet.Confidence > 0.5)
            {
                if (cdet.Charset.ToLowerInvariant() == "big-5")
                {
                    return(Encoding.GetEncoding("big5"));
                }
                else
                {
                    try
                    {
                        return(Encoding.GetEncoding(cdet.Charset));
                    }
                    catch
                    {
                        return(Encoding.Default);
                    }
                }
            }

            return(null);
        }
예제 #7
0
        private async Task <string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            try
            {
                using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    return(charset);
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return(null);
        }
        private bool isAscii(string fileName)
        {
            for (int i = 0; i < 10; i++)
            {
                try
                {
                    lock (SyncRoot)
                        using (FileStream fs = File.OpenRead(fileName))
                        {
                            var cdet = new CharsetDetector();
                            cdet.Feed(fs);
                            cdet.DataEnd();

                            bool isAscii = EncodingChecker.isAscii(cdet);
                            return(isAscii);
                        }
                }
                catch (IOException)
                {
                    Thread.Sleep(1000);
                }
            }

            return(false);
        }
예제 #9
0
        /// <summary>
        ///     Command line example: detects the encoding of the given file.
        /// </summary>
        /// <param name="args">a filename</param>
        public static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage: udetect <filename>");
                return;
            }

            var filename = args[0];

            using (var fs = File.OpenRead(filename)) {
                ICharsetDetector cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                Console.WriteLine();
                if (cdet.Charset != null)
                {
                    Console.ForegroundColor = ConsoleColor.Green;
                    Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence);
                }
                else
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine("Detection failed.");
                }
                Console.ResetColor();
            }

            Exit();
        }
        private bool convertToUtf8(string fileName)
        {
            string charset;

            lock (SyncRoot)
                using (FileStream fs = File.OpenRead(fileName))
                {
                    var cdet = new CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();

                    charset = cdet.Charset;

                    if (!isAscii(cdet))
                    {
                        return(false);
                    }
                }

            var srcEncoding = Encoding.GetEncoding(charset);

            var text = File.ReadAllText(fileName, srcEncoding);

            File.WriteAllText(fileName, text, Encoding.UTF8);
            return(true);
        }
예제 #11
0
    public static string LoadFile(FileInfo fileInfo)
    {
        string   result;
        Encoding encoding = CurrentEncoding;

        using (Stream stream = fileInfo.OpenRead()) {
            CharsetDetector detector = new CharsetDetector();
            detector.Feed(stream);
            detector.DataEnd();
            stream.Position = 0;
            if (detector.Charset != null)
            {
                Debug.LogFormat("Detected charset of file: {0}", detector.Charset);
                try {
                    encoding = Encoding.GetEncoding(detector.Charset);
                } catch {
                    Debug.LogWarning("Failed to load encoding, will use default encoding.");
                    encoding = CurrentEncoding;
                }
            }
            else
            {
                Debug.LogFormat("Failed to detect charset, will use default encoding.");
            }
            using (StreamReader reader = new StreamReader(stream, encoding))
                result = reader.ReadToEnd();
        }
        return(result);
    }
예제 #12
0
        public Encoding Detect(Stream stream)
        {
            var detector = new CharsetDetector();

            detector.Feed(stream);
            detector.DataEnd();
            return(detector.Charset.Return(Encoding.GetEncoding, null));
        }
예제 #13
0
 private static Encoding GetEncoding(string path)
 {
     using (var stream = File.OpenRead(path))
     {
         var detector = new CharsetDetector();
         detector.Feed(stream);
         detector.DataEnd();
         return(detector.Charset == null ? null : Encoding.GetEncoding(detector.Charset));
     }
 }
예제 #14
0
 public static Encoding DetectEncoding(string filename)
 {
     using (var fs = File.OpenRead(filename))
     {
         var cd = new CharsetDetector();
         cd.Feed(fs);
         cd.DataEnd();
         return(cd.Charset != null?Encoding.GetEncoding(cd.Charset) : Encoding.UTF8);
     }
 }
예제 #15
0
        private static void ActionWorkerDoWork(object sender, DoWorkEventArgs e)
        {
            BackgroundWorker worker = (BackgroundWorker)sender;
            WorkerArgs       args   = (WorkerArgs)e.Argument;

            string[] allFiles = Directory.GetFiles(args.BaseDirectory, "*.*",
                                                   args.IncludeSubdirectories ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly);

            IEnumerable <Regex> maskPatterns = GenerateMaskPatterns(args.FileMasks);

            for (int i = 0; i < allFiles.Length; i++)
            {
                if (worker.CancellationPending)
                {
                    e.Cancel = true;
                    break;
                }

                string path     = allFiles[i];
                string fileName = Path.GetFileName(path);
                if (!SatisfiesMaskPatterns(fileName, maskPatterns))
                {
                    continue;
                }

                CharsetDetector detector = new CharsetDetector();
                using (FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read))
                {
                    detector.Feed(fs);
                    detector.DataEnd();
                }
                if (args.Action == CurrentAction.Validate)
                {
                    if (detector.Charset == null)
                    {
                        continue;
                    }
                    if (args.ValidCharsets.Contains(detector.Charset))
                    {
                        continue;
                    }
                }

                string directoryName = Path.GetDirectoryName(path);

                int            percentageCompleted = (i * 100) / allFiles.Length;
                WorkerProgress progress            = new WorkerProgress();
                progress.Charset       = detector.Charset ?? "(Unknown)";
                progress.FileName      = fileName;
                progress.DirectoryName = directoryName;
                worker.ReportProgress(percentageCompleted, progress);
            }
        }
        public static Encoding GetFileEncoding(string fullPath)
        {
            using (var fs = File.OpenRead(fullPath))
            {
                var charsetDetector = new CharsetDetector();
                charsetDetector.Feed(fs);
                charsetDetector.DataEnd();
                fs.Close();

                return(EncodingHelper.GetEncodingFromCharSet(charsetDetector.Charset));
            }
        }
예제 #17
0
        private static Encoding GetEncoding(byte[] data)
        {
            ICharsetDetector cdet = new CharsetDetector();

            cdet.Feed(data, 0, data.Length);
            cdet.DataEnd();
            if (cdet.Charset != null && cdet.Confidence > 0.5)
            {
                return(Encoding.GetEncoding(cdet.Charset));
            }
            return(null);
        }
예제 #18
0
        private static string DetectFileCharset(string filePath)
        {
            string charset;

            using (var fs = File.OpenRead(filePath))
            {
                var charsetDetector = new CharsetDetector();
                charsetDetector.Feed(fs);
                charsetDetector.DataEnd();
                charset = charsetDetector.Charset;
            }
            return(charset);
        }
예제 #19
0
        private static ICharsetDetector GetCharsetDetector(string file)
        {
            ICharsetDetector cdet;

            using (var fs = File.OpenRead(file))
            {
                cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
            }

            return(cdet);
        }
예제 #20
0
        /// <summary>
        /// 指定されたバイト配列の文字コードを推定します。
        /// </summary>
        /// <param name="bytes">バイト配列</param>
        /// <param name="verifiedLength">検証される先頭からのバイト長</param>
        /// <returns>文字コード</returns>
        public static Encoding DetectEncoding(byte[] bytes, int verifiedLength)
        {
            const int MIN_LENGTH = 256;

            // 空の場合は null
            if (bytes.Length == 0)
            {
                return(null);
            }

            // BOM による判定
            var bom = DetectEncodingByBom(bytes);

            if (bom != null)
            {
                return(bom);
            }

            // バイト配列の整形
            if (bytes.Length < MIN_LENGTH)
            {
                // バイト長が短すぎる場合は複製して補う
                var count  = MIN_LENGTH / bytes.Length + 1;
                var buffer = new byte[count * bytes.Length];
                for (var i = 0; i < count; i++)
                {
                    Array.Copy(bytes, 0, buffer, bytes.Length * i, bytes.Length);
                }
                bytes = buffer;
            }
            else if (MIN_LENGTH <= verifiedLength && verifiedLength < bytes.Length)
            {
                // バイト長が指定されている場合は切り出す
                var buffer = new byte[verifiedLength];
                Array.Copy(bytes, 0, buffer, 0, verifiedLength);
                bytes = buffer;
            }

            // Mozilla Universal Charset Detector による判定
            var ude = new CharsetDetector();

            ude.Feed(bytes, 0, bytes.Length);
            ude.DataEnd();
            if (string.IsNullOrEmpty(ude.Charset) == false)
            {
                return(Encoding.GetEncoding(ude.Charset));
            }

            // いずれによっても判定できない場合は null
            return(null);
        }
예제 #21
0
        /// <summary>
        /// 检测一个外部文件的编码
        /// </summary>
        /// <param name="path"></param>
        /// <returns></returns>
        public static Encoding GetEncoding(string path)
        {
            using (var fs = File.OpenRead(path))
            {
                CharsetDetector detector = new CharsetDetector();
                detector.Feed(fs);
                detector.DataEnd();
                if (!string.IsNullOrEmpty(detector.Charset))
                {
                    return(Encoding.GetEncoding(detector.Charset));
                }
            }

            return(null);
        }
예제 #22
0
        private static Encoding DetectWithUDE(FileInfo file, Encoding targetEncoding)
        {
            var detector = new CharsetDetector();

            using (var stream = file.OpenRead())
                detector.Feed(stream);
            detector.DataEnd();

            if (targetEncoding.WebName.Equals(detector.Charset, StringComparison.InvariantCultureIgnoreCase))
            {
                return(null);
            }

            if (string.IsNullOrEmpty(detector.Charset))
            {
                Console.WriteLine($"[-] Charset undetected in file '{file.Name}'- skipped!");
                return(null);
            }

            if (detector.Confidence < 0.9)
            {
                Console.WriteLine($"[-] Charset detected as '{detector.Charset}' with confidence {Math.Round(detector.Confidence * 100)}% in file '{file.Name}' - skipped - too low confidence!");
                return(null);
            }

            Console.WriteLine($"[+] Charset detected as '{detector.Charset}' with confidence {Math.Round(detector.Confidence * 100)}% in file '{file.Name}'");

            var charset = detector.Charset;

            if (charset == "windows-1252") // incorrect detection in our case - may not be correct in all cases
            {
                charset = "windows-1250";
            }

            Encoding detectedEncoding;

            try
            {
                detectedEncoding = Encoding.GetEncoding(charset);
            }
            catch (ArgumentException)
            {
                Console.WriteLine($"[!] Encoding '{detector.Charset}' not recognized by .NET Framework");
                return(null);
            }

            return(detectedEncoding);
        }
예제 #23
0
    private static Encoding findEncoding(string path)
    {
        using (FileStream fs = File.OpenRead(path)) {
            CharsetDetector cdet = new CharsetDetector();
            cdet.Feed(fs);
            cdet.DataEnd();

            if (cdet.Charset != null)
            {
                return(Encoding.GetEncoding(cdet.Charset));
            }
        }


        return(Encoding.ASCII);
    }
        public static Encoding DetectEncoding(Stream stream)
        {
            if (stream == null)
            {
                return(null);
            }
            var detector = new CharsetDetector();

            detector.Feed(stream);
            detector.DataEnd();
            if (detector.Charset == null)
            {
                return(null);
            }
            return(Encoding.GetEncoding(detector.Charset));
        }
예제 #25
0
        public static Encoding DetectCharset(Byte[] bytes)
        {
            Encoding detectedCharset = Encoding.ASCII;

            CharsetDetector detector = new CharsetDetector();

            detector.Feed(bytes);

            detector.DataEnd();

            if (detector.Confidence > 0.5)
            {
                detectedCharset = Encoding.GetEncoding(detector.Charset);
            }

            return(detectedCharset);
        }
예제 #26
0
        private static Encoding GetEncoding(string FullFileName)
        {
            var detector = new CharsetDetector();

            byte[] bytes = File.ReadAllBytes(FullFileName);
            detector.Feed(bytes, 0, bytes.Length);
            detector.DataEnd();
            string encoding = detector.Charset;

            if (encoding == "windows-1255")
            {
                encoding = "windows-1251";
            }
            Encoding enc = Encoding.GetEncoding(encoding);

            return(enc);
        }
예제 #27
0
        public static string DetectCharset(byte[] bytes)
        {
            try
            {
                var charsetDetector = new CharsetDetector();

                charsetDetector.Feed(bytes, 0, bytes.Length);

                charsetDetector.DataEnd();

                return(charsetDetector.Charset == null ? null : charsetDetector.Charset.ToLowerInvariant());
            }
            catch (Exception)
            {
                return(null);
            }
        }
 public static string GetEncoding(string filename, out float confidence)
 {
     confidence = 0.0f;
     using (var fs = File.OpenRead(filename))
     {
         ICharsetDetector cdet = new CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             Logger.Log($"Charset: {cdet.Charset}, confidence: {cdet.Confidence}");
             confidence = cdet.Confidence;
             return(cdet.Charset);
         }
         Logger.Log($"{filename}: Detection failed.");
         return("UTF-8");
     }
 }
예제 #29
0
        private void CSVImport_Load(object sender, EventArgs e)
        {
            using (FileStream fs = File.OpenRead(CsvFile))
            {
                CharsetDetector cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    tfp = new TextFieldParser(CsvFile, Encoding.GetEncoding(cdet.Charset));
                }

                else
                {
                    tfp = new TextFieldParser(CsvFile, Encoding.ASCII);
                }
            }
        }
예제 #30
0
 private static Encoding GetEncoding(string path)
 {
     try
     {
         using (var stream = File.Open(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
         {
             var detector = new CharsetDetector();
             detector.Feed(stream);
             detector.DataEnd();
             return(detector.Charset == null ? null : Encoding.GetEncoding(detector.Charset));
         }
     }
     catch
     {
         //File locks, etc.
         return(null);
     }
 }
예제 #31
0
파일: Udetect.cs 프로젝트: henricj/FixEol
        /// <summary>
        ///     Command line example: detects the encoding of the given file.
        /// </summary>
        /// <param name="args">a filename</param>
        public static void Main(String[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage: udetect <filename>");
                return;
            }

            var filename = args[0];
            using (var fs = File.OpenRead(filename))
            {
                ICharsetDetector cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    Console.WriteLine("Charset: {0}, confidence: {1}",
                        cdet.Charset, cdet.Confidence);
                }
                else
                    Console.WriteLine("Detection failed.");
            }
        }
예제 #32
0
        private async Task<string> DetectCharset(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            try
            {
                using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    // This is often incorrectly indetected. If this happens, try to use other techniques instead
                    if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
                    {
                        if (!string.IsNullOrWhiteSpace(language))
                        {
                            return null;
                        }
                    }

                    return charset;
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return null;
        }
예제 #33
0
        private async Task<string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            try
            {
                using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    return charset;
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return null;
        }