private void RunTest(EncodingDetector.Options encDetectorOptions)
        {
            int totalFiles        = 0;
            int numFoundEncodings = 0;

            foreach (string filePath in Directory.EnumerateFiles(_tempDir, "*.*", SearchOption.AllDirectories))
            {
                var sampleBytes = Utils.ReadFileContentSample(filePath);

                Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions);
                totalFiles++;

                if (encoding != null)
                {
                    numFoundEncodings++;
                }

                WriteToConsole(Path.GetFileName(filePath) + ": " + encoding);

                if (totalFiles > 10)
                {
                    break;
                }
            }

            Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles);
        }
Exemplo n.º 2
0
        public static Encoding Detect(byte[] bytes, EncodingDetector.Options opts = Options.KlerkSoftBom | Options.MLang, Encoding defaultEncoding = null)
        {
            Encoding encoding = null;

            if ((opts & Options.KlerkSoftBom) == Options.KlerkSoftBom)
            {
                StopWatch.Start("DetectEncoding: UsingKlerksSoftBom");

                encoding = DetectEncodingUsingKlerksSoftBom(bytes);

                StopWatch.Stop("DetectEncoding: UsingKlerksSoftBom");
            }

            if (encoding != null)
            {
                return(encoding);
            }

            if ((opts & Options.KlerkSoftHeuristics) == Options.KlerkSoftHeuristics)
            {
                StopWatch.Start("DetectEncoding: UsingKlerksSoftHeuristics");
                encoding = DetectEncodingUsingKlerksSoftHeuristics(bytes);
                StopWatch.Stop("DetectEncoding: UsingKlerksSoftHeuristics");
            }

            if (encoding != null)
            {
                return(encoding);
            }

            if ((opts & Options.MLang) == Options.MLang)
            {
                StopWatch.Start("DetectEncoding: UsingMLang");
                encoding = DetectEncodingUsingMLang(bytes);
                StopWatch.Stop("DetectEncoding: UsingMLang");
            }

            if (encoding == null)
            {
                encoding = defaultEncoding;
            }

            return(encoding);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Retrieves the desired encoding detectors based on the given performance setting.
        /// </summary>
        /// <param name="performanceSetting">current performance setting</param>
        /// <returns>EncodingDetector.Options bit flag representation of selected detectors</returns>
        /// <history>
        /// [Curtis_Beard]	   05/26/2015	FIX: 69, add performance setting for file detection
        /// </history>
        public static EncodingDetector.Options GetEncodingDetectorOptionsByPerformance(Performance performanceSetting)
        {
            EncodingDetector.Options opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge | EncodingDetector.Options.MLang;

            switch (performanceSetting)
            {
            case Performance.Speed:
                opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge;
                break;

            case Performance.Default:
            default:
                opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge | EncodingDetector.Options.MLang;
                break;

            case Performance.Accuracy:
                opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge | EncodingDetector.Options.MozillaUCD | EncodingDetector.Options.MLang;
                break;
            }

            return(opts);
        }
Exemplo n.º 4
0
        private void RunTest(EncodingDetector.Options encDetectorOptions, string dir)
        {
            int totalFiles        = 0;
            int numFoundEncodings = 0;

            StopWatch stopWatch = new StopWatch();

            stopWatch.Start();

            string detectorName = encDetectorOptions.ToString();

            foreach (string filePath in Directory.EnumerateFiles(dir, "*.*", SearchOption.AllDirectories))
            {
                StopWatch.Start("ReadFileSample");

                var sampleBytes = Utils.ReadFileContentSample(filePath);

                StopWatch.Stop("ReadFileSample");

                StopWatch.Start("IsBinaryFile");

                if (Utils.IsBinaryFile(sampleBytes))
                {
                    StopWatch.Stop("IsBinaryFile");
                    continue;
                }

                StopWatch.Stop("IsBinaryFile");


                StopWatch.Start(detectorName);


                //First try BOM detection and Unicode detection using Klerks Soft encoder
                //stream.Seek(0, SeekOrigin.Begin);

                Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions);
                totalFiles++;

                if (encoding != null)
                {
                    numFoundEncodings++;
                }

                StopWatch.Stop(detectorName);

                WriteToConsole(filePath + ": " + encoding);

                if (totalFiles > 10)
                {
                    break;
                }
            }

            Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles);

            stopWatch.Stop();

            StopWatch.PrintCollection(stopWatch.Milliseconds);
            StopWatch.Collection.Clear();
        }
Exemplo n.º 5
0
        /// <summary>
        /// Detects the byte array's encoding based on options passed in.
        /// </summary>
        /// <param name="bytes">File sample byte array</param>
        /// <param name="opts">Flags of encoding dectors to use</param>
        /// <param name="defaultEncoding">Default encoding if nothing detected</param>
        /// <returns>Encoding detected or default if not detected</returns>
        /// <history>
        /// [Curtis_Beard]		02/12/2014	Created
        /// [Curtis_Beard]		12/01/2014	ADD: support for Mozilla encoding detection, remove KlerkSoftHeuristics as a default
        /// </history>
        public static Encoding Detect(byte[] bytes, out string usedEncoder, EncodingDetector.Options opts = Options.KlerkSoftBom | Options.WinMerge | Options.MozillaUCD | Options.MLang, Encoding defaultEncoding = null)
        {
            Encoding encoding = null;

            // NOTE: this order determines which is run first, usually Mozilla is better than MLang
            if ((opts & Options.KlerkSoftBom) == Options.KlerkSoftBom)
            {
                encoding = DetectEncodingUsingKlerksSoftBom(bytes);

                if (encoding != null)
                {
                    usedEncoder = Options.KlerkSoftBom.ToString();
                    return(encoding);
                }
            }

            if ((opts & Options.KlerkSoftHeuristics) == Options.KlerkSoftHeuristics)
            {
                encoding = DetectEncodingUsingKlerksSoftHeuristics(bytes);

                if (encoding != null)
                {
                    usedEncoder = Options.KlerkSoftHeuristics.ToString();
                    return(encoding);
                }
            }

            if ((opts & Options.WinMerge) == Options.WinMerge)
            {
                encoding = DetectEncodingUsingWinMerge(bytes);

                if (encoding != null)
                {
                    usedEncoder = Options.WinMerge.ToString();
                    return(encoding);
                }
            }

            if ((opts & Options.MozillaUCD) == Options.MozillaUCD)
            {
                encoding = DetectEncodingUsingMozillaUCD(bytes);

                if (encoding != null)
                {
                    usedEncoder = Options.MozillaUCD.ToString();
                    return(encoding);
                }
            }

            if ((opts & Options.MLang) == Options.MLang)
            {
                encoding = DetectEncodingUsingMLang(bytes);

                if (encoding != null)
                {
                    usedEncoder = Options.MLang.ToString();
                    return(encoding);
                }
            }

            // default encoding use since nothing was found
            usedEncoder = "Default";
            if (encoding == null)
            {
                encoding = defaultEncoding;
            }

            return(encoding);
        }