private void RunTest(EncodingDetector.Options encDetectorOptions) { int totalFiles = 0; int numFoundEncodings = 0; foreach (string filePath in Directory.EnumerateFiles(_tempDir, "*.*", SearchOption.AllDirectories)) { var sampleBytes = Utils.ReadFileContentSample(filePath); Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions); totalFiles++; if (encoding != null) { numFoundEncodings++; } WriteToConsole(Path.GetFileName(filePath) + ": " + encoding); if (totalFiles > 10) { break; } } Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles); }
public static Encoding Detect(byte[] bytes, EncodingDetector.Options opts = Options.KlerkSoftBom | Options.MLang, Encoding defaultEncoding = null) { Encoding encoding = null; if ((opts & Options.KlerkSoftBom) == Options.KlerkSoftBom) { StopWatch.Start("DetectEncoding: UsingKlerksSoftBom"); encoding = DetectEncodingUsingKlerksSoftBom(bytes); StopWatch.Stop("DetectEncoding: UsingKlerksSoftBom"); } if (encoding != null) { return(encoding); } if ((opts & Options.KlerkSoftHeuristics) == Options.KlerkSoftHeuristics) { StopWatch.Start("DetectEncoding: UsingKlerksSoftHeuristics"); encoding = DetectEncodingUsingKlerksSoftHeuristics(bytes); StopWatch.Stop("DetectEncoding: UsingKlerksSoftHeuristics"); } if (encoding != null) { return(encoding); } if ((opts & Options.MLang) == Options.MLang) { StopWatch.Start("DetectEncoding: UsingMLang"); encoding = DetectEncodingUsingMLang(bytes); StopWatch.Stop("DetectEncoding: UsingMLang"); } if (encoding == null) { encoding = defaultEncoding; } return(encoding); }
/// <summary> /// Retrieves the desired encoding detectors based on the given performance setting. /// </summary> /// <param name="performanceSetting">current performance setting</param> /// <returns>EncodingDetector.Options bit flag representation of selected detectors</returns> /// <history> /// [Curtis_Beard] 05/26/2015 FIX: 69, add performance setting for file detection /// </history> public static EncodingDetector.Options GetEncodingDetectorOptionsByPerformance(Performance performanceSetting) { EncodingDetector.Options opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge | EncodingDetector.Options.MLang; switch (performanceSetting) { case Performance.Speed: opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge; break; case Performance.Default: default: opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge | EncodingDetector.Options.MLang; break; case Performance.Accuracy: opts = EncodingDetector.Options.KlerkSoftBom | EncodingDetector.Options.WinMerge | EncodingDetector.Options.MozillaUCD | EncodingDetector.Options.MLang; break; } return(opts); }
private void RunTest(EncodingDetector.Options encDetectorOptions, string dir) { int totalFiles = 0; int numFoundEncodings = 0; StopWatch stopWatch = new StopWatch(); stopWatch.Start(); string detectorName = encDetectorOptions.ToString(); foreach (string filePath in Directory.EnumerateFiles(dir, "*.*", SearchOption.AllDirectories)) { StopWatch.Start("ReadFileSample"); var sampleBytes = Utils.ReadFileContentSample(filePath); StopWatch.Stop("ReadFileSample"); StopWatch.Start("IsBinaryFile"); if (Utils.IsBinaryFile(sampleBytes)) { StopWatch.Stop("IsBinaryFile"); continue; } StopWatch.Stop("IsBinaryFile"); StopWatch.Start(detectorName); //First try BOM detection and Unicode detection using Klerks Soft encoder //stream.Seek(0, SeekOrigin.Begin); Encoding encoding = EncodingDetector.Detect(sampleBytes, encDetectorOptions); totalFiles++; if (encoding != null) { numFoundEncodings++; } StopWatch.Stop(detectorName); WriteToConsole(filePath + ": " + encoding); if (totalFiles > 10) { break; } } Console.WriteLine("Found Encoding in:" + numFoundEncodings + " out of " + totalFiles); stopWatch.Stop(); StopWatch.PrintCollection(stopWatch.Milliseconds); StopWatch.Collection.Clear(); }
/// <summary> /// Detects the byte array's encoding based on options passed in. /// </summary> /// <param name="bytes">File sample byte array</param> /// <param name="opts">Flags of encoding dectors to use</param> /// <param name="defaultEncoding">Default encoding if nothing detected</param> /// <returns>Encoding detected or default if not detected</returns> /// <history> /// [Curtis_Beard] 02/12/2014 Created /// [Curtis_Beard] 12/01/2014 ADD: support for Mozilla encoding detection, remove KlerkSoftHeuristics as a default /// </history> public static Encoding Detect(byte[] bytes, out string usedEncoder, EncodingDetector.Options opts = Options.KlerkSoftBom | Options.WinMerge | Options.MozillaUCD | Options.MLang, Encoding defaultEncoding = null) { Encoding encoding = null; // NOTE: this order determines which is run first, usually Mozilla is better than MLang if ((opts & Options.KlerkSoftBom) == Options.KlerkSoftBom) { encoding = DetectEncodingUsingKlerksSoftBom(bytes); if (encoding != null) { usedEncoder = Options.KlerkSoftBom.ToString(); return(encoding); } } if ((opts & Options.KlerkSoftHeuristics) == Options.KlerkSoftHeuristics) { encoding = DetectEncodingUsingKlerksSoftHeuristics(bytes); if (encoding != null) { usedEncoder = Options.KlerkSoftHeuristics.ToString(); return(encoding); } } if ((opts & Options.WinMerge) == Options.WinMerge) { encoding = DetectEncodingUsingWinMerge(bytes); if (encoding != null) { usedEncoder = Options.WinMerge.ToString(); return(encoding); } } if ((opts & Options.MozillaUCD) == Options.MozillaUCD) { encoding = DetectEncodingUsingMozillaUCD(bytes); if (encoding != null) { usedEncoder = Options.MozillaUCD.ToString(); return(encoding); } } if ((opts & Options.MLang) == Options.MLang) { encoding = DetectEncodingUsingMLang(bytes); if (encoding != null) { usedEncoder = Options.MLang.ToString(); return(encoding); } } // default encoding use since nothing was found usedEncoder = "Default"; if (encoding == null) { encoding = defaultEncoding; } return(encoding); }