private bool CheckCUE() { State = FileState.ValidFile; Encode = EncodingDetector.GetEncoding(FullPath, out _confidence); if (Encode != "UTF-8") { State = FileState.InValidEncode; return(false); } using (var fs = File.OpenRead(FullPath)) { var buffer = new byte[3]; fs.Read(buffer, 0, 3); if (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) { if (!CueCurer.CueMatchCheck(this)) { State = FileState.InValidCue; } return(true); } } State = FileState.NonUTF8WBOM; return(false); }
static void Main(string[] args) { var parser = new CsvParser(); var config = BuildConfiguration(); Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); var encoding = EncodingDetector.GetEncoding(config["data-file-path"]); var data = parser.ParseFile(config["data-file-path"], encoding); if (parser.ErrorMessage != null) { Console.WriteLine("Parsing failed"); Console.WriteLine(parser.ErrorMessage); } var report = DataAnalyser.Analyze(data, Convert.ToUInt32(config["min-number-for-adv-stat"])); using (var stream = new FileStream("report.txt", FileMode.Create)) { TextReporter.ToStream(stream, report); } var reporter = new ExcelReport.ExcelReporter(); using (var stream = new FileStream("report.xlsx", FileMode.Create)) { reporter.ToStream(stream, report); } }
public void IsUTF8Test() { foreach (var item in Directory.GetFiles(@"..\..\[Encode Sample]")) { float confindence; string encode = EncodingDetector.GetEncoding(item, out confindence); Console.WriteLine($"{Path.GetFileName(item)}: {encode == "UTF-8"} confidence: {confindence:F3}"); } }
public void TestDetectingCodepages(int codepage) { Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); var stream = Common.CreateStreamFromText( Common.TestResultsDataString, Encoding.GetEncoding(codepage)); var encoding = EncodingDetector.GetEncoding(stream); Assert.Equal(codepage, encoding.CodePage); }
public void EncodeTest() { float confindence; foreach (var item in Directory.GetFiles(@"..\..\[Encode Sample]")) { Console.WriteLine($"{Path.GetFileName(item)}: {EncodingDetector.GetEncoding(item, out confindence)} ({confindence:F3})"); } foreach (var item in Directory.GetFiles(@"..\..\[Encoding All Star]")) { Console.WriteLine($"{Path.GetFileName(item)}: {EncodingDetector.GetEncoding(item, out confindence)} ({confindence:F3})"); } }
private void FileValidation() { if (BaseValidation() /* || State == FileState.InValidFile*/) { return; } switch (Extension) { case ".flac": { if (!GlobalConfiguration.Instance().InspectionOptions.FLACCompressRate) { goto SKIP_FLAC_COMPRESS_RATE; } Flac = FlacData.GetMetadataFromFlac(FullPath); // _confidence = (float)Flac.CompressRate; Suffix += $"[{Flac.CompressRate * 100:00.00}%]"; if (Flac.IsHiRes) { Suffix += "[HR]"; } if (Flac.HasCover) { Suffix += "[图]"; } Encode = Flac.Encoder; if (Flac.CompressRate > 0.9) //Maybe an uncompressed file { State = FileState.InValidFlacLevel; } } SKIP_FLAC_COMPRESS_RATE: break; case ".cue": if (!GlobalConfiguration.Instance().InspectionOptions.CUEEncoding) { break; } CheckCUE(); break; case ".log": { if (!GlobalConfiguration.Instance().InspectionOptions.LogValidation) { break; } Logger.Log(Logger.Level.Info, $"Log check for '{FullPath}'"); Encode = EncodingDetector.GetEncoding(FullPath, out var confidence); if (confidence < 0.9) { break; } var text = File.ReadAllText(FullPath, System.Text.Encoding.GetEncoding(Encode)); var index = 1; foreach (var(version, oldSignature, actualSignature) in LogChecker.Core.eac_verify(text)) { if (oldSignature == "") { Logger.Log(Logger.Level.Debug, $"No signature found, it could be '{actualSignature}'"); continue; } if (oldSignature != actualSignature) { Logger.Log(Logger.Level.Debug, $"Expect signature '{actualSignature}', but get '{oldSignature}'"); State = FileState.TamperedLog; } else { Logger.Log(Logger.Level.Fine, $"{index++}. Log entry is fine!"); } } break; } case ".png": { Logger.Log(Logger.Level.Info, $"Png check for '{FullPath}'"); var pngInfo = PngData.GetMetadataFrom(FullPath); Suffix += $"[{pngInfo.CompressRate * 100:00.00}%]"; if (pngInfo.CompressRate > 0.9) //Maybe an uncompressed file { State = FileState.InValidFlacLevel; } break; } default: if (!GlobalConfiguration.Instance().InspectionOptions.FileHeader) { break; } if (!FileHeader.Check(FullPath)) { State = FileState.InValidFileSignature; } break; } }
public void GetEncoding_WhenDataHasBom(byte[] bom, EncodingType expectedEncoding) { // Arrange byte[] data = GetData(); bom.CopyTo(data, 0); CreateSut(); // Act (var encoding, bool hasBom) = sut.GetEncoding(data, data.Length); // Assert Assert.True(hasBom); Assert.Equal(expectedEncoding, encoding); }
private string DetectEncoding(string filePath, string defaultEncoding) { string detectedCharset = null; int lineCount = 0; bool isHtml = false; foreach (var line in File.ReadLines(filePath)) { var lineContent = line.ToLower().Trim(); // skip empty lines if (lineContent == "") { continue; } // 10 lines but no <html>, give up if (lineCount++ > 10 && !isHtml) { break; } // found <html> if (!isHtml && lineContent.Contains("<html")) { isHtml = true; } // Arrived <body>, give up if (lineContent.Contains("<body")) { break; } // Already detected <html>, then found <meta> if (isHtml) { var match = Html4CharsetRegex.Match(lineContent); if (match.Success && match.Groups.Count == 2) { detectedCharset = match.Groups[1].Value; break; } match = Html5CharsetRegex.Match(lineContent); if (match.Success && match.Groups.Count == 2) { detectedCharset = match.Groups[1].Value; break; } } } var autoDetectedCharset = EncodingDetector.GetEncoding(filePath); if (detectedCharset == null || autoDetectedCharset == "UTF-8") { detectedCharset = autoDetectedCharset; if (detectedCharset == null) { detectedCharset = defaultEncoding; } } return(detectedCharset.ToUpper()); }