public static List <Encoding> GetFileEncoding(byte[] arrayByte) { List <Encoding> result = new List <Encoding>(); CharsetDetector detector = new CharsetDetector(); detector.Feed(arrayByte, 0, arrayByte.Length); detector.DataEnd(); if (!string.IsNullOrEmpty(detector.Charset) && detector.Confidence > 0.8f) { try { Encoding enc = Encoding.GetEncoding(detector.Charset); result.Add(enc); } catch (Exception ex) { DTEHelper.WriteExceptionToOutput(null, ex); #if DEBUG if (System.Diagnostics.Debugger.IsAttached) { System.Diagnostics.Debugger.Break(); } #endif } } return(result); }
/// <summary> /// Command line example: detects the encoding of the given file. /// </summary> /// <param name="args">a filename</param> public static void Main(String[] args) { if (args.Length == 0) { Console.WriteLine("Usage: udetect <filename>"); return; } string filename = args[0]; using (FileStream fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else { Console.WriteLine("Detection failed."); } } }
private string DetectCharset(byte[] bytes) { try { using (var ms = new MemoryStream(bytes)) { var detector = new CharsetDetector(); detector.Feed(ms); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { //_logger.Debug("UniversalDetector detected charset {0}", charset); } return(charset); } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine web socket message charset", ex); } return(null); }
private string DetectCharset(string path) { try { using (var file = new FileStream(path, FileMode.Open)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } return(charset); } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return(null); }
public static bool TryGetEncoding(byte[] data, out Encoding encoding) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(data, 0, data.Length); cdet.DataEnd(); if (cdet.Charset != null) { if (cdet.Charset.ToLowerInvariant() == "big-5") { encoding = Encoding.GetEncoding("big5"); return(true); } else { try { encoding = Encoding.GetEncoding(cdet.Charset); return(true); } catch { encoding = Encoding.Default; return(false); } } } encoding = Encoding.Default; return(false); }
public static Encoding GetEncoding(byte[] data) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(data, 0, data.Length); cdet.DataEnd(); if (cdet.Charset != null && cdet.Confidence > 0.5) { if (cdet.Charset.ToLowerInvariant() == "big-5") { return(Encoding.GetEncoding("big5")); } else { try { return(Encoding.GetEncoding(cdet.Charset)); } catch { return(Encoding.Default); } } } return(null); }
private async Task <string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken) { try { using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } return(charset); } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return(null); }
private bool isAscii(string fileName) { for (int i = 0; i < 10; i++) { try { lock (SyncRoot) using (FileStream fs = File.OpenRead(fileName)) { var cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); bool isAscii = EncodingChecker.isAscii(cdet); return(isAscii); } } catch (IOException) { Thread.Sleep(1000); } } return(false); }
/// <summary> /// Command line example: detects the encoding of the given file. /// </summary> /// <param name="args">a filename</param> public static void Main(string[] args) { if (args.Length == 0) { Console.WriteLine("Usage: udetect <filename>"); return; } var filename = args[0]; using (var fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); Console.WriteLine(); if (cdet.Charset != null) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Detection failed."); } Console.ResetColor(); } Exit(); }
private bool convertToUtf8(string fileName) { string charset; lock (SyncRoot) using (FileStream fs = File.OpenRead(fileName)) { var cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); charset = cdet.Charset; if (!isAscii(cdet)) { return(false); } } var srcEncoding = Encoding.GetEncoding(charset); var text = File.ReadAllText(fileName, srcEncoding); File.WriteAllText(fileName, text, Encoding.UTF8); return(true); }
public static string LoadFile(FileInfo fileInfo) { string result; Encoding encoding = CurrentEncoding; using (Stream stream = fileInfo.OpenRead()) { CharsetDetector detector = new CharsetDetector(); detector.Feed(stream); detector.DataEnd(); stream.Position = 0; if (detector.Charset != null) { Debug.LogFormat("Detected charset of file: {0}", detector.Charset); try { encoding = Encoding.GetEncoding(detector.Charset); } catch { Debug.LogWarning("Failed to load encoding, will use default encoding."); encoding = CurrentEncoding; } } else { Debug.LogFormat("Failed to detect charset, will use default encoding."); } using (StreamReader reader = new StreamReader(stream, encoding)) result = reader.ReadToEnd(); } return(result); }
public Encoding Detect(Stream stream) { var detector = new CharsetDetector(); detector.Feed(stream); detector.DataEnd(); return(detector.Charset.Return(Encoding.GetEncoding, null)); }
private static Encoding GetEncoding(string path) { using (var stream = File.OpenRead(path)) { var detector = new CharsetDetector(); detector.Feed(stream); detector.DataEnd(); return(detector.Charset == null ? null : Encoding.GetEncoding(detector.Charset)); } }
public static Encoding DetectEncoding(string filename) { using (var fs = File.OpenRead(filename)) { var cd = new CharsetDetector(); cd.Feed(fs); cd.DataEnd(); return(cd.Charset != null?Encoding.GetEncoding(cd.Charset) : Encoding.UTF8); } }
private static void ActionWorkerDoWork(object sender, DoWorkEventArgs e) { BackgroundWorker worker = (BackgroundWorker)sender; WorkerArgs args = (WorkerArgs)e.Argument; string[] allFiles = Directory.GetFiles(args.BaseDirectory, "*.*", args.IncludeSubdirectories ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly); IEnumerable <Regex> maskPatterns = GenerateMaskPatterns(args.FileMasks); for (int i = 0; i < allFiles.Length; i++) { if (worker.CancellationPending) { e.Cancel = true; break; } string path = allFiles[i]; string fileName = Path.GetFileName(path); if (!SatisfiesMaskPatterns(fileName, maskPatterns)) { continue; } CharsetDetector detector = new CharsetDetector(); using (FileStream fs = new FileStream(path, FileMode.Open, FileAccess.Read)) { detector.Feed(fs); detector.DataEnd(); } if (args.Action == CurrentAction.Validate) { if (detector.Charset == null) { continue; } if (args.ValidCharsets.Contains(detector.Charset)) { continue; } } string directoryName = Path.GetDirectoryName(path); int percentageCompleted = (i * 100) / allFiles.Length; WorkerProgress progress = new WorkerProgress(); progress.Charset = detector.Charset ?? "(Unknown)"; progress.FileName = fileName; progress.DirectoryName = directoryName; worker.ReportProgress(percentageCompleted, progress); } }
public static Encoding GetFileEncoding(string fullPath) { using (var fs = File.OpenRead(fullPath)) { var charsetDetector = new CharsetDetector(); charsetDetector.Feed(fs); charsetDetector.DataEnd(); fs.Close(); return(EncodingHelper.GetEncodingFromCharSet(charsetDetector.Charset)); } }
private static Encoding GetEncoding(byte[] data) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(data, 0, data.Length); cdet.DataEnd(); if (cdet.Charset != null && cdet.Confidence > 0.5) { return(Encoding.GetEncoding(cdet.Charset)); } return(null); }
private static string DetectFileCharset(string filePath) { string charset; using (var fs = File.OpenRead(filePath)) { var charsetDetector = new CharsetDetector(); charsetDetector.Feed(fs); charsetDetector.DataEnd(); charset = charsetDetector.Charset; } return(charset); }
private static ICharsetDetector GetCharsetDetector(string file) { ICharsetDetector cdet; using (var fs = File.OpenRead(file)) { cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); } return(cdet); }
/// <summary> /// 指定されたバイト配列の文字コードを推定します。 /// </summary> /// <param name="bytes">バイト配列</param> /// <param name="verifiedLength">検証される先頭からのバイト長</param> /// <returns>文字コード</returns> public static Encoding DetectEncoding(byte[] bytes, int verifiedLength) { const int MIN_LENGTH = 256; // 空の場合は null if (bytes.Length == 0) { return(null); } // BOM による判定 var bom = DetectEncodingByBom(bytes); if (bom != null) { return(bom); } // バイト配列の整形 if (bytes.Length < MIN_LENGTH) { // バイト長が短すぎる場合は複製して補う var count = MIN_LENGTH / bytes.Length + 1; var buffer = new byte[count * bytes.Length]; for (var i = 0; i < count; i++) { Array.Copy(bytes, 0, buffer, bytes.Length * i, bytes.Length); } bytes = buffer; } else if (MIN_LENGTH <= verifiedLength && verifiedLength < bytes.Length) { // バイト長が指定されている場合は切り出す var buffer = new byte[verifiedLength]; Array.Copy(bytes, 0, buffer, 0, verifiedLength); bytes = buffer; } // Mozilla Universal Charset Detector による判定 var ude = new CharsetDetector(); ude.Feed(bytes, 0, bytes.Length); ude.DataEnd(); if (string.IsNullOrEmpty(ude.Charset) == false) { return(Encoding.GetEncoding(ude.Charset)); } // いずれによっても判定できない場合は null return(null); }
/// <summary> /// 检测一个外部文件的编码 /// </summary> /// <param name="path"></param> /// <returns></returns> public static Encoding GetEncoding(string path) { using (var fs = File.OpenRead(path)) { CharsetDetector detector = new CharsetDetector(); detector.Feed(fs); detector.DataEnd(); if (!string.IsNullOrEmpty(detector.Charset)) { return(Encoding.GetEncoding(detector.Charset)); } } return(null); }
private static Encoding DetectWithUDE(FileInfo file, Encoding targetEncoding) { var detector = new CharsetDetector(); using (var stream = file.OpenRead()) detector.Feed(stream); detector.DataEnd(); if (targetEncoding.WebName.Equals(detector.Charset, StringComparison.InvariantCultureIgnoreCase)) { return(null); } if (string.IsNullOrEmpty(detector.Charset)) { Console.WriteLine($"[-] Charset undetected in file '{file.Name}'- skipped!"); return(null); } if (detector.Confidence < 0.9) { Console.WriteLine($"[-] Charset detected as '{detector.Charset}' with confidence {Math.Round(detector.Confidence * 100)}% in file '{file.Name}' - skipped - too low confidence!"); return(null); } Console.WriteLine($"[+] Charset detected as '{detector.Charset}' with confidence {Math.Round(detector.Confidence * 100)}% in file '{file.Name}'"); var charset = detector.Charset; if (charset == "windows-1252") // incorrect detection in our case - may not be correct in all cases { charset = "windows-1250"; } Encoding detectedEncoding; try { detectedEncoding = Encoding.GetEncoding(charset); } catch (ArgumentException) { Console.WriteLine($"[!] Encoding '{detector.Charset}' not recognized by .NET Framework"); return(null); } return(detectedEncoding); }
private static Encoding findEncoding(string path) { using (FileStream fs = File.OpenRead(path)) { CharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { return(Encoding.GetEncoding(cdet.Charset)); } } return(Encoding.ASCII); }
public static Encoding DetectEncoding(Stream stream) { if (stream == null) { return(null); } var detector = new CharsetDetector(); detector.Feed(stream); detector.DataEnd(); if (detector.Charset == null) { return(null); } return(Encoding.GetEncoding(detector.Charset)); }
public static Encoding DetectCharset(Byte[] bytes) { Encoding detectedCharset = Encoding.ASCII; CharsetDetector detector = new CharsetDetector(); detector.Feed(bytes); detector.DataEnd(); if (detector.Confidence > 0.5) { detectedCharset = Encoding.GetEncoding(detector.Charset); } return(detectedCharset); }
private static Encoding GetEncoding(string FullFileName) { var detector = new CharsetDetector(); byte[] bytes = File.ReadAllBytes(FullFileName); detector.Feed(bytes, 0, bytes.Length); detector.DataEnd(); string encoding = detector.Charset; if (encoding == "windows-1255") { encoding = "windows-1251"; } Encoding enc = Encoding.GetEncoding(encoding); return(enc); }
public static string DetectCharset(byte[] bytes) { try { var charsetDetector = new CharsetDetector(); charsetDetector.Feed(bytes, 0, bytes.Length); charsetDetector.DataEnd(); return(charsetDetector.Charset == null ? null : charsetDetector.Charset.ToLowerInvariant()); } catch (Exception) { return(null); } }
public static string GetEncoding(string filename, out float confidence) { confidence = 0.0f; using (var fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { Logger.Log($"Charset: {cdet.Charset}, confidence: {cdet.Confidence}"); confidence = cdet.Confidence; return(cdet.Charset); } Logger.Log($"{filename}: Detection failed."); return("UTF-8"); } }
private void CSVImport_Load(object sender, EventArgs e) { using (FileStream fs = File.OpenRead(CsvFile)) { CharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { tfp = new TextFieldParser(CsvFile, Encoding.GetEncoding(cdet.Charset)); } else { tfp = new TextFieldParser(CsvFile, Encoding.ASCII); } } }
private static Encoding GetEncoding(string path) { try { using (var stream = File.Open(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { var detector = new CharsetDetector(); detector.Feed(stream); detector.DataEnd(); return(detector.Charset == null ? null : Encoding.GetEncoding(detector.Charset)); } } catch { //File locks, etc. return(null); } }
/// <summary> /// Command line example: detects the encoding of the given file. /// </summary> /// <param name="args">a filename</param> public static void Main(String[] args) { if (args.Length == 0) { Console.WriteLine("Usage: udetect <filename>"); return; } var filename = args[0]; using (var fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else Console.WriteLine("Detection failed."); } }
private async Task<string> DetectCharset(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken) { try { using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } // This is often incorrectly indetected. If this happens, try to use other techniques instead if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase)) { if (!string.IsNullOrWhiteSpace(language)) { return null; } } return charset; } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return null; }
private async Task<string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken) { try { using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } return charset; } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return null; }