// the reason to create this method in the API is that certain encodings are not // grep-able under certain tools that we use heavily, like MINGW32's grep in this case // (not sure if more tools are affected) public static bool IsEncodingGrepable(FileInfo file, out string charSet) { using (FileStream fs = File.OpenRead(file.FullName)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); charSet = cdet.Charset; if (cdet.Charset != null && cdet.Charset == "UTF-16LE") { return false; } return true; } }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) { return(EncodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { IsText = false; Done = true; return(EncodingName); } HasByteOrderMark = CheckForByteOrderMark(inputData, start); IsText = true; } // execute charset detector ude.Feed(inputData, start, count); ude.DataEnd(); if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset)) { IncrementFrequency(ude.Charset); Done = true; return(EncodingName); } // singular buffer detection var singleUde = new Ude.CharsetDetector(); const int udeFeedSize = 4 * 1024; int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize; for (var pos = start; pos < count; pos += step) { singleUde.Reset(); if (pos + step > count) { singleUde.Feed(inputData, pos, count - pos); } else { singleUde.Feed(inputData, pos, step); } singleUde.DataEnd(); // update encoding frequency if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty(singleUde.Charset)) { IncrementFrequency(singleUde.Charset); } } // vote for best encoding EncodingName = GetCurrentEncoding(); // update current encoding name return(EncodingName); }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) { return(EncodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { IsText = false; Done = true; return(EncodingName); } HasByteOrderMark = CheckForByteOrderMark(inputData, start); IsText = true; } // execute charset detector ude.Feed(inputData, start, count); ude.DataEnd(); if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset)) { Done = true; return(EncodingName); } const int bufferSize = 4 * 1024; // singular buffer detection if (singleEncodings.Count < 2000) { var u = new Ude.CharsetDetector(); int step = (count - start) < bufferSize ? (count - start) : bufferSize; for (var i = start; i < count; i += step) { u.Reset(); if (i + step > count) { u.Feed(inputData, i, count - i); } else { u.Feed(inputData, i, step); } u.DataEnd(); if (u.Confidence > 0.3 && !String.IsNullOrEmpty(u.Charset)) { singleEncodings.Add(u.Charset); } } } return(EncodingName); }
public override string Recovery(FileInfo fi) { using (FileStream fs = File.OpenRead(fi.FullName)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { _encoding = cdet.Charset; //Console.WriteLine(_encoding); } else { Console.WriteLine("Detection failed."); } } string path = fi.DirectoryName + "\\" + fi.Name; Console.WriteLine(path); StreamReader Reader = new StreamReader(path, Encoding.GetEncoding(_encoding)); string polpi = Reader.ReadToEnd(); Reader.Close(); polpi = polpi.ToLower(); return(polpi); }
public static Encoding DetectFileEncoding(ZipFile zipFile) { long readEntries = 0; Ude.CharsetDetector cdet = new Ude.CharsetDetector(); foreach (var entry in zipFile.OfType <ZipEntry>()) { readEntries++; if (entry.IsUnicodeText) { return(Encoding.UTF8); } var guessedEncoding = Common.Extensions.IgnoreExceptions(() => { var rawBytes = Encoding.GetEncoding(Constants.Filesystem.ExtendedAsciiCodePage).GetBytes(entry.Name); cdet.Feed(rawBytes, 0, rawBytes.Length); cdet.DataEnd(); if (cdet.Charset != null && cdet.Confidence >= 0.9 && (readEntries >= Math.Min(zipFile.Count, 50))) { return(Encoding.GetEncoding(cdet.Charset)); } return(null); }); if (guessedEncoding != null) { return(guessedEncoding); } } return(Encoding.UTF8); }
/// <summary> /// Auto-detector to fixing the encoded string. /// </summary> /// <param name="str">Data for reencoding</param> /// <param name="from">Known Encoding for current string</param> /// <returns>Reencoded string with auto-detected charset.</returns> protected virtual string reEncodeString(string str, Encoding from) { if (String.IsNullOrEmpty(str)) { return(str); } byte[] bytes = from.GetBytes(str); Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); if (cdet.Charset == null) { Log.Debug("reEncodeString: Problem with detection... use the original"); return(str); } Log.Debug("reEncodeString: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence); if (cdet.Confidence < 0.92f) { Log.Debug("reEncodeString: Confidence < 0.92 /use the original"); return(str); } Encoding to = Encoding.GetEncoding(cdet.Charset); Log.Debug("reEncodeString: '{0}' -> '{1}'", from.EncodingName, to.EncodingName); Log.Trace("reEncodeString: original - '{0}'", str); return(to.GetString(bytes)); }
/// <summary> /// Automatically detects the text encoding of the data stored in buffer and decodes it. /// </summary> /// <param name="buffer">text data</param> /// <returns>decoded string from buffer or null if no suitable encoding was found</returns> public static string Decode(byte[] buffer) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); using (MemoryStream memory = new MemoryStream(buffer)) { cdet.Feed(memory); cdet.DataEnd(); } if (cdet.Charset == null) { return(null); } try { //Encoding encoding = Encoding.GetEncoding(cdet.Charset); Encoding encoding = Encoding.UTF8; string output = encoding.GetString(buffer); if (cdet.Charset == "ASCII" && output.Contains('\0')) // The Detector has probably mistakenly identified utf-16 as ASCII { output = Encoding.GetEncoding("utf-16").GetString(buffer); } return(output); } catch (Exception) { return(null); } }
public static EmailEncodingResult GetEmailEncoding(byte[] databytes) { var result = EncodingHelper.EmailEncoding.ScanCharSet(databytes); if (result != null) { return(result); } Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(databytes, 0, databytes.Length); detector.DataEnd(); if (!string.IsNullOrWhiteSpace(detector.Charset)) { if (result == null) { result = new EmailEncodingResult(); } result.Charset = detector.Charset; return(result); } return(null); }
private Encoding GetEncoding(string path) { try { using (FileStream fs = File.OpenRead(path)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { if (cdet.Charset == "ASCII") { return(Encoding.Default); } else if (cdet.Charset == "UTF-8") { return(Encoding.UTF8); } else { return(Encoding.Default); } } else { return(Encoding.Default); } } } catch { return(Encoding.Default); } }
/// <summary> /// Detects the byte order mark of a file and returns /// an appropriate encoding for the file. /// </summary> /// <param name="sourceFile"></param> /// <returns>A TextEncodingMetadata object</returns> public static TextEncodingMetadata GetFileEncoding(string sourceFile) { var metaData = new TextEncodingMetadata(); EncodingMetaInfo metaInfo = DetectBom(sourceFile); metaData.HasBom = metaInfo.HasBom; using (FileStream fs = File.OpenRead(sourceFile)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { metaData.CharacterSet = cdet.Charset; metaData.DetectionConfidence = cdet.Confidence; } else { Console.WriteLine("Detection failed."); } } using (var sr = new StreamReader(sourceFile)) { Encoding encoding = sr.CurrentEncoding; metaData.FileEncoding = encoding; } return(metaData); }
private void ConvertToUtf8(string fileName) { string charset = null; using (FileStream fs = File.OpenRead(fileName)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); charset = cdet.Charset; } if (charset != null) { var text = string.Empty; using (var reader = new StreamReader(fileName, Encoding.GetEncoding(charset))) { text = reader.ReadToEnd(); } using (var writer = new StreamWriter(fileName, false, Encoding.UTF8)) { if (text.Length > 0) { writer.Write(text); } } } }
public async Task <bool> ReadFile(bool ReplaceEncoding) { try { StorageFile file = AsyncHelpers.RunSync(() => StorageFile.GetFileFromPathAsync(Tab.TabOriginalPathContent).AsTask()); string encode_type = ""; bool encode_bom = true; await Task.Run(() => { using (FileStream fs = File.OpenRead(Tab.TabOriginalPathContent)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode_type = cdet.Charset; } } }); if (Encoding.UTF8.CodePage == Encoding.GetEncoding(encode_type).CodePage) { encode_bom = false; } if (encode_type == "") { encode_type = "utf-8"; } using (var st = new StreamReader(await file.OpenStreamForReadAsync(), Encoding.GetEncoding(encode_type))) { await TabsWriteManager.PushTabContentViaIDAsync(new TabID { ID_Tab = Tab.ID, ID_TabsList = ListTabsID }, st.ReadToEnd(), true); if (ReplaceEncoding) { Tab.TabEncoding = Encoding.GetEncoding(encode_type).CodePage; Tab.TabEncodingWithBOM = encode_bom; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, true); } st.Dispose(); } return(true); } catch (Exception e) { await DispatcherHelper.ExecuteOnUIThreadAsync(async() => { await new MessageDialog(e.Message, new ResourceLoader().GetString("popup-errorreadingfile")).ShowAsync(); await TabsWriteManager.DeleteTabAsync(new TabID { ID_Tab = Tab.ID, ID_TabsList = ListTabsID }); }); return(false); } }
public async Task <string> ReadFileAndGetContent() { StorageFile file = AsyncHelpers.RunSync(() => StorageFile.GetFileFromPathAsync(Tab.PathContent).AsTask()); string encode_type = "", content = ""; await Task.Run(() => { using (FileStream fs = File.OpenRead(Tab.PathContent)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode_type = cdet.Charset; } } }); using (var st = new StreamReader(await file.OpenStreamForReadAsync(), Encoding.GetEncoding(encode_type))) { content = st.ReadToEnd(); st.Dispose(); return(content); } }
protected string GetSubtitleEncoding(string subtitleSource, string subtitleLanguage) { if (string.IsNullOrEmpty(subtitleSource)) { return(null); } byte[] buffer = File.ReadAllBytes(subtitleSource); //Use byte order mark if any if (buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0XFE && buffer[3] == 0XFF) { return("UTF-32"); } else if (buffer[0] == 0XFF && buffer[1] == 0XFE && buffer[2] == 0x00 && buffer[3] == 0x00) { return("UTF-32"); } else if (buffer[0] == 0XFE && buffer[1] == 0XFF) { return("UNICODEBIG"); } else if (buffer[0] == 0XFF && buffer[1] == 0XFE) { return("UNICODELITTLE"); } else if (buffer[0] == 0XEF && buffer[1] == 0XBB && buffer[2] == 0XBF) { return("UTF-8"); } else if (buffer[0] == 0X2B && buffer[1] == 0X2F && buffer[2] == 0x76) { return("UTF-7"); } //Detect encoding from language if (string.IsNullOrEmpty(subtitleLanguage) == false) { CultureInfo[] cultures = CultureInfo.GetCultures(CultureTypes.NeutralCultures); foreach (CultureInfo culture in cultures) { if (culture.TwoLetterISOLanguageName.ToUpperInvariant() == subtitleLanguage.ToUpperInvariant()) { return(Encoding.GetEncoding(culture.TextInfo.ANSICodePage).BodyName.ToUpperInvariant()); } } } //Detect encoding from file Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(buffer, 0, buffer.Length); cdet.DataEnd(); if (cdet.Charset != null && cdet.Confidence >= 0.1) { return(Encoding.GetEncoding(cdet.Charset).BodyName.ToUpperInvariant()); } //Use windows encoding return(Encoding.Default.BodyName.ToUpperInvariant()); }
public static Encoding GetEncoding(Stream stream) { var cdet = new Ude.CharsetDetector(); cdet.Feed(stream); cdet.DataEnd(); return(cdet.Charset != null?Encoding.GetEncoding(cdet.Charset) : null); }
public static Encoding GetEncoding(byte[] bytes, int offset, int length) { var cdet = new Ude.CharsetDetector(); cdet.Feed(bytes, offset, length); cdet.DataEnd(); return(cdet.Charset != null?Encoding.GetEncoding(cdet.Charset) : null); }
private void GuessEncoding() { using var fs = File.OpenRead(_file); var charsetDetector = new Ude.CharsetDetector(); charsetDetector.Feed(fs); charsetDetector.DataEnd(); _encoding = charsetDetector.Charset != null?Encoding.GetEncoding(charsetDetector.Charset) : Encoding.UTF8; }
private string GetCharset(Stream stream) { var detector = new Ude.CharsetDetector(); detector.Feed(stream); detector.DataEnd(); return(detector.Charset); }
// this only return ascii or utf8, only for the email usage now. public static string GetTextCharset(string text) { var bytes = System.Text.Encoding.UTF8.GetBytes(text); Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(bytes, 0, bytes.Length); detector.DataEnd(); return(detector.Charset); }
public static string DetectEncoding(FileStream fs) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { return(cdet.Charset); } else { return("Error"); } }
private Encoding GetEncoding(string fileName) { using (FileStream fs = File.OpenRead(fileName)) { Ude.ICharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { return(Encoding.GetEncoding(cdet.Charset)); } return(_DEFAULT_ENCODING); } }
public static string GetCharset(string fileName) { using (FileStream fs = File.OpenRead(fileName)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { return(cdet.Charset); } return(""); } }
private void AddSrtSubtitleImpl() { string srtFile = FileService.Instance.GetFileNameLoad( Config.RememberPreviousFiles ? Config.LastSrtFolder : null, SubtitleRes.SrtFilePickerText, Utilities.GetFilePickerFilter("srt")); if (srtFile != null) { if (Config.RememberPreviousFiles) { Config.LastSrtFolder = Path.GetDirectoryName(srtFile); } string characterCode = null; using (FileStream srtFileStream = File.OpenRead(srtFile)) { Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(srtFileStream); detector.DataEnd(); if (detector.Charset != null) { this.logger.Log($"Detected encoding {detector.Charset} for {srtFile} with confidence {detector.Confidence}."); characterCode = CharCode.FromUdeCode(detector.Charset); if (characterCode == null) { this.logger.Log("Detected encoding does not match with any available encoding."); } else { this.logger.Log("Picked encoding " + characterCode); } } if (characterCode == null) { Ioc.Get <IMessageBoxService>().Show(this, SubtitleRes.SubtitleCharsetDetectionFailedMessage); characterCode = "UTF-8"; } } SrtSubtitle newSubtitle = new SrtSubtitle { FileName = srtFile, Default = false, CharacterCode = characterCode, LanguageCode = LanguageUtilities.GetDefaultLanguageCode(), Offset = 0 }; this.srtSubtitles.Add(new SrtSubtitleViewModel(this, newSubtitle)); } this.UpdateWarningVisibility(); }
private static string GetEncoding(string filename) { using (var fs = File.OpenRead(filename)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); //if (cdet.Charset != null) // Console.WriteLine("Charset: {0}, confidence: {1} : " + filename, cdet.Charset, cdet.Confidence); //else // Console.WriteLine("Detection failed: " + filename); return(cdet.Charset); } }
private Encoding DetectEncoding(Stream stream) { var encoding = Encoding.UTF8; Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(stream); cdet.DataEnd(); if (cdet.Charset != null) { encoding = GetEncodingFromString(cdet.Charset); } stream.Position = 0; return(encoding); }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> private string Detect(byte[] inputData, int start, int count) { if (_done) { return(_encodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { _done = true; return(_encodingName); } } // execute charset detector _ude.Feed(inputData, start, count); _ude.DataEnd(); if (_ude.IsDone() && !string.IsNullOrEmpty(_ude.Charset)) { IncrementFrequency(_ude.Charset); _done = true; return(_encodingName); } // singular buffer detection _singleUde.Reset(); const int udeFeedSize = 4 * 1024; int step = count - start < udeFeedSize ? count - start : udeFeedSize; for (int pos = start; pos < count; pos += step) { _singleUde.Feed(inputData, pos, pos + step > count ? count - pos : step); _singleUde.DataEnd(); // update encoding frequency if (_singleUde.Confidence > 0.3 && !string.IsNullOrEmpty(_singleUde.Charset)) { IncrementFrequency(_singleUde.Charset); } } // vote for best encoding _encodingName = GetCurrentEncoding(); // update current encoding name return(_encodingName); }
private static Encoding GetFileEncoding(string fileName) { using (var fs = File.OpenRead(fileName)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset == null) { throw new ArgumentException("Error in reading charset."); } return(Encoding.GetEncoding(cdet.Charset)); } }
public static System.Text.Encoding DetectCsvEncoding(Stream fs) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); fs.Seek(0, SeekOrigin.Begin); if (cdet.Charset != null) { s_logger.Debug("Charset: {}, confidence: {}", cdet.Charset, cdet.Confidence); return(System.Text.Encoding.GetEncoding(cdet.Charset) ?? System.Text.Encoding.Default); } else { return(System.Text.Encoding.Default); } }
static void Main(string[] args) { List <string> files = new List <string>(); files.Add("ansi.txt"); files.Add("utf8.txt"); /* * StreamReader ansi = new StreamReader("ansi.txt"); * StreamReader utf8 = new StreamReader("utf8.txt"); * * string ansiContent = ansi.ReadToEnd(); * string utf8Content = utf8.ReadToEnd(); * * Console.WriteLine("ansi : " + ansi.CurrentEncoding.CodePage); * //Console.WriteLine("ansi 2: "+TextFileEncodingDetector.DetectTextFileEncoding("ansi.txt")); * //Console.WriteLine(ansiContent); * * * Console.WriteLine("utf8 : " + utf8.CurrentEncoding.CodePage); * //Console.WriteLine(utf8Content); */ foreach (string file in files) { Console.Write(file); string filename = file; using (FileStream fs = File.OpenRead(filename)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else { Console.WriteLine("Detection failed."); } } } Console.ReadLine(); }
public BMS Parse(string path) { BMS bms = new BMS(); //default encoding: Shift-JIS? Encoding encoding = Encoding.GetEncoding(932); String line; if (!File.Exists(path)) { return null; } using (FileStream fs = File.OpenRead(path)) { //detect charset Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); /* if(cdet.Charset != null) { Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); encoding = Encoding.GetEncoding(cdet.Charset); } else { Console.WriteLine("Detection Failed"); } */ } using (StreamReader sr = new StreamReader(path, encoding)) { bms.path = Directory.GetParent(path).FullName; while((line = sr.ReadLine()) != null) { ProcessBMSLine(line.Trim(), bms); } } SetSubtitle(bms.info); CalculatePulse(bms); FillRealTime(bms); return bms; }
/// <summary> /// 用第三方元件"Ude"去解讀該文件的正確編碼方式 /// </summary> /// <param name="context"></param> /// <returns></returns> private static Encoding GetUdeEncode(Stream context) { Encoding result = null; Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(context); detector.DataEnd(); if (detector.Charset != null) { result = Encoding.GetEncoding(ConvertToCorrectEncodingString(detector.Charset)); } else { //沒有值視為沒取到 } return(result); }
private Encoding GetEncoding(string filename) { using (FileStream fs = File.OpenRead(filename)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { return(Encoding.GetEncoding(cdet.Charset)); } else { return(Encoding.Default); } } }
/// <summary> /// Auto detecting encoding from the file. /// </summary> protected virtual Encoding detectEncodingFromFile(string file) { using(FileStream fs = File.OpenRead(file)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if(cdet.Charset == null) { //throw new ComponentException("Ude: Detection failed for '{0}'", file); Log.Warn("Problem with detection of encoding for '{0}'", file); return defaultEncoding; // good luck } Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence); Encoding enc = Encoding.GetEncoding(cdet.Charset); if(enc == Encoding.UTF8) { fs.Seek(0, SeekOrigin.Begin); return (fs.ReadByte() == 0xEF && fs.ReadByte() == 0xBB && fs.ReadByte() == 0xBF) ? new UTF8Encoding(true) : new UTF8Encoding(false); } return enc; } }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) return EncodingName; if (!_started) { Reset (); _started = true; if (!CheckForTextualData (inputData, start, count)) { IsText = false; Done = true; return EncodingName; } HasByteOrderMark = CheckForByteOrderMark (inputData, start); IsText = true; } // execute charset detector ude.Feed (inputData, start, count); ude.DataEnd (); if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) { Done = true; return EncodingName; } const int bufferSize = 4 * 1024; // singular buffer detection if (singleEncodings.Count < 2000) { var u = new Ude.CharsetDetector (); int step = (count - start) < bufferSize ? (count - start) : bufferSize; for (var i = start; i < count; i += step) { u.Reset (); if (i + step > count) u.Feed (inputData, i, count - i); else u.Feed (inputData, i, step); u.DataEnd (); if (u.Confidence > 0.3 && !String.IsNullOrEmpty (u.Charset)) singleEncodings.Add (u.Charset); } } return EncodingName; }
private void ConvertToUtf8(string fileName) { string charset = null; using (FileStream fs = File.OpenRead(fileName)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); charset = cdet.Charset; } if (charset != null) { var text = string.Empty; using (var reader = new StreamReader(fileName, Encoding.GetEncoding(charset))) { text = reader.ReadToEnd(); } using (var writer = new StreamWriter(fileName, false, Encoding.UTF8)) { if (text.Length > 0) writer.Write(text); } } }
/// <summary> /// Detects encoding using mozilla universal character detector. /// </summary> /// <param name="bytes">sample data</param> /// <returns>Detected encoding or null if not detected</returns> /// <history> /// [Curtis_Beard] 12/01/2014 Created /// </history> private static Encoding DetectEncodingUsingMozillaUCD(Byte[] bytes) { try { Ude.ICharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); if (cdet.Charset != null) { return Encoding.GetEncoding(cdet.Charset); } } catch { } return null; }
/// <summary> /// Get Encoding Format of file /// </summary> /// <param name="path">Chemin du fichier</param> /// <returns>File Encoding</returns> private void GetEncoding(string path) { string encode; using (FileStream fs = File.OpenRead(path + DataLoaderConstants.FileExtCsv)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode = cdet.Charset; } else { encode = "failed"; } } if (encode == "failed") enc = Encoding.Default; else { switch (encode.ToLower()) { case "utf-8": enc = Encoding.UTF8; break; case "utf-16le": enc = Encoding.Unicode; break; case "utf-16be": enc = Encoding.BigEndianUnicode; break; case "windows-1252": goto default; default: enc = Encoding.Default; break; } } }
/// <summary> /// Auto detecting the encoding from the file /// </summary> /// <param name="file"></param> /// <returns></returns> protected virtual Encoding detectEncodingFromFile(string file) { using(FileStream fs = File.OpenRead(file)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if(cdet.Charset == null) { //throw new ComponentException("Ude: Detection failed for '{0}'", file); Log.Warn("Problem with detection of encoding for '{0}'", file); return Encoding.UTF8; // good luck } Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence); return Encoding.GetEncoding(cdet.Charset); } }
private static void ProcessFilesInDir(string d) { foreach (string f in Directory.GetFiles(d).Where(f => Path.GetExtension(f) == ".cs" || Path.GetExtension(f) == ".xaml")) { Encoding encoding; using (var fs = File.OpenRead(f)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset == null) { // Console.WriteLine($"{f} - Detection failed."); continue; } switch (cdet.Charset) { case "ASCII": case "UTF-8": continue; case "x-mac-cyrillic": case "windows-1251": encoding = Encoding.GetEncoding(1251); break; default: Console.Out.WriteLine($"{cdet.Charset} - {f} - Skipped"); continue; } } // Console.Out.WriteLine(f); var text = File.ReadAllText(f, encoding); File.WriteAllText(f, text, Encoding.UTF8); } }
/// <summary> /// Acquires encoding related info on one read. /// </summary> public static EncodingFileInfo GetEncodingFileInfo(String file) { Int32 startIndex = 0; EncodingFileInfo info = new EncodingFileInfo(); try { if (File.Exists(file)) { Byte[] bytes = File.ReadAllBytes(file); if (bytes.Length > 2 && (bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf)) { startIndex = 3; info.BomLength = 3; info.ContainsBOM = true; info.Charset = Encoding.UTF8.WebName; info.CodePage = Encoding.UTF8.CodePage; } else if (bytes.Length > 3 && (bytes[0] == 0xff && bytes[1] == 0xfe && bytes[2] == 0x00 && bytes[3] == 0x00)) { startIndex = 4; info.BomLength = 4; info.ContainsBOM = true; info.Charset = Encoding.UTF32.WebName; info.CodePage = Encoding.UTF32.CodePage; } else if (bytes.Length > 4 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f) && bytes[4] == 0x2D)) { startIndex = 5; info.BomLength = 5; info.ContainsBOM = true; info.Charset = Encoding.UTF7.WebName; info.CodePage = Encoding.UTF7.CodePage; } else if (bytes.Length > 3 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f))) { startIndex = 4; info.BomLength = 4; info.ContainsBOM = true; info.Charset = Encoding.UTF7.WebName; info.CodePage = Encoding.UTF7.CodePage; } else if (bytes.Length > 1 && (bytes[0] == 0xff && bytes[1] == 0xfe)) { startIndex = 2; info.BomLength = 2; info.ContainsBOM = true; info.Charset = Encoding.Unicode.WebName; info.CodePage = Encoding.Unicode.CodePage; } else if (bytes.Length > 1 && (bytes[0] == 0xfe && bytes[1] == 0xff)) { startIndex = 2; info.BomLength = 2; info.ContainsBOM = true; info.Charset = Encoding.BigEndianUnicode.WebName; info.CodePage = Encoding.BigEndianUnicode.CodePage; } else { if (!ContainsInvalidUTF8Bytes(bytes)) { info.Charset = Encoding.UTF8.WebName; info.CodePage = Encoding.UTF8.CodePage; } else // Try detecting using Ude... { Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(bytes, 0, bytes.Length); detector.DataEnd(); if (detector.Charset != null) { Encoding encoding = Encoding.GetEncoding(detector.Charset); info.Charset = encoding.WebName; info.CodePage = encoding.CodePage; } else { info.Charset = Encoding.Default.WebName; info.CodePage = Encoding.Default.CodePage; } } } Int32 contentLength = bytes.Length - startIndex; if (bytes.Length > 0 && bytes.Length > startIndex) { Encoding encoding = Encoding.GetEncoding(info.CodePage); info.Contents = encoding.GetString(bytes, startIndex, contentLength); } } } catch (Exception) { info = new EncodingFileInfo(); } return info; }
/// <summary> /// Auto-detector to fixing the encoded string. /// </summary> /// <param name="str">Data for reencoding</param> /// <param name="from">Known Encoding for current string</param> /// <returns>Reencoded string with auto-detected charset.</returns> protected virtual string reEncodeString(string str, Encoding from) { if(String.IsNullOrEmpty(str)) { return str; } byte[] bytes = from.GetBytes(str); Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); if(cdet.Charset == null) { Log.Debug("reEncodeString: Problem with detection... use the original"); return str; } Log.Debug("reEncodeString: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence); if(cdet.Confidence < 0.92f) { Log.Debug("reEncodeString: Confidence < 0.92 /use the original"); return str; } Encoding to = Encoding.GetEncoding(cdet.Charset); Log.Debug("reEncodeString: '{0}' -> '{1}'", from.EncodingName, to.EncodingName); Log.Trace("reEncodeString: original - '{0}'", str); return to.GetString(bytes); }
/// <summary> /// Get Encoding Format of file /// </summary> /// <param name="path">Chemin du fichier</param> /// <returns>File Encoding</returns> private static Encoding GetEncoding(string path) { string encode; using (FileStream fs = File.OpenRead(path)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode = cdet.Charset; } else { encode = "failed"; } fs.Close(); } if (encode == "failed") return Encoding.Default; else { switch (encode.ToLower()) { case "utf-8": return Encoding.UTF8; case "utf-16le": return Encoding.Unicode; case "utf-16be": return Encoding.BigEndianUnicode; case "windows-1252": goto default; default: return Encoding.Default; } } }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect (byte[] inputData, int start, int count) { if (Done) return EncodingName; if (!_started) { Reset (); _started = true; if (!CheckForTextualData (inputData, start, count)) { IsText = false; Done = true; return EncodingName; } HasByteOrderMark = CheckForByteOrderMark (inputData, start); IsText = true; } // execute charset detector ude.Feed (inputData, start, count); ude.DataEnd (); if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) { IncrementFrequency (ude.Charset); Done = true; return EncodingName; } // singular buffer detection var singleUde = new Ude.CharsetDetector (); const int udeFeedSize = 4 * 1024; int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize; for (var pos = start; pos < count; pos += step) { singleUde.Reset (); if (pos + step > count) singleUde.Feed (inputData, pos, count - pos); else singleUde.Feed (inputData, pos, step); singleUde.DataEnd (); // update encoding frequency if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset)) IncrementFrequency (singleUde.Charset); } // vote for best encoding EncodingName = GetCurrentEncoding (); // update current encoding name return EncodingName; }