// the reason to create this method in the API is that certain encodings are not // grep-able under certain tools that we use heavily, like MINGW32's grep in this case // (not sure if more tools are affected) public static bool IsEncodingGrepable(FileInfo file, out string charSet) { using (FileStream fs = File.OpenRead(file.FullName)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); charSet = cdet.Charset; if (cdet.Charset != null && cdet.Charset == "UTF-16LE") { return false; } return true; } }
public BMS Parse(string path) { BMS bms = new BMS(); //default encoding: Shift-JIS? Encoding encoding = Encoding.GetEncoding(932); String line; if (!File.Exists(path)) { return null; } using (FileStream fs = File.OpenRead(path)) { //detect charset Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); /* if(cdet.Charset != null) { Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); encoding = Encoding.GetEncoding(cdet.Charset); } else { Console.WriteLine("Detection Failed"); } */ } using (StreamReader sr = new StreamReader(path, encoding)) { bms.path = Directory.GetParent(path).FullName; while((line = sr.ReadLine()) != null) { ProcessBMSLine(line.Trim(), bms); } } SetSubtitle(bms.info); CalculatePulse(bms); FillRealTime(bms); return bms; }
static Encoding DetectArchiveFileEncoding(string path) { Encoding encoding = Encoding.GetEncoding(1252); var options = new ReaderOptions { ArchiveEncoding = new ArchiveEncoding(encoding, encoding) }; var detector = new Ude.CharsetDetector(); using (var archive = ArchiveFactory.Open(path, options)) { foreach (var entry in archive.Entries) { byte[] buffer = encoding.GetBytes(entry.Key); detector.Feed(buffer, 0, buffer.Length); } } detector.DataEnd(); Log.Information($"{path} charset: {detector.Charset} confidence: {detector.Confidence}"); return(Encoding.GetEncoding(detector.Charset)); }
public SrtSubtitle LoadSrtSubtitle(string srtPath) { try { string characterCode = null; using (FileStream srtFileStream = File.OpenRead(srtPath)) { Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(srtFileStream); detector.DataEnd(); if (detector.Charset != null) { this.logger.Log($"Detected encoding {detector.Charset} for {srtPath} with confidence {detector.Confidence}."); characterCode = CharCode.FromUdeCode(detector.Charset); if (characterCode == null) { this.logger.Log("Detected encoding does not match with any available encoding."); } else { this.logger.Log("Picked encoding " + characterCode); } } if (characterCode == null) { StaticResolver.Resolve <IMessageBoxService>().Show(this, SubtitleRes.SubtitleCharsetDetectionFailedMessage); characterCode = "UTF-8"; } } return(new SrtSubtitle { FileName = srtPath, Default = false, CharacterCode = characterCode, LanguageCode = LanguageUtilities.GetDefaultLanguageCode(), Offset = 0 }); } catch (Exception exception) { this.logger.LogError("Could not load SRT file: " + exception); return(null); } }
public static string LoadContentFromFile(string _path) { string fileContents = string.Empty; System.Text.Encoding encoding = null; FileInfo _file = new FileInfo(_path); using (FileStream fs = _file.OpenRead()) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encoding = System.Text.Encoding.GetEncoding(cdet.Charset); } else { encoding = System.Text.Encoding.UTF8; } fs.Position = 0; byte[] ar = new byte[_file.Length]; fs.Read(ar, 0, ar.Length); fileContents = encoding.GetString(ar); } if (fileContents.StartsWith("")) { fileContents = fileContents.Substring(3); } if (encoding != System.Text.Encoding.UTF8) { var datas = System.Text.Encoding.UTF8.GetBytes(fileContents); fileContents = System.Text.Encoding.UTF8.GetString(datas); } return(fileContents); }
public static string GetCode(string path) { using (FileStream fs = File.OpenRead(path)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); string ans; if (cdet.Charset.Contains("UTF")) { ans = "0"; return(ans); } else if (cdet.Charset.Contains("windows")) { ans = "1"; return(ans); } return("0"); } }
public static string LoadContentFromText(byte[] text) { string textContents = string.Empty; System.Text.Encoding encoding = null; using (MemoryStream fs = new MemoryStream(text)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encoding = System.Text.Encoding.GetEncoding(cdet.Charset); } else { encoding = System.Text.Encoding.UTF8; } fs.Position = 0; byte[] ar = new byte[text.Length]; fs.Read(ar, 0, ar.Length); textContents = encoding.GetString(ar); } if (textContents.StartsWith("")) { textContents = textContents.Substring(3); } if (encoding != System.Text.Encoding.UTF8) { var datas = System.Text.Encoding.UTF8.GetBytes(textContents); textContents = System.Text.Encoding.UTF8.GetString(datas); } return(textContents); }
public static Encoding DetectEncoding(FileInfo file) { string filename = file.FullName; Encoding encoding = null; try { using (FileStream fs = File.OpenRead(filename)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encoding = GetEncodingFromUdeCharset(cdet.Charset); } } } catch (Exception) { // leave as null } if (encoding == null) { try { using (StreamReader sr = new StreamReader(filename)) { sr.Read(); encoding = sr.CurrentEncoding; } } catch (IOException) { // just return null } } return(encoding); }
/// <summary> /// Get Encoding Format of file /// </summary> /// <param name="path">Chemin du fichier</param> /// <returns>File Encoding</returns> private static Encoding GetEncoding(string path) { string encode; using (FileStream fs = File.OpenRead(path)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode = cdet.Charset; } else { encode = "failed"; } fs.Close(); } if (encode == "failed") { return(Encoding.Default); } else { switch (encode.ToLower()) { case "utf-8": return(Encoding.UTF8); case "utf-16le": return(Encoding.Unicode); case "utf-16be": return(Encoding.BigEndianUnicode); case "windows-1252": goto default; default: return(Encoding.Default); } } }
public static Encoding GetEncoding(ref byte[] databytes, string ContentType = null) { Encoding encoding = null; if (!string.IsNullOrEmpty(ContentType)) { var charset = Kooboo.Lib.Helper.W3Encoding.ExtractCharset(ContentType); if (!string.IsNullOrEmpty(charset)) { encoding = System.Text.Encoding.GetEncoding(charset); if (encoding != null) { return(encoding); } } } encoding = W3Encoding.PreScanEncoding(databytes); if (encoding != null) { return(encoding); } Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(databytes, 0, databytes.Length); detector.DataEnd(); if (!string.IsNullOrWhiteSpace(detector.Charset)) { encoding = System.Text.Encoding.GetEncoding(detector.Charset); } if (encoding == null) { encoding = System.Text.Encoding.GetEncoding(W3Encoding.SystemDefaultEncoding); } return(encoding); }
public async Task <bool> ReadFile(bool ReplaceEncoding) { StorageFile file = AsyncHelpers.RunSync(() => StorageFile.GetFileFromPathAsync(Tab.PathContent).AsTask()); string encode_type = ""; await Task.Run(() => { using (FileStream fs = File.OpenRead(Tab.PathContent)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode_type = cdet.Charset; } } }); if (encode_type == "") { encode_type = "utf-8"; } using (var st = new StreamReader(await file.OpenStreamForReadAsync(), Encoding.GetEncoding(encode_type))) { await TabsWriteManager.PushTabContentViaIDAsync(new TabID { ID_Tab = Tab.ID, ID_TabsList = ListTabsID }, st.ReadToEnd(), true); if (ReplaceEncoding) { Tab.TabEncoding = Encoding.GetEncoding(encode_type).CodePage; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID); } st.Dispose(); } return(true); }
StreamReader GetStreamReader(string fname) { BinaryReader reader = new BinaryReader(File.OpenRead(fname)); byte[] bytes = reader.ReadBytes(5000); /* https://github.com/errepi/ude/tree/master/src/Library */ Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(bytes, 0, 5000); detector.DataEnd(); reader.Close(); if (detector.Charset != null) { try { return(new StreamReader(fname, Portable.Text.Encoding.GetEncoding(detector.Charset))); } catch (Exception e) { } } return(new StreamReader(fname, true)); }
/// <summary> /// Try to fix the wrong encoded string. /// </summary> /// <param name="input">Input data.</param> /// <param name="container">Known information about bytes.</param> /// <param name="confidence">To limit accepted confidence.</param> /// <returns>Returns null if detected confidence less than input limit. Otherwise, re-encoded string.</returns> public string FixEncoding(string input, Encoding container, float confidence = 0.92f) { if (string.IsNullOrWhiteSpace(input)) { return(input); } if (container == null) { throw new ArgumentNullException(nameof(container)); } byte[] bytes = container.GetBytes(input); var cdet = new Ude.CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); if (cdet.Charset == null) { return(null); } Log.Debug($"{nameof(FixEncoding)}: charset '{cdet.Charset}' confidence: '{cdet.Confidence}'"); if (cdet.Confidence < confidence) { Log.Debug($"{nameof(FixEncoding)}: Confidence < {confidence}"); return(null); } Encoding to = Encoding.GetEncoding(cdet.Charset); Log.Debug($"ReEncodeString: '{container.EncodingName}' -> '{to.EncodingName}'"); Log.Trace($"ReEncodeString: original - '{input}'"); return(to.GetString(bytes)); }
/// <summary> /// Get Encoding Format of file /// </summary> /// <param name="path">Chemin du fichier</param> /// <returns>File Encoding</returns> private void GetEncoding(string path) { string encode; using (FileStream fs = File.OpenRead(path + DataLoaderConstants.FileExtCsv)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode = cdet.Charset; } else { encode = "failed"; } } if (encode == "failed") { enc = Encoding.Default; } else { switch (encode.ToLower()) { case "utf-8": enc = Encoding.UTF8; break; case "utf-16le": enc = Encoding.Unicode; break; case "utf-16be": enc = Encoding.BigEndianUnicode; break; case "windows-1252": goto default; default: enc = Encoding.Default; break; } } }
/// <summary> /// Detects encoding for specified stream. /// </summary> /// <param name="stream">Input stream.</param> /// <param name="confidence">Detected confidence.</param> /// <returns>null if can't be detected.</returns> public Encoding Detect(Stream stream, out float confidence) { confidence = 0; if (stream == null) { return(null); } Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(stream); cdet.DataEnd(); if (cdet.Charset == null) { return(null); } confidence = cdet.Confidence; Log.Debug($"Detected charset '{cdet.Charset}' confidence: '{cdet.Confidence}'"); return(Encoding.GetEncoding(cdet.Charset)); }
private static Encoding GetTextEncoding(string filename, Ude.CharsetDetector charsetDetector, Encoding srcEncoding) { var encoding = Encoding.UTF8; using (FileStream fs = File.OpenRead(filename)) { charsetDetector.Reset(); charsetDetector.Feed(fs); charsetDetector.DataEnd(); try { encoding = Encoding.GetEncoding(charsetDetector.Charset); } catch (Exception ex) { Console.WriteLine("Failed to obtain encoding of the file: {0} to {1}. Using source encoding instead. Corrupted characters may occur. {2}", filename, charsetDetector.Charset, ex.Message); encoding = srcEncoding; } } return(encoding); }
private static void ProcessFilesInDir(string d) { foreach (string f in Directory.GetFiles(d).Where(f => Path.GetExtension(f) == ".cs" || Path.GetExtension(f) == ".xaml")) { Encoding encoding; using (var fs = File.OpenRead(f)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset == null) { // Console.WriteLine($"{f} - Detection failed."); continue; } switch (cdet.Charset) { case "ASCII": case "UTF-8": continue; case "x-mac-cyrillic": case "windows-1251": encoding = Encoding.GetEncoding(1251); break; default: Console.Out.WriteLine($"{cdet.Charset} - {f} - Skipped"); continue; } } // Console.Out.WriteLine(f); var text = File.ReadAllText(f, encoding); File.WriteAllText(f, text, Encoding.UTF8); } }
public async Task WriteFile() { await DispatcherHelper.ExecuteOnUIThreadAsync(async() => { try { StorageFile file = await StorageFile.GetFileFromPathAsync(Tab.TabOriginalPathContent); if (file != null) { await FileIO.WriteTextAsync(file, string.Empty); Encoding TempEncoding = Encoding.GetEncoding(Tab.TabEncoding); if (TempEncoding == Encoding.UTF8 && !Tab.TabEncodingWithBOM) { TempEncoding = new UTF8Encoding(false); } string Content = await TabsAccessManager.GetTabContentViaIDAsync(new TabID { ID_Tab = Tab.ID, ID_TabsList = ListTabsID }); using (var rd = new StreamWriter(await file.OpenStreamForWriteAsync(), TempEncoding)) { rd.Write(Content); rd.Flush(); rd.Dispose(); } //Update DateModified (updated push with "PushUpdateTabAsync" in StorageRouter.WriteFile()) BasicProperties properties = await file.GetBasicPropertiesAsync(); Tab.TabDateModified = properties.DateModified.ToString(); if (TempEncoding.CodePage == Encoding.ASCII.CodePage && Tab.TabEncodingReplacingRequest != EncodingReplacingRequest.Never) { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.Write(Content); writer.Flush(); stream.Position = 0; using (MemoryStream str = stream) { var cdet = new Ude.CharsetDetector(); cdet.Reset(); cdet.Feed(str); cdet.DataEnd(); if (cdet.Charset != null) { if (Encoding.GetEncoding(cdet.Charset).CodePage == Encoding.UTF8.CodePage) { await DispatcherHelper.ExecuteOnUIThreadAsync(async() => { MessageDialog Dialog = new MessageDialog(new ResourceLoader().GetString("popup-changeencodingcontent"), string.Format(new ResourceLoader().GetString("popup-changeencodingtitle"), TempEncoding.EncodingName, cdet.Charset)); Dialog.Commands.Add(new UICommand { Label = new ResourceLoader().GetString("popup-changeencodingaccept"), Invoked = async(e) => { Tab.TabEncoding = Encoding.GetEncoding(cdet.Charset).CodePage; Tab.TabEncodingWithBOM = false; Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.NotRequested; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); } }); Dialog.Commands.Add(new UICommand { Label = new ResourceLoader().GetString("popup-changeencodinglater"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.MaybeLater; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); } }); Dialog.Commands.Add(new UICommand { Label = new ResourceLoader().GetString("popup-changeencodingno"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.Never; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); } }); await Dialog.ShowAsync(); }); } } } } } } catch { await CreateFile().ContinueWith(async(e) => { if (e.Result) { await WriteFile(); } }); } }); }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) return EncodingName; if (!_started) { Reset (); _started = true; if (!CheckForTextualData (inputData, start, count)) { IsText = false; Done = true; return EncodingName; } HasByteOrderMark = CheckForByteOrderMark (inputData, start); IsText = true; } // execute charset detector ude.Feed (inputData, start, count); ude.DataEnd (); if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) { Done = true; return EncodingName; } const int bufferSize = 4 * 1024; // singular buffer detection if (singleEncodings.Count < 2000) { var u = new Ude.CharsetDetector (); int step = (count - start) < bufferSize ? (count - start) : bufferSize; for (var i = start; i < count; i += step) { u.Reset (); if (i + step > count) u.Feed (inputData, i, count - i); else u.Feed (inputData, i, step); u.DataEnd (); if (u.Confidence > 0.3 && !String.IsNullOrEmpty (u.Charset)) singleEncodings.Add (u.Charset); } } return EncodingName; }
/// <summary> /// Detects encoding using mozilla universal character detector. /// </summary> /// <param name="bytes">sample data</param> /// <returns>Detected encoding or null if not detected</returns> /// <history> /// [Curtis_Beard] 12/01/2014 Created /// </history> private static Encoding DetectEncodingUsingMozillaUCD(Byte[] bytes) { try { Ude.ICharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); if (cdet.Charset != null) { return Encoding.GetEncoding(cdet.Charset); } } catch { } return null; }
/// <summary> /// Auto detecting encoding from the file. /// </summary> protected virtual Encoding detectEncodingFromFile(string file) { using(FileStream fs = File.OpenRead(file)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if(cdet.Charset == null) { //throw new ComponentException("Ude: Detection failed for '{0}'", file); Log.Warn("Problem with detection of encoding for '{0}'", file); return defaultEncoding; // good luck } Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence); Encoding enc = Encoding.GetEncoding(cdet.Charset); if(enc == Encoding.UTF8) { fs.Seek(0, SeekOrigin.Begin); return (fs.ReadByte() == 0xEF && fs.ReadByte() == 0xBB && fs.ReadByte() == 0xBF) ? new UTF8Encoding(true) : new UTF8Encoding(false); } return enc; } }
/// <summary> /// Auto-detector to fixing the encoded string. /// </summary> /// <param name="str">Data for reencoding</param> /// <param name="from">Known Encoding for current string</param> /// <returns>Reencoded string with auto-detected charset.</returns> protected virtual string reEncodeString(string str, Encoding from) { if(String.IsNullOrEmpty(str)) { return str; } byte[] bytes = from.GetBytes(str); Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(bytes, 0, bytes.Length); cdet.DataEnd(); if(cdet.Charset == null) { Log.Debug("reEncodeString: Problem with detection... use the original"); return str; } Log.Debug("reEncodeString: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence); if(cdet.Confidence < 0.92f) { Log.Debug("reEncodeString: Confidence < 0.92 /use the original"); return str; } Encoding to = Encoding.GetEncoding(cdet.Charset); Log.Debug("reEncodeString: '{0}' -> '{1}'", from.EncodingName, to.EncodingName); Log.Trace("reEncodeString: original - '{0}'", str); return to.GetString(bytes); }
/// Add document from BLOB public virtual void AddRecord(string name, Stream stream, string fields, ConcurrentQueue <IIndexDocument> queue) { bool isText = false; IIndexDocument doc = null; AddRecordBase(name, fields); if ((indexType == IndexType.File) || (indexType != IndexType.Blob)) { throw new InvalidDataException("Adding record of wrong IndexType"); } BinaryReader reader = new BinaryReader(stream); byte[] buff = new byte[4000]; reader.Read(buff, 0, 4000); String det = Encoding.UTF8.GetString(buff, 0, buff.Length); stream.Seek(0, SeekOrigin.Begin); reader.Dispose(); // detect type if ((buff[0] == '%') && (buff[1] == 'P') && (buff[2] == 'D') && (buff[3] == 'F')) { DocumentsDataSource.IndexPDFDocument pdf = new DocumentsDataSource.IndexPDFDocument(name, stream, this); if (fields != null) { pdf.headers = () => { return(fields); } } ; doc = pdf; } else if (det.Contains("<html")) { IndexPagedTextFile file = WebDataSource.FromHtml(stream, name, Name); if (fields != null) { file.SetHeaders(fields); } } else { // detect charset Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(buff, 0, buff.Length); detector.DataEnd(); if (detector.Charset != null) { Encoding enc = Portable.Text.Encoding.GetEncoding(detector.Charset); using (StreamReader sreader = new StreamReader(stream, enc, false)) { doc = new IndexPagedTextFile("", sreader.ReadToEnd(), fields != null ? fields : ""); } } } if (doc != null) { Enqueue(queue, doc); } }
private void ConvertToUtf8(string fileName) { string charset = null; using (FileStream fs = File.OpenRead(fileName)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); charset = cdet.Charset; } if (charset != null) { var text = string.Empty; using (var reader = new StreamReader(fileName, Encoding.GetEncoding(charset))) { text = reader.ReadToEnd(); } using (var writer = new StreamWriter(fileName, false, Encoding.UTF8)) { if (text.Length > 0) writer.Write(text); } } }
/// <summary> /// Auto detecting the encoding from the file /// </summary> /// <param name="file"></param> /// <returns></returns> protected virtual Encoding detectEncodingFromFile(string file) { using(FileStream fs = File.OpenRead(file)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if(cdet.Charset == null) { //throw new ComponentException("Ude: Detection failed for '{0}'", file); Log.Warn("Problem with detection of encoding for '{0}'", file); return Encoding.UTF8; // good luck } Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence); return Encoding.GetEncoding(cdet.Charset); } }
/// <summary> /// Get Encoding Format of file /// </summary> /// <param name="path">Chemin du fichier</param> /// <returns>File Encoding</returns> private void GetEncoding(string path) { string encode; using (FileStream fs = File.OpenRead(path + DataLoaderConstants.FileExtCsv)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode = cdet.Charset; } else { encode = "failed"; } } if (encode == "failed") enc = Encoding.Default; else { switch (encode.ToLower()) { case "utf-8": enc = Encoding.UTF8; break; case "utf-16le": enc = Encoding.Unicode; break; case "utf-16be": enc = Encoding.BigEndianUnicode; break; case "windows-1252": goto default; default: enc = Encoding.Default; break; } } }
public WordOnTxt(DirectoryInfo di) { string _encoding = "1252"; foreach (var fi in di.GetFiles("*.txt", SearchOption.AllDirectories)) { K_Google.AddFile(fi); List <string> lstWord = new List <string>(); using (FileStream fs = File.OpenRead(fi.FullName)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { _encoding = cdet.Charset; //Console.WriteLine(_encoding); } else { Console.WriteLine("Detection failed."); } } string path = fi.DirectoryName + "\\" + fi.Name; Console.WriteLine(path); StreamReader Reader = new StreamReader(path, Encoding.GetEncoding(_encoding)); string polpi = Reader.ReadToEnd(); Reader.Close(); polpi = polpi.ToLower(); Char[] delimiter = new char[] { '[', ']', '#', '^', '¦', '|', '£', '<', '>', '_', '$', '\n', '\r', '.', ' ', ',', '\'', '!', '?', '(', ')', '%', '&', '"', '=', '+', '{', '}', '*', ';', ':', '\\', '-', '/' }; String[] substrings = polpi.Split(delimiter); //List<string> lstSub = new List<string>(substrings); List <string> lstSub = new List <string>(); foreach (string word in substrings) { if (word.Count() > 0) { lstSub.Add(word); } } lstSub.Sort(); foreach (var substring in lstSub) { if (!lstWord.Contains(substring)) { lstWord.Add(substring); } } lstSub.Sort(); foreach (var word in lstWord) { AddWord(word); } List <string> lstWord2 = new List <string>(); foreach (var substring in lstSub) { if (!lstWord2.Contains(substring)) { Ocurrence ocucu = new Ocurrence(fi, substring, null); if (!lstOccurence.Contains(ocucu)) { lstOccurence.Add(ocucu); lstWord2.Add(substring); } } else { foreach (Ocurrence occurence in lstOccurence) { if (occurence.Word == substring) { occurence.IncreamentOccurence(); } } } } } foreach (Ocurrence ocu in lstOccurence) { ocu.SendToDataBase(); } }
/// <summary> /// Acquires encoding related info on one read. /// </summary> public static EncodingFileInfo GetEncodingFileInfo(String file) { Int32 startIndex = 0; EncodingFileInfo info = new EncodingFileInfo(); try { if (File.Exists(file)) { Byte[] bytes = File.ReadAllBytes(file); if (bytes.Length > 2 && (bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf)) { startIndex = 3; info.BomLength = 3; info.ContainsBOM = true; info.Charset = Encoding.UTF8.WebName; info.CodePage = Encoding.UTF8.CodePage; } else if (bytes.Length > 3 && (bytes[0] == 0xff && bytes[1] == 0xfe && bytes[2] == 0x00 && bytes[3] == 0x00)) { startIndex = 4; info.BomLength = 4; info.ContainsBOM = true; info.Charset = Encoding.UTF32.WebName; info.CodePage = Encoding.UTF32.CodePage; } else if (bytes.Length > 4 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f) && bytes[4] == 0x2D)) { startIndex = 5; info.BomLength = 5; info.ContainsBOM = true; info.Charset = Encoding.UTF7.WebName; info.CodePage = Encoding.UTF7.CodePage; } else if (bytes.Length > 3 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f))) { startIndex = 4; info.BomLength = 4; info.ContainsBOM = true; info.Charset = Encoding.UTF7.WebName; info.CodePage = Encoding.UTF7.CodePage; } else if (bytes.Length > 1 && (bytes[0] == 0xff && bytes[1] == 0xfe)) { startIndex = 2; info.BomLength = 2; info.ContainsBOM = true; info.Charset = Encoding.Unicode.WebName; info.CodePage = Encoding.Unicode.CodePage; } else if (bytes.Length > 1 && (bytes[0] == 0xfe && bytes[1] == 0xff)) { startIndex = 2; info.BomLength = 2; info.ContainsBOM = true; info.Charset = Encoding.BigEndianUnicode.WebName; info.CodePage = Encoding.BigEndianUnicode.CodePage; } else { if (!ContainsInvalidUTF8Bytes(bytes)) { info.Charset = Encoding.UTF8.WebName; info.CodePage = Encoding.UTF8.CodePage; } else // Try detecting using Ude... { Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(bytes, 0, bytes.Length); detector.DataEnd(); if (detector.Charset != null) { Encoding encoding = Encoding.GetEncoding(detector.Charset); info.Charset = encoding.WebName; info.CodePage = encoding.CodePage; } else { info.Charset = Encoding.Default.WebName; info.CodePage = Encoding.Default.CodePage; } } } Int32 contentLength = bytes.Length - startIndex; if (bytes.Length > 0 && bytes.Length > startIndex) { Encoding encoding = Encoding.GetEncoding(info.CodePage); info.Contents = encoding.GetString(bytes, startIndex, contentLength); } } } catch (Exception) { info = new EncodingFileInfo(); } return info; }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect (byte[] inputData, int start, int count) { if (Done) return EncodingName; if (!_started) { Reset (); _started = true; if (!CheckForTextualData (inputData, start, count)) { IsText = false; Done = true; return EncodingName; } HasByteOrderMark = CheckForByteOrderMark (inputData, start); IsText = true; } // execute charset detector ude.Feed (inputData, start, count); ude.DataEnd (); if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) { IncrementFrequency (ude.Charset); Done = true; return EncodingName; } // singular buffer detection var singleUde = new Ude.CharsetDetector (); const int udeFeedSize = 4 * 1024; int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize; for (var pos = start; pos < count; pos += step) { singleUde.Reset (); if (pos + step > count) singleUde.Feed (inputData, pos, count - pos); else singleUde.Feed (inputData, pos, step); singleUde.DataEnd (); // update encoding frequency if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset)) IncrementFrequency (singleUde.Charset); } // vote for best encoding EncodingName = GetCurrentEncoding (); // update current encoding name return EncodingName; }
/// <summary> /// Get Encoding Format of file /// </summary> /// <param name="path">Chemin du fichier</param> /// <returns>File Encoding</returns> private static Encoding GetEncoding(string path) { string encode; using (FileStream fs = File.OpenRead(path)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { encode = cdet.Charset; } else { encode = "failed"; } fs.Close(); } if (encode == "failed") return Encoding.Default; else { switch (encode.ToLower()) { case "utf-8": return Encoding.UTF8; case "utf-16le": return Encoding.Unicode; case "utf-16be": return Encoding.BigEndianUnicode; case "windows-1252": goto default; default: return Encoding.Default; } } }
public Action(Options options) { OpenOrCreateDirectory(options.InputDirectory, false); if (string.IsNullOrEmpty(options.OutputDirectory)) { options.OutputDirectory = options.InputDirectory; } OpenOrCreateDirectory(options.OutputDirectory, true); string inputDirectory = new DirectoryInfo(options.InputDirectory).FullName; string outputDirectory = new DirectoryInfo(options.OutputDirectory).FullName; Encoding outputEncoding = Encoding.GetEncoding(options.OutputEncoding); string[] allFiles = Directory.GetFiles(options.InputDirectory, "*", SearchOption.AllDirectories); HashSet <string> extensionWhiteList = options.ExtensionWhiteList == null ? new HashSet <string>() : new HashSet <string>(options.ExtensionWhiteList); bool allExtension = extensionWhiteList.Contains("*"); HashSet <string> extensionBlackList = options.ExtensionBlackList == null ? new HashSet <string>() : new HashSet <string>(options.ExtensionBlackList); List <FileInfo> inputFiles = new List <FileInfo>(); for (int iFile = 0; iFile < allFiles.Length; iFile++) { string iterFile = allFiles[iFile]; FileInfo iterFileInfo = new FileInfo(iterFile); if ((allExtension || extensionWhiteList.Contains(iterFileInfo.Extension)) && !extensionBlackList.Contains(iterFileInfo.Extension)) { inputFiles.Add(iterFileInfo); } else if (options.OutputLog) { Console.WriteLine("Ignore file: " + iterFileInfo.FullName); } } int convertedFileCount = 0; for (int iFile = 0; iFile < inputFiles.Count; iFile++) { FileInfo iterFile = inputFiles[iFile]; Encoding inputEncoding = null; using (FileStream fs = File.OpenRead(iterFile.FullName)) { Ude.CharsetDetector cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { inputEncoding = Encoding.GetEncoding(cdet.Charset); } else if (options.OutputLog) { Console.WriteLine(string.Format("{0} Cant detector file encoding", iterFile.FullName)); } } if (inputEncoding != null && inputEncoding != outputEncoding) { File.WriteAllBytes(iterFile.FullName.Replace(inputDirectory, outputDirectory) , Encoding.Convert(inputEncoding , outputEncoding , File.ReadAllBytes(iterFile.FullName))); convertedFileCount++; Console.WriteLine("Converted File: " + iterFile.FullName); } else if (options.OutputLog) { Console.WriteLine(string.Format("File encoding already is ({0}): {1}", options.OutputEncoding, iterFile.FullName)); } } Console.WriteLine(string.Format("Converted {0} files", convertedFileCount)); }
/// <summary> /// Wraps the given <paramref name="inputFile"/> in HTML pre tags /// </summary> /// <param name="inputFile">The input file</param> /// <param name="encoding">The encoding used in the input file</param> /// <returns>The wrapped HTML file</returns> public string WrapFile(string inputFile, Encoding encoding) { var temp = Path.GetFileName(inputFile) ?? string.Empty; var title = WebUtility.HtmlEncode(temp); var tempFile = GetTempFile; WriteToLog($"Reading text file '{inputFile}'"); if (encoding == null) { var charsetDetector = new Ude.CharsetDetector(); using (var fileStream = File.OpenRead(inputFile)) { WriteToLog("Trying to detect encoding"); charsetDetector.Feed(fileStream); charsetDetector.DataEnd(); if (charsetDetector.Charset != null) { try { encoding = Encoding.GetEncoding(charsetDetector.Charset); } catch { Console.WriteLine("Detection failed assuming standard encoding"); encoding = Encoding.Default; } } else { Console.WriteLine("Detection failed assuming standard encoding"); encoding = Encoding.Default; } } } var streamReader = new StreamReader(inputFile, encoding); WriteToLog($"File is '{streamReader.CurrentEncoding.WebName}' encoded"); var writeEncoding = new UnicodeEncoding(!BitConverter.IsLittleEndian, true); using (var writer = new StreamWriter(tempFile, false, writeEncoding)) using (streamReader) { writer.WriteLine("<html>"); writer.WriteLine("<head>"); writer.WriteLine($" <meta charset=\"{writeEncoding.WebName}\">"); writer.WriteLine($"<title>{title}</title>"); writer.WriteLine("<style>"); writer.WriteLine(" pre {"); writer.WriteLine($" white-space: { WhiteSpace };"); if (!string.IsNullOrWhiteSpace(FontFamily)) { writer.WriteLine($" font-family: { FontFamily };"); } if (!string.IsNullOrWhiteSpace(FontFamily)) { writer.WriteLine($" font-style: { FontStyle };"); } if (!string.IsNullOrWhiteSpace(FontFamily)) { writer.WriteLine($" font-size: { FontSize };"); } writer.WriteLine(" }"); writer.WriteLine("</style>"); writer.WriteLine("</head>"); writer.WriteLine("<body>"); writer.WriteLine("<pre>"); while (!streamReader.EndOfStream) { var line = streamReader.ReadLine(); if (line != null) { writer.WriteLine(HttpUtility.HtmlEncode(line)); } } writer.WriteLine("</pre>"); writer.WriteLine("</body>"); writer.WriteLine("</html>"); } WriteToLog($"File pre wrapped and written to temporary file '{tempFile}'"); return(tempFile); }
/// <summary> /// Acquires encoding related info on one read. /// </summary> public static EncodingFileInfo GetEncodingFileInfo(String file) { Int32 startIndex = 0; EncodingFileInfo info = new EncodingFileInfo(); try { if (File.Exists(file)) { Byte[] bytes = File.ReadAllBytes(file); if (bytes.Length > 2 && (bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf)) { startIndex = 3; info.BomLength = 3; info.ContainsBOM = true; info.Charset = Encoding.UTF8.WebName; info.CodePage = Encoding.UTF8.CodePage; } else if (bytes.Length > 3 && (bytes[0] == 0xff && bytes[1] == 0xfe && bytes[2] == 0x00 && bytes[3] == 0x00)) { startIndex = 4; info.BomLength = 4; info.ContainsBOM = true; info.Charset = Encoding.UTF32.WebName; info.CodePage = Encoding.UTF32.CodePage; } else if (bytes.Length > 4 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f) && bytes[4] == 0x2D)) { startIndex = 5; info.BomLength = 5; info.ContainsBOM = true; info.Charset = Encoding.UTF7.WebName; info.CodePage = Encoding.UTF7.CodePage; } else if (bytes.Length > 3 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f))) { startIndex = 4; info.BomLength = 4; info.ContainsBOM = true; info.Charset = Encoding.UTF7.WebName; info.CodePage = Encoding.UTF7.CodePage; } else if (bytes.Length > 1 && (bytes[0] == 0xff && bytes[1] == 0xfe)) { startIndex = 2; info.BomLength = 2; info.ContainsBOM = true; info.Charset = Encoding.Unicode.WebName; info.CodePage = Encoding.Unicode.CodePage; } else if (bytes.Length > 1 && (bytes[0] == 0xfe && bytes[1] == 0xff)) { startIndex = 2; info.BomLength = 2; info.ContainsBOM = true; info.Charset = Encoding.BigEndianUnicode.WebName; info.CodePage = Encoding.BigEndianUnicode.CodePage; } else { if (!ContainsInvalidUTF8Bytes(bytes)) { info.Charset = Encoding.UTF8.WebName; info.CodePage = Encoding.UTF8.CodePage; } else // Try detecting using Ude... { Ude.CharsetDetector detector = new Ude.CharsetDetector(); detector.Feed(bytes, 0, bytes.Length); detector.DataEnd(); if (detector.Charset != null) { Encoding encoding = Encoding.GetEncoding(detector.Charset); info.Charset = encoding.WebName; info.CodePage = encoding.CodePage; } else { info.Charset = Encoding.Default.WebName; info.CodePage = Encoding.Default.CodePage; } } } Int32 contentLength = bytes.Length - startIndex; if (bytes.Length > 0 && bytes.Length > startIndex) { Encoding encoding = Encoding.GetEncoding(info.CodePage); info.Contents = encoding.GetString(bytes, startIndex, contentLength); } } } catch (Exception) { info = new EncodingFileInfo(); } return(info); }