private void Reset() { _started = false; _done = false; encodingFrequency.Clear(); _ude.Reset(); _singleUde.Reset(); _encodingName = null; }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) { return(EncodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { IsText = false; Done = true; return(EncodingName); } HasByteOrderMark = CheckForByteOrderMark(inputData, start); IsText = true; } // execute charset detector ude.Feed(inputData, start, count); ude.DataEnd(); if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset)) { IncrementFrequency(ude.Charset); Done = true; return(EncodingName); } // singular buffer detection var singleUde = new Ude.CharsetDetector(); const int udeFeedSize = 4 * 1024; int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize; for (var pos = start; pos < count; pos += step) { singleUde.Reset(); if (pos + step > count) { singleUde.Feed(inputData, pos, count - pos); } else { singleUde.Feed(inputData, pos, step); } singleUde.DataEnd(); // update encoding frequency if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty(singleUde.Charset)) { IncrementFrequency(singleUde.Charset); } } // vote for best encoding EncodingName = GetCurrentEncoding(); // update current encoding name return(EncodingName); }
/// <summary> /// Resets this instance. /// </summary> public void Reset() { _started = false; Done = false; HasByteOrderMark = false; encodingFrequency.Clear(); ude.Reset(); EncodingName = null; }
/// <summary> /// Resets this instance. /// </summary> public void Reset() { _started = false; Done = false; HasByteOrderMark = false; singleEncodings.Clear(); ude.Reset(); EncodingName = null; }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) { return(EncodingName); } if (!_started) { Reset(); _started = true; if (!CheckForTextualData(inputData, start, count)) { IsText = false; Done = true; return(EncodingName); } HasByteOrderMark = CheckForByteOrderMark(inputData, start); IsText = true; } // execute charset detector ude.Feed(inputData, start, count); ude.DataEnd(); if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset)) { Done = true; return(EncodingName); } const int bufferSize = 4 * 1024; // singular buffer detection if (singleEncodings.Count < 2000) { var u = new Ude.CharsetDetector(); int step = (count - start) < bufferSize ? (count - start) : bufferSize; for (var i = start; i < count; i += step) { u.Reset(); if (i + step > count) { u.Feed(inputData, i, count - i); } else { u.Feed(inputData, i, step); } u.DataEnd(); if (u.Confidence > 0.3 && !String.IsNullOrEmpty(u.Charset)) { singleEncodings.Add(u.Charset); } } } return(EncodingName); }
private static Encoding GetTextEncoding(string filename, Ude.CharsetDetector charsetDetector, Encoding srcEncoding) { var encoding = Encoding.UTF8; using (FileStream fs = File.OpenRead(filename)) { charsetDetector.Reset(); charsetDetector.Feed(fs); charsetDetector.DataEnd(); try { encoding = Encoding.GetEncoding(charsetDetector.Charset); } catch (Exception ex) { Console.WriteLine("Failed to obtain encoding of the file: {0} to {1}. Using source encoding instead. Corrupted characters may occur. {2}", filename, charsetDetector.Charset, ex.Message); encoding = srcEncoding; } } return(encoding); }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect(byte[] inputData, int start, int count) { if (Done) return EncodingName; if (!_started) { Reset (); _started = true; if (!CheckForTextualData (inputData, start, count)) { IsText = false; Done = true; return EncodingName; } HasByteOrderMark = CheckForByteOrderMark (inputData, start); IsText = true; } // execute charset detector ude.Feed (inputData, start, count); ude.DataEnd (); if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) { Done = true; return EncodingName; } const int bufferSize = 4 * 1024; // singular buffer detection if (singleEncodings.Count < 2000) { var u = new Ude.CharsetDetector (); int step = (count - start) < bufferSize ? (count - start) : bufferSize; for (var i = start; i < count; i += step) { u.Reset (); if (i + step > count) u.Feed (inputData, i, count - i); else u.Feed (inputData, i, step); u.DataEnd (); if (u.Confidence > 0.3 && !String.IsNullOrEmpty (u.Charset)) singleEncodings.Add (u.Charset); } } return EncodingName; }
public async Task WriteFile() { await DispatcherHelper.ExecuteOnUIThreadAsync(async() => { try { StorageFile file = await StorageFile.GetFileFromPathAsync(Tab.TabOriginalPathContent); if (file != null) { await FileIO.WriteTextAsync(file, string.Empty); Encoding TempEncoding = Encoding.GetEncoding(Tab.TabEncoding); if (TempEncoding == Encoding.UTF8 && !Tab.TabEncodingWithBOM) { TempEncoding = new UTF8Encoding(false); } string Content = await TabsAccessManager.GetTabContentViaIDAsync(new TabID { ID_Tab = Tab.ID, ID_TabsList = ListTabsID }); using (var rd = new StreamWriter(await file.OpenStreamForWriteAsync(), TempEncoding)) { rd.Write(Content); rd.Flush(); rd.Dispose(); } //Update DateModified (updated push with "PushUpdateTabAsync" in StorageRouter.WriteFile()) BasicProperties properties = await file.GetBasicPropertiesAsync(); Tab.TabDateModified = properties.DateModified.ToString(); if (TempEncoding.CodePage == Encoding.ASCII.CodePage && Tab.TabEncodingReplacingRequest != EncodingReplacingRequest.Never) { var stream = new MemoryStream(); var writer = new StreamWriter(stream); writer.Write(Content); writer.Flush(); stream.Position = 0; using (MemoryStream str = stream) { var cdet = new Ude.CharsetDetector(); cdet.Reset(); cdet.Feed(str); cdet.DataEnd(); if (cdet.Charset != null) { if (Encoding.GetEncoding(cdet.Charset).CodePage == Encoding.UTF8.CodePage) { await DispatcherHelper.ExecuteOnUIThreadAsync(async() => { MessageDialog Dialog = new MessageDialog(new ResourceLoader().GetString("popup-changeencodingcontent"), string.Format(new ResourceLoader().GetString("popup-changeencodingtitle"), TempEncoding.EncodingName, cdet.Charset)); Dialog.Commands.Add(new UICommand { Label = new ResourceLoader().GetString("popup-changeencodingaccept"), Invoked = async(e) => { Tab.TabEncoding = Encoding.GetEncoding(cdet.Charset).CodePage; Tab.TabEncodingWithBOM = false; Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.NotRequested; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); } }); Dialog.Commands.Add(new UICommand { Label = new ResourceLoader().GetString("popup-changeencodinglater"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.MaybeLater; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); } }); Dialog.Commands.Add(new UICommand { Label = new ResourceLoader().GetString("popup-changeencodingno"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.Never; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); } }); await Dialog.ShowAsync(); }); } } } } } } catch { await CreateFile().ContinueWith(async(e) => { if (e.Result) { await WriteFile(); } }); } }); }
/// <summary> /// Detects the encoding of textual data of the specified input data. /// </summary> /// <param name="inputData">The input data.</param> /// <param name="start">The start.</param> /// <param name="count">The count.</param> /// <returns>Detected encoding name</returns> public string Detect (byte[] inputData, int start, int count) { if (Done) return EncodingName; if (!_started) { Reset (); _started = true; if (!CheckForTextualData (inputData, start, count)) { IsText = false; Done = true; return EncodingName; } HasByteOrderMark = CheckForByteOrderMark (inputData, start); IsText = true; } // execute charset detector ude.Feed (inputData, start, count); ude.DataEnd (); if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset)) { IncrementFrequency (ude.Charset); Done = true; return EncodingName; } // singular buffer detection var singleUde = new Ude.CharsetDetector (); const int udeFeedSize = 4 * 1024; int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize; for (var pos = start; pos < count; pos += step) { singleUde.Reset (); if (pos + step > count) singleUde.Feed (inputData, pos, count - pos); else singleUde.Feed (inputData, pos, step); singleUde.DataEnd (); // update encoding frequency if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset)) IncrementFrequency (singleUde.Charset); } // vote for best encoding EncodingName = GetCurrentEncoding (); // update current encoding name return EncodingName; }