예제 #1
0
 private void Reset()
 {
     _started = false;
     _done    = false;
     encodingFrequency.Clear();
     _ude.Reset();
     _singleUde.Reset();
     _encodingName = null;
 }
예제 #2
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
            {
                return(EncodingName);
            }
            if (!_started)
            {
                Reset();
                _started = true;
                if (!CheckForTextualData(inputData, start, count))
                {
                    IsText = false;
                    Done   = true;
                    return(EncodingName);
                }
                HasByteOrderMark = CheckForByteOrderMark(inputData, start);
                IsText           = true;
            }

            // execute charset detector
            ude.Feed(inputData, start, count);
            ude.DataEnd();
            if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset))
            {
                IncrementFrequency(ude.Charset);
                Done = true;
                return(EncodingName);
            }

            // singular buffer detection
            var       singleUde   = new Ude.CharsetDetector();
            const int udeFeedSize = 4 * 1024;
            int       step        = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;

            for (var pos = start; pos < count; pos += step)
            {
                singleUde.Reset();
                if (pos + step > count)
                {
                    singleUde.Feed(inputData, pos, count - pos);
                }
                else
                {
                    singleUde.Feed(inputData, pos, step);
                }
                singleUde.DataEnd();
                // update encoding frequency
                if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty(singleUde.Charset))
                {
                    IncrementFrequency(singleUde.Charset);
                }
            }
            // vote for best encoding
            EncodingName = GetCurrentEncoding();
            // update current encoding name
            return(EncodingName);
        }
예제 #3
0
 /// <summary>
 /// Resets this instance.
 /// </summary>
 public void Reset()
 {
     _started         = false;
     Done             = false;
     HasByteOrderMark = false;
     encodingFrequency.Clear();
     ude.Reset();
     EncodingName = null;
 }
예제 #4
0
 /// <summary>
 /// Resets this instance.
 /// </summary>
 public void Reset()
 {
     _started         = false;
     Done             = false;
     HasByteOrderMark = false;
     singleEncodings.Clear();
     ude.Reset();
     EncodingName = null;
 }
예제 #5
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
            {
                return(EncodingName);
            }
            if (!_started)
            {
                Reset();
                _started = true;
                if (!CheckForTextualData(inputData, start, count))
                {
                    IsText = false;
                    Done   = true;
                    return(EncodingName);
                }
                HasByteOrderMark = CheckForByteOrderMark(inputData, start);
                IsText           = true;
            }

            // execute charset detector
            ude.Feed(inputData, start, count);
            ude.DataEnd();
            if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset))
            {
                Done = true;
                return(EncodingName);
            }

            const int bufferSize = 4 * 1024;

            // singular buffer detection
            if (singleEncodings.Count < 2000)
            {
                var u    = new Ude.CharsetDetector();
                int step = (count - start) < bufferSize ? (count - start) : bufferSize;
                for (var i = start; i < count; i += step)
                {
                    u.Reset();
                    if (i + step > count)
                    {
                        u.Feed(inputData, i, count - i);
                    }
                    else
                    {
                        u.Feed(inputData, i, step);
                    }
                    u.DataEnd();
                    if (u.Confidence > 0.3 && !String.IsNullOrEmpty(u.Charset))
                    {
                        singleEncodings.Add(u.Charset);
                    }
                }
            }
            return(EncodingName);
        }
예제 #6
0
        private static Encoding GetTextEncoding(string filename, Ude.CharsetDetector charsetDetector, Encoding srcEncoding)
        {
            var encoding = Encoding.UTF8;

            using (FileStream fs = File.OpenRead(filename))
            {
                charsetDetector.Reset();
                charsetDetector.Feed(fs);
                charsetDetector.DataEnd();

                try
                {
                    encoding = Encoding.GetEncoding(charsetDetector.Charset);
                }
                catch (Exception ex)
                {
                    Console.WriteLine("Failed to obtain encoding of the file: {0} to {1}. Using source encoding instead. Corrupted characters may occur. {2}", filename, charsetDetector.Charset, ex.Message);

                    encoding = srcEncoding;
                }
            }

            return(encoding);
        }
예제 #7
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
                return EncodingName;
            if (!_started)
            {
                Reset ();
                _started = true;
                if (!CheckForTextualData (inputData, start, count))
                {
                    IsText = false;
                    Done = true;
                    return EncodingName;
                }
                HasByteOrderMark = CheckForByteOrderMark (inputData, start);
                IsText = true;
            }

            // execute charset detector
            ude.Feed (inputData, start, count);
            ude.DataEnd ();
            if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
            {
                Done = true;
                return EncodingName;
            }

            const int bufferSize = 4 * 1024;

            // singular buffer detection
            if (singleEncodings.Count < 2000)
            {
                var u = new Ude.CharsetDetector ();
                int step = (count - start) < bufferSize ? (count - start) : bufferSize;
                for (var i = start; i < count; i += step)
                {
                    u.Reset ();
                    if (i + step > count)
                        u.Feed (inputData, i, count - i);
                    else
                        u.Feed (inputData, i, step);
                    u.DataEnd ();
                    if (u.Confidence > 0.3 && !String.IsNullOrEmpty (u.Charset))
                        singleEncodings.Add (u.Charset);
                }
            }
            return EncodingName;
        }
예제 #8
0
        public async Task WriteFile()
        {
            await DispatcherHelper.ExecuteOnUIThreadAsync(async() =>
            {
                try
                {
                    StorageFile file = await StorageFile.GetFileFromPathAsync(Tab.TabOriginalPathContent);

                    if (file != null)
                    {
                        await FileIO.WriteTextAsync(file, string.Empty);

                        Encoding TempEncoding = Encoding.GetEncoding(Tab.TabEncoding);

                        if (TempEncoding == Encoding.UTF8 && !Tab.TabEncodingWithBOM)
                        {
                            TempEncoding = new UTF8Encoding(false);
                        }

                        string Content = await TabsAccessManager.GetTabContentViaIDAsync(new TabID {
                            ID_Tab = Tab.ID, ID_TabsList = ListTabsID
                        });

                        using (var rd = new StreamWriter(await file.OpenStreamForWriteAsync(), TempEncoding))
                        {
                            rd.Write(Content);
                            rd.Flush(); rd.Dispose();
                        }

                        //Update DateModified (updated push with "PushUpdateTabAsync" in StorageRouter.WriteFile())
                        BasicProperties properties = await file.GetBasicPropertiesAsync();
                        Tab.TabDateModified        = properties.DateModified.ToString();

                        if (TempEncoding.CodePage == Encoding.ASCII.CodePage && Tab.TabEncodingReplacingRequest != EncodingReplacingRequest.Never)
                        {
                            var stream = new MemoryStream();
                            var writer = new StreamWriter(stream);
                            writer.Write(Content);
                            writer.Flush();
                            stream.Position = 0;

                            using (MemoryStream str = stream)
                            {
                                var cdet = new Ude.CharsetDetector();
                                cdet.Reset();
                                cdet.Feed(str);
                                cdet.DataEnd();
                                if (cdet.Charset != null)
                                {
                                    if (Encoding.GetEncoding(cdet.Charset).CodePage == Encoding.UTF8.CodePage)
                                    {
                                        await DispatcherHelper.ExecuteOnUIThreadAsync(async() =>
                                        {
                                            MessageDialog Dialog = new MessageDialog(new ResourceLoader().GetString("popup-changeencodingcontent"), string.Format(new ResourceLoader().GetString("popup-changeencodingtitle"), TempEncoding.EncodingName, cdet.Charset));
                                            Dialog.Commands.Add(new UICommand {
                                                Label = new ResourceLoader().GetString("popup-changeencodingaccept"), Invoked = async(e) => { Tab.TabEncoding = Encoding.GetEncoding(cdet.Charset).CodePage; Tab.TabEncodingWithBOM = false; Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.NotRequested; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); }
                                            });
                                            Dialog.Commands.Add(new UICommand {
                                                Label = new ResourceLoader().GetString("popup-changeencodinglater"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.MaybeLater; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); }
                                            });
                                            Dialog.Commands.Add(new UICommand {
                                                Label = new ResourceLoader().GetString("popup-changeencodingno"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.Never; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); }
                                            });
                                            await Dialog.ShowAsync();
                                        });
                                    }
                                }
                            }
                        }
                    }
                }
                catch
                {
                    await CreateFile().ContinueWith(async(e) =>
                    {
                        if (e.Result)
                        {
                            await WriteFile();
                        }
                    });
                }
            });
        }
예제 #9
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect (byte[] inputData, int start, int count)
        {
            if (Done)
                return EncodingName;
            if (!_started)
            {
                Reset ();
                _started = true;
                if (!CheckForTextualData (inputData, start, count))
                {
                    IsText = false;
                    Done = true;
                    return EncodingName;
                }
                HasByteOrderMark = CheckForByteOrderMark (inputData, start);
                IsText = true;
            }

            // execute charset detector                
            ude.Feed (inputData, start, count);
            ude.DataEnd ();
            if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
            {
                IncrementFrequency (ude.Charset);
                Done = true;
                return EncodingName;
            }

            // singular buffer detection
            var singleUde = new Ude.CharsetDetector ();
            const int udeFeedSize = 4 * 1024;
            int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;
            for (var pos = start; pos < count; pos += step)
            {
                singleUde.Reset ();
                if (pos + step > count)
                    singleUde.Feed (inputData, pos, count - pos);
                else
                    singleUde.Feed (inputData, pos, step);
                singleUde.DataEnd ();
                // update encoding frequency
                if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset))
                    IncrementFrequency (singleUde.Charset);
            }
            // vote for best encoding
            EncodingName = GetCurrentEncoding ();
            // update current encoding name
            return EncodingName;
        }