// the reason to create this method in the API is that certain encodings are not
        // grep-able under certain tools that we use heavily, like MINGW32's grep in this case
        // (not sure if more tools are affected)
        public static bool IsEncodingGrepable(FileInfo file, out string charSet)
        {
            using (FileStream fs = File.OpenRead(file.FullName))
            {
                var cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                charSet = cdet.Charset;

                if (cdet.Charset != null && cdet.Charset == "UTF-16LE")
                {
                    return false;
                }
                return true;
            }
        }
Example #2
0
        /// <summary>
        /// Automatically detects the text encoding of the data stored in buffer and decodes it.
        /// </summary>
        /// <param name="buffer">text data</param>
        /// <returns>decoded string from buffer or null if no suitable encoding was found</returns>
        public static string Decode(byte[] buffer)
        {
            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            using (MemoryStream memory = new MemoryStream(buffer))
            {
                cdet.Feed(memory);
                cdet.DataEnd();
            }

            if (cdet.Charset == null)
            {
                return(null);
            }

            try
            {
                //Encoding encoding = Encoding.GetEncoding(cdet.Charset);
                Encoding encoding = Encoding.UTF8;
                string   output   = encoding.GetString(buffer);

                if (cdet.Charset == "ASCII" && output.Contains('\0')) // The Detector has probably mistakenly identified utf-16 as ASCII
                {
                    output = Encoding.GetEncoding("utf-16").GetString(buffer);
                }

                return(output);
            }
            catch (Exception)
            {
                return(null);
            }
        }
Example #3
0
        /// <summary>
        /// Detects the byte order mark of a file and returns
        /// an appropriate encoding for the file.
        /// </summary>
        /// <param name="sourceFile"></param>
        /// <returns>A TextEncodingMetadata object</returns>
        public static TextEncodingMetadata GetFileEncoding(string sourceFile)
        {
            var metaData = new TextEncodingMetadata();

            EncodingMetaInfo metaInfo = DetectBom(sourceFile);

            metaData.HasBom = metaInfo.HasBom;

            using (FileStream fs = File.OpenRead(sourceFile))
            {
                var cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                if (cdet.Charset != null)
                {
                    metaData.CharacterSet        = cdet.Charset;
                    metaData.DetectionConfidence = cdet.Confidence;
                }
                else
                {
                    Console.WriteLine("Detection failed.");
                }
            }

            using (var sr = new StreamReader(sourceFile))
            {
                Encoding encoding = sr.CurrentEncoding;
                metaData.FileEncoding = encoding;
            }

            return(metaData);
        }
Example #4
0
        private void ConvertToUtf8(string fileName)
        {
            string charset = null;

            using (FileStream fs = File.OpenRead(fileName))
            {
                var cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                charset = cdet.Charset;
            }

            if (charset != null)
            {
                var text = string.Empty;
                using (var reader = new StreamReader(fileName, Encoding.GetEncoding(charset)))
                {
                    text = reader.ReadToEnd();
                }

                using (var writer = new StreamWriter(fileName, false, Encoding.UTF8))
                {
                    if (text.Length > 0)
                    {
                        writer.Write(text);
                    }
                }
            }
        }
Example #5
0
        public static Encoding DetectFileEncoding(ZipFile zipFile)
        {
            long readEntries = 0;

            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            foreach (var entry in zipFile.OfType <ZipEntry>())
            {
                readEntries++;
                if (entry.IsUnicodeText)
                {
                    return(Encoding.UTF8);
                }
                var guessedEncoding = Common.Extensions.IgnoreExceptions(() =>
                {
                    var rawBytes = Encoding.GetEncoding(Constants.Filesystem.ExtendedAsciiCodePage).GetBytes(entry.Name);
                    cdet.Feed(rawBytes, 0, rawBytes.Length);
                    cdet.DataEnd();
                    if (cdet.Charset != null && cdet.Confidence >= 0.9 && (readEntries >= Math.Min(zipFile.Count, 50)))
                    {
                        return(Encoding.GetEncoding(cdet.Charset));
                    }
                    return(null);
                });
                if (guessedEncoding != null)
                {
                    return(guessedEncoding);
                }
            }
            return(Encoding.UTF8);
        }
Example #6
0
        /// <summary>
        /// Auto-detector to fixing the encoded string.
        /// </summary>
        /// <param name="str">Data for reencoding</param>
        /// <param name="from">Known Encoding for current string</param>
        /// <returns>Reencoded string with auto-detected charset.</returns>
        protected virtual string reEncodeString(string str, Encoding from)
        {
            if (String.IsNullOrEmpty(str))
            {
                return(str);
            }

            byte[] bytes = from.GetBytes(str);

            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(bytes, 0, bytes.Length);
            cdet.DataEnd();

            if (cdet.Charset == null)
            {
                Log.Debug("reEncodeString: Problem with detection... use the original");
                return(str);
            }
            Log.Debug("reEncodeString: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence);

            if (cdet.Confidence < 0.92f)
            {
                Log.Debug("reEncodeString: Confidence < 0.92 /use the original");
                return(str);
            }

            Encoding to = Encoding.GetEncoding(cdet.Charset);

            Log.Debug("reEncodeString: '{0}' -> '{1}'", from.EncodingName, to.EncodingName);
            Log.Trace("reEncodeString: original - '{0}'", str);
            return(to.GetString(bytes));
        }
Example #7
0
        public async Task <bool> ReadFile(bool ReplaceEncoding)
        {
            try
            {
                StorageFile file = AsyncHelpers.RunSync(() => StorageFile.GetFileFromPathAsync(Tab.TabOriginalPathContent).AsTask());
                string      encode_type = ""; bool encode_bom = true;

                await Task.Run(() =>
                {
                    using (FileStream fs = File.OpenRead(Tab.TabOriginalPathContent))
                    {
                        var cdet = new Ude.CharsetDetector();
                        cdet.Feed(fs);
                        cdet.DataEnd();
                        if (cdet.Charset != null)
                        {
                            encode_type = cdet.Charset;
                        }
                    }
                });

                if (Encoding.UTF8.CodePage == Encoding.GetEncoding(encode_type).CodePage)
                {
                    encode_bom = false;
                }

                if (encode_type == "")
                {
                    encode_type = "utf-8";
                }

                using (var st = new StreamReader(await file.OpenStreamForReadAsync(), Encoding.GetEncoding(encode_type)))
                {
                    await TabsWriteManager.PushTabContentViaIDAsync(new TabID { ID_Tab = Tab.ID, ID_TabsList = ListTabsID }, st.ReadToEnd(), true);

                    if (ReplaceEncoding)
                    {
                        Tab.TabEncoding        = Encoding.GetEncoding(encode_type).CodePage;
                        Tab.TabEncodingWithBOM = encode_bom;
                        await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, true);
                    }

                    st.Dispose();
                }

                return(true);
            }
            catch (Exception e)
            {
                await DispatcherHelper.ExecuteOnUIThreadAsync(async() =>
                {
                    await new MessageDialog(e.Message, new ResourceLoader().GetString("popup-errorreadingfile")).ShowAsync();
                    await TabsWriteManager.DeleteTabAsync(new TabID {
                        ID_Tab = Tab.ID, ID_TabsList = ListTabsID
                    });
                });

                return(false);
            }
        }
        public async Task <string> ReadFileAndGetContent()
        {
            StorageFile file = AsyncHelpers.RunSync(() => StorageFile.GetFileFromPathAsync(Tab.PathContent).AsTask());
            string      encode_type = "", content = "";

            await Task.Run(() =>
            {
                using (FileStream fs = File.OpenRead(Tab.PathContent))
                {
                    var cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset != null)
                    {
                        encode_type = cdet.Charset;
                    }
                }
            });

            using (var st = new StreamReader(await file.OpenStreamForReadAsync(), Encoding.GetEncoding(encode_type)))
            {
                content = st.ReadToEnd();
                st.Dispose();
                return(content);
            }
        }
        protected string GetSubtitleEncoding(string subtitleSource, string subtitleLanguage)
        {
            if (string.IsNullOrEmpty(subtitleSource))
            {
                return(null);
            }

            byte[] buffer = File.ReadAllBytes(subtitleSource);

            //Use byte order mark if any
            if (buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0XFE && buffer[3] == 0XFF)
            {
                return("UTF-32");
            }
            else if (buffer[0] == 0XFF && buffer[1] == 0XFE && buffer[2] == 0x00 && buffer[3] == 0x00)
            {
                return("UTF-32");
            }
            else if (buffer[0] == 0XFE && buffer[1] == 0XFF)
            {
                return("UNICODEBIG");
            }
            else if (buffer[0] == 0XFF && buffer[1] == 0XFE)
            {
                return("UNICODELITTLE");
            }
            else if (buffer[0] == 0XEF && buffer[1] == 0XBB && buffer[2] == 0XBF)
            {
                return("UTF-8");
            }
            else if (buffer[0] == 0X2B && buffer[1] == 0X2F && buffer[2] == 0x76)
            {
                return("UTF-7");
            }

            //Detect encoding from language
            if (string.IsNullOrEmpty(subtitleLanguage) == false)
            {
                CultureInfo[] cultures = CultureInfo.GetCultures(CultureTypes.NeutralCultures);
                foreach (CultureInfo culture in cultures)
                {
                    if (culture.TwoLetterISOLanguageName.ToUpperInvariant() == subtitleLanguage.ToUpperInvariant())
                    {
                        return(Encoding.GetEncoding(culture.TextInfo.ANSICodePage).BodyName.ToUpperInvariant());
                    }
                }
            }

            //Detect encoding from file
            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(buffer, 0, buffer.Length);
            cdet.DataEnd();
            if (cdet.Charset != null && cdet.Confidence >= 0.1)
            {
                return(Encoding.GetEncoding(cdet.Charset).BodyName.ToUpperInvariant());
            }

            //Use windows encoding
            return(Encoding.Default.BodyName.ToUpperInvariant());
        }
Example #10
0
        public override string Recovery(FileInfo fi)
        {
            using (FileStream fs = File.OpenRead(fi.FullName))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    _encoding = cdet.Charset;
                    //Console.WriteLine(_encoding);
                }
                else
                {
                    Console.WriteLine("Detection failed.");
                }
            }

            string path = fi.DirectoryName + "\\" + fi.Name;

            Console.WriteLine(path);
            StreamReader Reader = new StreamReader(path, Encoding.GetEncoding(_encoding));
            string       polpi  = Reader.ReadToEnd();

            Reader.Close();
            polpi = polpi.ToLower();
            return(polpi);
        }
Example #11
0
        public static EmailEncodingResult GetEmailEncoding(byte[] databytes)
        {
            var result = EncodingHelper.EmailEncoding.ScanCharSet(databytes);

            if (result != null)
            {
                return(result);
            }

            Ude.CharsetDetector detector = new Ude.CharsetDetector();
            detector.Feed(databytes, 0, databytes.Length);
            detector.DataEnd();


            if (!string.IsNullOrWhiteSpace(detector.Charset))
            {
                if (result == null)
                {
                    result = new EmailEncodingResult();
                }

                result.Charset = detector.Charset;
                return(result);
            }

            return(null);
        }
Example #12
0
 private Encoding GetEncoding(string path)
 {
     try
     {
         using (FileStream fs = File.OpenRead(path))
         {
             Ude.CharsetDetector cdet = new Ude.CharsetDetector();
             cdet.Feed(fs);
             cdet.DataEnd();
             if (cdet.Charset != null)
             {
                 if (cdet.Charset == "ASCII")
                 {
                     return(Encoding.Default);
                 }
                 else if (cdet.Charset == "UTF-8")
                 {
                     return(Encoding.UTF8);
                 }
                 else
                 {
                     return(Encoding.Default);
                 }
             }
             else
             {
                 return(Encoding.Default);
             }
         }
     }
     catch
     {
         return(Encoding.Default);
     }
 }
Example #13
0
        public static Encoding GetEncoding(Stream stream)
        {
            var cdet = new Ude.CharsetDetector();

            cdet.Feed(stream);
            cdet.DataEnd();
            return(cdet.Charset != null?Encoding.GetEncoding(cdet.Charset) : null);
        }
Example #14
0
        public static Encoding GetEncoding(byte[] bytes, int offset, int length)
        {
            var cdet = new Ude.CharsetDetector();

            cdet.Feed(bytes, offset, length);
            cdet.DataEnd();
            return(cdet.Charset != null?Encoding.GetEncoding(cdet.Charset) : null);
        }
Example #15
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
            {
                return(EncodingName);
            }
            if (!_started)
            {
                Reset();
                _started = true;
                if (!CheckForTextualData(inputData, start, count))
                {
                    IsText = false;
                    Done   = true;
                    return(EncodingName);
                }
                HasByteOrderMark = CheckForByteOrderMark(inputData, start);
                IsText           = true;
            }

            // execute charset detector
            ude.Feed(inputData, start, count);
            ude.DataEnd();
            if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset))
            {
                IncrementFrequency(ude.Charset);
                Done = true;
                return(EncodingName);
            }

            // singular buffer detection
            var       singleUde   = new Ude.CharsetDetector();
            const int udeFeedSize = 4 * 1024;
            int       step        = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;

            for (var pos = start; pos < count; pos += step)
            {
                singleUde.Reset();
                if (pos + step > count)
                {
                    singleUde.Feed(inputData, pos, count - pos);
                }
                else
                {
                    singleUde.Feed(inputData, pos, step);
                }
                singleUde.DataEnd();
                // update encoding frequency
                if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty(singleUde.Charset))
                {
                    IncrementFrequency(singleUde.Charset);
                }
            }
            // vote for best encoding
            EncodingName = GetCurrentEncoding();
            // update current encoding name
            return(EncodingName);
        }
Example #16
0
        private void GuessEncoding()
        {
            using var fs = File.OpenRead(_file);
            var charsetDetector = new Ude.CharsetDetector();

            charsetDetector.Feed(fs);
            charsetDetector.DataEnd();
            _encoding = charsetDetector.Charset != null?Encoding.GetEncoding(charsetDetector.Charset) : Encoding.UTF8;
        }
Example #17
0
        protected virtual void Reset()
        {
            maxStreamLength = 0;

            detector                 = null;
            encoding                 = null;
            encodingBytesRead        = 0;
            lineAtBufferEndCompleted = false;
        }
        private string GetCharset(Stream stream)
        {
            var detector = new Ude.CharsetDetector();

            detector.Feed(stream);
            detector.DataEnd();

            return(detector.Charset);
        }
Example #19
0
        // this only return ascii or utf8, only for the email usage now.
        public static string GetTextCharset(string text)
        {
            var bytes = System.Text.Encoding.UTF8.GetBytes(text);

            Ude.CharsetDetector detector = new Ude.CharsetDetector();
            detector.Feed(bytes, 0, bytes.Length);
            detector.DataEnd();
            return(detector.Charset);
        }
Example #20
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
            {
                return(EncodingName);
            }
            if (!_started)
            {
                Reset();
                _started = true;
                if (!CheckForTextualData(inputData, start, count))
                {
                    IsText = false;
                    Done   = true;
                    return(EncodingName);
                }
                HasByteOrderMark = CheckForByteOrderMark(inputData, start);
                IsText           = true;
            }

            // execute charset detector
            ude.Feed(inputData, start, count);
            ude.DataEnd();
            if (ude.IsDone() && !String.IsNullOrEmpty(ude.Charset))
            {
                Done = true;
                return(EncodingName);
            }

            const int bufferSize = 4 * 1024;

            // singular buffer detection
            if (singleEncodings.Count < 2000)
            {
                var u    = new Ude.CharsetDetector();
                int step = (count - start) < bufferSize ? (count - start) : bufferSize;
                for (var i = start; i < count; i += step)
                {
                    u.Reset();
                    if (i + step > count)
                    {
                        u.Feed(inputData, i, count - i);
                    }
                    else
                    {
                        u.Feed(inputData, i, step);
                    }
                    u.DataEnd();
                    if (u.Confidence > 0.3 && !String.IsNullOrEmpty(u.Charset))
                    {
                        singleEncodings.Add(u.Charset);
                    }
                }
            }
            return(EncodingName);
        }
Example #21
0
 public static string DetectEncoding(FileStream fs)
 {
     Ude.CharsetDetector cdet = new Ude.CharsetDetector();
     cdet.Feed(fs);
     cdet.DataEnd();
     if (cdet.Charset != null)
     {
         return(cdet.Charset);
     }
     else
     {
         return("Error");
     }
 }
Example #22
0
 private Encoding GetEncoding(string fileName)
 {
     using (FileStream fs = File.OpenRead(fileName))
     {
         Ude.ICharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             return(Encoding.GetEncoding(cdet.Charset));
         }
         return(_DEFAULT_ENCODING);
     }
 }
Example #23
0
        private void AddSrtSubtitleImpl()
        {
            string srtFile = FileService.Instance.GetFileNameLoad(
                Config.RememberPreviousFiles ? Config.LastSrtFolder : null,
                SubtitleRes.SrtFilePickerText,
                Utilities.GetFilePickerFilter("srt"));

            if (srtFile != null)
            {
                if (Config.RememberPreviousFiles)
                {
                    Config.LastSrtFolder = Path.GetDirectoryName(srtFile);
                }

                string characterCode = null;
                using (FileStream srtFileStream = File.OpenRead(srtFile))
                {
                    Ude.CharsetDetector detector = new Ude.CharsetDetector();
                    detector.Feed(srtFileStream);
                    detector.DataEnd();
                    if (detector.Charset != null)
                    {
                        this.logger.Log($"Detected encoding {detector.Charset} for {srtFile} with confidence {detector.Confidence}.");
                        characterCode = CharCode.FromUdeCode(detector.Charset);

                        if (characterCode == null)
                        {
                            this.logger.Log("Detected encoding does not match with any available encoding.");
                        }
                        else
                        {
                            this.logger.Log("Picked encoding " + characterCode);
                        }
                    }

                    if (characterCode == null)
                    {
                        Ioc.Get <IMessageBoxService>().Show(this, SubtitleRes.SubtitleCharsetDetectionFailedMessage);
                        characterCode = "UTF-8";
                    }
                }

                SrtSubtitle newSubtitle = new SrtSubtitle {
                    FileName = srtFile, Default = false, CharacterCode = characterCode, LanguageCode = LanguageUtilities.GetDefaultLanguageCode(), Offset = 0
                };
                this.srtSubtitles.Add(new SrtSubtitleViewModel(this, newSubtitle));
            }

            this.UpdateWarningVisibility();
        }
 public static string GetCharset(string fileName)
 {
     using (FileStream fs = File.OpenRead(fileName))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             return(cdet.Charset);
         }
         return("");
     }
 }
Example #25
0
 private static string GetEncoding(string filename)
 {
     using (var fs = File.OpenRead(filename))
     {
         var cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         //if (cdet.Charset != null)
         //    Console.WriteLine("Charset: {0}, confidence: {1} : " + filename, cdet.Charset, cdet.Confidence);
         //else
         //    Console.WriteLine("Detection failed: " + filename);
         return(cdet.Charset);
     }
 }
        private Encoding DetectEncoding(Stream stream)
        {
            var encoding = Encoding.UTF8;

            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(stream);
            cdet.DataEnd();
            if (cdet.Charset != null)
            {
                encoding = GetEncodingFromString(cdet.Charset);
            }

            stream.Position = 0;
            return(encoding);
        }
Example #27
0
        private static Encoding GetFileEncoding(string fileName)
        {
            using (var fs = File.OpenRead(fileName))
            {
                var cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                if (cdet.Charset == null)
                {
                    throw new ArgumentException("Error in reading charset.");
                }

                return(Encoding.GetEncoding(cdet.Charset));
            }
        }
 public static System.Text.Encoding DetectCsvEncoding(Stream fs)
 {
     Ude.CharsetDetector cdet = new Ude.CharsetDetector();
     cdet.Feed(fs);
     cdet.DataEnd();
     fs.Seek(0, SeekOrigin.Begin);
     if (cdet.Charset != null)
     {
         s_logger.Debug("Charset: {}, confidence: {}", cdet.Charset, cdet.Confidence);
         return(System.Text.Encoding.GetEncoding(cdet.Charset) ?? System.Text.Encoding.Default);
     }
     else
     {
         return(System.Text.Encoding.Default);
     }
 }
Example #29
0
        static void Main(string[] args)
        {
            List <string> files = new List <string>();

            files.Add("ansi.txt");
            files.Add("utf8.txt");

            /*
             * StreamReader ansi = new StreamReader("ansi.txt");
             * StreamReader utf8 = new StreamReader("utf8.txt");
             *
             * string ansiContent = ansi.ReadToEnd();
             * string utf8Content = utf8.ReadToEnd();
             *
             * Console.WriteLine("ansi : " + ansi.CurrentEncoding.CodePage);
             * //Console.WriteLine("ansi 2: "+TextFileEncodingDetector.DetectTextFileEncoding("ansi.txt"));
             * //Console.WriteLine(ansiContent);
             *
             *
             * Console.WriteLine("utf8 : " + utf8.CurrentEncoding.CodePage);
             * //Console.WriteLine(utf8Content);
             */


            foreach (string file in files)
            {
                Console.Write(file);
                string filename = file;
                using (FileStream fs = File.OpenRead(filename))
                {
                    Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset != null)
                    {
                        Console.WriteLine("Charset: {0}, confidence: {1}",
                                          cdet.Charset, cdet.Confidence);
                    }
                    else
                    {
                        Console.WriteLine("Detection failed.");
                    }
                }
            }

            Console.ReadLine();
        }
Example #30
0
        /// <summary>
        /// 用第三方元件"Ude"去解讀該文件的正確編碼方式
        /// </summary>
        /// <param name="context"></param>
        /// <returns></returns>
        private static Encoding GetUdeEncode(Stream context)
        {
            Encoding result = null;

            Ude.CharsetDetector detector = new Ude.CharsetDetector();
            detector.Feed(context);
            detector.DataEnd();
            if (detector.Charset != null)
            {
                result = Encoding.GetEncoding(ConvertToCorrectEncodingString(detector.Charset));
            }
            else
            {
                //沒有值視為沒取到
            }
            return(result);
        }
 private Encoding GetEncoding(string filename)
 {
     using (FileStream fs = File.OpenRead(filename))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             return(Encoding.GetEncoding(cdet.Charset));
         }
         else
         {
             return(Encoding.Default);
         }
     }
 }
Example #32
0
        public BMS Parse(string path)
        {
            BMS bms = new BMS();
            //default encoding: Shift-JIS?
            Encoding encoding = Encoding.GetEncoding(932);
            String line;

            if (!File.Exists(path))
            {
                return null;
            }

            using (FileStream fs = File.OpenRead(path))
            {
                //detect charset
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                /*
                if(cdet.Charset != null)
                {
                    Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence);
                    encoding = Encoding.GetEncoding(cdet.Charset);
                }
                else
                {
                    Console.WriteLine("Detection Failed");
                }
                */
            }

            using (StreamReader sr = new StreamReader(path, encoding))
            {
                bms.path = Directory.GetParent(path).FullName;

                while((line = sr.ReadLine()) != null)
                {
                    ProcessBMSLine(line.Trim(), bms);
                }
            }
            SetSubtitle(bms.info);
            CalculatePulse(bms);
            FillRealTime(bms);

            return bms;
        }
Example #33
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
                return EncodingName;
            if (!_started)
            {
                Reset ();
                _started = true;
                if (!CheckForTextualData (inputData, start, count))
                {
                    IsText = false;
                    Done = true;
                    return EncodingName;
                }
                HasByteOrderMark = CheckForByteOrderMark (inputData, start);
                IsText = true;
            }

            // execute charset detector
            ude.Feed (inputData, start, count);
            ude.DataEnd ();
            if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
            {
                Done = true;
                return EncodingName;
            }

            const int bufferSize = 4 * 1024;

            // singular buffer detection
            if (singleEncodings.Count < 2000)
            {
                var u = new Ude.CharsetDetector ();
                int step = (count - start) < bufferSize ? (count - start) : bufferSize;
                for (var i = start; i < count; i += step)
                {
                    u.Reset ();
                    if (i + step > count)
                        u.Feed (inputData, i, count - i);
                    else
                        u.Feed (inputData, i, step);
                    u.DataEnd ();
                    if (u.Confidence > 0.3 && !String.IsNullOrEmpty (u.Charset))
                        singleEncodings.Add (u.Charset);
                }
            }
            return EncodingName;
        }
Example #34
0
        /// <summary>
        /// Auto detecting encoding from the file.
        /// </summary>
        protected virtual Encoding detectEncodingFromFile(string file)
        {
            using(FileStream fs = File.OpenRead(file))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                if(cdet.Charset == null) {
                    //throw new ComponentException("Ude: Detection failed for '{0}'", file);
                    Log.Warn("Problem with detection of encoding for '{0}'", file);
                    return defaultEncoding; // good luck
                }

                Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence);
                Encoding enc = Encoding.GetEncoding(cdet.Charset);

                if(enc == Encoding.UTF8) {
                    fs.Seek(0, SeekOrigin.Begin);
                    return (fs.ReadByte() == 0xEF &&
                            fs.ReadByte() == 0xBB &&
                            fs.ReadByte() == 0xBF) ? new UTF8Encoding(true) : new UTF8Encoding(false);
                }

                return enc;
            }
        }
Example #35
0
        /// <summary>
        /// Detects encoding using mozilla universal character detector.
        /// </summary>
        /// <param name="bytes">sample data</param>
        /// <returns>Detected encoding or null if not detected</returns>
        /// <history>
        /// [Curtis_Beard]		12/01/2014	Created
        /// </history>
        private static Encoding DetectEncodingUsingMozillaUCD(Byte[] bytes)
        {
            try
             {
            Ude.ICharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(bytes, 0, bytes.Length);
            cdet.DataEnd();

            if (cdet.Charset != null)
            {
               return Encoding.GetEncoding(cdet.Charset);
            }
             }
             catch { }

             return null;
        }
 /// <summary>
 /// Get Encoding Format of file 
 /// </summary>
 /// <param name="path">Chemin du fichier</param>
 /// <returns>File Encoding</returns>
 private void GetEncoding(string path)
 {
     string encode;
     using (FileStream fs = File.OpenRead(path + DataLoaderConstants.FileExtCsv))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             encode = cdet.Charset;
         }
         else
         {
             encode = "failed";
         }
     }
     if (encode == "failed")
         enc = Encoding.Default;
     else
     {
         switch (encode.ToLower())
         {
             case "utf-8": enc = Encoding.UTF8; break;
             case "utf-16le": enc = Encoding.Unicode; break;
             case "utf-16be": enc = Encoding.BigEndianUnicode; break;
             case "windows-1252": goto default;
             default: enc = Encoding.Default; break;
         }
     }
 }
        /// <summary>
        /// Auto detecting the encoding from the file
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        protected virtual Encoding detectEncodingFromFile(string file)
        {
            using(FileStream fs = File.OpenRead(file))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                if(cdet.Charset == null) {
                    //throw new ComponentException("Ude: Detection failed for '{0}'", file);
                    Log.Warn("Problem with detection of encoding for '{0}'", file);
                    return Encoding.UTF8; // good luck
                }

                Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence);
                return Encoding.GetEncoding(cdet.Charset);
            }
        }
Example #38
0
        private void ConvertToUtf8(string fileName)
        {
            string charset = null;
            using (FileStream fs = File.OpenRead(fileName))
            {
              var cdet = new Ude.CharsetDetector();
              cdet.Feed(fs);
              cdet.DataEnd();
              charset = cdet.Charset;
            }

            if (charset != null)
            {
              var text = string.Empty;
              using (var reader = new StreamReader(fileName, Encoding.GetEncoding(charset)))
              {
            text = reader.ReadToEnd();
              }

              using (var writer = new StreamWriter(fileName, false, Encoding.UTF8))
              {
            if (text.Length > 0)
              writer.Write(text);
              }
            }
        }
Example #39
0
        /// <summary>
        /// Auto-detector to fixing the encoded string.
        /// </summary>
        /// <param name="str">Data for reencoding</param>
        /// <param name="from">Known Encoding for current string</param>
        /// <returns>Reencoded string with auto-detected charset.</returns>
        protected virtual string reEncodeString(string str, Encoding from)
        {
            if(String.IsNullOrEmpty(str)) {
                return str;
            }

            byte[] bytes = from.GetBytes(str);

            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(bytes, 0, bytes.Length);
            cdet.DataEnd();

            if(cdet.Charset == null) {
                Log.Debug("reEncodeString: Problem with detection... use the original");
                return str;
            }
            Log.Debug("reEncodeString: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence);

            if(cdet.Confidence < 0.92f) {
                Log.Debug("reEncodeString: Confidence < 0.92 /use the original");
                return str;
            }

            Encoding to = Encoding.GetEncoding(cdet.Charset);
            Log.Debug("reEncodeString: '{0}' -> '{1}'", from.EncodingName, to.EncodingName);
            Log.Trace("reEncodeString: original - '{0}'", str);
            return to.GetString(bytes);
        }
 /// <summary>
 /// Get Encoding Format of file 
 /// </summary>
 /// <param name="path">Chemin du fichier</param>
 /// <returns>File Encoding</returns>
 private static Encoding GetEncoding(string path)
 {
     string encode;
     using (FileStream fs = File.OpenRead(path))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             encode = cdet.Charset;
         }
         else
         {
             encode = "failed";
         }
         fs.Close();
     }
     if (encode == "failed")
         return Encoding.Default;
     else
     {
         switch (encode.ToLower())
         {
             case "utf-8": return Encoding.UTF8;
             case "utf-16le": return Encoding.Unicode;
             case "utf-16be": return Encoding.BigEndianUnicode;
             case "windows-1252": goto default;
             default: return Encoding.Default;
         }
     }
 }
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect (byte[] inputData, int start, int count)
        {
            if (Done)
                return EncodingName;
            if (!_started)
            {
                Reset ();
                _started = true;
                if (!CheckForTextualData (inputData, start, count))
                {
                    IsText = false;
                    Done = true;
                    return EncodingName;
                }
                HasByteOrderMark = CheckForByteOrderMark (inputData, start);
                IsText = true;
            }

            // execute charset detector                
            ude.Feed (inputData, start, count);
            ude.DataEnd ();
            if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
            {
                IncrementFrequency (ude.Charset);
                Done = true;
                return EncodingName;
            }

            // singular buffer detection
            var singleUde = new Ude.CharsetDetector ();
            const int udeFeedSize = 4 * 1024;
            int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;
            for (var pos = start; pos < count; pos += step)
            {
                singleUde.Reset ();
                if (pos + step > count)
                    singleUde.Feed (inputData, pos, count - pos);
                else
                    singleUde.Feed (inputData, pos, step);
                singleUde.DataEnd ();
                // update encoding frequency
                if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset))
                    IncrementFrequency (singleUde.Charset);
            }
            // vote for best encoding
            EncodingName = GetCurrentEncoding ();
            // update current encoding name
            return EncodingName;
        }
Example #42
0
        private static void ProcessFilesInDir(string d)
        {
            foreach (string f in Directory.GetFiles(d).Where(f => Path.GetExtension(f) == ".cs" || Path.GetExtension(f) == ".xaml"))
            {
                Encoding encoding;

                using (var fs = File.OpenRead(f))
                {
                    Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset == null)
                    {
            //                        Console.WriteLine($"{f} - Detection failed.");
                        continue;
                    }

                    switch (cdet.Charset)
                    {
                        case "ASCII":
                        case "UTF-8":
                            continue;
                        case "x-mac-cyrillic":
                        case "windows-1251":
                            encoding = Encoding.GetEncoding(1251);
                            break;
                        default:
                            Console.Out.WriteLine($"{cdet.Charset} - {f} - Skipped");
                            continue;
                    }
                }

            //                Console.Out.WriteLine(f);
                var text = File.ReadAllText(f, encoding);
                File.WriteAllText(f, text, Encoding.UTF8);
            }
        }
Example #43
0
 /// <summary>
 /// Acquires encoding related info on one read.
 /// </summary>
 public static EncodingFileInfo GetEncodingFileInfo(String file)
 {
     Int32 startIndex = 0;
     EncodingFileInfo info = new EncodingFileInfo();
     try
     {
         if (File.Exists(file))
         {
             Byte[] bytes = File.ReadAllBytes(file);
             if (bytes.Length > 2 && (bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf))
             {
                 startIndex = 3;
                 info.BomLength = 3;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF8.WebName;
                 info.CodePage = Encoding.UTF8.CodePage;
             }
             else if (bytes.Length > 3 && (bytes[0] == 0xff && bytes[1] == 0xfe && bytes[2] == 0x00 && bytes[3] == 0x00))
             {
                 startIndex = 4;
                 info.BomLength = 4;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF32.WebName;
                 info.CodePage = Encoding.UTF32.CodePage;
             }
             else if (bytes.Length > 4 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f) && bytes[4] == 0x2D))
             {
                 startIndex = 5;
                 info.BomLength = 5;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF7.WebName;
                 info.CodePage = Encoding.UTF7.CodePage;
             }
             else if (bytes.Length > 3 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f)))
             {
                 startIndex = 4;
                 info.BomLength = 4;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF7.WebName;
                 info.CodePage = Encoding.UTF7.CodePage;
             }
             else if (bytes.Length > 1 && (bytes[0] == 0xff && bytes[1] == 0xfe))
             {
                 startIndex = 2;
                 info.BomLength = 2;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.Unicode.WebName;
                 info.CodePage = Encoding.Unicode.CodePage;
             }
             else if (bytes.Length > 1 && (bytes[0] == 0xfe && bytes[1] == 0xff))
             {
                 startIndex = 2;
                 info.BomLength = 2;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.BigEndianUnicode.WebName;
                 info.CodePage = Encoding.BigEndianUnicode.CodePage;
             }
             else
             {
                 if (!ContainsInvalidUTF8Bytes(bytes))
                 {
                     info.Charset = Encoding.UTF8.WebName;
                     info.CodePage = Encoding.UTF8.CodePage;
                 }
                 else // Try detecting using Ude...
                 {
                     Ude.CharsetDetector detector = new Ude.CharsetDetector();
                     detector.Feed(bytes, 0, bytes.Length); detector.DataEnd();
                     if (detector.Charset != null)
                     {
                         Encoding encoding = Encoding.GetEncoding(detector.Charset);
                         info.Charset = encoding.WebName;
                         info.CodePage = encoding.CodePage;
                     }
                     else
                     {
                         info.Charset = Encoding.Default.WebName;
                         info.CodePage = Encoding.Default.CodePage;
                     }
                 }
             }
             Int32 contentLength = bytes.Length - startIndex;
             if (bytes.Length > 0 && bytes.Length > startIndex)
             {
                 Encoding encoding = Encoding.GetEncoding(info.CodePage);
                 info.Contents = encoding.GetString(bytes, startIndex, contentLength);
             }
         }
     }
     catch (Exception)
     {
         info = new EncodingFileInfo();
     }
     return info;
 }