// the reason to create this method in the API is that certain encodings are not
        // grep-able under certain tools that we use heavily, like MINGW32's grep in this case
        // (not sure if more tools are affected)
        public static bool IsEncodingGrepable(FileInfo file, out string charSet)
        {
            using (FileStream fs = File.OpenRead(file.FullName))
            {
                var cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                charSet = cdet.Charset;

                if (cdet.Charset != null && cdet.Charset == "UTF-16LE")
                {
                    return false;
                }
                return true;
            }
        }
Example #2
0
        public BMS Parse(string path)
        {
            BMS bms = new BMS();
            //default encoding: Shift-JIS?
            Encoding encoding = Encoding.GetEncoding(932);
            String line;

            if (!File.Exists(path))
            {
                return null;
            }

            using (FileStream fs = File.OpenRead(path))
            {
                //detect charset
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                /*
                if(cdet.Charset != null)
                {
                    Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence);
                    encoding = Encoding.GetEncoding(cdet.Charset);
                }
                else
                {
                    Console.WriteLine("Detection Failed");
                }
                */
            }

            using (StreamReader sr = new StreamReader(path, encoding))
            {
                bms.path = Directory.GetParent(path).FullName;

                while((line = sr.ReadLine()) != null)
                {
                    ProcessBMSLine(line.Trim(), bms);
                }
            }
            SetSubtitle(bms.info);
            CalculatePulse(bms);
            FillRealTime(bms);

            return bms;
        }
Example #3
0
        static Encoding DetectArchiveFileEncoding(string path)
        {
            Encoding encoding = Encoding.GetEncoding(1252);
            var      options  = new ReaderOptions {
                ArchiveEncoding = new ArchiveEncoding(encoding, encoding)
            };
            var detector = new Ude.CharsetDetector();

            using (var archive = ArchiveFactory.Open(path, options)) {
                foreach (var entry in archive.Entries)
                {
                    byte[] buffer = encoding.GetBytes(entry.Key);
                    detector.Feed(buffer, 0, buffer.Length);
                }
            }
            detector.DataEnd();
            Log.Information($"{path} charset: {detector.Charset} confidence: {detector.Confidence}");
            return(Encoding.GetEncoding(detector.Charset));
        }
Example #4
0
        public SrtSubtitle LoadSrtSubtitle(string srtPath)
        {
            try
            {
                string characterCode = null;
                using (FileStream srtFileStream = File.OpenRead(srtPath))
                {
                    Ude.CharsetDetector detector = new Ude.CharsetDetector();
                    detector.Feed(srtFileStream);
                    detector.DataEnd();
                    if (detector.Charset != null)
                    {
                        this.logger.Log($"Detected encoding {detector.Charset} for {srtPath} with confidence {detector.Confidence}.");
                        characterCode = CharCode.FromUdeCode(detector.Charset);

                        if (characterCode == null)
                        {
                            this.logger.Log("Detected encoding does not match with any available encoding.");
                        }
                        else
                        {
                            this.logger.Log("Picked encoding " + characterCode);
                        }
                    }

                    if (characterCode == null)
                    {
                        StaticResolver.Resolve <IMessageBoxService>().Show(this, SubtitleRes.SubtitleCharsetDetectionFailedMessage);
                        characterCode = "UTF-8";
                    }
                }

                return(new SrtSubtitle {
                    FileName = srtPath, Default = false, CharacterCode = characterCode, LanguageCode = LanguageUtilities.GetDefaultLanguageCode(), Offset = 0
                });
            }
            catch (Exception exception)
            {
                this.logger.LogError("Could not load SRT file: " + exception);
                return(null);
            }
        }
Example #5
0
        public static string LoadContentFromFile(string _path)
        {
            string fileContents = string.Empty;

            System.Text.Encoding encoding = null;
            FileInfo             _file    = new FileInfo(_path);

            using (FileStream fs = _file.OpenRead())
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    encoding = System.Text.Encoding.GetEncoding(cdet.Charset);
                }
                else
                {
                    encoding = System.Text.Encoding.UTF8;
                }

                fs.Position = 0;

                byte[] ar = new byte[_file.Length];
                fs.Read(ar, 0, ar.Length);
                fileContents = encoding.GetString(ar);
            }

            if (fileContents.StartsWith(""))
            {
                fileContents = fileContents.Substring(3);
            }

            if (encoding != System.Text.Encoding.UTF8)
            {
                var datas = System.Text.Encoding.UTF8.GetBytes(fileContents);
                fileContents = System.Text.Encoding.UTF8.GetString(datas);
            }

            return(fileContents);
        }
Example #6
0
 public static string GetCode(string path)
 {
     using (FileStream fs = File.OpenRead(path))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         string ans;
         if (cdet.Charset.Contains("UTF"))
         {
             ans = "0";
             return(ans);
         }
         else if (cdet.Charset.Contains("windows"))
         {
             ans = "1";
             return(ans);
         }
         return("0");
     }
 }
Example #7
0
        public static string LoadContentFromText(byte[] text)
        {
            string textContents = string.Empty;

            System.Text.Encoding encoding = null;

            using (MemoryStream fs = new MemoryStream(text))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    encoding = System.Text.Encoding.GetEncoding(cdet.Charset);
                }
                else
                {
                    encoding = System.Text.Encoding.UTF8;
                }

                fs.Position = 0;

                byte[] ar = new byte[text.Length];
                fs.Read(ar, 0, ar.Length);
                textContents = encoding.GetString(ar);
            }

            if (textContents.StartsWith(""))
            {
                textContents = textContents.Substring(3);
            }

            if (encoding != System.Text.Encoding.UTF8)
            {
                var datas = System.Text.Encoding.UTF8.GetBytes(textContents);
                textContents = System.Text.Encoding.UTF8.GetString(datas);
            }

            return(textContents);
        }
Example #8
0
        public static Encoding DetectEncoding(FileInfo file)
        {
            string   filename = file.FullName;
            Encoding encoding = null;

            try
            {
                using (FileStream fs = File.OpenRead(filename))
                {
                    Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset != null)
                    {
                        encoding = GetEncodingFromUdeCharset(cdet.Charset);
                    }
                }
            }
            catch (Exception)
            {
                // leave as null
            }
            if (encoding == null)
            {
                try
                {
                    using (StreamReader sr = new StreamReader(filename))
                    {
                        sr.Read();
                        encoding = sr.CurrentEncoding;
                    }
                }
                catch (IOException)
                {
                    // just return null
                }
            }
            return(encoding);
        }
        /// <summary>
        /// Get Encoding Format of file
        /// </summary>
        /// <param name="path">Chemin du fichier</param>
        /// <returns>File Encoding</returns>
        private static Encoding GetEncoding(string path)
        {
            string encode;

            using (FileStream fs = File.OpenRead(path))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    encode = cdet.Charset;
                }
                else
                {
                    encode = "failed";
                }
                fs.Close();
            }
            if (encode == "failed")
            {
                return(Encoding.Default);
            }
            else
            {
                switch (encode.ToLower())
                {
                case "utf-8": return(Encoding.UTF8);

                case "utf-16le": return(Encoding.Unicode);

                case "utf-16be": return(Encoding.BigEndianUnicode);

                case "windows-1252": goto default;

                default: return(Encoding.Default);
                }
            }
        }
Example #10
0
        public static Encoding GetEncoding(ref byte[] databytes, string ContentType = null)
        {
            Encoding encoding = null;

            if (!string.IsNullOrEmpty(ContentType))
            {
                var charset = Kooboo.Lib.Helper.W3Encoding.ExtractCharset(ContentType);
                if (!string.IsNullOrEmpty(charset))
                {
                    encoding = System.Text.Encoding.GetEncoding(charset);
                    if (encoding != null)
                    {
                        return(encoding);
                    }
                }
            }

            encoding = W3Encoding.PreScanEncoding(databytes);
            if (encoding != null)
            {
                return(encoding);
            }

            Ude.CharsetDetector detector = new Ude.CharsetDetector();
            detector.Feed(databytes, 0, databytes.Length);
            detector.DataEnd();

            if (!string.IsNullOrWhiteSpace(detector.Charset))
            {
                encoding = System.Text.Encoding.GetEncoding(detector.Charset);
            }

            if (encoding == null)
            {
                encoding = System.Text.Encoding.GetEncoding(W3Encoding.SystemDefaultEncoding);
            }

            return(encoding);
        }
Example #11
0
        public async Task <bool> ReadFile(bool ReplaceEncoding)
        {
            StorageFile file        = AsyncHelpers.RunSync(() => StorageFile.GetFileFromPathAsync(Tab.PathContent).AsTask());
            string      encode_type = "";

            await Task.Run(() =>
            {
                using (FileStream fs = File.OpenRead(Tab.PathContent))
                {
                    var cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset != null)
                    {
                        encode_type = cdet.Charset;
                    }
                }
            });

            if (encode_type == "")
            {
                encode_type = "utf-8";
            }

            using (var st = new StreamReader(await file.OpenStreamForReadAsync(), Encoding.GetEncoding(encode_type)))
            {
                await TabsWriteManager.PushTabContentViaIDAsync(new TabID { ID_Tab = Tab.ID, ID_TabsList = ListTabsID }, st.ReadToEnd(), true);

                if (ReplaceEncoding)
                {
                    Tab.TabEncoding = Encoding.GetEncoding(encode_type).CodePage;
                    await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID);
                }

                st.Dispose();
            }

            return(true);
        }
Example #12
0
            StreamReader GetStreamReader(string fname)
            {
                BinaryReader reader = new BinaryReader(File.OpenRead(fname));

                byte[] bytes = reader.ReadBytes(5000);
                /* https://github.com/errepi/ude/tree/master/src/Library */
                Ude.CharsetDetector detector = new Ude.CharsetDetector();
                detector.Feed(bytes, 0, 5000);
                detector.DataEnd();
                reader.Close();
                if (detector.Charset != null)
                {
                    try
                    {
                        return(new StreamReader(fname, Portable.Text.Encoding.GetEncoding(detector.Charset)));
                    }
                    catch (Exception e)
                    {
                    }
                }
                return(new StreamReader(fname, true));
            }
Example #13
0
        /// <summary>
        /// Try to fix the wrong encoded string.
        /// </summary>
        /// <param name="input">Input data.</param>
        /// <param name="container">Known information about bytes.</param>
        /// <param name="confidence">To limit accepted confidence.</param>
        /// <returns>Returns null if detected confidence less than input limit. Otherwise, re-encoded string.</returns>
        public string FixEncoding(string input, Encoding container, float confidence = 0.92f)
        {
            if (string.IsNullOrWhiteSpace(input))
            {
                return(input);
            }

            if (container == null)
            {
                throw new ArgumentNullException(nameof(container));
            }

            byte[] bytes = container.GetBytes(input);

            var cdet = new Ude.CharsetDetector();

            cdet.Feed(bytes, 0, bytes.Length);
            cdet.DataEnd();

            if (cdet.Charset == null)
            {
                return(null);
            }

            Log.Debug($"{nameof(FixEncoding)}: charset '{cdet.Charset}' confidence: '{cdet.Confidence}'");

            if (cdet.Confidence < confidence)
            {
                Log.Debug($"{nameof(FixEncoding)}: Confidence < {confidence}");
                return(null);
            }

            Encoding to = Encoding.GetEncoding(cdet.Charset);

            Log.Debug($"ReEncodeString: '{container.EncodingName}' -> '{to.EncodingName}'");
            Log.Trace($"ReEncodeString: original - '{input}'");
            return(to.GetString(bytes));
        }
Example #14
0
        /// <summary>
        /// Get Encoding Format of file
        /// </summary>
        /// <param name="path">Chemin du fichier</param>
        /// <returns>File Encoding</returns>
        private void GetEncoding(string path)
        {
            string encode;

            using (FileStream fs = File.OpenRead(path + DataLoaderConstants.FileExtCsv))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    encode = cdet.Charset;
                }
                else
                {
                    encode = "failed";
                }
            }
            if (encode == "failed")
            {
                enc = Encoding.Default;
            }
            else
            {
                switch (encode.ToLower())
                {
                case "utf-8": enc = Encoding.UTF8; break;

                case "utf-16le": enc = Encoding.Unicode; break;

                case "utf-16be": enc = Encoding.BigEndianUnicode; break;

                case "windows-1252": goto default;

                default: enc = Encoding.Default; break;
                }
            }
        }
Example #15
0
        /// <summary>
        /// Detects encoding for specified stream.
        /// </summary>
        /// <param name="stream">Input stream.</param>
        /// <param name="confidence">Detected confidence.</param>
        /// <returns>null if can't be detected.</returns>
        public Encoding Detect(Stream stream, out float confidence)
        {
            confidence = 0;

            if (stream == null)
            {
                return(null);
            }

            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(stream);
            cdet.DataEnd();

            if (cdet.Charset == null)
            {
                return(null);
            }

            confidence = cdet.Confidence;

            Log.Debug($"Detected charset '{cdet.Charset}' confidence: '{cdet.Confidence}'");
            return(Encoding.GetEncoding(cdet.Charset));
        }
Example #16
0
        private static Encoding GetTextEncoding(string filename, Ude.CharsetDetector charsetDetector, Encoding srcEncoding)
        {
            var encoding = Encoding.UTF8;

            using (FileStream fs = File.OpenRead(filename))
            {
                charsetDetector.Reset();
                charsetDetector.Feed(fs);
                charsetDetector.DataEnd();

                try
                {
                    encoding = Encoding.GetEncoding(charsetDetector.Charset);
                }
                catch (Exception ex)
                {
                    Console.WriteLine("Failed to obtain encoding of the file: {0} to {1}. Using source encoding instead. Corrupted characters may occur. {2}", filename, charsetDetector.Charset, ex.Message);

                    encoding = srcEncoding;
                }
            }

            return(encoding);
        }
Example #17
0
        private static void ProcessFilesInDir(string d)
        {
            foreach (string f in Directory.GetFiles(d).Where(f => Path.GetExtension(f) == ".cs" || Path.GetExtension(f) == ".xaml"))
            {
                Encoding encoding;

                using (var fs = File.OpenRead(f))
                {
                    Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset == null)
                    {
            //                        Console.WriteLine($"{f} - Detection failed.");
                        continue;
                    }

                    switch (cdet.Charset)
                    {
                        case "ASCII":
                        case "UTF-8":
                            continue;
                        case "x-mac-cyrillic":
                        case "windows-1251":
                            encoding = Encoding.GetEncoding(1251);
                            break;
                        default:
                            Console.Out.WriteLine($"{cdet.Charset} - {f} - Skipped");
                            continue;
                    }
                }

            //                Console.Out.WriteLine(f);
                var text = File.ReadAllText(f, encoding);
                File.WriteAllText(f, text, Encoding.UTF8);
            }
        }
Example #18
0
        public async Task WriteFile()
        {
            await DispatcherHelper.ExecuteOnUIThreadAsync(async() =>
            {
                try
                {
                    StorageFile file = await StorageFile.GetFileFromPathAsync(Tab.TabOriginalPathContent);

                    if (file != null)
                    {
                        await FileIO.WriteTextAsync(file, string.Empty);

                        Encoding TempEncoding = Encoding.GetEncoding(Tab.TabEncoding);

                        if (TempEncoding == Encoding.UTF8 && !Tab.TabEncodingWithBOM)
                        {
                            TempEncoding = new UTF8Encoding(false);
                        }

                        string Content = await TabsAccessManager.GetTabContentViaIDAsync(new TabID {
                            ID_Tab = Tab.ID, ID_TabsList = ListTabsID
                        });

                        using (var rd = new StreamWriter(await file.OpenStreamForWriteAsync(), TempEncoding))
                        {
                            rd.Write(Content);
                            rd.Flush(); rd.Dispose();
                        }

                        //Update DateModified (updated push with "PushUpdateTabAsync" in StorageRouter.WriteFile())
                        BasicProperties properties = await file.GetBasicPropertiesAsync();
                        Tab.TabDateModified        = properties.DateModified.ToString();

                        if (TempEncoding.CodePage == Encoding.ASCII.CodePage && Tab.TabEncodingReplacingRequest != EncodingReplacingRequest.Never)
                        {
                            var stream = new MemoryStream();
                            var writer = new StreamWriter(stream);
                            writer.Write(Content);
                            writer.Flush();
                            stream.Position = 0;

                            using (MemoryStream str = stream)
                            {
                                var cdet = new Ude.CharsetDetector();
                                cdet.Reset();
                                cdet.Feed(str);
                                cdet.DataEnd();
                                if (cdet.Charset != null)
                                {
                                    if (Encoding.GetEncoding(cdet.Charset).CodePage == Encoding.UTF8.CodePage)
                                    {
                                        await DispatcherHelper.ExecuteOnUIThreadAsync(async() =>
                                        {
                                            MessageDialog Dialog = new MessageDialog(new ResourceLoader().GetString("popup-changeencodingcontent"), string.Format(new ResourceLoader().GetString("popup-changeencodingtitle"), TempEncoding.EncodingName, cdet.Charset));
                                            Dialog.Commands.Add(new UICommand {
                                                Label = new ResourceLoader().GetString("popup-changeencodingaccept"), Invoked = async(e) => { Tab.TabEncoding = Encoding.GetEncoding(cdet.Charset).CodePage; Tab.TabEncodingWithBOM = false; Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.NotRequested; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); }
                                            });
                                            Dialog.Commands.Add(new UICommand {
                                                Label = new ResourceLoader().GetString("popup-changeencodinglater"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.MaybeLater; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); }
                                            });
                                            Dialog.Commands.Add(new UICommand {
                                                Label = new ResourceLoader().GetString("popup-changeencodingno"), Invoked = async(e) => { Tab.TabEncodingReplacingRequest = EncodingReplacingRequest.Never; await TabsWriteManager.PushUpdateTabAsync(Tab, ListTabsID, false); }
                                            });
                                            await Dialog.ShowAsync();
                                        });
                                    }
                                }
                            }
                        }
                    }
                }
                catch
                {
                    await CreateFile().ContinueWith(async(e) =>
                    {
                        if (e.Result)
                        {
                            await WriteFile();
                        }
                    });
                }
            });
        }
Example #19
0
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect(byte[] inputData, int start, int count)
        {
            if (Done)
                return EncodingName;
            if (!_started)
            {
                Reset ();
                _started = true;
                if (!CheckForTextualData (inputData, start, count))
                {
                    IsText = false;
                    Done = true;
                    return EncodingName;
                }
                HasByteOrderMark = CheckForByteOrderMark (inputData, start);
                IsText = true;
            }

            // execute charset detector
            ude.Feed (inputData, start, count);
            ude.DataEnd ();
            if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
            {
                Done = true;
                return EncodingName;
            }

            const int bufferSize = 4 * 1024;

            // singular buffer detection
            if (singleEncodings.Count < 2000)
            {
                var u = new Ude.CharsetDetector ();
                int step = (count - start) < bufferSize ? (count - start) : bufferSize;
                for (var i = start; i < count; i += step)
                {
                    u.Reset ();
                    if (i + step > count)
                        u.Feed (inputData, i, count - i);
                    else
                        u.Feed (inputData, i, step);
                    u.DataEnd ();
                    if (u.Confidence > 0.3 && !String.IsNullOrEmpty (u.Charset))
                        singleEncodings.Add (u.Charset);
                }
            }
            return EncodingName;
        }
Example #20
0
        /// <summary>
        /// Detects encoding using mozilla universal character detector.
        /// </summary>
        /// <param name="bytes">sample data</param>
        /// <returns>Detected encoding or null if not detected</returns>
        /// <history>
        /// [Curtis_Beard]		12/01/2014	Created
        /// </history>
        private static Encoding DetectEncodingUsingMozillaUCD(Byte[] bytes)
        {
            try
             {
            Ude.ICharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(bytes, 0, bytes.Length);
            cdet.DataEnd();

            if (cdet.Charset != null)
            {
               return Encoding.GetEncoding(cdet.Charset);
            }
             }
             catch { }

             return null;
        }
Example #21
0
        /// <summary>
        /// Auto detecting encoding from the file.
        /// </summary>
        protected virtual Encoding detectEncodingFromFile(string file)
        {
            using(FileStream fs = File.OpenRead(file))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                if(cdet.Charset == null) {
                    //throw new ComponentException("Ude: Detection failed for '{0}'", file);
                    Log.Warn("Problem with detection of encoding for '{0}'", file);
                    return defaultEncoding; // good luck
                }

                Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence);
                Encoding enc = Encoding.GetEncoding(cdet.Charset);

                if(enc == Encoding.UTF8) {
                    fs.Seek(0, SeekOrigin.Begin);
                    return (fs.ReadByte() == 0xEF &&
                            fs.ReadByte() == 0xBB &&
                            fs.ReadByte() == 0xBF) ? new UTF8Encoding(true) : new UTF8Encoding(false);
                }

                return enc;
            }
        }
Example #22
0
        /// <summary>
        /// Auto-detector to fixing the encoded string.
        /// </summary>
        /// <param name="str">Data for reencoding</param>
        /// <param name="from">Known Encoding for current string</param>
        /// <returns>Reencoded string with auto-detected charset.</returns>
        protected virtual string reEncodeString(string str, Encoding from)
        {
            if(String.IsNullOrEmpty(str)) {
                return str;
            }

            byte[] bytes = from.GetBytes(str);

            Ude.CharsetDetector cdet = new Ude.CharsetDetector();
            cdet.Feed(bytes, 0, bytes.Length);
            cdet.DataEnd();

            if(cdet.Charset == null) {
                Log.Debug("reEncodeString: Problem with detection... use the original");
                return str;
            }
            Log.Debug("reEncodeString: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence);

            if(cdet.Confidence < 0.92f) {
                Log.Debug("reEncodeString: Confidence < 0.92 /use the original");
                return str;
            }

            Encoding to = Encoding.GetEncoding(cdet.Charset);
            Log.Debug("reEncodeString: '{0}' -> '{1}'", from.EncodingName, to.EncodingName);
            Log.Trace("reEncodeString: original - '{0}'", str);
            return to.GetString(bytes);
        }
Example #23
0
        /// Add document from BLOB
        public virtual void AddRecord(string name, Stream stream, string fields, ConcurrentQueue <IIndexDocument> queue)
        {
            bool           isText = false;
            IIndexDocument doc    = null;

            AddRecordBase(name, fields);

            if ((indexType == IndexType.File) || (indexType != IndexType.Blob))
            {
                throw new InvalidDataException("Adding record of wrong IndexType");
            }

            BinaryReader reader = new BinaryReader(stream);

            byte[] buff = new byte[4000];
            reader.Read(buff, 0, 4000);
            String det = Encoding.UTF8.GetString(buff, 0, buff.Length);

            stream.Seek(0, SeekOrigin.Begin);
            reader.Dispose();

            // detect type
            if ((buff[0] == '%') && (buff[1] == 'P') && (buff[2] == 'D') && (buff[3] == 'F'))
            {
                DocumentsDataSource.IndexPDFDocument pdf = new DocumentsDataSource.IndexPDFDocument(name, stream, this);
                if (fields != null)
                {
                    pdf.headers = () => { return(fields); }
                }
                ;
                doc = pdf;
            }
            else
            if (det.Contains("<html"))
            {
                IndexPagedTextFile file = WebDataSource.FromHtml(stream, name, Name);
                if (fields != null)
                {
                    file.SetHeaders(fields);
                }
            }
            else
            {
                // detect charset
                Ude.CharsetDetector detector = new Ude.CharsetDetector();
                detector.Feed(buff, 0, buff.Length);
                detector.DataEnd();
                if (detector.Charset != null)
                {
                    Encoding enc = Portable.Text.Encoding.GetEncoding(detector.Charset);
                    using (StreamReader sreader = new StreamReader(stream, enc, false)) {
                        doc = new IndexPagedTextFile("", sreader.ReadToEnd(), fields != null ? fields : "");
                    }
                }
            }

            if (doc != null)
            {
                Enqueue(queue, doc);
            }
        }
Example #24
0
        private void ConvertToUtf8(string fileName)
        {
            string charset = null;
            using (FileStream fs = File.OpenRead(fileName))
            {
              var cdet = new Ude.CharsetDetector();
              cdet.Feed(fs);
              cdet.DataEnd();
              charset = cdet.Charset;
            }

            if (charset != null)
            {
              var text = string.Empty;
              using (var reader = new StreamReader(fileName, Encoding.GetEncoding(charset)))
              {
            text = reader.ReadToEnd();
              }

              using (var writer = new StreamWriter(fileName, false, Encoding.UTF8))
              {
            if (text.Length > 0)
              writer.Write(text);
              }
            }
        }
        /// <summary>
        /// Auto detecting the encoding from the file
        /// </summary>
        /// <param name="file"></param>
        /// <returns></returns>
        protected virtual Encoding detectEncodingFromFile(string file)
        {
            using(FileStream fs = File.OpenRead(file))
            {
                Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                if(cdet.Charset == null) {
                    //throw new ComponentException("Ude: Detection failed for '{0}'", file);
                    Log.Warn("Problem with detection of encoding for '{0}'", file);
                    return Encoding.UTF8; // good luck
                }

                Log.Debug("Ude: charset '{0}' confidence: '{1}'", cdet.Charset, cdet.Confidence);
                return Encoding.GetEncoding(cdet.Charset);
            }
        }
 /// <summary>
 /// Get Encoding Format of file 
 /// </summary>
 /// <param name="path">Chemin du fichier</param>
 /// <returns>File Encoding</returns>
 private void GetEncoding(string path)
 {
     string encode;
     using (FileStream fs = File.OpenRead(path + DataLoaderConstants.FileExtCsv))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             encode = cdet.Charset;
         }
         else
         {
             encode = "failed";
         }
     }
     if (encode == "failed")
         enc = Encoding.Default;
     else
     {
         switch (encode.ToLower())
         {
             case "utf-8": enc = Encoding.UTF8; break;
             case "utf-16le": enc = Encoding.Unicode; break;
             case "utf-16be": enc = Encoding.BigEndianUnicode; break;
             case "windows-1252": goto default;
             default: enc = Encoding.Default; break;
         }
     }
 }
Example #27
0
        public WordOnTxt(DirectoryInfo di)
        {
            string _encoding = "1252";


            foreach (var fi in di.GetFiles("*.txt", SearchOption.AllDirectories))
            {
                K_Google.AddFile(fi);



                List <string> lstWord = new List <string>();

                using (FileStream fs = File.OpenRead(fi.FullName))
                {
                    Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset != null)
                    {
                        _encoding = cdet.Charset;
                        //Console.WriteLine(_encoding);
                    }
                    else
                    {
                        Console.WriteLine("Detection failed.");
                    }
                }

                string path = fi.DirectoryName + "\\" + fi.Name;
                Console.WriteLine(path);
                StreamReader Reader = new StreamReader(path, Encoding.GetEncoding(_encoding));
                string       polpi  = Reader.ReadToEnd();
                Reader.Close();
                polpi = polpi.ToLower();



                Char[] delimiter = new char[] { '[', ']', '#', '^', '¦', '|', '£', '<', '>', '_', '$', '\n', '\r', '.', ' ', ',', '\'', '!', '?', '(', ')', '%', '&', '"', '=', '+', '{', '}', '*', ';', ':', '\\', '-', '/' };

                String[] substrings = polpi.Split(delimiter);



                //List<string> lstSub = new List<string>(substrings);

                List <string> lstSub = new List <string>();

                foreach (string word in substrings)
                {
                    if (word.Count() > 0)
                    {
                        lstSub.Add(word);
                    }
                }
                lstSub.Sort();
                foreach (var substring in lstSub)
                {
                    if (!lstWord.Contains(substring))
                    {
                        lstWord.Add(substring);
                    }
                }
                lstSub.Sort();
                foreach (var word in lstWord)
                {
                    AddWord(word);
                }
                List <string> lstWord2 = new List <string>();
                foreach (var substring in lstSub)
                {
                    if (!lstWord2.Contains(substring))
                    {
                        Ocurrence ocucu = new Ocurrence(fi, substring, null);
                        if (!lstOccurence.Contains(ocucu))
                        {
                            lstOccurence.Add(ocucu);
                            lstWord2.Add(substring);
                        }
                    }
                    else
                    {
                        foreach (Ocurrence occurence in lstOccurence)
                        {
                            if (occurence.Word == substring)
                            {
                                occurence.IncreamentOccurence();
                            }
                        }
                    }
                }
            }
            foreach (Ocurrence ocu in lstOccurence)
            {
                ocu.SendToDataBase();
            }
        }
Example #28
0
 /// <summary>
 /// Acquires encoding related info on one read.
 /// </summary>
 public static EncodingFileInfo GetEncodingFileInfo(String file)
 {
     Int32 startIndex = 0;
     EncodingFileInfo info = new EncodingFileInfo();
     try
     {
         if (File.Exists(file))
         {
             Byte[] bytes = File.ReadAllBytes(file);
             if (bytes.Length > 2 && (bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf))
             {
                 startIndex = 3;
                 info.BomLength = 3;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF8.WebName;
                 info.CodePage = Encoding.UTF8.CodePage;
             }
             else if (bytes.Length > 3 && (bytes[0] == 0xff && bytes[1] == 0xfe && bytes[2] == 0x00 && bytes[3] == 0x00))
             {
                 startIndex = 4;
                 info.BomLength = 4;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF32.WebName;
                 info.CodePage = Encoding.UTF32.CodePage;
             }
             else if (bytes.Length > 4 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f) && bytes[4] == 0x2D))
             {
                 startIndex = 5;
                 info.BomLength = 5;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF7.WebName;
                 info.CodePage = Encoding.UTF7.CodePage;
             }
             else if (bytes.Length > 3 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f)))
             {
                 startIndex = 4;
                 info.BomLength = 4;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.UTF7.WebName;
                 info.CodePage = Encoding.UTF7.CodePage;
             }
             else if (bytes.Length > 1 && (bytes[0] == 0xff && bytes[1] == 0xfe))
             {
                 startIndex = 2;
                 info.BomLength = 2;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.Unicode.WebName;
                 info.CodePage = Encoding.Unicode.CodePage;
             }
             else if (bytes.Length > 1 && (bytes[0] == 0xfe && bytes[1] == 0xff))
             {
                 startIndex = 2;
                 info.BomLength = 2;
                 info.ContainsBOM = true;
                 info.Charset = Encoding.BigEndianUnicode.WebName;
                 info.CodePage = Encoding.BigEndianUnicode.CodePage;
             }
             else
             {
                 if (!ContainsInvalidUTF8Bytes(bytes))
                 {
                     info.Charset = Encoding.UTF8.WebName;
                     info.CodePage = Encoding.UTF8.CodePage;
                 }
                 else // Try detecting using Ude...
                 {
                     Ude.CharsetDetector detector = new Ude.CharsetDetector();
                     detector.Feed(bytes, 0, bytes.Length); detector.DataEnd();
                     if (detector.Charset != null)
                     {
                         Encoding encoding = Encoding.GetEncoding(detector.Charset);
                         info.Charset = encoding.WebName;
                         info.CodePage = encoding.CodePage;
                     }
                     else
                     {
                         info.Charset = Encoding.Default.WebName;
                         info.CodePage = Encoding.Default.CodePage;
                     }
                 }
             }
             Int32 contentLength = bytes.Length - startIndex;
             if (bytes.Length > 0 && bytes.Length > startIndex)
             {
                 Encoding encoding = Encoding.GetEncoding(info.CodePage);
                 info.Contents = encoding.GetString(bytes, startIndex, contentLength);
             }
         }
     }
     catch (Exception)
     {
         info = new EncodingFileInfo();
     }
     return info;
 }
        /// <summary>
        /// Detects the encoding of textual data of the specified input data.
        /// </summary>
        /// <param name="inputData">The input data.</param>
        /// <param name="start">The start.</param>
        /// <param name="count">The count.</param>
        /// <returns>Detected encoding name</returns>
        public string Detect (byte[] inputData, int start, int count)
        {
            if (Done)
                return EncodingName;
            if (!_started)
            {
                Reset ();
                _started = true;
                if (!CheckForTextualData (inputData, start, count))
                {
                    IsText = false;
                    Done = true;
                    return EncodingName;
                }
                HasByteOrderMark = CheckForByteOrderMark (inputData, start);
                IsText = true;
            }

            // execute charset detector                
            ude.Feed (inputData, start, count);
            ude.DataEnd ();
            if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
            {
                IncrementFrequency (ude.Charset);
                Done = true;
                return EncodingName;
            }

            // singular buffer detection
            var singleUde = new Ude.CharsetDetector ();
            const int udeFeedSize = 4 * 1024;
            int step = (count - start) < udeFeedSize ? (count - start) : udeFeedSize;
            for (var pos = start; pos < count; pos += step)
            {
                singleUde.Reset ();
                if (pos + step > count)
                    singleUde.Feed (inputData, pos, count - pos);
                else
                    singleUde.Feed (inputData, pos, step);
                singleUde.DataEnd ();
                // update encoding frequency
                if (singleUde.Confidence > 0.3 && !String.IsNullOrEmpty (singleUde.Charset))
                    IncrementFrequency (singleUde.Charset);
            }
            // vote for best encoding
            EncodingName = GetCurrentEncoding ();
            // update current encoding name
            return EncodingName;
        }
 /// <summary>
 /// Get Encoding Format of file 
 /// </summary>
 /// <param name="path">Chemin du fichier</param>
 /// <returns>File Encoding</returns>
 private static Encoding GetEncoding(string path)
 {
     string encode;
     using (FileStream fs = File.OpenRead(path))
     {
         Ude.CharsetDetector cdet = new Ude.CharsetDetector();
         cdet.Feed(fs);
         cdet.DataEnd();
         if (cdet.Charset != null)
         {
             encode = cdet.Charset;
         }
         else
         {
             encode = "failed";
         }
         fs.Close();
     }
     if (encode == "failed")
         return Encoding.Default;
     else
     {
         switch (encode.ToLower())
         {
             case "utf-8": return Encoding.UTF8;
             case "utf-16le": return Encoding.Unicode;
             case "utf-16be": return Encoding.BigEndianUnicode;
             case "windows-1252": goto default;
             default: return Encoding.Default;
         }
     }
 }
Example #31
0
        public Action(Options options)
        {
            OpenOrCreateDirectory(options.InputDirectory, false);
            if (string.IsNullOrEmpty(options.OutputDirectory))
            {
                options.OutputDirectory = options.InputDirectory;
            }
            OpenOrCreateDirectory(options.OutputDirectory, true);
            string inputDirectory  = new DirectoryInfo(options.InputDirectory).FullName;
            string outputDirectory = new DirectoryInfo(options.OutputDirectory).FullName;

            Encoding outputEncoding = Encoding.GetEncoding(options.OutputEncoding);

            string[]         allFiles           = Directory.GetFiles(options.InputDirectory, "*", SearchOption.AllDirectories);
            HashSet <string> extensionWhiteList = options.ExtensionWhiteList == null
                ? new HashSet <string>()
                : new HashSet <string>(options.ExtensionWhiteList);
            bool             allExtension       = extensionWhiteList.Contains("*");
            HashSet <string> extensionBlackList = options.ExtensionBlackList == null
                ? new HashSet <string>()
                : new HashSet <string>(options.ExtensionBlackList);
            List <FileInfo> inputFiles = new List <FileInfo>();

            for (int iFile = 0; iFile < allFiles.Length; iFile++)
            {
                string   iterFile     = allFiles[iFile];
                FileInfo iterFileInfo = new FileInfo(iterFile);
                if ((allExtension || extensionWhiteList.Contains(iterFileInfo.Extension)) &&
                    !extensionBlackList.Contains(iterFileInfo.Extension))
                {
                    inputFiles.Add(iterFileInfo);
                }
                else if (options.OutputLog)
                {
                    Console.WriteLine("Ignore file: " + iterFileInfo.FullName);
                }
            }

            int convertedFileCount = 0;

            for (int iFile = 0; iFile < inputFiles.Count; iFile++)
            {
                FileInfo iterFile      = inputFiles[iFile];
                Encoding inputEncoding = null;
                using (FileStream fs = File.OpenRead(iterFile.FullName))
                {
                    Ude.CharsetDetector cdet = new Ude.CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();
                    if (cdet.Charset != null)
                    {
                        inputEncoding = Encoding.GetEncoding(cdet.Charset);
                    }
                    else if (options.OutputLog)
                    {
                        Console.WriteLine(string.Format("{0} Cant detector file encoding", iterFile.FullName));
                    }
                }
                if (inputEncoding != null && inputEncoding != outputEncoding)
                {
                    File.WriteAllBytes(iterFile.FullName.Replace(inputDirectory, outputDirectory)
                                       , Encoding.Convert(inputEncoding
                                                          , outputEncoding
                                                          , File.ReadAllBytes(iterFile.FullName)));
                    convertedFileCount++;
                    Console.WriteLine("Converted File: " + iterFile.FullName);
                }
                else if (options.OutputLog)
                {
                    Console.WriteLine(string.Format("File encoding already is ({0}): {1}", options.OutputEncoding, iterFile.FullName));
                }
            }
            Console.WriteLine(string.Format("Converted {0} files", convertedFileCount));
        }
Example #32
0
        /// <summary>
        ///     Wraps the given <paramref name="inputFile"/> in HTML pre tags
        /// </summary>
        /// <param name="inputFile">The input file</param>
        /// <param name="encoding">The encoding used in the input file</param>
        /// <returns>The wrapped HTML file</returns>
        public string WrapFile(string inputFile, Encoding encoding)
        {
            var temp     = Path.GetFileName(inputFile) ?? string.Empty;
            var title    = WebUtility.HtmlEncode(temp);
            var tempFile = GetTempFile;

            WriteToLog($"Reading text file '{inputFile}'");

            if (encoding == null)
            {
                var charsetDetector = new Ude.CharsetDetector();
                using (var fileStream = File.OpenRead(inputFile))
                {
                    WriteToLog("Trying to detect encoding");
                    charsetDetector.Feed(fileStream);
                    charsetDetector.DataEnd();
                    if (charsetDetector.Charset != null)
                    {
                        try
                        {
                            encoding = Encoding.GetEncoding(charsetDetector.Charset);
                        }
                        catch
                        {
                            Console.WriteLine("Detection failed assuming standard encoding");
                            encoding = Encoding.Default;
                        }
                    }
                    else
                    {
                        Console.WriteLine("Detection failed assuming standard encoding");
                        encoding = Encoding.Default;
                    }
                }
            }

            var streamReader = new StreamReader(inputFile, encoding);

            WriteToLog($"File is '{streamReader.CurrentEncoding.WebName}' encoded");

            var writeEncoding = new UnicodeEncoding(!BitConverter.IsLittleEndian, true);

            using (var writer = new StreamWriter(tempFile, false, writeEncoding))
                using (streamReader)
                {
                    writer.WriteLine("<html>");
                    writer.WriteLine("<head>");
                    writer.WriteLine($"   <meta charset=\"{writeEncoding.WebName}\">");
                    writer.WriteLine($"<title>{title}</title>");
                    writer.WriteLine("<style>");
                    writer.WriteLine("  pre {");
                    writer.WriteLine($"  white-space: { WhiteSpace };");
                    if (!string.IsNullOrWhiteSpace(FontFamily))
                    {
                        writer.WriteLine($"  font-family: { FontFamily };");
                    }
                    if (!string.IsNullOrWhiteSpace(FontFamily))
                    {
                        writer.WriteLine($"  font-style: { FontStyle };");
                    }
                    if (!string.IsNullOrWhiteSpace(FontFamily))
                    {
                        writer.WriteLine($"  font-size: { FontSize };");
                    }
                    writer.WriteLine("  }");
                    writer.WriteLine("</style>");
                    writer.WriteLine("</head>");
                    writer.WriteLine("<body>");
                    writer.WriteLine("<pre>");

                    while (!streamReader.EndOfStream)
                    {
                        var line = streamReader.ReadLine();
                        if (line != null)
                        {
                            writer.WriteLine(HttpUtility.HtmlEncode(line));
                        }
                    }

                    writer.WriteLine("</pre>");
                    writer.WriteLine("</body>");
                    writer.WriteLine("</html>");
                }

            WriteToLog($"File pre wrapped and written to temporary file '{tempFile}'");

            return(tempFile);
        }
Example #33
0
        /// <summary>
        /// Acquires encoding related info on one read.
        /// </summary>
        public static EncodingFileInfo GetEncodingFileInfo(String file)
        {
            Int32            startIndex = 0;
            EncodingFileInfo info       = new EncodingFileInfo();

            try
            {
                if (File.Exists(file))
                {
                    Byte[] bytes = File.ReadAllBytes(file);
                    if (bytes.Length > 2 && (bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf))
                    {
                        startIndex       = 3;
                        info.BomLength   = 3;
                        info.ContainsBOM = true;
                        info.Charset     = Encoding.UTF8.WebName;
                        info.CodePage    = Encoding.UTF8.CodePage;
                    }
                    else if (bytes.Length > 3 && (bytes[0] == 0xff && bytes[1] == 0xfe && bytes[2] == 0x00 && bytes[3] == 0x00))
                    {
                        startIndex       = 4;
                        info.BomLength   = 4;
                        info.ContainsBOM = true;
                        info.Charset     = Encoding.UTF32.WebName;
                        info.CodePage    = Encoding.UTF32.CodePage;
                    }
                    else if (bytes.Length > 4 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f) && bytes[4] == 0x2D))
                    {
                        startIndex       = 5;
                        info.BomLength   = 5;
                        info.ContainsBOM = true;
                        info.Charset     = Encoding.UTF7.WebName;
                        info.CodePage    = Encoding.UTF7.CodePage;
                    }
                    else if (bytes.Length > 3 && ((bytes[0] == 0x2b && bytes[1] == 0x2f && bytes[2] == 0x76) && (bytes[3] == 0x38 || bytes[3] == 0x39 || bytes[3] == 0x2b || bytes[3] == 0x2f)))
                    {
                        startIndex       = 4;
                        info.BomLength   = 4;
                        info.ContainsBOM = true;
                        info.Charset     = Encoding.UTF7.WebName;
                        info.CodePage    = Encoding.UTF7.CodePage;
                    }
                    else if (bytes.Length > 1 && (bytes[0] == 0xff && bytes[1] == 0xfe))
                    {
                        startIndex       = 2;
                        info.BomLength   = 2;
                        info.ContainsBOM = true;
                        info.Charset     = Encoding.Unicode.WebName;
                        info.CodePage    = Encoding.Unicode.CodePage;
                    }
                    else if (bytes.Length > 1 && (bytes[0] == 0xfe && bytes[1] == 0xff))
                    {
                        startIndex       = 2;
                        info.BomLength   = 2;
                        info.ContainsBOM = true;
                        info.Charset     = Encoding.BigEndianUnicode.WebName;
                        info.CodePage    = Encoding.BigEndianUnicode.CodePage;
                    }
                    else
                    {
                        if (!ContainsInvalidUTF8Bytes(bytes))
                        {
                            info.Charset  = Encoding.UTF8.WebName;
                            info.CodePage = Encoding.UTF8.CodePage;
                        }
                        else // Try detecting using Ude...
                        {
                            Ude.CharsetDetector detector = new Ude.CharsetDetector();
                            detector.Feed(bytes, 0, bytes.Length); detector.DataEnd();
                            if (detector.Charset != null)
                            {
                                Encoding encoding = Encoding.GetEncoding(detector.Charset);
                                info.Charset  = encoding.WebName;
                                info.CodePage = encoding.CodePage;
                            }
                            else
                            {
                                info.Charset  = Encoding.Default.WebName;
                                info.CodePage = Encoding.Default.CodePage;
                            }
                        }
                    }
                    Int32 contentLength = bytes.Length - startIndex;
                    if (bytes.Length > 0 && bytes.Length > startIndex)
                    {
                        Encoding encoding = Encoding.GetEncoding(info.CodePage);
                        info.Contents = encoding.GetString(bytes, startIndex, contentLength);
                    }
                }
            }
            catch (Exception)
            {
                info = new EncodingFileInfo();
            }
            return(info);
        }