예제 #1
0
        private async Task <Stream> GetSubtitleStream(string path, string language, MediaProtocol protocol, bool requiresCharset, CancellationToken cancellationToken)
        {
            if (requiresCharset)
            {
                var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false);

                var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;
                _logger.LogDebug("charset {CharSet} detected for {Path}", charset ?? "null", path);

                if (!string.IsNullOrEmpty(charset))
                {
                    // Make sure we have all the code pages we can get
                    Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
                    using (var inputStream = new MemoryStream(bytes))
                        using (var reader = new StreamReader(inputStream, Encoding.GetEncoding(charset)))
                        {
                            var text = await reader.ReadToEndAsync().ConfigureAwait(false);

                            bytes = Encoding.UTF8.GetBytes(text);

                            return(new MemoryStream(bytes));
                        }
                }
            }

            return(File.OpenRead(path));
        }
예제 #2
0
        public void TestBomUTF32_LE()
        {
            byte[] buf    = { 0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00 };
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(Charsets.UTF32_LE, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #3
0
        public void TestOutOfRange2()
        {
            byte[] buf    = Encoding.UTF8.GetBytes("1234567890");
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #4
0
        public void TestIssue3()
        {
            byte[] buf    = Encoding.UTF8.GetBytes("3");
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #5
0
        public void TestCaseBomUtf7(byte[] bufferBytes)
        {
            var result = CharsetDetector.DetectFromBytes(bufferBytes)
                         .Detected;

            Assert.AreEqual(CodepageName.UTF7, result.EncodingName);
            Assert.AreEqual(1.0f, result.Confidence);
        }
예제 #6
0
        public void TestSingleChar()
        {
            byte[] buf    = Encoding.UTF8.GetBytes("3");
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName);
            Assert.AreEqual(1, result.Detected.Confidence);
        }
예제 #7
0
        public void Test2byteArrayBomUTF16_LE()
        {
            byte[] buf    = { 0xFF, 0xFE, };
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(CodepageName.UTF16_LE, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #8
0
        public void TestBomUtf8()
        {
            byte[] buf    = { 0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0x21 };
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual("UTF-8", result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #9
0
        private void LoadFileAsync(string path)
        {
            Task.Run(() =>
            {
                const int maxLength = 5 * 1024 * 1024;
                var buffer          = new MemoryStream();
                bool tooLong;

                using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                {
                    tooLong = s.Length > maxLength;
                    while (s.Position < s.Length && buffer.Length < maxLength)
                    {
                        if (_disposed)
                        {
                            break;
                        }

                        var lb    = new byte[8192];
                        var count = s.Read(lb, 0, lb.Length);
                        buffer.Write(lb, 0, count);
                    }
                }

                if (_disposed)
                {
                    return;
                }

                if (tooLong)
                {
                    _context.Title += " (0 ~ 5MB)";
                }

                var bufferCopy = buffer.ToArray();
                buffer.Dispose();

                var encoding = CharsetDetector.DetectFromBytes(bufferCopy).Detected?.Encoding ??
                               Encoding.Default;

                var doc = new TextDocument(encoding.GetString(bufferCopy));
                doc.SetOwnerThread(Dispatcher.Thread);

                if (_disposed)
                {
                    return;
                }

                Dispatcher.BeginInvoke(new Action(() =>
                {
                    Encoding           = encoding;
                    SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path));
                    Document           = doc;

                    _context.IsBusy = false;
                }), DispatcherPriority.Render);
            });
        }
예제 #10
0
        public void TestBomGb18030()
        {
            var bufferBytes = new byte[] { 0x84, 0x31, 0x95, 0x33 };
            var result      = CharsetDetector.DetectFromBytes(bufferBytes)
                              .Detected;

            Assert.AreEqual(CodepageName.GB18030, result.EncodingName);
            Assert.AreEqual(1.0f, result.Confidence);
        }
예제 #11
0
        public void TestBomX_ISO_10646_UCS_4_3412()
        {
            byte[] buf = { 0xFE, 0xFF, 0x00, 0x00, 0x65 };

            var result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(CodepageName.X_ISO_10646_UCS_4_3412, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #12
0
        public void TestBomX_ISO_10646_UCS_4_2143()
        {
            byte[] buf = { 0x00, 0x00, 0xFF, 0xFE, 0x00, 0x65 };

            var result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual("X-ISO-10646-UCS-4-2143", result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #13
0
        public string Chardet(byte[] bytes, int index, int count)
        {
            var buffer = new byte[count];

            Array.Copy(bytes, index, buffer, 0, count);

            var encoding = CharsetDetector.DetectFromBytes(buffer).Detected?.Encoding ?? Encoding.Default;

            return(encoding.GetString(buffer));
        }
예제 #14
0
        public async Task <string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false);

            var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;

            _logger.LogDebug("charset {0} detected for {Path}", charset ?? "null", path);

            return(charset);
        }
예제 #15
0
        private string GenerateMarkdownHtml(string path)
        {
            var bytes    = File.ReadAllBytes(path);
            var encoding = CharsetDetector.DetectFromBytes(bytes).Detected?.Encoding ?? Encoding.Default;

            var md = encoding.GetString(bytes);

            md = WebUtility.HtmlEncode(md);

            var html = Resources.md2html.Replace("{{content}}", md);

            return(html);
        }
예제 #16
0
        public void TestUTF8_1()
        {
            string s = "ウィキペディアはオープンコンテントの百科事典です。基本方針に賛同し" +
                       "ていただけるなら、誰でも記事を編集したり新しく作成したりできます。" +
                       "ガイドブックを読んでから、サンドボックスで練習してみましょう。質問は" +
                       "利用案内でどうぞ。";

            byte[] buf    = Encoding.UTF8.GetBytes(s);
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual("UTF-8", result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #17
0
        private void LoadFile(string path)
        {
            using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                const int bufferLength = 1 * 1024 * 1024;
                var       buffer       = new byte[bufferLength];
                s.Read(buffer, 0, bufferLength);

                viewer.Encoding = CharsetDetector.DetectFromBytes(buffer).Detected?.Encoding ?? Encoding.Default;
            }

            viewer.Load(path);
            viewer.SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path));
        }
예제 #18
0
        /// <summary>
        ///   Guesses the code page.
        /// </summary>
        /// <param name="buff">The buff containing the characters.</param>
        /// <returns><see cref="Encoding" /></returns>
        public static Encoding GuessEncodingNoBom([CanBeNull] byte[] buff)
        {
            if (buff == null || buff.Length < 1)
            {
                return(Encoding.UTF8);
            }

            var results = CharsetDetector.DetectFromBytes(buff);

            if (results.Detected == null || results.Detected.Confidence < 0.2)
            {
                return(Encoding.UTF8);
            }
            return(results.Detected.Encoding);
        }
예제 #19
0
        public void DetectFromByteArray(int offset, int len, string detectedCodepage)
        {
            // Arrange
            string s = "UTF-Unknown은 파일, 스트림, 그 외 바이트 배열의 캐릭터 셋을 탐지하는 라이브러리입니다." +
                       "대한민국 (大韓民國, Republic of Korea)";

            byte[] bytes = Encoding.UTF8.GetBytes(s);

            // Act
            var result = CharsetDetector.DetectFromBytes(bytes, offset, len);

            // Assert
            Assert.AreEqual(detectedCodepage, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }
예제 #20
0
        private void LoadFileAsync(string path)
        {
            try {
                Task.Run(() =>
                {
                    const int maxLength = 50 * 1024 * 1024;
                    var buffer          = new MemoryStream();
                    bool tooLong;
                    MyBusyIndicator.IsBusy = true;
                    using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                    {
                        tooLong = s.Length > maxLength;
                        while (s.Position < s.Length && buffer.Length < maxLength)
                        {
                            var lb    = new byte[8192];
                            var count = s.Read(lb, 0, lb.Length);
                            buffer.Write(lb, 0, count);
                        }
                    }


                    //if (tooLong)
                    //    nevise.Title += " (0 ~ 50MB)";

                    var bufferCopy = buffer.ToArray();
                    buffer.Dispose();

                    var encoding = CharsetDetector.DetectFromBytes(bufferCopy).Detected?.Encoding ??
                                   Encoding.Default;

                    var doc = new TextDocument(encoding.GetString(bufferCopy));
                    doc.SetOwnerThread(Dispatcher.Thread);

                    Dispatcher.BeginInvoke(new Action(() =>
                    {
                        nevise.Encoding           = encoding;
                        nevise.SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(System.IO.Path.GetExtension(path));
                        nevise.Document           = doc;

                        MyBusyIndicator.IsBusy = false;
                    }), DispatcherPriority.Render);
                });
            } catch { MyBusyIndicator.IsBusy = false; }
        }
예제 #21
0
        /// <summary>
        /// Called when [receive].
        /// </summary>
        /// <param name="bytes">The bytes.</param>
        private void OnReceiveInternal(byte[] bytes)
        {
            LastActivityDate = DateTime.UtcNow;

            if (OnReceive == null)
            {
                return;
            }
            var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;

            if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
            {
                OnReceiveInternal(Encoding.UTF8.GetString(bytes, 0, bytes.Length));
            }
            else
            {
                OnReceiveInternal(Encoding.ASCII.GetString(bytes, 0, bytes.Length));
            }
        }
예제 #22
0
        private async Task ProcessMessage(byte[] messageBytes, TaskCompletionSource <bool> taskCompletionSource)
        {
            var charset = CharsetDetector.DetectFromBytes(messageBytes).Detected?.EncodingName;
            var message = string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase)
                ? Encoding.UTF8.GetString(messageBytes, 0, messageBytes.Length)
                : Encoding.ASCII.GetString(messageBytes, 0, messageBytes.Length);

            // All messages are expected to be valid JSON objects
            if (!message.StartsWith("{", StringComparison.OrdinalIgnoreCase))
            {
                _logger.LogDebug("Received web socket message that is not a json structure: {Message}", message);
                return;
            }

            try
            {
                var info = _jsonSerializer.DeserializeFromString <WebSocketMessage <object> >(message);

                _logger.LogDebug("Websocket message received: {0}", info.MessageType);

                var tasks = _webSocketHandlers.Select(handler => Task.Run(() =>
                {
                    try
                    {
                        handler.ProcessMessage(info, taskCompletionSource).ConfigureAwait(false);
                    }
                    catch (Exception ex)
                    {
                        _logger.LogError(ex, "{HandlerType} failed processing WebSocket message {MessageType}",
                                         handler.GetType().Name, info.MessageType ?? string.Empty);
                    }
                }));

                await Task.WhenAll(tasks);
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Error processing web socket message");
            }
        }
예제 #23
0
        /// <summary>
        /// 判断读入文本的编码格式
        ///
        /// </summary>
        public static Encoding GetEncoding(string filename, int taster = 1000)
        {
            /* 改为默认uf8奇葩方法保留
             * //unix 可能是识别失败,使用奇葩方法,常见代码默认utf-8
             * //黑名单自动识别
             * string[] black_list = { ".txt" };
             * if(!black_list.Any(filename.ToLower().EndsWith))
             * {
             *  return Encoding.UTF8;
             * }
             */
            var encoding = Encoding.Default;
            var buffer   = new MemoryStream();

            using (FileStream fs = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
            {
                while (fs.Position < fs.Length && buffer.Length < taster)
                {
                    var lb  = new byte[8192];
                    int len = fs.Read(lb, 0, lb.Length);
                    buffer.Write(lb, 0, len);
                }
            }
            var bufferCopy = buffer.ToArray();

            buffer.Dispose();
            var Detected = CharsetDetector.DetectFromBytes(bufferCopy).Detected;

            Debug.WriteLine("Confidence = " + Detected?.Confidence);
            if (Detected?.Confidence > 0.5)
            {
                encoding = Detected.Encoding ?? Encoding.UTF8;
                Debug.WriteLine("UTF-UNKNOWN Charset = " + encoding.EncodingName);
                return(Detected.Encoding);
            }
            else
            {
                return(Encoding.UTF8);
            }
        }
        /// <summary>
        /// Returns up to maxEncodings code pages that are assumed to be appropriate
        /// </summary>
        /// <param name="input">array containing the raw data</param>
        /// <param name="maxEncodings">maximum number of encodings to detect</param>
        /// <returns>an array of Encoding with assumed encodings</returns>
        private static Encoding[] DetectInputCodePages(byte[] input, int maxEncodings)
        {
            if (maxEncodings < 1)
            {
                throw new ArgumentOutOfRangeException(nameof(maxEncodings), "at least one encoding must be returned");
            }

            if (input == null)
            {
                throw new ArgumentNullException(nameof(input));
            }

            // empty strings can always be encoded as ASCII
            if (input.Length == 0)
            {
                return(new[] { Encoding.ASCII });
            }

            // use UTF.Unknown to detect from input byte string
            var detectionResult = CharsetDetector.DetectFromBytes(input);

            return(detectionResult.Details.OrderByDescending(p => p.Confidence).Select(p => p.Encoding).Take(maxEncodings).ToArray());
        }
예제 #25
0
        public static Subtitle Open(string path, string encode, bool noNoFallback = false)
        {
            string text;

            if (encode == "자동")
            {
                var      bytes  = File.ReadAllBytes(path);
                var      result = CharsetDetector.DetectFromBytes(bytes.Take(30).ToArray()); // 성능 최적화를 위해 우선 초반 30 Bytes만 검사
                Encoding encoding;
                try
                {
                    if (result.Detected == null)
                    {
                        throw new DecoderFallbackException();
                    }
                    encoding = Encoding.GetEncoding(result.Detected.EncodingName, MainWindow.EcdFallback, MainWindow.DcdFallback);
                    text     = encoding.GetString(bytes);
                }
                catch (DecoderFallbackException)
                {
                    // Debug.WriteLine("Full Bytes 검사 : " + path);
                    result = CharsetDetector.DetectFromBytes(bytes);
                    if (noNoFallback)
                    {
                        encoding = Encoding.GetEncoding(result.Detected.EncodingName, MainWindow.EcdFallback, MainWindow.DcdFallback);
                    }
                    else
                    {
                        encoding = result.Detected.Encoding;
                    }
                    try
                    {
                        text = encoding.GetString(bytes);
                    }
                    catch (DecoderFallbackException)
                    {
                        throw new Exception("인코딩 자동 감지 실패");
                    }
                }
            }
            else
            {
                encode = encode == "ANSI" ? "ks_c_5601-1987" : encode;
                using (var sr = new StreamReader(path, Encoding.GetEncoding(encode), true))
                {
                    text = sr.ReadToEnd();
                }
            }
            if (String.IsNullOrEmpty(text))
            {
                throw new Exception("파일 읽기 실패");
            }

            // 성능 향상을 위해 자주 쓰일 법한 포맷부터 먼저 체크해 조건문을 빠져나갈 수 있도록 함
            if (path.EndsWith(".smi"))
            {
                return(Parser.Parse(text, "SMI"));
            }
            else if (path.EndsWith(".ass") || path.EndsWith(".ssa"))
            {
                return(Parser.Parse(text, "ASS"));
            }
            else
            {
                return(Parser.Parse(text, "SRT"));
            }
        }
예제 #26
0
 public static Encoding GetEncoding(byte[] buffer)
 {
     return(CharsetDetector.DetectFromBytes(buffer).Detected?.Encoding ?? Encoding.UTF8);
 }
예제 #27
0
        public static string detect(Context ctx, PhpString content)
        {
            var result = CharsetDetector.DetectFromBytes(content.ToBytes(ctx));

            return(result.Detected?.EncodingName);
        }