private async Task <Stream> GetSubtitleStream(string path, string language, MediaProtocol protocol, bool requiresCharset, CancellationToken cancellationToken) { if (requiresCharset) { var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false); var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName; _logger.LogDebug("charset {CharSet} detected for {Path}", charset ?? "null", path); if (!string.IsNullOrEmpty(charset)) { // Make sure we have all the code pages we can get Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); using (var inputStream = new MemoryStream(bytes)) using (var reader = new StreamReader(inputStream, Encoding.GetEncoding(charset))) { var text = await reader.ReadToEndAsync().ConfigureAwait(false); bytes = Encoding.UTF8.GetBytes(text); return(new MemoryStream(bytes)); } } } return(File.OpenRead(path)); }
public void TestBomUTF32_LE() { byte[] buf = { 0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00 }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(Charsets.UTF32_LE, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestOutOfRange2() { byte[] buf = Encoding.UTF8.GetBytes("1234567890"); var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestIssue3() { byte[] buf = Encoding.UTF8.GetBytes("3"); var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestCaseBomUtf7(byte[] bufferBytes) { var result = CharsetDetector.DetectFromBytes(bufferBytes) .Detected; Assert.AreEqual(CodepageName.UTF7, result.EncodingName); Assert.AreEqual(1.0f, result.Confidence); }
public void TestSingleChar() { byte[] buf = Encoding.UTF8.GetBytes("3"); var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName); Assert.AreEqual(1, result.Detected.Confidence); }
public void Test2byteArrayBomUTF16_LE() { byte[] buf = { 0xFF, 0xFE, }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(CodepageName.UTF16_LE, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestBomUtf8() { byte[] buf = { 0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0x21 }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual("UTF-8", result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
private void LoadFileAsync(string path) { Task.Run(() => { const int maxLength = 5 * 1024 * 1024; var buffer = new MemoryStream(); bool tooLong; using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { tooLong = s.Length > maxLength; while (s.Position < s.Length && buffer.Length < maxLength) { if (_disposed) { break; } var lb = new byte[8192]; var count = s.Read(lb, 0, lb.Length); buffer.Write(lb, 0, count); } } if (_disposed) { return; } if (tooLong) { _context.Title += " (0 ~ 5MB)"; } var bufferCopy = buffer.ToArray(); buffer.Dispose(); var encoding = CharsetDetector.DetectFromBytes(bufferCopy).Detected?.Encoding ?? Encoding.Default; var doc = new TextDocument(encoding.GetString(bufferCopy)); doc.SetOwnerThread(Dispatcher.Thread); if (_disposed) { return; } Dispatcher.BeginInvoke(new Action(() => { Encoding = encoding; SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path)); Document = doc; _context.IsBusy = false; }), DispatcherPriority.Render); }); }
public void TestBomGb18030() { var bufferBytes = new byte[] { 0x84, 0x31, 0x95, 0x33 }; var result = CharsetDetector.DetectFromBytes(bufferBytes) .Detected; Assert.AreEqual(CodepageName.GB18030, result.EncodingName); Assert.AreEqual(1.0f, result.Confidence); }
public void TestBomX_ISO_10646_UCS_4_3412() { byte[] buf = { 0xFE, 0xFF, 0x00, 0x00, 0x65 }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(CodepageName.X_ISO_10646_UCS_4_3412, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestBomX_ISO_10646_UCS_4_2143() { byte[] buf = { 0x00, 0x00, 0xFF, 0xFE, 0x00, 0x65 }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual("X-ISO-10646-UCS-4-2143", result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public string Chardet(byte[] bytes, int index, int count) { var buffer = new byte[count]; Array.Copy(bytes, index, buffer, 0, count); var encoding = CharsetDetector.DetectFromBytes(buffer).Detected?.Encoding ?? Encoding.Default; return(encoding.GetString(buffer)); }
public async Task <string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken) { var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false); var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName; _logger.LogDebug("charset {0} detected for {Path}", charset ?? "null", path); return(charset); }
private string GenerateMarkdownHtml(string path) { var bytes = File.ReadAllBytes(path); var encoding = CharsetDetector.DetectFromBytes(bytes).Detected?.Encoding ?? Encoding.Default; var md = encoding.GetString(bytes); md = WebUtility.HtmlEncode(md); var html = Resources.md2html.Replace("{{content}}", md); return(html); }
public void TestUTF8_1() { string s = "ウィキペディアはオープンコンテントの百科事典です。基本方針に賛同し" + "ていただけるなら、誰でも記事を編集したり新しく作成したりできます。" + "ガイドブックを読んでから、サンドボックスで練習してみましょう。質問は" + "利用案内でどうぞ。"; byte[] buf = Encoding.UTF8.GetBytes(s); var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual("UTF-8", result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
private void LoadFile(string path) { using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read)) { const int bufferLength = 1 * 1024 * 1024; var buffer = new byte[bufferLength]; s.Read(buffer, 0, bufferLength); viewer.Encoding = CharsetDetector.DetectFromBytes(buffer).Detected?.Encoding ?? Encoding.Default; } viewer.Load(path); viewer.SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path)); }
/// <summary> /// Guesses the code page. /// </summary> /// <param name="buff">The buff containing the characters.</param> /// <returns><see cref="Encoding" /></returns> public static Encoding GuessEncodingNoBom([CanBeNull] byte[] buff) { if (buff == null || buff.Length < 1) { return(Encoding.UTF8); } var results = CharsetDetector.DetectFromBytes(buff); if (results.Detected == null || results.Detected.Confidence < 0.2) { return(Encoding.UTF8); } return(results.Detected.Encoding); }
public void DetectFromByteArray(int offset, int len, string detectedCodepage) { // Arrange string s = "UTF-Unknown은 파일, 스트림, 그 외 바이트 배열의 캐릭터 셋을 탐지하는 라이브러리입니다." + "대한민국 (大韓民國, Republic of Korea)"; byte[] bytes = Encoding.UTF8.GetBytes(s); // Act var result = CharsetDetector.DetectFromBytes(bytes, offset, len); // Assert Assert.AreEqual(detectedCodepage, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
private void LoadFileAsync(string path) { try { Task.Run(() => { const int maxLength = 50 * 1024 * 1024; var buffer = new MemoryStream(); bool tooLong; MyBusyIndicator.IsBusy = true; using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { tooLong = s.Length > maxLength; while (s.Position < s.Length && buffer.Length < maxLength) { var lb = new byte[8192]; var count = s.Read(lb, 0, lb.Length); buffer.Write(lb, 0, count); } } //if (tooLong) // nevise.Title += " (0 ~ 50MB)"; var bufferCopy = buffer.ToArray(); buffer.Dispose(); var encoding = CharsetDetector.DetectFromBytes(bufferCopy).Detected?.Encoding ?? Encoding.Default; var doc = new TextDocument(encoding.GetString(bufferCopy)); doc.SetOwnerThread(Dispatcher.Thread); Dispatcher.BeginInvoke(new Action(() => { nevise.Encoding = encoding; nevise.SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(System.IO.Path.GetExtension(path)); nevise.Document = doc; MyBusyIndicator.IsBusy = false; }), DispatcherPriority.Render); }); } catch { MyBusyIndicator.IsBusy = false; } }
/// <summary> /// Called when [receive]. /// </summary> /// <param name="bytes">The bytes.</param> private void OnReceiveInternal(byte[] bytes) { LastActivityDate = DateTime.UtcNow; if (OnReceive == null) { return; } var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName; if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase)) { OnReceiveInternal(Encoding.UTF8.GetString(bytes, 0, bytes.Length)); } else { OnReceiveInternal(Encoding.ASCII.GetString(bytes, 0, bytes.Length)); } }
private async Task ProcessMessage(byte[] messageBytes, TaskCompletionSource <bool> taskCompletionSource) { var charset = CharsetDetector.DetectFromBytes(messageBytes).Detected?.EncodingName; var message = string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase) ? Encoding.UTF8.GetString(messageBytes, 0, messageBytes.Length) : Encoding.ASCII.GetString(messageBytes, 0, messageBytes.Length); // All messages are expected to be valid JSON objects if (!message.StartsWith("{", StringComparison.OrdinalIgnoreCase)) { _logger.LogDebug("Received web socket message that is not a json structure: {Message}", message); return; } try { var info = _jsonSerializer.DeserializeFromString <WebSocketMessage <object> >(message); _logger.LogDebug("Websocket message received: {0}", info.MessageType); var tasks = _webSocketHandlers.Select(handler => Task.Run(() => { try { handler.ProcessMessage(info, taskCompletionSource).ConfigureAwait(false); } catch (Exception ex) { _logger.LogError(ex, "{HandlerType} failed processing WebSocket message {MessageType}", handler.GetType().Name, info.MessageType ?? string.Empty); } })); await Task.WhenAll(tasks); } catch (Exception ex) { _logger.LogError(ex, "Error processing web socket message"); } }
/// <summary> /// 判断读入文本的编码格式 /// /// </summary> public static Encoding GetEncoding(string filename, int taster = 1000) { /* 改为默认uf8奇葩方法保留 * //unix 可能是识别失败,使用奇葩方法,常见代码默认utf-8 * //黑名单自动识别 * string[] black_list = { ".txt" }; * if(!black_list.Any(filename.ToLower().EndsWith)) * { * return Encoding.UTF8; * } */ var encoding = Encoding.Default; var buffer = new MemoryStream(); using (FileStream fs = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { while (fs.Position < fs.Length && buffer.Length < taster) { var lb = new byte[8192]; int len = fs.Read(lb, 0, lb.Length); buffer.Write(lb, 0, len); } } var bufferCopy = buffer.ToArray(); buffer.Dispose(); var Detected = CharsetDetector.DetectFromBytes(bufferCopy).Detected; Debug.WriteLine("Confidence = " + Detected?.Confidence); if (Detected?.Confidence > 0.5) { encoding = Detected.Encoding ?? Encoding.UTF8; Debug.WriteLine("UTF-UNKNOWN Charset = " + encoding.EncodingName); return(Detected.Encoding); } else { return(Encoding.UTF8); } }
/// <summary> /// Returns up to maxEncodings code pages that are assumed to be appropriate /// </summary> /// <param name="input">array containing the raw data</param> /// <param name="maxEncodings">maximum number of encodings to detect</param> /// <returns>an array of Encoding with assumed encodings</returns> private static Encoding[] DetectInputCodePages(byte[] input, int maxEncodings) { if (maxEncodings < 1) { throw new ArgumentOutOfRangeException(nameof(maxEncodings), "at least one encoding must be returned"); } if (input == null) { throw new ArgumentNullException(nameof(input)); } // empty strings can always be encoded as ASCII if (input.Length == 0) { return(new[] { Encoding.ASCII }); } // use UTF.Unknown to detect from input byte string var detectionResult = CharsetDetector.DetectFromBytes(input); return(detectionResult.Details.OrderByDescending(p => p.Confidence).Select(p => p.Encoding).Take(maxEncodings).ToArray()); }
public static Subtitle Open(string path, string encode, bool noNoFallback = false) { string text; if (encode == "자동") { var bytes = File.ReadAllBytes(path); var result = CharsetDetector.DetectFromBytes(bytes.Take(30).ToArray()); // 성능 최적화를 위해 우선 초반 30 Bytes만 검사 Encoding encoding; try { if (result.Detected == null) { throw new DecoderFallbackException(); } encoding = Encoding.GetEncoding(result.Detected.EncodingName, MainWindow.EcdFallback, MainWindow.DcdFallback); text = encoding.GetString(bytes); } catch (DecoderFallbackException) { // Debug.WriteLine("Full Bytes 검사 : " + path); result = CharsetDetector.DetectFromBytes(bytes); if (noNoFallback) { encoding = Encoding.GetEncoding(result.Detected.EncodingName, MainWindow.EcdFallback, MainWindow.DcdFallback); } else { encoding = result.Detected.Encoding; } try { text = encoding.GetString(bytes); } catch (DecoderFallbackException) { throw new Exception("인코딩 자동 감지 실패"); } } } else { encode = encode == "ANSI" ? "ks_c_5601-1987" : encode; using (var sr = new StreamReader(path, Encoding.GetEncoding(encode), true)) { text = sr.ReadToEnd(); } } if (String.IsNullOrEmpty(text)) { throw new Exception("파일 읽기 실패"); } // 성능 향상을 위해 자주 쓰일 법한 포맷부터 먼저 체크해 조건문을 빠져나갈 수 있도록 함 if (path.EndsWith(".smi")) { return(Parser.Parse(text, "SMI")); } else if (path.EndsWith(".ass") || path.EndsWith(".ssa")) { return(Parser.Parse(text, "ASS")); } else { return(Parser.Parse(text, "SRT")); } }
public static Encoding GetEncoding(byte[] buffer) { return(CharsetDetector.DetectFromBytes(buffer).Detected?.Encoding ?? Encoding.UTF8); }
public static string detect(Context ctx, PhpString content) { var result = CharsetDetector.DetectFromBytes(content.ToBytes(ctx)); return(result.Detected?.EncodingName); }