public DetectionResult DetectUtfUnknown(byte[] rawData, int sizeLimit) { using (MemoryStream ms = new MemoryStream(rawData, 0, Math.Min(sizeLimit, rawData.Length))) { return(CharsetDetector.DetectFromStream(ms)); } }
public static bool TryGuessEncoding(Stream stream, out Encoding encoding) { encoding = null; try { var result = CharsetDetector.DetectFromStream(stream); if (result.Detected?.Encoding != null) // Detected can be null { encoding = AnalyzeAndGuessEncoding(result); return(true); } else if (stream.Length > 0) // We do not care about empty file { Analytics.TrackEvent("UnableToDetectEncoding"); } } catch (Exception ex) { Analytics.TrackEvent("TryGuessEncodingFailedWithException", new Dictionary <string, string>() { { "Exception", ex.ToString() }, { "Message", ex.Message } }); } return(false); }
/// <inheritdoc /> public async Task <string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken) { using (var stream = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var charset = CharsetDetector.DetectFromStream(stream).Detected?.EncodingName; _logger.LogDebug("charset {0} detected for {Path}", charset ?? "null", path); return(charset); } }
public void DetectFromStreamMaxBytes(int?maxBytes, int expectedPosition, int start = 0) { // Arrange var text = new string('a', 10000); var stream = AsciiToSteam(text); stream.Position = start; // Act CharsetDetector.DetectFromStream(stream, maxBytes); // Assert Assert.AreEqual(expectedPosition, stream.Position); }
/// <summary> /// Get the System.Text.Encoding of this file. /// </summary> /// <param name="filePath">Path to file</param> /// <param name="maxBytesToRead">max bytes to read from <paramref name="filePath"/>. If <c>null</c>, then no max</param> /// <returns>System.Text.Encoding (can be null if not available or not supported by .NET).</returns> public static Encoding GetFileEncoding(string filePath, int?maxBytesToRead) { using (FileStream stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { // Check for possible UTF-16 encoding (LE or BE). Encoding encoding = Utf16Detector.DetectFromStream(stream, maxBytesToRead); if (encoding != null) { return(encoding); } // https://github.com/CharsetDetector/UTF-unknown stream.Position = 0L; return(CharsetDetector.DetectFromStream(stream, maxBytesToRead).Detected?.Encoding); } }
public void TestAscii() { const string text = "The Documentation of the libraries is not complete " + "and your contributions would be greatly appreciated " + "the documentation you want to contribute to and " + "click on the [Edit] link to start writing"; var stream = AsciiToSteam(text); using (stream) { var result = CharsetDetector.DetectFromStream(stream); Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); } }
public void TestAscii() { string s = "The Documentation of the libraries is not complete " + "and your contributions would be greatly appreciated " + "the documentation you want to contribute to and " + "click on the [Edit] link to start writing"; using (MemoryStream ms = new MemoryStream(Encoding.ASCII.GetBytes(s))) { var result = CharsetDetector.DetectFromStream(ms); Assert.Equal(Charsets.ASCII, result.Detected.EncodingName); Assert.Equal(1.0f, result.Detected.Confidence); } }
/// <inheritdoc /> public async Task <string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken) { using (var stream = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var charset = CharsetDetector.DetectFromStream(stream).Detected?.EncodingName; // UTF16 is automatically converted to UTF8 by FFmpeg, do not specify a character encoding if ((path.EndsWith(".ass", StringComparison.Ordinal) || path.EndsWith(".ssa", StringComparison.Ordinal) || path.EndsWith(".srt", StringComparison.Ordinal)) && (string.Equals(charset, "utf-16le", StringComparison.OrdinalIgnoreCase) || string.Equals(charset, "utf-16be", StringComparison.OrdinalIgnoreCase))) { charset = string.Empty; } _logger.LogDebug("charset {0} detected for {Path}", charset ?? "null", path); return(charset); } }
private async Task <Stream> GetSubtitleStream(string path, MediaProtocol protocol, bool requiresCharset, CancellationToken cancellationToken) { if (requiresCharset) { using (var stream = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var result = CharsetDetector.DetectFromStream(stream).Detected; if (result != null) { _logger.LogDebug("charset {CharSet} detected for {Path}", result.EncodingName, path); using var reader = new StreamReader(stream, result.Encoding); var text = await reader.ReadToEndAsync().ConfigureAwait(false); return(new MemoryStream(Encoding.UTF8.GetBytes(text))); } } } return(File.OpenRead(path)); }
private async Task <Stream> GetSubtitleStream(SubtitleInfo fileInfo, CancellationToken cancellationToken) { if (fileInfo.IsExternal) { using (var stream = await GetStream(fileInfo.Path, fileInfo.Protocol, cancellationToken).ConfigureAwait(false)) { var result = CharsetDetector.DetectFromStream(stream).Detected; stream.Position = 0; if (result != null) { _logger.LogDebug("charset {CharSet} detected for {Path}", result.EncodingName, fileInfo.Path); using var reader = new StreamReader(stream, result.Encoding); var text = await reader.ReadToEndAsync().ConfigureAwait(false); return(new MemoryStream(Encoding.UTF8.GetBytes(text))); } } } return(AsyncFile.OpenRead(fileInfo.Path)); }
/// <summary> /// Gets an <see cref="Encoding"/> from a given memory stream. /// </summary> /// <param name="stream">The stream to get the encoding from.</param> /// <returns>The detected <see cref="Encoding"/>.</returns> public static Encoding FromStream(MemoryStream stream) { byte[] bytes = GetEncodingComparisonBytes(stream); if (ByteMatch(bytes, Utf7Bom5) || ByteMatch(bytes, Utf7Bom4) || ByteMatch(bytes, Utf7Bom3) || ByteMatch(bytes, Utf7Bom2) || ByteMatch(bytes, Utf7Bom1)) { #pragma warning disable 618 #pragma warning disable SYSLIB0001 // Type or member is obsolete // the UTF7 encoding is required to access legacy files.. return(new UTF7Encoding(false)); #pragma warning restore SYSLIB0001 // Type or member is obsolete #pragma warning restore 618 } if (ByteMatch(bytes, Utf8Bom)) { return(new UTF8Encoding(true, true)); } if (ByteMatch(bytes, Utf16BigEndianBom)) { return(new UnicodeEncoding(true, true, true)); } if (ByteMatch(bytes, Utf16LittleEndianBom)) { return(new UnicodeEncoding(false, true, true)); } if (ByteMatch(bytes, Utf32BigEndianBom)) { return(new UTF32Encoding(true, true, true)); } if (ByteMatch(bytes, Utf32LittleEndianBom)) { return(new UTF32Encoding(false, true, true)); } try // use the UTF-unknown (C: https://github.com/CharsetDetector/UTF-unknown) library.. { stream.Position = 0; var result = CharsetDetector.DetectFromStream(stream); var encoding = GetPrimaryFromCharsetDetector(result); if (encoding != null) { // US-ASCII seems to be the library default, so use the default instead.. return(encoding.CodePage == 20127 ? FallBackEncoding : encoding); } } catch (Exception ex) { // log the exception.. ExceptionLogAction?.Invoke(ex); } return(FallBackEncoding); }