Пример #1
0
 public DetectionResult DetectUtfUnknown(byte[] rawData, int sizeLimit)
 {
     using (MemoryStream ms = new MemoryStream(rawData, 0, Math.Min(sizeLimit, rawData.Length)))
     {
         return(CharsetDetector.DetectFromStream(ms));
     }
 }
Пример #2
0
        public static bool TryGuessEncoding(Stream stream, out Encoding encoding)
        {
            encoding = null;

            try
            {
                var result = CharsetDetector.DetectFromStream(stream);
                if (result.Detected?.Encoding != null) // Detected can be null
                {
                    encoding = AnalyzeAndGuessEncoding(result);
                    return(true);
                }
                else if (stream.Length > 0) // We do not care about empty file
                {
                    Analytics.TrackEvent("UnableToDetectEncoding");
                }
            }
            catch (Exception ex)
            {
                Analytics.TrackEvent("TryGuessEncodingFailedWithException", new Dictionary <string, string>()
                {
                    {
                        "Exception", ex.ToString()
                    },
                    {
                        "Message", ex.Message
                    }
                });
            }

            return(false);
        }
Пример #3
0
        /// <inheritdoc />
        public async Task <string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            using (var stream = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
            {
                var charset = CharsetDetector.DetectFromStream(stream).Detected?.EncodingName;

                _logger.LogDebug("charset {0} detected for {Path}", charset ?? "null", path);

                return(charset);
            }
        }
Пример #4
0
        public void DetectFromStreamMaxBytes(int?maxBytes, int expectedPosition, int start = 0)
        {
            // Arrange
            var text   = new string('a', 10000);
            var stream = AsciiToSteam(text);

            stream.Position = start;

            // Act
            CharsetDetector.DetectFromStream(stream, maxBytes);

            // Assert
            Assert.AreEqual(expectedPosition, stream.Position);
        }
Пример #5
0
 /// <summary>
 ///  Get the System.Text.Encoding of this file.
 /// </summary>
 /// <param name="filePath">Path to file</param>
 /// <param name="maxBytesToRead">max bytes to read from <paramref name="filePath"/>. If <c>null</c>, then no max</param>
 /// <returns>System.Text.Encoding (can be null if not available or not supported by .NET).</returns>
 public static Encoding GetFileEncoding(string filePath, int?maxBytesToRead)
 {
     using (FileStream stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
     {
         // Check for possible UTF-16 encoding (LE or BE).
         Encoding encoding = Utf16Detector.DetectFromStream(stream, maxBytesToRead);
         if (encoding != null)
         {
             return(encoding);
         }
         // https://github.com/CharsetDetector/UTF-unknown
         stream.Position = 0L;
         return(CharsetDetector.DetectFromStream(stream, maxBytesToRead).Detected?.Encoding);
     }
 }
Пример #6
0
        public void TestAscii()
        {
            const string text = "The Documentation of the libraries is not complete " +
                                "and your contributions would be greatly appreciated " +
                                "the documentation you want to contribute to and " +
                                "click on the [Edit] link to start writing";
            var stream = AsciiToSteam(text);

            using (stream)
            {
                var result = CharsetDetector.DetectFromStream(stream);
                Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName);
                Assert.AreEqual(1.0f, result.Detected.Confidence);
            }
        }
Пример #7
0
        public void TestAscii()
        {
            string s =
                "The Documentation of the libraries is not complete " +
                "and your contributions would be greatly appreciated " +
                "the documentation you want to contribute to and " +
                "click on the [Edit] link to start writing";

            using (MemoryStream ms = new MemoryStream(Encoding.ASCII.GetBytes(s)))
            {
                var result = CharsetDetector.DetectFromStream(ms);
                Assert.Equal(Charsets.ASCII, result.Detected.EncodingName);
                Assert.Equal(1.0f, result.Detected.Confidence);
            }
        }
Пример #8
0
        /// <inheritdoc />
        public async Task <string> GetSubtitleFileCharacterSet(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            using (var stream = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
            {
                var charset = CharsetDetector.DetectFromStream(stream).Detected?.EncodingName;

                // UTF16 is automatically converted to UTF8 by FFmpeg, do not specify a character encoding
                if ((path.EndsWith(".ass", StringComparison.Ordinal) || path.EndsWith(".ssa", StringComparison.Ordinal) || path.EndsWith(".srt", StringComparison.Ordinal)) &&
                    (string.Equals(charset, "utf-16le", StringComparison.OrdinalIgnoreCase) ||
                     string.Equals(charset, "utf-16be", StringComparison.OrdinalIgnoreCase)))
                {
                    charset = string.Empty;
                }

                _logger.LogDebug("charset {0} detected for {Path}", charset ?? "null", path);

                return(charset);
            }
        }
Пример #9
0
        private async Task <Stream> GetSubtitleStream(string path, MediaProtocol protocol, bool requiresCharset, CancellationToken cancellationToken)
        {
            if (requiresCharset)
            {
                using (var stream = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
                {
                    var result = CharsetDetector.DetectFromStream(stream).Detected;

                    if (result != null)
                    {
                        _logger.LogDebug("charset {CharSet} detected for {Path}", result.EncodingName, path);

                        using var reader = new StreamReader(stream, result.Encoding);
                        var text = await reader.ReadToEndAsync().ConfigureAwait(false);

                        return(new MemoryStream(Encoding.UTF8.GetBytes(text)));
                    }
                }
            }

            return(File.OpenRead(path));
        }
Пример #10
0
        private async Task <Stream> GetSubtitleStream(SubtitleInfo fileInfo, CancellationToken cancellationToken)
        {
            if (fileInfo.IsExternal)
            {
                using (var stream = await GetStream(fileInfo.Path, fileInfo.Protocol, cancellationToken).ConfigureAwait(false))
                {
                    var result = CharsetDetector.DetectFromStream(stream).Detected;
                    stream.Position = 0;

                    if (result != null)
                    {
                        _logger.LogDebug("charset {CharSet} detected for {Path}", result.EncodingName, fileInfo.Path);

                        using var reader = new StreamReader(stream, result.Encoding);
                        var text = await reader.ReadToEndAsync().ConfigureAwait(false);

                        return(new MemoryStream(Encoding.UTF8.GetBytes(text)));
                    }
                }
            }

            return(AsyncFile.OpenRead(fileInfo.Path));
        }
Пример #11
0
        /// <summary>
        /// Gets an <see cref="Encoding"/> from a given memory stream.
        /// </summary>
        /// <param name="stream">The stream to get the encoding from.</param>
        /// <returns>The detected <see cref="Encoding"/>.</returns>
        public static Encoding FromStream(MemoryStream stream)
        {
            byte[] bytes = GetEncodingComparisonBytes(stream);

            if (ByteMatch(bytes, Utf7Bom5) ||
                ByteMatch(bytes, Utf7Bom4) ||
                ByteMatch(bytes, Utf7Bom3) ||
                ByteMatch(bytes, Utf7Bom2) ||
                ByteMatch(bytes, Utf7Bom1))
            {
#pragma warning disable 618
#pragma warning disable SYSLIB0001 // Type or member is obsolete
                // the UTF7 encoding is required to access legacy files..
                return(new UTF7Encoding(false));

#pragma warning restore SYSLIB0001 // Type or member is obsolete
#pragma warning restore 618
            }

            if (ByteMatch(bytes, Utf8Bom))
            {
                return(new UTF8Encoding(true, true));
            }

            if (ByteMatch(bytes, Utf16BigEndianBom))
            {
                return(new UnicodeEncoding(true, true, true));
            }

            if (ByteMatch(bytes, Utf16LittleEndianBom))
            {
                return(new UnicodeEncoding(false, true, true));
            }

            if (ByteMatch(bytes, Utf32BigEndianBom))
            {
                return(new UTF32Encoding(true, true, true));
            }

            if (ByteMatch(bytes, Utf32LittleEndianBom))
            {
                return(new UTF32Encoding(false, true, true));
            }

            try // use the UTF-unknown (C: https://github.com/CharsetDetector/UTF-unknown) library..
            {
                stream.Position = 0;
                var result   = CharsetDetector.DetectFromStream(stream);
                var encoding = GetPrimaryFromCharsetDetector(result);
                if (encoding != null)
                {
                    // US-ASCII seems to be the library default, so use the default instead..
                    return(encoding.CodePage == 20127 ? FallBackEncoding : encoding);
                }
            }
            catch (Exception ex)
            {
                // log the exception..
                ExceptionLogAction?.Invoke(ex);
            }

            return(FallBackEncoding);
        }