CharsetDetector C# (CSharp) Code Examples

Example #1

0

Show file

        public void TestC1Bytes()
        {
            String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";

            String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";

            byte[] bISO     = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sISO, "ISO-8859-1");
            byte[] bWindows = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sWindows, "windows-1252");

            CharsetDetector det = new CharsetDetector();
            CharsetMatch    m;

            det.SetText(bWindows);
            m = det.Detect();

            if (m.GetName() != "windows-1252")
            {
                Errln("Text with C1 bytes not correctly detected as windows-1252.");
                return;
            }

            det.SetText(bISO);
            m = det.Detect();

            if (m.GetName() != "ISO-8859-1")
            {
                Errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
            }
        }

Example #2

0

Show file

        public void TestShortInput()
        {
            // Test that detection with very short byte strings does not crash and
            // burn.
            // The shortest input that should produce positive detection result is
            // two bytes,
            // a UTF-16 BOM.
            // TODO: Detector confidence levels needs to be refined for very short
            // input.
            // Too high now, for some charsets that happen to be compatible with a
            // few bytes of input.
            byte[][] shortBytes = new byte[][] {
                new byte[] {},
                new byte[] { (byte)0x0a },
                new byte[] { (byte)'A', (byte)'B' },
                new byte[] { (byte)'A', (byte)'B', (byte)'C' },
                new byte[] { (byte)'A', (byte)'B', (byte)'C',
                             (byte)'D' }
            };

            CharsetDetector det = new CharsetDetector();
            CharsetMatch    m;

            for (int i = 0; i < shortBytes.Length; i++)
            {
                det.SetText(shortBytes[i]);
                m = det.Detect();
            }
        }

Example #3

0

Show file

        public void TestInputFilter()
        {
            String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";

            byte[]          bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(s, "ISO-8859-1");
            CharsetDetector det   = new CharsetDetector();
            CharsetMatch    m;

            det.EnableInputFilter(true);
            if (!det.InputFilterEnabled())
            {
                Errln("input filter should be enabled");
            }

            det.SetText(bytes);
            m = det.Detect();

            if (!m.GetLanguage().Equals("fr"))
            {
                Errln("input filter did not strip markup!");
            }

            det.EnableInputFilter(false);
            det.SetText(bytes);
            m = det.Detect();

            if (!m.GetLanguage().Equals("en"))
            {
                Errln("unfiltered input did not detect as English!");
            }
        }

Example #4

0

Show file

File: SongInfoLoader.cs Project: noeticwxb/BMP-U

    public static string LoadFile(FileInfo fileInfo)
    {
        string   result;
        Encoding encoding = CurrentEncoding;

        using (Stream stream = fileInfo.OpenRead()) {
            CharsetDetector detector = new CharsetDetector();
            detector.Feed(stream);
            detector.DataEnd();
            stream.Position = 0;
            if (detector.Charset != null)
            {
                Debug.LogFormat("Detected charset of file: {0}", detector.Charset);
                try {
                    encoding = Encoding.GetEncoding(detector.Charset);
                } catch {
                    Debug.LogWarning("Failed to load encoding, will use default encoding.");
                    encoding = CurrentEncoding;
                }
            }
            else
            {
                Debug.LogFormat("Failed to detect charset, will use default encoding.");
            }
            using (StreamReader reader = new StreamReader(stream, encoding))
                result = reader.ReadToEnd();
        }
        return(result);
    }

Example #5

0

Show file

File: FileSystemUtility.cs Project: esbiete/Notepads

        public static bool TryGuessEncoding(Stream stream, out Encoding encoding)
        {
            encoding = null;

            try
            {
                var result = CharsetDetector.DetectFromStream(stream);
                if (result.Detected?.Encoding != null) // Detected can be null
                {
                    encoding = AnalyzeAndGuessEncoding(result);
                    return(true);
                }
                else if (stream.Length > 0) // We do not care about empty file
                {
                    Analytics.TrackEvent("UnableToDetectEncoding");
                }
            }
            catch (Exception ex)
            {
                Analytics.TrackEvent("TryGuessEncodingFailedWithException", new Dictionary <string, string>()
                {
                    {
                        "Exception", ex.ToString()
                    },
                    {
                        "Message", ex.Message
                    }
                });
            }

            return(false);
        }

Example #6

0

Show file

File: EncodingTests.cs Project: askazakov/csharpier

        public void RunTest(string fileName)
        {
            var filePath = Path.Combine(
                this.rootDirectory.FullName,
                "EncodingTests",
                fileName + ".cst"
                );

            using var reader = new StreamReader(filePath, Encoding.UTF8, true);
            var code = reader.ReadToEnd();

            var detectionResult = CharsetDetector.DetectFromFile(filePath);

            var encoding = detectionResult.Detected.Encoding;

            reader.Close();

            var formatter = new CodeFormatter();
            var result    = formatter.Format(code, new Options());

            var actualFilePath = filePath.Replace(".cst", ".actual.cst");

            using var stream = File.Open(actualFilePath, FileMode.Create);
            using var writer = new StreamWriter(stream, encoding);
            writer.Write(result.Code);

            var actualDetectionResult = CharsetDetector.DetectFromFile(
                filePath
                );
            var actualEncoding = actualDetectionResult.Detected.Encoding;

            encoding.Should().Be(actualEncoding);
        }

Example #7

0

Show file

File: SubtitleEncoder.cs Project: wangjiataoG/jellyfin

        private async Task <Stream> GetSubtitleStream(string path, string language, MediaProtocol protocol, bool requiresCharset, CancellationToken cancellationToken)
        {
            if (requiresCharset)
            {
                var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false);

                var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName;
                _logger.LogDebug("charset {CharSet} detected for {Path}", charset ?? "null", path);

                if (!string.IsNullOrEmpty(charset))
                {
                    // Make sure we have all the code pages we can get
                    Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
                    using (var inputStream = new MemoryStream(bytes))
                        using (var reader = new StreamReader(inputStream, Encoding.GetEncoding(charset)))
                        {
                            var text = await reader.ReadToEndAsync().ConfigureAwait(false);

                            bytes = Encoding.UTF8.GetBytes(text);

                            return(new MemoryStream(bytes));
                        }
                }
            }

            return(File.OpenRead(path));
        }

Example #8

0

Show file

        /// <summary>
        ///     Command line example: detects the encoding of the given file.
        /// </summary>
        /// <param name="args">a filename</param>
        public static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage: udetect <filename>");
                return;
            }

            var filename = args[0];

            using (var fs = File.OpenRead(filename)) {
                ICharsetDetector cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();

                Console.WriteLine();
                if (cdet.Charset != null)
                {
                    Console.ForegroundColor = ConsoleColor.Green;
                    Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence);
                }
                else
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine("Detection failed.");
                }
                Console.ResetColor();
            }

            Exit();
        }

Example #9

0

Show file

File: EncodingChecker.cs Project: NikolayHD/DirectoryFilesValidation

        private bool convertToUtf8(string fileName)
        {
            string charset;

            lock (SyncRoot)
                using (FileStream fs = File.OpenRead(fileName))
                {
                    var cdet = new CharsetDetector();
                    cdet.Feed(fs);
                    cdet.DataEnd();

                    charset = cdet.Charset;

                    if (!isAscii(cdet))
                    {
                        return(false);
                    }
                }

            var srcEncoding = Encoding.GetEncoding(charset);

            var text = File.ReadAllText(fileName, srcEncoding);

            File.WriteAllText(fileName, text, Encoding.UTF8);
            return(true);
        }

Example #10

0

Show file

        private string DetectCharset(string path)
        {
            try
            {
                using (var file = new FileStream(path, FileMode.Open))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    return(charset);
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return(null);
        }

Example #11

0

Show file

File: EncodingChecker.cs Project: NikolayHD/DirectoryFilesValidation

        private bool isAscii(string fileName)
        {
            for (int i = 0; i < 10; i++)
            {
                try
                {
                    lock (SyncRoot)
                        using (FileStream fs = File.OpenRead(fileName))
                        {
                            var cdet = new CharsetDetector();
                            cdet.Feed(fs);
                            cdet.DataEnd();

                            bool isAscii = EncodingChecker.isAscii(cdet);
                            return(isAscii);
                        }
                }
                catch (IOException)
                {
                    Thread.Sleep(1000);
                }
            }

            return(false);
        }

Example #12

0

Show file

File: WebSocketConnection.cs Project: jrags56/MediaBrowser

        private string DetectCharset(byte[] bytes)
        {
            try
            {
                using (var ms = new MemoryStream(bytes))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(ms);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        //_logger.Debug("UniversalDetector detected charset {0}", charset);
                    }

                    return(charset);
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine web socket message charset", ex);
            }

            return(null);
        }

Example #13

0

Show file

        private string[] mergeSourceFiles(string[] sourceFiles, int mergeOption, out string[] mergedFileNames)
        {
            string        name                   = CharsetDetector.GuessCharsetOfFile(sourceFiles[0]);
            List <string> list                   = new List <string>();
            List <string> list2                  = new List <string>();
            StringBuilder stringBuilder          = new StringBuilder();
            StringBuilder stringBuilder2         = new StringBuilder();
            bool          @checked               = changeFileNameCheckBox.Checked;
            int           num                    = sourceFiles.Length;
            bool          needMarkChapterHeaders = outputTypeComboBox.SelectedIndex == 2 || outputTypeComboBox.SelectedIndex == 4 || outputTypeComboBox.SelectedIndex == 6;

            for (int i = 0; i < num; i += mergeOption)
            {
                stringBuilder.Length  = 0;
                stringBuilder2.Length = 0;
                int num2 = 0;
                while (num2 < mergeOption && sourceFiles.Length > i + num2)
                {
                    stringBuilder.Append(Util.NormalizeTextAndRemoveIgnoredChinesePhrases(readFile(sourceFiles[i + num2], Encoding.GetEncoding(name), needMarkChapterHeaders))).Append("\n\n----------oOo----------\n\n");
                    if (num2 == 0)
                    {
                        stringBuilder2.Append(getOutputFileName(sourceFiles[i + num2], i + num2, num, @checked));
                    }
                    else if (mergeOption != 1 && (num2 == mergeOption - 1 || i + num2 == sourceFiles.Length - 1))
                    {
                        stringBuilder2.Append(" - ").Append(getOutputFileName(sourceFiles[i + num2], i + num2, num, @checked));
                    }
                    num2++;
                }
                list.Add(stringBuilder.ToString());
                list2.Add(stringBuilder2.ToString());
            }
            mergedFileNames = list2.ToArray();
            return(list.ToArray());
        }

Example #14

0

Show file

File: ContentComparerHelper.cs Project: PonomarevDmitry/CrmDeveloperHelper

        public static List <Encoding> GetFileEncoding(byte[] arrayByte)
        {
            List <Encoding> result = new List <Encoding>();

            CharsetDetector detector = new CharsetDetector();

            detector.Feed(arrayByte, 0, arrayByte.Length);
            detector.DataEnd();

            if (!string.IsNullOrEmpty(detector.Charset) && detector.Confidence > 0.8f)
            {
                try
                {
                    Encoding enc = Encoding.GetEncoding(detector.Charset);

                    result.Add(enc);
                }
                catch (Exception ex)
                {
                    DTEHelper.WriteExceptionToOutput(null, ex);

#if DEBUG
                    if (System.Diagnostics.Debugger.IsAttached)
                    {
                        System.Diagnostics.Debugger.Break();
                    }
#endif
                }
            }

            return(result);
        }

Example #15

0

Show file

File: FileDisplayHandler.cs Project: jiangzm/Bonobo-Git-Server

        public static Encoding GetEncoding(byte[] data)
        {
            ICharsetDetector cdet = new CharsetDetector();

            cdet.Feed(data, 0, data.Length);
            cdet.DataEnd();
            if (cdet.Charset != null && cdet.Confidence > 0.5)
            {
                if (cdet.Charset.ToLowerInvariant() == "big-5")
                {
                    return(Encoding.GetEncoding("big5"));
                }
                else
                {
                    try
                    {
                        return(Encoding.GetEncoding(cdet.Charset));
                    }
                    catch
                    {
                        return(Encoding.Default);
                    }
                }
            }

            return(null);
        }

Example #16

0

Show file

File: YtXmlParser.cs Project: Cryental/Kayla.NET

        public bool ParseFormat(string path, out List <SubtitleItem> result)
        {
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

            var detect   = CharsetDetector.DetectFromFile(path);
            var encoding = Encoding.GetEncoding(detect.Detected.EncodingName);

            var xmlStream = new StreamReader(path, encoding).BaseStream;

            // rewind the stream
            xmlStream.Position = 0;
            var items = new List <SubtitleItem>();

            // parse xml stream
            var xmlDoc = new XmlDocument();

            xmlDoc.Load(xmlStream);

            if (xmlDoc.DocumentElement != null)
            {
                var nodeList = xmlDoc.DocumentElement.SelectNodes("//text");

                if (nodeList != null)
                {
                    for (var i = 0; i < nodeList.Count; i++)
                    {
                        var node = nodeList[i];
                        try
                        {
                            var startString = node.Attributes["start"].Value;
                            var start       = float.Parse(startString, CultureInfo.InvariantCulture);
                            var durString   = node.Attributes["dur"].Value;
                            var duration    = float.Parse(durString, CultureInfo.InvariantCulture);
                            var text        = node.InnerText;

                            items.Add(new SubtitleItem
                            {
                                StartTime = (int)(start * 1000),
                                EndTime   = (int)((start + duration) * 1000),
                                Text      = ConvertString(text)
                            });
                        }
                        catch
                        {
                            result = null;
                            return(false);
                        }
                    }
                }
            }

            if (items.Any())
            {
                result = Filters.RemoveDuplicateItems(items);
                return(true);
            }

            result = null;
            return(false);
        }

Example #17

0

Show file

File: SubtitleEncoder.cs Project: mcanthony/Emby

        private async Task <string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            try
            {
                using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    return(charset);
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return(null);
        }

Example #18

0

Show file

        /// <summary>
        /// Command line example: detect the encoding of the given file.
        /// </summary>
        /// <param name="filename">a filename</param>
        public static void DetectDemo(string filename)
        {
            // Detect from File
            DetectionResult result = CharsetDetector.DetectFromFile(filename);
            // Get the best Detection
            DetectionDetail resultDetected = result.Detected;

            // detected result may be null.
            if (resultDetected != null)
            {
                // Get the alias of the found encoding
                string encodingName = resultDetected.EncodingName;
                // Get the System.Text.Encoding of the found encoding (can be null if not available)
                Encoding encoding = resultDetected.Encoding;
                // Get the confidence of the found encoding (between 0 and 1)
                float confidence = resultDetected.Confidence;
                if (encoding != null)
                {
                    Console.WriteLine($"Detection completed: {filename}");
                    Console.WriteLine($"EncodingWebName: {encoding.WebName}{Environment.NewLine}Confidence: {confidence}");
                }
                else
                {
                    Console.WriteLine($"Detection completed: {filename}");
                    Console.WriteLine($"(Encoding is null){Environment.NewLine}EncodingName: {encodingName}{Environment.NewLine}Confidence: {confidence}");
                }
            }
            else
            {
                Console.WriteLine($"Detection failed: {filename}");
            }
        }

Example #19

0

Show file

File: Udetect.cs Project: Gpower2/AcTools

        /// <summary>
        /// Command line example: detects the encoding of the given file.
        /// </summary>
        /// <param name="args">a filename</param>
        public static void Main(String[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage: udetect <filename>");
                return;
            }

            string filename = args[0];

            using (FileStream fs = File.OpenRead(filename)) {
                ICharsetDetector cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    Console.WriteLine("Charset: {0}, confidence: {1}",
                                      cdet.Charset, cdet.Confidence);
                }
                else
                {
                    Console.WriteLine("Detection failed.");
                }
            }
        }

Example #20

0

Show file

 public DetectionResult DetectUtfUnknown(byte[] rawData, int sizeLimit)
 {
     using (MemoryStream ms = new MemoryStream(rawData, 0, Math.Min(sizeLimit, rawData.Length)))
     {
         return(CharsetDetector.DetectFromStream(ms));
     }
 }

Example #21

0

Show file

        public static bool TryGetEncoding(byte[] data, out Encoding encoding)
        {
            ICharsetDetector cdet = new CharsetDetector();

            cdet.Feed(data, 0, data.Length);
            cdet.DataEnd();
            if (cdet.Charset != null)
            {
                if (cdet.Charset.ToLowerInvariant() == "big-5")
                {
                    encoding = Encoding.GetEncoding("big5");
                    return(true);
                }
                else
                {
                    try
                    {
                        encoding = Encoding.GetEncoding(cdet.Charset);
                        return(true);
                    }
                    catch
                    {
                        encoding = Encoding.Default;
                        return(false);
                    }
                }
            }

            encoding = Encoding.Default;
            return(false);
        }

Example #22

0

Show file

File: CharsetDetectorTest.cs Project: Bond-009/UTF-unknown

        public void TestBomUTF32_LE()
        {
            byte[] buf    = { 0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00 };
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(Charsets.UTF32_LE, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }

Example #23

0

Show file

File: CharsetDetectorTest.cs Project: Bond-009/UTF-unknown

        public void TestBomUtf8()
        {
            byte[] buf    = { 0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0x21 };
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual("UTF-8", result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }

Example #24

0

Show file

File: CharsetDetectorTest.cs Project: Bond-009/UTF-unknown

        public void TestSingleChar()
        {
            byte[] buf    = Encoding.UTF8.GetBytes("3");
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName);
            Assert.AreEqual(1, result.Detected.Confidence);
        }

Example #25

0

Show file

        public void TestIssue3()
        {
            byte[] buf    = Encoding.UTF8.GetBytes("3");
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }

Example #26

0

Show file

        public void TestCaseBomUtf7(byte[] bufferBytes)
        {
            var result = CharsetDetector.DetectFromBytes(bufferBytes)
                         .Detected;

            Assert.AreEqual(CodepageName.UTF7, result.EncodingName);
            Assert.AreEqual(1.0f, result.Confidence);
        }

Example #27

0

Show file

        public Encoding Detect(Stream stream)
        {
            var detector = new CharsetDetector();

            detector.Feed(stream);
            detector.DataEnd();
            return(detector.Charset.Return(Encoding.GetEncoding, null));
        }

Example #28

0

Show file

        public void Test2byteArrayBomUTF16_LE()
        {
            byte[] buf    = { 0xFF, 0xFE, };
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(CodepageName.UTF16_LE, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }

Example #29

0

Show file

File: CharsetDetectorTest.cs Project: Bond-009/UTF-unknown

        public void TestOutOfRange2()
        {
            byte[] buf    = Encoding.UTF8.GetBytes("1234567890");
            var    result = CharsetDetector.DetectFromBytes(buf);

            Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName);
            Assert.AreEqual(1.0f, result.Detected.Confidence);
        }

Example #30

0

Show file

        private void LoadFileAsync(string path)
        {
            Task.Run(() =>
            {
                const int maxLength = 5 * 1024 * 1024;
                var buffer          = new MemoryStream();
                bool tooLong;

                using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                {
                    tooLong = s.Length > maxLength;
                    while (s.Position < s.Length && buffer.Length < maxLength)
                    {
                        if (_disposed)
                        {
                            break;
                        }

                        var lb    = new byte[8192];
                        var count = s.Read(lb, 0, lb.Length);
                        buffer.Write(lb, 0, count);
                    }
                }

                if (_disposed)
                {
                    return;
                }

                if (tooLong)
                {
                    _context.Title += " (0 ~ 5MB)";
                }

                var bufferCopy = buffer.ToArray();
                buffer.Dispose();

                var encoding = CharsetDetector.DetectFromBytes(bufferCopy).Detected?.Encoding ??
                               Encoding.Default;

                var doc = new TextDocument(encoding.GetString(bufferCopy));
                doc.SetOwnerThread(Dispatcher.Thread);

                if (_disposed)
                {
                    return;
                }

                Dispatcher.BeginInvoke(new Action(() =>
                {
                    Encoding           = encoding;
                    SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path));
                    Document           = doc;

                    _context.IsBusy = false;
                }), DispatcherPriority.Render);
            });
        }

Example #31

0

Show file

File: Udetect.cs Project: henricj/FixEol

        /// <summary>
        ///     Command line example: detects the encoding of the given file.
        /// </summary>
        /// <param name="args">a filename</param>
        public static void Main(String[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Usage: udetect <filename>");
                return;
            }

            var filename = args[0];
            using (var fs = File.OpenRead(filename))
            {
                ICharsetDetector cdet = new CharsetDetector();
                cdet.Feed(fs);
                cdet.DataEnd();
                if (cdet.Charset != null)
                {
                    Console.WriteLine("Charset: {0}, confidence: {1}",
                        cdet.Charset, cdet.Confidence);
                }
                else
                    Console.WriteLine("Detection failed.");
            }
        }

Example #32

0

Show file

File: sResponse.cs Project: s7loves/pesta

        /**
   * Attempts to determine the encoding of the body. If it can't be determined, we use
   * DEFAULT_ENCODING instead.
   *
   * @return The detected encoding or DEFAULT_ENCODING.
   */
        private static String getAndUpdateEncoding(NameValueCollection headers, byte[] body)
        {
            String values = headers["Content-Type"];
            String contentType = values == null ? null : values.Length == 0 ? null : values;
            if (contentType != null)
            {
                String[] parts = contentType.Split(';');
                if (BINARY_CONTENT_TYPES.Contains(parts[0]))
                {
                    return DEFAULT_ENCODING;
                }
                if (parts.Length == 2)
                {
                    int offset = parts[1].IndexOf("charset=");
                    if (offset != -1)
                    {
                        String charset = parts[1].Substring(offset + 8).ToUpper();
                        // Some servers include quotes around the charset:
                        //   Content-Type: text/html; charset="UTF-8"
                        if (charset[0] == '"')
                        {
                            charset = charset.Substring(1, charset.Length);
                        }
                        return charset;
                    }
                }
            }

            if (body == null || body.Length == 0)
            {
                return DEFAULT_ENCODING;
            }

            // If the header doesn't specify the charset, try to determine it by examining the content.
            CharsetDetector detector = new CharsetDetector();
            detector.setText(body);
            CharsetMatch match = detector.detect();

            if (contentType != null)
            {
                // Record the charset in the content-type header so that its value can be cached
                // and re-used. This is a BIG performance win.
                headers.Add("Content-Type",
                            contentType + "; charset=" + match.getName().ToUpper());
            }
            return match.getName().ToUpper();
        }

Example #33

0

Show file

File: SubtitleEncoder.cs Project: t-andre/Emby

        private async Task<string> DetectCharset(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            try
            {
                using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    // This is often incorrectly indetected. If this happens, try to use other techniques instead
                    if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
                    {
                        if (!string.IsNullOrWhiteSpace(language))
                        {
                            return null;
                        }
                    }

                    return charset;
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return null;
        }

Example #34

0

Show file

File: SubtitleEncoder.cs Project: ratanparai/Emby

        private async Task<string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken)
        {
            try
            {
                using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false))
                {
                    var detector = new CharsetDetector();
                    detector.Feed(file);
                    detector.DataEnd();

                    var charset = detector.Charset;

                    if (!string.IsNullOrWhiteSpace(charset))
                    {
                        _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path);
                    }

                    return charset;
                }
            }
            catch (IOException ex)
            {
                _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path);
            }

            return null;
        }

C# (CSharp) CharsetDetector Examples