public void TestC1Bytes() { String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes."; byte[] bISO = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sISO, "ISO-8859-1"); byte[] bWindows = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sWindows, "windows-1252"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.SetText(bWindows); m = det.Detect(); if (m.GetName() != "windows-1252") { Errln("Text with C1 bytes not correctly detected as windows-1252."); return; } det.SetText(bISO); m = det.Detect(); if (m.GetName() != "ISO-8859-1") { Errln("Text without C1 bytes not correctly detected as ISO-8859-1."); } }
public void TestShortInput() { // Test that detection with very short byte strings does not crash and // burn. // The shortest input that should produce positive detection result is // two bytes, // a UTF-16 BOM. // TODO: Detector confidence levels needs to be refined for very short // input. // Too high now, for some charsets that happen to be compatible with a // few bytes of input. byte[][] shortBytes = new byte[][] { new byte[] {}, new byte[] { (byte)0x0a }, new byte[] { (byte)'A', (byte)'B' }, new byte[] { (byte)'A', (byte)'B', (byte)'C' }, new byte[] { (byte)'A', (byte)'B', (byte)'C', (byte)'D' } }; CharsetDetector det = new CharsetDetector(); CharsetMatch m; for (int i = 0; i < shortBytes.Length; i++) { det.SetText(shortBytes[i]); m = det.Detect(); } }
public void TestInputFilter() { String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>"; byte[] bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(s, "ISO-8859-1"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.EnableInputFilter(true); if (!det.InputFilterEnabled()) { Errln("input filter should be enabled"); } det.SetText(bytes); m = det.Detect(); if (!m.GetLanguage().Equals("fr")) { Errln("input filter did not strip markup!"); } det.EnableInputFilter(false); det.SetText(bytes); m = det.Detect(); if (!m.GetLanguage().Equals("en")) { Errln("unfiltered input did not detect as English!"); } }
public static string LoadFile(FileInfo fileInfo) { string result; Encoding encoding = CurrentEncoding; using (Stream stream = fileInfo.OpenRead()) { CharsetDetector detector = new CharsetDetector(); detector.Feed(stream); detector.DataEnd(); stream.Position = 0; if (detector.Charset != null) { Debug.LogFormat("Detected charset of file: {0}", detector.Charset); try { encoding = Encoding.GetEncoding(detector.Charset); } catch { Debug.LogWarning("Failed to load encoding, will use default encoding."); encoding = CurrentEncoding; } } else { Debug.LogFormat("Failed to detect charset, will use default encoding."); } using (StreamReader reader = new StreamReader(stream, encoding)) result = reader.ReadToEnd(); } return(result); }
public static bool TryGuessEncoding(Stream stream, out Encoding encoding) { encoding = null; try { var result = CharsetDetector.DetectFromStream(stream); if (result.Detected?.Encoding != null) // Detected can be null { encoding = AnalyzeAndGuessEncoding(result); return(true); } else if (stream.Length > 0) // We do not care about empty file { Analytics.TrackEvent("UnableToDetectEncoding"); } } catch (Exception ex) { Analytics.TrackEvent("TryGuessEncodingFailedWithException", new Dictionary <string, string>() { { "Exception", ex.ToString() }, { "Message", ex.Message } }); } return(false); }
public void RunTest(string fileName) { var filePath = Path.Combine( this.rootDirectory.FullName, "EncodingTests", fileName + ".cst" ); using var reader = new StreamReader(filePath, Encoding.UTF8, true); var code = reader.ReadToEnd(); var detectionResult = CharsetDetector.DetectFromFile(filePath); var encoding = detectionResult.Detected.Encoding; reader.Close(); var formatter = new CodeFormatter(); var result = formatter.Format(code, new Options()); var actualFilePath = filePath.Replace(".cst", ".actual.cst"); using var stream = File.Open(actualFilePath, FileMode.Create); using var writer = new StreamWriter(stream, encoding); writer.Write(result.Code); var actualDetectionResult = CharsetDetector.DetectFromFile( filePath ); var actualEncoding = actualDetectionResult.Detected.Encoding; encoding.Should().Be(actualEncoding); }
private async Task <Stream> GetSubtitleStream(string path, string language, MediaProtocol protocol, bool requiresCharset, CancellationToken cancellationToken) { if (requiresCharset) { var bytes = await GetBytes(path, protocol, cancellationToken).ConfigureAwait(false); var charset = CharsetDetector.DetectFromBytes(bytes).Detected?.EncodingName; _logger.LogDebug("charset {CharSet} detected for {Path}", charset ?? "null", path); if (!string.IsNullOrEmpty(charset)) { // Make sure we have all the code pages we can get Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); using (var inputStream = new MemoryStream(bytes)) using (var reader = new StreamReader(inputStream, Encoding.GetEncoding(charset))) { var text = await reader.ReadToEndAsync().ConfigureAwait(false); bytes = Encoding.UTF8.GetBytes(text); return(new MemoryStream(bytes)); } } } return(File.OpenRead(path)); }
/// <summary> /// Command line example: detects the encoding of the given file. /// </summary> /// <param name="args">a filename</param> public static void Main(string[] args) { if (args.Length == 0) { Console.WriteLine("Usage: udetect <filename>"); return; } var filename = args[0]; using (var fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); Console.WriteLine(); if (cdet.Charset != null) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Detection failed."); } Console.ResetColor(); } Exit(); }
private bool convertToUtf8(string fileName) { string charset; lock (SyncRoot) using (FileStream fs = File.OpenRead(fileName)) { var cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); charset = cdet.Charset; if (!isAscii(cdet)) { return(false); } } var srcEncoding = Encoding.GetEncoding(charset); var text = File.ReadAllText(fileName, srcEncoding); File.WriteAllText(fileName, text, Encoding.UTF8); return(true); }
private string DetectCharset(string path) { try { using (var file = new FileStream(path, FileMode.Open)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } return(charset); } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return(null); }
private bool isAscii(string fileName) { for (int i = 0; i < 10; i++) { try { lock (SyncRoot) using (FileStream fs = File.OpenRead(fileName)) { var cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); bool isAscii = EncodingChecker.isAscii(cdet); return(isAscii); } } catch (IOException) { Thread.Sleep(1000); } } return(false); }
private string DetectCharset(byte[] bytes) { try { using (var ms = new MemoryStream(bytes)) { var detector = new CharsetDetector(); detector.Feed(ms); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { //_logger.Debug("UniversalDetector detected charset {0}", charset); } return(charset); } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine web socket message charset", ex); } return(null); }
private string[] mergeSourceFiles(string[] sourceFiles, int mergeOption, out string[] mergedFileNames) { string name = CharsetDetector.GuessCharsetOfFile(sourceFiles[0]); List <string> list = new List <string>(); List <string> list2 = new List <string>(); StringBuilder stringBuilder = new StringBuilder(); StringBuilder stringBuilder2 = new StringBuilder(); bool @checked = changeFileNameCheckBox.Checked; int num = sourceFiles.Length; bool needMarkChapterHeaders = outputTypeComboBox.SelectedIndex == 2 || outputTypeComboBox.SelectedIndex == 4 || outputTypeComboBox.SelectedIndex == 6; for (int i = 0; i < num; i += mergeOption) { stringBuilder.Length = 0; stringBuilder2.Length = 0; int num2 = 0; while (num2 < mergeOption && sourceFiles.Length > i + num2) { stringBuilder.Append(Util.NormalizeTextAndRemoveIgnoredChinesePhrases(readFile(sourceFiles[i + num2], Encoding.GetEncoding(name), needMarkChapterHeaders))).Append("\n\n----------oOo----------\n\n"); if (num2 == 0) { stringBuilder2.Append(getOutputFileName(sourceFiles[i + num2], i + num2, num, @checked)); } else if (mergeOption != 1 && (num2 == mergeOption - 1 || i + num2 == sourceFiles.Length - 1)) { stringBuilder2.Append(" - ").Append(getOutputFileName(sourceFiles[i + num2], i + num2, num, @checked)); } num2++; } list.Add(stringBuilder.ToString()); list2.Add(stringBuilder2.ToString()); } mergedFileNames = list2.ToArray(); return(list.ToArray()); }
public static List <Encoding> GetFileEncoding(byte[] arrayByte) { List <Encoding> result = new List <Encoding>(); CharsetDetector detector = new CharsetDetector(); detector.Feed(arrayByte, 0, arrayByte.Length); detector.DataEnd(); if (!string.IsNullOrEmpty(detector.Charset) && detector.Confidence > 0.8f) { try { Encoding enc = Encoding.GetEncoding(detector.Charset); result.Add(enc); } catch (Exception ex) { DTEHelper.WriteExceptionToOutput(null, ex); #if DEBUG if (System.Diagnostics.Debugger.IsAttached) { System.Diagnostics.Debugger.Break(); } #endif } } return(result); }
public static Encoding GetEncoding(byte[] data) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(data, 0, data.Length); cdet.DataEnd(); if (cdet.Charset != null && cdet.Confidence > 0.5) { if (cdet.Charset.ToLowerInvariant() == "big-5") { return(Encoding.GetEncoding("big5")); } else { try { return(Encoding.GetEncoding(cdet.Charset)); } catch { return(Encoding.Default); } } } return(null); }
public bool ParseFormat(string path, out List <SubtitleItem> result) { Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); var detect = CharsetDetector.DetectFromFile(path); var encoding = Encoding.GetEncoding(detect.Detected.EncodingName); var xmlStream = new StreamReader(path, encoding).BaseStream; // rewind the stream xmlStream.Position = 0; var items = new List <SubtitleItem>(); // parse xml stream var xmlDoc = new XmlDocument(); xmlDoc.Load(xmlStream); if (xmlDoc.DocumentElement != null) { var nodeList = xmlDoc.DocumentElement.SelectNodes("//text"); if (nodeList != null) { for (var i = 0; i < nodeList.Count; i++) { var node = nodeList[i]; try { var startString = node.Attributes["start"].Value; var start = float.Parse(startString, CultureInfo.InvariantCulture); var durString = node.Attributes["dur"].Value; var duration = float.Parse(durString, CultureInfo.InvariantCulture); var text = node.InnerText; items.Add(new SubtitleItem { StartTime = (int)(start * 1000), EndTime = (int)((start + duration) * 1000), Text = ConvertString(text) }); } catch { result = null; return(false); } } } } if (items.Any()) { result = Filters.RemoveDuplicateItems(items); return(true); } result = null; return(false); }
private async Task <string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken) { try { using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } return(charset); } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return(null); }
/// <summary> /// Command line example: detect the encoding of the given file. /// </summary> /// <param name="filename">a filename</param> public static void DetectDemo(string filename) { // Detect from File DetectionResult result = CharsetDetector.DetectFromFile(filename); // Get the best Detection DetectionDetail resultDetected = result.Detected; // detected result may be null. if (resultDetected != null) { // Get the alias of the found encoding string encodingName = resultDetected.EncodingName; // Get the System.Text.Encoding of the found encoding (can be null if not available) Encoding encoding = resultDetected.Encoding; // Get the confidence of the found encoding (between 0 and 1) float confidence = resultDetected.Confidence; if (encoding != null) { Console.WriteLine($"Detection completed: {filename}"); Console.WriteLine($"EncodingWebName: {encoding.WebName}{Environment.NewLine}Confidence: {confidence}"); } else { Console.WriteLine($"Detection completed: {filename}"); Console.WriteLine($"(Encoding is null){Environment.NewLine}EncodingName: {encodingName}{Environment.NewLine}Confidence: {confidence}"); } } else { Console.WriteLine($"Detection failed: {filename}"); } }
/// <summary> /// Command line example: detects the encoding of the given file. /// </summary> /// <param name="args">a filename</param> public static void Main(String[] args) { if (args.Length == 0) { Console.WriteLine("Usage: udetect <filename>"); return; } string filename = args[0]; using (FileStream fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else { Console.WriteLine("Detection failed."); } } }
public DetectionResult DetectUtfUnknown(byte[] rawData, int sizeLimit) { using (MemoryStream ms = new MemoryStream(rawData, 0, Math.Min(sizeLimit, rawData.Length))) { return(CharsetDetector.DetectFromStream(ms)); } }
public static bool TryGetEncoding(byte[] data, out Encoding encoding) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(data, 0, data.Length); cdet.DataEnd(); if (cdet.Charset != null) { if (cdet.Charset.ToLowerInvariant() == "big-5") { encoding = Encoding.GetEncoding("big5"); return(true); } else { try { encoding = Encoding.GetEncoding(cdet.Charset); return(true); } catch { encoding = Encoding.Default; return(false); } } } encoding = Encoding.Default; return(false); }
public void TestBomUTF32_LE() { byte[] buf = { 0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00 }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(Charsets.UTF32_LE, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestBomUtf8() { byte[] buf = { 0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0x21 }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual("UTF-8", result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestSingleChar() { byte[] buf = Encoding.UTF8.GetBytes("3"); var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName); Assert.AreEqual(1, result.Detected.Confidence); }
public void TestIssue3() { byte[] buf = Encoding.UTF8.GetBytes("3"); var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(CodepageName.ASCII, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestCaseBomUtf7(byte[] bufferBytes) { var result = CharsetDetector.DetectFromBytes(bufferBytes) .Detected; Assert.AreEqual(CodepageName.UTF7, result.EncodingName); Assert.AreEqual(1.0f, result.Confidence); }
public Encoding Detect(Stream stream) { var detector = new CharsetDetector(); detector.Feed(stream); detector.DataEnd(); return(detector.Charset.Return(Encoding.GetEncoding, null)); }
public void Test2byteArrayBomUTF16_LE() { byte[] buf = { 0xFF, 0xFE, }; var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(CodepageName.UTF16_LE, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
public void TestOutOfRange2() { byte[] buf = Encoding.UTF8.GetBytes("1234567890"); var result = CharsetDetector.DetectFromBytes(buf); Assert.AreEqual(Charsets.ASCII, result.Detected.EncodingName); Assert.AreEqual(1.0f, result.Detected.Confidence); }
private void LoadFileAsync(string path) { Task.Run(() => { const int maxLength = 5 * 1024 * 1024; var buffer = new MemoryStream(); bool tooLong; using (var s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { tooLong = s.Length > maxLength; while (s.Position < s.Length && buffer.Length < maxLength) { if (_disposed) { break; } var lb = new byte[8192]; var count = s.Read(lb, 0, lb.Length); buffer.Write(lb, 0, count); } } if (_disposed) { return; } if (tooLong) { _context.Title += " (0 ~ 5MB)"; } var bufferCopy = buffer.ToArray(); buffer.Dispose(); var encoding = CharsetDetector.DetectFromBytes(bufferCopy).Detected?.Encoding ?? Encoding.Default; var doc = new TextDocument(encoding.GetString(bufferCopy)); doc.SetOwnerThread(Dispatcher.Thread); if (_disposed) { return; } Dispatcher.BeginInvoke(new Action(() => { Encoding = encoding; SyntaxHighlighting = HighlightingManager.Instance.GetDefinitionByExtension(Path.GetExtension(path)); Document = doc; _context.IsBusy = false; }), DispatcherPriority.Render); }); }
/// <summary> /// Command line example: detects the encoding of the given file. /// </summary> /// <param name="args">a filename</param> public static void Main(String[] args) { if (args.Length == 0) { Console.WriteLine("Usage: udetect <filename>"); return; } var filename = args[0]; using (var fs = File.OpenRead(filename)) { ICharsetDetector cdet = new CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { Console.WriteLine("Charset: {0}, confidence: {1}", cdet.Charset, cdet.Confidence); } else Console.WriteLine("Detection failed."); } }
/** * Attempts to determine the encoding of the body. If it can't be determined, we use * DEFAULT_ENCODING instead. * * @return The detected encoding or DEFAULT_ENCODING. */ private static String getAndUpdateEncoding(NameValueCollection headers, byte[] body) { String values = headers["Content-Type"]; String contentType = values == null ? null : values.Length == 0 ? null : values; if (contentType != null) { String[] parts = contentType.Split(';'); if (BINARY_CONTENT_TYPES.Contains(parts[0])) { return DEFAULT_ENCODING; } if (parts.Length == 2) { int offset = parts[1].IndexOf("charset="); if (offset != -1) { String charset = parts[1].Substring(offset + 8).ToUpper(); // Some servers include quotes around the charset: // Content-Type: text/html; charset="UTF-8" if (charset[0] == '"') { charset = charset.Substring(1, charset.Length); } return charset; } } } if (body == null || body.Length == 0) { return DEFAULT_ENCODING; } // If the header doesn't specify the charset, try to determine it by examining the content. CharsetDetector detector = new CharsetDetector(); detector.setText(body); CharsetMatch match = detector.detect(); if (contentType != null) { // Record the charset in the content-type header so that its value can be cached // and re-used. This is a BIG performance win. headers.Add("Content-Type", contentType + "; charset=" + match.getName().ToUpper()); } return match.getName().ToUpper(); }
private async Task<string> DetectCharset(string path, string language, MediaProtocol protocol, CancellationToken cancellationToken) { try { using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } // This is often incorrectly indetected. If this happens, try to use other techniques instead if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase)) { if (!string.IsNullOrWhiteSpace(language)) { return null; } } return charset; } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return null; }
private async Task<string> DetectCharset(string path, MediaProtocol protocol, CancellationToken cancellationToken) { try { using (var file = await GetStream(path, protocol, cancellationToken).ConfigureAwait(false)) { var detector = new CharsetDetector(); detector.Feed(file); detector.DataEnd(); var charset = detector.Charset; if (!string.IsNullOrWhiteSpace(charset)) { _logger.Info("UniversalDetector detected charset {0} for {1}", charset, path); } return charset; } } catch (IOException ex) { _logger.ErrorException("Error attempting to determine subtitle charset from {0}", ex, path); } return null; }