public void TestC1Bytes() { String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes."; byte[] bISO = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sISO, "ISO-8859-1"); byte[] bWindows = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sWindows, "windows-1252"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.SetText(bWindows); m = det.Detect(); if (m.GetName() != "windows-1252") { Errln("Text with C1 bytes not correctly detected as windows-1252."); return; } det.SetText(bISO); m = det.Detect(); if (m.GetName() != "ISO-8859-1") { Errln("Text without C1 bytes not correctly detected as ISO-8859-1."); } }
public void TestInputFilter() { String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>"; byte[] bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(s, "ISO-8859-1"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.EnableInputFilter(true); if (!det.InputFilterEnabled()) { Errln("input filter should be enabled"); } det.SetText(bytes); m = det.Detect(); if (!m.GetLanguage().Equals("fr")) { Errln("input filter did not strip markup!"); } det.EnableInputFilter(false); det.SetText(bytes); m = det.Detect(); if (!m.GetLanguage().Equals("en")) { Errln("unfiltered input did not detect as English!"); } }
public void TestShortInput() { // Test that detection with very short byte strings does not crash and // burn. // The shortest input that should produce positive detection result is // two bytes, // a UTF-16 BOM. // TODO: Detector confidence levels needs to be refined for very short // input. // Too high now, for some charsets that happen to be compatible with a // few bytes of input. byte[][] shortBytes = new byte[][] { new byte[] {}, new byte[] { (byte)0x0a }, new byte[] { (byte)'A', (byte)'B' }, new byte[] { (byte)'A', (byte)'B', (byte)'C' }, new byte[] { (byte)'A', (byte)'B', (byte)'C', (byte)'D' } }; CharsetDetector det = new CharsetDetector(); CharsetMatch m; for (int i = 0; i < shortBytes.Length; i++) { det.SetText(shortBytes[i]); m = det.Detect(); } }
public void TestUTF16() { String source = "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " + "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a"; byte[] beBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeBig"); byte[] leBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeLittle"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.SetText(beBytes); m = det.Detect(); if (!m.GetName().Equals("UTF-16BE")) { Errln("Encoding detection failure: expected UTF-16BE, got " + m.GetName()); } det.SetText(leBytes); m = det.Detect(); if (!m.GetName().Equals("UTF-16LE")) { Errln("Encoding detection failure: expected UTF-16LE, got " + m.GetName()); } // Jitterbug 4451, for coverage int confidence = m.GetConfidence(); if (confidence != 100) { Errln("Did not get the expected confidence level " + confidence); } int matchType = m.GetMatchType(); if (matchType != 0) { Errln("Did not get the expected matchType level " + matchType); } }
private void CheckEncoding(String testString, String encoding, String id) { String enc = null, lang = null; // #if defined(FOUNDATION10) || defined(J2SE13) // ## String[] split = Utility.split(encoding,'/'); // #else String[] split = ILOG.J2CsMapping.Text.RegExUtil.Split(encoding, "/"); // #endif enc = split[0]; if (split.Length > 1) { lang = split[1]; } try { CharsetDetector det = new CharsetDetector(); byte[] bytes; // if (enc.startsWith("UTF-32")) { // UTF32 utf32 = UTF32.getInstance(enc); // bytes = utf32.toBytes(testString); // } else { String from = enc; while (true) { try { bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(testString, from); } catch (NotSupportedException uoe) { // In some runtimes, the ISO-2022-CN converter // only converts *to* Unicode - we have to use // x-ISO-2022-CN-GB to convert *from* Unicode. if (from.Equals("ISO-2022-CN")) { from = "x-ISO-2022-CN-GB"; continue; } // Ignore any other converters that can't // convert from Unicode. return; } catch (IOException uee) { // Ignore any encodings that this runtime // doesn't support. return; } break; } // } det.SetText(bytes); CheckMatch(det, testString, enc, lang, id); det.SetText(new MemoryStream(bytes)); CheckMatch(det, testString, enc, lang, id); } catch (Exception e) { Errln(id + ": " + e.ToString() + "enc=" + enc); Console.Error.WriteLine(e.StackTrace); } }