public void TestC1Bytes() { String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes."; byte[] bISO = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sISO, "ISO-8859-1"); byte[] bWindows = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sWindows, "windows-1252"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.SetText(bWindows); m = det.Detect(); if (m.GetName() != "windows-1252") { Errln("Text with C1 bytes not correctly detected as windows-1252."); return; } det.SetText(bISO); m = det.Detect(); if (m.GetName() != "ISO-8859-1") { Errln("Text without C1 bytes not correctly detected as ISO-8859-1."); } }
public void TestInputFilter() { String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>"; byte[] bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(s, "ISO-8859-1"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.EnableInputFilter(true); if (!det.InputFilterEnabled()) { Errln("input filter should be enabled"); } det.SetText(bytes); m = det.Detect(); if (!m.GetLanguage().Equals("fr")) { Errln("input filter did not strip markup!"); } det.EnableInputFilter(false); det.SetText(bytes); m = det.Detect(); if (!m.GetLanguage().Equals("en")) { Errln("unfiltered input did not detect as English!"); } }
public void TestShortInput() { // Test that detection with very short byte strings does not crash and // burn. // The shortest input that should produce positive detection result is // two bytes, // a UTF-16 BOM. // TODO: Detector confidence levels needs to be refined for very short // input. // Too high now, for some charsets that happen to be compatible with a // few bytes of input. byte[][] shortBytes = new byte[][] { new byte[] {}, new byte[] { (byte)0x0a }, new byte[] { (byte)'A', (byte)'B' }, new byte[] { (byte)'A', (byte)'B', (byte)'C' }, new byte[] { (byte)'A', (byte)'B', (byte)'C', (byte)'D' } }; CharsetDetector det = new CharsetDetector(); CharsetMatch m; for (int i = 0; i < shortBytes.Length; i++) { det.SetText(shortBytes[i]); m = det.Detect(); } }
public void TestUTF16() { String source = "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " + "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a"; byte[] beBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeBig"); byte[] leBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeLittle"); CharsetDetector det = new CharsetDetector(); CharsetMatch m; det.SetText(beBytes); m = det.Detect(); if (!m.GetName().Equals("UTF-16BE")) { Errln("Encoding detection failure: expected UTF-16BE, got " + m.GetName()); } det.SetText(leBytes); m = det.Detect(); if (!m.GetName().Equals("UTF-16LE")) { Errln("Encoding detection failure: expected UTF-16LE, got " + m.GetName()); } // Jitterbug 4451, for coverage int confidence = m.GetConfidence(); if (confidence != 100) { Errln("Did not get the expected confidence level " + confidence); } int matchType = m.GetMatchType(); if (matchType != 0) { Errln("Did not get the expected matchType level " + matchType); } }
private void CheckMatch(CharsetDetector det, String testString, String encoding, String language, String id) { CharsetMatch m = det.Detect(); String decoded; if (!m.GetName().Equals(encoding)) { Errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.GetName()); return; } String charsetMatchLanguage = m.GetLanguage(); if ((language != null && !charsetMatchLanguage.Equals(language)) || (language == null && charsetMatchLanguage != null) || (language != null && charsetMatchLanguage == null)) { Errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.GetLanguage()); } if (encoding.StartsWith("UTF-32")) { return; } decoded = m.GetString(); if (!testString.Equals(decoded)) { Errln(id + ", " + encoding + ": getString() didn't return the original string!"); } decoded = StringFromReader(m.GetReader()); if (!testString.Equals(decoded)) { Errln(id + ", " + encoding + ": getReader() didn't yield the original string!"); } }