Esempio n. 1
0
        public void TestC1Bytes()
        {
            String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";

            String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";

            byte[] bISO     = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sISO, "ISO-8859-1");
            byte[] bWindows = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sWindows, "windows-1252");

            CharsetDetector det = new CharsetDetector();
            CharsetMatch    m;

            det.SetText(bWindows);
            m = det.Detect();

            if (m.GetName() != "windows-1252")
            {
                Errln("Text with C1 bytes not correctly detected as windows-1252.");
                return;
            }

            det.SetText(bISO);
            m = det.Detect();

            if (m.GetName() != "ISO-8859-1")
            {
                Errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
            }
        }
Esempio n. 2
0
        public void TestInputFilter()
        {
            String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";

            byte[]          bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(s, "ISO-8859-1");
            CharsetDetector det   = new CharsetDetector();
            CharsetMatch    m;

            det.EnableInputFilter(true);
            if (!det.InputFilterEnabled())
            {
                Errln("input filter should be enabled");
            }

            det.SetText(bytes);
            m = det.Detect();

            if (!m.GetLanguage().Equals("fr"))
            {
                Errln("input filter did not strip markup!");
            }

            det.EnableInputFilter(false);
            det.SetText(bytes);
            m = det.Detect();

            if (!m.GetLanguage().Equals("en"))
            {
                Errln("unfiltered input did not detect as English!");
            }
        }
Esempio n. 3
0
        public void TestShortInput()
        {
            // Test that detection with very short byte strings does not crash and
            // burn.
            // The shortest input that should produce positive detection result is
            // two bytes,
            // a UTF-16 BOM.
            // TODO: Detector confidence levels needs to be refined for very short
            // input.
            // Too high now, for some charsets that happen to be compatible with a
            // few bytes of input.
            byte[][] shortBytes = new byte[][] {
                new byte[] {},
                new byte[] { (byte)0x0a },
                new byte[] { (byte)'A', (byte)'B' },
                new byte[] { (byte)'A', (byte)'B', (byte)'C' },
                new byte[] { (byte)'A', (byte)'B', (byte)'C',
                             (byte)'D' }
            };

            CharsetDetector det = new CharsetDetector();
            CharsetMatch    m;

            for (int i = 0; i < shortBytes.Length; i++)
            {
                det.SetText(shortBytes[i]);
                m = det.Detect();
            }
        }
Esempio n. 4
0
        public void TestUTF16()
        {
            String source = "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a "
                            + "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";

            byte[]          beBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeBig");
            byte[]          leBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeLittle");
            CharsetDetector det     = new CharsetDetector();
            CharsetMatch    m;

            det.SetText(beBytes);
            m = det.Detect();

            if (!m.GetName().Equals("UTF-16BE"))
            {
                Errln("Encoding detection failure: expected UTF-16BE, got "
                      + m.GetName());
            }

            det.SetText(leBytes);
            m = det.Detect();

            if (!m.GetName().Equals("UTF-16LE"))
            {
                Errln("Encoding detection failure: expected UTF-16LE, got "
                      + m.GetName());
            }

            // Jitterbug 4451, for coverage
            int confidence = m.GetConfidence();

            if (confidence != 100)
            {
                Errln("Did not get the expected confidence level " + confidence);
            }
            int matchType = m.GetMatchType();

            if (matchType != 0)
            {
                Errln("Did not get the expected matchType level " + matchType);
            }
        }
Esempio n. 5
0
        private void CheckMatch(CharsetDetector det, String testString,
                                String encoding, String language, String id)
        {
            CharsetMatch m = det.Detect();
            String       decoded;

            if (!m.GetName().Equals(encoding))
            {
                Errln(id + ": encoding detection failure - expected " + encoding
                      + ", got " + m.GetName());
                return;
            }

            String charsetMatchLanguage = m.GetLanguage();

            if ((language != null && !charsetMatchLanguage.Equals(language)) ||
                (language == null && charsetMatchLanguage != null) ||
                (language != null && charsetMatchLanguage == null))
            {
                Errln(id + ", " + encoding
                      + ": language detection failure - expected " + language
                      + ", got " + m.GetLanguage());
            }

            if (encoding.StartsWith("UTF-32"))
            {
                return;
            }

            decoded = m.GetString();

            if (!testString.Equals(decoded))
            {
                Errln(id + ", " + encoding
                      + ": getString() didn't return the original string!");
            }

            decoded = StringFromReader(m.GetReader());

            if (!testString.Equals(decoded))
            {
                Errln(id + ", " + encoding
                      + ": getReader() didn't yield the original string!");
            }
        }