Esempi in C# (CSharp) per CharsetDetector.SetText

Linguaggio di programmazione: C# (CSharp)

Classe/tipologia: CharsetDetector

Metodo/funzione: SetText

Esempi su hotexamples.com: 5

CharsetDetector.SetText in C# (CSharp): 5 esempi trovati. Questi sono i migliori esempi reali in C# (CSharp) per CharsetDetector.SetText, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Feed(30)

DataEnd(30)

DetectFromBytes(27)

DetectFromFile(21)

DetectFromStream(11)

GuessCharsetOfFile(7)

SetText(5)

Detect(5)

Reset(4)

detect(1)

SetDeclaredEncoding(1)

GetString(1)

IsDone(1)

InputFilterEnabled(1)

GetReader(1)

FeedAsync(1)

EnableInputFilter(1)

setText(1)

Esempio n. 1

Mostra file

        public void TestC1Bytes()
        {
            String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";

            String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";

            byte[] bISO     = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sISO, "ISO-8859-1");
            byte[] bWindows = ILOG.J2CsMapping.Util.StringUtil.GetBytes(sWindows, "windows-1252");

            CharsetDetector det = new CharsetDetector();
            CharsetMatch    m;

            det.SetText(bWindows);
            m = det.Detect();

            if (m.GetName() != "windows-1252")
            {
                Errln("Text with C1 bytes not correctly detected as windows-1252.");
                return;
            }

            det.SetText(bISO);
            m = det.Detect();

            if (m.GetName() != "ISO-8859-1")
            {
                Errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
            }
        }

Esempio n. 2

Mostra file

        public void TestInputFilter()
        {
            String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";

            byte[]          bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(s, "ISO-8859-1");
            CharsetDetector det   = new CharsetDetector();
            CharsetMatch    m;

            det.EnableInputFilter(true);
            if (!det.InputFilterEnabled())
            {
                Errln("input filter should be enabled");
            }

            det.SetText(bytes);
            m = det.Detect();

            if (!m.GetLanguage().Equals("fr"))
            {
                Errln("input filter did not strip markup!");
            }

            det.EnableInputFilter(false);
            det.SetText(bytes);
            m = det.Detect();

            if (!m.GetLanguage().Equals("en"))
            {
                Errln("unfiltered input did not detect as English!");
            }
        }

Esempio n. 3

Mostra file

        public void TestShortInput()
        {
            // Test that detection with very short byte strings does not crash and
            // burn.
            // The shortest input that should produce positive detection result is
            // two bytes,
            // a UTF-16 BOM.
            // TODO: Detector confidence levels needs to be refined for very short
            // input.
            // Too high now, for some charsets that happen to be compatible with a
            // few bytes of input.
            byte[][] shortBytes = new byte[][] {
                new byte[] {},
                new byte[] { (byte)0x0a },
                new byte[] { (byte)'A', (byte)'B' },
                new byte[] { (byte)'A', (byte)'B', (byte)'C' },
                new byte[] { (byte)'A', (byte)'B', (byte)'C',
                             (byte)'D' }
            };

            CharsetDetector det = new CharsetDetector();
            CharsetMatch    m;

            for (int i = 0; i < shortBytes.Length; i++)
            {
                det.SetText(shortBytes[i]);
                m = det.Detect();
            }
        }

Esempio n. 4

Mostra file

        public void TestUTF16()
        {
            String source = "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a "
                            + "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";

            byte[]          beBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeBig");
            byte[]          leBytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(source, "UnicodeLittle");
            CharsetDetector det     = new CharsetDetector();
            CharsetMatch    m;

            det.SetText(beBytes);
            m = det.Detect();

            if (!m.GetName().Equals("UTF-16BE"))
            {
                Errln("Encoding detection failure: expected UTF-16BE, got "
                      + m.GetName());
            }

            det.SetText(leBytes);
            m = det.Detect();

            if (!m.GetName().Equals("UTF-16LE"))
            {
                Errln("Encoding detection failure: expected UTF-16LE, got "
                      + m.GetName());
            }

            // Jitterbug 4451, for coverage
            int confidence = m.GetConfidence();

            if (confidence != 100)
            {
                Errln("Did not get the expected confidence level " + confidence);
            }
            int matchType = m.GetMatchType();

            if (matchType != 0)
            {
                Errln("Did not get the expected matchType level " + matchType);
            }
        }

Esempio n. 5

Mostra file

        private void CheckEncoding(String testString, String encoding, String id)
        {
            String enc = null, lang = null;

            // #if defined(FOUNDATION10) || defined(J2SE13)
            // ## String[] split = Utility.split(encoding,'/');
            // #else
            String[] split = ILOG.J2CsMapping.Text.RegExUtil.Split(encoding, "/");
            // #endif

            enc = split[0];

            if (split.Length > 1)
            {
                lang = split[1];
            }

            try {
                CharsetDetector det = new CharsetDetector();
                byte[]          bytes;

                // if (enc.startsWith("UTF-32")) {
                // UTF32 utf32 = UTF32.getInstance(enc);

                // bytes = utf32.toBytes(testString);
                // } else {
                String from = enc;

                while (true)
                {
                    try {
                        bytes = ILOG.J2CsMapping.Util.StringUtil.GetBytes(testString, from);
                    } catch (NotSupportedException uoe) {
                        // In some runtimes, the ISO-2022-CN converter
                        // only converts *to* Unicode - we have to use
                        // x-ISO-2022-CN-GB to convert *from* Unicode.
                        if (from.Equals("ISO-2022-CN"))
                        {
                            from = "x-ISO-2022-CN-GB";
                            continue;
                        }

                        // Ignore any other converters that can't
                        // convert from Unicode.
                        return;
                    } catch (IOException uee) {
                        // Ignore any encodings that this runtime
                        // doesn't support.
                        return;
                    }

                    break;
                }
                // }

                det.SetText(bytes);
                CheckMatch(det, testString, enc, lang, id);

                det.SetText(new MemoryStream(bytes));
                CheckMatch(det, testString, enc, lang, id);
            } catch (Exception e) {
                Errln(id + ": " + e.ToString() + "enc=" + enc);
                Console.Error.WriteLine(e.StackTrace);
            }
        }