Exemple #1
0
        /// <summary>
        /// Recog the charset from byte array.
        /// </summary>
        /// <param name="bytes">the byte array.</param>
        /// <param name="language">the language.</param>
        /// <param name="maxLength">max length per time. the default is 1024, -1 to without limit.</param>
        /// <returns>charset string, will be empty when can't recog.</returns>
        public static string RecogCharset(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL, int maxLength = 1024)
        {
            if (bytes == null || bytes.Length == 0)
                return null;

            PSMDetector detector = new PSMDetector(language);
            string charset = String.Empty;

            if (maxLength > 0)
            {
                int count = 0;

                do
                {
                    var tempBytes = bytes.Skip(maxLength * count).Take(maxLength);
                    if (tempBytes == null || tempBytes.Count() == 0)
                        break;

                    detector.HandleData(tempBytes.ToArray(), tempBytes.Count(), ref charset);
                    if (!string.IsNullOrEmpty(charset))
                        break;

                    count++;
                }
                while (true);
            }
            else
                detector.HandleData(bytes, bytes.Length, ref charset);

            return charset;
        }
Exemple #2
0
        /// <summary>
        /// Recog the Encoding from byte array.
        /// </summary>
        /// <param name="bytes">the byte array.</param>
        /// <param name="language">the language.</param>
        /// <returns>charset string, will be empty when can't recog.</returns>
        public static Encoding RecogEncoding(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL)
        {
            string charset = RecogCharset(bytes, language);
            if (!string.IsNullOrEmpty(charset))
                return Encoding.GetEncoding(charset); 

            return Encoding.Default;
        }
        /// <summary>
        /// Recog the Encoding from byte array.
        /// </summary>
        /// <param name="bytes">the byte array.</param>
        /// <param name="language">the language.</param>
        /// <returns>charset string, will be empty when can't recog.</returns>
        public static Encoding RecogEncoding(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL)
        {
            string charset = RecogCharset(bytes, language);

            if (!string.IsNullOrEmpty(charset))
            {
                return(Encoding.GetEncoding(charset));
            }

            return(Encoding.Default);
        }
        /// <summary>
        /// Recog the charset from byte array.
        /// </summary>
        /// <param name="bytes">the byte array.</param>
        /// <param name="language">the language.</param>
        /// <param name="maxLength">max length per time. the default is 1024, -1 to without limit.</param>
        /// <returns>charset string, will be empty when can't recog.</returns>
        public static string RecogCharset(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL, int maxLength = 1024)
        {
            if (bytes == null || bytes.Length == 0)
            {
                return(null);
            }

            PSMDetector detector = new PSMDetector(language);
            string      charset  = String.Empty;

            if (maxLength > 0)
            {
                int count = 0;

                do
                {
                    var tempBytes = bytes.Skip(maxLength * count).Take(maxLength);
                    if (tempBytes == null || tempBytes.Count() == 0)
                    {
                        break;
                    }

                    detector.HandleData(tempBytes.ToArray(), tempBytes.Count(), ref charset);
                    if (!string.IsNullOrEmpty(charset))
                    {
                        break;
                    }

                    count++;
                }while (true);
            }
            else
            {
                detector.HandleData(bytes, bytes.Length, ref charset);
            }

            return(charset);
        }
Exemple #5
0
        protected void initVerifiers(NChardetLanguage currVerSet)
        {
            //int idx = 0 ;
            NChardetLanguage currVerifierSet;

            if (currVerSet >= 0 && currVerSet < NChardetLanguage.NO_OF_LANGUAGES)
            {
                currVerifierSet = currVerSet;
            }
            else {
                currVerifierSet = NChardetLanguage.ALL;
            }

            mVerifier = null;
            mStatisticsData = null;

            if (currVerifierSet == NChardetLanguage.TRADITIONAL_CHINESE)
            {
                mVerifier = new Verifier[] {
                      new UTF8Verifier(),
                      new BIG5Verifier(),
                      new ISO2022CNVerifier(),
                      new EUCTWVerifier(),
                      new CP1252Verifier(),
                      new UCS2BEVerifier(),
                      new UCS2LEVerifier()
               };

                mStatisticsData = new EUCStatistics[] {
                      null,
                      new Big5Statistics(),
                      null,
                      new EUCTWStatistics(),
                      null,
                      null,
                      null
               };
            }

            //==========================================================
            else if (currVerifierSet == NChardetLanguage.KOREAN)
            {
                mVerifier = new Verifier[] {
                      new UTF8Verifier(),
                      new EUCKRVerifier(),
                      new ISO2022KRVerifier(),
                      new CP1252Verifier(),
                      new UCS2BEVerifier(),
                      new UCS2LEVerifier()
               };
            }

            //==========================================================
            else if (currVerifierSet == NChardetLanguage.SIMPLIFIED_CHINESE)
            {
                mVerifier = new Verifier[] {
                      new UTF8Verifier(),
                      new GB2312Verifier(),
                      new GB18030Verifier(),
                      new ISO2022CNVerifier(),
                      new HZVerifier(),
                      new CP1252Verifier(),
                      new UCS2BEVerifier(),
                      new UCS2LEVerifier()
               };
            }

            //==========================================================
            else if (currVerifierSet == NChardetLanguage.JAPANESE)
            {
                mVerifier = new Verifier[] {
                      new UTF8Verifier(),
                      new SJISVerifier(),
                      new EUCJPVerifier(),
                      new ISO2022JPVerifier(),
                      new CP1252Verifier(),
                      new UCS2BEVerifier(),
                      new UCS2LEVerifier()
               };
            }
            //==========================================================
            else if (currVerifierSet == NChardetLanguage.CHINESE)
            {
                mVerifier = new Verifier[] {
                      new UTF8Verifier(),
                      new GB2312Verifier(),
                      new GB18030Verifier(),
                      new BIG5Verifier(),
                      new ISO2022CNVerifier(),
                      new HZVerifier(),
                      new EUCTWVerifier(),
                      new CP1252Verifier(),
                      new UCS2BEVerifier(),
                      new UCS2LEVerifier()
               };
                mStatisticsData = new EUCStatistics[] {
                      null,
                      new GB2312Statistics(),
                null,
                      new Big5Statistics(),
                      null,
                      null,
                      new EUCTWStatistics(),
                      null,
                      null,
                      null
               };
            }

            //==========================================================
            else if (currVerifierSet == NChardetLanguage.ALL)
            {
                mVerifier = new Verifier[] {
                      new UTF8Verifier(),
                      new SJISVerifier(),
                      new EUCJPVerifier(),
                      new ISO2022JPVerifier(),
                      new EUCKRVerifier(),
                      new ISO2022KRVerifier(),
                      new BIG5Verifier(),
                      new EUCTWVerifier(),
                      new GB2312Verifier(),
                      new GB18030Verifier(),
                      new ISO2022CNVerifier(),
                      new HZVerifier(),
                      new CP1252Verifier(),
                      new UCS2BEVerifier(),
                      new UCS2LEVerifier()
               };
                mStatisticsData = new EUCStatistics[] {
                      null,
                      null,
                      new EUCJPStatistics(),
                      null,
                      new EUCKRStatistics(),
                      null,
                      new Big5Statistics(),
                      new EUCTWStatistics(),
                      new GB2312Statistics(),
                      null,
                      null,
                      null,
                      null,
                      null,
                      null
               };
            }
            mClassRunSampler = (mStatisticsData != null);
            mClassItems = mVerifier.Length;
        }
Exemple #6
0
 public PSMDetector(NChardetLanguage langFlag)
 {
     initVerifiers(langFlag);
     Reset();
 }
        protected void initVerifiers(NChardetLanguage currVerSet)
        {
            //int idx = 0 ;
            NChardetLanguage currVerifierSet;

            if (currVerSet >= 0 && currVerSet < NChardetLanguage.NO_OF_LANGUAGES)
            {
                currVerifierSet = currVerSet;
            }
            else
            {
                currVerifierSet = NChardetLanguage.ALL;
            }

            mVerifier       = null;
            mStatisticsData = null;

            //==========================================================
            if (currVerifierSet == NChardetLanguage.CHINESE)
            {
                mVerifier = new Verifier[] {
                    new UTF8Verifier(),
                    new GB2312Verifier(),
                    new UCS2BEVerifier(),
                    new UCS2LEVerifier()
                };
                mStatisticsData = new EUCStatistics[] {
                    null,
                    new GB2312Statistics(),
                    null,
                    null
                };
            }

            //==========================================================
            else if (currVerifierSet == NChardetLanguage.ALL)
            {
                mVerifier = new Verifier[] {
                    new UTF8Verifier(),
                    new SJISVerifier(),
                    new EUCJPVerifier(),
                    new ISO2022JPVerifier(),
                    new EUCKRVerifier(),
                    new ISO2022KRVerifier(),
                    new BIG5Verifier(),
                    new EUCTWVerifier(),
                    new GB2312Verifier(),
                    new GB18030Verifier(),
                    new ISO2022CNVerifier(),
                    new HZVerifier(),
                    new CP1252Verifier(),
                    new UCS2BEVerifier(),
                    new UCS2LEVerifier()
                };
                mStatisticsData = new EUCStatistics[] {
                    null,
                    null,
                    new EUCJPStatistics(),
                    null,
                    new EUCKRStatistics(),
                    null,
                    new Big5Statistics(),
                    new EUCTWStatistics(),
                    new GB2312Statistics(),
                    null,
                    null,
                    null,
                    null,
                    null,
                    null
                };
            }
            mClassRunSampler = (mStatisticsData != null);
            mClassItems      = mVerifier.Length;
        }
 public PSMDetector(NChardetLanguage langFlag)
 {
     initVerifiers(langFlag);
     Reset();
 }