/// <summary> /// Recog the charset from byte array. /// </summary> /// <param name="bytes">the byte array.</param> /// <param name="language">the language.</param> /// <param name="maxLength">max length per time. the default is 1024, -1 to without limit.</param> /// <returns>charset string, will be empty when can't recog.</returns> public static string RecogCharset(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL, int maxLength = 1024) { if (bytes == null || bytes.Length == 0) return null; PSMDetector detector = new PSMDetector(language); string charset = String.Empty; if (maxLength > 0) { int count = 0; do { var tempBytes = bytes.Skip(maxLength * count).Take(maxLength); if (tempBytes == null || tempBytes.Count() == 0) break; detector.HandleData(tempBytes.ToArray(), tempBytes.Count(), ref charset); if (!string.IsNullOrEmpty(charset)) break; count++; } while (true); } else detector.HandleData(bytes, bytes.Length, ref charset); return charset; }
/// <summary> /// Recog the Encoding from byte array. /// </summary> /// <param name="bytes">the byte array.</param> /// <param name="language">the language.</param> /// <returns>charset string, will be empty when can't recog.</returns> public static Encoding RecogEncoding(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL) { string charset = RecogCharset(bytes, language); if (!string.IsNullOrEmpty(charset)) return Encoding.GetEncoding(charset); return Encoding.Default; }
/// <summary> /// Recog the Encoding from byte array. /// </summary> /// <param name="bytes">the byte array.</param> /// <param name="language">the language.</param> /// <returns>charset string, will be empty when can't recog.</returns> public static Encoding RecogEncoding(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL) { string charset = RecogCharset(bytes, language); if (!string.IsNullOrEmpty(charset)) { return(Encoding.GetEncoding(charset)); } return(Encoding.Default); }
/// <summary> /// Recog the charset from byte array. /// </summary> /// <param name="bytes">the byte array.</param> /// <param name="language">the language.</param> /// <param name="maxLength">max length per time. the default is 1024, -1 to without limit.</param> /// <returns>charset string, will be empty when can't recog.</returns> public static string RecogCharset(byte[] bytes, NChardetLanguage language = NChardetLanguage.ALL, int maxLength = 1024) { if (bytes == null || bytes.Length == 0) { return(null); } PSMDetector detector = new PSMDetector(language); string charset = String.Empty; if (maxLength > 0) { int count = 0; do { var tempBytes = bytes.Skip(maxLength * count).Take(maxLength); if (tempBytes == null || tempBytes.Count() == 0) { break; } detector.HandleData(tempBytes.ToArray(), tempBytes.Count(), ref charset); if (!string.IsNullOrEmpty(charset)) { break; } count++; }while (true); } else { detector.HandleData(bytes, bytes.Length, ref charset); } return(charset); }
protected void initVerifiers(NChardetLanguage currVerSet) { //int idx = 0 ; NChardetLanguage currVerifierSet; if (currVerSet >= 0 && currVerSet < NChardetLanguage.NO_OF_LANGUAGES) { currVerifierSet = currVerSet; } else { currVerifierSet = NChardetLanguage.ALL; } mVerifier = null; mStatisticsData = null; if (currVerifierSet == NChardetLanguage.TRADITIONAL_CHINESE) { mVerifier = new Verifier[] { new UTF8Verifier(), new BIG5Verifier(), new ISO2022CNVerifier(), new EUCTWVerifier(), new CP1252Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; mStatisticsData = new EUCStatistics[] { null, new Big5Statistics(), null, new EUCTWStatistics(), null, null, null }; } //========================================================== else if (currVerifierSet == NChardetLanguage.KOREAN) { mVerifier = new Verifier[] { new UTF8Verifier(), new EUCKRVerifier(), new ISO2022KRVerifier(), new CP1252Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; } //========================================================== else if (currVerifierSet == NChardetLanguage.SIMPLIFIED_CHINESE) { mVerifier = new Verifier[] { new UTF8Verifier(), new GB2312Verifier(), new GB18030Verifier(), new ISO2022CNVerifier(), new HZVerifier(), new CP1252Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; } //========================================================== else if (currVerifierSet == NChardetLanguage.JAPANESE) { mVerifier = new Verifier[] { new UTF8Verifier(), new SJISVerifier(), new EUCJPVerifier(), new ISO2022JPVerifier(), new CP1252Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; } //========================================================== else if (currVerifierSet == NChardetLanguage.CHINESE) { mVerifier = new Verifier[] { new UTF8Verifier(), new GB2312Verifier(), new GB18030Verifier(), new BIG5Verifier(), new ISO2022CNVerifier(), new HZVerifier(), new EUCTWVerifier(), new CP1252Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; mStatisticsData = new EUCStatistics[] { null, new GB2312Statistics(), null, new Big5Statistics(), null, null, new EUCTWStatistics(), null, null, null }; } //========================================================== else if (currVerifierSet == NChardetLanguage.ALL) { mVerifier = new Verifier[] { new UTF8Verifier(), new SJISVerifier(), new EUCJPVerifier(), new ISO2022JPVerifier(), new EUCKRVerifier(), new ISO2022KRVerifier(), new BIG5Verifier(), new EUCTWVerifier(), new GB2312Verifier(), new GB18030Verifier(), new ISO2022CNVerifier(), new HZVerifier(), new CP1252Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; mStatisticsData = new EUCStatistics[] { null, null, new EUCJPStatistics(), null, new EUCKRStatistics(), null, new Big5Statistics(), new EUCTWStatistics(), new GB2312Statistics(), null, null, null, null, null, null }; } mClassRunSampler = (mStatisticsData != null); mClassItems = mVerifier.Length; }
public PSMDetector(NChardetLanguage langFlag) { initVerifiers(langFlag); Reset(); }
protected void initVerifiers(NChardetLanguage currVerSet) { //int idx = 0 ; NChardetLanguage currVerifierSet; if (currVerSet >= 0 && currVerSet < NChardetLanguage.NO_OF_LANGUAGES) { currVerifierSet = currVerSet; } else { currVerifierSet = NChardetLanguage.ALL; } mVerifier = null; mStatisticsData = null; //========================================================== if (currVerifierSet == NChardetLanguage.CHINESE) { mVerifier = new Verifier[] { new UTF8Verifier(), new GB2312Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; mStatisticsData = new EUCStatistics[] { null, new GB2312Statistics(), null, null }; } //========================================================== else if (currVerifierSet == NChardetLanguage.ALL) { mVerifier = new Verifier[] { new UTF8Verifier(), new SJISVerifier(), new EUCJPVerifier(), new ISO2022JPVerifier(), new EUCKRVerifier(), new ISO2022KRVerifier(), new BIG5Verifier(), new EUCTWVerifier(), new GB2312Verifier(), new GB18030Verifier(), new ISO2022CNVerifier(), new HZVerifier(), new CP1252Verifier(), new UCS2BEVerifier(), new UCS2LEVerifier() }; mStatisticsData = new EUCStatistics[] { null, null, new EUCJPStatistics(), null, new EUCKRStatistics(), null, new Big5Statistics(), new EUCTWStatistics(), new GB2312Statistics(), null, null, null, null, null, null }; } mClassRunSampler = (mStatisticsData != null); mClassItems = mVerifier.Length; }