示例#1
0
        public System.Text.Encoding GetEncodingOfFile(string filename)
        {
            int count = 0;

            byte[] buf;
            using (System.IO.FileStream fs = new System.IO.FileStream(filename, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.Read))
            {
                buf   = new byte[fs.Length];
                count = fs.Read(buf, 0, buf.Length);
            }
            if (count < 1)
            {
                return(System.Text.Encoding.Default);
            }
            NChardet.Detector        detect = new NChardet.Detector();
            CharsetDetectionObserver cdo    = new CharsetDetectionObserver();

            detect.Init(cdo);
            if (detect.isAscii(buf, count))
            {
                return(System.Text.Encoding.ASCII);
            }
            else
            {
                detect.DoIt(buf, count, true);
                detect.DataEnd();
                if (string.IsNullOrEmpty(cdo.Charset))
                {
                    return(System.Text.Encoding.Default);
                }
                else
                {
                    return(System.Text.Encoding.GetEncoding(cdo.Charset));
                }
            }
        }
示例#2
0
        public static string DetectStream(Stream stream)
        {
            int lang = 2;

            NChardet.Detector det = new NChardet.Detector(lang);

            CharsetDetectionObserver cdo = new CharsetDetectionObserver();

            det.Init(cdo);

            byte[] buf     = new byte[1024];
            bool   done    = false;
            bool   isAscii = true;
            int    len;

            using (stream)
            {
                while ((len = stream.Read(buf, 0, buf.Length)) != 0)
                {
                    // 探测是否为Ascii编码
                    if (isAscii == true)
                    {
                        isAscii = det.isAscii(buf, len);
                    }

                    // 如果不是Ascii编码,并且编码未确定,则继续探测
                    if (isAscii == false && done == false)
                    {
                        done = det.DoIt(buf, len, false);
                    }
                }
            }
            //调用DatEnd方法,
            //如果引擎认为已经探测出了正确的编码,
            //则会在此时调用ICharsetDetectionObserver的Notify方法
            det.DataEnd();

            string charset = Encoding.Default.BodyName;

            if (isAscii == true)
            {
                charset = Encoding.ASCII.BodyName;
            }
            else if (!string.IsNullOrEmpty(cdo.Charset))
            {
                charset = cdo.Charset;
            }
            else
            {
                string[] probable = det.getProbableCharsets();
                if (probable != null && probable.Length >= 1)
                {
                    string probableCharset = probable[0].ToLower();
                    if (probableCharset == "gb2312" || probableCharset == "utf-8")
                    {
                        charset = probableCharset;
                    }
                }
            }
            return(charset);
        }