Ejemplo n.º 1
0
        /// <summary>Detects the possible code pages by reading a specified stream.</summary>
        /// <remarks>The stream reader position is reset after reading.</remarks>
        /// <returns>The detected code pages, which will have zero length if none was detected.</returns>
        internal static int[] DetectCodePages(Stream stream)
        {
            const int  BUFFERSIZE = 1024;
            nsDetector detector   = new nsDetector(nsPSMDetector.ALL);

            detector.Init(null);

            byte[] buffer     = new byte[BUFFERSIZE];
            int    readLength = 0;
            bool   finished   = false;

            while (!finished)
            {
                readLength = stream.Read(buffer, 0, buffer.Length);
                if (readLength == 0)
                {
                    break;
                }

                finished = detector.DoIt(buffer, readLength, false);
            }
            detector.Done();
            stream.Seek(0, SeekOrigin.Begin);

            string[] detectedEncodings = detector.getProbableCharsets();
            Logger.Info("[FileInputOutput] Detected encodings: {0}", String.Join(", ", detectedEncodings));

            /* Check if no encoding was detected */
            if (detectedEncodings[0] == "nomatch")
            {
                return(new int[0]);
            }

            return(GetCodePages(detectedEncodings));
        }
Ejemplo n.º 2
0
        public static string DetectChineseCharset(string filePath)
        {
            CharsetDetector.DetectedCharset = "GB2312";
            nsDetector nsDetector = new nsDetector(3);
            Notifier   aObserver  = new Notifier();

            nsDetector.Init(aObserver);
            byte[] array = new byte[1024];
            int    aLen  = File.OpenRead(filePath).Read(array, 0, array.Length);
            bool   flag  = nsDetector.isAscii(array, aLen);

            if (!flag)
            {
                nsDetector.DoIt(array, aLen, false);
            }
            nsDetector.DataEnd();
            if (flag)
            {
                CharsetDetector.DetectedCharset = "ASCII";
            }
            if (File.ReadAllText(filePath).Contains("CONTENT=\"text/html; charset=gb2312\""))
            {
                CharsetDetector.DetectedCharset = "GB2312";
            }
            return(CharsetDetector.DetectedCharset);
        }
Ejemplo n.º 3
0
 public Encoding sniff()
 {
     nsDetector detector = new nsDetector(nsPSMDetector.ALL);
     detector.Init(this);
     detector.DoIt(source, length, false);
     detector.DataEnd();
     if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) {
         return returnValue;
     } else {
         return null;
     }
 }
Ejemplo n.º 4
0
        public static Encoding DetectEncoding(string filePath)
        {
            var      fs       = new FileStream(filePath, FileMode.Open, FileAccess.Read);
            Encoding encoding = null;

            var det = new nsDetector(2);
            var not = new Notifier();

            det.Init(not);

            var done    = false;
            var isAscii = true;

            var buf = new byte[1024];
            int len = fs.Read(buf, 0, buf.Length);

            //For some reason NCharDet can't detect Unicode.
            //Manual detect Unicode here.
            if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE)
            {
                fs.Close();
                det.DataEnd();
                return(Encoding.Unicode);
            }

            while (len > 0)
            {
                // Check if the stream is only ascii.
                if (isAscii)
                {
                    isAscii = det.isAscii(buf, len);
                }

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                {
                    done = det.DoIt(buf, len, false);
                }

                len = fs.Read(buf, 0, buf.Length);
            }
            fs.Close();
            det.DataEnd();

            if (isAscii)
            {
                encodingFound = true;
                encoding      = Encoding.ASCII;
            }

            if (!encodingFound)
            {
                string[] prob = det.getProbableCharsets();
                encodingName = prob[0];
            }

            if (encoding == null)
            {
                encoding = Encoding.GetEncoding(encodingName);
            }

            return(encoding);
        }
Ejemplo n.º 5
0
        public static string detEncoding = string.Empty;        // 自动检测到的文件类型
        public void TransEncoding(string srcFullName, string dstFullName,
                                  string dstEncoding = "utf8",
                                  bool IsAutoDet     = true, string srcEncoding = "gb2312")
        {
            Encoding Edst = Encoding.GetEncoding(dstEncoding);
            Encoding Esrc = Encoding.GetEncoding(srcEncoding);

            if (IsAutoDet)
            {
                #region 检测编码
                nsDetector det = new nsDetector();
                Notifier   not = new Notifier();
                det.Init(not);

                byte[] buf = new byte[1024];
                int    len;
                bool   done    = false;
                bool   isAscii = true;

                FileStream fs = File.OpenRead(srcFullName);
                len = fs.Read(buf, 0, buf.Length);
                while (len > 0)
                {
                    // Check if the stream is only ascii.
                    if (isAscii)
                    {
                        isAscii = det.isAscii(buf, len);
                    }

                    // DoIt if non-ascii and not done yet.
                    if (!isAscii && !done)
                    {
                        done = det.DoIt(buf, len, false);
                    }

                    len = fs.Read(buf, 0, buf.Length);
                }
                det.DataEnd();
                fs.Close();

                if (isAscii)
                {
                    found       = true;
                    detEncoding = "ASCII";
                }

                if (!found)
                {
                    string[] prob = det.getProbableCharsets();
                    if (prob.Length > 0)
                    {
                        detEncoding = prob[0];
                    }
                    else
                    {
                        detEncoding = srcEncoding;
                    }
                }
                #endregion

                Esrc = Encoding.GetEncoding(detEncoding);
            }

            #region 编码转换
            string strAll = File.ReadAllText(srcFullName, Esrc);
            File.WriteAllText(dstFullName, strAll, Edst);
            #endregion
        }