Exemplo n.º 1
0
	/// <summary>Detects the possible code pages by reading a specified stream.</summary>
	/// <remarks>The stream reader position is reset after reading.</remarks>
	/// <returns>The detected code pages, which will have zero length if none was detected.</returns>
	internal static int[] DetectCodePages (Stream stream){
		const int BUFFERSIZE = 1024;
		nsDetector detector = new nsDetector(nsPSMDetector.ALL);
		detector.Init(null);

  		byte[] buffer = new byte[BUFFERSIZE] ;
		int readLength = 0;
		bool finished = false;

		while (!finished) {
			readLength = stream.Read(buffer, 0, buffer.Length);
			if (readLength == 0)
				break;

			finished = detector.DoIt(buffer, readLength, false);
		}
		detector.Done();
		stream.Seek(0, SeekOrigin.Begin);

		string[] detectedEncodings = detector.getProbableCharsets();
		VerboseConsole.WriteLine(DetectedEncodingsToString(detectedEncodings));

		/* Check if no encoding was detected */
		if (detectedEncodings[0] == "nomatch")
			return new int[0];

		return GetCodePages(detectedEncodings);
	}
Exemplo n.º 2
0
		public static string detEncoding = string.Empty;// 自动检测到的文件类型
		public void TransEncoding(string srcFullName, string dstFullName,
			string dstEncoding = "utf8",
			bool IsAutoDet = true, string srcEncoding = "gb2312")
		{
			Encoding Edst = Encoding.GetEncoding(dstEncoding);
			Encoding Esrc = Encoding.GetEncoding(srcEncoding);

			if (IsAutoDet)
			{
				#region 检测编码
				nsDetector det = new nsDetector();
				Notifier not = new Notifier();
				det.Init(not);

				byte[] buf = new byte[1024];
				int len;
				bool done = false;
				bool isAscii = true;

				FileStream fs = File.OpenRead(srcFullName);
				len = fs.Read(buf, 0, buf.Length);
				while (len > 0)
				{
					// Check if the stream is only ascii.
					if (isAscii)
						isAscii = det.isAscii(buf, len);

					// DoIt if non-ascii and not done yet.
					if (!isAscii && !done)
						done = det.DoIt(buf, len, false);

					len = fs.Read(buf, 0, buf.Length);
				}
				det.DataEnd();
				fs.Close();

				if (isAscii)
				{
					found = true;
					detEncoding = "ASCII";
				}

				if (!found)
				{
					string[] prob = det.getProbableCharsets();
					if (prob.Length > 0)
					{
						detEncoding = prob[0];
					}
					else
					{
						detEncoding = srcEncoding;
					}
				}
				#endregion

				Esrc = Encoding.GetEncoding(detEncoding);
			}

			#region 编码转换
			string strAll = File.ReadAllText(srcFullName, Esrc);
			File.WriteAllText(dstFullName, strAll, Edst);
			#endregion
		}
Exemplo n.º 3
0
        public static Encoding DetectEncoding(string filePath)
        {
            FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read);
            Encoding encoding = null;


            nsDetector det = new nsDetector(2);
            Notifier not = new Notifier();
            det.Init(not);

            bool done = false;
            bool isAscii = true;

            byte[] buf = new byte[1024];
            int len = fs.Read(buf, 0, buf.Length);

            //For some reason NCharDet can't detect Unicode.
            //Manual detect Unicode here.
            if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE)
            {
                fs.Close();
                det.DataEnd();
                return Encoding.Unicode;
            }

            while (len > 0)
            {
                // Check if the stream is only ascii.
                if (isAscii)
                    isAscii = det.isAscii(buf, len);

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                    done = det.DoIt(buf, len, false);

                len = fs.Read(buf, 0, buf.Length);
            }
            fs.Close();
            det.DataEnd();

            if (isAscii)
            {
                encodingFound = true;
                encoding = Encoding.ASCII;
            }

            if (!encodingFound)
            {
                String[] prob = det.getProbableCharsets();
                encodingName = prob[0];
            }

            if (encoding == null)
            {
                encoding = Encoding.GetEncoding(encodingName);
            }

            return encoding;
        }
Exemplo n.º 4
0
        public static void Main(String[] argv)
        { //throws Exception {

            if (argv.Length != 1 && argv.Length != 2)
            {

                Console.Out.WriteLine(
                      "Usage: HtmlCharsetDetector <url> [<languageHint>]");

                Console.Out.WriteLine("");
                Console.Out.WriteLine("Where <url> is http://...");
                Console.Out.WriteLine("For optional <languageHint>. Use following...");
                Console.Out.WriteLine("		1 => Japanese");
                Console.Out.WriteLine("		2 => Chinese");
                Console.Out.WriteLine("		3 => Simplified Chinese");
                Console.Out.WriteLine("		4 => Traditional Chinese");
                Console.Out.WriteLine("		5 => Korean");
                Console.Out.WriteLine("		6 => Dont know (default)");

                return;
            }


            // Initalize the nsDetector() ;
            int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1])
                             : nsPSMDetector.ALL;
            nsDetector det = new nsDetector(lang);

            // Set an observer...
            // The Notify() will be called when a matching charset is found.
            // C# doesn't support anonymous methods...
            Notifier not = new Notifier();
            det.Init(not);
            /*
                det.Init(new nsICharsetDetectionObserver() {
                    public void Notify(String charset) {
                        HtmlCharsetDetector.found = true ;
                        System.out.println("CHARSET = " + charset);
                    }
                    });
            */
            // C# has different web access methods
            /*	URL url = new URL(argv[0]); 
                BufferedInputStream imp = new BufferedInputStream(url.openStream());
            */
            Uri url = new Uri(argv[0]);
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri);

            System.Net.HttpWebResponse imp = null;
            try
            {
                imp = (HttpWebResponse)req.GetResponse();
            }
            catch (System.Net.WebException we)
            {
                //remote url not found, 404 ?
                Console.Out.WriteLine("Web Request Error " + we.Message);
            }


            byte[] buf = new byte[1024];
            int len;
            bool done = false;
            bool isAscii = true;

            //while( (len=imp.read(buf,0,buf.Length)) != -1) {
            //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1)
            len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            while (len > 0)
            {
                // Check if the stream is only ascii.
                if (isAscii)
                    isAscii = det.isAscii(buf, len);

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                    done = det.DoIt(buf, len, false);

                len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            }
            det.DataEnd();

            if (isAscii)
            {
                Console.Out.WriteLine("CHARSET = ASCII");
                found = true;
            }

            if (!found)
            {
                String[] prob = det.getProbableCharsets();
                for (int i = 0; i < prob.Length; i++)
                {
                    Console.Out.WriteLine("Probable Charset = " + prob[i]);
                }
            }
        }
Exemplo n.º 5
0
        public static void Main(String[] argv)
        { //throws Exception {
            if (argv.Length != 1 && argv.Length != 2)
            {
                Console.Out.WriteLine(
                    "Usage: HtmlCharsetDetector <url> [<languageHint>]");

                Console.Out.WriteLine("");
                Console.Out.WriteLine("Where <url> is http://...");
                Console.Out.WriteLine("For optional <languageHint>. Use following...");
                Console.Out.WriteLine("		1 => Japanese");
                Console.Out.WriteLine("		2 => Chinese");
                Console.Out.WriteLine("		3 => Simplified Chinese");
                Console.Out.WriteLine("		4 => Traditional Chinese");
                Console.Out.WriteLine("		5 => Korean");
                Console.Out.WriteLine("		6 => Dont know (default)");

                return;
            }


            // Initalize the nsDetector() ;
            int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1])
                             : nsPSMDetector.ALL;
            nsDetector det = new nsDetector(lang);

            // Set an observer...
            // The Notify() will be called when a matching charset is found.
            // C# doesn't support anonymous methods...
            Notifier not = new Notifier();

            det.Init(not);

            /*
             *  det.Init(new nsICharsetDetectionObserver() {
             *      public void Notify(String charset) {
             *          HtmlCharsetDetector.found = true ;
             *          System.out.println("CHARSET = " + charset);
             *      }
             *      });
             */
            // C# has different web access methods

            /*	URL url = new URL(argv[0]);
             *  BufferedInputStream imp = new BufferedInputStream(url.openStream());
             */
            Uri            url = new Uri(argv[0]);
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri);

            System.Net.HttpWebResponse imp = null;
            try
            {
                imp = (HttpWebResponse)req.GetResponse();
            }
            catch (System.Net.WebException we)
            {
                //remote url not found, 404 ?
                Console.Out.WriteLine("Web Request Error " + we.Message);
            }


            byte[] buf = new byte[1024];
            int    len;
            bool   done    = false;
            bool   isAscii = true;

            //while( (len=imp.read(buf,0,buf.Length)) != -1) {
            //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1)
            len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            while (len > 0)
            {
                // Check if the stream is only ascii.
                if (isAscii)
                {
                    isAscii = det.isAscii(buf, len);
                }

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                {
                    done = det.DoIt(buf, len, false);
                }

                len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            }
            det.DataEnd();

            if (isAscii)
            {
                Console.Out.WriteLine("CHARSET = ASCII");
                found = true;
            }

            if (!found)
            {
                String[] prob = det.getProbableCharsets();
                for (int i = 0; i < prob.Length; i++)
                {
                    Console.Out.WriteLine("Probable Charset = " + prob[i]);
                }
            }
        }