Пример #1
0
        public static void Main(String[] argv)
        { //throws Exception {

            if (argv.Length != 1 && argv.Length != 2)
            {

                Console.Out.WriteLine(
                      "Usage: HtmlCharsetDetector <url> [<languageHint>]");

                Console.Out.WriteLine("");
                Console.Out.WriteLine("Where <url> is http://...");
                Console.Out.WriteLine("For optional <languageHint>. Use following...");
                Console.Out.WriteLine("		1 => Japanese");
                Console.Out.WriteLine("		2 => Chinese");
                Console.Out.WriteLine("		3 => Simplified Chinese");
                Console.Out.WriteLine("		4 => Traditional Chinese");
                Console.Out.WriteLine("		5 => Korean");
                Console.Out.WriteLine("		6 => Dont know (default)");

                return;
            }


            // Initalize the nsDetector() ;
            int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1])
                             : nsPSMDetector.ALL;
            nsDetector det = new nsDetector(lang);

            // Set an observer...
            // The Notify() will be called when a matching charset is found.
            // C# doesn't support anonymous methods...
            Notifier not = new Notifier();
            det.Init(not);
            /*
                det.Init(new nsICharsetDetectionObserver() {
                    public void Notify(String charset) {
                        HtmlCharsetDetector.found = true ;
                        System.out.println("CHARSET = " + charset);
                    }
                    });
            */
            // C# has different web access methods
            /*	URL url = new URL(argv[0]); 
                BufferedInputStream imp = new BufferedInputStream(url.openStream());
            */
            Uri url = new Uri(argv[0]);
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri);

            System.Net.HttpWebResponse imp = null;
            try
            {
                imp = (HttpWebResponse)req.GetResponse();
            }
            catch (System.Net.WebException we)
            {
                //remote url not found, 404 ?
                Console.Out.WriteLine("Web Request Error " + we.Message);
            }


            byte[] buf = new byte[1024];
            int len;
            bool done = false;
            bool isAscii = true;

            //while( (len=imp.read(buf,0,buf.Length)) != -1) {
            //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1)
            len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            while (len > 0)
            {
                // Check if the stream is only ascii.
                if (isAscii)
                    isAscii = det.isAscii(buf, len);

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                    done = det.DoIt(buf, len, false);

                len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            }
            det.DataEnd();

            if (isAscii)
            {
                Console.Out.WriteLine("CHARSET = ASCII");
                found = true;
            }

            if (!found)
            {
                String[] prob = det.getProbableCharsets();
                for (int i = 0; i < prob.Length; i++)
                {
                    Console.Out.WriteLine("Probable Charset = " + prob[i]);
                }
            }
        }
Пример #2
0
        public static Encoding DetectEncoding(string filePath)
        {
            FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read);
            Encoding encoding = null;


            nsDetector det = new nsDetector(2);
            Notifier not = new Notifier();
            det.Init(not);

            bool done = false;
            bool isAscii = true;

            byte[] buf = new byte[1024];
            int len = fs.Read(buf, 0, buf.Length);

            //For some reason NCharDet can't detect Unicode.
            //Manual detect Unicode here.
            if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE)
            {
                fs.Close();
                det.DataEnd();
                return Encoding.Unicode;
            }

            while (len > 0)
            {
                // Check if the stream is only ascii.
                if (isAscii)
                    isAscii = det.isAscii(buf, len);

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                    done = det.DoIt(buf, len, false);

                len = fs.Read(buf, 0, buf.Length);
            }
            fs.Close();
            det.DataEnd();

            if (isAscii)
            {
                encodingFound = true;
                encoding = Encoding.ASCII;
            }

            if (!encodingFound)
            {
                String[] prob = det.getProbableCharsets();
                encodingName = prob[0];
            }

            if (encoding == null)
            {
                encoding = Encoding.GetEncoding(encodingName);
            }

            return encoding;
        }
Пример #3
0
        public static void Main(String[] argv)
        { //throws Exception {
            if (argv.Length != 1 && argv.Length != 2)
            {
                Console.Out.WriteLine(
                    "Usage: HtmlCharsetDetector <url> [<languageHint>]");

                Console.Out.WriteLine("");
                Console.Out.WriteLine("Where <url> is http://...");
                Console.Out.WriteLine("For optional <languageHint>. Use following...");
                Console.Out.WriteLine("		1 => Japanese");
                Console.Out.WriteLine("		2 => Chinese");
                Console.Out.WriteLine("		3 => Simplified Chinese");
                Console.Out.WriteLine("		4 => Traditional Chinese");
                Console.Out.WriteLine("		5 => Korean");
                Console.Out.WriteLine("		6 => Dont know (default)");

                return;
            }


            // Initalize the nsDetector() ;
            int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1])
                             : nsPSMDetector.ALL;
            nsDetector det = new nsDetector(lang);

            // Set an observer...
            // The Notify() will be called when a matching charset is found.
            // C# doesn't support anonymous methods...
            Notifier not = new Notifier();

            det.Init(not);

            /*
             *  det.Init(new nsICharsetDetectionObserver() {
             *      public void Notify(String charset) {
             *          HtmlCharsetDetector.found = true ;
             *          System.out.println("CHARSET = " + charset);
             *      }
             *      });
             */
            // C# has different web access methods

            /*	URL url = new URL(argv[0]);
             *  BufferedInputStream imp = new BufferedInputStream(url.openStream());
             */
            Uri            url = new Uri(argv[0]);
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri);

            System.Net.HttpWebResponse imp = null;
            try
            {
                imp = (HttpWebResponse)req.GetResponse();
            }
            catch (System.Net.WebException we)
            {
                //remote url not found, 404 ?
                Console.Out.WriteLine("Web Request Error " + we.Message);
            }


            byte[] buf = new byte[1024];
            int    len;
            bool   done    = false;
            bool   isAscii = true;

            //while( (len=imp.read(buf,0,buf.Length)) != -1) {
            //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1)
            len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            while (len > 0)
            {
                // Check if the stream is only ascii.
                if (isAscii)
                {
                    isAscii = det.isAscii(buf, len);
                }

                // DoIt if non-ascii and not done yet.
                if (!isAscii && !done)
                {
                    done = det.DoIt(buf, len, false);
                }

                len = imp.GetResponseStream().Read(buf, 0, buf.Length);
            }
            det.DataEnd();

            if (isAscii)
            {
                Console.Out.WriteLine("CHARSET = ASCII");
                found = true;
            }

            if (!found)
            {
                String[] prob = det.getProbableCharsets();
                for (int i = 0; i < prob.Length; i++)
                {
                    Console.Out.WriteLine("Probable Charset = " + prob[i]);
                }
            }
        }