public static void Main(String[] argv) { //throws Exception { if (argv.Length != 1 && argv.Length != 2) { Console.Out.WriteLine( "Usage: HtmlCharsetDetector <url> [<languageHint>]"); Console.Out.WriteLine(""); Console.Out.WriteLine("Where <url> is http://..."); Console.Out.WriteLine("For optional <languageHint>. Use following..."); Console.Out.WriteLine(" 1 => Japanese"); Console.Out.WriteLine(" 2 => Chinese"); Console.Out.WriteLine(" 3 => Simplified Chinese"); Console.Out.WriteLine(" 4 => Traditional Chinese"); Console.Out.WriteLine(" 5 => Korean"); Console.Out.WriteLine(" 6 => Dont know (default)"); return; } // Initalize the nsDetector() ; int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1]) : nsPSMDetector.ALL; nsDetector det = new nsDetector(lang); // Set an observer... // The Notify() will be called when a matching charset is found. // C# doesn't support anonymous methods... Notifier not = new Notifier(); det.Init(not); /* det.Init(new nsICharsetDetectionObserver() { public void Notify(String charset) { HtmlCharsetDetector.found = true ; System.out.println("CHARSET = " + charset); } }); */ // C# has different web access methods /* URL url = new URL(argv[0]); BufferedInputStream imp = new BufferedInputStream(url.openStream()); */ Uri url = new Uri(argv[0]); HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri); System.Net.HttpWebResponse imp = null; try { imp = (HttpWebResponse)req.GetResponse(); } catch (System.Net.WebException we) { //remote url not found, 404 ? Console.Out.WriteLine("Web Request Error " + we.Message); } byte[] buf = new byte[1024]; int len; bool done = false; bool isAscii = true; //while( (len=imp.read(buf,0,buf.Length)) != -1) { //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1) len = imp.GetResponseStream().Read(buf, 0, buf.Length); while (len > 0) { // Check if the stream is only ascii. if (isAscii) isAscii = det.isAscii(buf, len); // DoIt if non-ascii and not done yet. if (!isAscii && !done) done = det.DoIt(buf, len, false); len = imp.GetResponseStream().Read(buf, 0, buf.Length); } det.DataEnd(); if (isAscii) { Console.Out.WriteLine("CHARSET = ASCII"); found = true; } if (!found) { String[] prob = det.getProbableCharsets(); for (int i = 0; i < prob.Length; i++) { Console.Out.WriteLine("Probable Charset = " + prob[i]); } } }
public static Encoding DetectEncoding(string filePath) { FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read); Encoding encoding = null; nsDetector det = new nsDetector(2); Notifier not = new Notifier(); det.Init(not); bool done = false; bool isAscii = true; byte[] buf = new byte[1024]; int len = fs.Read(buf, 0, buf.Length); //For some reason NCharDet can't detect Unicode. //Manual detect Unicode here. if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE) { fs.Close(); det.DataEnd(); return Encoding.Unicode; } while (len > 0) { // Check if the stream is only ascii. if (isAscii) isAscii = det.isAscii(buf, len); // DoIt if non-ascii and not done yet. if (!isAscii && !done) done = det.DoIt(buf, len, false); len = fs.Read(buf, 0, buf.Length); } fs.Close(); det.DataEnd(); if (isAscii) { encodingFound = true; encoding = Encoding.ASCII; } if (!encodingFound) { String[] prob = det.getProbableCharsets(); encodingName = prob[0]; } if (encoding == null) { encoding = Encoding.GetEncoding(encodingName); } return encoding; }
public static void Main(String[] argv) { //throws Exception { if (argv.Length != 1 && argv.Length != 2) { Console.Out.WriteLine( "Usage: HtmlCharsetDetector <url> [<languageHint>]"); Console.Out.WriteLine(""); Console.Out.WriteLine("Where <url> is http://..."); Console.Out.WriteLine("For optional <languageHint>. Use following..."); Console.Out.WriteLine(" 1 => Japanese"); Console.Out.WriteLine(" 2 => Chinese"); Console.Out.WriteLine(" 3 => Simplified Chinese"); Console.Out.WriteLine(" 4 => Traditional Chinese"); Console.Out.WriteLine(" 5 => Korean"); Console.Out.WriteLine(" 6 => Dont know (default)"); return; } // Initalize the nsDetector() ; int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1]) : nsPSMDetector.ALL; nsDetector det = new nsDetector(lang); // Set an observer... // The Notify() will be called when a matching charset is found. // C# doesn't support anonymous methods... Notifier not = new Notifier(); det.Init(not); /* * det.Init(new nsICharsetDetectionObserver() { * public void Notify(String charset) { * HtmlCharsetDetector.found = true ; * System.out.println("CHARSET = " + charset); * } * }); */ // C# has different web access methods /* URL url = new URL(argv[0]); * BufferedInputStream imp = new BufferedInputStream(url.openStream()); */ Uri url = new Uri(argv[0]); HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri); System.Net.HttpWebResponse imp = null; try { imp = (HttpWebResponse)req.GetResponse(); } catch (System.Net.WebException we) { //remote url not found, 404 ? Console.Out.WriteLine("Web Request Error " + we.Message); } byte[] buf = new byte[1024]; int len; bool done = false; bool isAscii = true; //while( (len=imp.read(buf,0,buf.Length)) != -1) { //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1) len = imp.GetResponseStream().Read(buf, 0, buf.Length); while (len > 0) { // Check if the stream is only ascii. if (isAscii) { isAscii = det.isAscii(buf, len); } // DoIt if non-ascii and not done yet. if (!isAscii && !done) { done = det.DoIt(buf, len, false); } len = imp.GetResponseStream().Read(buf, 0, buf.Length); } det.DataEnd(); if (isAscii) { Console.Out.WriteLine("CHARSET = ASCII"); found = true; } if (!found) { String[] prob = det.getProbableCharsets(); for (int i = 0; i < prob.Length; i++) { Console.Out.WriteLine("Probable Charset = " + prob[i]); } } }