/// <summary>Detects the possible code pages by reading a specified stream.</summary> /// <remarks>The stream reader position is reset after reading.</remarks> /// <returns>The detected code pages, which will have zero length if none was detected.</returns> internal static int[] DetectCodePages (Stream stream){ const int BUFFERSIZE = 1024; nsDetector detector = new nsDetector(nsPSMDetector.ALL); detector.Init(null); byte[] buffer = new byte[BUFFERSIZE] ; int readLength = 0; bool finished = false; while (!finished) { readLength = stream.Read(buffer, 0, buffer.Length); if (readLength == 0) break; finished = detector.DoIt(buffer, readLength, false); } detector.Done(); stream.Seek(0, SeekOrigin.Begin); string[] detectedEncodings = detector.getProbableCharsets(); VerboseConsole.WriteLine(DetectedEncodingsToString(detectedEncodings)); /* Check if no encoding was detected */ if (detectedEncodings[0] == "nomatch") return new int[0]; return GetCodePages(detectedEncodings); }
public static string detEncoding = string.Empty;// 自动检测到的文件类型 public void TransEncoding(string srcFullName, string dstFullName, string dstEncoding = "utf8", bool IsAutoDet = true, string srcEncoding = "gb2312") { Encoding Edst = Encoding.GetEncoding(dstEncoding); Encoding Esrc = Encoding.GetEncoding(srcEncoding); if (IsAutoDet) { #region 检测编码 nsDetector det = new nsDetector(); Notifier not = new Notifier(); det.Init(not); byte[] buf = new byte[1024]; int len; bool done = false; bool isAscii = true; FileStream fs = File.OpenRead(srcFullName); len = fs.Read(buf, 0, buf.Length); while (len > 0) { // Check if the stream is only ascii. if (isAscii) isAscii = det.isAscii(buf, len); // DoIt if non-ascii and not done yet. if (!isAscii && !done) done = det.DoIt(buf, len, false); len = fs.Read(buf, 0, buf.Length); } det.DataEnd(); fs.Close(); if (isAscii) { found = true; detEncoding = "ASCII"; } if (!found) { string[] prob = det.getProbableCharsets(); if (prob.Length > 0) { detEncoding = prob[0]; } else { detEncoding = srcEncoding; } } #endregion Esrc = Encoding.GetEncoding(detEncoding); } #region 编码转换 string strAll = File.ReadAllText(srcFullName, Esrc); File.WriteAllText(dstFullName, strAll, Edst); #endregion }
public static Encoding DetectEncoding(string filePath) { FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read); Encoding encoding = null; nsDetector det = new nsDetector(2); Notifier not = new Notifier(); det.Init(not); bool done = false; bool isAscii = true; byte[] buf = new byte[1024]; int len = fs.Read(buf, 0, buf.Length); //For some reason NCharDet can't detect Unicode. //Manual detect Unicode here. if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE) { fs.Close(); det.DataEnd(); return Encoding.Unicode; } while (len > 0) { // Check if the stream is only ascii. if (isAscii) isAscii = det.isAscii(buf, len); // DoIt if non-ascii and not done yet. if (!isAscii && !done) done = det.DoIt(buf, len, false); len = fs.Read(buf, 0, buf.Length); } fs.Close(); det.DataEnd(); if (isAscii) { encodingFound = true; encoding = Encoding.ASCII; } if (!encodingFound) { String[] prob = det.getProbableCharsets(); encodingName = prob[0]; } if (encoding == null) { encoding = Encoding.GetEncoding(encodingName); } return encoding; }
public static void Main(String[] argv) { //throws Exception { if (argv.Length != 1 && argv.Length != 2) { Console.Out.WriteLine( "Usage: HtmlCharsetDetector <url> [<languageHint>]"); Console.Out.WriteLine(""); Console.Out.WriteLine("Where <url> is http://..."); Console.Out.WriteLine("For optional <languageHint>. Use following..."); Console.Out.WriteLine(" 1 => Japanese"); Console.Out.WriteLine(" 2 => Chinese"); Console.Out.WriteLine(" 3 => Simplified Chinese"); Console.Out.WriteLine(" 4 => Traditional Chinese"); Console.Out.WriteLine(" 5 => Korean"); Console.Out.WriteLine(" 6 => Dont know (default)"); return; } // Initalize the nsDetector() ; int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1]) : nsPSMDetector.ALL; nsDetector det = new nsDetector(lang); // Set an observer... // The Notify() will be called when a matching charset is found. // C# doesn't support anonymous methods... Notifier not = new Notifier(); det.Init(not); /* det.Init(new nsICharsetDetectionObserver() { public void Notify(String charset) { HtmlCharsetDetector.found = true ; System.out.println("CHARSET = " + charset); } }); */ // C# has different web access methods /* URL url = new URL(argv[0]); BufferedInputStream imp = new BufferedInputStream(url.openStream()); */ Uri url = new Uri(argv[0]); HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri); System.Net.HttpWebResponse imp = null; try { imp = (HttpWebResponse)req.GetResponse(); } catch (System.Net.WebException we) { //remote url not found, 404 ? Console.Out.WriteLine("Web Request Error " + we.Message); } byte[] buf = new byte[1024]; int len; bool done = false; bool isAscii = true; //while( (len=imp.read(buf,0,buf.Length)) != -1) { //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1) len = imp.GetResponseStream().Read(buf, 0, buf.Length); while (len > 0) { // Check if the stream is only ascii. if (isAscii) isAscii = det.isAscii(buf, len); // DoIt if non-ascii and not done yet. if (!isAscii && !done) done = det.DoIt(buf, len, false); len = imp.GetResponseStream().Read(buf, 0, buf.Length); } det.DataEnd(); if (isAscii) { Console.Out.WriteLine("CHARSET = ASCII"); found = true; } if (!found) { String[] prob = det.getProbableCharsets(); for (int i = 0; i < prob.Length; i++) { Console.Out.WriteLine("Probable Charset = " + prob[i]); } } }
public static void Main(String[] argv) { //throws Exception { if (argv.Length != 1 && argv.Length != 2) { Console.Out.WriteLine( "Usage: HtmlCharsetDetector <url> [<languageHint>]"); Console.Out.WriteLine(""); Console.Out.WriteLine("Where <url> is http://..."); Console.Out.WriteLine("For optional <languageHint>. Use following..."); Console.Out.WriteLine(" 1 => Japanese"); Console.Out.WriteLine(" 2 => Chinese"); Console.Out.WriteLine(" 3 => Simplified Chinese"); Console.Out.WriteLine(" 4 => Traditional Chinese"); Console.Out.WriteLine(" 5 => Korean"); Console.Out.WriteLine(" 6 => Dont know (default)"); return; } // Initalize the nsDetector() ; int lang = (argv.Length == 2) ? Convert.ToInt32(argv[1]) : nsPSMDetector.ALL; nsDetector det = new nsDetector(lang); // Set an observer... // The Notify() will be called when a matching charset is found. // C# doesn't support anonymous methods... Notifier not = new Notifier(); det.Init(not); /* * det.Init(new nsICharsetDetectionObserver() { * public void Notify(String charset) { * HtmlCharsetDetector.found = true ; * System.out.println("CHARSET = " + charset); * } * }); */ // C# has different web access methods /* URL url = new URL(argv[0]); * BufferedInputStream imp = new BufferedInputStream(url.openStream()); */ Uri url = new Uri(argv[0]); HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url.AbsoluteUri); System.Net.HttpWebResponse imp = null; try { imp = (HttpWebResponse)req.GetResponse(); } catch (System.Net.WebException we) { //remote url not found, 404 ? Console.Out.WriteLine("Web Request Error " + we.Message); } byte[] buf = new byte[1024]; int len; bool done = false; bool isAscii = true; //while( (len=imp.read(buf,0,buf.Length)) != -1) { //while ((len = imp.GetResponseStream().Read(buf, 0, buf.Length)) != -1) len = imp.GetResponseStream().Read(buf, 0, buf.Length); while (len > 0) { // Check if the stream is only ascii. if (isAscii) { isAscii = det.isAscii(buf, len); } // DoIt if non-ascii and not done yet. if (!isAscii && !done) { done = det.DoIt(buf, len, false); } len = imp.GetResponseStream().Read(buf, 0, buf.Length); } det.DataEnd(); if (isAscii) { Console.Out.WriteLine("CHARSET = ASCII"); found = true; } if (!found) { String[] prob = det.getProbableCharsets(); for (int i = 0; i < prob.Length; i++) { Console.Out.WriteLine("Probable Charset = " + prob[i]); } } }