/// <summary>Detects the possible code pages by reading a specified stream.</summary> /// <remarks>The stream reader position is reset after reading.</remarks> /// <returns>The detected code pages, which will have zero length if none was detected.</returns> internal static int[] DetectCodePages(Stream stream) { const int BUFFERSIZE = 1024; nsDetector detector = new nsDetector(nsPSMDetector.ALL); detector.Init(null); byte[] buffer = new byte[BUFFERSIZE]; int readLength = 0; bool finished = false; while (!finished) { readLength = stream.Read(buffer, 0, buffer.Length); if (readLength == 0) { break; } finished = detector.DoIt(buffer, readLength, false); } detector.Done(); stream.Seek(0, SeekOrigin.Begin); string[] detectedEncodings = detector.getProbableCharsets(); Logger.Info("[FileInputOutput] Detected encodings: {0}", String.Join(", ", detectedEncodings)); /* Check if no encoding was detected */ if (detectedEncodings[0] == "nomatch") { return(new int[0]); } return(GetCodePages(detectedEncodings)); }
public static string DetectChineseCharset(string filePath) { CharsetDetector.DetectedCharset = "GB2312"; nsDetector nsDetector = new nsDetector(3); Notifier aObserver = new Notifier(); nsDetector.Init(aObserver); byte[] array = new byte[1024]; int aLen = File.OpenRead(filePath).Read(array, 0, array.Length); bool flag = nsDetector.isAscii(array, aLen); if (!flag) { nsDetector.DoIt(array, aLen, false); } nsDetector.DataEnd(); if (flag) { CharsetDetector.DetectedCharset = "ASCII"; } if (File.ReadAllText(filePath).Contains("CONTENT=\"text/html; charset=gb2312\"")) { CharsetDetector.DetectedCharset = "GB2312"; } return(CharsetDetector.DetectedCharset); }
public Encoding sniff() { nsDetector detector = new nsDetector(nsPSMDetector.ALL); detector.Init(this); detector.DoIt(source, length, false); detector.DataEnd(); if (returnValue != null && returnValue != Encoding.WINDOWS1252 && returnValue.isAsciiSuperset()) { return returnValue; } else { return null; } }
public static Encoding DetectEncoding(string filePath) { var fs = new FileStream(filePath, FileMode.Open, FileAccess.Read); Encoding encoding = null; var det = new nsDetector(2); var not = new Notifier(); det.Init(not); var done = false; var isAscii = true; var buf = new byte[1024]; int len = fs.Read(buf, 0, buf.Length); //For some reason NCharDet can't detect Unicode. //Manual detect Unicode here. if (len >= 2 && buf[0] == 0xFF && buf[1] == 0xFE) { fs.Close(); det.DataEnd(); return(Encoding.Unicode); } while (len > 0) { // Check if the stream is only ascii. if (isAscii) { isAscii = det.isAscii(buf, len); } // DoIt if non-ascii and not done yet. if (!isAscii && !done) { done = det.DoIt(buf, len, false); } len = fs.Read(buf, 0, buf.Length); } fs.Close(); det.DataEnd(); if (isAscii) { encodingFound = true; encoding = Encoding.ASCII; } if (!encodingFound) { string[] prob = det.getProbableCharsets(); encodingName = prob[0]; } if (encoding == null) { encoding = Encoding.GetEncoding(encodingName); } return(encoding); }
public static string detEncoding = string.Empty; // 自动检测到的文件类型 public void TransEncoding(string srcFullName, string dstFullName, string dstEncoding = "utf8", bool IsAutoDet = true, string srcEncoding = "gb2312") { Encoding Edst = Encoding.GetEncoding(dstEncoding); Encoding Esrc = Encoding.GetEncoding(srcEncoding); if (IsAutoDet) { #region 检测编码 nsDetector det = new nsDetector(); Notifier not = new Notifier(); det.Init(not); byte[] buf = new byte[1024]; int len; bool done = false; bool isAscii = true; FileStream fs = File.OpenRead(srcFullName); len = fs.Read(buf, 0, buf.Length); while (len > 0) { // Check if the stream is only ascii. if (isAscii) { isAscii = det.isAscii(buf, len); } // DoIt if non-ascii and not done yet. if (!isAscii && !done) { done = det.DoIt(buf, len, false); } len = fs.Read(buf, 0, buf.Length); } det.DataEnd(); fs.Close(); if (isAscii) { found = true; detEncoding = "ASCII"; } if (!found) { string[] prob = det.getProbableCharsets(); if (prob.Length > 0) { detEncoding = prob[0]; } else { detEncoding = srcEncoding; } } #endregion Esrc = Encoding.GetEncoding(detEncoding); } #region 编码转换 string strAll = File.ReadAllText(srcFullName, Esrc); File.WriteAllText(dstFullName, strAll, Edst); #endregion }