public static Encoding DetectTextFileEncoding(string pFilePath) { Encoding rtnEnc; byte[] tempByte = null; TextEncodingDetect ted = new TextEncodingDetect(); TextEncodingDetect.Encoding tedEnc; tempByte = File.ReadAllBytes(pFilePath); tedEnc = ted.DetectEncoding(tempByte, tempByte.Length); switch (tedEnc) { case TextEncodingDetect.Encoding.Utf8Bom: case TextEncodingDetect.Encoding.Utf8Nobom: rtnEnc = Encoding.UTF8; break; default: rtnEnc = Encoding.Default; break; } return(rtnEnc); }
public static int Main(string[] args) { Console.WriteLine(); if (args.Length != 1) { Console.WriteLine("Usage: TextEncodingDetect.exe <filename>"); return(1); } // Read in the file in binary byte[] buffer; try { buffer = File.ReadAllBytes(args[0]); } catch (Exception ex) { Console.WriteLine(ex.Message); return(1); } // Detect encoding var textDetect = new TextEncodingDetect(); TextEncodingDetect.Encoding encoding = textDetect.DetectEncoding(buffer, buffer.Length); Console.Write("Encoding: "); if (encoding == TextEncodingDetect.Encoding.None) { Console.WriteLine("Binary"); } else if (encoding == TextEncodingDetect.Encoding.Ascii) { Console.WriteLine("ASCII (chars in the 0-127 range)"); } else if (encoding == TextEncodingDetect.Encoding.Ansi) { Console.WriteLine("ANSI (chars in the range 0-255 range)"); } else if (encoding == TextEncodingDetect.Encoding.Utf8Bom || encoding == TextEncodingDetect.Encoding.Utf8Nobom) { Console.WriteLine("UTF-8"); } else if (encoding == TextEncodingDetect.Encoding.Utf16LeBom || encoding == TextEncodingDetect.Encoding.Utf16LeNoBom) { Console.WriteLine("UTF-16 Little Endian"); } else if (encoding == TextEncodingDetect.Encoding.Utf16BeBom || encoding == TextEncodingDetect.Encoding.Utf16BeNoBom) { Console.WriteLine("UTF-16 Big Endian"); } return(0); }
private void button7_Click(object sender, EventArgs e) { if (comboBoxEdit1.SelectedIndex <= 0) { MessageBox.Show("請先選擇模組!"); return; } string[] array_path = Directory.GetFiles(s_folderpath + "\\" + comboBoxEdit1.Properties.Items[comboBoxEdit1.SelectedIndex].ToString() + "\\SQL\\"); for (int j = 0; j < array_path.Length; j++) { byte[] byteData = File.ReadAllBytes(array_path[j]); var textDetect = new TextEncodingDetect(); TextEncodingDetect.Encoding encoding = textDetect.DetectEncoding(byteData, byteData.Length); if (encoding == TextEncodingDetect.Encoding.UTF8_BOM) { string s = array_path[j] + " 是UTF8格式,請轉成無BOM格式!"; MessageBox.Show(s); return; } } MessageBox.Show("檢查完成,全部都是UTF8無BOM格式"); }
// 检测文本文件的encoding /* * UTF-8: EF BB BF * UTF-16 big-endian byte order: FE FF * UTF-16 little-endian byte order: FF FE * UTF-32 big-endian byte order: 00 00 FE FF * UTF-32 little-endian byte order: FF FE 00 00 * */ public static Encoding DetectTextFileEncoding(string strFilename, Encoding default_encoding) { byte[] buffer = new byte[4]; try { using (FileStream file = File.Open( strFilename, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { if (file.Length >= 2) { file.Read(buffer, 0, 2); // 1, 2 BUG if (buffer[0] == 0xff && buffer[1] == 0xfe) { return(Encoding.Unicode); // little-endian } if (buffer[0] == 0xfe && buffer[1] == 0xff) { return(Encoding.BigEndianUnicode); } } if (file.Length >= 3) { file.Read(buffer, 2, 1); if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) { return(Encoding.UTF8); } } if (file.Length >= 4) { file.Read(buffer, 3, 1); // UTF-32 big-endian byte order: 00 00 FE FF // UTF-32 little-endian byte order: FF FE 00 00 if (buffer[0] == 0x00 && buffer[1] == 0x00 && buffer[2] == 0xfe && buffer[3] == 0xff) { return(Encoding.UTF32); // little-endian } if (buffer[0] == 0xff && buffer[1] == 0xfe && buffer[2] == 0x00 && buffer[3] == 0x00) { return(Encoding.GetEncoding(65006)); // UTF-32 big-endian } } // 2018/11/6 // 检测是不是没有 BOM 的 UTF-8 { byte[] temp_buffer = new byte[4096]; file.Seek(0, SeekOrigin.Begin); int length = file.Read(temp_buffer, 0, temp_buffer.Length); TextEncodingDetect detector = new TextEncodingDetect(); TextEncodingDetect.Encoding encoding = detector.DetectEncoding(temp_buffer, length); switch (encoding) { case TextEncodingDetect.Encoding.Utf8Bom: case TextEncodingDetect.Encoding.Utf8Nobom: return(Encoding.UTF8); } } } } catch { } return(default_encoding); // default }
public static FileStateModel GetFileState(string filename) { // Read in the file in binary byte[] buffer; try { buffer = File.ReadAllBytes(filename); } catch (Exception ex) { Console.WriteLine(ex.Message); throw ex; } // Detect encoding var textDetect = new TextEncodingDetect(); TextEncodingDetect.Encoding encoding = textDetect.DetectEncoding(buffer, buffer.Length); string str = ""; StringBuilder sb = new StringBuilder(); //sb.AppendLine("File: " + filename); //sb.Append("Encoding: "); if (encoding == TextEncodingDetect.Encoding.None) { //sb.AppendLine("Binary"); } else if (encoding == TextEncodingDetect.Encoding.Ascii) { str = Encoding.ASCII.GetString(buffer); //sb.AppendLine("ASCII (chars in the 0-127 range)"); } else if (encoding == TextEncodingDetect.Encoding.Ansi) { str = Encoding.Default.GetString(buffer); //sb.AppendLine("ANSI (chars in the range 0-255 range)"); } else if (encoding == TextEncodingDetect.Encoding.Utf8Bom || encoding == TextEncodingDetect.Encoding.Utf8Nobom) { str = Encoding.UTF8.GetString(buffer); //sb.AppendLine("UTF-8"); } else if (encoding == TextEncodingDetect.Encoding.Utf16LeBom || encoding == TextEncodingDetect.Encoding.Utf16LeNoBom) { str = Encoding.Unicode.GetString(buffer); //sb.AppendLine("UTF-16 Little Endian"); } else if (encoding == TextEncodingDetect.Encoding.Utf16BeBom || encoding == TextEncodingDetect.Encoding.Utf16BeNoBom) { str = Encoding.BigEndianUnicode.GetString(buffer); //sb.AppendLine("UTF-16 Big Endian"); } int All_Lines = 0; int CRLF_Count = 0; int LF_Count = 0; if (encoding != TextEncodingDetect.Encoding.None) { All_Lines = LineBreakCount(str); CRLF_Count = LineBreakCount(str, new[] { "\r\n" }); LF_Count = All_Lines - CRLF_Count; //sb.AppendLine( // "Length: " + str.Length + "\t" + // "Lines: " + All_Lines + "\t" + // "CRLF: " + CRLF_Count + "\t" + // " LF: " + (LF_Count)); } return(new FileStateModel() { Encoding = encoding, Lines = All_Lines, CRLFs = CRLF_Count, LFs = LF_Count }); }
public static bool IsText(byte[] buffer, int offset, int count) { if (buffer.Length < offset + count) { throw new ArgumentOutOfRangeException(nameof(buffer)); } // [Stage 1] Contains unicode BOM -> text if (3 <= offset + count && buffer[offset] == Utf8Bom[0] && buffer[offset + 1] == Utf8Bom[1] && buffer[offset + 2] == Utf8Bom[2]) { return(true); } if (2 <= offset + count) { if (buffer[offset] == Utf16LeBom[0] && buffer[offset + 1] == Utf16LeBom[1]) { return(true); } if (buffer[offset] == Utf16BeBom[0] && buffer[offset + 1] == Utf16BeBom[1]) { return(true); } } // [Stage 2] Check if a chunk can be decoded as system default ANSI locale. // Many multibyte encodings have 'unused area'. If a file contains one of these area, treat it as a binary. // Ex) EUC-KR's layout : https://en.wikipedia.org/wiki/CP949#/media/File:Unified_Hangul_Code.svg bool isText = true; Encoding ansiEnc = Encoding.GetEncoding(DefaultAnsi.CodePage, new EncoderExceptionFallback(), new DecoderExceptionFallback()); try { // ReSharper disable once ReturnValueOfPureMethodIsNotUsed ansiEnc.GetChars(buffer, offset, count); } catch (DecoderFallbackException) { // Failure isText = false; } // [Stage 3] // Problem: Some encodings make use of 128-255 area, so every byte is valid. (e.g. Windows-1252 / CP437) // To counter these issue, if file is seems to be text, check again with AutoIt.Common.TextEncodingDetect if (isText) { TextEncodingDetect detect = new TextEncodingDetect(); byte[] idxZeroBuffer; if (offset == 0) { idxZeroBuffer = buffer; } else { idxZeroBuffer = new byte[count]; Array.Copy(buffer, offset, idxZeroBuffer, 0, count); } switch (detect.DetectEncoding(idxZeroBuffer, idxZeroBuffer.Length)) { // Binary case TextEncodingDetect.Encoding.None: // PEBakery mandates unicode text to have BOM. // They must have been filtered out in stage 1. case TextEncodingDetect.Encoding.Utf16LeBom: case TextEncodingDetect.Encoding.Utf16BeBom: case TextEncodingDetect.Encoding.Utf8Bom: // Treat unicode text file without a BOM as a binary. case TextEncodingDetect.Encoding.Utf16LeNoBom: case TextEncodingDetect.Encoding.Utf16BeNoBom: case TextEncodingDetect.Encoding.Utf8NoBom: isText = false; break; } } return(isText); }