/// <summary> /// Detects the byte order mark of a file and returns /// an appropriate encoding for the file. /// </summary> /// <param name="sourceFile"></param> /// <returns>A TextEncodingMetadata object</returns> public static TextEncodingMetadata GetFileEncoding(string sourceFile) { var metaData = new TextEncodingMetadata(); EncodingMetaInfo metaInfo = DetectBom(sourceFile); metaData.HasBom = metaInfo.HasBom; using (FileStream fs = File.OpenRead(sourceFile)) { var cdet = new Ude.CharsetDetector(); cdet.Feed(fs); cdet.DataEnd(); if (cdet.Charset != null) { metaData.CharacterSet = cdet.Charset; metaData.DetectionConfidence = cdet.Confidence; } else { Console.WriteLine("Detection failed."); } } using (var sr = new StreamReader(sourceFile)) { Encoding encoding = sr.CurrentEncoding; metaData.FileEncoding = encoding; } return(metaData); }
/// <summary> /// Detects the bom. /// </summary> /// <param name="filePath">The file path.</param> /// <returns>An EncodingMetaInfo object</returns> public static EncodingMetaInfo DetectBom(string filePath) { var metaInfo = new EncodingMetaInfo(); // *** Use Default of Encoding.Default (Ansi CodePage) Encoding enc = Encoding.Default; // *** Detect byte order mark if any - otherwise assume default var buffer = new byte[5]; var file = new FileStream(filePath, FileMode.Open); file.Read(buffer, 0, 5); file.Close(); if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf) { metaInfo.HasBom = true; metaInfo.TextEncoding = Encoding.UTF8; } else if (buffer[0] == 0xff && buffer[1] == 0xfe) { metaInfo.HasBom = true; metaInfo.TextEncoding = Encoding.Unicode; // utf-16le } else if (buffer[0] == 0xfe && buffer[1] == 0xff) { metaInfo.HasBom = true; metaInfo.TextEncoding = Encoding.BigEndianUnicode; // utf-16be } else if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff) { metaInfo.HasBom = true; metaInfo.TextEncoding = Encoding.UTF32; } else if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76) { metaInfo.HasBom = true; metaInfo.TextEncoding = Encoding.UTF7; } return(metaInfo); }