/// <summary> /// Tries to detect character encoding of given byte sequence. /// The following encodings are detected: UTF-8, UTF-16 (both big endian and little endian), windows-1251, Cp866, ISO-8859-5, KOI8-R /// If the bytes do not look like UTF-8 or UTF-16 code, then an attempt to detect single-byte Russian encoding is performed. /// If the bytes look like binary content (i.e. has no statistical characteristics suitable for text) then null is returned. /// To detect single-byte encoding an assumption is made that byte sequence represents text in Russian, /// otherwise result can not be predicted. /// </summary> /// <param name="data">The byte sequence to analyze</param> /// <param name="lengthToAnalyze">The number of bytes to analyze (0 means to analyze the whole sequence)</param> /// <returns></returns> public static Encoding TryDetectTextEncoding(byte[] data, int lengthToAnalyze = 0) { if ((lengthToAnalyze == 0) || (lengthToAnalyze > data.Length)) { lengthToAnalyze = data.Length; } bool hasAsciiControlCharacters; if (IsUTF8EncodingImpl(data, lengthToAnalyze, out hasAsciiControlCharacters)) { return(hasAsciiControlCharacters ? null : Encoding.UTF8); } var byteFrequencies = new ByteFrequencies(data, lengthToAnalyze); var utf16Encoding = DetectUtf16Encoding(data, lengthToAnalyze, byteFrequencies); if (utf16Encoding != null) { return(utf16Encoding); } if (HasControlBytes(byteFrequencies.Frequencies)) { return(null); } var singleByteRussianEncoding = DetectSingleByteRussianEncoding(byteFrequencies.Frequencies, false); if (singleByteRussianEncoding != null) { return(singleByteRussianEncoding); } return(Encoding.ASCII); }
private static Encoding DetectUtf16Encoding(byte[] data, int lengthToAnalyze, [CanBeNull] ByteFrequencies byteFrequencies) { byteFrequencies = byteFrequencies ?? new ByteFrequencies(data, lengthToAnalyze); var utf16BeHighBytesCount = byteFrequencies.EvenPosFrequencies.Take(5).Sum(); var utf16LeHighBytesCount = byteFrequencies.OddPosFrequencies.Take(5).Sum(); var utf16BeDetected = (utf16BeHighBytesCount >= byteFrequencies.EvenPosCount * 0.9); var utf16LeDetected = (utf16LeHighBytesCount >= byteFrequencies.OddPosCount * 0.9); var utf16BeBomDetected = false; var utf16LeBomDetected = false; if (byteFrequencies.Count > 2) { // Unicode Little Endian BOM character utf16LeBomDetected = data[0] == 0xFF && data[1] == 0xFE; // Unicode Big Endian BOM character utf16BeBomDetected = data[1] == 0xFF && data[0] == 0xFE; } // The presence of BOM character overrides results of frequency analisys utf16BeDetected &= !utf16LeBomDetected; utf16LeDetected &= !utf16BeBomDetected; if (utf16BeDetected && utf16LeDetected) { // Frequency analisys allows both encodings. // Prefer the encoding which has more correct high bytes if (utf16BeHighBytesCount > utf16LeHighBytesCount) { utf16LeDetected = false; } else { utf16BeDetected = false; } } if (utf16LeDetected) { // Unicode is detected // Encoding utf-16, codepage 1200 return(Encoding.GetEncoding("utf-16")); } if (utf16BeDetected) { // Unicode is detected // Encoding utf-16BE, codepage 1201 return(Encoding.GetEncoding("utf-16BE")); } return(null); }