示例#1
0
        /// <summary>
        /// Tries to detect character encoding of given byte sequence.
        /// The following encodings are detected: UTF-8, UTF-16 (both big endian and little endian), windows-1251, Cp866, ISO-8859-5, KOI8-R
        /// If the bytes do not look like UTF-8 or UTF-16 code, then an attempt to detect single-byte Russian encoding is performed.
        /// If the bytes look like binary content (i.e. has no statistical characteristics suitable for text) then null is returned.
        /// To detect single-byte encoding an assumption is made that byte sequence represents text in Russian,
        /// otherwise result can not be predicted.
        /// </summary>
        /// <param name="data">The byte sequence to analyze</param>
        /// <param name="lengthToAnalyze">The number of bytes to analyze (0 means to analyze the whole sequence)</param>
        /// <returns></returns>
        public static Encoding TryDetectTextEncoding(byte[] data, int lengthToAnalyze = 0)
        {
            if ((lengthToAnalyze == 0) || (lengthToAnalyze > data.Length))
            {
                lengthToAnalyze = data.Length;
            }
            bool hasAsciiControlCharacters;

            if (IsUTF8EncodingImpl(data, lengthToAnalyze, out hasAsciiControlCharacters))
            {
                return(hasAsciiControlCharacters ? null : Encoding.UTF8);
            }
            var byteFrequencies = new ByteFrequencies(data, lengthToAnalyze);
            var utf16Encoding   = DetectUtf16Encoding(data, lengthToAnalyze, byteFrequencies);

            if (utf16Encoding != null)
            {
                return(utf16Encoding);
            }
            if (HasControlBytes(byteFrequencies.Frequencies))
            {
                return(null);
            }
            var singleByteRussianEncoding = DetectSingleByteRussianEncoding(byteFrequencies.Frequencies, false);

            if (singleByteRussianEncoding != null)
            {
                return(singleByteRussianEncoding);
            }
            return(Encoding.ASCII);
        }
示例#2
0
        private static Encoding DetectUtf16Encoding(byte[] data, int lengthToAnalyze, [CanBeNull] ByteFrequencies byteFrequencies)
        {
            byteFrequencies = byteFrequencies ?? new ByteFrequencies(data, lengthToAnalyze);
            var utf16BeHighBytesCount = byteFrequencies.EvenPosFrequencies.Take(5).Sum();
            var utf16LeHighBytesCount = byteFrequencies.OddPosFrequencies.Take(5).Sum();
            var utf16BeDetected       = (utf16BeHighBytesCount >= byteFrequencies.EvenPosCount * 0.9);
            var utf16LeDetected       = (utf16LeHighBytesCount >= byteFrequencies.OddPosCount * 0.9);
            var utf16BeBomDetected    = false;
            var utf16LeBomDetected    = false;

            if (byteFrequencies.Count > 2)
            {
                // Unicode Little Endian BOM character
                utf16LeBomDetected = data[0] == 0xFF && data[1] == 0xFE;
                // Unicode Big Endian BOM character
                utf16BeBomDetected = data[1] == 0xFF && data[0] == 0xFE;
            }
            // The presence of BOM character overrides results of frequency analisys
            utf16BeDetected &= !utf16LeBomDetected;
            utf16LeDetected &= !utf16BeBomDetected;

            if (utf16BeDetected && utf16LeDetected)
            {
                // Frequency analisys allows both encodings.
                // Prefer the encoding which has more correct high bytes
                if (utf16BeHighBytesCount > utf16LeHighBytesCount)
                {
                    utf16LeDetected = false;
                }
                else
                {
                    utf16BeDetected = false;
                }
            }

            if (utf16LeDetected)
            {
                // Unicode is detected
                // Encoding utf-16, codepage 1200
                return(Encoding.GetEncoding("utf-16"));
            }

            if (utf16BeDetected)
            {
                // Unicode is detected
                // Encoding utf-16BE, codepage 1201
                return(Encoding.GetEncoding("utf-16BE"));
            }

            return(null);
        }