This class provides a simple finite state automaton that scans the file looking for (1) valid UTF-8 byte patterns, (2) bytes >= 0x80 which are not part of a UTF-8 sequence. The method then guesses whether it is UTF-8 or maybe some local machine default encoding. This works well for the various Latin encodings.
        public static BlockReader Get(Stream stream, int fallbackCodePage)
        {
            Encoding encoding;
            int preamble = Preamble(stream);

            if (preamble != 0)  // There is a valid BOM here!
                encoding = Encoding.GetEncoding(preamble);
            else if (fallbackCodePage == -1) // Fallback is "raw" bytes
                return Raw(stream);
            else if (fallbackCodePage != -2) // Anything but "guess"
                encoding = Encoding.GetEncoding(fallbackCodePage);
            else // This is the "guess" option
            {
                int guess = new Guesser(stream).GuessCodePage();
                stream.Seek(0, SeekOrigin.Begin);
                if (guess == -1) // ==> this is a 7-bit file
                    encoding = Encoding.ASCII;
                else if (guess == 65001)
                    encoding = Encoding.UTF8;
                else             // ==> use the machine default
                    encoding = Encoding.Default;
            }
            StreamReader reader = new StreamReader(stream, encoding);
            return reader.Read;
        }
Esempio n. 2
0
        public static BlockReader Get(Stream stream, int fallbackCodePage)
        {
            Encoding encoding;
            int      preamble = Preamble(stream);

            if (preamble != 0)  // There is a valid BOM here!
            {
                encoding = Encoding.GetEncoding(preamble);
            }
            else if (fallbackCodePage == -1) // Fallback is "raw" bytes
            {
                return(Raw(stream));
            }
            else if (fallbackCodePage != -2) // Anything but "guess"
            {
                encoding = Encoding.GetEncoding(fallbackCodePage);
            }
            else // This is the "guess" option
            {
                int guess = new Guesser(stream).GuessCodePage();
                stream.Seek(0, SeekOrigin.Begin);
                if (guess == -1) // ==> this is a 7-bit file
                {
                    encoding = Encoding.ASCII;
                }
                else if (guess == 65001)
                {
                    encoding = Encoding.UTF8;
                }
                else             // ==> use the machine default
                {
                    encoding = Encoding.Default;
                }
            }
            StreamReader reader = new StreamReader(stream, encoding);

            return(reader.Read);
        }