private int DetectUtf8Or16Or32(int c1)
        {
            int c2, c3, c4;

            if (c1 == 0xff || c1 == 0xfe)
            {
                // Start of a possible byte-order mark
                // FF FE 0 0 --> UTF-32LE
                // FF FE ... --> UTF-16LE
                // FE FF --> UTF-16BE
                c2 = this.stream.ReadByte();
                bool bigEndian = c1 == 0xfe;
                int  otherbyte = bigEndian ? 0xff : 0xfe;
                if (c2 == otherbyte)
                {
                    c3 = this.stream.ReadByte();
                    c4 = this.stream.ReadByte();
                    if (!bigEndian && c3 == 0 && c4 == 0)
                    {
                        this.reader = new Utf32Reader(this.stream, false, this.errorThrow);
                        return(this.reader.ReadChar());
                    }
                    else
                    {
                        var newReader = new Utf16Reader(
                            this.stream,
                            bigEndian,
                            this.errorThrow);
                        newReader.Unget(c3, c4);
                        this.reader = newReader;
                        return(newReader.ReadChar());
                    }
                }
                // Assume UTF-8 here, so the 0xff or 0xfe is invalid
                if (this.errorThrow)
                {
                    throw new InvalidOperationException("Invalid Unicode stream");
                }
                else
                {
                    var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
                    utf8reader.Unget(c2);
                    this.reader = utf8reader;
                    return(0xfffd);
                }
            }
            else if (c1 == 0 && this.mode == 4)
            {
                // Here, the relevant cases are:
                // 0 0 0 NZA --> UTF-32BE (if mode is 4)
                // 0 0 FE FF --> UTF-32BE
                // Anything else is treated as UTF-8
                c2 = this.stream.ReadByte();
                c3 = this.stream.ReadByte();
                c4 = this.stream.ReadByte();
                if (c2 == 0 &&
                    ((c3 == 0xfe && c4 == 0xff) ||
                     (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f)))
                {
                    this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
                    return(c3 == 0 ? c4 : this.reader.ReadChar());
                }
                else
                {
                    var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
                    utf8reader.UngetThree(c2, c3, c4);
                    this.reader = utf8reader;
                    return(c1);
                }
            }
            else if (this.mode == 2)
            {
                if (c1 >= 0x01 && c1 <= 0x7f)
                {
                    // Nonzero ASCII character
                    c2 = this.stream.ReadByte();
                    if (c2 == 0)
                    {
                        // NZA 0, so UTF-16LE or UTF-32LE
                        c3 = this.stream.ReadByte();
                        c4 = this.stream.ReadByte();
                        if (c3 == 0 && c4 == 0)
                        {
                            this.reader = new Utf32Reader(
                                this.stream,
                                false,
                                this.errorThrow);
                            return(c1);
                        }
                        else
                        {
                            var newReader = new Utf16Reader(
                                this.stream,
                                false,
                                this.errorThrow);
                            newReader.Unget(c3, c4);
                            this.reader = newReader;
                            return(c1);
                        }
                    }
                    else
                    {
                        // NZA NZ, so UTF-8
                        var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
                        utf8reader.Unget(c2);
                        this.reader = utf8reader;
                        return(c1);
                    }
                }
                else if (c1 == 0)
                {
                    // Zero
                    c2 = this.stream.ReadByte();
                    if (c2 >= 0x01 && c2 <= 0x7f)
                    {
                        // 0 NZA, so UTF-16BE
                        var newReader = new Utf16Reader(this.stream, true, this.errorThrow);
                        this.reader = newReader;
                        return(c2);
                    }
                    else if (c2 == 0)
                    {
                        // 0 0, so maybe UTF-32BE
                        c3 = this.stream.ReadByte();
                        c4 = this.stream.ReadByte();
                        if (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f)
                        {
                            // 0 0 0 NZA
                            this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
                            return(c4);
                        }
                        else if (c3 == 0xfe && c4 == 0xff)
                        {
                            // 0 0 FE FF
                            this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
                            return(this.reader.ReadChar());
                        }
                        else
                        {
                            // 0 0 ...
                            var newReader = new Utf8Reader(this.stream, this.errorThrow);
                            newReader.UngetThree(c2, c3, c4);
                            this.reader = newReader;
                            return(c1);
                        }
                    }
                    else
                    {
                        // 0 NonAscii, so UTF-8
                        var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
                        utf8reader.Unget(c2);
                        this.reader = utf8reader;
                        return(c1);
                    }
                }
            }
            // Use default of UTF-8
            return(-2);
        }
Example #2
0
 private int DetectUtf8Or16Or32(int c1)
 {
     int c2, c3, c4;
       if (c1 == 0xff || c1 == 0xfe) {
     // Start of a possible byte-order mark
     // FF FE 0 0 --> UTF-32LE
     // FF FE ... --> UTF-16LE
     // FE FF --> UTF-16BE
     c2 = this.stream.ReadByte();
     bool bigEndian = c1 == 0xfe;
     int otherbyte = bigEndian ? 0xff : 0xfe;
     if (c2 == otherbyte) {
       c3 = this.stream.ReadByte();
       c4 = this.stream.ReadByte();
       if (!bigEndian && c3 == 0 && c4 == 0) {
     this.reader = new Utf32Reader(this.stream, false, this.errorThrow);
     return this.reader.ReadChar();
       } else {
       var newReader = new Utf16Reader(
       this.stream,
       bigEndian,
       this.errorThrow);
     newReader.Unget(c3, c4);
     this.reader = newReader;
     return newReader.ReadChar();
       }
     }
     // Assume UTF-8 here, so the 0xff or 0xfe is invalid
     if (this.errorThrow) {
       throw new InvalidOperationException("Invalid Unicode stream");
     } else {
       var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
       utf8reader.Unget(c2);
       this.reader = utf8reader;
       return 0xfffd;
     }
       } else if (c1 == 0 && this.mode == 4) {
     // Here, the relevant cases are:
     // 0 0 0 NZA --> UTF-32BE (if mode is 4)
     // 0 0 FE FF --> UTF-32BE
     // Anything else is treated as UTF-8
     c2 = this.stream.ReadByte();
     c3 = this.stream.ReadByte();
     c4 = this.stream.ReadByte();
     if (c2 == 0 &&
        ((c3 == 0xfe && c4 == 0xff) ||
     (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f))) {
       this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
       return c3 == 0 ? c4 : this.reader.ReadChar();
     } else {
       var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
       utf8reader.UngetThree(c2, c3, c4);
       this.reader = utf8reader;
       return c1;
     }
       } else if (this.mode == 2) {
     if (c1 >= 0x01 && c1 <= 0x7f) {
       // Nonzero ASCII character
       c2 = this.stream.ReadByte();
       if (c2 == 0) {
     // NZA 0, so UTF-16LE or UTF-32LE
     c3 = this.stream.ReadByte();
     c4 = this.stream.ReadByte();
     if (c3 == 0 && c4 == 0) {
     this.reader = new Utf32Reader(
       this.stream,
       false,
       this.errorThrow);
       return c1;
     } else {
       var newReader = new Utf16Reader(
       this.stream,
       false,
       this.errorThrow);
       newReader.Unget(c3, c4);
       this.reader = newReader;
       return c1;
     }
       } else {
     // NZA NZ, so UTF-8
     var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
     utf8reader.Unget(c2);
     this.reader = utf8reader;
     return c1;
       }
     } else if (c1 == 0) {
       // Zero
       c2 = this.stream.ReadByte();
       if (c2 >= 0x01 && c2 <= 0x7f) {
     // 0 NZA, so UTF-16BE
     var newReader = new Utf16Reader(this.stream, true, this.errorThrow);
     this.reader = newReader;
     return c2;
       } else if (c2 == 0) {
     // 0 0, so maybe UTF-32BE
     c3 = this.stream.ReadByte();
     c4 = this.stream.ReadByte();
     if (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f) {
       // 0 0 0 NZA
       this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
       return c4;
     } else if (c3 == 0xfe && c4 == 0xff) {
       // 0 0 FE FF
       this.reader = new Utf32Reader(this.stream, true, this.errorThrow);
       return this.reader.ReadChar();
     } else {
       // 0 0 ...
       var newReader = new Utf8Reader(this.stream, this.errorThrow);
       newReader.UngetThree(c2, c3, c4);
       this.reader = newReader;
       return c1;
     }
       } else {
     // 0 NonAscii, so UTF-8
     var utf8reader = new Utf8Reader(this.stream, this.errorThrow);
     utf8reader.Unget(c2);
     this.reader = utf8reader;
     return c1;
       }
     }
       }
       // Use default of UTF-8
       return -2;
 }