private int DetectUtf8Or16Or32(int c1) { int c2, c3, c4; if (c1 == 0xff || c1 == 0xfe) { // Start of a possible byte-order mark // FF FE 0 0 --> UTF-32LE // FF FE ... --> UTF-16LE // FE FF --> UTF-16BE c2 = this.stream.ReadByte(); bool bigEndian = c1 == 0xfe; int otherbyte = bigEndian ? 0xff : 0xfe; if (c2 == otherbyte) { c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (!bigEndian && c3 == 0 && c4 == 0) { this.reader = new Utf32Reader(this.stream, false, this.errorThrow); return(this.reader.ReadChar()); } else { var newReader = new Utf16Reader( this.stream, bigEndian, this.errorThrow); newReader.Unget(c3, c4); this.reader = newReader; return(newReader.ReadChar()); } } // Assume UTF-8 here, so the 0xff or 0xfe is invalid if (this.errorThrow) { throw new InvalidOperationException("Invalid Unicode stream"); } else { var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return(0xfffd); } } else if (c1 == 0 && this.mode == 4) { // Here, the relevant cases are: // 0 0 0 NZA --> UTF-32BE (if mode is 4) // 0 0 FE FF --> UTF-32BE // Anything else is treated as UTF-8 c2 = this.stream.ReadByte(); c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (c2 == 0 && ((c3 == 0xfe && c4 == 0xff) || (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f))) { this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return(c3 == 0 ? c4 : this.reader.ReadChar()); } else { var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.UngetThree(c2, c3, c4); this.reader = utf8reader; return(c1); } } else if (this.mode == 2) { if (c1 >= 0x01 && c1 <= 0x7f) { // Nonzero ASCII character c2 = this.stream.ReadByte(); if (c2 == 0) { // NZA 0, so UTF-16LE or UTF-32LE c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (c3 == 0 && c4 == 0) { this.reader = new Utf32Reader( this.stream, false, this.errorThrow); return(c1); } else { var newReader = new Utf16Reader( this.stream, false, this.errorThrow); newReader.Unget(c3, c4); this.reader = newReader; return(c1); } } else { // NZA NZ, so UTF-8 var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return(c1); } } else if (c1 == 0) { // Zero c2 = this.stream.ReadByte(); if (c2 >= 0x01 && c2 <= 0x7f) { // 0 NZA, so UTF-16BE var newReader = new Utf16Reader(this.stream, true, this.errorThrow); this.reader = newReader; return(c2); } else if (c2 == 0) { // 0 0, so maybe UTF-32BE c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f) { // 0 0 0 NZA this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return(c4); } else if (c3 == 0xfe && c4 == 0xff) { // 0 0 FE FF this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return(this.reader.ReadChar()); } else { // 0 0 ... var newReader = new Utf8Reader(this.stream, this.errorThrow); newReader.UngetThree(c2, c3, c4); this.reader = newReader; return(c1); } } else { // 0 NonAscii, so UTF-8 var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return(c1); } } } // Use default of UTF-8 return(-2); }
private int DetectUtf8Or16Or32(int c1) { int c2, c3, c4; if (c1 == 0xff || c1 == 0xfe) { // Start of a possible byte-order mark // FF FE 0 0 --> UTF-32LE // FF FE ... --> UTF-16LE // FE FF --> UTF-16BE c2 = this.stream.ReadByte(); bool bigEndian = c1 == 0xfe; int otherbyte = bigEndian ? 0xff : 0xfe; if (c2 == otherbyte) { c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (!bigEndian && c3 == 0 && c4 == 0) { this.reader = new Utf32Reader(this.stream, false, this.errorThrow); return this.reader.ReadChar(); } else { var newReader = new Utf16Reader( this.stream, bigEndian, this.errorThrow); newReader.Unget(c3, c4); this.reader = newReader; return newReader.ReadChar(); } } // Assume UTF-8 here, so the 0xff or 0xfe is invalid if (this.errorThrow) { throw new InvalidOperationException("Invalid Unicode stream"); } else { var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return 0xfffd; } } else if (c1 == 0 && this.mode == 4) { // Here, the relevant cases are: // 0 0 0 NZA --> UTF-32BE (if mode is 4) // 0 0 FE FF --> UTF-32BE // Anything else is treated as UTF-8 c2 = this.stream.ReadByte(); c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (c2 == 0 && ((c3 == 0xfe && c4 == 0xff) || (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f))) { this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return c3 == 0 ? c4 : this.reader.ReadChar(); } else { var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.UngetThree(c2, c3, c4); this.reader = utf8reader; return c1; } } else if (this.mode == 2) { if (c1 >= 0x01 && c1 <= 0x7f) { // Nonzero ASCII character c2 = this.stream.ReadByte(); if (c2 == 0) { // NZA 0, so UTF-16LE or UTF-32LE c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (c3 == 0 && c4 == 0) { this.reader = new Utf32Reader( this.stream, false, this.errorThrow); return c1; } else { var newReader = new Utf16Reader( this.stream, false, this.errorThrow); newReader.Unget(c3, c4); this.reader = newReader; return c1; } } else { // NZA NZ, so UTF-8 var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return c1; } } else if (c1 == 0) { // Zero c2 = this.stream.ReadByte(); if (c2 >= 0x01 && c2 <= 0x7f) { // 0 NZA, so UTF-16BE var newReader = new Utf16Reader(this.stream, true, this.errorThrow); this.reader = newReader; return c2; } else if (c2 == 0) { // 0 0, so maybe UTF-32BE c3 = this.stream.ReadByte(); c4 = this.stream.ReadByte(); if (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f) { // 0 0 0 NZA this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return c4; } else if (c3 == 0xfe && c4 == 0xff) { // 0 0 FE FF this.reader = new Utf32Reader(this.stream, true, this.errorThrow); return this.reader.ReadChar(); } else { // 0 0 ... var newReader = new Utf8Reader(this.stream, this.errorThrow); newReader.UngetThree(c2, c3, c4); this.reader = newReader; return c1; } } else { // 0 NonAscii, so UTF-8 var utf8reader = new Utf8Reader(this.stream, this.errorThrow); utf8reader.Unget(c2); this.reader = utf8reader; return c1; } } } // Use default of UTF-8 return -2; }