public void TestUTF8() { // generate a test set of every valid set of bytes var utf8TestData = new MemoryStream(); var _utf8Test = Encoding.UTF8.GetString( Properties.Resources.UTF_8_test); var data = Encoding.UTF8.GetBytes(_utf8Test); utf8TestData.Write(data, 0, data.Length); ////var buffer = new char[1]; //for (int i = 0; i < 0x10FFFF; i++) //{ // if(i >= 0xD800 && i <= 0xDFFF) continue; // //buffer[0] = Char.ConvertFromUtf32(i)[0];// (char)i; // var bytes = Encoding.UTF8.GetBytes(Char.ConvertFromUtf32(i)); // utf8TestData.Write(bytes, 0, bytes.Length); //} utf8TestData.Flush(); utf8TestData.Seek(0, SeekOrigin.Begin); var utf8 = new Utf8CharScanner(utf8TestData); //for (int i = 0; i < 0x10FFFF; i++) //{ // if (i >= 0xD800 && i <= 0xDFFF) continue; // var expected = Char.ConvertFromUtf32(i); // var actual = utf8.Read().Value; // if(actual.HasValue) // Assert.AreEqual(expected, actual.ToString(), string.Format("Failed at index {0}, exp {1}, act {2}", i, (int)i, (int)actual)); //} Console.Out.WriteLine("utf len" + Encoding.UTF8.GetMaxByteCount(1)); Console.Out.WriteLine(_utf8Test.Length); long offset=0; for (int i = 0; i < _utf8Test.Length; i++) { var expected = new CharLocation { Value = _utf8Test[i] }; var expectedBytes = Encoding.UTF8.GetBytes(new []{expected.Value.Value}); if(expectedBytes.Length > 2) { utf8.Read(); continue; } expected.ByteSpan.Start = offset; expected.ByteSpan.End = offset + expectedBytes.Length; offset += expectedBytes.Length; var actual = utf8.Read(); Assert.AreEqual(expected.Value, actual.Value, "index: " + i); Assert.AreEqual(expected.ByteSpan.Start, actual.ByteSpan.Start, "index: " + i); string window = _utf8Test.Substring(i > 30? i - 30 : 0, 60); Assert.AreEqual(expected.ByteSpan.End, actual.ByteSpan.End, string.Format("index: {0}, expbyte: {1}, value {2}", i, expectedBytes.Length, window)); } Console.Out.WriteLine(_utf8Test); }
public void Reset() { _bufferOffset = 0; _surrogate = CharLocation.Empty; }
/// <summary> /// Read a single character. This method will block until a character is /// available, an I/O error occurs, or the end of the stream is reached. /// /// <p> Subclasses that intend to support efficient single-character input /// should override this method.</p> /// </summary> /// <throws>IOException If an I/O error occurs</throws> /// <returns> /// The character read, as an integer in the range 0 to 16383 /// (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has /// been reached /// </returns> public CharLocation Read() { // decode character var c = _surrogate; if (_surrogate.Value != null) { _surrogate.Value = null; return c; } // NOTE: We use the index into the buffer if there are remaining // bytes from the last block read. -Ac var index = 0; // get first byte var byteZero = index == _bufferOffset ? _stream.ReadByte() : _buffer[index++] & 0x00FF; if (byteZero == -1) { // -1 == EOF return CharLocation.Empty; } if (byteZero < 0x80) { // UTF-8: [0xxx xxxx] // Unicode: [0000 0000] [0xxx xxxx] // Single Byte Char c = new CharLocation { Value = (char)byteZero, ByteSpan = new Span { Start = GetCurrentPosition(index), End = GetCurrentPosition(index) + 1 } }; } else if ((byteZero & 0xE0) == 0xC0 && (byteZero & 0x1E) != 0) { // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] var start = GetCurrentPosition(index); c = new CharLocation { Value = ReadTwoByteChar(index, byteZero), ByteSpan = { Start = start, End = GetCurrentPosition(index) + 1 } }; } else if ((byteZero & 0xF0) == 0xE0) { // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] // Unicode: [zzzz yyyy] [yyxx xxxx] var start = GetCurrentPosition(index); c = new CharLocation { Value = ReadThreeByteChar(index, byteZero), ByteSpan = { Start = start, End = GetCurrentPosition(index) + 1 } }; } else if ((byteZero & 0xF8) == 0xF0) { // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) // [1101 11yy] [yyxx xxxx] (low surrogate) // * uuuuu = wwww + 1 var start = GetCurrentPosition(index); c = new CharLocation { Value = ReadFourByteChar(index, byteZero), ByteSpan = { Start = start, End = GetCurrentPosition(index) + 1 } }; } else { ThrowInvalidByteException(1, 1, byteZero); } return c; }
/// <summary> /// Read characters into a portion of an array. This method will block /// until some input is available, an I/O error occurs, or the end of the /// stream is reached. /// </summary> /// <param name="buffer">Destination buffer</param> /// <param name="offset">Offset at which to start storing characters</param> /// <param name="length">Maximum number of characters to read</param> /// <throws>IOException If an I/O error occurs</throws> /// <returns> /// The number of characters read, or -1 if the end of the /// stream has been reached /// </returns> public int Read(char[] buffer, int offset, int length) { // read bytes var outputbufferIndex = offset; var count = 0; if (_bufferOffset == 0) { // adjust length to read if (length > _buffer.Length) { length = _buffer.Length; } // handle surrogate if (_surrogate.Value.HasValue) { buffer[outputbufferIndex++] = _surrogate.Value.Value; _surrogate = CharLocation.Empty; length--; } // perform read operation count = _stream.Read(_buffer, 0, length); if (count == -1) { return -1; } count += outputbufferIndex - offset; } else { // skip read; last character was in error // NOTE: Having an offset value other than zero means that there was // an error in the last character read. In this case, we have // skipped the read so we don't consume any bytes past the // error. By signalling the error on the next block read we // allow the method to return the most valid characters that // it can on the previous block read. -Ac count = _bufferOffset; _bufferOffset = 0; } // convert bytes to characters var total = count; int inputBufferIndex; byte currentByte; const byte emptyByte = 0; for (inputBufferIndex = 0; inputBufferIndex < total; inputBufferIndex++) { currentByte = _buffer[inputBufferIndex]; if (currentByte >= emptyByte) { buffer[outputbufferIndex++] = (char)currentByte; } else { break; } } for (; inputBufferIndex < total; inputBufferIndex++) { currentByte = _buffer[inputBufferIndex]; // UTF-8: [0xxx xxxx] // Unicode: [0000 0000] [0xxx xxxx] if (currentByte >= emptyByte) { buffer[outputbufferIndex++] = (char)currentByte; continue; } // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] var byteZero = currentByte & 0x0FF; if ((byteZero & 0xE0) == 0xC0 && (byteZero & 0x1E) != 0) { int byte1; if (++inputBufferIndex < total) { byte1 = _buffer[inputBufferIndex] & 0x00FF; } else { byte1 = _stream.ReadByte(); if (byte1 == -1) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _bufferOffset = 1; return outputbufferIndex - offset; } ThrowExpectedByteException(2, 2); } count++; } if ((byte1 & 0xC0) != 0x80) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _bufferOffset = 2; return outputbufferIndex - offset; } ThrowInvalidByteException(2, 2, byte1); } var c = ((byteZero << 6) & 0x07C0) | (byte1 & 0x003F); buffer[outputbufferIndex++] = (char)c; count -= 1; continue; } // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] // Unicode: [zzzz yyyy] [yyxx xxxx] if ((byteZero & 0xF0) == 0xE0) { int byte1; if (++inputBufferIndex < total) { byte1 = _buffer[inputBufferIndex] & 0x00FF; } else { byte1 = _stream.ReadByte(); if (byte1 == -1) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _bufferOffset = 1; return outputbufferIndex - offset; } ThrowExpectedByteException(2, 3); } count++; } if ((byte1 & 0xC0) != 0x80 || (byteZero == 0xED && byte1 >= 0xA0) || ((byteZero & 0x0F) == 0 && (byte1 & 0x20) == 0)) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _bufferOffset = 2; return outputbufferIndex - offset; } ThrowInvalidByteException(2, 3, byte1); } int byte2; if (++inputBufferIndex < total) { byte2 = _buffer[inputBufferIndex] & 0x00FF; } else { byte2 = _stream.ReadByte(); if (byte2 == -1) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _bufferOffset = 2; return outputbufferIndex - offset; } ThrowExpectedByteException(3, 3); } count++; } if ((byte2 & 0xC0) != 0x80) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _buffer[2] = (byte)byte2; _bufferOffset = 3; return outputbufferIndex - offset; } ThrowInvalidByteException(3, 3, byte2); } var c = ((byteZero << 12) & 0xF000) | ((byte1 << 6) & 0x0FC0) | (byte2 & 0x003F); buffer[outputbufferIndex++] = (char)c; count -= 2; continue; } // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate) // [1101 11yy] [yyxx xxxx] (low surrogate) // * uuuuu = wwww + 1 if ((byteZero & 0xF8) == 0xF0) { int byte1; if (++inputBufferIndex < total) { byte1 = _buffer[inputBufferIndex] & 0x00FF; } else { byte1 = _stream.ReadByte(); if (byte1 == -1) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _bufferOffset = 1; return outputbufferIndex - offset; } ThrowExpectedByteException(2, 4); } count++; } if ((byte1 & 0xC0) != 0x80 || ((byte1 & 0x30) == 0 && (byteZero & 0x07) == 0)) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _bufferOffset = 2; return outputbufferIndex - offset; } ThrowInvalidByteException(2, 4, byte1); } int byte2; if (++inputBufferIndex < total) { byte2 = _buffer[inputBufferIndex] & 0x00FF; } else { byte2 = _stream.ReadByte(); if (byte2 == -1) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _bufferOffset = 2; return outputbufferIndex - offset; } ThrowExpectedByteException(3, 4); } count++; } if ((byte2 & 0xC0) != 0x80) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _buffer[2] = (byte)byte2; _bufferOffset = 3; return outputbufferIndex - offset; } ThrowInvalidByteException(3, 4, byte2); } int byte3; if (++inputBufferIndex < total) { byte3 = _buffer[inputBufferIndex] & 0x00FF; } else { byte3 = _stream.ReadByte(); if (byte3 == -1) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _buffer[2] = (byte)byte2; _bufferOffset = 3; return outputbufferIndex - offset; } ThrowExpectedByteException(4, 4); } count++; } if ((byte3 & 0xC0) != 0x80) { if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _buffer[1] = (byte)byte1; _buffer[2] = (byte)byte2; _buffer[3] = (byte)byte3; _bufferOffset = 4; return outputbufferIndex - offset; } ThrowInvalidByteException(4, 4, byte2); } // decode bytes into surrogate characters var uuuuu = ((byteZero << 2) & 0x001C) | ((byte1 >> 4) & 0x0003); if (uuuuu > 0x10) ThrowInvalidSurrogateException(uuuuu); var wwww = uuuuu - 1; var zzzz = byte1 & 0x000F; var yyyyyy = byte2 & 0x003F; var xxxxxx = byte3 & 0x003F; var highSurrogate = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4); var lowSurrogate = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx; // set characters buffer[outputbufferIndex++] = (char)highSurrogate; if ((count -= 2) <= length) { buffer[outputbufferIndex++] = (char)lowSurrogate; } else { // reached the end of the char buffer; save low surrogate for the next read _surrogate.Value = (char)lowSurrogate; --count; } continue; } // error if (outputbufferIndex > offset) { _buffer[0] = (byte)byteZero; _bufferOffset = 1; return outputbufferIndex - offset; } ThrowInvalidByteException(1, 1, byteZero); } return count; }