/// <summary> /// A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars. /// The result is a buffer where those chars have been converted to UTF-8; /// that means it contains only valid UTF-8 chars. /// <p> /// <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking /// at the first four bytes (that works only if the buffer starts with an ASCII-char, /// like xmls '<'). UTF-16/32 flavours do not require further proccessing. /// <p> /// In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of /// Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte /// sequence. /// <p> /// The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code /// page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined /// by Windows 1252. These are in XML's RestrictedChar set, so we map them to a /// space. /// <p> /// The official Latin-1 characters in the range 0xA0..0xFF are converted into /// the Unicode Latin Supplement range U+00A0 - U+00FF. /// <p> /// <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC), /// it will be left as is. But if only the first two bytes are appearing, /// followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to /// 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a). /// </summary> /// <param name="buffer"> a byte buffer contain </param> /// <returns> Returns a new buffer containing valid UTF-8 </returns> public static ByteBuffer Convert(ByteBuffer buffer) { if ("UTF-8".Equals(buffer.Encoding)) { // the buffer containing one UTF-8 char (up to 8 bytes) byte[] readAheadBuffer = new byte[8]; // the number of bytes read ahead. int readAhead = 0; // expected UTF8 bytesto come int expectedBytes = 0; // output buffer with estimated length ByteBuffer outp = new ByteBuffer(buffer.Length * 4 / 3); int state = STATE_START; for (int i = 0; i < buffer.Length; i++) { int b = buffer.ByteAt(i); switch (state) { default: goto case STATE_START; case STATE_START: if (b < 0x7F) { outp.Append((byte)b); } else if (b >= 0xC0) { // start of UTF8 sequence expectedBytes = -1; int test = b; for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1) { expectedBytes++; } readAheadBuffer[readAhead++] = (byte)b; state = STATE_UTF8CHAR; } else // implicitly: b >= 0x80 && b < 0xC0 { // invalid UTF8 start char, assume to be Latin-1 byte[] utf8 = ConvertToUtf8((byte)b); outp.Append(utf8); } break; case STATE_UTF8CHAR: if (expectedBytes > 0 && (b & 0xC0) == 0x80) { // valid UTF8 char, add to readAheadBuffer readAheadBuffer[readAhead++] = (byte)b; expectedBytes--; if (expectedBytes == 0) { outp.Append(readAheadBuffer, 0, readAhead); readAhead = 0; state = STATE_START; } } else { // invalid UTF8 char: // 1. convert first of seq to UTF8 byte[] utf8 = ConvertToUtf8(readAheadBuffer[0]); outp.Append(utf8); // 2. continue processing at second byte of sequence i = i - readAhead; readAhead = 0; state = STATE_START; } break; } } // loop ends with "half" Utf8 char --> assume that the bytes are Latin-1 if (state == STATE_UTF8CHAR) { for (int j = 0; j < readAhead; j++) { byte b = readAheadBuffer[j]; byte[] utf8 = ConvertToUtf8(b); outp.Append(utf8); } } return(outp); } // Latin-1 fixing applies only to UTF-8 return(buffer); }
/// <summary> /// A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars. /// The result is a buffer where those chars have been converted to UTF-8; /// that means it contains only valid UTF-8 chars. /// <p> /// <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking /// at the first four bytes (that works only if the buffer starts with an ASCII-char, /// like xmls '<'). UTF-16/32 flavours do not require further proccessing. /// <p> /// In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of /// Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte /// sequence. /// <p> /// The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code /// page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined /// by Windows 1252. These are in XML's RestrictedChar set, so we map them to a /// space. /// <p> /// The official Latin-1 characters in the range 0xA0..0xFF are converted into /// the Unicode Latin Supplement range U+00A0 - U+00FF. /// <p> /// <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC), /// it will be left as is. But if only the first two bytes are appearing, /// followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to /// 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a). /// </summary> /// <param name="buffer"> a byte buffer contain </param> /// <returns> Returns a new buffer containing valid UTF-8 </returns> public static ByteBuffer Convert(ByteBuffer buffer) { if ("UTF-8".Equals(buffer.Encoding)) { // the buffer containing one UTF-8 char (up to 8 bytes) byte[] readAheadBuffer = new byte[8]; // the number of bytes read ahead. int readAhead = 0; // expected UTF8 bytesto come int expectedBytes = 0; // output buffer with estimated length ByteBuffer outp = new ByteBuffer(buffer.Length*4/3); int state = STATE_START; for (int i = 0; i < buffer.Length; i++) { int b = buffer.ByteAt(i); switch (state) { default: goto case STATE_START; case STATE_START: if (b < 0x7F) { outp.Append((byte) b); } else if (b >= 0xC0) { // start of UTF8 sequence expectedBytes = -1; int test = b; for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1) { expectedBytes++; } readAheadBuffer[readAhead++] = (byte) b; state = STATE_UTF8CHAR; } else // implicitly: b >= 0x80 && b < 0xC0 { // invalid UTF8 start char, assume to be Latin-1 byte[] utf8 = ConvertToUtf8((byte) b); outp.Append(utf8); } break; case STATE_UTF8CHAR: if (expectedBytes > 0 && (b & 0xC0) == 0x80) { // valid UTF8 char, add to readAheadBuffer readAheadBuffer[readAhead++] = (byte) b; expectedBytes--; if (expectedBytes == 0) { outp.Append(readAheadBuffer, 0, readAhead); readAhead = 0; state = STATE_START; } } else { // invalid UTF8 char: // 1. convert first of seq to UTF8 byte[] utf8 = ConvertToUtf8(readAheadBuffer[0]); outp.Append(utf8); // 2. continue processing at second byte of sequence i = i - readAhead; readAhead = 0; state = STATE_START; } break; } } // loop ends with "half" Utf8 char --> assume that the bytes are Latin-1 if (state == STATE_UTF8CHAR) { for (int j = 0; j < readAhead; j++) { byte b = readAheadBuffer[j]; byte[] utf8 = ConvertToUtf8(b); outp.Append(utf8); } } return outp; } // Latin-1 fixing applies only to UTF-8 return buffer; }