internal virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, UTF8Decoder decoder) { if (bytes == null || chars == null) { throw new ArgumentNullException(bytes == null ? "bytes" : "chars", "ArgumentNull_Array"); } if (byteIndex < 0 || byteCount < 0) { throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), "ArgumentOutOfRange_NeedNonNegNum"); } if (bytes.Length - byteIndex < byteCount) { throw new ArgumentOutOfRangeException("bytes", "ArgumentOutOfRange_IndexCountBuffer"); } if (charIndex < 0 || charIndex > chars.Length) { throw new ArgumentOutOfRangeException("charIndex", "ArgumentOutOfRange_Index"); } int bits = 0; int trailCount = 0; bool isSurrogate = false; // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence. // This is used to detect non-shortest form. // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that // we don't check for non-shortest form again. int byteSequence = 0; if (decoder != null) { bits = decoder.bits; trailCount = decoder.trailCount; isSurrogate = decoder.isSurrogate; byteSequence = decoder.byteSequence; } int byteEnd = byteIndex + byteCount; int charStart = charIndex; try { while (byteIndex < byteEnd) { byte b = bytes[byteIndex++]; if (trailCount == 0) { // // We are not at a trailing byte. // if ((b & 0x80) == 0) { // This is the ASCII case. // 1 7 0vvvvvvv // // Found an ASCII character. // chars[charIndex++] = (char)b; } else { // Check if this is a valid starting byte. byte temp = (byte)b; while ((temp & 0x80) != 0) { temp <<= 1; trailCount++; } switch (trailCount) { case 1: trailCount = 0; break; case 2: // Make sure that bit 8 ~ bit 11 is not all zero. // 110XXXXx 10xxxxxx if ((b & 0x1e) == 0) { trailCount = 0; } break; case 3: byteSequence = 3; break; case 4: // // This is a surrogate unicode pair // byteSequence = 4; break; default: trailCount = 0; break; } if (trailCount == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } } else { isSurrogate = (trailCount == 4); bits = temp >> trailCount; trailCount--; } } } else { // We are expecting to see bytes like 10vvvvvv if ((b & 0xC0) != 0x80) { // If not, this is NOT a valid sequence. if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } // At this point, we are seeing an invalid trailing byte. // However, this can be a valid starting byte for another UTF8 byte sequence (e.g. // this character could be under 0x7f, or a valid leading byte like 110xxxxx). // So let's put the current byte back, and try to see if this is a valid byte // for another UTF8 byte sequence. byteIndex--; bits = 0; trailCount = 0; } else { switch (byteSequence) { case 3: // Check 3-byte sequence for non-shortest form. // 1110XXXX 10Xxxxxx 10xxxxxx if (bits == 0 && (b & 0x20) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } // Rest byteSequence to zero since we are done with non-shortest form check. byteSequence = 0; break; case 4: // Check 4-byte sequence for non-shortest form. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx if (bits == 0) { if ((b & 0x30) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } else if ((bits & 0x04) != 0) { // Make sure that the resulting Unicode is within the valid surrogate range. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20 // bit are all zero. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx, // XXXXX can only be 10000. if ((bits & 0x03) != 0 || (b & 0x30) != 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } byteSequence = 0; break; } if (--trailCount >= 0) { bits = bits << 6 | (b & 0x3F); if (trailCount == 0) { if (!isSurrogate) { chars[charIndex++] = (char)bits; } else { // // bits >= 0x10000, use surrogate. // chars[charIndex++] = (char)(0xD7C0 + (bits >> 10)); chars[charIndex++] = (char)(CharacterInfo.LOW_SURROGATE_START + (bits & 0x3FF)); } } } } } } } catch (IndexOutOfRangeException) { throw new ArgumentException("Argument_ConversionOverflow"); } if (decoder != null) { decoder.bits = bits; decoder.trailCount = trailCount; decoder.isSurrogate = isSurrogate; decoder.byteSequence = byteSequence; } return(charIndex - charStart); }
internal virtual int GetCharCount(byte[] bytes, int index, int count, UTF8Decoder decoder) { if (bytes == null) { throw new ArgumentNullException("bytes", "ArgumentNull_Array"); } if (index < 0 || count < 0) { throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), "ArgumentOutOfRange_NeedNonNegNum"); } if (bytes.Length - index < count) { throw new ArgumentOutOfRangeException("bytes", "ArgumentOutOfRange_IndexCountBuffer"); } int charCount = 0; int trailCount = 0; // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence. // This is used to detect non-shortest form. // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that // we don't check for non-shortest form again. int byteSequence = 0; bool isSurrogate = false; int bits = 0; if (decoder != null) { trailCount = decoder.trailCount; isSurrogate = decoder.isSurrogate; byteSequence = decoder.byteSequence; bits = decoder.bits; } int end = index + count; while (index < end) { byte b = bytes[index++]; if (trailCount == 0) { if ((b & 0x80) == 0) { // This is an ASCII. charCount++; } else { byte temp = b; while ((temp & 0x80) != 0) { temp <<= 1; trailCount++; } switch (trailCount) { case 1: trailCount = 0; break; case 2: // Make sure that bit 8 ~ bit 11 is not all zero. // 110XXXXx 10xxxxxx if ((b & 0x1e) == 0) { trailCount = 0; } break; case 3: byteSequence = 3; break; case 4: isSurrogate = true; byteSequence = 4; break; default: trailCount = 0; break; } if (trailCount == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } } else { bits = temp >> trailCount; trailCount--; } } } else { // We are expecting to see trailing bytes like 10vvvvvv if ((b & 0xC0) != 0x80) { // If not, this is NOT a valid sequence. if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } index--; trailCount = 0; isSurrogate = false; } else { switch (byteSequence) { case 3: // Check 3-byte sequence for non-shortest form. // 1110XXXX 10Xxxxxx 10xxxxxx if (bits == 0 && (b & 0x20) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } // We are done checking the non-shortest form, reset byteSequence to 0, so that we don't // do the extra check for the remaining byte of the 3-byte chunk. byteSequence = 0; break; case 4: // Check 4-byte sequence for non-shortest form. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx if (bits == 0) { if ((b & 0x30) == 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } else if ((bits & 0x04) != 0) { // Make sure that the resulting Unicode is within the valid surrogate range. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20 // bit are all zero. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx, // XXXXX can only be 10000. if ((bits & 0x03) != 0 || (b & 0x30) != 0) { if (isThrowException) { throw new ArgumentException("Argument_InvalidByteSequence"); } trailCount = -1; } } byteSequence = 0; break; } if (--trailCount == 0) { charCount++; if (isSurrogate) { charCount++; isSurrogate = false; } } } } } return(charCount); }
internal virtual int GetCharCount(byte[] bytes, int index, int count, UTF8Decoder decoder) { if (bytes == null) { throw new ArgumentNullException("bytes", Environment.GetResourceString("ArgumentNull_Array")); } if (index < 0 || count < 0) { throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"), Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); } if (bytes.Length - index < count) { throw new ArgumentOutOfRangeException("bytes", Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer")); } int charCount = 0; int trailCount = 0; // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence. // This is used to detect non-shortest form. // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that // we don't check for non-shortest form again. int byteSequence = 0; bool isSurrogate = false; int bits = 0; if (decoder != null) { trailCount = decoder.trailCount; isSurrogate = decoder.isSurrogate; byteSequence = decoder.byteSequence; bits = decoder.bits; } int end = index + count; while (index < end) { byte b = bytes[index++]; if (trailCount == 0) { if ((b & 0x80) == 0) { // This is an ASCII. charCount++; } else { byte temp = b; while ((temp & 0x80) != 0) { temp <<= 1; trailCount++; } switch (trailCount) { case 1: trailCount = 0; break; case 2: // Make sure that bit 8 ~ bit 11 is not all zero. // 110XXXXx 10xxxxxx if ((b & 0x1e) == 0) { trailCount = 0; } break; case 3: byteSequence = 3; break; case 4: isSurrogate = true; byteSequence = 4; break; default: trailCount = 0; break; } if (trailCount == 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1)); } } else { bits = temp >> trailCount; trailCount--; } } } else { // We are expecting to see trailing bytes like 10vvvvvv if ((b & 0xC0) != 0x80) { // If not, this is NOT a valid sequence. if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1)); } index--; trailCount = 0; isSurrogate = false; } else { switch (byteSequence) { case 3: // Check 3-byte sequence for non-shortest form. // 1110XXXX 10Xxxxxx 10xxxxxx if (bits == 0 && (b & 0x20) == 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1)); } trailCount = -1; } // We are done checking the non-shortest form, reset byteSequence to 0, so that we don't // do the extra check for the remaining byte of the 3-byte chunk. byteSequence = 0; break; case 4: // Check 4-byte sequence for non-shortest form. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx if (bits == 0) { if ((b & 0x30) == 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1)); } trailCount = -1; } } else if ((bits & 0x04) != 0) { // Make sure that the resulting Unicode is within the valid surrogate range. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20 // bit are all zero. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx, // XXXXX can only be 10000. if ((bits & 0x03) != 0 || (b & 0x30) != 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1)); } trailCount = -1; } } byteSequence = 0; break; } if (--trailCount == 0) { charCount++; if (isSurrogate) { charCount++; isSurrogate = false; } } } } } return charCount; }
internal virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, UTF8Decoder decoder) { if (bytes == null || chars == null) { throw new ArgumentNullException(bytes == null ? "bytes" : "chars", Environment.GetResourceString("ArgumentNull_Array")); } if (byteIndex < 0 || byteCount < 0) { throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"), Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); } if ( bytes.Length - byteIndex < byteCount) { throw new ArgumentOutOfRangeException("bytes", Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer")); } if (charIndex < 0 || charIndex > chars.Length) { throw new ArgumentOutOfRangeException("charIndex", Environment.GetResourceString("ArgumentOutOfRange_Index")); } int bits = 0; int trailCount = 0; bool isSurrogate = false; // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence. // This is used to detect non-shortest form. // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that // we don't check for non-shortest form again. int byteSequence = 0; if (decoder != null) { bits = decoder.bits; trailCount = decoder.trailCount; isSurrogate = decoder.isSurrogate; byteSequence = decoder.byteSequence; } int byteEnd = byteIndex + byteCount; int charStart = charIndex; try { while (byteIndex < byteEnd) { byte b = bytes[byteIndex++]; if (trailCount == 0) { // // We are not at a trailing byte. // if ((b & 0x80) == 0) { // This is the ASCII case. // 1 7 0vvvvvvv // // Found an ASCII character. // chars[charIndex++] = (char)b; } else { // Check if this is a valid starting byte. byte temp = (byte)b; while ((temp & 0x80) != 0) { temp <<= 1; trailCount++; } switch (trailCount) { case 1: trailCount = 0; break; case 2: // Make sure that bit 8 ~ bit 11 is not all zero. // 110XXXXx 10xxxxxx if ((b & 0x1e) == 0) { trailCount = 0; } break; case 3: byteSequence = 3; break; case 4: // // This is a surrogate unicode pair // byteSequence = 4; break; default: trailCount = 0; break; } if (trailCount == 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1)); } } else { isSurrogate = (trailCount == 4); bits = temp >> trailCount; trailCount--; } } } else { // We are expecting to see bytes like 10vvvvvv if ((b & 0xC0) != 0x80) { // If not, this is NOT a valid sequence. if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1)); } // At this point, we are seeing an invalid trailing byte. // However, this can be a valid starting byte for another UTF8 byte sequence (e.g. // this character could be under 0x7f, or a valid leading byte like 110xxxxx). // So let's put the current byte back, and try to see if this is a valid byte // for another UTF8 byte sequence. byteIndex--; bits = 0; trailCount = 0; } else { switch (byteSequence) { case 3: // Check 3-byte sequence for non-shortest form. // 1110XXXX 10Xxxxxx 10xxxxxx if (bits == 0 && (b & 0x20) == 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1)); } trailCount = -1; } // Rest byteSequence to zero since we are done with non-shortest form check. byteSequence = 0; break; case 4: // Check 4-byte sequence for non-shortest form. // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx if (bits == 0) { if ((b & 0x30) == 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1)); } trailCount = -1; } } else if ((bits & 0x04) != 0) { // Make sure that the resulting Unicode is within the valid surrogate range. // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF. // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20 // bit are all zero. // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx, // XXXXX can only be 10000. if ((bits & 0x03) != 0 || (b & 0x30) != 0) { if (isThrowException) { throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1)); } trailCount = -1; } } byteSequence = 0; break; } if (--trailCount >= 0) { bits = bits << 6 | (b & 0x3F); if (trailCount == 0) { if (!isSurrogate) { chars[charIndex++] = (char)bits; } else { // // bits >= 0x10000, use surrogate. // chars[charIndex++] = (char)(0xD7C0 + (bits >> 10)); chars[charIndex++] = (char)(CharacterInfo.LOW_SURROGATE_START + (bits & 0x3FF)); } } } } } } } catch (IndexOutOfRangeException) { throw new ArgumentException(Environment.GetResourceString("Argument_ConversionOverflow")); } if (decoder != null) { decoder.bits = bits; decoder.trailCount = trailCount; decoder.isSurrogate = isSurrogate; decoder.byteSequence = byteSequence; } return charIndex - charStart; }
public UTF8Encoding() { encoder = new UTF8Encoder(); decoder = new UTF8Decoder(); }