Example #1
0
        internal virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, UTF8Decoder decoder)
        {
            if (bytes == null || chars == null)
            {
                throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
                                                "ArgumentNull_Array");
            }
            if (byteIndex < 0 || byteCount < 0)
            {
                throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"),
                                                      "ArgumentOutOfRange_NeedNonNegNum");
            }
            if (bytes.Length - byteIndex < byteCount)
            {
                throw new ArgumentOutOfRangeException("bytes",
                                                      "ArgumentOutOfRange_IndexCountBuffer");
            }
            if (charIndex < 0 || charIndex > chars.Length)
            {
                throw new ArgumentOutOfRangeException("charIndex",
                                                      "ArgumentOutOfRange_Index");
            }
            int  bits        = 0;
            int  trailCount  = 0;
            bool isSurrogate = false;

            // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
            // This is used to detect non-shortest form.
            // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
            // we don't check for non-shortest form again.
            int byteSequence = 0;

            if (decoder != null)
            {
                bits         = decoder.bits;
                trailCount   = decoder.trailCount;
                isSurrogate  = decoder.isSurrogate;
                byteSequence = decoder.byteSequence;
            }
            int byteEnd   = byteIndex + byteCount;
            int charStart = charIndex;

            try {
                while (byteIndex < byteEnd)
                {
                    byte b = bytes[byteIndex++];
                    if (trailCount == 0)
                    {
                        //
                        // We are not at a trailing byte.
                        //
                        if ((b & 0x80) == 0)
                        {
                            // This is the ASCII case.
                            //   1        7      0vvvvvvv
                            //
                            // Found an ASCII character.
                            //
                            chars[charIndex++] = (char)b;
                        }
                        else
                        {
                            // Check if this is a valid starting byte.
                            byte temp = (byte)b;
                            while ((temp & 0x80) != 0)
                            {
                                temp <<= 1;
                                trailCount++;
                            }
                            switch (trailCount)
                            {
                            case 1:
                                trailCount = 0;
                                break;

                            case 2:
                                // Make sure that bit 8 ~ bit 11 is not all zero.
                                // 110XXXXx 10xxxxxx
                                if ((b & 0x1e) == 0)
                                {
                                    trailCount = 0;
                                }
                                break;

                            case 3:
                                byteSequence = 3;
                                break;

                            case 4:
                                //
                                // This is a surrogate unicode pair
                                //
                                byteSequence = 4;
                                break;

                            default:
                                trailCount = 0;
                                break;
                            }
                            if (trailCount == 0)
                            {
                                if (isThrowException)
                                {
                                    throw new ArgumentException("Argument_InvalidByteSequence");
                                }
                            }
                            else
                            {
                                isSurrogate = (trailCount == 4);
                                bits        = temp >> trailCount;
                                trailCount--;
                            }
                        }
                    }
                    else
                    {
                        // We are expecting to see bytes like 10vvvvvv
                        if ((b & 0xC0) != 0x80)
                        {
                            // If not, this is NOT a valid sequence.
                            if (isThrowException)
                            {
                                throw new ArgumentException("Argument_InvalidByteSequence");
                            }
                            // At this point, we are seeing an invalid trailing byte.
                            // However, this can be a valid starting byte for another UTF8 byte sequence (e.g.
                            // this character could be under 0x7f, or a valid leading byte like 110xxxxx).
                            // So let's put the current byte back, and try to see if this is a valid byte
                            // for another UTF8 byte sequence.
                            byteIndex--;
                            bits       = 0;
                            trailCount = 0;
                        }
                        else
                        {
                            switch (byteSequence)
                            {
                            case 3:
                                // Check 3-byte sequence for non-shortest form.
                                // 1110XXXX 10Xxxxxx 10xxxxxx
                                if (bits == 0 && (b & 0x20) == 0)
                                {
                                    if (isThrowException)
                                    {
                                        throw new ArgumentException("Argument_InvalidByteSequence");
                                    }
                                    trailCount = -1;
                                }
                                // Rest byteSequence to zero since we are done with non-shortest form check.
                                byteSequence = 0;
                                break;

                            case 4:
                                // Check 4-byte sequence for non-shortest form.
                                // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
                                if (bits == 0)
                                {
                                    if ((b & 0x30) == 0)
                                    {
                                        if (isThrowException)
                                        {
                                            throw new ArgumentException("Argument_InvalidByteSequence");
                                        }
                                        trailCount = -1;
                                    }
                                }
                                else if ((bits & 0x04) != 0)
                                {
                                    // Make sure that the resulting Unicode is within the valid surrogate range.
                                    // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range
                                    // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
                                    // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
                                    // bit are all zero.
                                    // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
                                    // XXXXX can only be 10000.
                                    if ((bits & 0x03) != 0 || (b & 0x30) != 0)
                                    {
                                        if (isThrowException)
                                        {
                                            throw new ArgumentException("Argument_InvalidByteSequence");
                                        }
                                        trailCount = -1;
                                    }
                                }
                                byteSequence = 0;
                                break;
                            }
                            if (--trailCount >= 0)
                            {
                                bits = bits << 6 | (b & 0x3F);
                                if (trailCount == 0)
                                {
                                    if (!isSurrogate)
                                    {
                                        chars[charIndex++] = (char)bits;
                                    }
                                    else
                                    {
                                        //
                                        // bits >= 0x10000, use surrogate.
                                        //
                                        chars[charIndex++] = (char)(0xD7C0 + (bits >> 10));
                                        chars[charIndex++] = (char)(CharacterInfo.LOW_SURROGATE_START + (bits & 0x3FF));
                                    }
                                }
                            }
                        }
                    }
                }
            }
            catch (IndexOutOfRangeException) {
                throw new ArgumentException("Argument_ConversionOverflow");
            }
            if (decoder != null)
            {
                decoder.bits         = bits;
                decoder.trailCount   = trailCount;
                decoder.isSurrogate  = isSurrogate;
                decoder.byteSequence = byteSequence;
            }
            return(charIndex - charStart);
        }
Example #2
0
        internal virtual int GetCharCount(byte[] bytes, int index, int count, UTF8Decoder decoder)
        {
            if (bytes == null)
            {
                throw new ArgumentNullException("bytes",
                                                "ArgumentNull_Array");
            }
            if (index < 0 || count < 0)
            {
                throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"),
                                                      "ArgumentOutOfRange_NeedNonNegNum");
            }
            if (bytes.Length - index < count)
            {
                throw new ArgumentOutOfRangeException("bytes",
                                                      "ArgumentOutOfRange_IndexCountBuffer");
            }
            int charCount  = 0;
            int trailCount = 0;

            // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
            // This is used to detect non-shortest form.
            // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
            // we don't check for non-shortest form again.
            int  byteSequence = 0;
            bool isSurrogate  = false;
            int  bits         = 0;

            if (decoder != null)
            {
                trailCount   = decoder.trailCount;
                isSurrogate  = decoder.isSurrogate;
                byteSequence = decoder.byteSequence;
                bits         = decoder.bits;
            }

            int end = index + count;

            while (index < end)
            {
                byte b = bytes[index++];
                if (trailCount == 0)
                {
                    if ((b & 0x80) == 0)
                    {
                        // This is an ASCII.
                        charCount++;
                    }
                    else
                    {
                        byte temp = b;
                        while ((temp & 0x80) != 0)
                        {
                            temp <<= 1;
                            trailCount++;
                        }
                        switch (trailCount)
                        {
                        case 1:
                            trailCount = 0;
                            break;

                        case 2:
                            // Make sure that bit 8 ~ bit 11 is not all zero.
                            // 110XXXXx 10xxxxxx
                            if ((b & 0x1e) == 0)
                            {
                                trailCount = 0;
                            }
                            break;

                        case 3:
                            byteSequence = 3;
                            break;

                        case 4:
                            isSurrogate  = true;
                            byteSequence = 4;
                            break;

                        default:
                            trailCount = 0;
                            break;
                        }
                        if (trailCount == 0)
                        {
                            if (isThrowException)
                            {
                                throw new ArgumentException("Argument_InvalidByteSequence");
                            }
                        }
                        else
                        {
                            bits = temp >> trailCount;
                            trailCount--;
                        }
                    }
                }
                else
                {
                    // We are expecting to see trailing bytes like 10vvvvvv
                    if ((b & 0xC0) != 0x80)
                    {
                        // If not, this is NOT a valid sequence.
                        if (isThrowException)
                        {
                            throw new ArgumentException("Argument_InvalidByteSequence");
                        }
                        index--;
                        trailCount  = 0;
                        isSurrogate = false;
                    }
                    else
                    {
                        switch (byteSequence)
                        {
                        case 3:
                            // Check 3-byte sequence for non-shortest form.
                            // 1110XXXX 10Xxxxxx 10xxxxxx
                            if (bits == 0 && (b & 0x20) == 0)
                            {
                                if (isThrowException)
                                {
                                    throw new ArgumentException("Argument_InvalidByteSequence");
                                }
                                trailCount = -1;
                            }
                            // We are done checking the non-shortest form, reset byteSequence to 0, so that we don't
                            // do the extra check for the remaining byte of the 3-byte chunk.
                            byteSequence = 0;
                            break;

                        case 4:
                            // Check 4-byte sequence for non-shortest form.
                            // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
                            if (bits == 0)
                            {
                                if ((b & 0x30) == 0)
                                {
                                    if (isThrowException)
                                    {
                                        throw new ArgumentException("Argument_InvalidByteSequence");
                                    }
                                    trailCount = -1;
                                }
                            }
                            else if ((bits & 0x04) != 0)
                            {
                                // Make sure that the resulting Unicode is within the valid surrogate range.
                                // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point range
                                // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
                                // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
                                // bit are all zero.
                                // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
                                // XXXXX can only be 10000.
                                if ((bits & 0x03) != 0 || (b & 0x30) != 0)
                                {
                                    if (isThrowException)
                                    {
                                        throw new ArgumentException("Argument_InvalidByteSequence");
                                    }
                                    trailCount = -1;
                                }
                            }
                            byteSequence = 0;
                            break;
                        }

                        if (--trailCount == 0)
                        {
                            charCount++;
                            if (isSurrogate)
                            {
                                charCount++;
                                isSurrogate = false;
                            }
                        }
                    }
                }
            }
            return(charCount);
        }
Example #3
0
        internal virtual int GetCharCount(byte[] bytes, int index, int count, UTF8Decoder decoder) {
            if (bytes == null) {
                throw new ArgumentNullException("bytes", 
                    Environment.GetResourceString("ArgumentNull_Array"));
            }
            if (index < 0 || count < 0) {
                throw new ArgumentOutOfRangeException((index<0 ? "index" : "count"), 
                    Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
            }                                                             
            if (bytes.Length - index < count) {
                throw new ArgumentOutOfRangeException("bytes",
                    Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
            }
            int charCount = 0;
            int trailCount = 0;
            
            // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
            // This is used to detect non-shortest form.
            // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
            // we don't check for non-shortest form again.
            int byteSequence = 0;   
            bool isSurrogate = false;
            int bits = 0;
            if (decoder != null) {
                trailCount = decoder.trailCount;
                isSurrogate = decoder.isSurrogate;
                byteSequence = decoder.byteSequence;
                bits = decoder.bits;
            }

            int end = index + count;
            while (index < end) {
                byte b = bytes[index++];
                if (trailCount == 0) {
                    if ((b & 0x80) == 0) {
                        // This is an ASCII.
                        charCount++;
                    } else {
                        byte temp = b;
                        while ((temp & 0x80) != 0) {
                            temp <<= 1;
                            trailCount++;
                        }
                        switch (trailCount) {
                            case 1:
                                trailCount = 0;
                                break;
                            case 2:
                                // Make sure that bit 8 ~ bit 11 is not all zero.
                                // 110XXXXx 10xxxxxx
                                if ((b & 0x1e) == 0) {
                                    trailCount = 0;
                                }
                                break;
                            case 3:
                                byteSequence = 3;
                                break;
                            case 4:
                                isSurrogate = true;
                                byteSequence = 4;
                                break;
                            default:
                                trailCount = 0;
                                break;
                        }
                        if (trailCount == 0) {
                            if (isThrowException) {
                                throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1));
                            }
                        } else {
                            bits = temp >> trailCount;
                            trailCount--;
                        } 
                    }                   
                } else {
                    // We are expecting to see trailing bytes like 10vvvvvv
                    if ((b & 0xC0) != 0x80) {
                        // If not, this is NOT a valid sequence.
                        if (isThrowException) {
                            throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1));
                        }
                        index--;
                        trailCount = 0;
                        isSurrogate = false;
                    } else {
                        switch (byteSequence) {
                            case 3:
                                // Check 3-byte sequence for non-shortest form.
                                // 1110XXXX 10Xxxxxx 10xxxxxx                                    
                                if (bits == 0 && (b & 0x20) == 0) {
                                    if (isThrowException) {
                                        throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1));
                                    }
                                    trailCount = -1;
                                }
                                // We are done checking the non-shortest form, reset byteSequence to 0, so that we don't
                                // do the extra check for the remaining byte of the 3-byte chunk.
                                byteSequence = 0;
                                break;
                            case 4:
                                // Check 4-byte sequence for non-shortest form.
                                // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
                                if (bits == 0) {
                                    if ((b & 0x30) == 0) {
                                        if (isThrowException) {
                                            throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1));
                                        }
                                        trailCount = -1;
                                    }
                                } else if ((bits & 0x04) != 0) {
                                    // Make sure that the resulting Unicode is within the valid surrogate range.
                                    // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne
                                    // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
                                    // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
                                    // bit are all zero.
                                    // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
                                    // XXXXX can only be 10000.
                                    if ((bits & 0x03) != 0 || (b & 0x30) != 0) {
                                        if (isThrowException) {
                                            throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), index-1));
                                        }
                                        trailCount = -1;
                                    }
                                }
                                byteSequence = 0;
                                break;
                        }
                    
                        if (--trailCount == 0) {
                            charCount++;
                            if (isSurrogate) {
                                charCount++;
                                isSurrogate = false;
                            }
                        }
                    }
                }
            }
            return charCount;
        }
Example #4
0
 internal virtual int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, UTF8Decoder decoder) {
     if (bytes == null || chars == null) {
         throw new ArgumentNullException(bytes == null ? "bytes" : "chars",
             Environment.GetResourceString("ArgumentNull_Array"));
     }
     if (byteIndex < 0 || byteCount < 0) {
         throw new ArgumentOutOfRangeException((byteIndex<0 ? "byteIndex" : "byteCount"), 
             Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
     }        
     if ( bytes.Length - byteIndex < byteCount)
     {
         throw new ArgumentOutOfRangeException("bytes",
             Environment.GetResourceString("ArgumentOutOfRange_IndexCountBuffer"));
     }
     if (charIndex < 0 || charIndex > chars.Length) {
         throw new ArgumentOutOfRangeException("charIndex", 
             Environment.GetResourceString("ArgumentOutOfRange_Index"));
     }
     int bits = 0;
     int trailCount = 0;
     bool isSurrogate = false;
     
     // Indicate the current chunk of bytes is a 2-byte, 3-byte or 4-byte UTF8 sequence.
     // This is used to detect non-shortest form.
     // It will be reset to 0 when the 2nd byte of the UTF8 sequence is read, so that
     // we don't check for non-shortest form again.            
     int byteSequence = 0;
     if (decoder != null) {
         bits = decoder.bits;
         trailCount = decoder.trailCount;
         isSurrogate = decoder.isSurrogate;
         byteSequence = decoder.byteSequence;
     }
     int byteEnd = byteIndex + byteCount;
     int charStart = charIndex;
     try {
         while (byteIndex < byteEnd) {
             byte b = bytes[byteIndex++];
             if (trailCount == 0) {
                 //
                 // We are not at a trailing byte.
                 //
                 if ((b & 0x80) == 0) {
                     // This is the ASCII case.
                     //   1        7      0vvvvvvv
                     //
                     // Found an ASCII character.
                     //
                     chars[charIndex++] = (char)b;
                 } else {
                     // Check if this is a valid starting byte.
                     byte temp = (byte)b;
                     while ((temp & 0x80) != 0) {
                         temp <<= 1;
                         trailCount++;
                     }
                     switch (trailCount) {
                         case 1:
                             trailCount = 0;
                             break;
                         case 2:
                             // Make sure that bit 8 ~ bit 11 is not all zero.
                             // 110XXXXx 10xxxxxx
                             if ((b & 0x1e) == 0) {
                                 trailCount = 0;
                             }
                             break;
                         case 3:
                             byteSequence = 3;
                             break;
                         case 4:
                             //
                             // This is a surrogate unicode pair
                             //
                             byteSequence = 4;
                             break;
                         default:
                             trailCount = 0;
                             break;
                     }
                     if (trailCount == 0) {
                         if (isThrowException) {
                             throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1));
                         }
                     } else {
                         isSurrogate = (trailCount == 4);
                         bits = temp >> trailCount;
                         trailCount--;
                 }                        
             }
             } else {
                 // We are expecting to see bytes like 10vvvvvv
                 if ((b & 0xC0) != 0x80) {
                     // If not, this is NOT a valid sequence.
                     if (isThrowException) {
                         throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1));
                     }
                     // At this point, we are seeing an invalid trailing byte.
                     // However, this can be a valid starting byte for another UTF8 byte sequence (e.g.
                     // this character could be under 0x7f, or a valid leading byte like 110xxxxx).
                     // So let's put the current byte back, and try to see if this is a valid byte
                     // for another UTF8 byte sequence.
                     byteIndex--;
                     bits = 0;
                     trailCount = 0;
                 } else {                            
                     switch (byteSequence) {
                         case 3:
                             // Check 3-byte sequence for non-shortest form.
                             // 1110XXXX 10Xxxxxx 10xxxxxx                                    
                             if (bits == 0 && (b & 0x20) == 0) {
                                 if (isThrowException) {
                                     throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1));
                                 }
                                 trailCount = -1;
                             }
                             // Rest byteSequence to zero since we are done with non-shortest form check.
                             byteSequence = 0;
                             break;
                         case 4:                                        
                             // Check 4-byte sequence for non-shortest form.
                             // 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
                             if (bits == 0) {
                                 if ((b & 0x30) == 0) {
                                     if (isThrowException) {
                                         throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1));
                                     }
                                     trailCount = -1;
                                 }
                             } else if ((bits & 0x04) != 0) {
                                 // Make sure that the resulting Unicode is within the valid surrogate range.
                                 // The 4 byte code sequence can hold up to 21 bits, and the maximum valid code point ragne
                                 // that Unicode (with surrogate) could represent are from U+000000 ~ U+10FFFF.
                                 // Therefore, if the 21 bit (the most significant bit) is 1, we should verify that the 17 ~ 20
                                 // bit are all zero.
                                 // I.e., in 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx,
                                 // XXXXX can only be 10000.
                                 if ((bits & 0x03) != 0 || (b & 0x30) != 0) {
                                     if (isThrowException) {
                                         throw new ArgumentException(String.Format(Environment.GetResourceString("Argument_InvalidByteSequence"), byteIndex-1));
                                     }
                                     trailCount = -1;
                                 }
                             }
                             byteSequence = 0;
                             break;
                     }
                     if (--trailCount >= 0) {
                         bits = bits << 6 | (b & 0x3F);
                         if (trailCount == 0) {
                             if (!isSurrogate) {
                                 chars[charIndex++] = (char)bits;
                             }
                             else {
                                 //
                                 // bits >= 0x10000, use surrogate.
                                 //
                                 chars[charIndex++] = (char)(0xD7C0 + (bits >> 10));
                                 chars[charIndex++] = (char)(CharacterInfo.LOW_SURROGATE_START + (bits & 0x3FF));
                             }
                         }
                     }
                 }
             }
         }
     } catch (IndexOutOfRangeException) {
         throw new ArgumentException(Environment.GetResourceString("Argument_ConversionOverflow"));
     }
     if (decoder != null) {
         decoder.bits = bits;
         decoder.trailCount = trailCount;
         decoder.isSurrogate = isSurrogate;
         decoder.byteSequence = byteSequence;
     }
     return charIndex - charStart;
 }
Example #5
0
 public UTF8Encoding()
 {
     encoder = new UTF8Encoder();
     decoder = new UTF8Decoder();
 }