static utf_error get_sequence_3(ref string src, int it, int end, ref int code_point) { if (it == end) { return(utf_error.NOT_ENOUGH_ROOM); } code_point = mask8(src[it]); { utf_error ret1 = increase_safely(ref src, it, end); if (ret1 != utf_error.UTF8_OK) { return(ret1); } } code_point = ((code_point << 12) & 0xffff) + ((mask8(src[it]) << 6) & 0xfff); utf_error ret2 = increase_safely(ref src, it, end); if (ret2 != utf_error.UTF8_OK) { return(ret2); } code_point += (src[it]) & 0x3f; return(utf_error.UTF8_OK); }
public static bool IsStringValidUTF8(string source) { int result = 0; int end = source.Length; while (result != end) { int t = 0; utf_error err_code = validate_next(ref source, ref result, end, ref t); if (err_code != utf_error.UTF8_OK) { return(result >= end); } } return(result >= end); }
static utf_error get_sequence_2(ref string src, int it, int end, ref int code_point) { if (it == end) { return(utf_error.NOT_ENOUGH_ROOM); } code_point = mask8(src[it]); utf_error ret = increase_safely(ref src, it, end); if (ret != utf_error.UTF8_OK) { return(ret); } code_point = ((code_point << 6) & 0x7ff) + ((src[it]) & 0x3f); return(utf_error.UTF8_OK); }
static utf_error validate_next(ref string src, ref int it, int end, ref int code_point) { if (it == end) { return(utf_error.NOT_ENOUGH_ROOM); } // Save the original value of it so we can go back ref case of failure // Of course, it does not make much sense with i.e. stream iterators int original_it = it; Int32 cp = 0; // Determine the sequence length based on the lead octet int length = sequence_length(ref src, it); // Get trail octets and calculate the code point utf_error err = utf_error.UTF8_OK; switch (length) { case 0: return(utf_error.INVALID_LEAD); case 1: err = get_sequence_1(ref src, it, end, ref cp); break; case 2: err = get_sequence_2(ref src, it, end, ref cp); break; case 3: err = get_sequence_3(ref src, it, end, ref cp); break; case 4: err = get_sequence_4(ref src, it, end, ref cp); break; } if (err == utf_error.UTF8_OK) { // Decoding succeeded. Now, security checks... if (is_code_point_valid((UInt32)cp)) { if (!is_overlong_sequence((UInt32)cp, length)) { // Passed! Return here. code_point = cp; ++it; return(utf_error.UTF8_OK); } else { err = utf_error.OVERLONG_SEQUENCE; } } else { err = utf_error.INVALID_CODE_POINT; } } // Failure branch - restore the original value of the iterator it = original_it; return(err); }