public static bool parse_number(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { bytechar *p = (bytechar *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; if (!is_integer(*p)) { // a negative sign must be followed by an integer return(false); } } bytechar *startdigits = p; int64_t i; if (*p == '0') { // 0 cannot be followed by an integer ++p; if (is_not_structural_or_whitespace_or_exponent_or_decimal((uint8_t)(*p))) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } i = 0; } else { if (!(is_integer(*p))) { // must start with an integer #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); i = digit; p++; // the is_made_of_eight_digits_fast routine is unlikely to help here because // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); i = 10 * i + digit; // might overflow ++p; } } int64_t exponent = 0; if ('.' == *p) { ++p; bytechar *firstafterperiod = p; if (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; i = i * 10 + digit; } else { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } #if SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; // exponent -= 8; } #endif while (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later. } exponent = firstafterperiod - p; } int digitcount = (int)(p - startdigits - 1); int64_t expnumber = 0; // exponential part if (('e' == *p) || ('E' == *p)) { ++p; bool negexp = false; if ('-' == *p) { negexp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); expnumber = digit; p++; while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } exponent += (negexp ? -expnumber : expnumber); } i = negative ? -i : i; if ((exponent != 0) || (expnumber != 0)) { if ((digitcount >= 19)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! return(parse_float(buf, pj, offset, found_minus)); } /////////// // We want 0.1e1 to be a float. ////////// if (i == 0) { pj.WriteTapeDouble(0.0); #if JSON_TEST_NUMBERS // for unit testing foundFloat(0.0, buf + offset); #endif } else { if ((exponent > 308) || (exponent < -308)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } double d = i; d *= power_of_ten[308 + exponent]; // d = negative ? -d : d; pj.WriteTapeDouble(d); #if JSON_TEST_NUMBERS // for unit testing foundFloat(d, buf + offset); #endif } } else { if ((digitcount >= 18)) { // this is uncommon!!! return(parse_large_integer(buf, pj, offset, found_minus)); } pj.WriteTapeInt64(i); #if JSON_TEST_NUMBERS // for unit testing foundInteger(i, buf + offset); #endif } return(is_structural_or_whitespace((uint8_t)(*p)) != 0); }
// called by parse_number when we know that the output is a float, // but where there might be some integer overflow. The trick here is to // parse using floats from the start. // Do not call this function directly as it skips some of the checks from // parse_number // // This function will almost never be called!!! // // Note: a redesign could avoid this function entirely. // private static bool parse_float(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { bytechar *p = (bytechar *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; } double i; if (*p == '0') { // 0 cannot be followed by an integer ++p; i = 0; } else { unsigned_bytechar digit = (unsigned_bytechar)(*p - (bytechar)'0'); i = digit; p++; while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); i = 10 * i + digit; ++p; } } if ('.' == *p) { ++p; double fractionalweight = 1; if (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; fractionalweight *= 0.1; i = i + digit * fractionalweight; } else { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } while (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; fractionalweight *= 0.1; i = i + digit * fractionalweight; } } if (('e' == *p) || ('E' == *p)) { ++p; bool negexp = false; if ('-' == *p) { negexp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); int64_t expnumber = digit; // exponential part p++; if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } int exponent = (int)(negexp ? -expnumber : expnumber); if ((exponent > 308) || (exponent < -308)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } i *= power_of_ten[308 + exponent]; } if (is_not_structural_or_whitespace((byte)*p) != 0) { return(false); } double d = negative ? -i : i; pj.WriteTapeDouble(d); #if JSON_TEST_NUMBERS // for unit testing foundFloat(d, buf + offset); #endif return(is_structural_or_whitespace((byte)(*p)) != 0); }
// called by parse_number when we know that the output is an integer, // but where there might be some integer overflow. // we want to catch overflows! // Do not call this function directly as it skips some of the checks from // parse_number // // This function will almost never be called!!! // static bool parse_large_integer(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { bytechar *p = (bytechar *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; } uint64_t i; if (*p == '0') { // 0 cannot be followed by an integer ++p; i = 0; } else { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); i = digit; p++; // the is_made_of_eight_digits_fast routine is unlikely to help here because // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); if (mul_overflow(i, 10, &i)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } if (add_overflow(i, digit, &i)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } ++p; } } if (negative) { if (i > 0x8000000000000000) { // overflows! #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } } else { if (i >= 0x8000000000000000) { // overflows! #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } } int64_t signed_answer = negative ? -(int64_t)i : (int64_t)i; pj.WriteTapeInt64(signed_answer); #if JSON_TEST_NUMBERS // for unit testing foundInteger(signed_answer, buf + offset); #endif return(is_structural_or_whitespace((byte)(*p)) != 0); }
public static bool unified_machine(uint8_t *buf, size_t len, ParsedJson *pj) { uint32_t i = 0; // index of the structural character (0,1,2,3...) uint32_t idx; // location of the structural character in the input (buf) uint8_t c; // used to track the (structural) character we are looking at, updated // by UPDATE_CHAR macro uint32_t depth = 0; // could have an arbitrary starting depth pj->init(); if (pj->bytecapacity < len) { Debug.Write("insufficient capacity\n"); return(false); } // this macro reads the next structural character, updating idx, i and c. //C#: expanded directly everywhere //void UPDATE_CHAR() //{ // idx = pj->structural_indexes[i++]; // c = buf[idx]; //} pj->ret_address[depth] = (bytechar)'s'; pj->containing_scope_offset[depth] = pj->get_current_loc(); pj->write_tape(0, (byte)'r'); // r for root, 0 is going to get overwritten // the root is used, if nothing else, to capture the size of the tape depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else. if (depth > pj->depthcapacity) { goto fail; } //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'{': pj->containing_scope_offset[depth] = pj->get_current_loc(); pj->ret_address[depth] = (bytechar)'s'; depth++; if (depth > pj->depthcapacity) { goto fail; } pj->write_tape(0, c); // strangely, moving this to object_begin slows things down goto object_begin; case (uint8_t)'[': pj->containing_scope_offset[depth] = pj->get_current_loc(); pj->ret_address[depth] = (bytechar)'s'; depth++; if (depth > pj->depthcapacity) { goto fail; } pj->write_tape(0, c); goto array_begin; // A JSON text is a serialized value. Note that certain previous // specifications of JSON constrained a JSON text to be an object or an // array. Implementations that generate only objects or arrays where a // JSON text is called for will be interoperable in the sense that all // implementations will accept these as conforming JSON texts. // https://tools.ietf.org/html/rfc8259 #if SIMDJSON_ALLOWANYTHINGINROOT case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the true value. // this will almost never be called in practice bytechar *copy = stackalloc bytechar[(int)(len + SIMDJSON_PADDING)]; if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_true_atom((uint8_t *)copy + idx)) { //free(copy); goto fail; } //free(copy); pj->write_tape(0, c); break; } case (uint8_t)'f': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the false value. // this will almost never be called in practice bytechar *copy = stackalloc bytechar[(int)(len + SIMDJSON_PADDING)]; if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_false_atom((uint8_t *)copy + idx)) { //free(copy); goto fail; } //free(copy); pj->write_tape(0, c); break; } case (uint8_t)'n': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the null value. // this will almost never be called in practice bytechar *copy = stackalloc bytechar[(int)(len + SIMDJSON_PADDING)]; if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_null_atom((uint8_t *)copy + idx)) { //free(copy); goto fail; } //free(copy); pj->write_tape(0, c); break; } case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = stackalloc bytechar[(int)(len + SIMDJSON_PADDING)]; if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)copy, pj, idx, false)) { //free(copy); goto fail; } //free(copy); break; } case (uint8_t)'-': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = stackalloc bytechar[(int)(len + SIMDJSON_PADDING)]; if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)copy, pj, idx, true)) { //free(copy); goto fail; } //free(copy); break; } #endif // ALLOWANYTHINGINROOT default: goto fail; } start_continue: // the string might not be NULL terminated. if (i + 1 == pj->n_structural_indexes) { goto succeed; } else { goto fail; } ////////////////////////////// OBJECT STATES ///////////////////////////// object_begin: //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; // could also go to object_continue default: goto fail; } object_key_state: //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; if (c != ':') { goto fail; } //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj->write_tape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj->write_tape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj->write_tape(0, c); break; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; } case (uint8_t)'{': { pj->containing_scope_offset[depth] = pj->get_current_loc(); pj->write_tape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj->ret_address[depth] = (bytechar)'o'; // we found an object inside an object, so we need to increment the depth depth++; if (depth > pj->depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { pj->containing_scope_offset[depth] = pj->get_current_loc(); pj->write_tape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj->ret_address[depth] = (bytechar)'o'; // we found an array inside an object, so we need to increment the depth depth++; if (depth > pj->depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } object_continue: //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)',': //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; if (c != (uint8_t)'"') { goto fail; } else { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; default: goto fail; } ////////////////////////////// COMMON STATE ///////////////////////////// scope_end: // write our tape location to the header scope depth--; pj->write_tape(pj->containing_scope_offset[depth], c); pj->annotate_previousloc(pj->containing_scope_offset[depth], pj->get_current_loc()); // goto saved_state if (pj->ret_address[depth] == (uint8_t)'a') { goto array_continue; } else if (pj->ret_address[depth] == (uint8_t)'o') { goto object_continue; } else { goto start_continue; } ////////////////////////////// ARRAY STATES ///////////////////////////// array_begin: //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; if (c == ']') { goto scope_end; // could also go to array_continue } main_array_switch: // we call update char on all paths in, so we can peek at c on the // on paths that can accept a close square brace (post-, and at start) switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj->write_tape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj->write_tape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj->write_tape(0, c); break; // goto array_continue; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; // goto array_continue; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; // goto array_continue; } case (uint8_t)'{': { // we have not yet encountered ] so we need to come back for it pj->containing_scope_offset[depth] = pj->get_current_loc(); pj->write_tape(0, c); // here the compilers knows what c is so this gets optimized pj->ret_address[depth] = (bytechar)'a'; // we found an object inside an array, so we need to increment the depth depth++; if (depth > pj->depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { // we have not yet encountered ] so we need to come back for it pj->containing_scope_offset[depth] = pj->get_current_loc(); pj->write_tape(0, c); // here the compilers knows what c is so this gets optimized pj->ret_address[depth] = (bytechar)'a'; // we found an array inside an array, so we need to increment the depth depth++; if (depth > pj->depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } array_continue: //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)',': //UPDATE_CHAR(): idx = pj->structural_indexes[i++]; c = buf[idx]; goto main_array_switch; case (uint8_t)']': goto scope_end; default: goto fail; } ////////////////////////////// FINAL STATES ///////////////////////////// succeed: depth--; if (depth != 0) { throw new InvalidOperationException("internal bug"); } if (pj->containing_scope_offset[depth] != 0) { throw new InvalidOperationException("internal bug"); } pj->annotate_previousloc(pj->containing_scope_offset[depth], pj->get_current_loc()); pj->write_tape(pj->containing_scope_offset[depth], (byte)'r'); // r is root pj->isvalid = true; return(true); fail: return(false); }