public static bool parse_string_sse41(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset) { #if SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing pj.write_tape(0, '"'); // don't bother with the string parsing at all return(true); // always succeeds #else uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a " uint8_t *dst = pj.current_string_buf_loc; #if JSON_TEST_STRINGS // for unit testing uint8_t *const start_of_string = dst; #endif Vector128 <byte> slashVec = Vector128.Create((byte)'\\'); Vector128 <byte> quoteVec = Vector128.Create((byte)'"'); Vector128 <byte> unitsep = Vector128.Create((byte)0x1F); while (true) { Vector128 <byte> v = Sse2.LoadVector128((src)); uint32_t bs_bits = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, slashVec)); uint32_t quote_bits = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, quoteVec)); // All Unicode characters may be placed within the // quotation marks, except for the characters that MUST be escaped: // quotation mark, reverse solidus, and the control characters (U+0000 //through U+001F). // https://tools.ietf.org/html/rfc8259 #if CHECKUNESCAPED Vector128 <byte> unescaped_vec = Sse2.CompareEqual(Sse2.Max(unitsep, v), unitsep); // could do it with saturated subtraction #endif // CHECKUNESCAPED uint32_t quote_dist = (uint32_t)trailingzeroes(quote_bits); uint32_t bs_dist = (uint32_t)trailingzeroes(bs_bits); // store to dest unconditionally - we can overwrite the bits we don't like // later memcpy(dst, src, (size_t)Vector128 <byte> .Count); if (quote_dist < bs_dist) { // we encountered quotes first. Move dst to point to quotes and exit dst[quote_dist] = 0; // null terminate and get out pj.WriteTape((size_t)pj.current_string_buf_loc - (size_t)pj.string_buf, (uint8_t)'"'); pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value #if CHECKUNESCAPED // check that there is no unescaped char before the quote uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec); bool is_ok = ((quote_bits - 1) & (~quote_bits) & unescaped_bits) == 0; #if JSON_TEST_STRINGS // for unit testing if (is_ok) { foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1); } else { foundBadString(buf + offset); } #endif // JSON_TEST_STRINGS return(is_ok); #else //CHECKUNESCAPED #if JSON_TEST_STRINGS // for unit testing foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1); #endif // JSON_TEST_STRINGS return(true); #endif //CHECKUNESCAPED } else if (quote_dist > bs_dist) { uint8_t escape_char = src[bs_dist + 1]; #if CHECKUNESCAPED // we are going to need the unescaped_bits to check for unescaped chars uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec); if (((bs_bits - 1) & (~bs_bits) & unescaped_bits) != 0) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); } #endif //CHECKUNESCAPED // we encountered backslash first. Handle backslash if (escape_char == 'u') { // move src/dst up to the start; they will be further adjusted // within the unicode codepoint handling code. src += bs_dist; dst += bs_dist; if (!handle_unicode_codepoint(&src, &dst)) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); } } else { // simple 1:1 conversion. Will eat bs_dist+2 characters in input and // write bs_dist+1 characters to output // note this may reach beyond the part of the buffer we've actually // seen. I think this is ok uint8_t escape_result = escape(escape_char); if (escape_result == 0) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); // bogus escape value is an error } dst[bs_dist] = escape_result; src += bs_dist + 2; dst += bs_dist + 1; } } else { // they are the same. Since they can't co-occur, it means we encountered // neither. src += Vector128 <byte> .Count; dst += Vector128 <byte> .Count; #if CHECKUNESCAPED // check for unescaped chars if (Sse2.MoveMask(unescaped_vec) != 0) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); } #endif // CHECKUNESCAPED } } // can't be reached return(true); #endif // SIMDJSON_SKIPSTRINGPARSING }
internal static JsonParseError unified_machine(uint8_t *buf, size_t len, ParsedJson pj) { #if !ALLOW_SAME_PAGE_BUFFER_OVERRUN memset((uint8_t *)buf + len, 0, SIMDJSON_PADDING); // to please valgrind #endif uint32_t i = 0; // index of the structural character (0,1,2,3...) uint32_t idx; // location of the structural character in the input (buf) uint8_t c = 0; // used to track the (structural) character we are looking at, updated // by UPDATE_CHAR macro uint32_t depth = 0; // could have an arbitrary starting depth pj.Init(); // sets isvalid to false if (pj.bytecapacity < len) { pj.ErrorCode = JsonParseError.CAPACITY; return(pj.ErrorCode); } ////////////////////////////// START STATE ///////////////////////////// pj.ret_address[depth] = (bytechar)'s'; pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, (uint8_t)'r'); // r for root, 0 is going to get overwritten // the root is used, if nothing else, to capture the size of the tape depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else. if (depth >= pj.depthcapacity) { goto fail; } idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)'{': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth >= pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); // strangely, moving this to object_begin slows things down goto object_begin; case (uint8_t)'[': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth >= pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); goto array_begin; // A JSON text is a serialized value. Note that certain previous // specifications of JSON constrained a JSON text to be an object or an // array. Implementations that generate only objects or arrays where a // JSON text is called for will be interoperable in the sense that all // implementations will accept these as conforming JSON texts. // https://tools.ietf.org/html/rfc8259 #if SIMDJSON_ALLOWANYTHINGINROOT case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': { // we need to make a copy to make sure that the string is space terminated. // this only applies to the JSON document made solely of the true value. // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!is_valid_true_atom((uint8_t *)(copy) + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'f': { // we need to make a copy to make sure that the string is space terminated. // this only applies to the JSON document made solely of the false value. // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!is_valid_false_atom((uint8_t *)(copy) + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'n': { // we need to make a copy to make sure that the string is space terminated. // this only applies to the JSON document made solely of the null value. // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!is_valid_null_atom((uint8_t *)(copy) + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { // we need to make a copy to make sure that the string is space terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice. We terminate with a space // because we do not want to allow NULLs in the middle of a number (whereas a // space in the middle of a number would be identified in stage 1). bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!parse_number((uint8_t *)(copy), pj, idx, false)) { free(copy); goto fail; } free(copy); break; } case (uint8_t)'-': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)(copy), pj, idx, true)) { free(copy); goto fail; } free(copy); break; } #endif // ALLOWANYTHINGINROOT default: goto fail; } start_continue: // the string might not be NULL terminated. if (i + 1 == pj.n_structural_indexes) { goto succeed; } else { goto fail; } ////////////////////////////// OBJECT STATES ///////////////////////////// object_begin: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; // could also go to object_continue default: goto fail; } object_key_state: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() if (c != ':') { goto fail; } idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; } case (uint8_t)'{': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an object inside an object, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an array inside an object, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } object_continue: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)',': idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() if (c != '"') { goto fail; } else { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; default: goto fail; } ////////////////////////////// COMMON STATE ///////////////////////////// scope_end: // write our tape location to the header scope depth--; pj.WriteTape(pj.containing_scope_offset[depth], c); pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); if (pj.ret_address[depth] == 'a') { goto array_continue; } else if (pj.ret_address[depth] == 'o') { goto object_continue; } else { goto start_continue; } ////////////////////////////// ARRAY STATES ///////////////////////////// array_begin: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() if (c == (uint8_t)']') { goto scope_end; // could also go to array_continue } main_array_switch: // we call update char on all paths in, so we can peek at c on the // on paths that can accept a close square brace (post-, and at start) switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; // goto array_continue; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; // goto array_continue; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; // goto array_continue; } case (uint8_t)'{': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an object inside an array, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an array inside an array, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } array_continue: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)',': idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() goto main_array_switch; case (uint8_t)']': goto scope_end; default: goto fail; } ////////////////////////////// FINAL STATES ///////////////////////////// succeed: depth--; if (depth != 0) { throw new InvalidOperationException("internal bug"); //abort(); } if (pj.containing_scope_offset[depth] != 0) { throw new InvalidOperationException("internal bug"); //abort(); } pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); pj.WriteTape(pj.containing_scope_offset[depth], (uint8_t)'r'); // r is root pj.isvalid = true; pj.ErrorCode = JsonParseError.SUCCESS; return(pj.ErrorCode); fail: // we do not need the next line because this is done by pj.init(), pessimistically. // pj.isvalid = false; // At this point in the code, we have all the time in the world. // Note that we know exactly where we are in the document so we could, // without any overhead on the processing code, report a specific location. // We could even trigger special code paths to assess what happened carefully, // all without any added cost. if (depth >= pj.depthcapacity) { pj.ErrorCode = JsonParseError.DEPTH_ERROR; return(pj.ErrorCode); } switch (c) { case (uint8_t)'"': pj.ErrorCode = JsonParseError.STRING_ERROR; return(pj.ErrorCode); case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': case (uint8_t)'-': pj.ErrorCode = JsonParseError.NUMBER_ERROR; return(pj.ErrorCode); case (uint8_t)'t': pj.ErrorCode = JsonParseError.T_ATOM_ERROR; return(pj.ErrorCode); case (uint8_t)'n': pj.ErrorCode = JsonParseError.N_ATOM_ERROR; return(pj.ErrorCode); case (uint8_t)'f': pj.ErrorCode = JsonParseError.F_ATOM_ERROR; return(pj.ErrorCode); default: break; } pj.ErrorCode = JsonParseError.TAPE_ERROR; return(pj.ErrorCode); }
internal static bool unified_machine(uint8_t *buf, size_t len, ParsedJson pj) { uint32_t i = 0; // index of the structural character (0,1,2,3...) uint32_t idx; // location of the structural character in the input (buf) uint8_t c; // used to track the (structural) character we are looking at, updated // by UPDATE_CHAR macro uint32_t depth = 0; // could have an arbitrary starting depth pj.Init(); if (pj.bytecapacity < len) { Debug.Write("insufficient capacity\n"); return(false); } // this macro reads the next structural character, updating idx, i and c. //C#: expanded directly everywhere //void UPDATE_CHAR() //{ // idx = pj.structural_indexes[i++]; // c = buf[idx]; //} pj.ret_address[depth] = (bytechar)'s'; pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, (byte)'r'); // r for root, 0 is going to get overwritten // the root is used, if nothing else, to capture the size of the tape depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else. if (depth > pj.depthcapacity) { goto fail; } //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'{': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth > pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); // strangely, moving this to object_begin slows things down goto object_begin; case (uint8_t)'[': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth > pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); goto array_begin; // A JSON text is a serialized value. Note that certain previous // specifications of JSON constrained a JSON text to be an object or an // array. Implementations that generate only objects or arrays where a // JSON text is called for will be interoperable in the sense that all // implementations will accept these as conforming JSON texts. // https://tools.ietf.org/html/rfc8259 #if SIMDJSON_ALLOWANYTHINGINROOT case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the true value. // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_true_atom((uint8_t *)copy + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'f': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the false value. // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_false_atom((uint8_t *)copy + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'n': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the null value. // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_null_atom((uint8_t *)copy + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)copy, pj, idx, false)) { free(copy); goto fail; } free(copy); break; } case (uint8_t)'-': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)copy, pj, idx, true)) { free(copy); goto fail; } free(copy); break; } #endif // ALLOWANYTHINGINROOT default: goto fail; } start_continue: // the string might not be NULL terminated. if (i + 1 == pj.n_structural_indexes) { goto succeed; } else { goto fail; } ////////////////////////////// OBJECT STATES ///////////////////////////// object_begin: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; // could also go to object_continue default: goto fail; } object_key_state: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; if (c != ':') { goto fail; } //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; } case (uint8_t)'{': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an object inside an object, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an array inside an object, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } object_continue: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)',': //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; if (c != (uint8_t)'"') { goto fail; } else { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; default: goto fail; } ////////////////////////////// COMMON STATE ///////////////////////////// scope_end: // write our tape location to the header scope depth--; pj.WriteTape(pj.containing_scope_offset[depth], c); pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); // goto saved_state if (pj.ret_address[depth] == (uint8_t)'a') { goto array_continue; } else if (pj.ret_address[depth] == (uint8_t)'o') { goto object_continue; } else { goto start_continue; } ////////////////////////////// ARRAY STATES ///////////////////////////// array_begin: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; if (c == ']') { goto scope_end; // could also go to array_continue } main_array_switch: // we call update char on all paths in, so we can peek at c on the // on paths that can accept a close square brace (post-, and at start) switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; // goto array_continue; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; // goto array_continue; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; // goto array_continue; } case (uint8_t)'{': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an object inside an array, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an array inside an array, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } array_continue: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)',': //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; goto main_array_switch; case (uint8_t)']': goto scope_end; default: goto fail; } ////////////////////////////// FINAL STATES ///////////////////////////// succeed: depth--; if (depth != 0) { throw new InvalidOperationException("internal bug"); } if (pj.containing_scope_offset[depth] != 0) { throw new InvalidOperationException("internal bug"); } pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); pj.WriteTape(pj.containing_scope_offset[depth], (byte)'r'); // r is root pj.isvalid = true; return(true); fail: return(false); }
internal static bool parse_string(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset) { pj.WriteTape((ulong)(pj.current_string_buf_loc - pj.string_buf), (char1)'"'); uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a " uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); uint8_t *start_of_string = dst; while (true) { parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst); if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { // we encountered quotes first. Move dst to point to quotes and exit // find out where the quote is... uint32_t quote_dist = (uint32_t)trailingzeroes(helper.quote_bits); // NULL termination is still handy if you expect all your strings to be NULL terminated? // It comes at a small cost dst[quote_dist] = 0; uint32_t str_length = (uint32_t)((dst - start_of_string) + quote_dist); memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t)); /////////////////////// // Above, check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB // Currently unneeded because we refuse to parse docs larger or equal to 4GB. //////////////////////// // we advance the point, accounting for the fact that we have a NULL termination pj.current_string_buf_loc = dst + quote_dist + 1; return(true); } if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { // find out where the backspace is uint32_t bs_dist = (uint32_t)trailingzeroes(helper.bs_bits); uint8_t escape_char = src[bs_dist + 1]; // we encountered backslash first. Handle backslash if (escape_char == 'u') { // move src/dst up to the start; they will be further adjusted // within the unicode codepoint handling code. src += bs_dist; dst += bs_dist; if (!handle_unicode_codepoint(&src, &dst)) { return(false); } } else { // simple 1:1 conversion. Will eat bs_dist+2 characters in input and // write bs_dist+1 characters to output // note this may reach beyond the part of the buffer we've actually // seen. I think this is ok uint8_t escape_result = escape_map[escape_char]; // TODO: https://github.com/dotnet/coreclr/issues/25894 if (escape_result == 0u) { return(false); // bogus escape value is an error } dst[bs_dist] = escape_result; src += bs_dist + 2; dst += bs_dist + 1; } } else { // they are the same. Since they can't co-occur, it means we encountered // neither. if (!Avx2.IsSupported) { src += 16; // sse42 dst += 16; } else { src += 32; // avx2 dst += 32; } } } }