public ParsedJsonIterator(ParsedJson parsedJson) { pj = parsedJson; depth = 0; location = 0; tape_length = 0; depthindex = allocate <scopeindex_t>(pj.depthcapacity); current_type = 0; current_val = 0; depthindex[0].start_of_scope = location; current_val = pj.tape[location++]; current_type = (uint8_t)(current_val >> 56); depthindex[0].scope_type = current_type; if (current_type == 'r') { tape_length = current_val & JSONVALUEMASK; if (location < tape_length) { current_val = pj.tape[location]; current_type = (uint8_t)(current_val >> 56); depth++; depthindex[depth].start_of_scope = location; depthindex[depth].scope_type = current_type; } } else { throw new InvalidOperationException("Json is invalid"); } }
internal static JsonParseError JsonParse(uint8_t *jsonData, size_t length, ParsedJson pj, bool reallocIfNeeded = true) { if (pj.bytecapacity < length) { return(JsonParseError.Capacity); } bool reallocated = false; if (reallocIfNeeded) { // realloc is needed if the end of the memory crosses a page if ((size_t)(jsonData + length - 1) % (size_t)pagesize < SIMDJSON_PADDING) { uint8_t *tmpbuf = jsonData; jsonData = (uint8_t *)allocate_padded_buffer(length); if (jsonData == null) { return(JsonParseError.Memalloc); } memcpy(jsonData, tmpbuf, length); reallocated = true; } } var result = JsonParseError.Success; if (stage1_find_marks.find_structural_bits(jsonData, length, pj)) { result = stage2_build_tape.unified_machine(jsonData, length, pj); } if (reallocated) aligned_free(jsonData); }
public static bool parse_string(uint8_t* buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset) { if (Avx2.IsSupported) return parse_string_avx2(buf, len, pj, depth, offset); //if (Sse41.IsSupported) // return parse_string_sse41(buf, len, pj, depth, offset); ThrowHelper.ThrowPNSE(); return false; }
public void Dispose() { if (depthindex != null) { delete(depthindex); depthindex = null; } if (pj != null) { pj.Dispose(); pj = null; } }
public static ParsedJson build_parsed_json(uint8_t *buf, size_t len, bool reallocifneeded = true) { ParsedJson pj = new ParsedJson(); bool ok = pj.allocateCapacity(len); if (ok) { ok = json_parse(buf, len, &pj, reallocifneeded); } else { throw new InvalidOperationException("failure during memory allocation"); } return(pj); }
public static ParsedJson ParseJson(byte *jsonData, int length, bool reallocIfNeeded = true) { var pj = new ParsedJson(); bool ok = pj.AllocateCapacity((ulong)length); if (ok) { JsonParse(jsonData, (ulong)length, pj, reallocIfNeeded); } else { throw new InvalidOperationException("failure during memory allocation"); } return(pj); }
public static ParsedJson ParseJson(byte *jsonData, ulong length, bool reallocIfNeeded = true) { var pj = new ParsedJson(); bool ok = pj.AllocateCapacity(length); if (ok) { JsonParse(jsonData, length, pj, reallocIfNeeded); } else { pj.isvalid = false; pj.ErrorCode = JsonParseError.CAPACITY; } return(pj); }
internal static bool JsonParse(uint8_t *jsonData, size_t length, ParsedJson pj, bool reallocIfNeeded = true) { if (pj.bytecapacity < length) { throw new InvalidOperationException("Your ParsedJson cannot support documents that big: " + length); } bool reallocated = false; if (reallocIfNeeded) { // realloc is needed if the end of the memory crosses a page if ((size_t)(jsonData + length - 1) % (size_t)pagesize < SIMDJSON_PADDING) { uint8_t *tmpbuf = jsonData; jsonData = (uint8_t *)allocate_padded_buffer(length); if (jsonData == null) { return(false); } memcpy(jsonData, tmpbuf, length); reallocated = true; } } bool isok = stage1_find_marks.find_structural_bits(jsonData, length, pj); if (isok) { isok = stage2_build_tape.unified_machine(jsonData, length, pj); } else { if (reallocated) { free(jsonData); } return(false); } if (reallocated) free(jsonData); }
public static bool parse_string_sse41(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset) { #if SIMDJSON_SKIPSTRINGPARSING // for performance analysis, it is sometimes useful to skip parsing pj.write_tape(0, '"'); // don't bother with the string parsing at all return(true); // always succeeds #else uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a " uint8_t *dst = pj.current_string_buf_loc; #if JSON_TEST_STRINGS // for unit testing uint8_t *const start_of_string = dst; #endif Vector128 <byte> slashVec = Vector128.Create((byte)'\\'); Vector128 <byte> quoteVec = Vector128.Create((byte)'"'); Vector128 <byte> unitsep = Vector128.Create((byte)0x1F); while (true) { Vector128 <byte> v = Sse2.LoadVector128((src)); uint32_t bs_bits = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, slashVec)); uint32_t quote_bits = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, quoteVec)); // All Unicode characters may be placed within the // quotation marks, except for the characters that MUST be escaped: // quotation mark, reverse solidus, and the control characters (U+0000 //through U+001F). // https://tools.ietf.org/html/rfc8259 #if CHECKUNESCAPED Vector128 <byte> unescaped_vec = Sse2.CompareEqual(Sse2.Max(unitsep, v), unitsep); // could do it with saturated subtraction #endif // CHECKUNESCAPED uint32_t quote_dist = (uint32_t)trailingzeroes(quote_bits); uint32_t bs_dist = (uint32_t)trailingzeroes(bs_bits); // store to dest unconditionally - we can overwrite the bits we don't like // later memcpy(dst, src, (size_t)Vector128 <byte> .Count); if (quote_dist < bs_dist) { // we encountered quotes first. Move dst to point to quotes and exit dst[quote_dist] = 0; // null terminate and get out pj.WriteTape((size_t)pj.current_string_buf_loc - (size_t)pj.string_buf, (uint8_t)'"'); pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value #if CHECKUNESCAPED // check that there is no unescaped char before the quote uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec); bool is_ok = ((quote_bits - 1) & (~quote_bits) & unescaped_bits) == 0; #if JSON_TEST_STRINGS // for unit testing if (is_ok) { foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1); } else { foundBadString(buf + offset); } #endif // JSON_TEST_STRINGS return(is_ok); #else //CHECKUNESCAPED #if JSON_TEST_STRINGS // for unit testing foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1); #endif // JSON_TEST_STRINGS return(true); #endif //CHECKUNESCAPED } else if (quote_dist > bs_dist) { uint8_t escape_char = src[bs_dist + 1]; #if CHECKUNESCAPED // we are going to need the unescaped_bits to check for unescaped chars uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec); if (((bs_bits - 1) & (~bs_bits) & unescaped_bits) != 0) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); } #endif //CHECKUNESCAPED // we encountered backslash first. Handle backslash if (escape_char == 'u') { // move src/dst up to the start; they will be further adjusted // within the unicode codepoint handling code. src += bs_dist; dst += bs_dist; if (!handle_unicode_codepoint(&src, &dst)) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); } } else { // simple 1:1 conversion. Will eat bs_dist+2 characters in input and // write bs_dist+1 characters to output // note this may reach beyond the part of the buffer we've actually // seen. I think this is ok uint8_t escape_result = escape(escape_char); if (escape_result == 0) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); // bogus escape value is an error } dst[bs_dist] = escape_result; src += bs_dist + 2; dst += bs_dist + 1; } } else { // they are the same. Since they can't co-occur, it means we encountered // neither. src += Vector128 <byte> .Count; dst += Vector128 <byte> .Count; #if CHECKUNESCAPED // check for unescaped chars if (Sse2.MoveMask(unescaped_vec) != 0) { #if JSON_TEST_STRINGS // for unit testing foundBadString(buf + offset); #endif // JSON_TEST_STRINGS return(false); } #endif // CHECKUNESCAPED } } // can't be reached return(true); #endif // SIMDJSON_SKIPSTRINGPARSING }
// called by parse_number when we know that the output is an integer, // but where there might be some integer overflow. // we want to catch overflows! // Do not call this function directly as it skips some of the checks from // parse_number // // This function will almost never be called!!! // static bool parse_large_integer(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { char1 *p = (char1 *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; } uint64_t i; if (*p == (uchar1)'0') { // 0 cannot be followed by an integer ++p; i = 0; } else { uchar1 digit = (uchar1)(*p - (uchar1)'0'); i = digit; p++; // the is_made_of_eight_digits_fast routine is unlikely to help here because // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); if (mul_overflow(i, 10, &i)) { return(false); // overflow } if (add_overflow(i, digit, &i)) { return(false); // overflow } ++p; } } if (negative) { if (i > 0x8000000000000000) { return(false); // overflow } } else { if (i >= 0x8000000000000000) { return(false); // overflow } } int64_t signed_answer = negative ? -(int64_t)i : (int64_t)i; pj.WriteTapeInt64(signed_answer); return(is_structural_or_whitespace((uchar1)(*p)) != 0); }
// called by parse_number when we know that the output is a float, // but where there might be some integer overflow. The trick here is to // parse using floats from the start. // Do not call this function directly as it skips some of the checks from // parse_number // // This function will almost never be called!!! // // Note: a redesign could avoid this function entirely. // private static bool parse_float(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { bytechar *p = (bytechar *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; } double i; if (*p == '0') { // 0 cannot be followed by an integer ++p; i = 0; } else { unsigned_bytechar digit = (unsigned_bytechar)(*p - (bytechar)'0'); i = digit; p++; while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); i = 10 * i + digit; ++p; } } if ('.' == *p) { ++p; double fractionalweight = 1; if (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; fractionalweight *= 0.1; i = i + digit * fractionalweight; } else { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } while (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; fractionalweight *= 0.1; i = i + digit * fractionalweight; } } if (('e' == *p) || ('E' == *p)) { ++p; bool negexp = false; if ('-' == *p) { negexp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); int64_t expnumber = digit; // exponential part p++; if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } int exponent = (int)(negexp ? -expnumber : expnumber); if ((exponent > 308) || (exponent < -308)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } i *= power_of_ten[308 + exponent]; } if (is_not_structural_or_whitespace((byte)*p) != 0) { return(false); } double d = negative ? -i : i; pj.WriteTapeDouble(d); #if JSON_TEST_NUMBERS // for unit testing foundFloat(d, buf + offset); #endif return(is_structural_or_whitespace((byte)(*p)) != 0); }
internal static bool parse_number(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { char1 *p = (char1 *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; if (!is_integer(*p)) { // a negative sign must be followed by an integer return(false); } } char1 * startdigits = p; uint64_t i; // an unsigned int avoids signed overflows (which are bad) if (*p == (char1)'0') { // 0 cannot be followed by an integer ++p; if (is_not_structural_or_whitespace_or_exponent_or_decimal((uint8_t)(*p))) { return(false); } i = 0; } else { if (!(is_integer(*p))) { // must start with an integer return(false); } uchar1 digit = (uchar1)(*p - (uchar1)'0'); i = digit; p++; // the is_made_of_eight_digits_fast routine is unlikely to help here because // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); // a multiplication by 10 is cheaper than an arbitrary integer multiplication i = 10 * i + digit; // might overflow, we will handle the overflow later ++p; } } int64_t exponent = 0; bool is_float = false; if ('.' == *p) { is_float = true; // At this point we know that we have a float // we continue with the fiction that we have an integer. If the // floating point number is representable as x * 10^z for some integer // z that fits in 53 bits, then we will be able to convert back the // the integer into a float in a lossless manner. ++p; char1 *firstafterperiod = p; if (is_integer(*p)) { uchar1 digit = (uchar1)(*p - (uchar1)'0'); ++p; i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult. // we will handle the overflow later } else { return(false); } #if SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; } #endif while (is_integer(*p)) { uchar1 digit = (uchar1)(*p - (uchar1)'0'); ++p; i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later. } exponent = firstafterperiod - p; } int digitcount = (int)(p - startdigits - 1); // used later to guard against overflows int64_t expnumber = 0; // exponential part if (((char1)'e' == *p) || ((char1)'E' == *p)) { is_float = true; ++p; bool negexp = false; if ('-' == *p) { negexp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { return(false); } uchar1 digit = (uchar1)(*p - (uchar1)'0'); expnumber = digit; p++; if (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { // we refuse to parse this return(false); } exponent += (negexp ? -expnumber : expnumber); } if (is_float) { uint64_t powerindex = (uint64_t)(308 + exponent); if (/*unlikely*/ ((digitcount >= 19))) { // this is uncommon // It is possible that the integer had an overflow. // We have to handle the case where we have 0.0000somenumber. char1 *start = startdigits; while ((*start == (char1)'0') || (*start == (char1)'.')) { start++; } digitcount -= (int)(start - startdigits); if (digitcount >= 19) { // Ok, chances are good that we had an overflow! // this is almost never going to get called!!! // we start anew, going slowly!!! return(parse_float(buf, pj, offset, found_minus)); } } if (/*unlikely*/ ((powerindex > 2 * 308))) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! return(parse_float(buf, pj, offset, found_minus)); } double factor = power_of_ten[powerindex]; factor = negative ? -factor : factor; double d = i * factor; pj.WriteTapeDouble(d); } else { if (/*unlikely*/ (digitcount >= 18)) { // this is uncommon!!! // there is a good chance that we had an overflow, so we need // need to recover: we parse the whole thing again. return(parse_large_integer(buf, pj, offset, found_minus)); } i = negative ? 0 - i : i; pj.WriteTapeInt64((int64_t)i); } return(is_structural_or_whitespace((uint8_t)(*p)) != 0); }
internal static JsonParseError find_structural_bits(uint8_t *buf, size_t len, ParsedJson pj) { if (len > pj.bytecapacity) { return(JsonParseError.CAPACITY); } uint32_t *base_ptr = pj.structural_indexes; uint32_t @base = 0; #if SIMDJSON_UTF8VALIDATE utf8_checking_state state; #endif // we have padded the input out to 64 byte multiple with the remainder being // zeros // persistent state across loop // does the last iteration end with an odd-length sequence of backslashes? // either 0 or 1, but a 64-bit value uint64_t prev_iter_ends_odd_backslash = 0UL; // does the previous iteration end inside a double-quote pair? uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones // does the previous iteration end on something that is a predecessor of a // pseudo-structural character - i.e. whitespace or a structural character // effectively the very first char is considered to follow "whitespace" for // the // purposes of pseudo-structural character detection so we initialize to 1 uint64_t prev_iter_ends_pseudo_pred = 1UL; // structurals are persistent state across loop as we flatten them on the // subsequent iteration into our array pointed to be base_ptr. // This is harmless on the first iteration as structurals==0 // and is done for performance reasons; we can hide some of the latency of the // expensive carryless multiply in the previous step with this work uint64_t structurals = 0; size_t lenminus64 = len < 64 ? 0 : len - 64; size_t idx = 0; uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20) for (; idx < lenminus64; idx += 64) { //__builtin_prefetch(buf + idx + 128); simd_input @in = fill_input(buf + idx); #if SIMDJSON_UTF8VALIDATE check_utf8(in, state); #endif // detect odd sequences of backslashes uint64_t odd_ends = find_odd_backslash_sequences( @in, ref prev_iter_ends_odd_backslash); // detect insides of quote pairs ("quote_mask") and also our quote_bits // themselves uint64_t quote_bits = 0; uint64_t quote_mask = find_quote_mask_and_bits( @in, odd_ends, ref prev_iter_inside_quote, ref quote_bits, ref error_mask); // take the previous iterations structural bits, not our current iteration, // and flatten flatten_bits(base_ptr, ref @base, (uint32_t)idx, structurals); uint64_t whitespace = 0; find_whitespace_and_structurals(@in, ref whitespace, ref structurals); // fixup structurals to reflect quotes and add pseudo-structural characters structurals = finalize_structurals(structurals, whitespace, quote_mask, quote_bits, ref prev_iter_ends_pseudo_pred); } //////////////// // we use a giant copy-paste which is ugly. // but otherwise the string needs to be properly padded or else we // risk invalidating the UTF-8 checks. //////////// if (idx < len) { uint8_t *tmpbuf = stackalloc uint8_t[64]; memset(tmpbuf, 0x20, 64); memcpy(tmpbuf, buf + idx, len - idx); simd_input @in = fill_input(tmpbuf); #if SIMDJSON_UTF8VALIDATE check_utf8 <T>(in, state); #endif // detect odd sequences of backslashes uint64_t odd_ends = find_odd_backslash_sequences( @in, ref prev_iter_ends_odd_backslash); // detect insides of quote pairs ("quote_mask") and also our quote_bits // themselves uint64_t quote_bits = 0; uint64_t quote_mask = find_quote_mask_and_bits( @in, odd_ends, ref prev_iter_inside_quote, ref quote_bits, ref error_mask); // take the previous iterations structural bits, not our current iteration, // and flatten flatten_bits(base_ptr, ref @base, (uint)idx, structurals); uint64_t whitespace = 0; find_whitespace_and_structurals(@in, ref whitespace, ref structurals); // fixup structurals to reflect quotes and add pseudo-strucural characters structurals = finalize_structurals(structurals, whitespace, quote_mask, quote_bits, ref prev_iter_ends_pseudo_pred); idx += 64; } // is last string quote closed? if (prev_iter_inside_quote != 0) { return(JsonParseError.UNCLOSED_STRING); } // finally, flatten out the remaining structurals from the last iteration flatten_bits(base_ptr, ref @base, (uint)idx, structurals); pj.n_structural_indexes = @base; // a valid JSON file cannot have zero structural indexes - we should have // found something if (pj.n_structural_indexes == 0u) { return(JsonParseError.EMPTY); } if (base_ptr[pj.n_structural_indexes - 1] > len) { return(JsonParseError.UNEXPECTED_ERROR); } if (len != base_ptr[pj.n_structural_indexes - 1]) { // the string might not be NULL terminated, but we add a virtual NULL ending // character. base_ptr[pj.n_structural_indexes++] = (uint)len; } // make it safe to dereference one beyond this array base_ptr[pj.n_structural_indexes] = 0; if (error_mask != 0) { return(JsonParseError.UNESCAPED_CHARS); } #if SIMDJSON_UTF8VALIDATE return(check_utf8_errors(state)); #else return(JsonParseError.SUCCESS); #endif }
internal static JsonParseError find_structural_bits(char1 *buf, size_t len, ParsedJson pj) => find_structural_bits((uint8_t *)(buf), len, pj);
// called by parse_number when we know that the output is a float, // but where there might be some integer overflow. The trick here is to // parse using floats from the start. // Do not call this function directly as it skips some of the checks from // parse_number // // This function will almost never be called!!! // // Note: a redesign could avoid this function entirely. // static bool parse_float(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { char1 *p = (char1 *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; } /*long*/ double i; if (*p == '0') { // 0 cannot be followed by an integer ++p; i = 0; } else { uchar1 digit = (uchar1)(*p - (uchar1)'0'); i = digit; p++; while (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); i = 10 * i + digit; ++p; } } if ('.' == *p) { ++p; int fractionalweight = 308; if (is_integer(*p)) { uchar1 digit = (uchar1)(*p - (uchar1)'0'); ++p; fractionalweight--; i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0); } else { return(false); } while (is_integer(*p)) { uchar1 digit = (uchar1)(*p - (uchar1)'0'); ++p; fractionalweight--; i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0); } } if (('e' == *p) || ('E' == *p)) { ++p; bool negexp = false; if ('-' == *p) { negexp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { return(false); } uchar1 digit = (uchar1)(*p - (uchar1)'0'); int64_t expnumber = digit; // exponential part p++; if (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (uchar1)(*p - (uchar1)'0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { return(false); } if (/*unlikely*/ (expnumber > 308)) { // C# needs unlikely! // this path is unlikely if (negexp) { // We either have zero or a subnormal. // We expect this to be uncommon so we go through a slow path. i = subnormal_power10(i, (int)-expnumber); } else { // We know for sure that we have a number that is too large, // we refuse to parse this return(false); } } else { int exponent = (int)(negexp ? -expnumber : expnumber); // we have that expnumber is [0,308] so that // exponent is [-308,308] so that // 308 + exponent is in [0, 2 * 308] i *= power_of_ten[308 + exponent]; } } if (is_not_structural_or_whitespace((uint8_t)(*p)) != 0) { return(false); } double d = negative ? -i : i; pj.WriteTapeDouble(d); return(is_structural_or_whitespace((uint8_t)(*p)) != 0); }
// called by parse_number when we know that the output is an integer, // but where there might be some integer overflow. // we want to catch overflows! // Do not call this function directly as it skips some of the checks from // parse_number // // This function will almost never be called!!! // static bool parse_large_integer(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { bytechar *p = (bytechar *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; } uint64_t i; if (*p == '0') { // 0 cannot be followed by an integer ++p; i = 0; } else { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); i = digit; p++; // the is_made_of_eight_digits_fast routine is unlikely to help here because // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); if (mul_overflow(i, 10, &i)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } if (add_overflow(i, digit, &i)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } ++p; } } if (negative) { if (i > 0x8000000000000000) { // overflows! #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } } else { if (i >= 0x8000000000000000) { // overflows! #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); // overflow } } int64_t signed_answer = negative ? -(int64_t)i : (int64_t)i; pj.WriteTapeInt64(signed_answer); #if JSON_TEST_NUMBERS // for unit testing foundInteger(signed_answer, buf + offset); #endif return(is_structural_or_whitespace((byte)(*p)) != 0); }
public static bool parse_number(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus) { bytechar *p = (bytechar *)(buf + offset); bool negative = false; if (found_minus) { ++p; negative = true; if (!is_integer(*p)) { // a negative sign must be followed by an integer return(false); } } bytechar *startdigits = p; int64_t i; if (*p == '0') { // 0 cannot be followed by an integer ++p; if (is_not_structural_or_whitespace_or_exponent_or_decimal((uint8_t)(*p))) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } i = 0; } else { if (!(is_integer(*p))) { // must start with an integer #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); i = digit; p++; // the is_made_of_eight_digits_fast routine is unlikely to help here because // we rarely see large integer parts like 123456789 while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); i = 10 * i + digit; // might overflow ++p; } } int64_t exponent = 0; if ('.' == *p) { ++p; bytechar *firstafterperiod = p; if (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; i = i * 10 + digit; } else { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } #if SWAR_NUMBER_PARSING // this helps if we have lots of decimals! // this turns out to be frequent enough. if (is_made_of_eight_digits_fast(p)) { i = i * 100000000 + parse_eight_digits_unrolled(p); p += 8; // exponent -= 8; } #endif while (is_integer(*p)) { unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); ++p; i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later. } exponent = firstafterperiod - p; } int digitcount = (int)(p - startdigits - 1); int64_t expnumber = 0; // exponential part if (('e' == *p) || ('E' == *p)) { ++p; bool negexp = false; if ('-' == *p) { negexp = true; ++p; } else if ('+' == *p) { ++p; } if (!is_integer(*p)) { #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } unsigned_bytechar digit = (unsigned_bytechar)(*p - '0'); expnumber = digit; p++; while (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { digit = (unsigned_bytechar)(*p - '0'); expnumber = 10 * expnumber + digit; ++p; } if (is_integer(*p)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } exponent += (negexp ? -expnumber : expnumber); } i = negative ? -i : i; if ((exponent != 0) || (expnumber != 0)) { if ((digitcount >= 19)) { // this is uncommon!!! // this is almost never going to get called!!! // we start anew, going slowly!!! return(parse_float(buf, pj, offset, found_minus)); } /////////// // We want 0.1e1 to be a float. ////////// if (i == 0) { pj.WriteTapeDouble(0.0); #if JSON_TEST_NUMBERS // for unit testing foundFloat(0.0, buf + offset); #endif } else { if ((exponent > 308) || (exponent < -308)) { // we refuse to parse this #if JSON_TEST_NUMBERS // for unit testing foundInvalidNumber(buf + offset); #endif return(false); } double d = i; d *= power_of_ten[308 + exponent]; // d = negative ? -d : d; pj.WriteTapeDouble(d); #if JSON_TEST_NUMBERS // for unit testing foundFloat(d, buf + offset); #endif } } else { if ((digitcount >= 18)) { // this is uncommon!!! return(parse_large_integer(buf, pj, offset, found_minus)); } pj.WriteTapeInt64(i); #if JSON_TEST_NUMBERS // for unit testing foundInteger(i, buf + offset); #endif } return(is_structural_or_whitespace((uint8_t)(*p)) != 0); }
internal static JsonParseError unified_machine(uint8_t *buf, size_t len, ParsedJson pj) { #if !ALLOW_SAME_PAGE_BUFFER_OVERRUN memset((uint8_t *)buf + len, 0, SIMDJSON_PADDING); // to please valgrind #endif uint32_t i = 0; // index of the structural character (0,1,2,3...) uint32_t idx; // location of the structural character in the input (buf) uint8_t c = 0; // used to track the (structural) character we are looking at, updated // by UPDATE_CHAR macro uint32_t depth = 0; // could have an arbitrary starting depth pj.Init(); // sets isvalid to false if (pj.bytecapacity < len) { pj.ErrorCode = JsonParseError.CAPACITY; return(pj.ErrorCode); } ////////////////////////////// START STATE ///////////////////////////// pj.ret_address[depth] = (bytechar)'s'; pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, (uint8_t)'r'); // r for root, 0 is going to get overwritten // the root is used, if nothing else, to capture the size of the tape depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else. if (depth >= pj.depthcapacity) { goto fail; } idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)'{': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth >= pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); // strangely, moving this to object_begin slows things down goto object_begin; case (uint8_t)'[': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth >= pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); goto array_begin; // A JSON text is a serialized value. Note that certain previous // specifications of JSON constrained a JSON text to be an object or an // array. Implementations that generate only objects or arrays where a // JSON text is called for will be interoperable in the sense that all // implementations will accept these as conforming JSON texts. // https://tools.ietf.org/html/rfc8259 #if SIMDJSON_ALLOWANYTHINGINROOT case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': { // we need to make a copy to make sure that the string is space terminated. // this only applies to the JSON document made solely of the true value. // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!is_valid_true_atom((uint8_t *)(copy) + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'f': { // we need to make a copy to make sure that the string is space terminated. // this only applies to the JSON document made solely of the false value. // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!is_valid_false_atom((uint8_t *)(copy) + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'n': { // we need to make a copy to make sure that the string is space terminated. // this only applies to the JSON document made solely of the null value. // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!is_valid_null_atom((uint8_t *)(copy) + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { // we need to make a copy to make sure that the string is space terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice. We terminate with a space // because we do not want to allow NULLs in the middle of a number (whereas a // space in the middle of a number would be identified in stage 1). bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)' '; if (!parse_number((uint8_t *)(copy), pj, idx, false)) { free(copy); goto fail; } free(copy); break; } case (uint8_t)'-': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING)); if (copy == null) { goto fail; } memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)(copy), pj, idx, true)) { free(copy); goto fail; } free(copy); break; } #endif // ALLOWANYTHINGINROOT default: goto fail; } start_continue: // the string might not be NULL terminated. if (i + 1 == pj.n_structural_indexes) { goto succeed; } else { goto fail; } ////////////////////////////// OBJECT STATES ///////////////////////////// object_begin: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; // could also go to object_continue default: goto fail; } object_key_state: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() if (c != ':') { goto fail; } idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; } case (uint8_t)'{': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an object inside an object, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an array inside an object, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } object_continue: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)',': idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() if (c != '"') { goto fail; } else { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; default: goto fail; } ////////////////////////////// COMMON STATE ///////////////////////////// scope_end: // write our tape location to the header scope depth--; pj.WriteTape(pj.containing_scope_offset[depth], c); pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); if (pj.ret_address[depth] == 'a') { goto array_continue; } else if (pj.ret_address[depth] == 'o') { goto object_continue; } else { goto start_continue; } ////////////////////////////// ARRAY STATES ///////////////////////////// array_begin: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() if (c == (uint8_t)']') { goto scope_end; // could also go to array_continue } main_array_switch: // we call update char on all paths in, so we can peek at c on the // on paths that can accept a close square brace (post-, and at start) switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; // goto array_continue; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; // goto array_continue; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; // goto array_continue; } case (uint8_t)'{': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an object inside an array, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an array inside an array, so we need to increment the depth depth++; if (depth >= pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } array_continue: idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() switch (c) { case (uint8_t)',': idx = pj.structural_indexes[i++]; c = buf[idx]; //UPDATE_CHAR() goto main_array_switch; case (uint8_t)']': goto scope_end; default: goto fail; } ////////////////////////////// FINAL STATES ///////////////////////////// succeed: depth--; if (depth != 0) { throw new InvalidOperationException("internal bug"); //abort(); } if (pj.containing_scope_offset[depth] != 0) { throw new InvalidOperationException("internal bug"); //abort(); } pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); pj.WriteTape(pj.containing_scope_offset[depth], (uint8_t)'r'); // r is root pj.isvalid = true; pj.ErrorCode = JsonParseError.SUCCESS; return(pj.ErrorCode); fail: // we do not need the next line because this is done by pj.init(), pessimistically. // pj.isvalid = false; // At this point in the code, we have all the time in the world. // Note that we know exactly where we are in the document so we could, // without any overhead on the processing code, report a specific location. // We could even trigger special code paths to assess what happened carefully, // all without any added cost. if (depth >= pj.depthcapacity) { pj.ErrorCode = JsonParseError.DEPTH_ERROR; return(pj.ErrorCode); } switch (c) { case (uint8_t)'"': pj.ErrorCode = JsonParseError.STRING_ERROR; return(pj.ErrorCode); case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': case (uint8_t)'-': pj.ErrorCode = JsonParseError.NUMBER_ERROR; return(pj.ErrorCode); case (uint8_t)'t': pj.ErrorCode = JsonParseError.T_ATOM_ERROR; return(pj.ErrorCode); case (uint8_t)'n': pj.ErrorCode = JsonParseError.N_ATOM_ERROR; return(pj.ErrorCode); case (uint8_t)'f': pj.ErrorCode = JsonParseError.F_ATOM_ERROR; return(pj.ErrorCode); default: break; } pj.ErrorCode = JsonParseError.TAPE_ERROR; return(pj.ErrorCode); }
internal static JsonParseError unified_machine(bytechar *buf, size_t len, ParsedJson pj) => unified_machine((uint8_t *)(buf), len, pj);
internal static JsonParseError JsonParse(byte *jsonData, UInt64 length, ParsedJson pj, bool reallocIfNeeded = true) { if (pj.bytecapacity < length) return(JsonParseError.CAPACITY); }
internal static bool unified_machine(uint8_t *buf, size_t len, ParsedJson pj) { uint32_t i = 0; // index of the structural character (0,1,2,3...) uint32_t idx; // location of the structural character in the input (buf) uint8_t c; // used to track the (structural) character we are looking at, updated // by UPDATE_CHAR macro uint32_t depth = 0; // could have an arbitrary starting depth pj.Init(); if (pj.bytecapacity < len) { Debug.Write("insufficient capacity\n"); return(false); } // this macro reads the next structural character, updating idx, i and c. //C#: expanded directly everywhere //void UPDATE_CHAR() //{ // idx = pj.structural_indexes[i++]; // c = buf[idx]; //} pj.ret_address[depth] = (bytechar)'s'; pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, (byte)'r'); // r for root, 0 is going to get overwritten // the root is used, if nothing else, to capture the size of the tape depth++; // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else. if (depth > pj.depthcapacity) { goto fail; } //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'{': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth > pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); // strangely, moving this to object_begin slows things down goto object_begin; case (uint8_t)'[': pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.ret_address[depth] = (bytechar)'s'; depth++; if (depth > pj.depthcapacity) { goto fail; } pj.WriteTape(0, c); goto array_begin; // A JSON text is a serialized value. Note that certain previous // specifications of JSON constrained a JSON text to be an object or an // array. Implementations that generate only objects or arrays where a // JSON text is called for will be interoperable in the sense that all // implementations will accept these as conforming JSON texts. // https://tools.ietf.org/html/rfc8259 #if SIMDJSON_ALLOWANYTHINGINROOT case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the true value. // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_true_atom((uint8_t *)copy + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'f': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the false value. // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_false_atom((uint8_t *)copy + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'n': { // we need to make a copy to make sure that the string is NULL terminated. // this only applies to the JSON document made solely of the null value. // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!is_valid_null_atom((uint8_t *)copy + idx)) { free(copy); goto fail; } free(copy); pj.WriteTape(0, c); break; } case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)copy, pj, idx, false)) { free(copy); goto fail; } free(copy); break; } case (uint8_t)'-': { // we need to make a copy to make sure that the string is NULL terminated. // this is done only for JSON documents made of a sole number // this will almost never be called in practice bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING); memcpy(copy, buf, len); copy[len] = (bytechar)'\0'; if (!parse_number((uint8_t *)copy, pj, idx, true)) { free(copy); goto fail; } free(copy); break; } #endif // ALLOWANYTHINGINROOT default: goto fail; } start_continue: // the string might not be NULL terminated. if (i + 1 == pj.n_structural_indexes) { goto succeed; } else { goto fail; } ////////////////////////////// OBJECT STATES ///////////////////////////// object_begin: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; // could also go to object_continue default: goto fail; } object_key_state: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; if (c != ':') { goto fail; } //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; } case (uint8_t)'{': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an object inside an object, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized // we have not yet encountered } so we need to come back for it pj.ret_address[depth] = (bytechar)'o'; // we found an array inside an object, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } object_continue: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)',': //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; if (c != (uint8_t)'"') { goto fail; } else { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } goto object_key_state; } case (uint8_t)'}': goto scope_end; default: goto fail; } ////////////////////////////// COMMON STATE ///////////////////////////// scope_end: // write our tape location to the header scope depth--; pj.WriteTape(pj.containing_scope_offset[depth], c); pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); // goto saved_state if (pj.ret_address[depth] == (uint8_t)'a') { goto array_continue; } else if (pj.ret_address[depth] == (uint8_t)'o') { goto object_continue; } else { goto start_continue; } ////////////////////////////// ARRAY STATES ///////////////////////////// array_begin: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; if (c == ']') { goto scope_end; // could also go to array_continue } main_array_switch: // we call update char on all paths in, so we can peek at c on the // on paths that can accept a close square brace (post-, and at start) switch (c) { case (uint8_t)'"': { if (!parse_string(buf, len, pj, depth, idx)) { goto fail; } break; } case (uint8_t)'t': if (!is_valid_true_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'f': if (!is_valid_false_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; case (uint8_t)'n': if (!is_valid_null_atom(buf + idx)) { goto fail; } pj.WriteTape(0, c); break; // goto array_continue; case (uint8_t)'0': case (uint8_t)'1': case (uint8_t)'2': case (uint8_t)'3': case (uint8_t)'4': case (uint8_t)'5': case (uint8_t)'6': case (uint8_t)'7': case (uint8_t)'8': case (uint8_t)'9': { if (!parse_number(buf, pj, idx, false)) { goto fail; } break; // goto array_continue; } case (uint8_t)'-': { if (!parse_number(buf, pj, idx, true)) { goto fail; } break; // goto array_continue; } case (uint8_t)'{': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an object inside an array, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto object_begin; } case (uint8_t)'[': { // we have not yet encountered ] so we need to come back for it pj.containing_scope_offset[depth] = pj.CurrentLoc; pj.WriteTape(0, c); // here the compilers knows what c is so this gets optimized pj.ret_address[depth] = (bytechar)'a'; // we found an array inside an array, so we need to increment the depth depth++; if (depth > pj.depthcapacity) { goto fail; } goto array_begin; } default: goto fail; } array_continue: //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; switch (c) { case (uint8_t)',': //UPDATE_CHAR(): idx = pj.structural_indexes[i++]; c = buf[idx]; goto main_array_switch; case (uint8_t)']': goto scope_end; default: goto fail; } ////////////////////////////// FINAL STATES ///////////////////////////// succeed: depth--; if (depth != 0) { throw new InvalidOperationException("internal bug"); } if (pj.containing_scope_offset[depth] != 0) { throw new InvalidOperationException("internal bug"); } pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc); pj.WriteTape(pj.containing_scope_offset[depth], (byte)'r'); // r is root pj.isvalid = true; return(true); fail: return(false); }
internal static bool find_structural_bits(uint8_t* buf, size_t len, ParsedJson pj) { if (len > pj.bytecapacity) { Console.WriteLine("Your ParsedJson object only supports documents up to " + pj.bytecapacity + " bytes but you are trying to process " + len + " bytes\n"); return false; } uint32_t* base_ptr = pj.structural_indexes; uint32_t @base = 0; #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! var has_error = Vector256<byte>.Zero; var previous = new avx_processed_utf_bytes(); previous.rawbytes = Vector256<byte>.Zero; previous.high_nibbles = Vector256<byte>.Zero; previous.carried_continuations = Vector256<byte>.Zero; var highbit = Vector256.Create((byte)0x80); #endif const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; // for now, just work in 64-byte chunks // we have padded the input out to 64 byte multiple with the remainder being // zeros // persistent state across loop uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones // effectively the very first char is considered to follow "whitespace" for the // purposes of psuedo-structural character detection uint64_t prev_iter_ends_pseudo_pred = 1UL; size_t lenminus64 = len < 64 ? 0 : len - 64; size_t idx = 0; uint64_t structurals = 0; // C#: assign static readonly fields to locals before the loop Vector256<byte> low_nibble_mask = s_low_nibble_mask; Vector256<byte> high_nibble_mask = s_high_nibble_mask; Vector256<byte> utf8ValidVec = s_utf8ValidVec; var structural_shufti_mask = Vector256.Create((byte)0x7); var whitespace_shufti_mask = Vector256.Create((byte)0x18); var slashVec = Vector256.Create((bytechar) '\\').AsByte(); var ffVec = Vector128.Create((byte) 0xFF).AsUInt64(); var doubleQuoteVec = Vector256.Create((byte)'"'); var zeroBVec = Vector256.Create((byte) 0); var vec7f = Vector256.Create((byte) 0x7f); for (; idx < lenminus64; idx += 64) { var input_lo = Avx.LoadVector256(buf + idx + 0); var input_hi = Avx.LoadVector256(buf + idx + 32); #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true) { // it is ascii, we just check continuation has_error = Avx2.Or( Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec, has_error); } else { // it is not ascii so we have to do heavy work previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error); previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error); } #endif //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// /// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0)); uint32_t cnt = (uint32_t) hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t) ((int64_t) quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t) Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t) Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t) Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t) Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //Console.WriteLine($"Iter: {idx}, satur: {structurals}"); //*(uint64_t *)(pj.structurals + idx / 8) = structurals; } //////////////// /// we use a giant copy-paste which is ugly. /// but otherwise the string needs to be properly padded or else we /// risk invalidating the UTF-8 checks. //////////// if (idx < len) { uint8_t* tmpbuf = stackalloc uint8_t[64]; memset(tmpbuf, 0x20, 64); memcpy(tmpbuf, buf + idx, len - idx); Vector256<byte> input_lo = Avx.LoadVector256(tmpbuf + 0); Vector256<byte> input_hi = Avx.LoadVector256(tmpbuf + 32); #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! var highbit = Vector256.Create((byte)0x80); if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true) { // it is ascii, we just check continuation has_error = Avx2.Or( Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec).AsByte(), has_error); } else { // it is not ascii so we have to do heavy work previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error); previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error); } #endif //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64()); quote_mask ^= prev_iter_inside_quote; //BUG? https://github.com/dotnet/coreclr/issues/22813 //quote_mask = 60; //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20 uint32_t cnt = (uint32_t)hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; // How do we build up a user traversable data structure // first, do a 'shufti' to detect structural JSON characters // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c // these go into the first 3 buckets of the comparison (1/2/4) // we are also interested in the four whitespace characters // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d // these go into the next 2 buckets of the comparison (8/16) var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); // this additional mask and transfer is non-trivially expensive, // unfortunately var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //*(uint64_t *)(pj.structurals + idx / 8) = structurals; idx += 64; } uint32_t cnt2 = (uint32_t)hamming(structurals); uint32_t next_base2 = @base + cnt2; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base2; pj.n_structural_indexes = @base; if (base_ptr[pj.n_structural_indexes - 1] > len) { throw new InvalidOperationException("Internal bug"); } if (len != base_ptr[pj.n_structural_indexes - 1]) { // the string might not be NULL terminated, but we add a virtual NULL ending character. base_ptr[pj.n_structural_indexes++] = (uint32_t)len; } base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array #if SIMDJSON_UTF8VALIDATE // NOT TESTED YET! return Avx.TestZ(has_error, has_error); #else return true; #endif }
internal static bool parse_string(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset) { pj.WriteTape((ulong)(pj.current_string_buf_loc - pj.string_buf), (char1)'"'); uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a " uint8_t *dst = pj.current_string_buf_loc + sizeof(uint32_t); uint8_t *start_of_string = dst; while (true) { parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst); if (((helper.bs_bits - 1) & helper.quote_bits) != 0) { // we encountered quotes first. Move dst to point to quotes and exit // find out where the quote is... uint32_t quote_dist = (uint32_t)trailingzeroes(helper.quote_bits); // NULL termination is still handy if you expect all your strings to be NULL terminated? // It comes at a small cost dst[quote_dist] = 0; uint32_t str_length = (uint32_t)((dst - start_of_string) + quote_dist); memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t)); /////////////////////// // Above, check for overflow in case someone has a crazy string (>=4GB?) // But only add the overflow check when the document itself exceeds 4GB // Currently unneeded because we refuse to parse docs larger or equal to 4GB. //////////////////////// // we advance the point, accounting for the fact that we have a NULL termination pj.current_string_buf_loc = dst + quote_dist + 1; return(true); } if (((helper.quote_bits - 1) & helper.bs_bits) != 0) { // find out where the backspace is uint32_t bs_dist = (uint32_t)trailingzeroes(helper.bs_bits); uint8_t escape_char = src[bs_dist + 1]; // we encountered backslash first. Handle backslash if (escape_char == 'u') { // move src/dst up to the start; they will be further adjusted // within the unicode codepoint handling code. src += bs_dist; dst += bs_dist; if (!handle_unicode_codepoint(&src, &dst)) { return(false); } } else { // simple 1:1 conversion. Will eat bs_dist+2 characters in input and // write bs_dist+1 characters to output // note this may reach beyond the part of the buffer we've actually // seen. I think this is ok uint8_t escape_result = escape_map[escape_char]; // TODO: https://github.com/dotnet/coreclr/issues/25894 if (escape_result == 0u) { return(false); // bogus escape value is an error } dst[bs_dist] = escape_result; src += bs_dist + 2; dst += bs_dist + 1; } } else { // they are the same. Since they can't co-occur, it means we encountered // neither. if (!Avx2.IsSupported) { src += 16; // sse42 dst += 16; } else { src += 32; // avx2 dst += 32; } } } }