public static bool parse_string_sse41(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
#if SIMDJSON_SKIPSTRINGPARSING               // for performance analysis, it is sometimes useful to skip parsing
            pj.write_tape(0, '"');           // don't bother with the string parsing at all
            return(true);                    // always succeeds
#else
            uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
            uint8_t *dst = pj.current_string_buf_loc;
#if JSON_TEST_STRINGS                        // for unit testing
            uint8_t *const start_of_string = dst;
#endif

            Vector128 <byte> slashVec = Vector128.Create((byte)'\\');
            Vector128 <byte> quoteVec = Vector128.Create((byte)'"');
            Vector128 <byte> unitsep  = Vector128.Create((byte)0x1F);

            while (true)
            {
                Vector128 <byte> v          = Sse2.LoadVector128((src));
                uint32_t         bs_bits    = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, slashVec));
                uint32_t         quote_bits = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, quoteVec));
                // All Unicode characters may be placed within the
                // quotation marks, except for the characters that MUST be escaped:
                // quotation mark, reverse solidus, and the control characters (U+0000
                //through U+001F).
                // https://tools.ietf.org/html/rfc8259
#if CHECKUNESCAPED
                Vector128 <byte> unescaped_vec =
                    Sse2.CompareEqual(Sse2.Max(unitsep, v), unitsep); // could do it with saturated subtraction
#endif // CHECKUNESCAPED

                uint32_t quote_dist = (uint32_t)trailingzeroes(quote_bits);
                uint32_t bs_dist    = (uint32_t)trailingzeroes(bs_bits);
                // store to dest unconditionally - we can overwrite the bits we don't like
                // later
                memcpy(dst, src, (size_t)Vector128 <byte> .Count);

                if (quote_dist < bs_dist)
                {
                    // we encountered quotes first. Move dst to point to quotes and exit
                    dst[quote_dist] = 0; // null terminate and get out

                    pj.WriteTape((size_t)pj.current_string_buf_loc - (size_t)pj.string_buf, (uint8_t)'"');

                    pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value
#if CHECKUNESCAPED
                    // check that there is no unescaped char before the quote
                    uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec);
                    bool     is_ok          = ((quote_bits - 1) & (~quote_bits) & unescaped_bits) == 0;
#if JSON_TEST_STRINGS // for unit testing
                    if (is_ok)
                    {
                        foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1);
                    }
                    else
                    {
                        foundBadString(buf + offset);
                    }
#endif // JSON_TEST_STRINGS
                    return(is_ok);
#else //CHECKUNESCAPED
#if JSON_TEST_STRINGS // for unit testing
                    foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1);
#endif // JSON_TEST_STRINGS
                    return(true);
#endif //CHECKUNESCAPED
                }
                else if (quote_dist > bs_dist)
                {
                    uint8_t escape_char = src[bs_dist + 1];
#if CHECKUNESCAPED
                    // we are going to need the unescaped_bits to check for unescaped chars
                    uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec);
                    if (((bs_bits - 1) & (~bs_bits) & unescaped_bits) != 0)
                    {
#if JSON_TEST_STRINGS // for unit testing
                        foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                        return(false);
                    }
#endif //CHECKUNESCAPED
                    // we encountered backslash first. Handle backslash
                    if (escape_char == 'u')
                    {
                        // move src/dst up to the start; they will be further adjusted
                        // within the unicode codepoint handling code.
                        src += bs_dist;
                        dst += bs_dist;
                        if (!handle_unicode_codepoint(&src, &dst))
                        {
#if JSON_TEST_STRINGS // for unit testing
                            foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                            return(false);
                        }
                    }
                    else
                    {
                        // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
                        // write bs_dist+1 characters to output
                        // note this may reach beyond the part of the buffer we've actually
                        // seen. I think this is ok
                        uint8_t escape_result = escape(escape_char);
                        if (escape_result == 0)
                        {
#if JSON_TEST_STRINGS // for unit testing
                            foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                            return(false); // bogus escape value is an error
                        }

                        dst[bs_dist] = escape_result;
                        src         += bs_dist + 2;
                        dst         += bs_dist + 1;
                    }
                }
                else
                {
                    // they are the same. Since they can't co-occur, it means we encountered
                    // neither.
                    src += Vector128 <byte> .Count;
                    dst += Vector128 <byte> .Count;
#if CHECKUNESCAPED
                    // check for unescaped chars
                    if (Sse2.MoveMask(unescaped_vec) != 0)
                    {
#if JSON_TEST_STRINGS // for unit testing
                        foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                        return(false);
                    }
#endif // CHECKUNESCAPED
                }
            }

            // can't be reached
            return(true);
#endif // SIMDJSON_SKIPSTRINGPARSING
        }
Beispiel #2
0
        internal static JsonParseError unified_machine(uint8_t *buf, size_t len, ParsedJson pj)
        {
#if !ALLOW_SAME_PAGE_BUFFER_OVERRUN
            memset((uint8_t *)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
#endif
            uint32_t i = 0;                                    // index of the structural character (0,1,2,3...)
            uint32_t idx;                                      // location of the structural character in the input (buf)
            uint8_t  c = 0;                                    // used to track the (structural) character we are looking at, updated
            // by UPDATE_CHAR macro
            uint32_t depth = 0;                                // could have an arbitrary starting depth
            pj.Init();                                         // sets isvalid to false
            if (pj.bytecapacity < len)
            {
                pj.ErrorCode = JsonParseError.CAPACITY;
                return(pj.ErrorCode);
            }

            ////////////////////////////// START STATE /////////////////////////////
            pj.ret_address[depth]             = (bytechar)'s';
            pj.containing_scope_offset[depth] = pj.CurrentLoc;
            pj.WriteTape(0, (uint8_t)'r'); // r for root, 0 is going to get overwritten
            // the root is used, if nothing else, to capture the size of the tape
            depth++;                       // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
            if (depth >= pj.depthcapacity)
            {
                goto fail;
            }

            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'{':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);     // strangely, moving this to object_begin slows things down
                goto object_begin;

            case (uint8_t)'[':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                goto array_begin;

                // A JSON text is a serialized value.  Note that certain previous
                // specifications of JSON constrained a JSON text to be an object or an
                // array.  Implementations that generate only objects or arrays where a
                // JSON text is called for will be interoperable in the sense that all
                // implementations will accept these as conforming JSON texts.
                // https://tools.ietf.org/html/rfc8259
#if SIMDJSON_ALLOWANYTHINGINROOT
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the true value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_true_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'f':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the false value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_false_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'n':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the null value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_null_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice. We terminate with a space
                // because we do not want to allow NULLs in the middle of a number (whereas a
                // space in the middle of a number would be identified in stage 1).
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!parse_number((uint8_t *)(copy), pj, idx, false))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }

            case (uint8_t)'-':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)(copy), pj, idx, true))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }
#endif // ALLOWANYTHINGINROOT
            default:
                goto fail;
            }

start_continue:
            // the string might not be NULL terminated.
            if (i + 1 == pj.n_structural_indexes)
            {
                goto succeed;
            }
            else
            {
                goto fail;
            }
            ////////////////////////////// OBJECT STATES /////////////////////////////

object_begin:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                goto object_key_state;
            }

            case (uint8_t)'}':
                goto scope_end;     // could also go to object_continue

            default:
                goto fail;
            }

object_key_state:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            if (c != ':')
            {
                goto fail;
            }

            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'{':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an object inside an object, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an array inside an object, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

object_continue:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)',':
                idx = pj.structural_indexes[i++];
                c   = buf[idx];   //UPDATE_CHAR()
                if (c != '"')
                {
                    goto fail;
                }
                else
                {
                    if (!parse_string(buf, len, pj, depth, idx))
                    {
                        goto fail;
                    }

                    goto object_key_state;
                }

            case (uint8_t)'}':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// COMMON STATE /////////////////////////////

scope_end:
            // write our tape location to the header scope
            depth--;
            pj.WriteTape(pj.containing_scope_offset[depth], c);
            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth],
                                   pj.CurrentLoc);
            if (pj.ret_address[depth] == 'a')
            {
                goto array_continue;
            }
            else if (pj.ret_address[depth] == 'o')
            {
                goto object_continue;
            }
            else
            {
                goto start_continue;
            }

            ////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            if (c == (uint8_t)']')
            {
                goto scope_end; // could also go to array_continue
            }

main_array_switch:
            // we call update char on all paths in, so we can peek at c on the
            // on paths that can accept a close square brace (post-, and at start)
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;     // goto array_continue;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'{':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     //  here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an object inside an array, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an array inside an array, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

array_continue:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)',':
                idx = pj.structural_indexes[i++];
                c   = buf[idx];   //UPDATE_CHAR()
                goto main_array_switch;

            case (uint8_t)']':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// FINAL STATES /////////////////////////////

succeed:
            depth--;
            if (depth != 0)
            {
                throw new InvalidOperationException("internal bug");
                //abort();
            }

            if (pj.containing_scope_offset[depth] != 0)
            {
                throw new InvalidOperationException("internal bug");
                //abort();
            }

            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc);
            pj.WriteTape(pj.containing_scope_offset[depth], (uint8_t)'r');  // r is root

            pj.isvalid   = true;
            pj.ErrorCode = JsonParseError.SUCCESS;
            return(pj.ErrorCode);

fail:
            // we do not need the next line because this is done by pj.init(), pessimistically.
            // pj.isvalid  = false;
            // At this point in the code, we have all the time in the world.
            // Note that we know exactly where we are in the document so we could,
            // without any overhead on the processing code, report a specific location.
            // We could even trigger special code paths to assess what happened carefully,
            // all without any added cost.
            if (depth >= pj.depthcapacity)
            {
                pj.ErrorCode = JsonParseError.DEPTH_ERROR;
                return(pj.ErrorCode);
            }

            switch (c)
            {
            case (uint8_t)'"':
                pj.ErrorCode = JsonParseError.STRING_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            case (uint8_t)'-':
                pj.ErrorCode = JsonParseError.NUMBER_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'t':
                pj.ErrorCode = JsonParseError.T_ATOM_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'n':
                pj.ErrorCode = JsonParseError.N_ATOM_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'f':
                pj.ErrorCode = JsonParseError.F_ATOM_ERROR;
                return(pj.ErrorCode);

            default:
                break;
            }

            pj.ErrorCode = JsonParseError.TAPE_ERROR;
            return(pj.ErrorCode);
        }
        internal static bool unified_machine(uint8_t *buf, size_t len, ParsedJson pj)
        {
            uint32_t i = 0;     // index of the structural character (0,1,2,3...)
            uint32_t idx;       // location of the structural character in the input (buf)
            uint8_t  c;         // used to track the (structural) character we are looking at, updated
            // by UPDATE_CHAR macro
            uint32_t depth = 0; // could have an arbitrary starting depth

            pj.Init();
            if (pj.bytecapacity < len)
            {
                Debug.Write("insufficient capacity\n");
                return(false);
            }

            // this macro reads the next structural character, updating idx, i and c.
            //C#: expanded directly everywhere
            //void UPDATE_CHAR()
            //{
            //    idx = pj.structural_indexes[i++];
            //    c = buf[idx];
            //}

            pj.ret_address[depth]             = (bytechar)'s';
            pj.containing_scope_offset[depth] = pj.CurrentLoc;
            pj.WriteTape(0, (byte)'r'); // r for root, 0 is going to get overwritten
            // the root is used, if nothing else, to capture the size of the tape
            depth++;                    // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
            if (depth > pj.depthcapacity)
            {
                goto fail;
            }


            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];

            switch (c)
            {
            case (uint8_t)'{':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);     // strangely, moving this to object_begin slows things down
                goto object_begin;

            case (uint8_t)'[':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                goto array_begin;

                // A JSON text is a serialized value.  Note that certain previous
                // specifications of JSON constrained a JSON text to be an object or an
                // array.  Implementations that generate only objects or arrays where a
                // JSON text is called for will be interoperable in the sense that all
                // implementations will accept these as conforming JSON texts.
                // https://tools.ietf.org/html/rfc8259
#if SIMDJSON_ALLOWANYTHINGINROOT
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the true value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_true_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'f':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the false value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_false_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'n':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the null value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_null_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)copy, pj, idx, false))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }

            case (uint8_t)'-':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)copy, pj, idx, true))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }
#endif // ALLOWANYTHINGINROOT
            default:
                goto fail;
            }

start_continue:
            // the string might not be NULL terminated.
            if (i + 1 == pj.n_structural_indexes)
            {
                goto succeed;
            }
            else
            {
                goto fail;
            }
            ////////////////////////////// OBJECT STATES /////////////////////////////

object_begin:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                goto object_key_state;
            }

            case (uint8_t)'}':
                goto scope_end;     // could also go to object_continue

            default:
                goto fail;
            }

object_key_state:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            if (c != ':')
            {
                goto fail;
            }

            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'{':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an object inside an object, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an array inside an object, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

object_continue:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)',':
                //UPDATE_CHAR():
                idx = pj.structural_indexes[i++];
                c   = buf[idx];
                if (c != (uint8_t)'"')
                {
                    goto fail;
                }
                else
                {
                    if (!parse_string(buf, len, pj, depth, idx))
                    {
                        goto fail;
                    }

                    goto object_key_state;
                }

            case (uint8_t)'}':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// COMMON STATE /////////////////////////////

scope_end:
            // write our tape location to the header scope
            depth--;
            pj.WriteTape(pj.containing_scope_offset[depth], c);
            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth],
                                   pj.CurrentLoc);
            // goto saved_state
            if (pj.ret_address[depth] == (uint8_t)'a')
            {
                goto array_continue;
            }
            else if (pj.ret_address[depth] == (uint8_t)'o')
            {
                goto object_continue;
            }
            else
            {
                goto start_continue;
            }

            ////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            if (c == ']')
            {
                goto scope_end; // could also go to array_continue
            }

main_array_switch:
            // we call update char on all paths in, so we can peek at c on the
            // on paths that can accept a close square brace (post-, and at start)
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;     // goto array_continue;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'{':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     //  here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an object inside an array, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an array inside an array, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

array_continue:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)',':
                //UPDATE_CHAR():
                idx = pj.structural_indexes[i++];
                c   = buf[idx];
                goto main_array_switch;

            case (uint8_t)']':
                goto scope_end;

            default:
                goto fail;
            }


            ////////////////////////////// FINAL STATES /////////////////////////////

succeed:
            depth--;
            if (depth != 0)
            {
                throw new InvalidOperationException("internal bug");
            }

            if (pj.containing_scope_offset[depth] != 0)
            {
                throw new InvalidOperationException("internal bug");
            }

            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc);
            pj.WriteTape(pj.containing_scope_offset[depth], (byte)'r');  // r is root
            pj.isvalid = true;
            return(true);



fail:
            return(false);
        }
Beispiel #4
0
        internal static bool parse_string(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
            pj.WriteTape((ulong)(pj.current_string_buf_loc - pj.string_buf), (char1)'"');
            uint8_t *src             = &buf[offset + 1]; // we know that buf at offset is a "
            uint8_t *dst             = pj.current_string_buf_loc + sizeof(uint32_t);
            uint8_t *start_of_string = dst;

            while (true)
            {
                parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
                if (((helper.bs_bits - 1) & helper.quote_bits) != 0)
                {
                    // we encountered quotes first. Move dst to point to quotes and exit
                    // find out where the quote is...
                    uint32_t quote_dist = (uint32_t)trailingzeroes(helper.quote_bits);

                    // NULL termination is still handy if you expect all your strings to be NULL terminated?
                    // It comes at a small cost
                    dst[quote_dist] = 0;

                    uint32_t str_length = (uint32_t)((dst - start_of_string) + quote_dist);
                    memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
                    ///////////////////////
                    // Above, check for overflow in case someone has a crazy string (>=4GB?)
                    // But only add the overflow check when the document itself exceeds 4GB
                    // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
                    ////////////////////////

                    // we advance the point, accounting for the fact that we have a NULL termination
                    pj.current_string_buf_loc = dst + quote_dist + 1;

                    return(true);
                }

                if (((helper.quote_bits - 1) & helper.bs_bits) != 0)
                {
                    // find out where the backspace is
                    uint32_t bs_dist     = (uint32_t)trailingzeroes(helper.bs_bits);
                    uint8_t  escape_char = src[bs_dist + 1];
                    // we encountered backslash first. Handle backslash
                    if (escape_char == 'u')
                    {
                        // move src/dst up to the start; they will be further adjusted
                        // within the unicode codepoint handling code.
                        src += bs_dist;
                        dst += bs_dist;
                        if (!handle_unicode_codepoint(&src, &dst))
                        {
                            return(false);
                        }
                    }
                    else
                    {
                        // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
                        // write bs_dist+1 characters to output
                        // note this may reach beyond the part of the buffer we've actually
                        // seen. I think this is ok
                        uint8_t escape_result = escape_map[escape_char]; // TODO: https://github.com/dotnet/coreclr/issues/25894
                        if (escape_result == 0u)
                        {
                            return(false); // bogus escape value is an error
                        }

                        dst[bs_dist] = escape_result;
                        src         += bs_dist + 2;
                        dst         += bs_dist + 1;
                    }
                }
                else
                {
                    // they are the same. Since they can't co-occur, it means we encountered
                    // neither.
                    if (!Avx2.IsSupported)
                    {
                        src += 16; // sse42
                        dst += 16;
                    }
                    else
                    {
                        src += 32; // avx2
                        dst += 32;
                    }
                }
            }
        }