public ParsedJsonIterator(ParsedJson parsedJson)
        {
            pj           = parsedJson;
            depth        = 0;
            location     = 0;
            tape_length  = 0;
            depthindex   = allocate <scopeindex_t>(pj.depthcapacity);
            current_type = 0;
            current_val  = 0;

            depthindex[0].start_of_scope = location;
            current_val              = pj.tape[location++];
            current_type             = (uint8_t)(current_val >> 56);
            depthindex[0].scope_type = current_type;
            if (current_type == 'r')
            {
                tape_length = current_val & JSONVALUEMASK;
                if (location < tape_length)
                {
                    current_val  = pj.tape[location];
                    current_type = (uint8_t)(current_val >> 56);
                    depth++;
                    depthindex[depth].start_of_scope = location;
                    depthindex[depth].scope_type     = current_type;
                }
            }
            else
            {
                throw new InvalidOperationException("Json is invalid");
            }
        }
Example #2
0
        internal static JsonParseError JsonParse(uint8_t *jsonData, size_t length, ParsedJson pj, bool reallocIfNeeded = true)
        {
            if (pj.bytecapacity < length)
            {
                return(JsonParseError.Capacity);
            }

            bool reallocated = false;

            if (reallocIfNeeded)
            {
                // realloc is needed if the end of the memory crosses a page
                if ((size_t)(jsonData + length - 1) % (size_t)pagesize < SIMDJSON_PADDING)
                {
                    uint8_t *tmpbuf = jsonData;
                    jsonData = (uint8_t *)allocate_padded_buffer(length);
                    if (jsonData == null)
                    {
                        return(JsonParseError.Memalloc);
                    }
                    memcpy(jsonData, tmpbuf, length);
                    reallocated = true;
                }
            }

            var result = JsonParseError.Success;

            if (stage1_find_marks.find_structural_bits(jsonData, length, pj))
            {
                result = stage2_build_tape.unified_machine(jsonData, length, pj);
            }
            if (reallocated)
                aligned_free(jsonData); }
        public static bool parse_string(uint8_t* buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
            if (Avx2.IsSupported)
                return parse_string_avx2(buf, len, pj, depth, offset);
            //if (Sse41.IsSupported)
            //    return parse_string_sse41(buf, len, pj, depth, offset);

            ThrowHelper.ThrowPNSE();
            return false;
        }
Example #4
0
        public void Dispose()
        {
            if (depthindex != null)
            {
                delete(depthindex);
                depthindex = null;
            }

            if (pj != null)
            {
                pj.Dispose();
                pj = null;
            }
        }
Example #5
0
        public static ParsedJson build_parsed_json(uint8_t *buf, size_t len, bool reallocifneeded = true)
        {
            ParsedJson pj = new ParsedJson();
            bool       ok = pj.allocateCapacity(len);

            if (ok)
            {
                ok = json_parse(buf, len, &pj, reallocifneeded);
            }
            else
            {
                throw new InvalidOperationException("failure during memory allocation");
            }
            return(pj);
        }
Example #6
0
        public static ParsedJson ParseJson(byte *jsonData, int length, bool reallocIfNeeded = true)
        {
            var  pj = new ParsedJson();
            bool ok = pj.AllocateCapacity((ulong)length);

            if (ok)
            {
                JsonParse(jsonData, (ulong)length, pj, reallocIfNeeded);
            }
            else
            {
                throw new InvalidOperationException("failure during memory allocation");
            }
            return(pj);
        }
Example #7
0
        public static ParsedJson ParseJson(byte *jsonData, ulong length, bool reallocIfNeeded = true)
        {
            var  pj = new ParsedJson();
            bool ok = pj.AllocateCapacity(length);

            if (ok)
            {
                JsonParse(jsonData, length, pj, reallocIfNeeded);
            }
            else
            {
                pj.isvalid   = false;
                pj.ErrorCode = JsonParseError.CAPACITY;
            }
            return(pj);
        }
Example #8
0
        internal static bool JsonParse(uint8_t *jsonData, size_t length, ParsedJson pj, bool reallocIfNeeded = true)
        {
            if (pj.bytecapacity < length)
            {
                throw new InvalidOperationException("Your ParsedJson cannot support documents that big: " + length);
            }

            bool reallocated = false;

            if (reallocIfNeeded)
            {
                // realloc is needed if the end of the memory crosses a page
                if ((size_t)(jsonData + length - 1) % (size_t)pagesize < SIMDJSON_PADDING)
                {
                    uint8_t *tmpbuf = jsonData;
                    jsonData = (uint8_t *)allocate_padded_buffer(length);
                    if (jsonData == null)
                    {
                        return(false);
                    }
                    memcpy(jsonData, tmpbuf, length);
                    reallocated = true;
                }
            }
            bool isok = stage1_find_marks.find_structural_bits(jsonData, length, pj);

            if (isok)
            {
                isok = stage2_build_tape.unified_machine(jsonData, length, pj);
            }
            else
            {
                if (reallocated)
                {
                    free(jsonData);
                }
                return(false);
            }
            if (reallocated)
                free(jsonData); }
        public static bool parse_string_sse41(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
#if SIMDJSON_SKIPSTRINGPARSING               // for performance analysis, it is sometimes useful to skip parsing
            pj.write_tape(0, '"');           // don't bother with the string parsing at all
            return(true);                    // always succeeds
#else
            uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
            uint8_t *dst = pj.current_string_buf_loc;
#if JSON_TEST_STRINGS                        // for unit testing
            uint8_t *const start_of_string = dst;
#endif

            Vector128 <byte> slashVec = Vector128.Create((byte)'\\');
            Vector128 <byte> quoteVec = Vector128.Create((byte)'"');
            Vector128 <byte> unitsep  = Vector128.Create((byte)0x1F);

            while (true)
            {
                Vector128 <byte> v          = Sse2.LoadVector128((src));
                uint32_t         bs_bits    = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, slashVec));
                uint32_t         quote_bits = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, quoteVec));
                // All Unicode characters may be placed within the
                // quotation marks, except for the characters that MUST be escaped:
                // quotation mark, reverse solidus, and the control characters (U+0000
                //through U+001F).
                // https://tools.ietf.org/html/rfc8259
#if CHECKUNESCAPED
                Vector128 <byte> unescaped_vec =
                    Sse2.CompareEqual(Sse2.Max(unitsep, v), unitsep); // could do it with saturated subtraction
#endif // CHECKUNESCAPED

                uint32_t quote_dist = (uint32_t)trailingzeroes(quote_bits);
                uint32_t bs_dist    = (uint32_t)trailingzeroes(bs_bits);
                // store to dest unconditionally - we can overwrite the bits we don't like
                // later
                memcpy(dst, src, (size_t)Vector128 <byte> .Count);

                if (quote_dist < bs_dist)
                {
                    // we encountered quotes first. Move dst to point to quotes and exit
                    dst[quote_dist] = 0; // null terminate and get out

                    pj.WriteTape((size_t)pj.current_string_buf_loc - (size_t)pj.string_buf, (uint8_t)'"');

                    pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value
#if CHECKUNESCAPED
                    // check that there is no unescaped char before the quote
                    uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec);
                    bool     is_ok          = ((quote_bits - 1) & (~quote_bits) & unescaped_bits) == 0;
#if JSON_TEST_STRINGS // for unit testing
                    if (is_ok)
                    {
                        foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1);
                    }
                    else
                    {
                        foundBadString(buf + offset);
                    }
#endif // JSON_TEST_STRINGS
                    return(is_ok);
#else //CHECKUNESCAPED
#if JSON_TEST_STRINGS // for unit testing
                    foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1);
#endif // JSON_TEST_STRINGS
                    return(true);
#endif //CHECKUNESCAPED
                }
                else if (quote_dist > bs_dist)
                {
                    uint8_t escape_char = src[bs_dist + 1];
#if CHECKUNESCAPED
                    // we are going to need the unescaped_bits to check for unescaped chars
                    uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec);
                    if (((bs_bits - 1) & (~bs_bits) & unescaped_bits) != 0)
                    {
#if JSON_TEST_STRINGS // for unit testing
                        foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                        return(false);
                    }
#endif //CHECKUNESCAPED
                    // we encountered backslash first. Handle backslash
                    if (escape_char == 'u')
                    {
                        // move src/dst up to the start; they will be further adjusted
                        // within the unicode codepoint handling code.
                        src += bs_dist;
                        dst += bs_dist;
                        if (!handle_unicode_codepoint(&src, &dst))
                        {
#if JSON_TEST_STRINGS // for unit testing
                            foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                            return(false);
                        }
                    }
                    else
                    {
                        // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
                        // write bs_dist+1 characters to output
                        // note this may reach beyond the part of the buffer we've actually
                        // seen. I think this is ok
                        uint8_t escape_result = escape(escape_char);
                        if (escape_result == 0)
                        {
#if JSON_TEST_STRINGS // for unit testing
                            foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                            return(false); // bogus escape value is an error
                        }

                        dst[bs_dist] = escape_result;
                        src         += bs_dist + 2;
                        dst         += bs_dist + 1;
                    }
                }
                else
                {
                    // they are the same. Since they can't co-occur, it means we encountered
                    // neither.
                    src += Vector128 <byte> .Count;
                    dst += Vector128 <byte> .Count;
#if CHECKUNESCAPED
                    // check for unescaped chars
                    if (Sse2.MoveMask(unescaped_vec) != 0)
                    {
#if JSON_TEST_STRINGS // for unit testing
                        foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                        return(false);
                    }
#endif // CHECKUNESCAPED
                }
            }

            // can't be reached
            return(true);
#endif // SIMDJSON_SKIPSTRINGPARSING
        }
Example #10
0
        // called by parse_number when we know that the output is an integer,
        // but where there might be some integer overflow.
        // we want to catch overflows!
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        static bool parse_large_integer(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            char1 *p = (char1 *)(buf + offset);

            bool negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            uint64_t i;

            if (*p == (uchar1)'0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (uchar1)(*p - (uchar1)'0');
                    if (mul_overflow(i, 10, &i))
                    {
                        return(false); // overflow
                    }

                    if (add_overflow(i, digit, &i))
                    {
                        return(false); // overflow
                    }
                    ++p;
                }
            }

            if (negative)
            {
                if (i > 0x8000000000000000)
                {
                    return(false); // overflow
                }
            }
            else
            {
                if (i >= 0x8000000000000000)
                {
                    return(false); // overflow
                }
            }

            int64_t signed_answer = negative ? -(int64_t)i : (int64_t)i;

            pj.WriteTapeInt64(signed_answer);
            return(is_structural_or_whitespace((uchar1)(*p)) != 0);
        }
Example #11
0
        // called by parse_number when we know that the output is a float,
        // but where there might be some integer overflow. The trick here is to
        // parse using floats from the start.
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        // Note: a redesign could avoid this function entirely.
        //
        private static bool parse_float(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            bytechar *p        = (bytechar *)(buf + offset);
            bool      negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            double i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                unsigned_bytechar digit = (unsigned_bytechar)(*p - (bytechar)'0');
                i = digit;
                p++;
                while (is_integer(*p))
                {
                    digit = (unsigned_bytechar)(*p - '0');
                    i     = 10 * i + digit;
                    ++p;
                }
            }

            if ('.' == *p)
            {
                ++p;
                double fractionalweight = 1;
                if (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    fractionalweight *= 0.1;
                    i = i + digit * fractionalweight;
                }
                else
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                while (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    fractionalweight *= 0.1;
                    i = i + digit * fractionalweight;
                }
            }

            if (('e' == *p) || ('E' == *p))
            {
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                unsigned_bytechar digit     = (unsigned_bytechar)(*p - '0');
                int64_t           expnumber = digit; // exponential part
                p++;
                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
// we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                int exponent = (int)(negexp ? -expnumber : expnumber);
                if ((exponent > 308) || (exponent < -308))
                {
// we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                i *= power_of_ten[308 + exponent];
            }

            if (is_not_structural_or_whitespace((byte)*p) != 0)
            {
                return(false);
            }

            double d = negative ? -i : i;
            pj.WriteTapeDouble(d);
#if JSON_TEST_NUMBERS // for unit testing
            foundFloat(d, buf + offset);
#endif
            return(is_structural_or_whitespace((byte)(*p)) != 0);
        }
Example #12
0
        internal static bool parse_number(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            char1 *p        = (char1 *)(buf + offset);
            bool   negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
                if (!is_integer(*p))
                {
                    // a negative sign must be followed by an integer
                    return(false);
                }
            }

            char1 *  startdigits = p;
            uint64_t i; // an unsigned int avoids signed overflows (which are bad)

            if (*p == (char1)'0')
            {
                // 0 cannot be followed by an integer
                ++p;
                if (is_not_structural_or_whitespace_or_exponent_or_decimal((uint8_t)(*p)))
                {
                    return(false);
                }
                i = 0;
            }
            else
            {
                if (!(is_integer(*p)))
                {
                    // must start with an integer
                    return(false);
                }

                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (uchar1)(*p - (uchar1)'0');
                    // a multiplication by 10 is cheaper than an arbitrary integer multiplication
                    i = 10 * i + digit; // might overflow, we will handle the overflow later
                    ++p;
                }
            }

            int64_t exponent = 0;
            bool    is_float = false;

            if ('.' == *p)
            {
                is_float = true; // At this point we know that we have a float
                // we continue with the fiction that we have an integer. If the
                // floating point number is representable as x * 10^z for some integer
                // z that fits in 53 bits, then we will be able to convert back the
                // the integer into a float in a lossless manner.
                ++p;
                char1 *firstafterperiod = p;
                if (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;
                    i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
                    // we will handle the overflow later
                }
                else
                {
                    return(false);
                }
#if SWAR_NUMBER_PARSING
                // this helps if we have lots of decimals!
                // this turns out to be frequent enough.
                if (is_made_of_eight_digits_fast(p))
                {
                    i  = i * 100000000 + parse_eight_digits_unrolled(p);
                    p += 8;
                }
#endif
                while (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;
                    i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later.
                }

                exponent = firstafterperiod - p;
            }

            int     digitcount = (int)(p - startdigits - 1); // used later to guard against overflows
            int64_t expnumber  = 0;                          // exponential part
            if (((char1)'e' == *p) || ((char1)'E' == *p))
            {
                is_float = true;
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
                    return(false);
                }

                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                expnumber = digit;
                p++;
                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    // we refuse to parse this
                    return(false);
                }

                exponent += (negexp ? -expnumber : expnumber);
            }

            if (is_float)
            {
                uint64_t powerindex = (uint64_t)(308 + exponent);
                if (/*unlikely*/ ((digitcount >= 19)))
                {
                    // this is uncommon
                    // It is possible that the integer had an overflow.
                    // We have to handle the case where we have 0.0000somenumber.
                    char1 *start = startdigits;
                    while ((*start == (char1)'0') || (*start == (char1)'.'))
                    {
                        start++;
                    }

                    digitcount -= (int)(start - startdigits);
                    if (digitcount >= 19)
                    {
                        // Ok, chances are good that we had an overflow!
                        // this is almost never going to get called!!!
                        // we start anew, going slowly!!!
                        return(parse_float(buf, pj, offset,
                                           found_minus));
                    }
                }

                if (/*unlikely*/ ((powerindex > 2 * 308)))
                {
                    // this is uncommon!!!
                    // this is almost never going to get called!!!
                    // we start anew, going slowly!!!
                    return(parse_float(buf, pj, offset,
                                       found_minus));
                }

                double factor = power_of_ten[powerindex];
                factor = negative ? -factor : factor;
                double d = i * factor;
                pj.WriteTapeDouble(d);
            }
            else
            {
                if (/*unlikely*/ (digitcount >= 18))
                {
                    // this is uncommon!!!
                    // there is a good chance that we had an overflow, so we need
                    // need to recover: we parse the whole thing again.
                    return(parse_large_integer(buf, pj, offset,
                                               found_minus));
                }

                i = negative ? 0 - i : i;
                pj.WriteTapeInt64((int64_t)i);
            }

            return(is_structural_or_whitespace((uint8_t)(*p)) != 0);
        }
Example #13
0
        internal static JsonParseError find_structural_bits(uint8_t *buf, size_t len, ParsedJson pj)
        {
            if (len > pj.bytecapacity)
            {
                return(JsonParseError.CAPACITY);
            }

            uint32_t *base_ptr = pj.structural_indexes;
            uint32_t  @base    = 0;

#if SIMDJSON_UTF8VALIDATE
            utf8_checking_state state;
#endif

            // we have padded the input out to 64 byte multiple with the remainder being
            // zeros

            // persistent state across loop
            // does the last iteration end with an odd-length sequence of backslashes?
            // either 0 or 1, but a 64-bit value
            uint64_t prev_iter_ends_odd_backslash = 0UL;
            // does the previous iteration end inside a double-quote pair?
            uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones
            // does the previous iteration end on something that is a predecessor of a
            // pseudo-structural character - i.e. whitespace or a structural character
            // effectively the very first char is considered to follow "whitespace" for
            // the
            // purposes of pseudo-structural character detection so we initialize to 1
            uint64_t prev_iter_ends_pseudo_pred = 1UL;

            // structurals are persistent state across loop as we flatten them on the
            // subsequent iteration into our array pointed to be base_ptr.
            // This is harmless on the first iteration as structurals==0
            // and is done for performance reasons; we can hide some of the latency of the
            // expensive carryless multiply in the previous step with this work
            uint64_t structurals = 0;

            size_t   lenminus64 = len < 64 ? 0 : len - 64;
            size_t   idx        = 0;
            uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20)

            for (; idx < lenminus64; idx += 64)
            {
                //__builtin_prefetch(buf + idx + 128);
                simd_input @in = fill_input(buf + idx);
#if SIMDJSON_UTF8VALIDATE
                check_utf8(in, state);
#endif
                // detect odd sequences of backslashes
                uint64_t odd_ends = find_odd_backslash_sequences(
                    @in, ref prev_iter_ends_odd_backslash);

                // detect insides of quote pairs ("quote_mask") and also our quote_bits
                // themselves
                uint64_t quote_bits = 0;
                uint64_t quote_mask = find_quote_mask_and_bits(
                    @in, odd_ends, ref prev_iter_inside_quote, ref quote_bits, ref error_mask);

                // take the previous iterations structural bits, not our current iteration,
                // and flatten
                flatten_bits(base_ptr, ref @base, (uint32_t)idx, structurals);

                uint64_t whitespace = 0;
                find_whitespace_and_structurals(@in, ref whitespace, ref structurals);

                // fixup structurals to reflect quotes and add pseudo-structural characters
                structurals = finalize_structurals(structurals, whitespace, quote_mask,
                                                   quote_bits, ref prev_iter_ends_pseudo_pred);
            }

            ////////////////
            // we use a giant copy-paste which is ugly.
            // but otherwise the string needs to be properly padded or else we
            // risk invalidating the UTF-8 checks.
            ////////////
            if (idx < len)
            {
                uint8_t *tmpbuf = stackalloc uint8_t[64];
                memset(tmpbuf, 0x20, 64);
                memcpy(tmpbuf, buf + idx, len - idx);
                simd_input @in = fill_input(tmpbuf);
#if SIMDJSON_UTF8VALIDATE
                check_utf8 <T>(in, state);
#endif
                // detect odd sequences of backslashes
                uint64_t odd_ends = find_odd_backslash_sequences(
                    @in, ref prev_iter_ends_odd_backslash);

                // detect insides of quote pairs ("quote_mask") and also our quote_bits
                // themselves
                uint64_t quote_bits = 0;
                uint64_t quote_mask = find_quote_mask_and_bits(
                    @in, odd_ends, ref prev_iter_inside_quote, ref quote_bits, ref error_mask);

                // take the previous iterations structural bits, not our current iteration,
                // and flatten
                flatten_bits(base_ptr, ref @base, (uint)idx, structurals);

                uint64_t whitespace = 0;
                find_whitespace_and_structurals(@in, ref whitespace, ref structurals);

                // fixup structurals to reflect quotes and add pseudo-strucural characters
                structurals = finalize_structurals(structurals, whitespace, quote_mask,
                                                   quote_bits, ref prev_iter_ends_pseudo_pred);
                idx += 64;
            }

            // is last string quote closed?
            if (prev_iter_inside_quote != 0)
            {
                return(JsonParseError.UNCLOSED_STRING);
            }

            // finally, flatten out the remaining structurals from the last iteration
            flatten_bits(base_ptr, ref @base, (uint)idx, structurals);

            pj.n_structural_indexes = @base;
            // a valid JSON file cannot have zero structural indexes - we should have
            // found something
            if (pj.n_structural_indexes == 0u)
            {
                return(JsonParseError.EMPTY);
            }

            if (base_ptr[pj.n_structural_indexes - 1] > len)
            {
                return(JsonParseError.UNEXPECTED_ERROR);
            }

            if (len != base_ptr[pj.n_structural_indexes - 1])
            {
                // the string might not be NULL terminated, but we add a virtual NULL ending
                // character.
                base_ptr[pj.n_structural_indexes++] = (uint)len;
            }

            // make it safe to dereference one beyond this array
            base_ptr[pj.n_structural_indexes] = 0;
            if (error_mask != 0)
            {
                return(JsonParseError.UNESCAPED_CHARS);
            }
#if SIMDJSON_UTF8VALIDATE
            return(check_utf8_errors(state));
#else
            return(JsonParseError.SUCCESS);
#endif
        }
Example #14
0
 internal static JsonParseError find_structural_bits(char1 *buf, size_t len, ParsedJson pj)
 => find_structural_bits((uint8_t *)(buf), len, pj);
Example #15
0
        // called by parse_number when we know that the output is a float,
        // but where there might be some integer overflow. The trick here is to
        // parse using floats from the start.
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        // Note: a redesign could avoid this function entirely.
        //
        static bool parse_float(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            char1 *p        = (char1 *)(buf + offset);
            bool   negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            /*long*/
            double i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                i = digit;
                p++;
                while (is_integer(*p))
                {
                    digit = (uchar1)(*p - (uchar1)'0');
                    i     = 10 * i + digit;
                    ++p;
                }
            }

            if ('.' == *p)
            {
                ++p;
                int fractionalweight = 308;
                if (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;

                    fractionalweight--;
                    i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
                }
                else
                {
                    return(false);
                }

                while (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;
                    fractionalweight--;
                    i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
                }
            }

            if (('e' == *p) || ('E' == *p))
            {
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
                    return(false);
                }

                uchar1  digit     = (uchar1)(*p - (uchar1)'0');
                int64_t expnumber = digit; // exponential part
                p++;
                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    return(false);
                }

                if (/*unlikely*/ (expnumber > 308))
                {
                    // C# needs unlikely!
                    // this path is unlikely
                    if (negexp)
                    {
                        // We either have zero or a subnormal.
                        // We expect this to be uncommon so we go through a slow path.
                        i = subnormal_power10(i, (int)-expnumber);
                    }
                    else
                    {
                        // We know for sure that we have a number that is too large,
                        // we refuse to parse this
                        return(false);
                    }
                }
                else
                {
                    int exponent = (int)(negexp ? -expnumber : expnumber);
                    // we have that expnumber is [0,308] so that
                    // exponent is [-308,308] so that
                    // 308 + exponent is in [0, 2 * 308]
                    i *= power_of_ten[308 + exponent];
                }
            }

            if (is_not_structural_or_whitespace((uint8_t)(*p)) != 0)
            {
                return(false);
            }

            double d = negative ? -i : i;

            pj.WriteTapeDouble(d);
            return(is_structural_or_whitespace((uint8_t)(*p)) != 0);
        }
Example #16
0
        // called by parse_number when we know that the output is an integer,
        // but where there might be some integer overflow.
        // we want to catch overflows!
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        static bool parse_large_integer(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            bytechar *p        = (bytechar *)(buf + offset);
            bool      negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            uint64_t i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (unsigned_bytechar)(*p - '0');
                    if (mul_overflow(i, 10, &i))
                    {
#if JSON_TEST_NUMBERS // for unit testing
                        foundInvalidNumber(buf + offset);
#endif
                        return(false); // overflow
                    }

                    if (add_overflow(i, digit, &i))
                    {
#if JSON_TEST_NUMBERS // for unit testing
                        foundInvalidNumber(buf + offset);
#endif
                        return(false); // overflow
                    }

                    ++p;
                }
            }

            if (negative)
            {
                if (i > 0x8000000000000000)
                {
                    // overflows!
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false); // overflow
                }
            }
            else
            {
                if (i >= 0x8000000000000000)
                {
                    // overflows!
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false); // overflow
                }
            }

            int64_t signed_answer = negative ? -(int64_t)i : (int64_t)i;
            pj.WriteTapeInt64(signed_answer);
#if JSON_TEST_NUMBERS // for unit testing
            foundInteger(signed_answer, buf + offset);
#endif
            return(is_structural_or_whitespace((byte)(*p)) != 0);
        }
Example #17
0
        public static bool parse_number(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            bytechar *p        = (bytechar *)(buf + offset);
            bool      negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
                if (!is_integer(*p))
                {
                    // a negative sign must be followed by an integer

                    return(false);
                }
            }

            bytechar *startdigits = p;

            int64_t i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                if (is_not_structural_or_whitespace_or_exponent_or_decimal((uint8_t)(*p)))
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                i = 0;
            }
            else
            {
                if (!(is_integer(*p)))
                {
                    // must start with an integer
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (unsigned_bytechar)(*p - '0');
                    i     = 10 * i + digit; // might overflow
                    ++p;
                }
            }

            int64_t exponent = 0;

            if ('.' == *p)
            {
                ++p;
                bytechar *firstafterperiod = p;
                if (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    i = i * 10 + digit;
                }
                else
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }
#if SWAR_NUMBER_PARSING
                // this helps if we have lots of decimals!
                // this turns out to be frequent enough.
                if (is_made_of_eight_digits_fast(p))
                {
                    i  = i * 100000000 + parse_eight_digits_unrolled(p);
                    p += 8;
                    // exponent -= 8;
                }
#endif
                while (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later.
                }

                exponent = firstafterperiod - p;
            }

            int digitcount = (int)(p - startdigits - 1);

            int64_t expnumber = 0; // exponential part
            if (('e' == *p) || ('E' == *p))
            {
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                expnumber = digit;
                p++;
                while (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    // we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                exponent += (negexp ? -expnumber : expnumber);
            }

            i = negative ? -i : i;
            if ((exponent != 0) || (expnumber != 0))
            {
                if ((digitcount >= 19))
                {
                    // this is uncommon!!!
                    // this is almost never going to get called!!!
                    // we start anew, going slowly!!!
                    return(parse_float(buf, pj, offset,
                                       found_minus));
                }

                ///////////
                // We want 0.1e1 to be a float.
                //////////
                if (i == 0)
                {
                    pj.WriteTapeDouble(0.0);
#if JSON_TEST_NUMBERS // for unit testing
                    foundFloat(0.0, buf + offset);
#endif
                }
                else
                {
                    if ((exponent > 308) || (exponent < -308))
                    {
                        // we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                        foundInvalidNumber(buf + offset);
#endif
                        return(false);
                    }

                    double d = i;
                    d *= power_of_ten[308 + exponent];
                    // d = negative ? -d : d;
                    pj.WriteTapeDouble(d);
#if JSON_TEST_NUMBERS // for unit testing
                    foundFloat(d, buf + offset);
#endif
                }
            }
            else
            {
                if ((digitcount >= 18))
                {
                    // this is uncommon!!!
                    return(parse_large_integer(buf, pj, offset,
                                               found_minus));
                }

                pj.WriteTapeInt64(i);
#if JSON_TEST_NUMBERS // for unit testing
                foundInteger(i, buf + offset);
#endif
            }

            return(is_structural_or_whitespace((uint8_t)(*p)) != 0);
        }
Example #18
0
        internal static JsonParseError unified_machine(uint8_t *buf, size_t len, ParsedJson pj)
        {
#if !ALLOW_SAME_PAGE_BUFFER_OVERRUN
            memset((uint8_t *)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
#endif
            uint32_t i = 0;                                    // index of the structural character (0,1,2,3...)
            uint32_t idx;                                      // location of the structural character in the input (buf)
            uint8_t  c = 0;                                    // used to track the (structural) character we are looking at, updated
            // by UPDATE_CHAR macro
            uint32_t depth = 0;                                // could have an arbitrary starting depth
            pj.Init();                                         // sets isvalid to false
            if (pj.bytecapacity < len)
            {
                pj.ErrorCode = JsonParseError.CAPACITY;
                return(pj.ErrorCode);
            }

            ////////////////////////////// START STATE /////////////////////////////
            pj.ret_address[depth]             = (bytechar)'s';
            pj.containing_scope_offset[depth] = pj.CurrentLoc;
            pj.WriteTape(0, (uint8_t)'r'); // r for root, 0 is going to get overwritten
            // the root is used, if nothing else, to capture the size of the tape
            depth++;                       // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
            if (depth >= pj.depthcapacity)
            {
                goto fail;
            }

            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'{':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);     // strangely, moving this to object_begin slows things down
                goto object_begin;

            case (uint8_t)'[':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                goto array_begin;

                // A JSON text is a serialized value.  Note that certain previous
                // specifications of JSON constrained a JSON text to be an object or an
                // array.  Implementations that generate only objects or arrays where a
                // JSON text is called for will be interoperable in the sense that all
                // implementations will accept these as conforming JSON texts.
                // https://tools.ietf.org/html/rfc8259
#if SIMDJSON_ALLOWANYTHINGINROOT
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the true value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_true_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'f':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the false value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_false_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'n':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the null value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_null_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice. We terminate with a space
                // because we do not want to allow NULLs in the middle of a number (whereas a
                // space in the middle of a number would be identified in stage 1).
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!parse_number((uint8_t *)(copy), pj, idx, false))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }

            case (uint8_t)'-':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)(copy), pj, idx, true))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }
#endif // ALLOWANYTHINGINROOT
            default:
                goto fail;
            }

start_continue:
            // the string might not be NULL terminated.
            if (i + 1 == pj.n_structural_indexes)
            {
                goto succeed;
            }
            else
            {
                goto fail;
            }
            ////////////////////////////// OBJECT STATES /////////////////////////////

object_begin:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                goto object_key_state;
            }

            case (uint8_t)'}':
                goto scope_end;     // could also go to object_continue

            default:
                goto fail;
            }

object_key_state:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            if (c != ':')
            {
                goto fail;
            }

            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'{':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an object inside an object, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an array inside an object, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

object_continue:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)',':
                idx = pj.structural_indexes[i++];
                c   = buf[idx];   //UPDATE_CHAR()
                if (c != '"')
                {
                    goto fail;
                }
                else
                {
                    if (!parse_string(buf, len, pj, depth, idx))
                    {
                        goto fail;
                    }

                    goto object_key_state;
                }

            case (uint8_t)'}':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// COMMON STATE /////////////////////////////

scope_end:
            // write our tape location to the header scope
            depth--;
            pj.WriteTape(pj.containing_scope_offset[depth], c);
            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth],
                                   pj.CurrentLoc);
            if (pj.ret_address[depth] == 'a')
            {
                goto array_continue;
            }
            else if (pj.ret_address[depth] == 'o')
            {
                goto object_continue;
            }
            else
            {
                goto start_continue;
            }

            ////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            if (c == (uint8_t)']')
            {
                goto scope_end; // could also go to array_continue
            }

main_array_switch:
            // we call update char on all paths in, so we can peek at c on the
            // on paths that can accept a close square brace (post-, and at start)
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;     // goto array_continue;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'{':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     //  here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an object inside an array, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an array inside an array, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

array_continue:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)',':
                idx = pj.structural_indexes[i++];
                c   = buf[idx];   //UPDATE_CHAR()
                goto main_array_switch;

            case (uint8_t)']':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// FINAL STATES /////////////////////////////

succeed:
            depth--;
            if (depth != 0)
            {
                throw new InvalidOperationException("internal bug");
                //abort();
            }

            if (pj.containing_scope_offset[depth] != 0)
            {
                throw new InvalidOperationException("internal bug");
                //abort();
            }

            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc);
            pj.WriteTape(pj.containing_scope_offset[depth], (uint8_t)'r');  // r is root

            pj.isvalid   = true;
            pj.ErrorCode = JsonParseError.SUCCESS;
            return(pj.ErrorCode);

fail:
            // we do not need the next line because this is done by pj.init(), pessimistically.
            // pj.isvalid  = false;
            // At this point in the code, we have all the time in the world.
            // Note that we know exactly where we are in the document so we could,
            // without any overhead on the processing code, report a specific location.
            // We could even trigger special code paths to assess what happened carefully,
            // all without any added cost.
            if (depth >= pj.depthcapacity)
            {
                pj.ErrorCode = JsonParseError.DEPTH_ERROR;
                return(pj.ErrorCode);
            }

            switch (c)
            {
            case (uint8_t)'"':
                pj.ErrorCode = JsonParseError.STRING_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            case (uint8_t)'-':
                pj.ErrorCode = JsonParseError.NUMBER_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'t':
                pj.ErrorCode = JsonParseError.T_ATOM_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'n':
                pj.ErrorCode = JsonParseError.N_ATOM_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'f':
                pj.ErrorCode = JsonParseError.F_ATOM_ERROR;
                return(pj.ErrorCode);

            default:
                break;
            }

            pj.ErrorCode = JsonParseError.TAPE_ERROR;
            return(pj.ErrorCode);
        }
Example #19
0
 internal static JsonParseError unified_machine(bytechar *buf, size_t len, ParsedJson pj)
 => unified_machine((uint8_t *)(buf), len, pj);
Example #20
0
 internal static JsonParseError JsonParse(byte *jsonData, UInt64 length, ParsedJson pj, bool reallocIfNeeded = true)
 {
     if (pj.bytecapacity < length)
         return(JsonParseError.CAPACITY); }
        internal static bool unified_machine(uint8_t *buf, size_t len, ParsedJson pj)
        {
            uint32_t i = 0;     // index of the structural character (0,1,2,3...)
            uint32_t idx;       // location of the structural character in the input (buf)
            uint8_t  c;         // used to track the (structural) character we are looking at, updated
            // by UPDATE_CHAR macro
            uint32_t depth = 0; // could have an arbitrary starting depth

            pj.Init();
            if (pj.bytecapacity < len)
            {
                Debug.Write("insufficient capacity\n");
                return(false);
            }

            // this macro reads the next structural character, updating idx, i and c.
            //C#: expanded directly everywhere
            //void UPDATE_CHAR()
            //{
            //    idx = pj.structural_indexes[i++];
            //    c = buf[idx];
            //}

            pj.ret_address[depth]             = (bytechar)'s';
            pj.containing_scope_offset[depth] = pj.CurrentLoc;
            pj.WriteTape(0, (byte)'r'); // r for root, 0 is going to get overwritten
            // the root is used, if nothing else, to capture the size of the tape
            depth++;                    // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
            if (depth > pj.depthcapacity)
            {
                goto fail;
            }


            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];

            switch (c)
            {
            case (uint8_t)'{':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);     // strangely, moving this to object_begin slows things down
                goto object_begin;

            case (uint8_t)'[':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                goto array_begin;

                // A JSON text is a serialized value.  Note that certain previous
                // specifications of JSON constrained a JSON text to be an object or an
                // array.  Implementations that generate only objects or arrays where a
                // JSON text is called for will be interoperable in the sense that all
                // implementations will accept these as conforming JSON texts.
                // https://tools.ietf.org/html/rfc8259
#if SIMDJSON_ALLOWANYTHINGINROOT
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the true value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_true_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'f':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the false value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_false_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'n':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the null value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_null_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)copy, pj, idx, false))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }

            case (uint8_t)'-':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)copy, pj, idx, true))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }
#endif // ALLOWANYTHINGINROOT
            default:
                goto fail;
            }

start_continue:
            // the string might not be NULL terminated.
            if (i + 1 == pj.n_structural_indexes)
            {
                goto succeed;
            }
            else
            {
                goto fail;
            }
            ////////////////////////////// OBJECT STATES /////////////////////////////

object_begin:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                goto object_key_state;
            }

            case (uint8_t)'}':
                goto scope_end;     // could also go to object_continue

            default:
                goto fail;
            }

object_key_state:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            if (c != ':')
            {
                goto fail;
            }

            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'{':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an object inside an object, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an array inside an object, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

object_continue:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)',':
                //UPDATE_CHAR():
                idx = pj.structural_indexes[i++];
                c   = buf[idx];
                if (c != (uint8_t)'"')
                {
                    goto fail;
                }
                else
                {
                    if (!parse_string(buf, len, pj, depth, idx))
                    {
                        goto fail;
                    }

                    goto object_key_state;
                }

            case (uint8_t)'}':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// COMMON STATE /////////////////////////////

scope_end:
            // write our tape location to the header scope
            depth--;
            pj.WriteTape(pj.containing_scope_offset[depth], c);
            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth],
                                   pj.CurrentLoc);
            // goto saved_state
            if (pj.ret_address[depth] == (uint8_t)'a')
            {
                goto array_continue;
            }
            else if (pj.ret_address[depth] == (uint8_t)'o')
            {
                goto object_continue;
            }
            else
            {
                goto start_continue;
            }

            ////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            if (c == ']')
            {
                goto scope_end; // could also go to array_continue
            }

main_array_switch:
            // we call update char on all paths in, so we can peek at c on the
            // on paths that can accept a close square brace (post-, and at start)
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;     // goto array_continue;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'{':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     //  here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an object inside an array, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an array inside an array, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

array_continue:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)',':
                //UPDATE_CHAR():
                idx = pj.structural_indexes[i++];
                c   = buf[idx];
                goto main_array_switch;

            case (uint8_t)']':
                goto scope_end;

            default:
                goto fail;
            }


            ////////////////////////////// FINAL STATES /////////////////////////////

succeed:
            depth--;
            if (depth != 0)
            {
                throw new InvalidOperationException("internal bug");
            }

            if (pj.containing_scope_offset[depth] != 0)
            {
                throw new InvalidOperationException("internal bug");
            }

            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc);
            pj.WriteTape(pj.containing_scope_offset[depth], (byte)'r');  // r is root
            pj.isvalid = true;
            return(true);



fail:
            return(false);
        }
Example #22
0
        internal static bool find_structural_bits(uint8_t* buf, size_t len, ParsedJson pj)
        {
            if (len > pj.bytecapacity)
            {
                Console.WriteLine("Your ParsedJson object only supports documents up to " + pj.bytecapacity +
                                  " bytes but you are trying to process " + len + " bytes\n");
                return false;
            }

            uint32_t* base_ptr = pj.structural_indexes;
            uint32_t @base = 0;
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
            var has_error = Vector256<byte>.Zero;
            var previous = new avx_processed_utf_bytes();
            previous.rawbytes = Vector256<byte>.Zero;
            previous.high_nibbles = Vector256<byte>.Zero;
            previous.carried_continuations = Vector256<byte>.Zero;
            var highbit = Vector256.Create((byte)0x80);
#endif

            const uint64_t even_bits = 0x5555555555555555UL;
            const uint64_t odd_bits = ~even_bits;

            // for now, just work in 64-byte chunks
            // we have padded the input out to 64 byte multiple with the remainder being
            // zeros

            // persistent state across loop
            uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value
            uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones

            // effectively the very first char is considered to follow "whitespace" for the
            // purposes of psuedo-structural character detection
            uint64_t prev_iter_ends_pseudo_pred = 1UL;
            size_t lenminus64 = len < 64 ? 0 : len - 64;
            size_t idx = 0;
            uint64_t structurals = 0;

            // C#: assign static readonly fields to locals before the loop
            Vector256<byte> low_nibble_mask = s_low_nibble_mask;
            Vector256<byte> high_nibble_mask = s_high_nibble_mask;
            Vector256<byte> utf8ValidVec = s_utf8ValidVec;

            var structural_shufti_mask = Vector256.Create((byte)0x7);
            var whitespace_shufti_mask = Vector256.Create((byte)0x18);
            var slashVec = Vector256.Create((bytechar) '\\').AsByte();
            var ffVec = Vector128.Create((byte) 0xFF).AsUInt64();
            var doubleQuoteVec = Vector256.Create((byte)'"');
            var zeroBVec = Vector256.Create((byte) 0);
            var vec7f = Vector256.Create((byte) 0x7f);

            for (; idx < lenminus64; idx += 64)
            {
                var input_lo = Avx.LoadVector256(buf + idx + 0);
                var input_hi = Avx.LoadVector256(buf + idx + 32);
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                {
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                        Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec, has_error);

                }
                else
                {
                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);
                }
#endif

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////
                /// 
                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;
                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                bool iter_ends_odd_backslash =
                    add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0));

                uint32_t cnt = (uint32_t) hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;
                }

                @base = next_base;

                quote_mask ^= prev_iter_inside_quote;
                prev_iter_inside_quote =
                    (uint64_t) ((int64_t) quote_mask >>
                                63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code



                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                            vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                            vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t) Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t) Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t) Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t) Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);

                //Console.WriteLine($"Iter: {idx}, satur: {structurals}");

                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;
            }

            ////////////////
            /// we use a giant copy-paste which is ugly.
            /// but otherwise the string needs to be properly padded or else we
            /// risk invalidating the UTF-8 checks.
            ////////////
            if (idx < len)
            {
                uint8_t* tmpbuf = stackalloc uint8_t[64];
                memset(tmpbuf, 0x20, 64);
                memcpy(tmpbuf, buf + idx, len - idx);
                Vector256<byte> input_lo = Avx.LoadVector256(tmpbuf + 0);
                Vector256<byte> input_hi = Avx.LoadVector256(tmpbuf + 32);
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
                var highbit = Vector256.Create((byte)0x80);
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                {
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                      Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(),
                                      utf8ValidVec).AsByte(), has_error);

                }
                else
                {
                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);
                }
#endif
                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;

                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                //bool iter_ends_odd_backslash =
                add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64());
                quote_mask ^= prev_iter_inside_quote;

                //BUG? https://github.com/dotnet/coreclr/issues/22813
                //quote_mask = 60;
                //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20

                uint32_t cnt = (uint32_t)hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;
                }
                @base = next_base;
                // How do we build up a user traversable data structure
                // first, do a 'shufti' to detect structural JSON characters
                // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
                // these go into the first 3 buckets of the comparison (1/2/4)

                // we are also interested in the four whitespace characters
                // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
                // these go into the next 2 buckets of the comparison (8/16)

                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                            vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                            vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                // this additional mask and transfer is non-trivially expensive,
                // unfortunately
                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);
                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;
                idx += 64;
            }
            uint32_t cnt2 = (uint32_t)hamming(structurals);
            uint32_t next_base2 = @base + cnt2;
            while (structurals != 0)
            {
                base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                @base += 8;
            }
            @base = next_base2;

            pj.n_structural_indexes = @base;
            if (base_ptr[pj.n_structural_indexes - 1] > len)
            {
                throw new InvalidOperationException("Internal bug");
            }
            if (len != base_ptr[pj.n_structural_indexes - 1])
            {
                // the string might not be NULL terminated, but we add a virtual NULL ending character. 
                base_ptr[pj.n_structural_indexes++] = (uint32_t)len;
            }
            base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array

#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
            return Avx.TestZ(has_error, has_error);
#else
            return true;
#endif
        }
Example #23
0
        internal static bool parse_string(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
            pj.WriteTape((ulong)(pj.current_string_buf_loc - pj.string_buf), (char1)'"');
            uint8_t *src             = &buf[offset + 1]; // we know that buf at offset is a "
            uint8_t *dst             = pj.current_string_buf_loc + sizeof(uint32_t);
            uint8_t *start_of_string = dst;

            while (true)
            {
                parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
                if (((helper.bs_bits - 1) & helper.quote_bits) != 0)
                {
                    // we encountered quotes first. Move dst to point to quotes and exit
                    // find out where the quote is...
                    uint32_t quote_dist = (uint32_t)trailingzeroes(helper.quote_bits);

                    // NULL termination is still handy if you expect all your strings to be NULL terminated?
                    // It comes at a small cost
                    dst[quote_dist] = 0;

                    uint32_t str_length = (uint32_t)((dst - start_of_string) + quote_dist);
                    memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
                    ///////////////////////
                    // Above, check for overflow in case someone has a crazy string (>=4GB?)
                    // But only add the overflow check when the document itself exceeds 4GB
                    // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
                    ////////////////////////

                    // we advance the point, accounting for the fact that we have a NULL termination
                    pj.current_string_buf_loc = dst + quote_dist + 1;

                    return(true);
                }

                if (((helper.quote_bits - 1) & helper.bs_bits) != 0)
                {
                    // find out where the backspace is
                    uint32_t bs_dist     = (uint32_t)trailingzeroes(helper.bs_bits);
                    uint8_t  escape_char = src[bs_dist + 1];
                    // we encountered backslash first. Handle backslash
                    if (escape_char == 'u')
                    {
                        // move src/dst up to the start; they will be further adjusted
                        // within the unicode codepoint handling code.
                        src += bs_dist;
                        dst += bs_dist;
                        if (!handle_unicode_codepoint(&src, &dst))
                        {
                            return(false);
                        }
                    }
                    else
                    {
                        // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
                        // write bs_dist+1 characters to output
                        // note this may reach beyond the part of the buffer we've actually
                        // seen. I think this is ok
                        uint8_t escape_result = escape_map[escape_char]; // TODO: https://github.com/dotnet/coreclr/issues/25894
                        if (escape_result == 0u)
                        {
                            return(false); // bogus escape value is an error
                        }

                        dst[bs_dist] = escape_result;
                        src         += bs_dist + 2;
                        dst         += bs_dist + 1;
                    }
                }
                else
                {
                    // they are the same. Since they can't co-occur, it means we encountered
                    // neither.
                    if (!Avx2.IsSupported)
                    {
                        src += 16; // sse42
                        dst += 16;
                    }
                    else
                    {
                        src += 32; // avx2
                        dst += 32;
                    }
                }
            }
        }