C# (CSharp) SimdJsonSharp ParsedJson Examples

Programming Language: C# (CSharp)

Namespace/Package Name: SimdJsonSharp

Class/Type: ParsedJson

Examples at hotexamples.com: 23

C# (CSharp) SimdJsonSharp ParsedJson - 23 examples found. These are the top rated real world C# (CSharp) examples of SimdJsonSharp.ParsedJson extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

WriteTape(4)

WriteTapeDouble(4)

WriteTapeInt64(4)

AllocateCapacity(2)

AnnotatePreviousLoc(2)

Init(2)

Dispose(1)

allocateCapacity(1)

write_tape(1)

Example #1

Show file

File: ParsedJsonIterator.cs Project: zsybupt/SimdJsonSharp

        public ParsedJsonIterator(ParsedJson parsedJson)
        {
            pj           = parsedJson;
            depth        = 0;
            location     = 0;
            tape_length  = 0;
            depthindex   = allocate <scopeindex_t>(pj.depthcapacity);
            current_type = 0;
            current_val  = 0;

            depthindex[0].start_of_scope = location;
            current_val              = pj.tape[location++];
            current_type             = (uint8_t)(current_val >> 56);
            depthindex[0].scope_type = current_type;
            if (current_type == 'r')
            {
                tape_length = current_val & JSONVALUEMASK;
                if (location < tape_length)
                {
                    current_val  = pj.tape[location];
                    current_type = (uint8_t)(current_val >> 56);
                    depth++;
                    depthindex[depth].start_of_scope = location;
                    depthindex[depth].scope_type     = current_type;
                }
            }
            else
            {
                throw new InvalidOperationException("Json is invalid");
            }
        }

Example #2

Show file

File: SimdJson.cs Project: billwillman/SimdJsonSharp

        internal static JsonParseError JsonParse(uint8_t *jsonData, size_t length, ParsedJson pj, bool reallocIfNeeded = true)
        {
            if (pj.bytecapacity < length)
            {
                return(JsonParseError.Capacity);
            }

            bool reallocated = false;

            if (reallocIfNeeded)
            {
                // realloc is needed if the end of the memory crosses a page
                if ((size_t)(jsonData + length - 1) % (size_t)pagesize < SIMDJSON_PADDING)
                {
                    uint8_t *tmpbuf = jsonData;
                    jsonData = (uint8_t *)allocate_padded_buffer(length);
                    if (jsonData == null)
                    {
                        return(JsonParseError.Memalloc);
                    }
                    memcpy(jsonData, tmpbuf, length);
                    reallocated = true;
                }
            }

            var result = JsonParseError.Success;

            if (stage1_find_marks.find_structural_bits(jsonData, length, pj))
            {
                result = stage2_build_tape.unified_machine(jsonData, length, pj);
            }
            if (reallocated)
                aligned_free(jsonData); }

Example #3

Show file

File: stringparsing.cs Project: zy850580380/SimdJsonSharp

        public static bool parse_string(uint8_t* buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
            if (Avx2.IsSupported)
                return parse_string_avx2(buf, len, pj, depth, offset);
            //if (Sse41.IsSupported)
            //    return parse_string_sse41(buf, len, pj, depth, offset);

            ThrowHelper.ThrowPNSE();
            return false;
        }

Example #4

Show file

        public void Dispose()
        {
            if (depthindex != null)
            {
                delete(depthindex);
                depthindex = null;
            }

            if (pj != null)
            {
                pj.Dispose();
                pj = null;
            }
        }

Example #5

Show file

        public static ParsedJson build_parsed_json(uint8_t *buf, size_t len, bool reallocifneeded = true)
        {
            ParsedJson pj = new ParsedJson();
            bool       ok = pj.allocateCapacity(len);

            if (ok)
            {
                ok = json_parse(buf, len, &pj, reallocifneeded);
            }
            else
            {
                throw new InvalidOperationException("failure during memory allocation");
            }
            return(pj);
        }

Example #6

Show file

File: SimdJson.cs Project: billwillman/SimdJsonSharp

        public static ParsedJson ParseJson(byte *jsonData, int length, bool reallocIfNeeded = true)
        {
            var  pj = new ParsedJson();
            bool ok = pj.AllocateCapacity((ulong)length);

            if (ok)
            {
                JsonParse(jsonData, (ulong)length, pj, reallocIfNeeded);
            }
            else
            {
                throw new InvalidOperationException("failure during memory allocation");
            }
            return(pj);
        }

Example #7

Show file

File: SimdJson.cs Project: zsybupt/SimdJsonSharp

        public static ParsedJson ParseJson(byte *jsonData, ulong length, bool reallocIfNeeded = true)
        {
            var  pj = new ParsedJson();
            bool ok = pj.AllocateCapacity(length);

            if (ok)
            {
                JsonParse(jsonData, length, pj, reallocIfNeeded);
            }
            else
            {
                pj.isvalid   = false;
                pj.ErrorCode = JsonParseError.CAPACITY;
            }
            return(pj);
        }

Example #8

Show file

File: SimdJson.cs Project: zy850580380/SimdJsonSharp

        internal static bool JsonParse(uint8_t *jsonData, size_t length, ParsedJson pj, bool reallocIfNeeded = true)
        {
            if (pj.bytecapacity < length)
            {
                throw new InvalidOperationException("Your ParsedJson cannot support documents that big: " + length);
            }

            bool reallocated = false;

            if (reallocIfNeeded)
            {
                // realloc is needed if the end of the memory crosses a page
                if ((size_t)(jsonData + length - 1) % (size_t)pagesize < SIMDJSON_PADDING)
                {
                    uint8_t *tmpbuf = jsonData;
                    jsonData = (uint8_t *)allocate_padded_buffer(length);
                    if (jsonData == null)
                    {
                        return(false);
                    }
                    memcpy(jsonData, tmpbuf, length);
                    reallocated = true;
                }
            }
            bool isok = stage1_find_marks.find_structural_bits(jsonData, length, pj);

            if (isok)
            {
                isok = stage2_build_tape.unified_machine(jsonData, length, pj);
            }
            else
            {
                if (reallocated)
                {
                    free(jsonData);
                }
                return(false);
            }
            if (reallocated)
                free(jsonData); }

Example #9

Show file

File: stringparsing.Sse2.cs Project: zy850580380/SimdJsonSharp

        public static bool parse_string_sse41(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
#if SIMDJSON_SKIPSTRINGPARSING               // for performance analysis, it is sometimes useful to skip parsing
            pj.write_tape(0, '"');           // don't bother with the string parsing at all
            return(true);                    // always succeeds
#else
            uint8_t *src = &buf[offset + 1]; // we know that buf at offset is a "
            uint8_t *dst = pj.current_string_buf_loc;
#if JSON_TEST_STRINGS                        // for unit testing
            uint8_t *const start_of_string = dst;
#endif

            Vector128 <byte> slashVec = Vector128.Create((byte)'\\');
            Vector128 <byte> quoteVec = Vector128.Create((byte)'"');
            Vector128 <byte> unitsep  = Vector128.Create((byte)0x1F);

            while (true)
            {
                Vector128 <byte> v          = Sse2.LoadVector128((src));
                uint32_t         bs_bits    = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, slashVec));
                uint32_t         quote_bits = (uint32_t)Sse2.MoveMask(Sse2.CompareEqual(v, quoteVec));
                // All Unicode characters may be placed within the
                // quotation marks, except for the characters that MUST be escaped:
                // quotation mark, reverse solidus, and the control characters (U+0000
                //through U+001F).
                // https://tools.ietf.org/html/rfc8259
#if CHECKUNESCAPED
                Vector128 <byte> unescaped_vec =
                    Sse2.CompareEqual(Sse2.Max(unitsep, v), unitsep); // could do it with saturated subtraction
#endif // CHECKUNESCAPED

                uint32_t quote_dist = (uint32_t)trailingzeroes(quote_bits);
                uint32_t bs_dist    = (uint32_t)trailingzeroes(bs_bits);
                // store to dest unconditionally - we can overwrite the bits we don't like
                // later
                memcpy(dst, src, (size_t)Vector128 <byte> .Count);

                if (quote_dist < bs_dist)
                {
                    // we encountered quotes first. Move dst to point to quotes and exit
                    dst[quote_dist] = 0; // null terminate and get out

                    pj.WriteTape((size_t)pj.current_string_buf_loc - (size_t)pj.string_buf, (uint8_t)'"');

                    pj.current_string_buf_loc = dst + quote_dist + 1; // the +1 is due to the 0 value
#if CHECKUNESCAPED
                    // check that there is no unescaped char before the quote
                    uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec);
                    bool     is_ok          = ((quote_bits - 1) & (~quote_bits) & unescaped_bits) == 0;
#if JSON_TEST_STRINGS // for unit testing
                    if (is_ok)
                    {
                        foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1);
                    }
                    else
                    {
                        foundBadString(buf + offset);
                    }
#endif // JSON_TEST_STRINGS
                    return(is_ok);
#else //CHECKUNESCAPED
#if JSON_TEST_STRINGS // for unit testing
                    foundString(buf + offset, start_of_string, pj.current_string_buf_loc - 1);
#endif // JSON_TEST_STRINGS
                    return(true);
#endif //CHECKUNESCAPED
                }
                else if (quote_dist > bs_dist)
                {
                    uint8_t escape_char = src[bs_dist + 1];
#if CHECKUNESCAPED
                    // we are going to need the unescaped_bits to check for unescaped chars
                    uint32_t unescaped_bits = (uint32_t)Sse2.MoveMask(unescaped_vec);
                    if (((bs_bits - 1) & (~bs_bits) & unescaped_bits) != 0)
                    {
#if JSON_TEST_STRINGS // for unit testing
                        foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                        return(false);
                    }
#endif //CHECKUNESCAPED
                    // we encountered backslash first. Handle backslash
                    if (escape_char == 'u')
                    {
                        // move src/dst up to the start; they will be further adjusted
                        // within the unicode codepoint handling code.
                        src += bs_dist;
                        dst += bs_dist;
                        if (!handle_unicode_codepoint(&src, &dst))
                        {
#if JSON_TEST_STRINGS // for unit testing
                            foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                            return(false);
                        }
                    }
                    else
                    {
                        // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
                        // write bs_dist+1 characters to output
                        // note this may reach beyond the part of the buffer we've actually
                        // seen. I think this is ok
                        uint8_t escape_result = escape(escape_char);
                        if (escape_result == 0)
                        {
#if JSON_TEST_STRINGS // for unit testing
                            foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                            return(false); // bogus escape value is an error
                        }

                        dst[bs_dist] = escape_result;
                        src         += bs_dist + 2;
                        dst         += bs_dist + 1;
                    }
                }
                else
                {
                    // they are the same. Since they can't co-occur, it means we encountered
                    // neither.
                    src += Vector128 <byte> .Count;
                    dst += Vector128 <byte> .Count;
#if CHECKUNESCAPED
                    // check for unescaped chars
                    if (Sse2.MoveMask(unescaped_vec) != 0)
                    {
#if JSON_TEST_STRINGS // for unit testing
                        foundBadString(buf + offset);
#endif // JSON_TEST_STRINGS
                        return(false);
                    }
#endif // CHECKUNESCAPED
                }
            }

            // can't be reached
            return(true);
#endif // SIMDJSON_SKIPSTRINGPARSING
        }

Example #10

Show file

File: numberparsing.cs Project: zsybupt/SimdJsonSharp

        // called by parse_number when we know that the output is an integer,
        // but where there might be some integer overflow.
        // we want to catch overflows!
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        static bool parse_large_integer(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            char1 *p = (char1 *)(buf + offset);

            bool negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            uint64_t i;

            if (*p == (uchar1)'0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (uchar1)(*p - (uchar1)'0');
                    if (mul_overflow(i, 10, &i))
                    {
                        return(false); // overflow
                    }

                    if (add_overflow(i, digit, &i))
                    {
                        return(false); // overflow
                    }
                    ++p;
                }
            }

            if (negative)
            {
                if (i > 0x8000000000000000)
                {
                    return(false); // overflow
                }
            }
            else
            {
                if (i >= 0x8000000000000000)
                {
                    return(false); // overflow
                }
            }

            int64_t signed_answer = negative ? -(int64_t)i : (int64_t)i;

            pj.WriteTapeInt64(signed_answer);
            return(is_structural_or_whitespace((uchar1)(*p)) != 0);
        }

Example #11

Show file

File: numberparsing.cs Project: randyammar/SimdJsonSharp

        // called by parse_number when we know that the output is a float,
        // but where there might be some integer overflow. The trick here is to
        // parse using floats from the start.
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        // Note: a redesign could avoid this function entirely.
        //
        private static bool parse_float(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            bytechar *p        = (bytechar *)(buf + offset);
            bool      negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            double i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                unsigned_bytechar digit = (unsigned_bytechar)(*p - (bytechar)'0');
                i = digit;
                p++;
                while (is_integer(*p))
                {
                    digit = (unsigned_bytechar)(*p - '0');
                    i     = 10 * i + digit;
                    ++p;
                }
            }

            if ('.' == *p)
            {
                ++p;
                double fractionalweight = 1;
                if (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    fractionalweight *= 0.1;
                    i = i + digit * fractionalweight;
                }
                else
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                while (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    fractionalweight *= 0.1;
                    i = i + digit * fractionalweight;
                }
            }

            if (('e' == *p) || ('E' == *p))
            {
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                unsigned_bytechar digit     = (unsigned_bytechar)(*p - '0');
                int64_t           expnumber = digit; // exponential part
                p++;
                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
// we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                int exponent = (int)(negexp ? -expnumber : expnumber);
                if ((exponent > 308) || (exponent < -308))
                {
// we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                i *= power_of_ten[308 + exponent];
            }

            if (is_not_structural_or_whitespace((byte)*p) != 0)
            {
                return(false);
            }

            double d = negative ? -i : i;
            pj.WriteTapeDouble(d);
#if JSON_TEST_NUMBERS // for unit testing
            foundFloat(d, buf + offset);
#endif
            return(is_structural_or_whitespace((byte)(*p)) != 0);
        }

Example #12

Show file

File: numberparsing.cs Project: zsybupt/SimdJsonSharp

        internal static bool parse_number(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            char1 *p        = (char1 *)(buf + offset);
            bool   negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
                if (!is_integer(*p))
                {
                    // a negative sign must be followed by an integer
                    return(false);
                }
            }

            char1 *  startdigits = p;
            uint64_t i; // an unsigned int avoids signed overflows (which are bad)

            if (*p == (char1)'0')
            {
                // 0 cannot be followed by an integer
                ++p;
                if (is_not_structural_or_whitespace_or_exponent_or_decimal((uint8_t)(*p)))
                {
                    return(false);
                }
                i = 0;
            }
            else
            {
                if (!(is_integer(*p)))
                {
                    // must start with an integer
                    return(false);
                }

                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (uchar1)(*p - (uchar1)'0');
                    // a multiplication by 10 is cheaper than an arbitrary integer multiplication
                    i = 10 * i + digit; // might overflow, we will handle the overflow later
                    ++p;
                }
            }

            int64_t exponent = 0;
            bool    is_float = false;

            if ('.' == *p)
            {
                is_float = true; // At this point we know that we have a float
                // we continue with the fiction that we have an integer. If the
                // floating point number is representable as x * 10^z for some integer
                // z that fits in 53 bits, then we will be able to convert back the
                // the integer into a float in a lossless manner.
                ++p;
                char1 *firstafterperiod = p;
                if (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;
                    i = i * 10 + digit; // might overflow + multiplication by 10 is likely cheaper than arbitrary mult.
                    // we will handle the overflow later
                }
                else
                {
                    return(false);
                }
#if SWAR_NUMBER_PARSING
                // this helps if we have lots of decimals!
                // this turns out to be frequent enough.
                if (is_made_of_eight_digits_fast(p))
                {
                    i  = i * 100000000 + parse_eight_digits_unrolled(p);
                    p += 8;
                }
#endif
                while (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;
                    i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later.
                }

                exponent = firstafterperiod - p;
            }

            int     digitcount = (int)(p - startdigits - 1); // used later to guard against overflows
            int64_t expnumber  = 0;                          // exponential part
            if (((char1)'e' == *p) || ((char1)'E' == *p))
            {
                is_float = true;
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
                    return(false);
                }

                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                expnumber = digit;
                p++;
                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    // we refuse to parse this
                    return(false);
                }

                exponent += (negexp ? -expnumber : expnumber);
            }

            if (is_float)
            {
                uint64_t powerindex = (uint64_t)(308 + exponent);
                if (/*unlikely*/ ((digitcount >= 19)))
                {
                    // this is uncommon
                    // It is possible that the integer had an overflow.
                    // We have to handle the case where we have 0.0000somenumber.
                    char1 *start = startdigits;
                    while ((*start == (char1)'0') || (*start == (char1)'.'))
                    {
                        start++;
                    }

                    digitcount -= (int)(start - startdigits);
                    if (digitcount >= 19)
                    {
                        // Ok, chances are good that we had an overflow!
                        // this is almost never going to get called!!!
                        // we start anew, going slowly!!!
                        return(parse_float(buf, pj, offset,
                                           found_minus));
                    }
                }

                if (/*unlikely*/ ((powerindex > 2 * 308)))
                {
                    // this is uncommon!!!
                    // this is almost never going to get called!!!
                    // we start anew, going slowly!!!
                    return(parse_float(buf, pj, offset,
                                       found_minus));
                }

                double factor = power_of_ten[powerindex];
                factor = negative ? -factor : factor;
                double d = i * factor;
                pj.WriteTapeDouble(d);
            }
            else
            {
                if (/*unlikely*/ (digitcount >= 18))
                {
                    // this is uncommon!!!
                    // there is a good chance that we had an overflow, so we need
                    // need to recover: we parse the whole thing again.
                    return(parse_large_integer(buf, pj, offset,
                                               found_minus));
                }

                i = negative ? 0 - i : i;
                pj.WriteTapeInt64((int64_t)i);
            }

            return(is_structural_or_whitespace((uint8_t)(*p)) != 0);
        }

Example #13

Show file

        internal static JsonParseError find_structural_bits(uint8_t *buf, size_t len, ParsedJson pj)
        {
            if (len > pj.bytecapacity)
            {
                return(JsonParseError.CAPACITY);
            }

            uint32_t *base_ptr = pj.structural_indexes;
            uint32_t  @base    = 0;

#if SIMDJSON_UTF8VALIDATE
            utf8_checking_state state;
#endif

            // we have padded the input out to 64 byte multiple with the remainder being
            // zeros

            // persistent state across loop
            // does the last iteration end with an odd-length sequence of backslashes?
            // either 0 or 1, but a 64-bit value
            uint64_t prev_iter_ends_odd_backslash = 0UL;
            // does the previous iteration end inside a double-quote pair?
            uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones
            // does the previous iteration end on something that is a predecessor of a
            // pseudo-structural character - i.e. whitespace or a structural character
            // effectively the very first char is considered to follow "whitespace" for
            // the
            // purposes of pseudo-structural character detection so we initialize to 1
            uint64_t prev_iter_ends_pseudo_pred = 1UL;

            // structurals are persistent state across loop as we flatten them on the
            // subsequent iteration into our array pointed to be base_ptr.
            // This is harmless on the first iteration as structurals==0
            // and is done for performance reasons; we can hide some of the latency of the
            // expensive carryless multiply in the previous step with this work
            uint64_t structurals = 0;

            size_t   lenminus64 = len < 64 ? 0 : len - 64;
            size_t   idx        = 0;
            uint64_t error_mask = 0; // for unescaped characters within strings (ASCII code points < 0x20)

            for (; idx < lenminus64; idx += 64)
            {
                //__builtin_prefetch(buf + idx + 128);
                simd_input @in = fill_input(buf + idx);
#if SIMDJSON_UTF8VALIDATE
                check_utf8(in, state);
#endif
                // detect odd sequences of backslashes
                uint64_t odd_ends = find_odd_backslash_sequences(
                    @in, ref prev_iter_ends_odd_backslash);

                // detect insides of quote pairs ("quote_mask") and also our quote_bits
                // themselves
                uint64_t quote_bits = 0;
                uint64_t quote_mask = find_quote_mask_and_bits(
                    @in, odd_ends, ref prev_iter_inside_quote, ref quote_bits, ref error_mask);

                // take the previous iterations structural bits, not our current iteration,
                // and flatten
                flatten_bits(base_ptr, ref @base, (uint32_t)idx, structurals);

                uint64_t whitespace = 0;
                find_whitespace_and_structurals(@in, ref whitespace, ref structurals);

                // fixup structurals to reflect quotes and add pseudo-structural characters
                structurals = finalize_structurals(structurals, whitespace, quote_mask,
                                                   quote_bits, ref prev_iter_ends_pseudo_pred);
            }

            ////////////////
            // we use a giant copy-paste which is ugly.
            // but otherwise the string needs to be properly padded or else we
            // risk invalidating the UTF-8 checks.
            ////////////
            if (idx < len)
            {
                uint8_t *tmpbuf = stackalloc uint8_t[64];
                memset(tmpbuf, 0x20, 64);
                memcpy(tmpbuf, buf + idx, len - idx);
                simd_input @in = fill_input(tmpbuf);
#if SIMDJSON_UTF8VALIDATE
                check_utf8 <T>(in, state);
#endif
                // detect odd sequences of backslashes
                uint64_t odd_ends = find_odd_backslash_sequences(
                    @in, ref prev_iter_ends_odd_backslash);

                // detect insides of quote pairs ("quote_mask") and also our quote_bits
                // themselves
                uint64_t quote_bits = 0;
                uint64_t quote_mask = find_quote_mask_and_bits(
                    @in, odd_ends, ref prev_iter_inside_quote, ref quote_bits, ref error_mask);

                // take the previous iterations structural bits, not our current iteration,
                // and flatten
                flatten_bits(base_ptr, ref @base, (uint)idx, structurals);

                uint64_t whitespace = 0;
                find_whitespace_and_structurals(@in, ref whitespace, ref structurals);

                // fixup structurals to reflect quotes and add pseudo-strucural characters
                structurals = finalize_structurals(structurals, whitespace, quote_mask,
                                                   quote_bits, ref prev_iter_ends_pseudo_pred);
                idx += 64;
            }

            // is last string quote closed?
            if (prev_iter_inside_quote != 0)
            {
                return(JsonParseError.UNCLOSED_STRING);
            }

            // finally, flatten out the remaining structurals from the last iteration
            flatten_bits(base_ptr, ref @base, (uint)idx, structurals);

            pj.n_structural_indexes = @base;
            // a valid JSON file cannot have zero structural indexes - we should have
            // found something
            if (pj.n_structural_indexes == 0u)
            {
                return(JsonParseError.EMPTY);
            }

            if (base_ptr[pj.n_structural_indexes - 1] > len)
            {
                return(JsonParseError.UNEXPECTED_ERROR);
            }

            if (len != base_ptr[pj.n_structural_indexes - 1])
            {
                // the string might not be NULL terminated, but we add a virtual NULL ending
                // character.
                base_ptr[pj.n_structural_indexes++] = (uint)len;
            }

            // make it safe to dereference one beyond this array
            base_ptr[pj.n_structural_indexes] = 0;
            if (error_mask != 0)
            {
                return(JsonParseError.UNESCAPED_CHARS);
            }
#if SIMDJSON_UTF8VALIDATE
            return(check_utf8_errors(state));
#else
            return(JsonParseError.SUCCESS);
#endif
        }

Example #14

Show file

 internal static JsonParseError find_structural_bits(char1 *buf, size_t len, ParsedJson pj)
 => find_structural_bits((uint8_t *)(buf), len, pj);

Example #15

Show file

File: numberparsing.cs Project: zsybupt/SimdJsonSharp

        // called by parse_number when we know that the output is a float,
        // but where there might be some integer overflow. The trick here is to
        // parse using floats from the start.
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        // Note: a redesign could avoid this function entirely.
        //
        static bool parse_float(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            char1 *p        = (char1 *)(buf + offset);
            bool   negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            /*long*/
            double i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                uchar1 digit = (uchar1)(*p - (uchar1)'0');
                i = digit;
                p++;
                while (is_integer(*p))
                {
                    digit = (uchar1)(*p - (uchar1)'0');
                    i     = 10 * i + digit;
                    ++p;
                }
            }

            if ('.' == *p)
            {
                ++p;
                int fractionalweight = 308;
                if (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;

                    fractionalweight--;
                    i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
                }
                else
                {
                    return(false);
                }

                while (is_integer(*p))
                {
                    uchar1 digit = (uchar1)(*p - (uchar1)'0');
                    ++p;
                    fractionalweight--;
                    i = i + digit * (fractionalweight >= 0 ? power_of_ten[fractionalweight] : 0);
                }
            }

            if (('e' == *p) || ('E' == *p))
            {
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
                    return(false);
                }

                uchar1  digit     = (uchar1)(*p - (uchar1)'0');
                int64_t expnumber = digit; // exponential part
                p++;
                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (uchar1)(*p - (uchar1)'0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    return(false);
                }

                if (/*unlikely*/ (expnumber > 308))
                {
                    // C# needs unlikely!
                    // this path is unlikely
                    if (negexp)
                    {
                        // We either have zero or a subnormal.
                        // We expect this to be uncommon so we go through a slow path.
                        i = subnormal_power10(i, (int)-expnumber);
                    }
                    else
                    {
                        // We know for sure that we have a number that is too large,
                        // we refuse to parse this
                        return(false);
                    }
                }
                else
                {
                    int exponent = (int)(negexp ? -expnumber : expnumber);
                    // we have that expnumber is [0,308] so that
                    // exponent is [-308,308] so that
                    // 308 + exponent is in [0, 2 * 308]
                    i *= power_of_ten[308 + exponent];
                }
            }

            if (is_not_structural_or_whitespace((uint8_t)(*p)) != 0)
            {
                return(false);
            }

            double d = negative ? -i : i;

            pj.WriteTapeDouble(d);
            return(is_structural_or_whitespace((uint8_t)(*p)) != 0);
        }

Example #16

Show file

File: numberparsing.cs Project: randyammar/SimdJsonSharp

        // called by parse_number when we know that the output is an integer,
        // but where there might be some integer overflow.
        // we want to catch overflows!
        // Do not call this function directly as it skips some of the checks from
        // parse_number
        //
        // This function will almost never be called!!!
        //
        static bool parse_large_integer(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            bytechar *p        = (bytechar *)(buf + offset);
            bool      negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
            }

            uint64_t i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                i = 0;
            }
            else
            {
                unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (unsigned_bytechar)(*p - '0');
                    if (mul_overflow(i, 10, &i))
                    {
#if JSON_TEST_NUMBERS // for unit testing
                        foundInvalidNumber(buf + offset);
#endif
                        return(false); // overflow
                    }

                    if (add_overflow(i, digit, &i))
                    {
#if JSON_TEST_NUMBERS // for unit testing
                        foundInvalidNumber(buf + offset);
#endif
                        return(false); // overflow
                    }

                    ++p;
                }
            }

            if (negative)
            {
                if (i > 0x8000000000000000)
                {
                    // overflows!
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false); // overflow
                }
            }
            else
            {
                if (i >= 0x8000000000000000)
                {
                    // overflows!
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false); // overflow
                }
            }

            int64_t signed_answer = negative ? -(int64_t)i : (int64_t)i;
            pj.WriteTapeInt64(signed_answer);
#if JSON_TEST_NUMBERS // for unit testing
            foundInteger(signed_answer, buf + offset);
#endif
            return(is_structural_or_whitespace((byte)(*p)) != 0);
        }

Example #17

Show file

File: numberparsing.cs Project: randyammar/SimdJsonSharp

        public static bool parse_number(uint8_t *buf, ParsedJson pj, uint32_t offset, bool found_minus)
        {
            bytechar *p        = (bytechar *)(buf + offset);
            bool      negative = false;

            if (found_minus)
            {
                ++p;
                negative = true;
                if (!is_integer(*p))
                {
                    // a negative sign must be followed by an integer

                    return(false);
                }
            }

            bytechar *startdigits = p;

            int64_t i;

            if (*p == '0')
            {
                // 0 cannot be followed by an integer
                ++p;
                if (is_not_structural_or_whitespace_or_exponent_or_decimal((uint8_t)(*p)))
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                i = 0;
            }
            else
            {
                if (!(is_integer(*p)))
                {
                    // must start with an integer
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                i = digit;
                p++;
                // the is_made_of_eight_digits_fast routine is unlikely to help here because
                // we rarely see large integer parts like 123456789
                while (is_integer(*p))
                {
                    digit = (unsigned_bytechar)(*p - '0');
                    i     = 10 * i + digit; // might overflow
                    ++p;
                }
            }

            int64_t exponent = 0;

            if ('.' == *p)
            {
                ++p;
                bytechar *firstafterperiod = p;
                if (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    i = i * 10 + digit;
                }
                else
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }
#if SWAR_NUMBER_PARSING
                // this helps if we have lots of decimals!
                // this turns out to be frequent enough.
                if (is_made_of_eight_digits_fast(p))
                {
                    i  = i * 100000000 + parse_eight_digits_unrolled(p);
                    p += 8;
                    // exponent -= 8;
                }
#endif
                while (is_integer(*p))
                {
                    unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                    ++p;
                    i = i * 10 + digit; // in rare cases, this will overflow, but that's ok because we have parse_highprecision_float later.
                }

                exponent = firstafterperiod - p;
            }

            int digitcount = (int)(p - startdigits - 1);

            int64_t expnumber = 0; // exponential part
            if (('e' == *p) || ('E' == *p))
            {
                ++p;
                bool negexp = false;
                if ('-' == *p)
                {
                    negexp = true;
                    ++p;
                }
                else if ('+' == *p)
                {
                    ++p;
                }

                if (!is_integer(*p))
                {
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                unsigned_bytechar digit = (unsigned_bytechar)(*p - '0');
                expnumber = digit;
                p++;
                while (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    digit     = (unsigned_bytechar)(*p - '0');
                    expnumber = 10 * expnumber + digit;
                    ++p;
                }

                if (is_integer(*p))
                {
                    // we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                    foundInvalidNumber(buf + offset);
#endif
                    return(false);
                }

                exponent += (negexp ? -expnumber : expnumber);
            }

            i = negative ? -i : i;
            if ((exponent != 0) || (expnumber != 0))
            {
                if ((digitcount >= 19))
                {
                    // this is uncommon!!!
                    // this is almost never going to get called!!!
                    // we start anew, going slowly!!!
                    return(parse_float(buf, pj, offset,
                                       found_minus));
                }

                ///////////
                // We want 0.1e1 to be a float.
                //////////
                if (i == 0)
                {
                    pj.WriteTapeDouble(0.0);
#if JSON_TEST_NUMBERS // for unit testing
                    foundFloat(0.0, buf + offset);
#endif
                }
                else
                {
                    if ((exponent > 308) || (exponent < -308))
                    {
                        // we refuse to parse this
#if JSON_TEST_NUMBERS // for unit testing
                        foundInvalidNumber(buf + offset);
#endif
                        return(false);
                    }

                    double d = i;
                    d *= power_of_ten[308 + exponent];
                    // d = negative ? -d : d;
                    pj.WriteTapeDouble(d);
#if JSON_TEST_NUMBERS // for unit testing
                    foundFloat(d, buf + offset);
#endif
                }
            }
            else
            {
                if ((digitcount >= 18))
                {
                    // this is uncommon!!!
                    return(parse_large_integer(buf, pj, offset,
                                               found_minus));
                }

                pj.WriteTapeInt64(i);
#if JSON_TEST_NUMBERS // for unit testing
                foundInteger(i, buf + offset);
#endif
            }

            return(is_structural_or_whitespace((uint8_t)(*p)) != 0);
        }

Example #18

Show file

        internal static JsonParseError unified_machine(uint8_t *buf, size_t len, ParsedJson pj)
        {
#if !ALLOW_SAME_PAGE_BUFFER_OVERRUN
            memset((uint8_t *)buf + len, 0, SIMDJSON_PADDING); // to please valgrind
#endif
            uint32_t i = 0;                                    // index of the structural character (0,1,2,3...)
            uint32_t idx;                                      // location of the structural character in the input (buf)
            uint8_t  c = 0;                                    // used to track the (structural) character we are looking at, updated
            // by UPDATE_CHAR macro
            uint32_t depth = 0;                                // could have an arbitrary starting depth
            pj.Init();                                         // sets isvalid to false
            if (pj.bytecapacity < len)
            {
                pj.ErrorCode = JsonParseError.CAPACITY;
                return(pj.ErrorCode);
            }

            ////////////////////////////// START STATE /////////////////////////////
            pj.ret_address[depth]             = (bytechar)'s';
            pj.containing_scope_offset[depth] = pj.CurrentLoc;
            pj.WriteTape(0, (uint8_t)'r'); // r for root, 0 is going to get overwritten
            // the root is used, if nothing else, to capture the size of the tape
            depth++;                       // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
            if (depth >= pj.depthcapacity)
            {
                goto fail;
            }

            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'{':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);     // strangely, moving this to object_begin slows things down
                goto object_begin;

            case (uint8_t)'[':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                goto array_begin;

                // A JSON text is a serialized value.  Note that certain previous
                // specifications of JSON constrained a JSON text to be an object or an
                // array.  Implementations that generate only objects or arrays where a
                // JSON text is called for will be interoperable in the sense that all
                // implementations will accept these as conforming JSON texts.
                // https://tools.ietf.org/html/rfc8259
#if SIMDJSON_ALLOWANYTHINGINROOT
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the true value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_true_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'f':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the false value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_false_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'n':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this only applies to the JSON document made solely of the null value.
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!is_valid_null_atom((uint8_t *)(copy) + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                // we need to make a copy to make sure that the string is space terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice. We terminate with a space
                // because we do not want to allow NULLs in the middle of a number (whereas a
                // space in the middle of a number would be identified in stage 1).
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)' ';
                if (!parse_number((uint8_t *)(copy), pj, idx, false))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }

            case (uint8_t)'-':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = (bytechar *)(allocate <bytechar>(len + SIMDJSON_PADDING));
                if (copy == null)
                {
                    goto fail;
                }

                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)(copy), pj, idx, true))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }
#endif // ALLOWANYTHINGINROOT
            default:
                goto fail;
            }

start_continue:
            // the string might not be NULL terminated.
            if (i + 1 == pj.n_structural_indexes)
            {
                goto succeed;
            }
            else
            {
                goto fail;
            }
            ////////////////////////////// OBJECT STATES /////////////////////////////

object_begin:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                goto object_key_state;
            }

            case (uint8_t)'}':
                goto scope_end;     // could also go to object_continue

            default:
                goto fail;
            }

object_key_state:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            if (c != ':')
            {
                goto fail;
            }

            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'{':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an object inside an object, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an array inside an object, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

object_continue:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)',':
                idx = pj.structural_indexes[i++];
                c   = buf[idx];   //UPDATE_CHAR()
                if (c != '"')
                {
                    goto fail;
                }
                else
                {
                    if (!parse_string(buf, len, pj, depth, idx))
                    {
                        goto fail;
                    }

                    goto object_key_state;
                }

            case (uint8_t)'}':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// COMMON STATE /////////////////////////////

scope_end:
            // write our tape location to the header scope
            depth--;
            pj.WriteTape(pj.containing_scope_offset[depth], c);
            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth],
                                   pj.CurrentLoc);
            if (pj.ret_address[depth] == 'a')
            {
                goto array_continue;
            }
            else if (pj.ret_address[depth] == 'o')
            {
                goto object_continue;
            }
            else
            {
                goto start_continue;
            }

            ////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            if (c == (uint8_t)']')
            {
                goto scope_end; // could also go to array_continue
            }

main_array_switch:
            // we call update char on all paths in, so we can peek at c on the
            // on paths that can accept a close square brace (post-, and at start)
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;     // goto array_continue;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'{':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     //  here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an object inside an array, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an array inside an array, so we need to increment the depth
                depth++;
                if (depth >= pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

array_continue:
            idx = pj.structural_indexes[i++];
            c   = buf[idx]; //UPDATE_CHAR()
            switch (c)
            {
            case (uint8_t)',':
                idx = pj.structural_indexes[i++];
                c   = buf[idx];   //UPDATE_CHAR()
                goto main_array_switch;

            case (uint8_t)']':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// FINAL STATES /////////////////////////////

succeed:
            depth--;
            if (depth != 0)
            {
                throw new InvalidOperationException("internal bug");
                //abort();
            }

            if (pj.containing_scope_offset[depth] != 0)
            {
                throw new InvalidOperationException("internal bug");
                //abort();
            }

            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc);
            pj.WriteTape(pj.containing_scope_offset[depth], (uint8_t)'r');  // r is root

            pj.isvalid   = true;
            pj.ErrorCode = JsonParseError.SUCCESS;
            return(pj.ErrorCode);

fail:
            // we do not need the next line because this is done by pj.init(), pessimistically.
            // pj.isvalid  = false;
            // At this point in the code, we have all the time in the world.
            // Note that we know exactly where we are in the document so we could,
            // without any overhead on the processing code, report a specific location.
            // We could even trigger special code paths to assess what happened carefully,
            // all without any added cost.
            if (depth >= pj.depthcapacity)
            {
                pj.ErrorCode = JsonParseError.DEPTH_ERROR;
                return(pj.ErrorCode);
            }

            switch (c)
            {
            case (uint8_t)'"':
                pj.ErrorCode = JsonParseError.STRING_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            case (uint8_t)'-':
                pj.ErrorCode = JsonParseError.NUMBER_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'t':
                pj.ErrorCode = JsonParseError.T_ATOM_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'n':
                pj.ErrorCode = JsonParseError.N_ATOM_ERROR;
                return(pj.ErrorCode);

            case (uint8_t)'f':
                pj.ErrorCode = JsonParseError.F_ATOM_ERROR;
                return(pj.ErrorCode);

            default:
                break;
            }

            pj.ErrorCode = JsonParseError.TAPE_ERROR;
            return(pj.ErrorCode);
        }

Example #19

Show file

 internal static JsonParseError unified_machine(bytechar *buf, size_t len, ParsedJson pj)
 => unified_machine((uint8_t *)(buf), len, pj);

Example #20

Show file

File: SimdJson.cs Project: zsybupt/SimdJsonSharp

 internal static JsonParseError JsonParse(byte *jsonData, UInt64 length, ParsedJson pj, bool reallocIfNeeded = true)
 {
     if (pj.bytecapacity < length)
         return(JsonParseError.CAPACITY); }

Example #21

Show file

File: stage2_build_tape.cs Project: zy850580380/SimdJsonSharp

        internal static bool unified_machine(uint8_t *buf, size_t len, ParsedJson pj)
        {
            uint32_t i = 0;     // index of the structural character (0,1,2,3...)
            uint32_t idx;       // location of the structural character in the input (buf)
            uint8_t  c;         // used to track the (structural) character we are looking at, updated
            // by UPDATE_CHAR macro
            uint32_t depth = 0; // could have an arbitrary starting depth

            pj.Init();
            if (pj.bytecapacity < len)
            {
                Debug.Write("insufficient capacity\n");
                return(false);
            }

            // this macro reads the next structural character, updating idx, i and c.
            //C#: expanded directly everywhere
            //void UPDATE_CHAR()
            //{
            //    idx = pj.structural_indexes[i++];
            //    c = buf[idx];
            //}

            pj.ret_address[depth]             = (bytechar)'s';
            pj.containing_scope_offset[depth] = pj.CurrentLoc;
            pj.WriteTape(0, (byte)'r'); // r for root, 0 is going to get overwritten
            // the root is used, if nothing else, to capture the size of the tape
            depth++;                    // everything starts at depth = 1, depth = 0 is just for the root, the root may contain an object, an array or something else.
            if (depth > pj.depthcapacity)
            {
                goto fail;
            }


            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];

            switch (c)
            {
            case (uint8_t)'{':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);     // strangely, moving this to object_begin slows things down
                goto object_begin;

            case (uint8_t)'[':
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.ret_address[depth]             = (bytechar)'s';
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                goto array_begin;

                // A JSON text is a serialized value.  Note that certain previous
                // specifications of JSON constrained a JSON text to be an object or an
                // array.  Implementations that generate only objects or arrays where a
                // JSON text is called for will be interoperable in the sense that all
                // implementations will accept these as conforming JSON texts.
                // https://tools.ietf.org/html/rfc8259
#if SIMDJSON_ALLOWANYTHINGINROOT
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the true value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_true_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'f':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the false value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_false_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'n':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this only applies to the JSON document made solely of the null value.
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!is_valid_null_atom((uint8_t *)copy + idx))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                pj.WriteTape(0, c);
                break;
            }

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)copy, pj, idx, false))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }

            case (uint8_t)'-':
            {
                // we need to make a copy to make sure that the string is NULL terminated.
                // this is done only for JSON documents made of a sole number
                // this will almost never be called in practice
                bytechar *copy = allocate <bytechar>(len + SIMDJSON_PADDING);
                memcpy(copy, buf, len);
                copy[len] = (bytechar)'\0';
                if (!parse_number((uint8_t *)copy, pj, idx, true))
                {
                    free(copy);
                    goto fail;
                }

                free(copy);
                break;
            }
#endif // ALLOWANYTHINGINROOT
            default:
                goto fail;
            }

start_continue:
            // the string might not be NULL terminated.
            if (i + 1 == pj.n_structural_indexes)
            {
                goto succeed;
            }
            else
            {
                goto fail;
            }
            ////////////////////////////// OBJECT STATES /////////////////////////////

object_begin:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                goto object_key_state;
            }

            case (uint8_t)'}':
                goto scope_end;     // could also go to object_continue

            default:
                goto fail;
            }

object_key_state:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            if (c != ':')
            {
                goto fail;
            }

            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'{':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an object inside an object, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                // we have not yet encountered } so we need to come back for it
                pj.ret_address[depth] = (bytechar)'o';
                // we found an array inside an object, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

object_continue:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)',':
                //UPDATE_CHAR():
                idx = pj.structural_indexes[i++];
                c   = buf[idx];
                if (c != (uint8_t)'"')
                {
                    goto fail;
                }
                else
                {
                    if (!parse_string(buf, len, pj, depth, idx))
                    {
                        goto fail;
                    }

                    goto object_key_state;
                }

            case (uint8_t)'}':
                goto scope_end;

            default:
                goto fail;
            }

            ////////////////////////////// COMMON STATE /////////////////////////////

scope_end:
            // write our tape location to the header scope
            depth--;
            pj.WriteTape(pj.containing_scope_offset[depth], c);
            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth],
                                   pj.CurrentLoc);
            // goto saved_state
            if (pj.ret_address[depth] == (uint8_t)'a')
            {
                goto array_continue;
            }
            else if (pj.ret_address[depth] == (uint8_t)'o')
            {
                goto object_continue;
            }
            else
            {
                goto start_continue;
            }

            ////////////////////////////// ARRAY STATES /////////////////////////////
array_begin:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            if (c == ']')
            {
                goto scope_end; // could also go to array_continue
            }

main_array_switch:
            // we call update char on all paths in, so we can peek at c on the
            // on paths that can accept a close square brace (post-, and at start)
            switch (c)
            {
            case (uint8_t)'"':
            {
                if (!parse_string(buf, len, pj, depth, idx))
                {
                    goto fail;
                }

                break;
            }

            case (uint8_t)'t':
                if (!is_valid_true_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'f':
                if (!is_valid_false_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;

            case (uint8_t)'n':
                if (!is_valid_null_atom(buf + idx))
                {
                    goto fail;
                }

                pj.WriteTape(0, c);
                break;     // goto array_continue;

            case (uint8_t)'0':
            case (uint8_t)'1':
            case (uint8_t)'2':
            case (uint8_t)'3':
            case (uint8_t)'4':
            case (uint8_t)'5':
            case (uint8_t)'6':
            case (uint8_t)'7':
            case (uint8_t)'8':
            case (uint8_t)'9':
            {
                if (!parse_number(buf, pj, idx, false))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'-':
            {
                if (!parse_number(buf, pj, idx, true))
                {
                    goto fail;
                }

                break;     // goto array_continue;
            }

            case (uint8_t)'{':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     //  here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an object inside an array, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto object_begin;
            }

            case (uint8_t)'[':
            {
                // we have not yet encountered ] so we need to come back for it
                pj.containing_scope_offset[depth] = pj.CurrentLoc;
                pj.WriteTape(0, c);     // here the compilers knows what c is so this gets optimized
                pj.ret_address[depth] = (bytechar)'a';
                // we found an array inside an array, so we need to increment the depth
                depth++;
                if (depth > pj.depthcapacity)
                {
                    goto fail;
                }

                goto array_begin;
            }

            default:
                goto fail;
            }

array_continue:
            //UPDATE_CHAR():
            idx = pj.structural_indexes[i++];
            c   = buf[idx];
            switch (c)
            {
            case (uint8_t)',':
                //UPDATE_CHAR():
                idx = pj.structural_indexes[i++];
                c   = buf[idx];
                goto main_array_switch;

            case (uint8_t)']':
                goto scope_end;

            default:
                goto fail;
            }


            ////////////////////////////// FINAL STATES /////////////////////////////

succeed:
            depth--;
            if (depth != 0)
            {
                throw new InvalidOperationException("internal bug");
            }

            if (pj.containing_scope_offset[depth] != 0)
            {
                throw new InvalidOperationException("internal bug");
            }

            pj.AnnotatePreviousLoc(pj.containing_scope_offset[depth], pj.CurrentLoc);
            pj.WriteTape(pj.containing_scope_offset[depth], (byte)'r');  // r is root
            pj.isvalid = true;
            return(true);



fail:
            return(false);
        }

Example #22

Show file

        internal static bool find_structural_bits(uint8_t* buf, size_t len, ParsedJson pj)
        {
            if (len > pj.bytecapacity)
            {
                Console.WriteLine("Your ParsedJson object only supports documents up to " + pj.bytecapacity +
                                  " bytes but you are trying to process " + len + " bytes\n");
                return false;
            }

            uint32_t* base_ptr = pj.structural_indexes;
            uint32_t @base = 0;
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
            var has_error = Vector256<byte>.Zero;
            var previous = new avx_processed_utf_bytes();
            previous.rawbytes = Vector256<byte>.Zero;
            previous.high_nibbles = Vector256<byte>.Zero;
            previous.carried_continuations = Vector256<byte>.Zero;
            var highbit = Vector256.Create((byte)0x80);
#endif

            const uint64_t even_bits = 0x5555555555555555UL;
            const uint64_t odd_bits = ~even_bits;

            // for now, just work in 64-byte chunks
            // we have padded the input out to 64 byte multiple with the remainder being
            // zeros

            // persistent state across loop
            uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value
            uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones

            // effectively the very first char is considered to follow "whitespace" for the
            // purposes of psuedo-structural character detection
            uint64_t prev_iter_ends_pseudo_pred = 1UL;
            size_t lenminus64 = len < 64 ? 0 : len - 64;
            size_t idx = 0;
            uint64_t structurals = 0;

            // C#: assign static readonly fields to locals before the loop
            Vector256<byte> low_nibble_mask = s_low_nibble_mask;
            Vector256<byte> high_nibble_mask = s_high_nibble_mask;
            Vector256<byte> utf8ValidVec = s_utf8ValidVec;

            var structural_shufti_mask = Vector256.Create((byte)0x7);
            var whitespace_shufti_mask = Vector256.Create((byte)0x18);
            var slashVec = Vector256.Create((bytechar) '\\').AsByte();
            var ffVec = Vector128.Create((byte) 0xFF).AsUInt64();
            var doubleQuoteVec = Vector256.Create((byte)'"');
            var zeroBVec = Vector256.Create((byte) 0);
            var vec7f = Vector256.Create((byte) 0x7f);

            for (; idx < lenminus64; idx += 64)
            {
                var input_lo = Avx.LoadVector256(buf + idx + 0);
                var input_hi = Avx.LoadVector256(buf + idx + 32);
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                {
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                        Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(), utf8ValidVec, has_error);

                }
                else
                {
                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);
                }
#endif

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////
                /// 
                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;
                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                bool iter_ends_odd_backslash =
                    add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0));

                uint32_t cnt = (uint32_t) hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t) idx - 64 + (uint32_t) trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;
                }

                @base = next_base;

                quote_mask ^= prev_iter_inside_quote;
                prev_iter_inside_quote =
                    (uint64_t) ((int64_t) quote_mask >>
                                63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code



                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                            vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                            vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t) Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t) Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t) Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t) Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);

                //Console.WriteLine($"Iter: {idx}, satur: {structurals}");

                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;
            }

            ////////////////
            /// we use a giant copy-paste which is ugly.
            /// but otherwise the string needs to be properly padded or else we
            /// risk invalidating the UTF-8 checks.
            ////////////
            if (idx < len)
            {
                uint8_t* tmpbuf = stackalloc uint8_t[64];
                memset(tmpbuf, 0x20, 64);
                memcpy(tmpbuf, buf + idx, len - idx);
                Vector256<byte> input_lo = Avx.LoadVector256(tmpbuf + 0);
                Vector256<byte> input_hi = Avx.LoadVector256(tmpbuf + 32);
#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
                var highbit = Vector256.Create((byte)0x80);
                if ((Avx.TestZ(Avx2.Or(input_lo, input_hi), highbit)) == true)
                {
                    // it is ascii, we just check continuation
                    has_error = Avx2.Or(
                      Avx2.CompareGreaterThan(previous.carried_continuations.AsSByte(),
                                      utf8ValidVec).AsByte(), has_error);

                }
                else
                {
                    // it is not ascii so we have to do heavy work
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_lo, ref previous, ref has_error);
                    previous = Utf8Validation.avxcheckUTF8Bytes(input_hi, ref previous, ref has_error);
                }
#endif
                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts = start_edges & even_start_mask;
                uint64_t odd_starts = start_edges & ~even_start_mask;
                uint64_t even_carries = bs_bits + even_starts;

                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                //bool iter_ends_odd_backslash =
                add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
                uint64_t even_carry_ends = even_carries & ~bs_bits;
                uint64_t odd_carry_ends = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply(
                    Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64());
                quote_mask ^= prev_iter_inside_quote;

                //BUG? https://github.com/dotnet/coreclr/issues/22813
                //quote_mask = 60;
                //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20

                uint32_t cnt = (uint32_t)hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals = structurals & (structurals - 1);
                    @base += 8;
                }
                @base = next_base;
                // How do we build up a user traversable data structure
                // first, do a 'shufti' to detect structural JSON characters
                // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
                // these go into the first 3 buckets of the comparison (1/2/4)

                // we are also interested in the four whitespace characters
                // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
                // these go into the next 2 buckets of the comparison (8/16)

                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                            vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                        Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                            vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                // this additional mask and transfer is non-trivially expensive,
                // unfortunately
                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);
                //*(uint64_t *)(pj.structurals + idx / 8) = structurals;
                idx += 64;
            }
            uint32_t cnt2 = (uint32_t)hamming(structurals);
            uint32_t next_base2 = @base + cnt2;
            while (structurals != 0)
            {
                base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals = structurals & (structurals - 1);
                @base += 8;
            }
            @base = next_base2;

            pj.n_structural_indexes = @base;
            if (base_ptr[pj.n_structural_indexes - 1] > len)
            {
                throw new InvalidOperationException("Internal bug");
            }
            if (len != base_ptr[pj.n_structural_indexes - 1])
            {
                // the string might not be NULL terminated, but we add a virtual NULL ending character. 
                base_ptr[pj.n_structural_indexes++] = (uint32_t)len;
            }
            base_ptr[pj.n_structural_indexes] = 0; // make it safe to dereference one beyond this array

#if SIMDJSON_UTF8VALIDATE // NOT TESTED YET!
            return Avx.TestZ(has_error, has_error);
#else
            return true;
#endif
        }

Example #23

Show file

        internal static bool parse_string(uint8_t *buf, size_t len, ParsedJson pj, uint32_t depth, uint32_t offset)
        {
            pj.WriteTape((ulong)(pj.current_string_buf_loc - pj.string_buf), (char1)'"');
            uint8_t *src             = &buf[offset + 1]; // we know that buf at offset is a "
            uint8_t *dst             = pj.current_string_buf_loc + sizeof(uint32_t);
            uint8_t *start_of_string = dst;

            while (true)
            {
                parse_string_helper helper = find_bs_bits_and_quote_bits(src, dst);
                if (((helper.bs_bits - 1) & helper.quote_bits) != 0)
                {
                    // we encountered quotes first. Move dst to point to quotes and exit
                    // find out where the quote is...
                    uint32_t quote_dist = (uint32_t)trailingzeroes(helper.quote_bits);

                    // NULL termination is still handy if you expect all your strings to be NULL terminated?
                    // It comes at a small cost
                    dst[quote_dist] = 0;

                    uint32_t str_length = (uint32_t)((dst - start_of_string) + quote_dist);
                    memcpy(pj.current_string_buf_loc, &str_length, sizeof(uint32_t));
                    ///////////////////////
                    // Above, check for overflow in case someone has a crazy string (>=4GB?)
                    // But only add the overflow check when the document itself exceeds 4GB
                    // Currently unneeded because we refuse to parse docs larger or equal to 4GB.
                    ////////////////////////

                    // we advance the point, accounting for the fact that we have a NULL termination
                    pj.current_string_buf_loc = dst + quote_dist + 1;

                    return(true);
                }

                if (((helper.quote_bits - 1) & helper.bs_bits) != 0)
                {
                    // find out where the backspace is
                    uint32_t bs_dist     = (uint32_t)trailingzeroes(helper.bs_bits);
                    uint8_t  escape_char = src[bs_dist + 1];
                    // we encountered backslash first. Handle backslash
                    if (escape_char == 'u')
                    {
                        // move src/dst up to the start; they will be further adjusted
                        // within the unicode codepoint handling code.
                        src += bs_dist;
                        dst += bs_dist;
                        if (!handle_unicode_codepoint(&src, &dst))
                        {
                            return(false);
                        }
                    }
                    else
                    {
                        // simple 1:1 conversion. Will eat bs_dist+2 characters in input and
                        // write bs_dist+1 characters to output
                        // note this may reach beyond the part of the buffer we've actually
                        // seen. I think this is ok
                        uint8_t escape_result = escape_map[escape_char]; // TODO: https://github.com/dotnet/coreclr/issues/25894
                        if (escape_result == 0u)
                        {
                            return(false); // bogus escape value is an error
                        }

                        dst[bs_dist] = escape_result;
                        src         += bs_dist + 2;
                        dst         += bs_dist + 1;
                    }
                }
                else
                {
                    // they are the same. Since they can't co-occur, it means we encountered
                    // neither.
                    if (!Avx2.IsSupported)
                    {
                        src += 16; // sse42
                        dst += 16;
                    }
                    else
                    {
                        src += 32; // avx2
                        dst += 32;
                    }
                }
            }
        }