Example #1
0
        static uint64_t umul128(uint64_t a, uint64_t b, uint64_t *productHi)
        {
            // The casts here help MSVC to avoid calls to the __allmul library function.
            uint32_t aLo = (uint32_t)a;
            uint32_t aHi = (uint32_t)(a >> 32);
            uint32_t bLo = (uint32_t)b;
            uint32_t bHi = (uint32_t)(b >> 32);

            uint64_t b00 = (uint64_t)aLo * bLo;
            uint64_t b01 = (uint64_t)aLo * bHi;
            uint64_t b10 = (uint64_t)aHi * bLo;
            uint64_t b11 = (uint64_t)aHi * bHi;

            uint32_t b00Lo = (uint32_t)b00;
            uint32_t b00Hi = (uint32_t)(b00 >> 32);

            uint64_t mid1   = b10 + b00Hi;
            uint32_t mid1Lo = (uint32_t)(mid1);
            uint32_t mid1Hi = (uint32_t)(mid1 >> 32);

            uint64_t mid2   = b01 + mid1Lo;
            uint32_t mid2Lo = (uint32_t)(mid2);
            uint32_t mid2Hi = (uint32_t)(mid2 >> 32);

            uint64_t pHi = b11 + mid1Hi + mid2Hi;
            uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;

            *productHi = pHi;
            return(pLo);
        }
Example #2
0
        public void sfmt_fill_array64(uint64_t *array, int size)
        {
            assert(idx == SFMT_N32);
            assert(size % 2 == 0);
            assert(size >= SFMT_N64);

            gen_rand_array((w128_t *)array, size / 2);
            idx = SFMT_N32;
        }
Example #3
0
        // if needed, allocate memory so that the object is able to process JSON
        // documents having up to len bytes and maxdepth "depth"
        public bool AllocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH)
        {
            if ((maxdepth == 0) || (len == 0))
            {
                return(false);
            }
            if (len > SIMDJSON_MAXSIZE_BYTES)
            {
                return(false);
            }
            if ((len <= bytecapacity) && (depthcapacity < maxdepth))
            {
                return(true);
            }
            Deallocate();
            isvalid              = false;
            bytecapacity         = 0; // will only set it to len after allocations are a success
            n_structural_indexes = 0;
            uint32_t max_structures = (uint32_t)(ROUNDUP_N(len, 64) + 2 + 7);

            structural_indexes = allocate <uint32_t>(max_structures);
            // a pathological input like "[[[[..." would generate len tape elements, so need a capacity of len + 1
            size_t localtapecapacity = ROUNDUP_N(len + 1, 64);
            // a document with only zero-length strings... could have len/3 string
            // and we would need len/3 * 5 bytes on the string buffer
            size_t localstringcapacity = ROUNDUP_N(5 * len / 3 + 32, 64);

            string_buf = allocate <uint8_t>(localstringcapacity);
            tape       = allocate <uint64_t>(localtapecapacity);
            containing_scope_offset = allocate <uint32_t>(maxdepth);
            ret_address             = allocate <char1>(maxdepth);
            if ((string_buf == null) || (tape == null) ||
                (containing_scope_offset == null) || (ret_address == null) || (structural_indexes == null))
            {
                delete(ret_address);
                delete(containing_scope_offset);
                delete(tape);
                delete(string_buf);
                delete(structural_indexes);
                return(false);
            }

            /*
             * // We do not need to initialize this content for parsing, though we could
             * // need to initialize it for safety.
             * memset(string_buf, 0 , localstringcapacity);
             * memset(structural_indexes, 0, max_structures * sizeof(uint32_t));
             * memset(tape, 0, localtapecapacity * sizeof(uint64_t));
             */
            bytecapacity   = len;
            depthcapacity  = maxdepth;
            tapecapacity   = localtapecapacity;
            stringcapacity = localstringcapacity;
            return(true);
        }
        public uint32_t dsfmt_genrand_uint32()
        {
            if (idx >= DSFMT_N64)
            {
                dsfmt_gen_rand_all();
                idx = 0;
            }

            fixed(w128_t *status = this.status)
            {
                uint64_t *psfmt64 = &status[0].u64_0;

                return((uint)psfmt64[idx++]);
            }
        }
        /**
         * This function certificate the period of 2^{SFMT_MEXP}-1.
         * @param dsfmt dsfmt state vector.
         */
        void period_certification()
        {
            uint64_t *tmp = stackalloc uint64_t[2];
            uint64_t  inner;
            uint64_t  work;

            fixed(w128_t *status = this.status)
            {
                tmp[0] = (status[DSFMT_N].u64_0 ^ DSFMT_FIX1);
                tmp[1] = (status[DSFMT_N].u64_1 ^ DSFMT_FIX2);

                inner  = tmp[0] & pcv[0];
                inner ^= tmp[1] & pcv[1];
                for (int i = 32; i > 0; i >>= 1)
                {
                    inner ^= inner >> i;
                }
                inner &= 1;
                /* check OK */
                if (inner == 1)
                {
                    return;
                }
                /* check NG, and modification */
                if ((DSFMT_PCV2 & 1) == 1)
                {
                    status[DSFMT_N].u64_1 ^= 1;
                }
                else
                {
#pragma warning disable 162
                    for (int i = 1; i >= 0; i--)
                    {
                        work = 1;
                        for (int j = 0; j < 64; j++)
                        {
                            if ((work & pcv[i]) != 0)
                            {
                                (&status[DSFMT_N].u64_0)[i] ^= work;
                                return;
                            }
                            work = work << 1;
                        }
                    }
#pragma warning restore
                }
            }
        }
Example #6
0
        // We need a 64x128-bit multiplication and a subsequent 128-bit shift.
        // Multiplication:
        //   The 64-bit factor is variable and passed in, the 128-bit factor comes
        //   from a lookup table. We know that the 64-bit factor only has 55
        //   significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
        //   factor only has 124 significant bits (i.e., the 4 topmost bits are
        //   zeros).
        // Shift:
        //   In principle, the multiplication result requires 55 + 124 = 179 bits to
        //   represent. However, we then shift this value to the right by j, which is
        //   at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
        //   bits. This means that we only need the topmost 64 significant bits of
        //   the 64x128-bit multiplication.
        //
        // There are several ways to do this:
        // 1. Best case: the compiler exposes a 128-bit type.
        //    We perform two 64x64-bit multiplications, add the higher 64 bits of the
        //    lower result to the higher result, and shift by j - 64 bits.
        //
        //    We explicitly cast from 64-bit to 128-bit, so the compiler can tell
        //    that these are only 64-bit inputs, and can map these to the best
        //    possible sequence of assembly instructions.
        //    x64 machines happen to have matching assembly instructions for
        //    64x64-bit multiplications and 128-bit shifts.
        //
        // 2. Second best case: the compiler exposes intrinsics for the x64 assembly
        //    instructions mentioned in 1.
        //
        // 3. We only have 64x64 bit instructions that return the lower 64 bits of
        //    the result, i.e., we have to use plain C.
        //    Our inputs are less than the full width, so we have three options:
        //    a. Ignore this fact and just implement the intrinsics manually.
        //    b. Split both into 31-bit pieces, which guarantees no internal overflow,
        //       but requires extra work upfront (unless we change the lookup table).
        //    c. Split only the first factor into 31-bit pieces, which also guarantees
        //       no internal overflow, but requires extra work since the intermediate
        //       results are not perfectly aligned.
        static uint64_t mulShift64(uint64_t m, uint64_t *mul, int32_t j)
        {
            // m is maximum 55 bits
            uint64_t high1;                             // 128
            uint64_t low1 = umul128(m, mul[1], &high1); // 64
            uint64_t high0;                             // 64

            umul128(m, mul[0], &high0);                 // 0
            uint64_t sum = high0 + low1;

            if (sum < high0)
            {
                ++high1; // overflow into high1
            }
            return(shiftright128(sum, high1, (uint)j - 64));
        }
Example #7
0
        // if needed, allocate memory so that the object is able to process JSON
        // documents having up to len butes and maxdepth "depth"
        public bool AllocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH)
        {
            if ((maxdepth == 0) || (len == 0))
            {
                Debug.WriteLine("capacities must be non-zero ");
                return(false);
            }

            if (len > 0)
            {
                if ((len <= bytecapacity) && (depthcapacity < maxdepth))
                {
                    return(true);
                }
                Deallocate();
            }

            isvalid              = false;
            bytecapacity         = 0; // will only set it to len after allocations are a success
            n_structural_indexes = 0;
            uint32_t max_structures = (uint32_t)ROUNDUP_N(len, 64) + 2 + 7;

            structural_indexes = Utils.allocate <uint32_t>(max_structures);
            size_t localtapecapacity   = ROUNDUP_N(len, 64);
            size_t localstringcapacity = ROUNDUP_N(len, 64);

            string_buf = Utils.allocate <uint8_t>(localstringcapacity);
            tape       = Utils.allocate <uint64_t>(localtapecapacity);
            containing_scope_offset = Utils.allocate <uint32_t>(maxdepth);
            ret_address             = Utils.allocate <bytechar>(maxdepth);
            if ((string_buf == null) || (tape == null) ||
                (containing_scope_offset == null) || (ret_address == null) || (structural_indexes == null))
            {
                Deallocate();
                return(false);
            }

            bytecapacity   = len;
            depthcapacity  = maxdepth;
            tapecapacity   = localtapecapacity;
            stringcapacity = localstringcapacity;
            return(true);
        }
Example #8
0
        private void Deallocate()
        {
            isvalid        = false;
            bytecapacity   = 0;
            depthcapacity  = 0;
            tapecapacity   = 0;
            stringcapacity = 0;

            if (ret_address != null)
            {
                delete(ret_address);
                ret_address = null;
            }

            if (containing_scope_offset != null)
            {
                delete(containing_scope_offset);
                containing_scope_offset = null;
            }

            if (tape != null)
            {
                delete(tape);
                tape = null;
            }

            if (string_buf != null)
            {
                delete(string_buf);
                string_buf = null;
            }

            if (structural_indexes != null)
            {
                delete(structural_indexes);
                structural_indexes = null;
            }
        }
Example #9
0
        static uint32_t mulShift_mod1e9(uint64_t m, uint64_t *mul, int32_t j)
        {
            uint64_t high0;                               // 64
            uint64_t low0 = umul128(m, mul[0], &high0);   // 0
            uint64_t high1;                               // 128
            uint64_t low1 = umul128(m, mul[1], &high1);   // 64
            uint64_t high2;                               // 192
            uint64_t low2   = umul128(m, mul[2], &high2); // 128
            uint64_t s0low  = low0;                       // 0
            uint64_t s0high = low1 + high0;               // 64
            uint32_t c1     = s0high < low1 ? 1U : 0;
            uint64_t s1low  = low2 + high1 + c1;          // 128
            uint32_t c2     = s1low < low2 ? 1U : 0;      // high1 + c1 can't overflow, so compare against low2
            uint64_t s1high = high2 + c2;                 // 192

            assert(j >= 128);
            assert(j <= 180);
            uint32_t dist        = (uint32_t)(j - 128); // dist: [0, 52]
            uint64_t shiftedhigh = s1high >> (int)dist;
            uint64_t shiftedlow  = shiftright128(s1low, s1high, dist);

            return(uint128_mod1e9(shiftedhigh, shiftedlow));
        }
Example #10
0
        // This is faster if we don't have a 64x64->128-bit multiplication.
        static uint64_t mulShiftAll64(uint64_t m, uint64_t *mul, int32_t j,
                                      uint64_t *vp, uint64_t *vm, uint32_t mmShift)
        {
            m <<= 1;
            // m is maximum 55 bits
            uint64_t tmp;
            uint64_t lo = umul128(m, mul[0], &tmp);
            uint64_t hi;
            uint64_t mid = tmp + umul128(m, mul[1], &hi);

            if (mid < tmp)
            {
                ++hi;           // overflow into hi
            }
            uint64_t lo2  = lo + mul[0];
            uint64_t mid2 = mid + mul[1];

            if (lo2 < lo)
            {
                ++mid2;
            }
            uint64_t hi2 = hi;

            if (mid2 < mid)
            {
                ++hi2;
            }
            *vp = shiftright128(mid2, hi2, (uint32_t)(j - 64 - 1));

            if (mmShift == 1)
            {
                uint64_t lo3  = lo - mul[0];
                uint64_t mid3 = mid - mul[1];
                if (lo3 > lo)
                {
                    --mid3;
                }
                uint64_t hi3 = hi;
                if (mid3 > mid)
                {
                    --hi3;
                }
                *vm = shiftright128(mid3, hi3, (uint32_t)(j - 64 - 1));
            }
            else
            {
                uint64_t lo3  = lo + lo;
                uint64_t mid3 = mid + mid;
                if (lo3 < lo)
                {
                    ++mid3;
                }
                uint64_t hi3 = hi + hi;
                if (mid3 < mid)
                {
                    ++hi3;
                }
                uint64_t lo4  = lo3 - mul[0];
                uint64_t mid4 = mid3 - mul[1];
                if (lo4 > lo3)
                {
                    --mid4;
                }
                uint64_t hi4 = hi3;
                if (mid4 > mid3)
                {
                    --hi4;
                }
                *vm = shiftright128(mid4, hi4, (uint32_t)(j - 64));
            }

            return(shiftright128(mid, hi, (uint32_t)(j - 64 - 1)));
        }