static uint64_t umul128(uint64_t a, uint64_t b, uint64_t *productHi) { // The casts here help MSVC to avoid calls to the __allmul library function. uint32_t aLo = (uint32_t)a; uint32_t aHi = (uint32_t)(a >> 32); uint32_t bLo = (uint32_t)b; uint32_t bHi = (uint32_t)(b >> 32); uint64_t b00 = (uint64_t)aLo * bLo; uint64_t b01 = (uint64_t)aLo * bHi; uint64_t b10 = (uint64_t)aHi * bLo; uint64_t b11 = (uint64_t)aHi * bHi; uint32_t b00Lo = (uint32_t)b00; uint32_t b00Hi = (uint32_t)(b00 >> 32); uint64_t mid1 = b10 + b00Hi; uint32_t mid1Lo = (uint32_t)(mid1); uint32_t mid1Hi = (uint32_t)(mid1 >> 32); uint64_t mid2 = b01 + mid1Lo; uint32_t mid2Lo = (uint32_t)(mid2); uint32_t mid2Hi = (uint32_t)(mid2 >> 32); uint64_t pHi = b11 + mid1Hi + mid2Hi; uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; *productHi = pHi; return(pLo); }
public void sfmt_fill_array64(uint64_t *array, int size) { assert(idx == SFMT_N32); assert(size % 2 == 0); assert(size >= SFMT_N64); gen_rand_array((w128_t *)array, size / 2); idx = SFMT_N32; }
// if needed, allocate memory so that the object is able to process JSON // documents having up to len bytes and maxdepth "depth" public bool AllocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH) { if ((maxdepth == 0) || (len == 0)) { return(false); } if (len > SIMDJSON_MAXSIZE_BYTES) { return(false); } if ((len <= bytecapacity) && (depthcapacity < maxdepth)) { return(true); } Deallocate(); isvalid = false; bytecapacity = 0; // will only set it to len after allocations are a success n_structural_indexes = 0; uint32_t max_structures = (uint32_t)(ROUNDUP_N(len, 64) + 2 + 7); structural_indexes = allocate <uint32_t>(max_structures); // a pathological input like "[[[[..." would generate len tape elements, so need a capacity of len + 1 size_t localtapecapacity = ROUNDUP_N(len + 1, 64); // a document with only zero-length strings... could have len/3 string // and we would need len/3 * 5 bytes on the string buffer size_t localstringcapacity = ROUNDUP_N(5 * len / 3 + 32, 64); string_buf = allocate <uint8_t>(localstringcapacity); tape = allocate <uint64_t>(localtapecapacity); containing_scope_offset = allocate <uint32_t>(maxdepth); ret_address = allocate <char1>(maxdepth); if ((string_buf == null) || (tape == null) || (containing_scope_offset == null) || (ret_address == null) || (structural_indexes == null)) { delete(ret_address); delete(containing_scope_offset); delete(tape); delete(string_buf); delete(structural_indexes); return(false); } /* * // We do not need to initialize this content for parsing, though we could * // need to initialize it for safety. * memset(string_buf, 0 , localstringcapacity); * memset(structural_indexes, 0, max_structures * sizeof(uint32_t)); * memset(tape, 0, localtapecapacity * sizeof(uint64_t)); */ bytecapacity = len; depthcapacity = maxdepth; tapecapacity = localtapecapacity; stringcapacity = localstringcapacity; return(true); }
public uint32_t dsfmt_genrand_uint32() { if (idx >= DSFMT_N64) { dsfmt_gen_rand_all(); idx = 0; } fixed(w128_t *status = this.status) { uint64_t *psfmt64 = &status[0].u64_0; return((uint)psfmt64[idx++]); } }
/** * This function certificate the period of 2^{SFMT_MEXP}-1. * @param dsfmt dsfmt state vector. */ void period_certification() { uint64_t *tmp = stackalloc uint64_t[2]; uint64_t inner; uint64_t work; fixed(w128_t *status = this.status) { tmp[0] = (status[DSFMT_N].u64_0 ^ DSFMT_FIX1); tmp[1] = (status[DSFMT_N].u64_1 ^ DSFMT_FIX2); inner = tmp[0] & pcv[0]; inner ^= tmp[1] & pcv[1]; for (int i = 32; i > 0; i >>= 1) { inner ^= inner >> i; } inner &= 1; /* check OK */ if (inner == 1) { return; } /* check NG, and modification */ if ((DSFMT_PCV2 & 1) == 1) { status[DSFMT_N].u64_1 ^= 1; } else { #pragma warning disable 162 for (int i = 1; i >= 0; i--) { work = 1; for (int j = 0; j < 64; j++) { if ((work & pcv[i]) != 0) { (&status[DSFMT_N].u64_0)[i] ^= work; return; } work = work << 1; } } #pragma warning restore } } }
// We need a 64x128-bit multiplication and a subsequent 128-bit shift. // Multiplication: // The 64-bit factor is variable and passed in, the 128-bit factor comes // from a lookup table. We know that the 64-bit factor only has 55 // significant bits (i.e., the 9 topmost bits are zeros). The 128-bit // factor only has 124 significant bits (i.e., the 4 topmost bits are // zeros). // Shift: // In principle, the multiplication result requires 55 + 124 = 179 bits to // represent. However, we then shift this value to the right by j, which is // at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64 // bits. This means that we only need the topmost 64 significant bits of // the 64x128-bit multiplication. // // There are several ways to do this: // 1. Best case: the compiler exposes a 128-bit type. // We perform two 64x64-bit multiplications, add the higher 64 bits of the // lower result to the higher result, and shift by j - 64 bits. // // We explicitly cast from 64-bit to 128-bit, so the compiler can tell // that these are only 64-bit inputs, and can map these to the best // possible sequence of assembly instructions. // x64 machines happen to have matching assembly instructions for // 64x64-bit multiplications and 128-bit shifts. // // 2. Second best case: the compiler exposes intrinsics for the x64 assembly // instructions mentioned in 1. // // 3. We only have 64x64 bit instructions that return the lower 64 bits of // the result, i.e., we have to use plain C. // Our inputs are less than the full width, so we have three options: // a. Ignore this fact and just implement the intrinsics manually. // b. Split both into 31-bit pieces, which guarantees no internal overflow, // but requires extra work upfront (unless we change the lookup table). // c. Split only the first factor into 31-bit pieces, which also guarantees // no internal overflow, but requires extra work since the intermediate // results are not perfectly aligned. static uint64_t mulShift64(uint64_t m, uint64_t *mul, int32_t j) { // m is maximum 55 bits uint64_t high1; // 128 uint64_t low1 = umul128(m, mul[1], &high1); // 64 uint64_t high0; // 64 umul128(m, mul[0], &high0); // 0 uint64_t sum = high0 + low1; if (sum < high0) { ++high1; // overflow into high1 } return(shiftright128(sum, high1, (uint)j - 64)); }
// if needed, allocate memory so that the object is able to process JSON // documents having up to len butes and maxdepth "depth" public bool AllocateCapacity(size_t len, size_t maxdepth = DEFAULTMAXDEPTH) { if ((maxdepth == 0) || (len == 0)) { Debug.WriteLine("capacities must be non-zero "); return(false); } if (len > 0) { if ((len <= bytecapacity) && (depthcapacity < maxdepth)) { return(true); } Deallocate(); } isvalid = false; bytecapacity = 0; // will only set it to len after allocations are a success n_structural_indexes = 0; uint32_t max_structures = (uint32_t)ROUNDUP_N(len, 64) + 2 + 7; structural_indexes = Utils.allocate <uint32_t>(max_structures); size_t localtapecapacity = ROUNDUP_N(len, 64); size_t localstringcapacity = ROUNDUP_N(len, 64); string_buf = Utils.allocate <uint8_t>(localstringcapacity); tape = Utils.allocate <uint64_t>(localtapecapacity); containing_scope_offset = Utils.allocate <uint32_t>(maxdepth); ret_address = Utils.allocate <bytechar>(maxdepth); if ((string_buf == null) || (tape == null) || (containing_scope_offset == null) || (ret_address == null) || (structural_indexes == null)) { Deallocate(); return(false); } bytecapacity = len; depthcapacity = maxdepth; tapecapacity = localtapecapacity; stringcapacity = localstringcapacity; return(true); }
private void Deallocate() { isvalid = false; bytecapacity = 0; depthcapacity = 0; tapecapacity = 0; stringcapacity = 0; if (ret_address != null) { delete(ret_address); ret_address = null; } if (containing_scope_offset != null) { delete(containing_scope_offset); containing_scope_offset = null; } if (tape != null) { delete(tape); tape = null; } if (string_buf != null) { delete(string_buf); string_buf = null; } if (structural_indexes != null) { delete(structural_indexes); structural_indexes = null; } }
static uint32_t mulShift_mod1e9(uint64_t m, uint64_t *mul, int32_t j) { uint64_t high0; // 64 uint64_t low0 = umul128(m, mul[0], &high0); // 0 uint64_t high1; // 128 uint64_t low1 = umul128(m, mul[1], &high1); // 64 uint64_t high2; // 192 uint64_t low2 = umul128(m, mul[2], &high2); // 128 uint64_t s0low = low0; // 0 uint64_t s0high = low1 + high0; // 64 uint32_t c1 = s0high < low1 ? 1U : 0; uint64_t s1low = low2 + high1 + c1; // 128 uint32_t c2 = s1low < low2 ? 1U : 0; // high1 + c1 can't overflow, so compare against low2 uint64_t s1high = high2 + c2; // 192 assert(j >= 128); assert(j <= 180); uint32_t dist = (uint32_t)(j - 128); // dist: [0, 52] uint64_t shiftedhigh = s1high >> (int)dist; uint64_t shiftedlow = shiftright128(s1low, s1high, dist); return(uint128_mod1e9(shiftedhigh, shiftedlow)); }
// This is faster if we don't have a 64x64->128-bit multiplication. static uint64_t mulShiftAll64(uint64_t m, uint64_t *mul, int32_t j, uint64_t *vp, uint64_t *vm, uint32_t mmShift) { m <<= 1; // m is maximum 55 bits uint64_t tmp; uint64_t lo = umul128(m, mul[0], &tmp); uint64_t hi; uint64_t mid = tmp + umul128(m, mul[1], &hi); if (mid < tmp) { ++hi; // overflow into hi } uint64_t lo2 = lo + mul[0]; uint64_t mid2 = mid + mul[1]; if (lo2 < lo) { ++mid2; } uint64_t hi2 = hi; if (mid2 < mid) { ++hi2; } *vp = shiftright128(mid2, hi2, (uint32_t)(j - 64 - 1)); if (mmShift == 1) { uint64_t lo3 = lo - mul[0]; uint64_t mid3 = mid - mul[1]; if (lo3 > lo) { --mid3; } uint64_t hi3 = hi; if (mid3 > mid) { --hi3; } *vm = shiftright128(mid3, hi3, (uint32_t)(j - 64 - 1)); } else { uint64_t lo3 = lo + lo; uint64_t mid3 = mid + mid; if (lo3 < lo) { ++mid3; } uint64_t hi3 = hi + hi; if (mid3 < mid) { ++hi3; } uint64_t lo4 = lo3 - mul[0]; uint64_t mid4 = mid3 - mul[1]; if (lo4 > lo3) { --mid4; } uint64_t hi4 = hi3; if (mid4 > mid3) { --hi4; } *vm = shiftright128(mid4, hi4, (uint32_t)(j - 64)); } return(shiftright128(mid, hi, (uint32_t)(j - 64 - 1))); }