private static unsafe void Add(Vector256 <UInt64>[] s, Vector256 <UInt32>[] v, int shift) { int shift_sets = shift / Vector256 <UInt64> .Count; int shift_rems = shift % Vector256 <UInt64> .Count; if (shift_rems == 0) { #if DEBUG Debug <OverflowException> .Assert(checked (shift_sets + v.Length) <= s.Length); #endif fixed(Vector256 <UInt64> *ps = s) { fixed(Vector256 <UInt32> *pv = v) { for (int i = 0, store_idx = shift_sets; i < v.Length; i++, store_idx++) { ps[store_idx] = Avx2.Add(ps[store_idx], pv[i].AsUInt64()); } } } } else { #if DEBUG Debug <OverflowException> .Assert(checked (shift_sets + v.Length) < s.Length); #endif Vector256 <UInt64> ml = Mask256.MSV(checked ((uint)(shift_rems * 2))).AsUInt64(); Vector256 <UInt64> mh = Mask256.LSV(checked ((uint)(shift_rems * 2))).AsUInt64(); byte mm_perm = shift_rems switch { 1 => MM_PERM_CBAD, 2 => MM_PERM_BADC, 3 => MM_PERM_ADCB, _ => throw new ArgumentException(nameof(shift_rems)) }; int store_idx = shift_sets; Vector256 <UInt64> uh, ul, u; fixed(Vector256 <UInt64> *ps = s) { fixed(Vector256 <UInt32> *pv = v) { u = Avx2.Permute4x64(pv[0].AsUInt64(), mm_perm); ul = Avx2.And(u, ml); ps[store_idx] = Avx2.Add(ps[store_idx], ul); store_idx++; for (int i = 1; i < v.Length; i++) { uh = Avx2.And(u, mh); u = Avx2.Permute4x64(pv[i].AsUInt64(), mm_perm); ul = Avx2.And(u, ml); ps[store_idx] = Avx2.Add(ps[store_idx], Avx2.Or(uh, ul)); store_idx++; } uh = Avx2.And(u, mh); ps[store_idx] = Avx2.Add(ps[store_idx], uh); } } } }