public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector256 <Int32> *pFld1 = &_fld1) fixed(Vector256 <Int32> *pFld2 = &_fld2) fixed(Vector256 <Int32> *pFld3 = &_fld3) { var result = Avx2.BlendVariable( Avx.LoadVector256((Int32 *)(pFld1)), Avx.LoadVector256((Int32 *)(pFld2)), Avx.LoadVector256((Int32 *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__SignSByte(); fixed(Vector256 <SByte> *pFld1 = &test._fld1) fixed(Vector256 <SByte> *pFld2 = &test._fld2) { var result = Avx2.Sign( Avx.LoadVector256((SByte *)(pFld1)), Avx.LoadVector256((SByte *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
internal static v256 mul_byte(v256 left, v256 right) { if (Avx2.IsAvx2Supported) { v256 productEvenIndices = Avx2.mm256_mullo_epi16(left, right); left = Avx2.mm256_srli_epi16(left, 8); right = Avx2.mm256_srli_epi16(right, 8); v256 productOddIndices = Avx2.mm256_slli_epi16(Avx2.mm256_mullo_epi16(left, right), 8); return(Avx2.mm256_blendv_epi8(productEvenIndices, productOddIndices, new v256(0xFF00_FF00))); } else { throw new CPUFeatureCheckException(); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector256 <Int32> *pClsVar1 = &_clsVar1) fixed(Vector256 <Int32> *pClsVar2 = &_clsVar2) fixed(Vector256 <Int32> *pClsVar3 = &_clsVar3) { var result = Avx2.BlendVariable( Avx.LoadVector256((Int32 *)(pClsVar1)), Avx.LoadVector256((Int32 *)(pClsVar2)), Avx.LoadVector256((Int32 *)(pClsVar3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _clsVar3, _dataTable.outArrayPtr); } }
private static unsafe (Vector256 <UInt32>[] hi, Vector256 <UInt32>[] lo) Mul(Vector256 <UInt32>[] v, UInt32 n) { Vector256 <UInt32>[] w_hi = new Vector256 <UInt32> [v.Length], w_lo = new Vector256 <UInt32> [v.Length]; Vector256 <UInt32> u = Avx2.ConvertToVector256Int64(Vector128.Create(n)).AsUInt32(); Vector256 <UInt32> mask = lower_mask; fixed(Vector256 <UInt32> *pv = v, pw_hi = w_hi, pw_lo = w_lo) { for (int i = 0; i < v.Length; i++) { Vector256 <UInt32> c = Avx2.Multiply(pv[i], u).AsUInt32(); pw_hi[i] = Avx2.And(Avx2.Shuffle(c, MM_PERM_CDAB), mask); pw_lo[i] = Avx2.And(c, mask); } } return(w_hi, w_lo); }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ private static unsafe void shuffle2_avx2(byte *dest, byte *src, int vectorizable_elements, int total_elements) { int bytesoftype = 2; int j; int k; var ymm0 = new Vector256 <byte> [2]; var ymm1 = new Vector256 <byte> [2]; /* Create the shuffle mask. * NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from * most to least significant (i.e., their order is reversed when compared to * loading the mask from an array). */ var shmask = Vector256.Create((byte) 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00); for (j = 0; j < vectorizable_elements; j += sizeof(Vector256 <byte>)) { /* Fetch 32 elements (64 bytes) then transpose bytes, words and double words. */ for (k = 0; k < 2; k++) { ymm0[k] = Avx.LoadVector256((src + (j * bytesoftype) + (k * sizeof(Vector256 <byte>)))); ymm1[k] = Avx2.Shuffle(ymm0[k], shmask); } ymm0[0] = Avx2.Permute4x64(ymm1[0].AsInt64(), 0xd8).AsByte(); ymm0[1] = Avx2.Permute4x64(ymm1[1].AsInt64(), 0x8d).AsByte(); ymm1[0] = Avx2.Blend(ymm0[0].AsInt32(), ymm0[1].AsInt32(), 0xf0).AsByte(); ymm0[1] = Avx2.Blend(ymm0[0].AsInt32(), ymm0[1].AsInt32(), 0x0f).AsByte(); ymm1[1] = Avx2.Permute4x64(ymm0[1].AsInt64(), 0x4e).AsByte(); /* Store the result vectors */ byte *dest_for_jth_element = dest + j; for (k = 0; k < 2; k++) { Avx2.Store((dest_for_jth_element + (k * total_elements)), ymm1[k]); } } }
public static unsafe void ReverseBits(this Span <int> span) { var intsReversed = 0; if (Avx2.IsSupported) { fixed(int *ptr = span) { var vectorCount = span.Length / 8; for (int i = 0; i < vectorCount; i++) { var vector = Avx.LoadVector256((ptr + intsReversed)); var vector2 = Avx2.And(Avx2.And(vector, Vector256.Create(0xFF00FF)), Vector256.Create(-16711936)); vector = Avx2.Add( Avx2.Or( Avx2.ShiftRightLogical(vector, 8), Avx2.ShiftLeftLogical(vector, 24) ), Avx2.Or( Avx2.ShiftLeftLogical(vector2, 8), Avx2.ShiftRightLogical(vector2, 24) ) ); Avx.Store(ptr + intsReversed, vector); intsReversed += 8; } } } for (int i = intsReversed; i < span.Length; i++) { span[i] = BinaryPrimitives.ReverseEndianness(span[i]); } fixed(void *ptr = span) { new Span <byte>(ptr, span.Length * 4).ReverseBits(); } }
public void Add1() { //var left = Vector256.Create(a+0,a+1,a+2,a+3,a+4,a+5,a+6,a+7,a+8,a+9,a+10,a+11,a+12,a+13,a+14,a+15); //var right = Vector256.Create(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); for (var left0 = 0; left0 < 1; left0++) { var left1 = (Byte)left0; var left2 = Vector256.Create(left1); for (var right0 = 0; right0 < 1; right0++) { var right1 = (Byte)right0; var right2 = Vector256.Create(right1); var actual = Avx2.Add(left2, right2); for (var index = 0; index < 32; index++) { Assert.AreEqual((Byte)(left0 + right0), actual.GetElement(index)); } } } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleTernaryOpTest__BlendVariableInt16(); fixed(Vector256 <Int16> *pFld1 = &test._fld1) fixed(Vector256 <Int16> *pFld2 = &test._fld2) fixed(Vector256 <Int16> *pFld3 = &test._fld3) { var result = Avx2.BlendVariable( Avx.LoadVector256((Int16 *)(pFld1)), Avx.LoadVector256((Int16 *)(pFld2)), Avx.LoadVector256((Int16 *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); } }
public void 水平加算Int16() { for (var a = 0; a < 1; a++) { var operand0 = Vector256.Create(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); var operand1 = Vector256.Create(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); for (var b = 0; b < 1; b++) { var result = Avx2.HorizontalAdd(operand0, operand1); } } for (var a = 0; a < 1; a++) { var operand0 = Vector256.Create(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); var operand1 = Vector256.Create(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); var result = Vector256.Create( (Byte)(operand0.GetElement(0) + operand0.GetElement(1)), (Byte)(operand0.GetElement(2) + operand0.GetElement(3)), (Byte)(operand0.GetElement(4) + operand0.GetElement(5)), (Byte)(operand0.GetElement(6) + operand0.GetElement(7)), (Byte)(operand0.GetElement(8) + operand0.GetElement(9)), (Byte)(operand0.GetElement(10) + operand0.GetElement(11)), (Byte)(operand0.GetElement(12) + operand0.GetElement(13)), (Byte)(operand0.GetElement(14) + operand0.GetElement(15)), (Byte)(operand1.GetElement(0) + operand1.GetElement(1)), (Byte)(operand1.GetElement(2) + operand1.GetElement(3)), (Byte)(operand1.GetElement(4) + operand1.GetElement(5)), (Byte)(operand1.GetElement(6) + operand1.GetElement(7)), (Byte)(operand1.GetElement(8) + operand1.GetElement(9)), (Byte)(operand1.GetElement(10) + operand1.GetElement(11)), (Byte)(operand1.GetElement(12) + operand1.GetElement(13)), (Byte)(operand1.GetElement(14) + operand1.GetElement(15)) ); } }
public void AOSからSIMD合計ループ1展開() { var AOS = AOS作成(); var Bオフセット = Marshal.OffsetOf <ABC>(nameof(ABC.B)).ToInt32(); var index = Vector256.Create( sizeof(ABC) * 0 + Bオフセット, sizeof(ABC) * 1 + Bオフセット, sizeof(ABC) * 2 + Bオフセット, sizeof(ABC) * 3 + Bオフセット, sizeof(ABC) * 4 + Bオフセット, sizeof(ABC) * 5 + Bオフセット, sizeof(ABC) * 6 + Bオフセット, sizeof(ABC) * 7 + Bオフセット ); var Count = (レコード数 + Int32ベクタ長 - 1) / Int32ベクタ長; Vector256 <Int32> Vector256Sum0 = default; var watch = Stopwatch.StartNew(); fixed(ABC *pAOS = AOS) { for (var a = 0; a < 繰り返し数; a++) { for (var b = 0; b < Count; b++) { Vector256Sum0 = Avx2.Add(Vector256Sum0, Avx2.GatherVector256((Int32 *)&pAOS[b * Int32ベクタ長], index, 1)); } } } watch.Stop(); var Sum = 0; for (var a = 0; a < Int32ベクタ長; a++) { Sum += Vector256Sum0.GetElement(a); } Console.WriteLine($"Sum={Sum}"); Console.WriteLine($"for {繰り返し数} loops: {watch.ElapsedMilliseconds}ms"); Console.WriteLine(); }
public unsafe int Intrinsics() { int vectorSize = 256 / 8 / 4; var temp = stackalloc int[vectorSize]; for (int j = 0; j < vectorSize; j++) { temp[j] = Item; } var mask = Avx2.LoadVector256(temp); var accVector = Vector256 <int> .Zero; int i; var array = Array; fixed(int *ptr = array) { for (i = 0; i <= array.Length - vectorSize; i += vectorSize) { var v = Avx2.LoadVector256(ptr + i); var areEqual = Avx2.CompareEqual(v, mask); accVector = Avx2.Subtract(accVector, areEqual); } } int result = 0; Avx2.Store(temp, accVector); for (int j = 0; j < vectorSize; j++) { result += temp[j]; } for (; i < array.Length; i++) { if (array[i] == Item) { result++; } } return(result); }
public static void Or(this Span <byte> thisSpam, Span <byte> valueSpam) { var length = thisSpam.Length; if (length != valueSpam.Length) { throw new ArgumentException("Both byte spans has to be same length."); } int i = 0; fixed(byte *thisPtr = thisSpam) fixed(byte *valuePtr = valueSpam) { if (Avx2.IsSupported) { for (; i < length - (Vector256 <byte> .Count - 1); i += Vector256 <byte> .Count) { Vector256 <byte> b1 = Avx2.LoadVector256(thisPtr + i); Vector256 <byte> b2 = Avx2.LoadVector256(valuePtr + i); Avx2.Store(thisPtr + i, Avx2.Or(b1, b2)); } } else if (Sse2.IsSupported) { for (; i < length - (Vector128 <byte> .Count - 1); i += Vector128 <byte> .Count) { Vector128 <byte> b1 = Sse2.LoadVector128(thisPtr + i); Vector128 <byte> b2 = Sse2.LoadVector128(valuePtr + i); Sse2.Store(thisPtr + i, Sse2.Or(b1, b2)); } } } for (; i < length; i++) { thisSpam[i] |= valueSpam[i]; } }
private static void Shuffle(ref Vector256 <uint> a, ref Vector256 <uint> b, ref Vector256 <uint> c, ref Vector256 <uint> d) { a = Avx2.PermuteVar8x32(a, Permute7); // 3 19 9 25 4 20 14 30 b = Avx2.PermuteVar8x32(b, Permute8); // 0 16 10 26 5 21 15 31 c = Avx2.PermuteVar8x32(c, Permute9); // 1 17 11 27 6 22 12 28 d = Avx2.PermuteVar8x32(d, Permute10); // 2 18 8 24 7 23 13 29 var t0 = Avx2.UnpackLow(a, b); // 3 0 19 16 4 5 20 21 var t1 = Avx2.UnpackLow(c, d); // 1 2 17 18 6 7 22 23 var t2 = Avx2.UnpackHigh(a, b); // 9 10 25 26 14 15 30 31 var t3 = Avx2.UnpackHigh(c, d); // 11 8 27 24 12 13 28 29 a = Avx2.UnpackLow(t0, t1); // 3 1 0 2 4 6 5 7 b = Avx2.UnpackLow(t2, t3); // 9 11 10 8 14 12 15 13 c = Avx2.UnpackHigh(t0, t1); // 19 17 16 18 20 22 21 23 d = Avx2.UnpackHigh(t2, t3); // 25 27 26 24 30 28 31 29 a = Avx2.PermuteVar8x32(a, Permute11); // 0 1 2 3 4 5 6 7 b = Avx2.PermuteVar8x32(b, Permute12); // 8 9 10 11 12 13 14 15 c = Avx2.PermuteVar8x32(c, Permute11); // 16 17 18 19 20 21 22 23 d = Avx2.PermuteVar8x32(d, Permute12); // 24 25 26 27 28 29 30 31 }
public void Add2_Double() { for (var left_lower0 = 0; left_lower0 < 1; left_lower0++) { var left_lower1 = (Double)left_lower0; var left_lower2 = Vector128.Create(left_lower1); for (var left_upper0 = 0; left_upper0 < 1; left_upper0++) { var left_upper1 = (Double)left_upper0; var left_upper2 = Vector128.Create(left_upper1); var left3 = Vector256.Create(left_lower2, left_upper2); for (var right_lower0 = 0; right_lower0 < 1; right_lower0++) { var right_lower1 = (Double)right_lower0; var right_lower2 = Vector128.Create(right_lower1); for (var right_upper0 = 0; right_upper0 < 1; right_upper0++) { var right_upper1 = (Double)right_upper0; var right_upper2 = Vector128.Create(right_upper1); var right3 = Vector256.Create(right_lower2, right_upper2); var actual = Avx2.Add(left3, right3); var expected_upper0 = (UInt64)(left_upper0 + right_upper0); var expected_upper1 = (Double)expected_upper0; var expected_upper2 = Vector128.Create(expected_upper1); var expected_lower0 = (UInt64)(left_lower0 + right_lower0); var expected_lower1 = (Double)expected_lower0; var expected_lower2 = Vector128.Create(expected_lower1); var expected3 = Vector256.Create(expected_lower2, expected_upper2).AsDouble(); for (var index = 0; index < 4; index++) { Assert.AreEqual(expected3.GetElement(index), actual.GetElement(index)); } } } } } }
public unsafe void IntrinsicsAVX2(byte[] oldScreen, byte[] newScreen, byte[] difference, int cores) { int steps = difference.Length / 32; int max = 1048576 / cores; Parallel.For(1, cores + 1, index => { fixed(byte *pOld = oldScreen) fixed(byte *pNew = newScreen) fixed(byte *pDiff = difference) for (int bufCnt = 0; bufCnt < max; bufCnt++) { long *ppOld = (long *)pOld; long *ppNew = (long *)pNew; long *ppDiff = (long *)pDiff; for (int position = 0; position < steps; ppOld += 4, ppNew += 4, ppDiff += 4, position++) { Avx2.Store(ppDiff, Avx2.Xor(Avx2.LoadVector256(ppOld), Avx2.LoadVector256(ppNew))); } } }); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Avx2.IsSupported) { Four = 4; Eight = 8; invalid = 15; for (int i = 0; i < N; i++) { floatSourceTable[i] = (float)i * 10.0f; doubleSourceTable[i] = (double)i * 10.0; intSourceTable[i] = i * 10; longSourceTable[i] = i * 10; } Vector256 <int> indexi; Vector256 <long> indexl; Vector128 <int> indexi128; fixed(int *iptr = intIndexTable) fixed(long *lptr = longIndexTable) fixed(int *i128ptr = vector128intIndexTable) { indexi = Avx.LoadVector256(iptr); indexl = Avx.LoadVector256(lptr); indexi128 = Sse2.LoadVector128(i128ptr); } Vector256 <int> maski; Vector256 <uint> maskui; Vector256 <long> maskl; Vector256 <ulong> maskul; Vector256 <float> maskf; Vector256 <double> maskd; fixed(int *iptr = intMaskTable) fixed(long *lptr = longMaskTable) { maski = Avx.LoadVector256(iptr); maskl = Avx.LoadVector256(lptr); maskui = maski.AsUInt32(); maskul = maskl.AsUInt64(); maskf = maski.AsSingle(); maskd = maskl.AsDouble(); } Vector256 <int> sourcei = Vector256 <int> .Zero; Vector256 <uint> sourceui = Vector256 <uint> .Zero; Vector256 <long> sourcel = Vector256 <long> .Zero; Vector256 <ulong> sourceul = Vector256 <ulong> .Zero; Vector256 <float> sourcef = Vector256 <float> .Zero; Vector256 <double> sourced = Vector256 <double> .Zero; // public static unsafe Vector256<float> GatherMaskVector256(Vector256<float> source, float* baseAddress, Vector256<int> index, Vector256<float> mask, byte scale) using (TestTable <float, int> floatTable = new TestTable <float, int>(floatSourceTable, new float[8])) { var vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, 4); Unsafe.Write(floatTable.outArrayPtr, vf); if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <float>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <float>), typeof(float *), typeof(Vector256 <int>), typeof(Vector256 <float>), typeof(byte) }). Invoke(null, new object[] { sourcef, Pointer.Box(floatTable.inArrayPtr, typeof(float *)), indexi, maskf, (byte)4 }); Unsafe.Write(floatTable.outArrayPtr, vf); if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on float with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, Four); Unsafe.Write(floatTable.outArrayPtr, vf); if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on float with non-const scale (IMM):"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on float with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<double> GatherMaskVector256(Vector256<double> source, double* baseAddress, Vector128<int> index, Vector256<double> mask, byte scale) using (TestTable <double, int> doubletTable = new TestTable <double, int>(doubleSourceTable, new double[4])) { var vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, 8); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vd = (Vector256 <double>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <double>), typeof(double *), typeof(Vector128 <int>), typeof(Vector256 <double>), typeof(byte) }). Invoke(null, new object[] { sourced, Pointer.Box(doubletTable.inArrayPtr, typeof(double *)), indexi128, maskd, (byte)8 }); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on double:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, Eight); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double with non-const scale (IMM):"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<int> GatherMaskVector256(Vector256<int> source, int* baseAddress, Vector256<int> index, Vector256<int> mask, byte scale) using (TestTable <int, int> intTable = new TestTable <int, int>(intSourceTable, new int[8])) { var vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, 4); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on int:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <int>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <int>), typeof(int *), typeof(Vector256 <int>), typeof(Vector256 <int>), typeof(byte) }). Invoke(null, new object[] { sourcei, Pointer.Box(intTable.inArrayPtr, typeof(int *)), indexi, maski, (byte)4 }); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on int:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on int with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, Four); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on int with non-const scale (IMM):"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on int with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<uint> GatherMaskVector256(Vector256<uint> source, uint* baseAddress, Vector256<int> index, Vector256<uint> mask, byte scale) using (TestTable <int, int> intTable = new TestTable <int, int>(intSourceTable, new int[8])) { var vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, 4); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on uint:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <uint>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <uint>), typeof(uint *), typeof(Vector256 <int>), typeof(Vector256 <uint>), typeof(byte) }). Invoke(null, new object[] { sourceui, Pointer.Box(intTable.inArrayPtr, typeof(uint *)), indexi, maskui, (byte)4 }); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on uint:"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, Four); Unsafe.Write(intTable.outArrayPtr, vf); if (!intTable.CheckResult((x, y) => x == y, intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with non-const scale (IMM):"); foreach (var item in intTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<long> GatherMaskVector256(Vector256<long> source, long* baseAddress, Vector128<int> index, Vector256<long> mask, byte scale) using (TestTable <long, int> longTable = new TestTable <long, int>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <long>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <long>), typeof(long *), typeof(Vector128 <int>), typeof(Vector256 <long>), typeof(byte) }). Invoke(null, new object[] { sourcel, Pointer.Box(longTable.inArrayPtr, typeof(long *)), indexi128, maskl, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on long:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long with non-const scale (IMM):"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<ulong> GatherMaskVector256(Vector256<ulong> source, ulong* baseAddress, Vector128<int> index, Vector256<ulong> mask, byte scale) using (TestTable <long, int> longTable = new TestTable <long, int>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <ulong>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <ulong>), typeof(ulong *), typeof(Vector128 <int>), typeof(Vector256 <ulong>), typeof(byte) }). Invoke(null, new object[] { sourceul, Pointer.Box(longTable.inArrayPtr, typeof(ulong *)), indexi128, maskul, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on ulong:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with non-const scale (IMM):"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid non-const scale (IMM)"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<long> GatherMaskVector256(Vector256<long> source, long* baseAddress, Vector256<long> index, Vector256<long> mask, byte scale) using (TestTable <long, long> longTable = new TestTable <long, long>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <long>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <long>), typeof(long *), typeof(Vector256 <long>), typeof(Vector256 <long>), typeof(byte) }). Invoke(null, new object[] { sourcel, Pointer.Box(longTable.inArrayPtr, typeof(long *)), indexl, maskl, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on long with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on long with non-const scale (IMM) and Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<ulong> GatherMaskVector256(Vector256<ulong> source, ulong* baseAddress, Vector256<long> index, Vector256<ulong> mask, byte scale) using (TestTable <long, long> longTable = new TestTable <long, long>(longSourceTable, new long[4])) { var vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, 8); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf = (Vector256 <ulong>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <ulong>), typeof(ulong *), typeof(Vector256 <long>), typeof(Vector256 <ulong>), typeof(byte) }). Invoke(null, new object[] { sourceul, Pointer.Box(longTable.inArrayPtr, typeof(ulong *)), indexl, maskul, (byte)8 }); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on ulong with Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, Eight); Unsafe.Write(longTable.outArrayPtr, vf); if (!longTable.CheckResult((x, y) => x == y, longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with non-const scale (IMM) and Vector256 long index:"); foreach (var item in longTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } // public static unsafe Vector256<double> GatherMaskVector256(Vector256<double> source, double* baseAddress, Vector256<long> index, Vector256<double> mask, byte scale) using (TestTable <double, long> doubletTable = new TestTable <double, long>(doubleSourceTable, new double[4])) { var vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, 8); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double with Vector256 long index:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vd = (Vector256 <double>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <double>), typeof(double *), typeof(Vector256 <long>), typeof(Vector256 <double>), typeof(byte) }). Invoke(null, new object[] { sourced, Pointer.Box(doubletTable.inArrayPtr, typeof(double *)), indexl, maskd, (byte)8 }); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on double with Vector256 long index:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, 3); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, Eight); Unsafe.Write(doubletTable.outArrayPtr, vd); if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable)) { Console.WriteLine("AVX2 GatherMaskVector256 failed on double with non-const scale (IMM) and Vector256 long index:"); foreach (var item in doubletTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } try { vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, invalid); Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid non-const scale (IMM) and Vector256 long index"); testResult = Fail; } catch (System.ArgumentOutOfRangeException) { // success } } } return(testResult); }
public void RunStructFldScenario(ExtractStoreTest__ExtractVector128Byte1 testClass) { Avx2.ExtractVector128((Byte *)testClass._dataTable.outArrayPtr, _fld, 1); testClass.ValidateResult(_fld, testClass._dataTable.outArrayPtr); }
public void RunClassFldScenario() { Avx2.ExtractVector128((Byte *)_dataTable.outArrayPtr, _fld, 1); ValidateResult(_fld, _dataTable.outArrayPtr); }
public void RunLclFldScenario() { var test = new SimpleUnaryOpTest__ExtractVector128Byte1Store(); Avx2.ExtractVector128((Byte *)_dataTable.outArrayPtr, test._fld, 1); }
public void RunLclVarScenario_UnsafeRead() { var firstOp = Unsafe.Read <Vector256 <Byte> >(_dataTable.inArrayPtr); Avx2.ExtractVector128((Byte *)_dataTable.outArrayPtr, firstOp, 1); }
public static unsafe bool TryGetAsciiString(byte *input, char *output, int count) { Debug.Assert(input != null); Debug.Assert(output != null); var end = input + count; Debug.Assert((long)end >= Vector256 <sbyte> .Count); // PERF: so the JIT can reuse the zero from a register Vector128 <sbyte> zero = Vector128 <sbyte> .Zero; if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256 <sbyte> .Count) { Vector256 <sbyte> avxZero = Vector256 <sbyte> .Zero; do { var vector = Avx.LoadVector256(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, avxZero)) { return(false); } var tmp0 = Avx2.UnpackLow(vector, avxZero); var tmp1 = Avx2.UnpackHigh(vector, avxZero); // Bring into the right order var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20); var out1 = Avx2.Permute2x128(tmp0, tmp1, 0x31); Avx.Store((ushort *)output, out0.AsUInt16()); Avx.Store((ushort *)output + Vector256 <ushort> .Count, out1.AsUInt16()); input += Vector256 <sbyte> .Count; output += Vector256 <sbyte> .Count; } while (input <= end - Vector256 <sbyte> .Count); if (input == end) { return(true); } } if (input <= end - Vector128 <sbyte> .Count) { do { var vector = Sse2.LoadVector128(input).AsSByte(); if (!CheckBytesInAsciiRange(vector, zero)) { return(false); } var c0 = Sse2.UnpackLow(vector, zero).AsUInt16(); var c1 = Sse2.UnpackHigh(vector, zero).AsUInt16(); Sse2.Store((ushort *)output, c0); Sse2.Store((ushort *)output + Vector128 <ushort> .Count, c1); input += Vector128 <sbyte> .Count; output += Vector128 <sbyte> .Count; } while (input <= end - Vector128 <sbyte> .Count); if (input == end) { return(true); } } } else if (Vector.IsHardwareAccelerated) { while (input <= end - Vector <sbyte> .Count) { var vector = Unsafe.AsRef <Vector <sbyte> >(input); if (!CheckBytesInAsciiRange(vector)) { return(false); } Vector.Widen( vector, out Unsafe.AsRef <Vector <short> >(output), out Unsafe.AsRef <Vector <short> >(output + Vector <short> .Count)); input += Vector <sbyte> .Count; output += Vector <sbyte> .Count; } if (input == end) { return(true); } } if (Environment.Is64BitProcess) // Use Intrinsic switch for branch elimination { // 64-bit: Loop longs by default while (input <= end - sizeof(long)) { var value = *(long *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } // BMI2 could be used, but this variant is faster on both Intel and AMD. if (Sse2.X64.IsSupported) { Vector128 <sbyte> vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); Vector128 <ulong> vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); Sse2.Store((ulong *)output, vecWide); } else { output[0] = (char)input[0]; output[1] = (char)input[1]; output[2] = (char)input[2]; output[3] = (char)input[3]; output[4] = (char)input[4]; output[5] = (char)input[5]; output[6] = (char)input[6]; output[7] = (char)input[7]; } input += sizeof(long); output += sizeof(long); } if (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); } } else { // 32-bit: Loop ints by default while (input <= end - sizeof(int)) { var value = *(int *)input; if (!CheckBytesInAsciiRange(value)) { return(false); } WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); } } if (input <= end - sizeof(short)) { if (!CheckBytesInAsciiRange(((short *)input)[0])) { return(false); } output[0] = (char)input[0]; output[1] = (char)input[1]; input += sizeof(short); output += sizeof(short); } if (input < end) { if (!CheckBytesInAsciiRange(((sbyte *)input)[0])) { return(false); } output[0] = (char)input[0]; } return(true); }
public static bool find_structural_bits(uint8_t *buf, size_t len, ParsedJson *pj) { if (len > pj->bytecapacity) { Console.WriteLine("Your ParsedJson object only supports documents up to " + pj->bytecapacity + " bytes but you are trying to process " + len + " bytes\n"); return(false); } uint32_t *base_ptr = pj->structural_indexes; uint32_t @base = 0; const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; // for now, just work in 64-byte chunks // we have padded the input out to 64 byte multiple with the remainder being // zeros // persistent state across loop uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones // effectively the very first char is considered to follow "whitespace" for the // purposes of psuedo-structural character detection uint64_t prev_iter_ends_pseudo_pred = 1UL; size_t lenminus64 = len < 64 ? 0 : len - 64; size_t idx = 0; uint64_t structurals = 0; // C#: assign static readonly fields to locals before the loop Vector256 <byte> low_nibble_mask = s_low_nibble_mask; Vector256 <byte> high_nibble_mask = s_high_nibble_mask; var structural_shufti_mask = Vector256.Create((byte)0x7); var whitespace_shufti_mask = Vector256.Create((byte)0x18); var slashVec = Vector256.Create((bytechar)'\\').AsByte(); var ffVec = Vector128.Create((byte)0xFF).AsUInt64(); var doubleQuoteVec = Vector256.Create((byte)'"'); var zeroBVec = Vector256.Create((byte)0); var vec7f = Vector256.Create((byte)0x7f); for (; idx < lenminus64; idx += 64) { var input_lo = Avx.LoadVector256(buf + idx + 0); var input_hi = Avx.LoadVector256(buf + idx + 32); //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// /// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0)); uint32_t cnt = (uint32_t)hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //Console.WriteLine($"Iter: {idx}, satur: {structurals}"); //*(uint64_t *)(pj->structurals + idx / 8) = structurals; } //////////////// /// we use a giant copy-paste which is ugly. /// but otherwise the string needs to be properly padded or else we /// risk invalidating the UTF-8 checks. //////////// if (idx < len) { uint8_t *tmpbuf = stackalloc uint8_t[64]; memset(tmpbuf, 0x20, 64); memcpy(tmpbuf, buf + idx, len - idx); Vector256 <byte> input_lo = Avx.LoadVector256(tmpbuf + 0); Vector256 <byte> input_hi = Avx.LoadVector256(tmpbuf + 32); //////////////////////////////////////////////////////////////////////////////////////////// // Step 1: detect odd sequences of backslashes //////////////////////////////////////////////////////////////////////////////////////////// uint64_t bs_bits = cmp_mask_against_input(input_lo, input_hi, slashVec); uint64_t start_edges = bs_bits & ~(bs_bits << 1); // flip lowest if we have an odd-length run at the end of the prior // iteration uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; // must record the carry-out of our odd-carries out of bit 63; this // indicates whether the sense of any edge going to the next iteration // should be flipped //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end // if we had an odd-numbered run at the // end of the previous iteration //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; //////////////////////////////////////////////////////////////////////////////////////////// // Step 2: detect insides of quote pairs //////////////////////////////////////////////////////////////////////////////////////////// uint64_t quote_bits = cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64()); quote_mask ^= prev_iter_inside_quote; //BUG? https://github.com/dotnet/coreclr/issues/22813 //quote_mask = 60; //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20 uint32_t cnt = (uint32_t)hamming(structurals); uint32_t next_base = @base + cnt; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base; // How do we build up a user traversable data structure // first, do a 'shufti' to detect structural JSON characters // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c // these go into the first 3 buckets of the comparison (1/2/4) // we are also interested in the four whitespace characters // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d // these go into the next 2 buckets of the comparison (8/16) var v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), vec7f))); var v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), vec7f))); var tmp_lo = Avx2.CompareEqual( Avx2.And(v_lo, structural_shufti_mask), zeroBVec); var tmp_hi = Avx2.CompareEqual( Avx2.And(v_hi, structural_shufti_mask), zeroBVec); uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo); uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi); structurals = ~(structural_res_0 | (structural_res_1 << 32)); // this additional mask and transfer is non-trivially expensive, // unfortunately var tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec); var tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); // mask off anything inside quotes structurals &= ~quote_mask; // add the real quote bits back into our bitmask as well, so we can // quickly traverse the strings we've spent all this trouble gathering structurals |= quote_bits; // Now, establish "pseudo-structural characters". These are non-whitespace // characters that are (a) outside quotes and (b) have a predecessor that's // either whitespace or a structural character. This means that subsequent // passes will get a chance to encounter the first character of every string // of non-whitespace and, if we're parsing an atom like true/false/null or a // number we can stop at the first whitespace or structural character // following it. // a qualified predecessor is something that can happen 1 position before an // psuedo-structural character uint64_t pseudo_pred = structurals | whitespace; uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred; prev_iter_ends_pseudo_pred = pseudo_pred >> 63; uint64_t pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask); structurals |= pseudo_structurals; // now, we've used our close quotes all we need to. So let's switch them off // they will be off in the quote mask and on in quote bits. structurals &= ~(quote_bits & ~quote_mask); //*(uint64_t *)(pj->structurals + idx / 8) = structurals; idx += 64; } uint32_t cnt2 = (uint32_t)hamming(structurals); uint32_t next_base2 = @base + cnt2; while (structurals != 0) { base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals); structurals = structurals & (structurals - 1); @base += 8; } @base = next_base2; pj->n_structural_indexes = @base; if (base_ptr[pj->n_structural_indexes - 1] > len) { throw new InvalidOperationException("Internal bug"); } if (len != base_ptr[pj->n_structural_indexes - 1]) { // the string might not be NULL terminated, but we add a virtual NULL ending character. base_ptr[pj->n_structural_indexes++] = (uint32_t)len; } base_ptr[pj->n_structural_indexes] = 0; // make it safe to dereference one beyond this array return(true); }
public static i32 Xor(i32 lhs, i32 rhs) => Avx2.Xor(lhs, rhs);
public static i32 Sub(i32 lhs, i32 rhs) => Avx2.Subtract(lhs, rhs);
public static i32 RightShift(i32 lhs, byte rhs) => Avx2.ShiftRightArithmetic(lhs, rhs);
public void RunLclVarScenario_LoadAligned() { var firstOp = Avx.LoadAlignedVector256((Byte *)(_dataTable.inArrayPtr)); Avx2.ExtractVector128((Byte *)_dataTable.outArrayPtr, firstOp, 1); }
public static i32 Negate(i32 lhs) => Avx2.Subtract(i32.Zero, lhs);
public void RunFldScenario() { Avx2.ExtractVector128((Byte *)_dataTable.outArrayPtr, _fld, 1); }
public static i32 Or(i32 lhs, i32 rhs) => Avx2.Or(lhs, rhs);