public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var left = Avx.LoadAlignedVector256((Byte *)(_dataTable.inArray1Ptr)); var right = Avx.LoadAlignedVector256((Byte *)(_dataTable.inArray2Ptr)); var result = Avx2.Add(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public unsafe int CalculateDistance(ReadOnlySpan <char> source, ReadOnlySpan <char> target) { var columns = target.Length + 1; columns += Vector256 <int> .Count - (columns & (Vector256 <int> .Count - 1)); var costMatrix = Enumerable .Range(0, source.Length + 1) .Select(line => new int[columns]) .ToArray(); for (var i = 1; i <= source.Length; ++i) { costMatrix[i][0] = i; } for (var i = 1; i <= target.Length; ++i) { costMatrix[0][i] = i; } var allOnesVectors = Vector256.Create(1); for (var i = 1; i <= source.Length; ++i) { fixed(int *prevRowPtr = costMatrix[i - 1]) { var previousRow = new Span <int>(costMatrix[i - 1]); for (int columnIndex = 0, l = target.Length + 1; columnIndex <= l; columnIndex += Vector256 <int> .Count) { var columnsCovered = Avx.LoadVector256(prevRowPtr + columnIndex); var addedColumns = Avx2.Add(columnsCovered, allOnesVectors); Avx.Store(prevRowPtr + columnIndex, addedColumns); } } for (var j = 1; j <= target.Length; ++j) { var insert = costMatrix[i][j - 1] + 1; var delete = costMatrix[i - 1][j]; var edit = costMatrix[i - 1][j - 1]; if (source[i - 1] == target[j - 1]) { edit -= 1; } costMatrix[i][j] = Math.Min(Math.Min(insert, delete), edit); } } return(costMatrix[source.Length][target.Length]); }
public void RunLclVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load)); var op1 = Avx.LoadVector256((UInt32 *)(_dataTable.inArray1Ptr)); var op2 = Avx.LoadVector256((UInt32 *)(_dataTable.inArray2Ptr)); var result = Avx2.Add(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var left = Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray2Ptr); var result = Avx2.Add(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public static void Activation_Sigmoid(Tensor input, Tensor output) { __m256 one = Vector256.Create(1.0f); for (int i = 0; i < output.xmm.Length; ++i) { output.xmm[i] = NN_utils.exp256_ps(input.xmm[i]); var divisor = Avx2.Add(output.xmm[i], one); output.xmm[i] = Avx2.Divide(output.xmm[i], divisor); } }
private unsafe long MyTestAddIntLong(int[] a) { var vz = Vector256 <long> .Zero;//0で初期化したVector256 int simdLength = Vector256 <long> .Count; int i = 0; fixed(int *ptrA = a) { for (i = 0; i < a.Length; i += simdLength) { //int型配列のポインタからint型のVector256作成 //var v2 = Avx.LoadVector256(ptrA + i);//合計値がint型で収まるならこれでいい //int型配列のポインタからlong型のVector256作成 var v2 = Avx2.ConvertToVector256Int64(ptrA + i); //Vectorで足し算 vz = Avx2.Add(vz, v2); } } var temp = new long[simdLength]; //Vectorの値を配列にStore(コピー?) fixed(long *ptrI = temp) { Avx.Store(ptrI, vz); } //long型だと要素数は4つだから合計はLINQのSumで十分、どれも速度は同じだった //LINQで合計 long total = temp.Sum(); //Forで合計 //long total = 0; //for (int j = 0; j < simdLength; j++) //{ // total += temp[j]; //} //決め打ちで合計 //long total = 0; //total += temp[0]; //total += temp[1]; //total += temp[2]; //total += temp[3]; //SIMDLengthで割り切れなかった余りの要素を合計 for (; i < a.Length; i++) { total += a[i]; } return(total); }
public static unsafe void Decode32Bytes(byte *source, byte *dest) { Vector256 <byte> maskA = Vector256.Create((uint)0x0000_003f).AsByte(); Vector256 <byte> maskB = Vector256.Create((uint)0x0000_3f00).AsByte(); Vector256 <byte> maskC = Vector256.Create((uint)0x003f_0000).AsByte(); Vector256 <byte> maskD = Vector256.Create((uint)0x3f00_0000).AsByte(); Vector256 <byte> offsets = Vector256.Create((sbyte)-32).AsByte(); Vector256 <byte> vecSource = Unsafe.As <byte, Vector256 <byte> >(ref source[0]); Vector256 <byte> subtracted = Avx2.Add(vecSource, offsets); Vector256 <byte> a = Avx2.And(subtracted, maskA); Vector256 <byte> b = Avx2.And(subtracted, maskB); Vector256 <byte> c = Avx2.And(subtracted, maskC); Vector256 <byte> d = Avx2.And(subtracted, maskD); a = Avx2.ShiftLeftLogical(a.AsUInt32(), 18).AsByte(); // 00000000 00000000 00000000 00aaaaaa -> 00000000 aaaaaa00 00000000 00000000 b = Avx2.ShiftLeftLogical(b.AsUInt32(), 4).AsByte(); // 00000000 00000000 00bbbbbb 00000000 -> 00000000 000000bb bbbb0000 00000000 c = Avx2.ShiftRightLogical(c.AsUInt32(), 10).AsByte(); // 00000000 00cccccc 00000000 00000000 -> 00000000 00000000 0000cccc cc000000 d = Avx2.ShiftRightLogical(d.AsUInt32(), 24).AsByte(); // 00dddddd 00000000 00000000 00000000 -> 00000000 00000000 00000000 00dddddd // After Or: 00000000 aaaaaabb bbbbcccc ccdddddd // byte 3 byte 1 byte 2 byte 0 // a uint: 0x00000000_00000000__00000000_00111111 // b uint: 0x00000000_00000000__00111111_00000000 // c uint: 0x00000000_00111111__00000000_00000000 // d uint: 0x00111111_00000000__00000000_00000000 a = Avx2.Or(a, b); c = Avx2.Or(c, d); a = Avx2.Or(a, c); // AA BB CC 00 AA BB CC 00 // a contains: [C,B,A,0, F,E,D,0, I,H,G,0, L,K,J,0] // Shuffle bytes so that it becomes: [A,B,C, D,E,F, G,H,I, J,K,L, 0,0,0,0] //2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, // 3, 7, 11, 15 // 18, 17, 16, 22, 21, 20, // 19 var vecShuffle = Vector256.Create( 0x02, 0x01, 0x00, 0x06, 0x05, 0x04, 0x0a, 0x09, 0x08, 0x0e, 0x0d, 0x0c, 0x80, 0x80, 0x80, 0x80, // 0x03, 0x07, 0x0b, 0x0f 0x12, 0x11, 0x10, 0x16, 0x15, 0x14, 0x1a, 0x19, 0x18, 0x1e, 0x1d, 0x1c, 0x80, 0x80, 0x80, 0x80); // 0x13, 0x17, 0x1b, 0x1f var vecBytes2 = Avx2.Shuffle(a, vecShuffle); Sse2.Store(dest, vecBytes2.GetLower()); Sse2.Store(dest + 12, vecBytes2.GetUpper()); }
private static void Accumulate1024Avx2(ref Accumulator accumulator, byte *data, byte *secret) { PrefetchNonTemporalNext(data); PrefetchNonTemporalNext(data + 0x40); var dataVec0 = Avx2.LoadVector256(data + 0x00u).AsUInt64(); var dataVec1 = Avx2.LoadVector256(data + 0x20u).AsUInt64(); var keyVec0 = Avx2.LoadVector256(secret + 0x00u).AsUInt64(); var keyVec1 = Avx2.LoadVector256(secret + 0x20u).AsUInt64(); var dataKey0 = Avx2.Xor(dataVec0, keyVec0); var dataKey1 = Avx2.Xor(dataVec1, keyVec1); var dataKeyLo0 = Avx2.Shuffle(dataKey0.AsUInt32(), ShuffleDataKey); var dataKeyLo1 = Avx2.Shuffle(dataKey1.AsUInt32(), ShuffleDataKey); var product0 = Avx2.Multiply(dataKey0.AsUInt32(), dataKeyLo0); var product1 = Avx2.Multiply(dataKey1.AsUInt32(), dataKeyLo1); var dataSwap0 = Avx2.Shuffle(dataVec0.AsUInt32(), ShuffleDataSwap); var dataSwap1 = Avx2.Shuffle(dataVec1.AsUInt32(), ShuffleDataSwap); var addend0 = accumulator.Data256.Data0; var addend1 = accumulator.Data256.Data1; var sum0 = Avx2.Add(addend0, dataSwap0.AsUInt64()); var sum1 = Avx2.Add(addend1, dataSwap1.AsUInt64()); var result0 = Avx2.Add(product0, sum0); var result1 = Avx2.Add(product1, sum1); addend0 = result0; addend1 = result1; var dataVec2 = Avx2.LoadVector256(data + 0x40u).AsUInt64(); var dataVec3 = Avx2.LoadVector256(data + 0x60u).AsUInt64(); var keyVec2 = Avx2.LoadVector256(secret + 0x08u).AsUInt64(); var keyVec3 = Avx2.LoadVector256(secret + 0x28u).AsUInt64(); var dataKey2 = Avx2.Xor(dataVec2, keyVec2); var dataKey3 = Avx2.Xor(dataVec3, keyVec3); var dataKeyLo2 = Avx2.Shuffle(dataKey2.AsUInt32(), ShuffleDataKey); var dataKeyLo3 = Avx2.Shuffle(dataKey3.AsUInt32(), ShuffleDataKey); var product2 = Avx2.Multiply(dataKey2.AsUInt32(), dataKeyLo2); var product3 = Avx2.Multiply(dataKey3.AsUInt32(), dataKeyLo3); var dataSwap2 = Avx2.Shuffle(dataVec2.AsUInt32(), ShuffleDataSwap); var dataSwap3 = Avx2.Shuffle(dataVec3.AsUInt32(), ShuffleDataSwap); var sum2 = Avx2.Add(addend0, dataSwap2.AsUInt64()); var sum3 = Avx2.Add(addend1, dataSwap3.AsUInt64()); var result2 = Avx2.Add(product2, sum2); var result3 = Avx2.Add(product3, sum3); accumulator.Data256.Data0 = result2; accumulator.Data256.Data1 = result3; }
public void Add2_Byte() { for (var left_lower0 = 0; left_lower0 < 1; left_lower0++) { var left_lower1 = (Byte)left_lower0; var left_lower2 = Vector128.Create(left_lower1); for (var left_upper0 = 0; left_upper0 < 1; left_upper0++) { var left_upper1 = (Byte)left_upper0; var left_upper2 = Vector128.Create(left_upper1); var left3 = Vector256.Create(left_lower2, left_upper2); for (var right_lower0 = 0; right_lower0 < 1; right_lower0++) { var right_lower1 = (Byte)right_lower0; var right_lower2 = Vector128.Create(right_lower1); for (var right_upper0 = 0; right_upper0 < 1; right_upper0++) { var right_upper1 = (Byte)right_upper0; var right_upper2 = Vector128.Create(right_upper1); var right3 = Vector256.Create(right_lower2, right_upper2); var actual = Avx2.Add(left3, right3); var expected_upper0 = (UInt64)(left_upper0 + right_upper0); var expected_upper1 = (expected_upper0 << 0) | (expected_upper0 << 8) | (expected_upper0 << 16) | (expected_upper0 << 24) | (expected_upper0 << 32) | (expected_upper0 << 40) | (expected_upper0 << 48) | (expected_upper0 << 56); var expected_upper2 = Vector128.Create(expected_upper1); var expected_lower0 = (UInt64)(left_lower0 + right_lower0); var expected_lower1 = (expected_lower0 << 0) | (expected_lower0 << 8) | (expected_lower0 << 16) | (expected_lower0 << 24) | (expected_lower0 << 32) | (expected_lower0 << 40) | (expected_lower0 << 48) | (expected_lower0 << 56); var expected_lower2 = Vector128.Create(expected_lower1); var expected3 = Vector256.Create(expected_lower2, expected_upper2).AsByte(); for (var index = 0; index < 32; index++) { Assert.AreEqual(expected3.GetElement(index), actual.GetElement(index)); } } } } } }
public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Avx2.Add( Avx.LoadVector256((Byte *)(_dataTable.inArray1Ptr)), Avx.LoadVector256((Byte *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void 垂直加算Int16() { for (var a = 0; a < 1; a++) { var operand0 = Vector256.Create(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); var operand1 = Vector256.Create(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); for (var b = 0; b < 1; b++) { var result = Avx2.Add(operand0, operand1); } } }
public static Vector256 <byte> F1_v256i(int t) { int ti = 1 - t; int t0 = ti * ti * ti; int t1 = 3 * ti * ti * t; int t2 = 3 * ti * t * t; int t3 = t * t * t; Vector256 <byte> tmp1 = Avx2.Add(Avx2.Subtract(Avx.SetAllVector256((byte)t0), s_v256i_0), Avx2.Subtract(Avx.SetAllVector256((byte)t1), s_v256i_1)); Vector256 <byte> tmp2 = Avx2.Add(Avx2.Subtract(Avx.SetAllVector256((byte)t2), s_v256i_2), Avx2.Subtract(Avx.SetAllVector256((byte)t3), s_v256i_3)); return(Avx2.Add(tmp1, tmp2)); }
public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Avx2.Add( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
private static void Accumulate512Avx2(ref Accumulator accumulator, byte *data, byte *secret) { PrefetchNonTemporalNext(data); PrefetchNonTemporalNext(secret); if (UnrollCount >= 2u) { var dataVec0 = Avx2.LoadVector256(data + 0x00u).AsUInt64(); var dataVec1 = Avx2.LoadVector256(data + 0x20u).AsUInt64(); var keyVec0 = Avx2.LoadVector256(secret + 0x00u).AsUInt64(); var keyVec1 = Avx2.LoadVector256(secret + 0x20u).AsUInt64(); var dataKey0 = Avx2.Xor(dataVec0, keyVec0); var dataKey1 = Avx2.Xor(dataVec1, keyVec1); var dataKeyLo0 = Avx2.Shuffle(dataKey0.AsUInt32(), ShuffleDataKey); var dataKeyLo1 = Avx2.Shuffle(dataKey1.AsUInt32(), ShuffleDataKey); var product0 = Avx2.Multiply(dataKey0.AsUInt32(), dataKeyLo0); var product1 = Avx2.Multiply(dataKey1.AsUInt32(), dataKeyLo1); var dataSwap0 = Avx2.Shuffle(dataVec0.AsUInt32(), ShuffleDataSwap); var dataSwap1 = Avx2.Shuffle(dataVec1.AsUInt32(), ShuffleDataSwap); var addend0 = accumulator.Data256.Data0; var addend1 = accumulator.Data256.Data1; var sum0 = Avx2.Add(addend0, dataSwap0.AsUInt64()); var sum1 = Avx2.Add(addend1, dataSwap1.AsUInt64()); var result0 = Avx2.Add(product0, sum0); var result1 = Avx2.Add(product1, sum1); accumulator.Data256.Data0 = result0; accumulator.Data256.Data1 = result1; } else { for (uint i = 0u; i < StripeLength; i += 0x20u) { var dataVec = Avx2.LoadVector256(data + i).AsUInt64(); var keyVec = Avx2.LoadVector256(secret + i).AsUInt64(); var dataKey = Avx2.Xor(dataVec, keyVec); var dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), ShuffleDataKey); var product = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo); var dataSwap = Avx2.Shuffle(dataVec.AsUInt32(), ShuffleDataSwap); var addend = accumulator.Data256.AtOffset(i); var sum = Avx2.Add(addend, dataSwap.AsUInt64()); var result = Avx2.Add(product, sum); accumulator.Data256.AtOffset(i) = result; } } }
//Vectorでの足し算、オーバーフロー編 private unsafe void Test3AddOverflow(byte[] aa) { Vector256 <byte> v = Vector256.Create((byte)250); Vector256 <byte> total; fixed(byte *ptrA = aa) { Vector256 <byte> tempV = Avx.LoadVector256(ptrA); total = Avx2.Add(v, tempV); } }
//オーバーフローしないように、byte型配列からint型Vector作成して足し算 private unsafe void Test5Add(byte[] aa) { Vector256 <int> v = Vector256.Create((int)250); Vector256 <int> total; fixed(byte *ptrA = aa) { Vector256 <int> tempV = Avx2.ConvertToVector256Int32(ptrA); total = Avx2.Add(v, tempV); } }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Avx2.Add( Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
static Vector256 <byte> avxcarryContinuations(Vector256 <byte> initial_lengths, Vector256 <byte> previous_carries) { Vector256 <byte> right1 = Avx2.SubtractSaturate( push_last_byte_of_a_to_b(previous_carries, initial_lengths), Vector256.Create((byte)1)); Vector256 <byte> sum = Avx2.Add(initial_lengths, right1); Vector256 <byte> right2 = Avx2.SubtractSaturate( push_last_2bytes_of_a_to_b(previous_carries, sum), Vector256.Create((byte)2)); return(Avx2.Add(sum, right2)); }
//配列の足し算 private unsafe void Test6Add(byte[] aa) { Vector256 <int> total = Vector256 <int> .Zero; fixed(byte *ptrA = aa) { for (int i = 0; i < aa.Length; i += Vector256 <int> .Count) { Vector256 <int> tempV = Avx2.ConvertToVector256Int32(ptrA + i); total = Avx2.Add(total, tempV); } } }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Avx2.Add( Avx.LoadVector256((Byte *)(&test._fld1)), Avx.LoadVector256((Byte *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public override void Step() { fixed(byte *fieldPtr = currentField, nextFieldPtr = nextField) { for (int i = 2 * WIDTH; i < currentField.Length - 2 * WIDTH; i += 32) { Vector256 <byte> topLeft = Avx.LoadVector256(fieldPtr + i - WIDTH - 1); Vector256 <byte> top = Avx.LoadVector256(fieldPtr + i - WIDTH); Vector256 <byte> topRight = Avx.LoadVector256(fieldPtr + i - WIDTH + 1); Vector256 <byte> left = Avx.LoadVector256(fieldPtr + i - 1); Vector256 <byte> right = Avx.LoadVector256(fieldPtr + i + 1); Vector256 <byte> bottomLeft = Avx.LoadVector256(fieldPtr + i + WIDTH - 1); Vector256 <byte> bottom = Avx.LoadVector256(fieldPtr + i + WIDTH); Vector256 <byte> bottomRight = Avx.LoadVector256(fieldPtr + i + WIDTH + 1); Vector256 <byte> sum1 = Avx2.Add(topLeft, top); Vector256 <byte> sum2 = Avx2.Add(topRight, left); Vector256 <byte> sum3 = Avx2.Add(right, bottomLeft); Vector256 <byte> sum4 = Avx2.Add(bottom, bottomRight); Vector256 <byte> sum5 = Avx2.Add(sum1, sum2); Vector256 <byte> sum6 = Avx2.Add(sum3, sum4); Vector256 <byte> neighbours = Avx2.Add(sum5, sum6); Vector256 <byte> alive = Avx.LoadVector256(fieldPtr + i); alive = Avx2.ShiftLeftLogical(alive.AsUInt64(), (byte)3).AsByte(); Vector256 <byte> mask = Avx2.Or(neighbours, alive); Vector256 <byte> shouldBeAlive = Avx2.Shuffle(v_lookup, mask); //Vector256<byte> hasTwoNeighbours = Avx2.CompareEqual(neighbours, v_2); //Vector256<byte> hasThreeNeighbours = Avx2.CompareEqual(neighbours, v_3); //hasThreeNeighbours = Avx2.And(hasThreeNeighbours, v_1); //Vector256<byte> aliveAndTwoNeighbours = Avx2.And(alive, hasTwoNeighbours); //Vector256<byte> shouldBeAlive = Avx2.Or(aliveAndTwoNeighbours, hasThreeNeighbours); //shouldBeAlive = Avx2.And(shouldBeAlive, v_1); Avx2.Store(nextFieldPtr + i, shouldBeAlive); } byte[] tempField = currentField; currentField = nextField; nextField = tempField; } for (int y = 1; y < HEIGHT - 1; y++) { currentField[WIDTH + y * WIDTH] = 0; currentField[WIDTH + y * WIDTH + WIDTH - 1] = 0; } }
private static void QuarterRound(ref Vector256 <uint> a, ref Vector256 <uint> b, ref Vector256 <uint> c, ref Vector256 <uint> d) { a = Avx2.Add(a, b); d = Avx2.Xor(a, d).RotateLeftUInt32_16(); c = Avx2.Add(c, d); b = Avx2.Xor(b, c).RotateLeftUInt32(12); a = Avx2.Add(a, b); d = Avx2.Xor(a, d).RotateLeftUInt32_8(); c = Avx2.Add(c, d); b = Avx2.Xor(b, c).RotateLeftUInt32(7); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__AddByte testClass) { fixed(Vector256 <Byte> *pFld1 = &_fld1) fixed(Vector256 <Byte> *pFld2 = &_fld2) { var result = Avx2.Add( Avx.LoadVector256((Byte *)(pFld1)), Avx.LoadVector256((Byte *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public static unsafe void CalculateDiagonalSection_Avx2 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct { if (typeof(T) == typeof(int)) { var diag1Ptr = (int *)refDiag1Ptr; var diag2Ptr = (int *)refDiag2Ptr; var sourceVector = Avx2.ConvertToVector256Int32((ushort *)sourcePtr + rowIndex - Vector256 <T> .Count); var targetVector = Avx2.ConvertToVector256Int32((ushort *)targetPtr + columnIndex - 1); targetVector = Avx2.Shuffle(targetVector, 0x1b); targetVector = Avx2.Permute2x128(targetVector, targetVector, 1); var substitutionCostAdjustment = Avx2.CompareEqual(sourceVector, targetVector); var substitutionCost = Avx2.Add( Avx.LoadDquVector256(diag1Ptr + rowIndex - Vector256 <T> .Count), substitutionCostAdjustment ); var deleteCost = Avx.LoadDquVector256(diag2Ptr + rowIndex - (Vector256 <T> .Count - 1)); var insertCost = Avx.LoadDquVector256(diag2Ptr + rowIndex - Vector256 <T> .Count); var localCost = Avx2.Min(Avx2.Min(insertCost, deleteCost), substitutionCost); localCost = Avx2.Add(localCost, Vector256.Create(1)); Avx.Store(diag1Ptr + rowIndex - (Vector256 <T> .Count - 1), localCost); } else if (typeof(T) == typeof(ushort)) { var diag1Ptr = (ushort *)refDiag1Ptr; var diag2Ptr = (ushort *)refDiag2Ptr; var sourceVector = Avx.LoadDquVector256((ushort *)sourcePtr + rowIndex - Vector256 <T> .Count); var targetVector = Avx.LoadDquVector256((ushort *)targetPtr + columnIndex - 1); targetVector = Avx2.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_256).AsUInt16(); targetVector = Avx2.Permute2x128(targetVector, targetVector, 1); var substitutionCostAdjustment = Avx2.CompareEqual(sourceVector, targetVector); var substitutionCost = Avx2.Add( Avx.LoadDquVector256(diag1Ptr + rowIndex - Vector256 <T> .Count), substitutionCostAdjustment ); var deleteCost = Avx.LoadDquVector256(diag2Ptr + rowIndex - (Vector256 <T> .Count - 1)); var insertCost = Avx.LoadDquVector256(diag2Ptr + rowIndex - Vector256 <T> .Count); var localCost = Avx2.Min(Avx2.Min(insertCost, deleteCost), substitutionCost); localCost = Avx2.Add(localCost, Vector256.Create((ushort)1)); Avx.Store(diag1Ptr + rowIndex - (Vector256 <T> .Count - 1), localCost); } }
public static Vector256 <T> Vector256PlusOne <T>(Vector256 <T> v1) where T : struct { Vector256 <T> v2 = Vector256One <T>(); if (typeof(T) == typeof(float)) { return(Avx.StaticCast <float, T>(Avx.Add(Avx.StaticCast <T, float>(v1), Avx.StaticCast <T, float>(v2)))); } else if (typeof(T) == typeof(double)) { return(Avx.StaticCast <double, T>(Avx.Add(Avx.StaticCast <T, double>(v1), Avx.StaticCast <T, double>(v2)))); } else if (typeof(T) == typeof(byte)) { return(Avx.StaticCast <byte, T>(Avx2.Add(Avx.StaticCast <T, byte>(v1), Avx.StaticCast <T, byte>(v2)))); } else if (typeof(T) == typeof(sbyte)) { return(Avx.StaticCast <sbyte, T>(Avx2.Add(Avx.StaticCast <T, sbyte>(v1), Avx.StaticCast <T, sbyte>(v2)))); } else if (typeof(T) == typeof(short)) { return(Avx.StaticCast <short, T>(Avx2.Add(Avx.StaticCast <T, short>(v1), Avx.StaticCast <T, short>(v2)))); } else if (typeof(T) == typeof(ushort)) { return(Avx.StaticCast <ushort, T>(Avx2.Add(Avx.StaticCast <T, ushort>(v1), Avx.StaticCast <T, ushort>(v2)))); } else if (typeof(T) == typeof(int)) { return(Avx.StaticCast <int, T>(Avx2.Add(Avx.StaticCast <T, int>(v1), Avx.StaticCast <T, int>(v2)))); } else if (typeof(T) == typeof(uint)) { return(Avx.StaticCast <uint, T>(Avx2.Add(Avx.StaticCast <T, uint>(v1), Avx.StaticCast <T, uint>(v2)))); } else if (typeof(T) == typeof(long)) { return(Avx.StaticCast <long, T>(Avx2.Add(Avx.StaticCast <T, long>(v1), Avx.StaticCast <T, long>(v2)))); } else if (typeof(T) == typeof(ulong)) { return(Avx.StaticCast <ulong, T>(Avx2.Add(Avx.StaticCast <T, ulong>(v1), Avx.StaticCast <T, ulong>(v2)))); } else { throw new NotSupportedException(); } }
private static void OneQuadUnpack(ref Vector256 <uint> x_A, ref Vector256 <uint> x_B, ref Vector256 <uint> x_C, ref Vector256 <uint> x_D, ref Vector256 <uint> t_A, ref Vector256 <uint> t_B, ref Vector256 <uint> t_C, ref Vector256 <uint> t_D, ref Vector256 <uint> orig_A, ref Vector256 <uint> orig_B, ref Vector256 <uint> orig_C, ref Vector256 <uint> orig_D) { x_A = Avx2.Add(x_A, orig_A); x_B = Avx2.Add(x_B, orig_B); x_C = Avx2.Add(x_C, orig_C); x_D = Avx2.Add(x_D, orig_D); t_A = Avx2.UnpackLow(x_A, x_B); t_B = Avx2.UnpackLow(x_C, x_D); t_C = Avx2.UnpackHigh(x_A, x_B); t_D = Avx2.UnpackHigh(x_C, x_D); x_A = Avx2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_B = Avx2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32(); x_C = Avx2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); x_D = Avx2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32(); }
private static Vector256 <int> Aggregate(Vector256 <int> t, int carry) { var shiftRight = RotateRight; var t2 = Avx2.PermuteVar8x32(t, shiftRight); t2 = t2.WithElement(0, carry); t = Avx2.Add(t, t2); t2 = Avx2.PermuteVar8x32(t2, shiftRight); t2 = t2.WithElement(0, 0); t = Avx2.Add(t, t2); t2 = Avx2.PermuteVar8x32(t2, shiftRight); t = Avx2.Add(t, t2); t2 = Avx2.PermuteVar8x32(t2, shiftRight); t = Avx2.Add(t, t2); t2 = Avx2.PermuteVar8x32(t2, shiftRight); t = Avx2.Add(t, t2); t2 = Avx2.PermuteVar8x32(t2, shiftRight); t = Avx2.Add(t, t2); t2 = Avx2.PermuteVar8x32(t2, shiftRight); t = Avx2.Add(t, t2); t2 = Avx2.PermuteVar8x32(t2, shiftRight); t = Avx2.Add(t, t2); return(t); }
/// <summary> /// Absolute error bounded by 1e-4. /// </summary> public static Vector256 <float> Log(Vector256 <float> x) { Vector256 <float> exp, addcst, val; exp = Avx2.ConvertToVector256Single(Avx2.ShiftRightArithmetic(x.As <float, int>(), 23)); // According to BenchmarkDotNet, isolating all the constants up-front // yield nearly 10% speed-up. const float bf0 = -89.970756366f; const float bf1 = float.NaN; // behavior of MathF.Log() on negative numbers const float bf2 = 3.529304993f; const float bf3 = -2.461222105f; const float bf4 = 1.130626167f; const float bf5 = -0.288739945f; const float bf6 = 3.110401639e-2f; const float bf7 = 0.6931471805f; const int bi0 = 0x7FFFFF; const int bi1 = 0x3F800000; //addcst = val > 0 ? -89.970756366f : -(float)INFINITY; addcst = Avx.BlendVariable(Vector256.Create(bf0), Vector256.Create(bf1), Avx.Compare(x, Vector256 <float> .Zero, FloatComparisonMode.OrderedLessThanNonSignaling)); val = Avx2.Or(Avx2.And( x.As <float, int>(), Vector256.Create(bi0)), Vector256.Create(bi1)).As <int, float>(); /* x * (3.529304993f + * x * (-2.461222105f + * x * (1.130626167f + * x * (-0.288739945f + * x * 3.110401639e-2f)))) + (addcst + 0.6931471805f*exp); */ return(Avx2.Add( Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf2), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf3), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf4), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf5), Avx2.Multiply(val, Vector256.Create(bf6)))))))))), Avx.Add(addcst, Avx2.Multiply(Vector256.Create(bf7), exp)))); }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector256 <Byte> *pFld1 = &_fld1) fixed(Vector256 <Byte> *pFld2 = &_fld2) { var result = Avx2.Add( Avx.LoadVector256((Byte *)(pFld1)), Avx.LoadVector256((Byte *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector256 <UInt16> *pClsVar1 = &_clsVar1) fixed(Vector256 <UInt16> *pClsVar2 = &_clsVar2) { var result = Avx2.Add( Avx.LoadVector256((UInt16 *)(pClsVar1)), Avx.LoadVector256((UInt16 *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }