private static bool TestSse41X64Extract_UInt64() { Vector128 <ulong> val = Vector128.CreateScalar(ulong.MaxValue); ulong result = Sse41.X64.Extract(val, 0); return(AreEqual(ulong.MaxValue, result)); }
private static bool TestSseX64ConvertToInt64WithTruncation() { Vector128 <float> val = Vector128.CreateScalar((float)long.MaxValue); long result = Sse.X64.ConvertToInt64WithTruncation(val); return(AreEqual(long.MinValue, result)); }
private static bool TestSse2X64ConvertToInt64_Vector128Double() { Vector128 <double> val = Vector128.CreateScalar((double)long.MaxValue); long result = Sse2.X64.ConvertToInt64(val); return(AreEqual(long.MinValue, result)); }
private static bool TestSse2X64ConvertToUInt64() { Vector128 <ulong> val = Vector128.CreateScalar(ulong.MaxValue); ulong result = Sse2.X64.ConvertToUInt64(val); return(AreEqual(ulong.MaxValue, result)); }
private static ushort ToUshort(int a) { var xmm = Vector128.CreateScalar(a); return(Sse41.PackUnsignedSaturate(xmm, xmm) .GetElement(0)); }
private static unsafe void CalculateRow(int *previousRowPtr, char *targetPtr, int targetLength, char sourcePrevChar, int lastInsertionCost, int lastSubstitutionCost) { var columnIndex = 0; int lastDeletionCost; int localCost; var rowColumnsRemaining = targetLength; while (rowColumnsRemaining > 0) { rowColumnsRemaining--; localCost = lastSubstitutionCost; lastDeletionCost = previousRowPtr[columnIndex]; if (sourcePrevChar != targetPtr[columnIndex]) { localCost = Sse41.Min( Vector128.CreateScalar(localCost), Sse41.Min(Vector128.CreateScalar(lastInsertionCost), Vector128.CreateScalar(lastDeletionCost))) .GetElement(0) ; localCost++; } lastInsertionCost = localCost; previousRowPtr[columnIndex++] = localCost; lastSubstitutionCost = lastDeletionCost; } }
private static unsafe void CalculateRow(int *previousRowPtr, char *targetPtr, int targetLength, char sourcePrevChar, int lastInsertionCost, int lastSubstitutionCost) { var columnIndex = 0; var rowColumnsRemaining = targetLength; Vector128 <int> one = Vector128.CreateScalar(1); Vector128 <int> lastSubstition = Vector128.CreateScalar(lastSubstitutionCost); Vector128 <int> lastInsertion = Vector128.CreateScalar(lastInsertionCost); Vector128 <int> localCost; Vector128 <int> lastDeletion; while (rowColumnsRemaining > 0) { rowColumnsRemaining--; localCost = lastSubstition; lastDeletion = Vector128.CreateScalar(previousRowPtr[columnIndex]); if (sourcePrevChar != targetPtr[columnIndex]) { localCost = Sse2.Add(one, Sse41.Min(localCost, Sse41.Min(lastInsertion, lastDeletion))); } lastInsertion = localCost; previousRowPtr[columnIndex++] = localCost.GetElement(0); lastSubstition = lastDeletion; } }
private static int Test() { return(Sse2.Subtract( // LLVM is able to fold constant vectors :p Vector128 <int> .Zero, Vector128.CreateScalar(42)) .ToScalar()); }
private static unsafe void CalculateRow(int *previousRowPtr, char *targetPtr, int targetLength, char sourcePrevChar, int lastInsertionCost, int lastSubstitutionCost) { for (int columnIndex = 0; columnIndex < targetLength; ++columnIndex) { int localCost = lastSubstitutionCost; int lastDeletionCost = previousRowPtr[columnIndex]; if (sourcePrevChar != targetPtr[columnIndex]) { localCost = Sse41.Min( Vector128.CreateScalar(localCost), Sse41.Min(Vector128.CreateScalar(lastInsertionCost), Vector128.CreateScalar(lastDeletionCost))) .GetElement(0) ; // localCost = Math.Min(lastInsertionCost, localCost); // localCost = Math.Min(lastDeletionCost, localCost); localCost++; } lastInsertionCost = localCost; previousRowPtr[columnIndex] = localCost; lastSubstitutionCost = lastDeletionCost; } }
static void TestExplicitFmaUsage6(ref Vector128 <float> a, float b) { CompareFloats(ReferenceMultiplyAdd(b, b, b), Fma.MultiplyAdd( Vector128.CreateScalarUnsafe(b), Vector128.CreateScalar(b), Vector128.Create(b)).ToScalar()); }
static void TestExplicitFmaUsage6(ref Vector128 <double> a, double b) { CompareDoubles(ReferenceMultiplyAdd(b, b, b), Fma.MultiplyAdd( Vector128.CreateScalarUnsafe(b), Vector128.CreateScalar(b), Vector128.Create(b)).ToScalar()); }
private unsafe bool TryParseInt(long input, out int value) { var vector = input - ShortCharA; var r = (vector & ShortN15) == 0; vector = (long)((((ulong)vector) << 4) | (((ulong)vector) >> 8)); value = Sse41.Extract(Ssse3.Shuffle(Vector128.CreateScalar(vector).AsSByte(), NShuffleMask).AsInt32(), 0); return(r); }
public void RunBasicScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario)); SByte value = TestLibrary.Generator.GetSByte(); Vector128 <SByte> result = Vector128.CreateScalar(value); ValidateResult(result, value); }
private static unsafe void FillRowSSe(ushort *previousRow, int length) { var one = Vector128.CreateScalar((ushort)1); var j = one; for (int i = 0; i < length; ++i) { previousRow[i] = j.GetElement(0); j = Sse42.AddSaturate(j, one); } }
/// <summary>Creates a matrix from a the specified rotation around the z-axis.</summary> /// <param name="rotationZ">A float representing the rotation around the z-axis for the matrix.</param> /// <returns>A matrix that represents <paramref name="rotationZ" />.</returns> public static Matrix4x4 CreateFromRotationZ(float rotationZ) { var(sin, cos) = SinCos(rotationZ); var tmp = InterleaveLower(Vector128.CreateScalar(cos), Vector128.CreateScalar(sin)); return(Create( tmp, Multiply(CreateFromYXZW(tmp), Vector128.Create(-1.0f, 1.0f, 1.0f, 1.0f)), UnitZ, UnitW )); }
/// <summary>Creates a matrix from a the specified rotation around the y-axis.</summary> /// <param name="rotationY">A float representing the rotation around the y-axis for the matrix.</param> /// <returns>A matrix that represents <paramref name="rotationY" />.</returns> public static Matrix4x4 CreateFromRotationY(float rotationY) { var(sin, cos) = SinCos(rotationY); var tmp = CreateFromXWAD(Vector128.CreateScalar(sin), Vector128.CreateScalar(cos)); return(Create( Multiply(CreateFromZYXW(tmp), Vector128.Create(1.0f, 1.0f, -1.0f, 1.0f)), UnitY, tmp, UnitW )); }
/// <summary>Creates a matrix from a the specified rotation around the x-axis.</summary> /// <param name="rotationX">A float representing the rotation around the x-axis for the matrix.</param> /// <returns>A matrix that represents <paramref name="rotationX" />.</returns> public static Matrix4x4 CreateFromRotationX(float rotationX) { var(sin, cos) = SinCos(rotationX); var tmp = CreateFromWXAD(Vector128.CreateScalar(cos), Vector128.CreateScalar(sin)); return(Create( UnitX, tmp, Multiply(CreateFromXZYW(tmp), Vector128.Create(1.0f, -1.0f, 1.0f, 1.0f)), UnitW )); }
public void SplitFraction() { var rnd = new Random(); for (var n = 0; n < Iterations; n++) { var d = rnd.NextDouble() + rnd.Next(0, Int32.MaxValue - 1); var vd = Vector128.CreateScalar(d); Utils.SplitFraction(vd, out var vi, out var vf); var mi = (double)(int)d; var mf = d - mi; Assert.Equal(mi, vi.ToScalar()); Assert.Equal(mf, vf.ToScalar()); } }
public static __m128 _mm_set_ss(float v) => Vector128.CreateScalar(v);
private static ushort Min(ushort a, ushort b) { return(Sse41.Min(Vector128.CreateScalar(a), Vector128.CreateScalar(b)) .GetElement(0)); }
private static unsafe int CalculateDistance(string sourceString, int sourceLength, string targetString, int targetLength, int startIndex) { var arrayPool = ArrayPool <ushort> .Shared; var pooledArray = arrayPool.Rent(targetLength); Span <ushort> previousRow = pooledArray; ReadOnlySpan <char> source = sourceString.AsSpan().Slice(startIndex, sourceLength); ReadOnlySpan <char> target = targetString.AsSpan().Slice(startIndex, targetLength); //ArrayPool values are sometimes bigger than allocated, let's trim our span to exactly what we use previousRow = previousRow.Slice(0, targetLength); fixed(char *targetPtr = target) fixed(char *srcPtr = source) fixed(ushort *previousRowPtr = previousRow) { FillRow(previousRowPtr, targetLength); var rowIndex = 0; for (; rowIndex < sourceLength - 7; rowIndex += 8) { // todo max var temp = Vector128.Create(rowIndex); var diag = Sse42.PackUnsignedSaturate(temp, temp); var one = Vector128.Create((ushort)1); var left = Sse42.AddSaturate(diag, one); var sourceV = Sse42.LoadVector128((ushort *)(srcPtr + rowIndex)); var targetV = Vector128 <ushort> .Zero; var shift = Vector128.CreateScalar(ushort.MaxValue); // First 3 iterations fills the vector for (int columnIndex = 0; columnIndex < 7; columnIndex++) { // Shift in the next character targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 2); targetV = Sse42.Insert(targetV, (ushort)targetPtr[columnIndex], 0); // Insert "(rowIndex + columnIndex + 1)" from the left var leftValue = Vector128.Create(rowIndex + columnIndex + 1); left = Sse42.Or(Sse42.And(shift, Sse42.PackUnsignedSaturate(leftValue, leftValue)), left); shift = Sse42.ShiftLeftLogical128BitLane(shift, 2); // compare source to target // alternativ, compare equal and OR with One var match = Sse42.CompareEqual(sourceV, targetV); var add = Sse42.AndNot(match, one); var next = Sse42.AddSaturate(diag, add); // Create next diag which is current up var up = Sse42.ShiftLeftLogical128BitLane(left, 2); up = Sse42.Insert(up, (ushort)previousRowPtr[columnIndex], 0); var tmp = Sse42.AddSaturate(Sse42.Min(left, up), one); next = Sse42.Min(next, tmp); left = next; diag = up; } previousRowPtr[0] = Sse42.Extract(left, 7); var writePtr = previousRowPtr + 1; for (int columnIndex = 8; columnIndex < targetLength; columnIndex++) { // Shift in the next character targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 2); targetV = Sse42.Insert(targetV, (ushort)targetPtr[columnIndex], 0); // compare source to target // alternativ, compare equal and OR with One var match = Sse42.CompareEqual(sourceV, targetV); var add = Sse42.AndNot(match, one); var next = Sse42.AddSaturate(diag, add); // Create next diag which is current up var up = Sse42.ShiftLeftLogical128BitLane(left, 2); up = Sse42.Insert(up, (ushort)previousRowPtr[columnIndex], 0); var tmp = Sse42.AddSaturate(Sse42.Min(left, up), one); next = Sse42.Min(next, tmp); left = next; diag = up; // Store one value *writePtr = Sse42.Extract(next, 7); writePtr = writePtr + 1; // Store one value //previousRowPtr[columnIndex - 7] = Sse42.Extract(next, 7); } // Finish with last 3 items, dont read any more chars just extract them for (int i = targetLength - 7; i < previousRow.Length; i++) { // Shift in the next character targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 2); // compare source to target // alternativ, compare equal and OR with One var match = Sse42.CompareEqual(sourceV, targetV); var add = Sse42.AndNot(match, one); var next = Sse42.AddSaturate(diag, add); // Create next diag which is current up var up = Sse42.ShiftLeftLogical128BitLane(left, 2); var tmp = Sse42.AddSaturate(Sse42.Min(left, up), one); next = Sse42.Min(next, tmp); left = next; diag = up; // Store one value previousRowPtr[i] = Sse42.Extract(next, 7); } #if DEBUG if (true) { Console.Write("prev values for row {0}:", rowIndex); for (int i = 0; i < targetLength; ++i) { Console.Write("{0} ", previousRow[i]); } Console.WriteLine(); } #endif } //Calculate Single Rows for (; rowIndex < sourceLength; rowIndex++) { var lastSubstitutionCost = rowIndex; var lastInsertionCost = rowIndex + 1; var sourcePrevChar = source[rowIndex]; #if DEBUG Console.Write("prev values for row {0}:", rowIndex); for (int i = 0; i < targetLength; ++i) { Console.Write("{0} ", previousRow[i]); } Console.WriteLine(); #endif CalculateRow(previousRowPtr, targetPtr, targetLength, sourcePrevChar, lastInsertionCost, lastSubstitutionCost); } } var result = previousRow[targetLength - 1]; arrayPool.Return(pooledArray); return(result); }
private static unsafe int CalculateDistance(string sourceString, int sourceLength, string targetString, int targetLength, int startIndex) { var arrayPool = ArrayPool <int> .Shared; var pooledArray = arrayPool.Rent(targetLength); Span <int> previousRow = pooledArray; ReadOnlySpan <char> source = sourceString.AsSpan().Slice(startIndex, sourceLength); ReadOnlySpan <char> target = targetString.AsSpan().Slice(startIndex, targetLength); //ArrayPool values are sometimes bigger than allocated, let's trim our span to exactly what we use previousRow = previousRow.Slice(0, targetLength); fixed(char *targetPtr = target) fixed(char *srcPtr = source) fixed(int *previousRowPtr = previousRow) { FillRow(previousRowPtr, targetLength); var rowIndex = 0; for (; rowIndex < sourceLength - 3; rowIndex += 4) { var diag = Vector128.Create(rowIndex); var left = Vector128.Create(rowIndex + 1); var sourceV = Sse42.ConvertToVector128Int32((short *)(srcPtr + rowIndex)); var targetV = Vector128 <int> .Zero; var one = Vector128.Create(1); // First 3 iterations fills the vector var shift = Vector128.CreateScalar(-1); for (int columnIndex = 0; columnIndex < 4; columnIndex++) { // Shift in the next character targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 4); targetV = Sse42.Insert(targetV, (short)targetPtr[columnIndex], 0); //left = Sse42.Insert(left, rowIndex + columnIndex + 1, (byte)columnIndex); var leftValue = Vector128.Create(rowIndex + columnIndex + 1); left = Sse42.Or(Sse42.And(shift, leftValue), left); shift = Sse42.ShiftLeftLogical128BitLane(shift, 4); // compare source to target // alternativ, compare equal and OR with One var match = Sse.CompareNotEqual(sourceV.AsSingle(), targetV.AsSingle()); var next = Sse42.Subtract(diag, match.AsInt32()); // Create next diag which is current up var up = Sse42.ShiftLeftLogical128BitLane(left, 4); up = Sse42.Insert(up, previousRowPtr[columnIndex], 0); var tmp = Sse42.Add(Sse42.Min(left, up), one); next = Sse42.Min(next, tmp); left = next; diag = up; } previousRowPtr[0] = Sse42.Extract(left, 3); for (int columnIndex = 4; columnIndex < targetLength; columnIndex++) { // Shift in the next character targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 4); targetV = Sse42.Insert(targetV, (short)targetPtr[columnIndex], 0); // compare source to target // alternativ, compare equal and OR with One var match = Sse42.CompareNotEqual(sourceV.AsSingle(), targetV.AsSingle()); var next = Sse42.Subtract(diag, match.AsInt32()); // Create next diag which is current up var up = Sse42.ShiftLeftLogical128BitLane(left, 4); up = Sse42.Insert(up, previousRowPtr[columnIndex], 0); var tmp = Sse42.Add(Sse42.Min(left, up), one); next = Sse42.Min(next, tmp); left = next; diag = up; // Store one value previousRowPtr[columnIndex - 3] = Sse42.Extract(next, 3); } // Finish with last 3 items, dont read any more chars just extract them for (int i = targetLength - 3; i < targetLength; i++) { // Shift in the next character targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 4); // compare source to target // alternativ, compare equal and OR with One var match = Sse.CompareNotEqual(sourceV.AsSingle(), targetV.AsSingle()); var next = Sse42.Subtract(diag, match.AsInt32()); // Create next diag which is current up var up = Sse42.ShiftLeftLogical128BitLane(left, 4); var tmp = Sse42.Add(Sse42.Min(left, up), one); next = Sse42.Min(next, tmp); left = next; diag = up; // Store one value previousRowPtr[i] = Sse42.Extract(next, 3); } #if DEBUG if (true) { Console.Write("prev values for row {0}:", rowIndex); for (int i = 0; i < targetLength; ++i) { Console.Write("{0} ", previousRow[i]); } Console.WriteLine(); } #endif } //Calculate Single Rows for (; rowIndex < sourceLength; rowIndex++) { var lastSubstitutionCost = rowIndex; var lastInsertionCost = rowIndex + 1; var sourcePrevChar = source[rowIndex]; #if DEBUG Console.Write("prev values for row {0}:", rowIndex); for (int i = 0; i < targetLength; ++i) { Console.Write("{0} ", previousRow[i]); } Console.WriteLine(); #endif CalculateRow(previousRowPtr, targetPtr, targetLength, sourcePrevChar, lastInsertionCost, lastSubstitutionCost); } } var result = previousRow[targetLength - 1]; arrayPool.Return(pooledArray); return(result); }
private static int Min(int a, int b) { return(Sse41.Min(Vector128.CreateScalar(a), Vector128.CreateScalar(b)) .GetElement(0)); }
public void ResizeBicubic(FastBitmap rtnImage) { float scaleX = (float)this.width / rtnImage.width; float scaleY = (float)this.height / rtnImage.height; if (scaleX > 1 || scaleY > 1) { throw new Exception("拡大のみ対応"); } float[] tmpa = new float[rtnImage.width * 4 * this.height]; fixed(float *tmpp = tmpa) { float *tmp = tmpp; var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255); var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255); var _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255); var _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255); var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); var _1012 = Vector128.Create(-1, 0, 1, 2); var _0123i = Vector128.Create(0, 1, 2, 3); var _0000 = Vector128.Create(0, 0, 0, 0); var _0000f = Vector128.Create(0f, 0, 0, 0); var _255f = Vector128.Create(255f, 255, 255, 255); var _1111 = Vector128.Create(1, 1, 1, 1); var _1111f = Vector128.Create(1f, 1, 1, 1); var _4444f = Vector128.Create(4f, 4, 4, 4); var _4444 = Vector128.Create(4, 4, 4, 4); var _5555f = Vector128.Create(5f, 5, 5, 5); var _2222f = Vector128.Create(2f, 2, 2, 2); var _8888f = Vector128.Create(8f, 8, 8, 8); var _7f = Vector128.Create(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff).AsSingle(); var _ff = Vector128.Create(-1, -1, -1, -1); var _stride = Vector128.Create(rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4); Parallel.For(0, this.height, (y) => { float py = (y * scaleY); float *tmpPos = tmp + y * rtnImage.width * 4; for (int x = 0; x < rtnImage.width; x++) { float px = (x * scaleX); int sx = (int)px; var _px = Vector128.CreateScalar(px); _px = Sse.Shuffle(_px, _px, 0); var _sx = Vector128.CreateScalar(sx); _sx = Sse2.Shuffle(_sx, 0); var _width = Vector128.CreateScalar(this.width); _width = Sse2.Shuffle(_width, 0); var _x2 = Sse2.Add(_sx, _1012); var _d = Sse.And(Sse.Subtract(_px, Sse2.ConvertToVector128Single(_x2)), _7f); var _d2 = Sse.Multiply(_d, _d); var _d3 = Sse.Multiply(_d2, _d); var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2))); var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3); var wb = Sse2.CompareGreaterThan(_d, _1111f); var _w = Sse41.BlendVariable(w1, w2, wb); var _xpb = Sse2.Or(Sse2.CompareLessThan(_x2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_x2, _width), _1111).AsInt32(), _ff)); var _xpp = Sse2.And(_sx, _xpb); var _xp = Sse41.BlendVariable(_x2, _xpp, _xpb); var p = Avx2.GatherVector128((uint *)(this._ptr + this._stride * y), _xp, 4).AsByte(); var _p0 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _00mask).AsInt32()); var _p1 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _01mask).AsInt32()); var _p2 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _10mask).AsInt32()); var _p3 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _11mask).AsInt32()); var _w0 = Sse.Shuffle(_w, _w, 0); var _w1 = Sse.Shuffle(_w, _w, 0b01010101); var _w2 = Sse.Shuffle(_w, _w, 0b10101010); var _w3 = Sse.Shuffle(_w, _w, 0b11111111); var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3))); Sse2.Store(tmpPos + x * 4, rgbaf); } }); Parallel.For(0, rtnImage.height, (y) => { float py = (y * scaleY); int sy = (int)py; uint *store = stackalloc uint[4]; var _py = Vector128.CreateScalar(py); _py = Sse.Shuffle(_py, _py, 0); var _sy = Vector128.CreateScalar(sy); _sy = Sse2.Shuffle(_sy, 0); var _height = Vector128.CreateScalar(this.height); _height = Sse2.Shuffle(_height, 0); var _y2 = Sse2.Add(_sy, _1012); var _d = Sse.And(Sse.Subtract(_py, Sse2.ConvertToVector128Single(_y2)), _7f); var _d2 = Sse.Multiply(_d, _d); var _d3 = Sse.Multiply(_d2, _d); var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2))); var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3); var wb = Sse2.CompareGreaterThan(_d, _1111f); var _w = Sse41.BlendVariable(w1, w2, wb); var _ypb = Sse2.Or(Sse2.CompareLessThan(_y2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_y2, _height), _1111).AsInt32(), _ff)); var _ypp = Sse2.And(_sy, _ypb); var _yp = Sse41.BlendVariable(_y2, _ypp, _ypb); var _yps = Sse41.MultiplyLow(_yp, _stride); var _yp0 = Sse2.Add(Sse2.Shuffle(_yps, 0), _0123i); var _yp1 = Sse2.Add(Sse2.Shuffle(_yps, 0b01010101), _0123i); var _yp2 = Sse2.Add(Sse2.Shuffle(_yps, 0b10101010), _0123i); var _yp3 = Sse2.Add(Sse2.Shuffle(_yps, 0b11111111), _0123i); uint *rtn = (uint *)(rtnImage._ptr + rtnImage._stride * y); for (int x = 0; x < rtnImage.width; x++) { var _p0 = Avx2.GatherVector128((float *)(tmp), _yp0, 4); var _p1 = Avx2.GatherVector128((float *)(tmp), _yp1, 4); var _p2 = Avx2.GatherVector128((float *)(tmp), _yp2, 4); var _p3 = Avx2.GatherVector128((float *)(tmp), _yp3, 4); var _w0 = Sse.Shuffle(_w, _w, 0); var _w1 = Sse.Shuffle(_w, _w, 0b01010101); var _w2 = Sse.Shuffle(_w, _w, 0b10101010); var _w3 = Sse.Shuffle(_w, _w, 0b11111111); var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3))); var _b0 = Sse.CompareLessThan(rgbaf, _0000f); rgbaf = Sse41.BlendVariable(rgbaf, _0000f, _b0); var _b1 = Sse.CompareGreaterThan(rgbaf, _255f); rgbaf = Sse41.BlendVariable(rgbaf, _255f, _b1); var rgbab = Sse2.ConvertToVector128Int32(rgbaf).AsByte(); var rgba = Ssse3.Shuffle(rgbab, _vmask).AsUInt32(); Sse2.Store(store, rgba); _yp0 = Sse2.Add(_yp0, _4444); _yp1 = Sse2.Add(_yp1, _4444); _yp2 = Sse2.Add(_yp2, _4444); _yp3 = Sse2.Add(_yp3, _4444); *rtn = *store; rtn++; } });
public Intro() { var middleVector = Vector128.Create(1.0f); // middleVector = <1,1,1,1> middleVector = Vector128.CreateScalar(-1.0f); // middleVector = <-1,0,0,0> var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191> if (Avx.IsSupported) { var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> Vector256 <float> result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, -70.0f, -80.0f); right = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f); result = Avx.UnpackHigh(left, right); // result = <-3, 3, -4, 4, -70, 70, -80, 80> result = Avx.UnpackLow(left, right); // result = <-1, 1, -2, 2, -50, 50, -60, 60> result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0> bool testResult = Avx.TestC(left, right); // testResult = true testResult = Avx.TestC(right, left); // testResult = false Vector256 <float> result1 = Avx.Divide(left, right); var plusOne = Vector256.Create(1.0f); result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling); result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling); left = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); Vector256 <float> nanInFirstPosition = Avx.Divide(left, right); left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f); Vector256 <float> InfInFirstPosition = Avx.Divide(left, right); left = Vector256.Create(-1.1f, 3.0f, 1.0f / 3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f); right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); Vector256 <float> compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> Vector256 <float> mixed = Avx.BlendVariable(left, right, compareResult); // mixed = <-1, 2, -3, 2, -50, -60, -70, -80> //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f); //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f); Vector256 <float> other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); bool bRes = Avx.TestZ(plusOne, compareResult); bool bRes2 = Avx.TestC(plusOne, compareResult); bool allTrue = !Avx.TestZ(compareResult, compareResult); compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling); compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling); compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); var left128 = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f); var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f); Vector128 <float> compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0> int res = Avx.MoveMask(compareResult); if (Fma.IsSupported) { Vector256 <float> resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element resultFma = Fma.MultiplyAddNegated(left, right, other); // = -(left * right + other) for each element resultFma = Fma.MultiplySubtract(left, right, other); // = left * right - other for each element Fma.MultiplyAddSubtract(left, right, other); // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract } result = Avx.DotProduct(left, right, 0b1010_0001); // result = <-20, 0, 0, 0, -10000, 0, 0, 0> result = Avx.Floor(left); // result = <-3, -3, -3, -3, -3, -3, -3, -3> result = Avx.Add(left, right); // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5> result = Avx.Ceiling(left); // result = <-2, -2, -2, -2, -2, -2, -2, -2> result = Avx.Multiply(left, right); // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5> result = Avx.HorizontalAdd(left, right); // result = <-5, -5, 10, 10, -5, -5, 10, 10> result = Avx.HorizontalSubtract(left, right); // result = <0, 0, 0, 0, 0, 0, 0, 0> double[] someDoubles = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 }; double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; double[] someResult = new double[someDoubles.Length]; float[] someFloats = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 }; float[] someOtherFloats = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 }; unsafe { fixed(double *ptr = &someDoubles[1]) { fixed(double *ptr2 = &someResult[0]) { Vector256 <double> res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8> Avx.Store(ptr2, res2); } } fixed(float *ptr = &someFloats[0]) { fixed(float *ptr2 = &someOtherFloats[0]) { Vector256 <float> res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001); //Avx.Store(ptr2, res2); } } } } }
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. uint length = (uint)buffer.Length; uint blocks = length / BlockSize; length -= blocks * BlockSize; fixed(byte *bufferPtr = &MemoryMarshal.GetReference(buffer)) { fixed(byte *tapPtr = &MemoryMarshal.GetReference(Tap1Tap2)) { byte *localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr); Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10)); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BlockSize; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n); Vector128 <uint> v_s2 = Vector128.CreateScalar(s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); localBufferPtr += BlockSize; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } if (length > 0) { HandleLeftOver(localBufferPtr, length, ref s1, ref s2); } return(s1 | (s2 << 16)); } } }
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. const int BLOCK_SIZE = 1 << 5; uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; int index = 0; fixed(byte *bufferPtr = buffer) fixed(byte *tapPtr = Tap1Tap2) { index += (int)blocks * BLOCK_SIZE; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr); Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10)); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n); Vector128 <uint> v_s2 = Vector128.CreateScalar(s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); localBufferPtr += BLOCK_SIZE; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } if (length > 0) { if (length >= 16) { s2 += s1 += localBufferPtr[0]; s2 += s1 += localBufferPtr[1]; s2 += s1 += localBufferPtr[2]; s2 += s1 += localBufferPtr[3]; s2 += s1 += localBufferPtr[4]; s2 += s1 += localBufferPtr[5]; s2 += s1 += localBufferPtr[6]; s2 += s1 += localBufferPtr[7]; s2 += s1 += localBufferPtr[8]; s2 += s1 += localBufferPtr[9]; s2 += s1 += localBufferPtr[10]; s2 += s1 += localBufferPtr[11]; s2 += s1 += localBufferPtr[12]; s2 += s1 += localBufferPtr[13]; s2 += s1 += localBufferPtr[14]; s2 += s1 += localBufferPtr[15]; localBufferPtr += 16; length -= 16; } while (length-- > 0) { s2 += s1 += *localBufferPtr++; } if (s1 >= BASE) { s1 -= BASE; } s2 %= BASE; } return(s1 | (s2 << 16)); } }
// Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c #if !NETSTANDARD2_0 && !NETSTANDARD2_1 private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. const int BLOCK_SIZE = 1 << 5; uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; int index = 0; fixed(byte *bufferPtr = &buffer[0]) { index += (int)blocks * BLOCK_SIZE; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32(); Vector128 <int> v_s2 = Vector128.CreateScalar(s2).AsInt32(); Vector128 <int> v_s1 = Vector128 <int> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones)); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones)); localBufferPtr += BLOCK_SIZE; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += (uint)v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = (uint)v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } } ref byte bufferRef = ref MemoryMarshal.GetReference(buffer);