public void RunBasicScenario_LoadAligned() { var result = Sse41.ConvertToVector128Int16( Sse2.LoadAlignedVector128((SByte *)(_dataTable.inArrayPtr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { var result = Sse41.ConvertToVector128Int16( _clsVar ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse41.ConvertToVector128Int16( Unsafe.Read <Vector128 <SByte> >(_dataTable.inArrayPtr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
public void RunBasicScenario_Ptr() { var result = Sse41.ConvertToVector128Int16( (Byte *)_dataTable.inArrayPtr ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
private unsafe void TestAddSum(byte[] vs) { fixed(byte *p = vs) { var v = Avx.LoadVector256(p); var v2 = Avx.LoadVector256(p + 32); //Avx.MultipleSumAbsoluteDifferences; Vector256 <int> i1 = Avx2.ConvertToVector256Int32(p); Vector256 <float> f1 = Avx.ConvertToVector256Single(i1); Vector256 <float> m1 = Avx.Multiply(f1, f1); Vector128 <int> i128 = Sse41.ConvertToVector128Int32(p); Vector256 <double> d256 = Avx.ConvertToVector256Double(i128); var dZero = Vector256 <double> .Zero; Vector256 <double> ma1 = Fma.MultiplyAdd(d256, d256, dZero); var i256 = Avx2.ConvertToVector256Int32(p); var f256 = Avx.ConvertToVector256Single(i256); var fZero = Vector256 <float> .Zero; var ma2 = Fma.MultiplyAdd(f256, f256, fZero); Vector128 <float> s128 = Sse2.ConvertToVector128Single(i128); Vector128 <float> ms = Sse.MultiplyScalar(s128, s128); // x86 / x64 SIMD命令一覧表(SSE~AVX2) //https://www.officedaytime.com/tips/simd.html // pmaddwd //https://www.officedaytime.com/tips/simdimg/si.php?f=pmaddwd Vector128 <short> sh128 = Sse41.ConvertToVector128Int16(p); Vector128 <int> vv3 = Avx.MultiplyAddAdjacent(sh128, sh128); var neko = 0; //Avx.MultiplyAddAdjacent; //Avx.MultiplyHigh; //Avx.MultiplyHighRoundScale; //Avx.MultiplyLow; //Avx.MultiplyScalar; //Fma.MultiplyAdd; //Fma.MultiplyAddNegated; //Fma.MultiplyAddNegatedScalar; //Fma.MultiplyAddScalar; //Fma.MultiplyAddSubtract; //Fma.MultiplySubtract; //Fma.MultiplySubtractAdd; //Fma.MultiplySubtractNegated; //Fma.MultiplySubtractNegatedScalar; //Fma.MultiplySubtractScalar; } }
//↑を改変 //集計用のVector128<int>がオーバーフローしないように配列を分割して計算 //Intrinsics SSE2 MultiplyAddAdjacent private unsafe long Test26_Intrinsics_SSE2_MultiplyAddAdjacent_int_MT_Kai(byte[] vs) { long total = 0; int simdLength = Vector128 <short> .Count;//8 //集計用のVector128<int>で //オーバーフローすることなく扱える最大要素数 = 132102 //int.MaxValue / (byte.MaxValue * byte.MaxValue) * Vector128<int>.Count //2147483647 / (255 * 255) * 4 = 132102.03 小数点以下切り捨てで132102 int rangeSize = int.MaxValue / (byte.MaxValue * byte.MaxValue) * Vector128 <int> .Count; Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector128 <int> vTotal = Vector128 <int> .Zero; //集計用 fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <short> v = Sse41.ConvertToVector128Int16(p + i); Vector128 <int> vv = Sse2.MultiplyAddAdjacent(v, v); //short + short vTotal = Sse2.Add(vTotal, vv); } } int *pp = stackalloc int[Vector128 <int> .Count]; Sse2.Store(pp, vTotal); for (int i = 0; i < Vector128 <int> .Count; i++) { subtotal += pp[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
private static unsafe void ConvolveHorizSse41( byte *src, int srcStride, byte *dst, int dstStride, Array8 <short>[] xFilters, int x0Q4, int w, int h) { Vector128 <int> zero = Vector128 <int> .Zero; Vector128 <int> const64 = Vector128.Create(64); ulong x, y; src -= SubpelTaps / 2 - 1; fixed(Array8 <short> *xFilter = xFilters) { Vector128 <short> vfilter = Sse2.LoadVector128((short *)xFilter + (uint)(x0Q4 & SubpelMask) * 8); for (y = 0; y < (uint)h; ++y) { ulong srcOffset = (uint)x0Q4 >> SubpelBits; for (x = 0; x < (uint)w; x += 4) { Vector128 <short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]); Vector128 <short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]); Vector128 <short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]); Vector128 <short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]); Vector128 <int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero); Sse.StoreScalar((float *)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); } src += srcStride; dst += dstStride; } } }
//↑をマルチスレッド化 //最大要素数は1_056_831まで(8スレッドCPU) //Intrinsics SSE2 MultiplyAddAdjacent private unsafe long Test16_Intrinsics_SSE2_MultiplyAddAdjacent_int_MT(byte[] vs) { long total = 0; int simdLength = Vector128 <short> .Count;//8 int rangeSize = vs.Length / Environment.ProcessorCount; Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector128 <int> vTotal = Vector128 <int> .Zero; fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <short> v = Sse41.ConvertToVector128Int16(p + i); Vector128 <int> vv = Sse2.MultiplyAddAdjacent(v, v); // short + short vTotal = Sse2.Add(vTotal, vv); } } int *pp = stackalloc int[Vector128 <int> .Count]; Sse2.Store(pp, vTotal); for (int i = 0; i < Vector128 <int> .Count; i++) { subtotal += pp[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets) { InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2); int width = input.Width; int height = input.Height; int yStride = GetPitch(width, 1); int uvStride = GetPitch(input.UvWidth, 2); Surface output = new Surface(rm.SurfacePool, width, height); if (Sse41.IsSupported) { Vector128 <byte> shufMask = Vector128.Create( (byte)0, (byte)2, (byte)3, (byte)1, (byte)4, (byte)6, (byte)7, (byte)5, (byte)8, (byte)10, (byte)11, (byte)9, (byte)12, (byte)14, (byte)15, (byte)13); Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16(); int yStrideGap = yStride - width; int uvStrideGap = uvStride - input.UvWidth; int widthTrunc = width & ~0xf; fixed(Pixel *dstPtr = output.Data) { Pixel *op = dstPtr; fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1) { byte *i0p = src0Ptr; for (int y = 0; y < height; y++) { byte *i1p = src1Ptr + (y >> 1) * uvStride; int x = 0; for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16) { Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p); Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8); Vector128 <byte> uv = Sse2.LoadVector128(i1p); Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16()); Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16()); Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0); Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0); Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1); Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1); rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16(); rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16(); rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16(); rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16(); Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte()); Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte())); Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte()); Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte())); Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte()); Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte())); Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte()); Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte())); rgba16_0 = Sse2.Or(rgba16_0, alphaMask); rgba16_1 = Sse2.Or(rgba16_1, alphaMask); rgba16_2 = Sse2.Or(rgba16_2, alphaMask); rgba16_3 = Sse2.Or(rgba16_3, alphaMask); rgba16_4 = Sse2.Or(rgba16_4, alphaMask); rgba16_5 = Sse2.Or(rgba16_5, alphaMask); rgba16_6 = Sse2.Or(rgba16_6, alphaMask); rgba16_7 = Sse2.Or(rgba16_7, alphaMask); rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2); rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2); rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2); rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2); rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2); rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2); rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2); rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2); Sse2.Store((short *)(op + (uint)x + 0), rgba16_0); Sse2.Store((short *)(op + (uint)x + 2), rgba16_1); Sse2.Store((short *)(op + (uint)x + 4), rgba16_2); Sse2.Store((short *)(op + (uint)x + 6), rgba16_3); Sse2.Store((short *)(op + (uint)x + 8), rgba16_4); Sse2.Store((short *)(op + (uint)x + 10), rgba16_5); Sse2.Store((short *)(op + (uint)x + 12), rgba16_6); Sse2.Store((short *)(op + (uint)x + 14), rgba16_7); } for (; x < width; x++, i1p += (x & 1) * 2) { Pixel *px = op + (uint)x; px->R = Upsample(*i0p++); px->G = Upsample(*i1p); px->B = Upsample(*(i1p + 1)); px->A = 0x3ff; } op += width; i0p += yStrideGap; i1p += uvStrideGap; } } } } else { for (int y = 0; y < height; y++) { int uvBase = (y >> 1) * uvStride; for (int x = 0; x < width; x++) { output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x])); int uvOffs = uvBase + (x & ~1); output.SetG(x, y, Upsample(input.Buffer1[uvOffs])); output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1])); output.SetA(x, y, 0x3ff); } } } return(output); }
public static unsafe Vector128 <short> xmm(byte *address) { return(Sse41.ConvertToVector128Int16(address)); }
public static Vector128 <short> _mm_cvtepu8_epi16(Vector128 <byte> value) { return(Sse41.ConvertToVector128Int16(value)); }
public unsafe string IntToString(int value) { var str = AsciiInterface.FastAllocateString(8); var vector = IntToUtf8_8(value); Unsafe.As <char, Vector128 <short> >(ref Unsafe.AsRef(in str.GetPinnableReference())) = Sse41.ConvertToVector128Int16((byte *)&vector); return(str); }