public void RunLclVarScenario_UnsafeRead() { var left = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray2Ptr); var result = Sse41.PackUnsignedSaturate(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new HorizontalBinaryOpTest__PackUnsignedSaturateUInt16(); var result = Sse41.PackUnsignedSaturate(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Sse41.PackUnsignedSaturate(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { var result = Sse41.PackUnsignedSaturate( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Sse41.PackUnsignedSaturate( Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse41.PackUnsignedSaturate( Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var left = Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray1Ptr)); var right = Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray2Ptr)); var result = Sse41.PackUnsignedSaturate(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray2Ptr); var result = Sse41.PackUnsignedSaturate(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
private static uint32_t parse_eight_digits_unrolled(bytechar *chars) { // this actually computes *16* values so we are being wasteful. Vector128 <sbyte> ascii0 = Vector128.Create((bytechar)'0'); Vector128 <sbyte> input = Sse2.Subtract(Sse2.LoadVector128(chars), ascii0); Vector128 <short> t1 = Ssse3.MultiplyAddAdjacent(input.AsByte(), mul_1_10); Vector128 <int> t2 = Sse2.MultiplyAddAdjacent(t1, mul_1_100); Vector128 <ushort> t3 = Sse41.PackUnsignedSaturate(t2, t2); Vector128 <int> t4 = Sse2.MultiplyAddAdjacent(t3.AsInt16(), mul_1_10000); return(Sse2.ConvertToUInt32(t4.AsUInt32())); // only captures the sum of the first 8 digits, drop the rest }
public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Sse41.PackUnsignedSaturate( Sse2.LoadVector128((Int32 *)(_dataTable.inArray1Ptr)), Sse2.LoadVector128((Int32 *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Sse41.PackUnsignedSaturate( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse41.PackUnsignedSaturate( Sse2.LoadVector128((Int32 *)(&test._fld1)), Sse2.LoadVector128((Int32 *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructFldScenario_Load(HorizontalBinaryOpTest__PackUnsignedSaturateUInt16 testClass) { fixed(Vector128 <Int32> *pFld1 = &_fld1) fixed(Vector128 <Int32> *pFld2 = &_fld2) { var result = Sse41.PackUnsignedSaturate( Sse2.LoadVector128((Int32 *)(pFld1)), Sse2.LoadVector128((Int32 *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Int32> *pFld1 = &_fld1) fixed(Vector128 <Int32> *pFld2 = &_fld2) { var result = Sse41.PackUnsignedSaturate( Sse2.LoadVector128((Int32 *)(pFld1)), Sse2.LoadVector128((Int32 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new HorizontalBinaryOpTest__PackUnsignedSaturateUInt16(); fixed(Vector128 <Int32> *pFld1 = &test._fld1) fixed(Vector128 <Int32> *pFld2 = &test._fld2) { var result = Sse41.PackUnsignedSaturate( Sse2.LoadVector128((Int32 *)(pFld1)), Sse2.LoadVector128((Int32 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI) { if (Avx2.IsSupported) { Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output); Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u); Vector128 <uint> masks = Vector128.Create(7u); Vector128 <byte> vClut; fixed(byte *pRPal = rPal) { vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte(); } Vector128 <uint> indices0 = Vector128.Create((uint)rI); Vector128 <uint> indices1 = Vector128.Create((uint)(rI >> 24)); Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12); Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12); indices00 = Sse2.And(indices00, masks); indices10 = Sse2.And(indices10, masks); indices01 = Sse2.And(indices01, masks); indices11 = Sse2.And(indices11, masks); Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); } else { for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) { output[i] = rPal[(int)(rI & 7)]; } } }
static Vector128 <ushort> ToUshortScalar(int i) { var xmm = Vector128.Create(i); return(Sse41.PackUnsignedSaturate(xmm, xmm)); }
private static Vector128 <byte> PackUnsignedSaturate(Vector128 <int> value, Vector128 <int> zero) { return(Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16())); }
public static Vector128 <ushort> _mm_packus_epi32(Vector128 <int> left, Vector128 <int> right) { return(Sse41.PackUnsignedSaturate(left, right)); }
private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int gobBlocksInY = 1 << config.OutBlkHeight; bool outLinear = config.OutBlkKind == 0; int width = Math.Min(config.OutLumaWidth + 1, input.Width); int height = Math.Min(config.OutLumaHeight + 1, input.Height); int yStride = GetPitch(config.OutLumaWidth + 1, 1); int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY); if (Sse41.IsSupported) { Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16(); int widthTrunc = width & ~0xf; int strideGap = yStride - width; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstY) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 16) { byte *baseOffset = (byte *)(ip + (ulong)(uint)x); Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset); Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10)); Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20)); Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30)); Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40)); Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50)); Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60)); Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70)); pixelp1 = Sse2.And(pixelp1, mask); pixelp2 = Sse2.And(pixelp2, mask); pixelp3 = Sse2.And(pixelp3, mask); pixelp4 = Sse2.And(pixelp4, mask); pixelp5 = Sse2.And(pixelp5, mask); pixelp6 = Sse2.And(pixelp6, mask); pixelp7 = Sse2.And(pixelp7, mask); pixelp8 = Sse2.And(pixelp8, mask); Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32()); Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32()); Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32()); Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32()); pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32()); pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32()); pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2); pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *op++ = Downsample(px->R); } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { dstY[y * yStride + x] = Downsample(input.GetR(x, y)); } } } WriteBuffer( rm, dstY, offsets.LumaOffset, outLinear, config.OutLumaWidth + 1, config.OutLumaHeight + 1, 1, gobBlocksInY); rm.BufferPool.Return(dstYIndex); int uvWidth = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1); int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1); int uvStride = GetPitch(config.OutChromaWidth + 1, 2); int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv); if (Sse2.IsSupported) { int widthTrunc = uvWidth & ~7; int strideGap = uvStride - uvWidth * 2; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstUv) { byte *op = dstPtr; for (int y = 0; y < uvHeight; y++, ip += input.Width * 2) { int x = 0; for (; x < widthTrunc; x += 8) { byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16; Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02)); Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12)); Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22)); Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32)); Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42)); Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52)); Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62)); Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72)); Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2); Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4); Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6); Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8); Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64()); Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64()); pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2); pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < uvWidth; x++) { Pixel *px = ip + (uint)(x << 1); *op++ = Downsample(px->G); *op++ = Downsample(px->B); } op += strideGap; } } } } else { for (int y = 0; y < uvHeight; y++) { for (int x = 0; x < uvWidth; x++) { int xx = x << 1; int yy = y << 1; int uvOffs = y * uvStride + xx; dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy)); dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy)); } } } WriteBuffer( rm, dstUv, offsets.ChromaUOffset, outLinear, config.OutChromaWidth + 1, config.OutChromaHeight + 1, 2, gobBlocksInY); rm.BufferPool.Return(dstUvIndex); }