示例#1
0
        public void RunLclVarScenario_UnsafeRead()
        {
            var left   = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray1Ptr);
            var right  = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray2Ptr);
            var result = Sse41.PackUnsignedSaturate(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test   = new HorizontalBinaryOpTest__PackUnsignedSaturateUInt16();
            var result = Sse41.PackUnsignedSaturate(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
        public void RunStructLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));

            var test   = TestStruct.Create();
            var result = Sse41.PackUnsignedSaturate(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
示例#4
0
        public void RunClsVarScenario()
        {
            var result = Sse41.PackUnsignedSaturate(
                _clsVar1,
                _clsVar2
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
示例#5
0
        public void RunBasicScenario_LoadAligned()
        {
            var result = Sse41.PackUnsignedSaturate(
                Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray1Ptr)),
                Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
示例#6
0
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Sse41.PackUnsignedSaturate(
                Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var left   = Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray1Ptr));
            var right  = Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray2Ptr));
            var result = Sse41.PackUnsignedSaturate(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var op1    = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray1Ptr);
            var op2    = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray2Ptr);
            var result = Sse41.PackUnsignedSaturate(op1, op2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, _dataTable.outArrayPtr);
        }
示例#9
0
        private static uint32_t parse_eight_digits_unrolled(bytechar *chars)
        {
            // this actually computes *16* values so we are being wasteful.
            Vector128 <sbyte>  ascii0 = Vector128.Create((bytechar)'0');
            Vector128 <sbyte>  input  = Sse2.Subtract(Sse2.LoadVector128(chars), ascii0);
            Vector128 <short>  t1     = Ssse3.MultiplyAddAdjacent(input.AsByte(), mul_1_10);
            Vector128 <int>    t2     = Sse2.MultiplyAddAdjacent(t1, mul_1_100);
            Vector128 <ushort> t3     = Sse41.PackUnsignedSaturate(t2, t2);
            Vector128 <int>    t4     = Sse2.MultiplyAddAdjacent(t3.AsInt16(), mul_1_10000);

            return(Sse2.ConvertToUInt32(t4.AsUInt32())); // only captures the sum of the first 8 digits, drop the rest
        }
        public void RunBasicScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load));

            var result = Sse41.PackUnsignedSaturate(
                Sse2.LoadVector128((Int32 *)(_dataTable.inArray1Ptr)),
                Sse2.LoadVector128((Int32 *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
        public void RunClsVarScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario));

            var result = Sse41.PackUnsignedSaturate(
                _clsVar1,
                _clsVar2
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Sse41.PackUnsignedSaturate(
                Sse2.LoadVector128((Int32 *)(&test._fld1)),
                Sse2.LoadVector128((Int32 *)(&test._fld2))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
            public void RunStructFldScenario_Load(HorizontalBinaryOpTest__PackUnsignedSaturateUInt16 testClass)
            {
                fixed(Vector128 <Int32> *pFld1 = &_fld1)
                fixed(Vector128 <Int32> *pFld2 = &_fld2)
                {
                    var result = Sse41.PackUnsignedSaturate(
                        Sse2.LoadVector128((Int32 *)(pFld1)),
                        Sse2.LoadVector128((Int32 *)(pFld2))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
                }
            }
        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <Int32> *pFld1 = &_fld1)
            fixed(Vector128 <Int32> *pFld2 = &_fld2)
            {
                var result = Sse41.PackUnsignedSaturate(
                    Sse2.LoadVector128((Int32 *)(pFld1)),
                    Sse2.LoadVector128((Int32 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
            }
        }
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new HorizontalBinaryOpTest__PackUnsignedSaturateUInt16();

            fixed(Vector128 <Int32> *pFld1 = &test._fld1)
            fixed(Vector128 <Int32> *pFld2 = &test._fld2)
            {
                var result = Sse41.PackUnsignedSaturate(
                    Sse2.LoadVector128((Int32 *)(pFld1)),
                    Sse2.LoadVector128((Int32 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }
示例#16
0
        private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI)
        {
            if (Avx2.IsSupported)
            {
                Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output);

                Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
                Vector128 <uint> masks  = Vector128.Create(7u);

                Vector128 <byte> vClut;

                fixed(byte *pRPal = rPal)
                {
                    vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte();
                }

                Vector128 <uint> indices0  = Vector128.Create((uint)rI);
                Vector128 <uint> indices1  = Vector128.Create((uint)(rI >> 24));
                Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
                Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
                Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
                Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
                indices00 = Sse2.And(indices00, masks);
                indices10 = Sse2.And(indices10, masks);
                indices01 = Sse2.And(indices01, masks);
                indices11 = Sse2.And(indices11, masks);

                Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
                Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());

                Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());

                outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
            }
            else
            {
                for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
                {
                    output[i] = rPal[(int)(rI & 7)];
                }
            }
        }
            static Vector128 <ushort> ToUshortScalar(int i)
            {
                var xmm = Vector128.Create(i);

                return(Sse41.PackUnsignedSaturate(xmm, xmm));
            }
示例#18
0
 private static Vector128 <byte> PackUnsignedSaturate(Vector128 <int> value, Vector128 <int> zero)
 {
     return(Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16()));
 }
示例#19
0
 public static Vector128 <ushort> _mm_packus_epi32(Vector128 <int> left, Vector128 <int> right)
 {
     return(Sse41.PackUnsignedSaturate(left, right));
 }
示例#20
0
        private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int gobBlocksInY = 1 << config.OutBlkHeight;

            bool outLinear = config.OutBlkKind == 0;

            int width   = Math.Min(config.OutLumaWidth + 1, input.Width);
            int height  = Math.Min(config.OutLumaHeight + 1, input.Height);
            int yStride = GetPitch(config.OutLumaWidth + 1, 1);

            int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY);

            if (Sse41.IsSupported)
            {
                Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16();

                int widthTrunc = width & ~0xf;
                int strideGap  = yStride - width;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstY)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 16)
                            {
                                byte *baseOffset = (byte *)(ip + (ulong)(uint)x);

                                Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset);
                                Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10));
                                Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20));
                                Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30));
                                Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40));
                                Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50));
                                Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60));
                                Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70));

                                pixelp1 = Sse2.And(pixelp1, mask);
                                pixelp2 = Sse2.And(pixelp2, mask);
                                pixelp3 = Sse2.And(pixelp3, mask);
                                pixelp4 = Sse2.And(pixelp4, mask);
                                pixelp5 = Sse2.And(pixelp5, mask);
                                pixelp6 = Sse2.And(pixelp6, mask);
                                pixelp7 = Sse2.And(pixelp7, mask);
                                pixelp8 = Sse2.And(pixelp8, mask);

                                Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32());
                                Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32());
                                Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32());
                                Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32());

                                pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32());
                                pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32());

                                pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2);
                                pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *op++ = Downsample(px->R);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        dstY[y * yStride + x] = Downsample(input.GetR(x, y));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstY,
                offsets.LumaOffset,
                outLinear,
                config.OutLumaWidth + 1,
                config.OutLumaHeight + 1,
                1,
                gobBlocksInY);

            rm.BufferPool.Return(dstYIndex);

            int uvWidth  = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1);
            int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1);
            int uvStride = GetPitch(config.OutChromaWidth + 1, 2);

            int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv);

            if (Sse2.IsSupported)
            {
                int widthTrunc = uvWidth & ~7;
                int strideGap  = uvStride - uvWidth * 2;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstUv)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < uvHeight; y++, ip += input.Width * 2)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16;

                                Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02));
                                Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12));
                                Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22));
                                Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32));
                                Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42));
                                Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52));
                                Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62));
                                Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72));

                                Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2);
                                Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4);
                                Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6);
                                Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8);

                                Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64());
                                Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64());

                                pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2);
                                pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < uvWidth; x++)
                            {
                                Pixel *px = ip + (uint)(x << 1);

                                *op++ = Downsample(px->G);
                                *op++ = Downsample(px->B);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < uvHeight; y++)
                {
                    for (int x = 0; x < uvWidth; x++)
                    {
                        int xx = x << 1;
                        int yy = y << 1;

                        int uvOffs = y * uvStride + xx;

                        dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy));
                        dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstUv,
                offsets.ChromaUOffset,
                outLinear,
                config.OutChromaWidth + 1,
                config.OutChromaHeight + 1, 2,
                gobBlocksInY);

            rm.BufferPool.Return(dstUvIndex);
        }