public void RunBasicScenario_LoadAligned()
        {
            var result = Sse41.ConvertToVector128Int16(
                Sse2.LoadAlignedVector128((SByte *)(_dataTable.inArrayPtr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
        public void RunClsVarScenario()
        {
            var result = Sse41.ConvertToVector128Int16(
                _clsVar
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Sse41.ConvertToVector128Int16(
                Unsafe.Read <Vector128 <SByte> >(_dataTable.inArrayPtr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_Ptr()
        {
            var result = Sse41.ConvertToVector128Int16(
                (Byte *)_dataTable.inArrayPtr
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
Exemplo n.º 5
0
        private unsafe void TestAddSum(byte[] vs)
        {
            fixed(byte *p = vs)
            {
                var v  = Avx.LoadVector256(p);
                var v2 = Avx.LoadVector256(p + 32);
                //Avx.MultipleSumAbsoluteDifferences;
                Vector256 <int>   i1 = Avx2.ConvertToVector256Int32(p);
                Vector256 <float> f1 = Avx.ConvertToVector256Single(i1);
                Vector256 <float> m1 = Avx.Multiply(f1, f1);

                Vector128 <int>    i128 = Sse41.ConvertToVector128Int32(p);
                Vector256 <double> d256 = Avx.ConvertToVector256Double(i128);
                var dZero = Vector256 <double> .Zero;
                Vector256 <double> ma1 = Fma.MultiplyAdd(d256, d256, dZero);

                var i256  = Avx2.ConvertToVector256Int32(p);
                var f256  = Avx.ConvertToVector256Single(i256);
                var fZero = Vector256 <float> .Zero;
                var ma2   = Fma.MultiplyAdd(f256, f256, fZero);

                Vector128 <float> s128 = Sse2.ConvertToVector128Single(i128);
                Vector128 <float> ms   = Sse.MultiplyScalar(s128, s128);

//                x86 / x64 SIMD命令一覧表(SSE~AVX2)
//https://www.officedaytime.com/tips/simd.html
                //                pmaddwd
                //https://www.officedaytime.com/tips/simdimg/si.php?f=pmaddwd

                Vector128 <short> sh128 = Sse41.ConvertToVector128Int16(p);
                Vector128 <int>   vv3   = Avx.MultiplyAddAdjacent(sh128, sh128);

                var neko = 0;
                //Avx.MultiplyAddAdjacent;
                //Avx.MultiplyHigh;
                //Avx.MultiplyHighRoundScale;
                //Avx.MultiplyLow;
                //Avx.MultiplyScalar;
                //Fma.MultiplyAdd;
                //Fma.MultiplyAddNegated;
                //Fma.MultiplyAddNegatedScalar;
                //Fma.MultiplyAddScalar;
                //Fma.MultiplyAddSubtract;
                //Fma.MultiplySubtract;
                //Fma.MultiplySubtractAdd;
                //Fma.MultiplySubtractNegated;
                //Fma.MultiplySubtractNegatedScalar;
                //Fma.MultiplySubtractScalar;
            }
        }
Exemplo n.º 6
0
        //↑を改変
        //集計用のVector128<int>がオーバーフローしないように配列を分割して計算
        //Intrinsics SSE2 MultiplyAddAdjacent
        private unsafe long Test26_Intrinsics_SSE2_MultiplyAddAdjacent_int_MT_Kai(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <short> .Count;//8
            //集計用のVector128<int>で
            //オーバーフローすることなく扱える最大要素数 = 132102
            //int.MaxValue / (byte.MaxValue * byte.MaxValue) * Vector128<int>.Count
            //2147483647 / (255 * 255) * 4 = 132102.03 小数点以下切り捨てで132102
            int rangeSize =
                int.MaxValue / (byte.MaxValue * byte.MaxValue) * Vector128 <int> .Count;

            Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize),
                             (range) =>
            {
                long subtotal = 0;
                int lastIndex =
                    range.Item2 - (range.Item2 - range.Item1) % simdLength;
                Vector128 <int> vTotal = Vector128 <int> .Zero; //集計用
                fixed(byte *p          = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector128 <short> v = Sse41.ConvertToVector128Int16(p + i);
                        Vector128 <int> vv  = Sse2.MultiplyAddAdjacent(v, v);  //short + short
                        vTotal = Sse2.Add(vTotal, vv);
                    }
                }

                int *pp = stackalloc int[Vector128 <int> .Count];
                Sse2.Store(pp, vTotal);
                for (int i = 0; i < Vector128 <int> .Count; i++)
                {
                    subtotal += pp[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Exemplo n.º 7
0
        private static unsafe void ConvolveHorizSse41(
            byte *src,
            int srcStride,
            byte *dst,
            int dstStride,
            Array8 <short>[] xFilters,
            int x0Q4,
            int w,
            int h)
        {
            Vector128 <int> zero    = Vector128 <int> .Zero;
            Vector128 <int> const64 = Vector128.Create(64);

            ulong x, y;

            src -= SubpelTaps / 2 - 1;

            fixed(Array8 <short> *xFilter = xFilters)
            {
                Vector128 <short> vfilter = Sse2.LoadVector128((short *)xFilter + (uint)(x0Q4 & SubpelMask) * 8);

                for (y = 0; y < (uint)h; ++y)
                {
                    ulong srcOffset = (uint)x0Q4 >> SubpelBits;
                    for (x = 0; x < (uint)w; x += 4)
                    {
                        Vector128 <short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
                        Vector128 <short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
                        Vector128 <short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]);
                        Vector128 <short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]);

                        Vector128 <int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);

                        Sse.StoreScalar((float *)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
                    }
                    src += srcStride;
                    dst += dstStride;
                }
            }
        }
Exemplo n.º 8
0
        //↑をマルチスレッド化
        //最大要素数は1_056_831まで(8スレッドCPU)
        //Intrinsics SSE2 MultiplyAddAdjacent
        private unsafe long Test16_Intrinsics_SSE2_MultiplyAddAdjacent_int_MT(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <short> .Count;//8
            int  rangeSize  = vs.Length / Environment.ProcessorCount;

            Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize),
                             (range) =>
            {
                long subtotal = 0;
                int lastIndex =
                    range.Item2 - (range.Item2 - range.Item1) % simdLength;
                Vector128 <int> vTotal = Vector128 <int> .Zero;
                fixed(byte *p          = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector128 <short> v = Sse41.ConvertToVector128Int16(p + i);
                        Vector128 <int> vv  = Sse2.MultiplyAddAdjacent(v, v);  // short + short
                        vTotal = Sse2.Add(vTotal, vv);
                    }
                }

                int *pp = stackalloc int[Vector128 <int> .Count];
                Sse2.Store(pp, vTotal);
                for (int i = 0; i < Vector128 <int> .Count; i++)
                {
                    subtotal += pp[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Exemplo n.º 9
0
        private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets)
        {
            InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2);

            int width  = input.Width;
            int height = input.Height;

            int yStride  = GetPitch(width, 1);
            int uvStride = GetPitch(input.UvWidth, 2);

            Surface output = new Surface(rm.SurfacePool, width, height);

            if (Sse41.IsSupported)
            {
                Vector128 <byte> shufMask = Vector128.Create(
                    (byte)0, (byte)2, (byte)3, (byte)1,
                    (byte)4, (byte)6, (byte)7, (byte)5,
                    (byte)8, (byte)10, (byte)11, (byte)9,
                    (byte)12, (byte)14, (byte)15, (byte)13);
                Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16();

                int yStrideGap  = yStride - width;
                int uvStrideGap = uvStride - input.UvWidth;

                int widthTrunc = width & ~0xf;

                fixed(Pixel *dstPtr = output.Data)
                {
                    Pixel *op = dstPtr;

                    fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1)
                    {
                        byte *i0p = src0Ptr;

                        for (int y = 0; y < height; y++)
                        {
                            byte *i1p = src1Ptr + (y >> 1) * uvStride;

                            int x = 0;

                            for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16)
                            {
                                Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p);
                                Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8);

                                Vector128 <byte> uv = Sse2.LoadVector128(i1p);

                                Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16());
                                Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16());

                                Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0);
                                Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0);
                                Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1);
                                Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1);

                                rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16();
                                rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16();
                                rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16();
                                rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16();

                                Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte());
                                Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte()));
                                Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte());
                                Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte()));
                                Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte());
                                Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte()));
                                Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte());
                                Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte()));

                                rgba16_0 = Sse2.Or(rgba16_0, alphaMask);
                                rgba16_1 = Sse2.Or(rgba16_1, alphaMask);
                                rgba16_2 = Sse2.Or(rgba16_2, alphaMask);
                                rgba16_3 = Sse2.Or(rgba16_3, alphaMask);
                                rgba16_4 = Sse2.Or(rgba16_4, alphaMask);
                                rgba16_5 = Sse2.Or(rgba16_5, alphaMask);
                                rgba16_6 = Sse2.Or(rgba16_6, alphaMask);
                                rgba16_7 = Sse2.Or(rgba16_7, alphaMask);

                                rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2);
                                rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2);
                                rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2);
                                rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2);
                                rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2);
                                rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2);
                                rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2);
                                rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2);

                                Sse2.Store((short *)(op + (uint)x + 0), rgba16_0);
                                Sse2.Store((short *)(op + (uint)x + 2), rgba16_1);
                                Sse2.Store((short *)(op + (uint)x + 4), rgba16_2);
                                Sse2.Store((short *)(op + (uint)x + 6), rgba16_3);
                                Sse2.Store((short *)(op + (uint)x + 8), rgba16_4);
                                Sse2.Store((short *)(op + (uint)x + 10), rgba16_5);
                                Sse2.Store((short *)(op + (uint)x + 12), rgba16_6);
                                Sse2.Store((short *)(op + (uint)x + 14), rgba16_7);
                            }

                            for (; x < width; x++, i1p += (x & 1) * 2)
                            {
                                Pixel *px = op + (uint)x;

                                px->R = Upsample(*i0p++);
                                px->G = Upsample(*i1p);
                                px->B = Upsample(*(i1p + 1));
                                px->A = 0x3ff;
                            }

                            op  += width;
                            i0p += yStrideGap;
                            i1p += uvStrideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    int uvBase = (y >> 1) * uvStride;

                    for (int x = 0; x < width; x++)
                    {
                        output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x]));

                        int uvOffs = uvBase + (x & ~1);

                        output.SetG(x, y, Upsample(input.Buffer1[uvOffs]));
                        output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1]));
                        output.SetA(x, y, 0x3ff);
                    }
                }
            }

            return(output);
        }
Exemplo n.º 10
0
 public static unsafe Vector128 <short> xmm(byte *address)
 {
     return(Sse41.ConvertToVector128Int16(address));
 }
Exemplo n.º 11
0
 public static Vector128 <short> _mm_cvtepu8_epi16(Vector128 <byte> value)
 {
     return(Sse41.ConvertToVector128Int16(value));
 }
Exemplo n.º 12
0
        public unsafe string IntToString(int value)
        {
            var str    = AsciiInterface.FastAllocateString(8);
            var vector = IntToUtf8_8(value);

            Unsafe.As <char, Vector128 <short> >(ref Unsafe.AsRef(in str.GetPinnableReference())) = Sse41.ConvertToVector128Int16((byte *)&vector);
            return(str);
        }