Exemplo n.º 1
0
        //↑をマルチスレッド化
        //Intrinsics FMA MultiplyAdd float
        private unsafe long Test13_Intrinsics_FMA_MultiplyAdd_float_MT(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector256 <int> .Count;
            int  rangeSize  = vs.Length / Environment.ProcessorCount;//1区分のサイズ

            Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize),
                             (range) =>
            {
                long subtotal        = 0;
                int lastIndex        = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                Vector256 <float> ff = Vector256.Create(0f);
                fixed(byte *p        = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector256 <int> v   = Avx2.ConvertToVector256Int32(p + i);
                        Vector256 <float> f = Avx.ConvertToVector256Single(v);
                        ff = Fma.MultiplyAdd(f, f, ff);    //float
                    }
                }
                float *pp = stackalloc float[Vector256 <float> .Count];
                Avx.Store(pp, ff);
                for (int i = 0; i < Vector256 <float> .Count; i++)
                {
                    subtotal += (long)pp[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Exemplo n.º 2
0
        public void RunFldScenario()
        {
            var result = Avx.ConvertToVector256Single(_fld);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_fld, _dataTable.outArrayPtr);
        }
Exemplo n.º 3
0
        private unsafe void Test3_Vector256Float(byte[] x, byte[] y, byte[] z, byte[] xx, byte[] yy, byte[] zz, float[] result)
        {
            Parallel.ForEach(Partitioner.Create(0, x.Length), range =>
            {
                int simdLength = Vector256 <float> .Count;
                int lastIndex  = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                Vector256 <float> vx, vy, vz, vm;
                fixed(byte *px = x, py = y, pz = z, pxx = xx, pyy = yy, pzz = zz)
                {
                    fixed(float *dp = result)
                    {
                        for (int i = range.Item1; i < range.Item2; i += simdLength)
                        {
                            vx = Avx.Subtract(
                                Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(px + i)),
                                Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pxx + i)));
                            vy = Avx.Subtract(
                                Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(py + i)),
                                Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pyy + i)));
                            vz = Avx.Subtract(
                                Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pz + i)),
                                Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pzz + i)));

                            vm = Avx.Add(Avx.Multiply(vx, vx), Avx.Multiply(vy, vy));
                            vm = Avx.Sqrt(Avx.Add(vm, Avx.Multiply(vz, vz)));
                            Avx.Store(dp + i, vm);
                        }
                    }
                }
            });
        }
Exemplo n.º 4
0
        //誤差無しで計算できる最大要素数は2064まで。
        //これはVector256<float>でbyte型配列を計算する場合で、
        //floatの誤差なし最大値が16777215(24bit)とbyte配列が最大の255ってことで
        //16777215/255/255=258.01176
        //小数点以下切り捨てて258個、これにVectorCountの8をかけて
        //258*8=2064、これが限界。
        //あとはおまけでVectorCountで割り切れなかった余りの最大数7を足して
        //2064+7=2071
        //FMA MultiplyAddはVector256Double型でも計算できる
        //最大要素数は増えるけどVectorCountが半減するから遅くなるので
        //配列を分割してfloat型で計算するほうが効率が良さそう
        //Intrinsics FMA MultiplyAdd float
        private unsafe long Test3_Intrinsics_FMA_MultiplyAdd_float(byte[] vs)
        {
            long total           = 0;
            int  simdLength      = Vector256 <int> .Count;
            int  lastIndex       = vs.Length - (vs.Length % simdLength);
            Vector256 <float> ff = Vector256.Create(0f);

            fixed(byte *p = vs)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    Vector256 <int>   v = Avx2.ConvertToVector256Int32(p + i);
                    Vector256 <float> f = Avx.ConvertToVector256Single(v);
                    ff = Fma.MultiplyAdd(f, f, ff);//float
                }
            }

            float *pp = stackalloc float[Vector256 <float> .Count];

            Avx.Store(pp, ff);
            for (int i = 0; i < Vector256 <float> .Count; i++)
            {
                total += (long)pp[i];
            }
            //割り切れなかった余り要素用
            for (int i = lastIndex; i < vs.Length; i++)
            {
                total += vs[i] * vs[i];
            }
            return(total);
        }
Exemplo n.º 5
0
        //変数を外で
        private unsafe void Test46_Intrinsics_V256float_Sqrt(byte[] red, byte[] green, byte[] blue, float[] vv)
        {
            int    simdLength = Vector256 <float> .Count;
            int    lastIndex  = red.Length - (red.Length % simdLength);
            float *tp         = stackalloc float[simdLength];
            //var zero = Vector256<float>.Zero;
            var vm = Vector256 <float> .Zero;
            Vector256 <float> vr;
            Vector256 <float> vg;
            Vector256 <float> vb;

            fixed(byte *pR = red, pG = green, pB = blue)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    vr = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pR + i));
                    vg = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pG + i));
                    vb = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pB + i));
                    vr = Avx.Subtract(vg, vr);
                    vg = Avx.Subtract(vb, vg);
                    vb = Avx.Subtract(vr, vb);
                    vm = Avx.Add(Avx.Multiply(vr, vr), Avx.Multiply(vg, vg));
                    vm = Avx.Add(vm, Avx.Multiply(vb, vb));
                    vm = Avx.Sqrt(vm);

                    Avx.Store(tp, vm);
                    for (int m = 0; m < simdLength; m++)
                    {
                        vv[i + m] = tp[m];
                    }
                }
            }

            Amari(lastIndex, red.Length, red, green, blue, vv);
        }
Exemplo n.º 6
0
        public void RunLclVarScenario_UnsafeRead()
        {
            var firstOp = Unsafe.Read <Vector256 <Int32> >(_dataTable.inArrayPtr);
            var result  = Avx.ConvertToVector256Single(firstOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, _dataTable.outArrayPtr);
        }
Exemplo n.º 7
0
        public void RunLclVarScenario_LoadAligned()
        {
            var firstOp = Avx.LoadAlignedVector256((Int32 *)(_dataTable.inArrayPtr));
            var result  = Avx.ConvertToVector256Single(firstOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, _dataTable.outArrayPtr);
        }
Exemplo n.º 8
0
        public void RunLclFldScenario()
        {
            var test   = new SimpleUnaryOpTest__ConvertToVector256SingleInt32();
            var result = Avx.ConvertToVector256Single(test._fld);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld, _dataTable.outArrayPtr);
        }
Exemplo n.º 9
0
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Avx.ConvertToVector256Single(
                Unsafe.Read <Vector256 <Int32> >(_dataTable.inArrayPtr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
Exemplo n.º 10
0
        public void RunBasicScenario_LoadAligned()
        {
            var result = Avx.ConvertToVector256Single(
                Avx.LoadAlignedVector256((Int32 *)(_dataTable.inArrayPtr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
Exemplo n.º 11
0
        private unsafe void TestAddSum(byte[] vs)
        {
            fixed(byte *p = vs)
            {
                var v  = Avx.LoadVector256(p);
                var v2 = Avx.LoadVector256(p + 32);
                //Avx.MultipleSumAbsoluteDifferences;
                Vector256 <int>   i1 = Avx2.ConvertToVector256Int32(p);
                Vector256 <float> f1 = Avx.ConvertToVector256Single(i1);
                Vector256 <float> m1 = Avx.Multiply(f1, f1);

                Vector128 <int>    i128 = Sse41.ConvertToVector128Int32(p);
                Vector256 <double> d256 = Avx.ConvertToVector256Double(i128);
                var dZero = Vector256 <double> .Zero;
                Vector256 <double> ma1 = Fma.MultiplyAdd(d256, d256, dZero);

                var i256  = Avx2.ConvertToVector256Int32(p);
                var f256  = Avx.ConvertToVector256Single(i256);
                var fZero = Vector256 <float> .Zero;
                var ma2   = Fma.MultiplyAdd(f256, f256, fZero);

                Vector128 <float> s128 = Sse2.ConvertToVector128Single(i128);
                Vector128 <float> ms   = Sse.MultiplyScalar(s128, s128);

//                x86 / x64 SIMD命令一覧表(SSE~AVX2)
//https://www.officedaytime.com/tips/simd.html
                //                pmaddwd
                //https://www.officedaytime.com/tips/simdimg/si.php?f=pmaddwd

                Vector128 <short> sh128 = Sse41.ConvertToVector128Int16(p);
                Vector128 <int>   vv3   = Avx.MultiplyAddAdjacent(sh128, sh128);

                var neko = 0;
                //Avx.MultiplyAddAdjacent;
                //Avx.MultiplyHigh;
                //Avx.MultiplyHighRoundScale;
                //Avx.MultiplyLow;
                //Avx.MultiplyScalar;
                //Fma.MultiplyAdd;
                //Fma.MultiplyAddNegated;
                //Fma.MultiplyAddNegatedScalar;
                //Fma.MultiplyAddScalar;
                //Fma.MultiplyAddSubtract;
                //Fma.MultiplySubtract;
                //Fma.MultiplySubtractAdd;
                //Fma.MultiplySubtractNegated;
                //Fma.MultiplySubtractNegatedScalar;
                //Fma.MultiplySubtractScalar;
            }
        }
Exemplo n.º 12
0
        //↑を改変
        //集計用のVector256<float>で誤差が出ないように配列を分割して計算
        //Intrinsics FMA MultiplyAdd float
        private unsafe long Test23_Intrinsics_FMA_MultiplyAdd_float_MT_Kai(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector256 <int> .Count;
            //集計用のVector256<float>で扱える最大要素数 = 2064
            //これを1区分あたりの要素数(分割サイズ)にする
            //floatの仮数部24bit(16777215) * 8 / (255 * 255) = 2064.0941
            int rangeSize = ((1 << 24) - 1)
                            * Vector256 <float> .Count
                            / (byte.MaxValue * byte.MaxValue);

            Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize),
                             (range) =>
            {
                long subtotal            = 0;
                int lastIndex            = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                Vector256 <float> vTotal = Vector256.Create(0f);   //集計用
                fixed(byte *p            = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector256 <int> v   = Avx2.ConvertToVector256Int32(p + i);
                        Vector256 <float> f = Avx.ConvertToVector256Single(v);
                        vTotal = Fma.MultiplyAdd(f, f, vTotal);    //float
                    }
                }
                float *pp = stackalloc float[Vector256 <float> .Count];
                Avx.Store(pp, vTotal);
                for (int i = 0; i < Vector256 <float> .Count; i++)
                {
                    subtotal += (long)pp[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Exemplo n.º 13
0
        //floatで掛け算、足し算
        //これだと要素数10万程度でも誤差が出てくる
        private unsafe double Test6Variance(byte[] vs)
        {
            int simdLength = Vector256 <int> .Count;
            int i;
            var vTotal = Vector256 <float> .Zero;

            fixed(byte *p = vs)
            {
                for (i = 0; i < vs.Length; i += simdLength)
                {
                    Vector256 <int>   v   = Avx2.ConvertToVector256Int32(p + i);//01234567
                    Vector256 <float> inu = Avx.ConvertToVector256Single(v);
                    Vector256 <float> vv  = Avx.Multiply(inu, inu);
                    vTotal = Avx.Add(vTotal, vv);
                    //var neko = Avx.ConvertToVector256Int32(vv);
                    //vTotal = Fma.MultiplyAdd(vv, vv, vTotal);
                }
            }

            double total = 0;

            simdLength = Vector256 <float> .Count;
            float *temp = stackalloc float[simdLength];

            Avx.Store(temp, vTotal);
            for (int j = 0; j < simdLength; j++)
            {
                total += temp[j];
            }
            for (; i < vs.Length; i++)
            {
                total += vs[i];
            }

            double average = (double)Test2(vs) / vs.Length;

            return((total / vs.Length) - (average * average));
        }
Exemplo n.º 14
0
        private unsafe void Test47_Intrinsics_V256float_Sqrt_MT(byte[] red, byte[] green, byte[] blue, float[] vv)
        {
            int rangeSize  = red.Length / Environment.ProcessorCount;
            int simdLength = Vector256 <float> .Count;

            Parallel.ForEach(Partitioner.Create(0, red.Length, rangeSize),
                             (range) =>
            {
                float *tp     = stackalloc float[simdLength];
                int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                var vm        = Vector256 <float> .Zero;
                Vector256 <float> vr;
                Vector256 <float> vg;
                Vector256 <float> vb;
                fixed(byte *pR = red, pG = green, pB = blue)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        vr = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pR + i));
                        vg = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pG + i));
                        vb = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pB + i));
                        vr = Avx.Subtract(vg, vr);
                        vg = Avx.Subtract(vb, vg);
                        vb = Avx.Subtract(vr, vb);
                        vm = Avx.Add(Avx.Multiply(vr, vr), Avx.Multiply(vg, vg));
                        vm = Avx.Add(vm, Avx.Multiply(vb, vb));
                        vm = Avx.Sqrt(vm);
                        Avx.Store(tp, vm);
                        for (int m = 0; m < simdLength; m++)
                        {
                            vv[i + m] = tp[m];
                        }
                    }
                    Amari(lastIndex, range.Item2, red, green, blue, vv);
                }
            });
        }
Exemplo n.º 15
0
        // Convert

        public static f32 Converti32_f32(i32 a)
        {
            return(Avx.ConvertToVector256Single(a));
        }
Exemplo n.º 16
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &LookupTables.Alpha[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscale = Vector256.Create(1f / byte.MaxValue);
                        var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMask3To3xChan)));

                        ipe -= Vector256 <byte> .Count * 3 / 4 + 2;                       // +2 accounts for the overrun on the last read
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3 / 4);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 6 / 4);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 9 / 4);
                            ip += Vector256 <byte> .Count * 3 / 4;

                            vi0 = Avx2.PermuteVar8x32(vi0, vmaskp);
                            vi1 = Avx2.PermuteVar8x32(vi1, vmaskp);
                            vi2 = Avx2.PermuteVar8x32(vi2, vmaskp);
                            vi3 = Avx2.PermuteVar8x32(vi3, vmaskp);

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            vf0 = Avx.Multiply(vf0, vscale);
                            vf1 = Avx.Multiply(vf1, vscale);
                            vf2 = Avx.Multiply(vf2, vscale);
                            vf3 = Avx.Multiply(vf3, vscale);

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <float> .Count, vf1);
                            Avx.Store(op + Vector256 <float> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <float> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count * 3 / 4 + 2;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscale = Vector128.Create(1f / byte.MaxValue);

                        ipe -= Vector128 <byte> .Count * 3 / 4 + 1;                       // +1 accounts for the overrun on the last read
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3 / 4);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 6 / 4);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 9 / 4);
                            ip += Vector128 <byte> .Count * 3 / 4;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(vf0, vscale);
                            vf1 = Sse.Multiply(vf1, vscale);
                            vf2 = Sse.Multiply(vf2, vscale);
                            vf3 = Sse.Multiply(vf3, vscale);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <float> .Count, vf1);
                            Sse.Store(op + Vector128 <float> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <float> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count * 3 / 4 + 1;
                    }
#endif

                    while (ip < ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        ip += 3;

                        op[0] = o0;
                        op[1] = o1;
                        op[2] = o2;
                        op   += 4;
                    }
                }
            }
Exemplo n.º 17
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &valueTable[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscal = Vector256.Create(scale);
                        var voffs = Fma.IsSupported ? Vector256.Create(offset * scale) : Vector256.Create(offset);

                        ipe -= Vector256 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3);
                            ip += Vector256 <byte> .Count;

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            if (Fma.IsSupported)
                            {
                                vf0 = Fma.MultiplySubtract(vf0, vscal, voffs);
                                vf1 = Fma.MultiplySubtract(vf1, vscal, voffs);
                                vf2 = Fma.MultiplySubtract(vf2, vscal, voffs);
                                vf3 = Fma.MultiplySubtract(vf3, vscal, voffs);
                            }
                            else
                            {
                                vf0 = Avx.Multiply(Avx.Subtract(vf0, voffs), vscal);
                                vf1 = Avx.Multiply(Avx.Subtract(vf1, voffs), vscal);
                                vf2 = Avx.Multiply(Avx.Subtract(vf2, voffs), vscal);
                                vf3 = Avx.Multiply(Avx.Subtract(vf3, voffs), vscal);
                            }

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <int> .Count, vf1);
                            Avx.Store(op + Vector256 <int> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <int> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscal = Vector128.Create(scale);
                        var voffs = Vector128.Create(offset);

                        ipe -= Vector128 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3);
                            ip += Vector128 <byte> .Count;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(Sse.Subtract(vf0, voffs), vscal);
                            vf1 = Sse.Multiply(Sse.Subtract(vf1, voffs), vscal);
                            vf2 = Sse.Multiply(Sse.Subtract(vf2, voffs), vscal);
                            vf3 = Sse.Multiply(Sse.Subtract(vf3, voffs), vscal);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <int> .Count, vf1);
                            Sse.Store(op + Vector128 <int> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <int> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count;
                    }
#elif VECTOR_CONVERT
                    var vscal = new VectorF(scale);
                    var voffs = new VectorF(offset);

                    ipe -= Vector <byte> .Count;
                    while (ip <= ipe)
                    {
                        var vb = Unsafe.ReadUnaligned <Vector <byte> >(ip);
                        Vector.Widen(vb, out var vs0, out var vs1);
                        Vector.Widen(vs0, out var vi0, out var vi1);
                        Vector.Widen(vs1, out var vi2, out var vi3);
                        ip += Vector <byte> .Count;

                        var vf0 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi0));
                        var vf1 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi1));
                        var vf2 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi2));
                        var vf3 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi3));

                        vf0 = (vf0 - voffs) * vscal;
                        vf1 = (vf1 - voffs) * vscal;
                        vf2 = (vf2 - voffs) * vscal;
                        vf3 = (vf3 - voffs) * vscal;

                        Unsafe.WriteUnaligned(op, vf0);
                        Unsafe.WriteUnaligned(op + VectorF.Count, vf1);
                        Unsafe.WriteUnaligned(op + VectorF.Count * 2, vf2);
                        Unsafe.WriteUnaligned(op + VectorF.Count * 3, vf3);
                        op += Vector <byte> .Count;
                    }
                    ipe += Vector <byte> .Count;
#endif

                    ipe -= 8;
                    while (ip <= ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        float o3 = at[(uint)ip[3]];
                        float o4 = at[(uint)ip[4]];
                        float o5 = at[(uint)ip[5]];
                        float o6 = at[(uint)ip[6]];
                        float o7 = at[(uint)ip[7]];
                        ip += 8;

                        op[0] = o0;
                        op[1] = o1;
                        op[2] = o2;
                        op[3] = o3;
                        op[4] = o4;
                        op[5] = o5;
                        op[6] = o6;
                        op[7] = o7;
                        op   += 8;
                    }
                    ipe += 8;

                    while (ip < ipe)
                    {
                        op[0] = at[(uint)ip[0]];
                        ip++;
                        op++;
                    }
                }
            }
Exemplo n.º 18
0
            unsafe public static void ConvertFloat(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb)
            {
                Debug.Assert(ipstart == opstart);

                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                float *lp = lutstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vlmax = Vector256.Create((float)lutmax);
                    var vzero = Vector256 <float> .Zero;
                    var vione = Vector256.Create(1);

                    ipe -= Vector256 <float> .Count;
                    while (ip <= ipe)
                    {
                        var vf = Avx.Multiply(vlmax, Avx.LoadVector256(ip));
                        vf = Avx.Min(Avx.Max(vzero, vf), vlmax);

                        var vi = Avx.ConvertToVector256Int32WithTruncation(vf);
                        var vp = Avx.ConvertToVector256Single(vi);

                        var vl = Avx2.GatherVector256(lp, vi, sizeof(float));
                        var vh = Avx2.GatherVector256(lp, Avx2.Add(vi, vione), sizeof(float));

                        vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vp));

                        Avx.Store(ip, vf);
                        ip += Vector256 <float> .Count;
                    }
                    ipe += Vector256 <float> .Count;

                    float fmin = vzero.ToScalar(), flmax = vlmax.ToScalar();
                    while (ip < ipe)
                    {
                        float f = (*ip * flmax).Clamp(fmin, flmax);
                        uint  i = (uint)f;

                        *ip++ = Lerp(lp[i], lp[i + 1], f - i);
                    }
                }
                else
#endif
                {
                    var vlmax = new Vector4(lutmax);
                    var vzero = Vector4.Zero;

                    ipe -= 4;
                    while (ip <= ipe)
                    {
                        var vf = (Unsafe.ReadUnaligned <Vector4>(ip) * vlmax).Clamp(vzero, vlmax);

                        float f0 = vf.X;
                        float f1 = vf.Y;
                        float f2 = vf.Z;
                        float f3 = vf.W;

                        uint i0 = (uint)f0;
                        uint i1 = (uint)f1;
                        uint i2 = (uint)f2;
                        uint i3 = (uint)f3;

                        ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0);
                        ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1);
                        ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2);
                        ip[3] = Lerp(lp[i3], lp[i3 + 1], f3 - (int)i3);

                        ip += 4;
                    }
                    ipe += 4;

                    float fmin = vzero.X, flmax = vlmax.X;
                    while (ip < ipe)
                    {
                        float f = (*ip * flmax).Clamp(fmin, flmax);
                        uint  i = (uint)f;

                        *ip++ = Lerp(lp[i], lp[i + 1], f - i);
                    }
                }
            }
Exemplo n.º 19
0
            unsafe public static void ConvertFloat3A(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb)
            {
                Debug.Assert(ipstart == opstart);

                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                float *lp = lutstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vgmsk = Avx.BroadcastVector128ToVector256((float *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.GatherMask3x)));
                    var vgmax = Vector256.Create((float)lutmax);
                    var vzero = Vector256 <float> .Zero;
                    var vfone = Vector256.Create(1f);
                    var vione = Vector256.Create(1);

                    ipe -= Vector256 <float> .Count;
                    while (ip <= ipe)
                    {
                        var vf = Avx.Max(vzero, Avx.LoadVector256(ip));
                        var va = Avx.Shuffle(vf, vf, HWIntrinsics.ShuffleMaskAlpha);

                        vf = Avx.Multiply(vf, Avx.Multiply(vgmax, Avx.Reciprocal(va)));
                        vf = Avx.Min(vf, vgmax);

                        var vi  = Avx.ConvertToVector256Int32WithTruncation(vf);
                        var vfi = Avx.ConvertToVector256Single(vi);

                        var vl = Avx2.GatherMaskVector256(vfone, lp, vi, vgmsk, sizeof(float));
                        var vh = Avx2.GatherMaskVector256(vfone, lp, Avx2.Add(vi, vione), vgmsk, sizeof(float));

                        vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vfi));
                        vf = Avx.Multiply(vf, va);

                        Avx.Store(ip, vf);
                        ip += Vector256 <float> .Count;
                    }
                    ipe += Vector256 <float> .Count;
                }
#endif
                {
                    var   vlmax = new Vector4(lutmax);
                    var   vzero = Vector4.Zero;
                    float famin = new Vector4(1 / 1024f).X;

                    while (ip < ipe)
                    {
                        var vf = Unsafe.ReadUnaligned <Vector4>(ip);

                        float f3 = vf.W;
                        if (f3 < famin)
                        {
                            Unsafe.WriteUnaligned(ip, vzero);
                        }
                        else
                        {
                            vf = (vf * vlmax / f3).Clamp(vzero, vlmax);

                            float f0 = vf.X;
                            float f1 = vf.Y;
                            float f2 = vf.Z;

                            uint i0 = (uint)f0;
                            uint i1 = (uint)f1;
                            uint i2 = (uint)f2;

                            ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0) * f3;
                            ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1) * f3;
                            ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2) * f3;
                        }
                        ip += 4;
                    }
                }
            }
Exemplo n.º 20
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &LookupTables.Alpha[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscale = Vector256.Create(1f / byte.MaxValue);

                        ipe -= Vector256 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3);
                            ip += Vector256 <byte> .Count;

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            vf0 = Avx.Multiply(vf0, vscale);
                            vf1 = Avx.Multiply(vf1, vscale);
                            vf2 = Avx.Multiply(vf2, vscale);
                            vf3 = Avx.Multiply(vf3, vscale);

                            var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                            vf0 = Avx.Multiply(vf0, vfa0);
                            vf1 = Avx.Multiply(vf1, vfa1);
                            vf2 = Avx.Multiply(vf2, vfa2);
                            vf3 = Avx.Multiply(vf3, vfa3);

                            vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                            vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                            vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                            vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <int> .Count, vf1);
                            Avx.Store(op + Vector256 <int> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <int> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscale = Vector128.Create(1f / byte.MaxValue);

                        ipe -= Vector128 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3);
                            ip += Vector128 <byte> .Count;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(vf0, vscale);
                            vf1 = Sse.Multiply(vf1, vscale);
                            vf2 = Sse.Multiply(vf2, vscale);
                            vf3 = Sse.Multiply(vf3, vscale);

                            var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                            vf0 = Sse.Multiply(vf0, vfa0);
                            vf1 = Sse.Multiply(vf1, vfa1);
                            vf2 = Sse.Multiply(vf2, vfa2);
                            vf3 = Sse.Multiply(vf3, vfa3);

                            vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                            vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                            vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                            vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <int> .Count, vf1);
                            Sse.Store(op + Vector128 <int> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <int> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count;
                    }
#endif

                    while (ip < ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        float o3 = at[(uint)ip[3]];
                        ip += 4;

                        op[0] = o0 * o3;
                        op[1] = o1 * o3;
                        op[2] = o2 * o3;
                        op[3] = o3;
                        op   += 4;
                    }
                }
            }