Exemplo n.º 1
0
        //↑をマルチスレッド化
        private unsafe long Test17_Intrinsics_SSE41_DotProduct_float_MT(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count;
            int  rangeSize  = vs.Length / Environment.ProcessorCount;

            Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize),
                             (range) =>
            {
                long subtotal = 0;
                int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                fixed(byte *p = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i);
                        var vv            = Sse2.ConvertToVector128Single(v);
                        //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                        Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                        //vTotal = Sse.Add(vTotal, dp);
                        subtotal += (long)dp.GetElement(0);
                    }
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Exemplo n.º 2
0
        private unsafe void Test44_Intrinsics_V128float_Sqrt(byte[] red, byte[] green, byte[] blue, float[] vv)
        {
            int    simdLength = Vector128 <float> .Count;
            int    lastIndex  = red.Length - (red.Length % simdLength);
            float *tp         = stackalloc float[simdLength];
            //var zero = Vector128<float>.Zero;
            var vm = Vector128 <float> .Zero;

            fixed(byte *pR = red, pG = green, pB = blue)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    var vr = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i)));
                    var vg = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i)));
                    var vb = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i)));
                    vm = Sse.Add(Sse.Multiply(vr, vr), Sse.Multiply(vg, vg));
                    vm = Sse.Add(vm, Sse.Multiply(vb, vb));
                    vm = Sse.Sqrt(vm);

                    Sse.Store(tp, vm);
                    for (int m = 0; m < simdLength; m++)
                    {
                        vv[i + m] = tp[m];
                    }
                }
            }

            Amari(lastIndex, red.Length, red, green, blue, vv);
        }
Exemplo n.º 3
0
        //Intrinsics FMA MultiplyAdd double
        private unsafe long Test4_Intrinsics_FMA_MultiplyAdd_double(byte[] vs)
        {
            long total                = 0;
            int  simdLength           = Vector128 <int> .Count;
            int  lastIndex            = vs.Length - (vs.Length % simdLength);
            Vector256 <double> vTotal = Vector256.Create(0d);

            fixed(byte *p = vs)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    Vector128 <int>    v = Sse41.ConvertToVector128Int32(p + i);
                    Vector256 <double> f = Avx.ConvertToVector256Double(v);
                    vTotal = Fma.MultiplyAdd(f, f, vTotal);//double
                }
            }

            double *pp = stackalloc double[Vector256 <double> .Count];

            Avx.Store(pp, vTotal);
            for (int i = 0; i < Vector256 <double> .Count; i++)
            {
                total += (long)pp[i];
            }
            for (int i = lastIndex; i < vs.Length; i++)
            {
                total += vs[i] * vs[i];
            }
            return(total);
        }
Exemplo n.º 4
0
        private unsafe void Test2_Vector256Double(byte[] x, byte[] y, byte[] z, byte[] xx, byte[] yy, byte[] zz, double[] result)
        {
            Parallel.ForEach(Partitioner.Create(0, x.Length), range =>
            {
                int simdLength = Vector256 <double> .Count;
                int lastIndex  = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                Vector256 <double> vx, vy, vz, vm;
                fixed(byte *px = x, py = y, pz = z, pxx = xx, pyy = yy, pzz = zz)
                {
                    fixed(double *dp = result)
                    {
                        for (int i = range.Item1; i < range.Item2; i += simdLength)
                        {
                            //引き算
                            vx = Avx.Subtract(
                                Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(px + i)),
                                Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pxx + i)));
                            vy = Avx.Subtract(
                                Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(py + i)),
                                Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pyy + i)));
                            vz = Avx.Subtract(
                                Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pz + i)),
                                Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pzz + i)));

                            //2乗和の平方根
                            vm = Avx.Add(Avx.Multiply(vx, vx), Avx.Multiply(vy, vy));
                            vm = Avx.Sqrt(Avx.Add(vm, Avx.Multiply(vz, vz)));

                            //結果を配列に書き込み
                            Avx.Store(dp + i, vm);
                        }
                    }
                }
            });
        }
        public void RunFldScenario()
        {
            var result = Sse41.ConvertToVector128Int32(_fld);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_fld, _dataTable.outArrayPtr);
        }
Exemplo n.º 6
0
        //↑をオーバーフローしない程度に配列を分割して計算
        private unsafe long Test28_Intrinsics_SSE41_DotProduct_float_MT_Kai(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count * 4;

            //集計用のVector128<float> vTotalで扱える最大要素数 = 1032
            //floatの仮数部24bit / byte型最大値 * byte型最大値
            //16777215 / (255 * 255) * 4 = 1032.0471 これの小数点以下切り捨てを
            //1区分あたりの要素数(分割サイズ)
            int rangeSize =
                ((1 << 24) - 1) / (byte.MaxValue * byte.MaxValue) * Vector128 <float> .Count;//1032

            Parallel.ForEach(
                Partitioner.Create(0, vs.Length, rangeSize),
                (range) =>
            {
                var vTotal    = Vector128 <float> .Zero;
                int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                fixed(byte *p = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i);
                        var vv            = Sse2.ConvertToVector128Single(v);
                        //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                        Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 4);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 8);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 12);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる
                        vTotal = Sse.Add(vTotal, dp);
                    }
                }
                long subtotal = 0;
                float *f      = stackalloc float[Vector128 <float> .Count];
                Sse.Store(f, vTotal);
                for (int i = 0; i < Vector128 <float> .Count; i++)
                {
                    subtotal += (long)f[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
        public void RunLclVarScenario_LoadAligned()
        {
            var firstOp = Sse2.LoadAlignedVector128((Int16 *)(_dataTable.inArrayPtr));
            var result  = Sse41.ConvertToVector128Int32(firstOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_UnsafeRead()
        {
            var firstOp = Unsafe.Read <Vector128 <Int16> >(_dataTable.inArrayPtr);
            var result  = Sse41.ConvertToVector128Int32(firstOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, _dataTable.outArrayPtr);
        }
        public void RunLclFldScenario()
        {
            var test   = new SimpleUnaryOpTest__ConvertToVector128Int32Int16();
            var result = Sse41.ConvertToVector128Int32(test._fld);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Sse41.ConvertToVector128Int32(
                Unsafe.Read <Vector128 <Int16> >(_dataTable.inArrayPtr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_Ptr()
        {
            var result = Sse41.ConvertToVector128Int32(
                (SByte *)(_dataTable.inArrayPtr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_LoadAligned()
        {
            var result = Sse41.ConvertToVector128Int32(
                Sse2.LoadAlignedVector128((Int16 *)(_dataTable.inArrayPtr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
Exemplo n.º 13
0
        private unsafe ulong HashSse(byte *buf, int len)
        {
            ulong           h       = 0;
            Vector128 <int> v_ps    = Vector128 <int> .Zero;
            bool            useSse4 = Sse41.IsSupported;

            int i = 0;

            for (int j = len - i - 1; len - i >= 4; i += 4, j = len - i - 1)
            {
                Vector128 <int> c_v = Sse2.LoadVector128(&kMultFactorsPtr[j - 3]);
                c_v = Sse2.Shuffle(c_v, SO123);
                Vector128 <byte> q_v = Sse2.LoadVector128(buf + i);

                Vector128 <int> s_v;
                if (useSse4)
                {
                    s_v = Sse41.ConvertToVector128Int32(q_v);
                }
                else
                {
                    q_v = Sse2.UnpackLow(q_v, q_v);
                    s_v = Sse2.ShiftRightLogical(Sse2.UnpackLow(q_v.AsUInt16(), q_v.AsUInt16()).AsInt32(), 24);
                }

                if (useSse4)
                {
                    v_ps = Sse2.Add(v_ps, Sse41.MultiplyLow(c_v, s_v));
                }
                else
                {
                    Vector128 <ulong> v_tmp1 = Sse2.Multiply(c_v.AsUInt32(), s_v.AsUInt32());
                    Vector128 <ulong> v_tmp2 =
                        Sse2.Multiply(Sse2.ShiftRightLogical128BitLane(c_v.AsByte(), 4).AsUInt32(),
                                      Sse2.ShiftRightLogical128BitLane(s_v.AsByte(), 4).AsUInt32());
                    ;
                    v_ps = Sse2.Add(v_ps, Sse2.UnpackLow(Sse2.Shuffle(v_tmp1.AsInt32(), SOO2O),
                                                         Sse2.Shuffle(v_tmp2.AsInt32(), SOO2O)));
                }
            }

            v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S23O1));
            v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S1O32));
            h   += Sse2.ConvertToUInt32(v_ps.AsUInt32());

            for (; i < len; i++)
            {
                int   index = len - i - 1;
                ulong c     = (uint)kMultFactors[index];
                h += c * buf[i];
            }

            return(h & (kBase - 1));
        }
Exemplo n.º 14
0
 public RgbaColor32 GetColor32()
 {
     if (Sse41.IsSupported)
     {
         Vector128 <byte> color = Vector128.CreateScalarUnsafe(Unsafe.As <RgbaColor8, uint>(ref this)).AsByte();
         return(new RgbaColor32(Sse41.ConvertToVector128Int32(color)));
     }
     else
     {
         return(new RgbaColor32(R, G, B, A));
     }
 }
Exemplo n.º 15
0
        //4倍速、コンバーターを使ってVector作成
        private unsafe void Test6(byte[] vs)
        {
            int simdLength = Vector256 <double> .Count;
            int lastIndex  = vs.Length - (vs.Length % simdLength);

            fixed(byte *p = vs)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    _ = Avx.Sqrt(Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(p)));
                }
            }
        }
Exemplo n.º 16
0
        private unsafe void TestAddSum(byte[] vs)
        {
            fixed(byte *p = vs)
            {
                var v  = Avx.LoadVector256(p);
                var v2 = Avx.LoadVector256(p + 32);
                //Avx.MultipleSumAbsoluteDifferences;
                Vector256 <int>   i1 = Avx2.ConvertToVector256Int32(p);
                Vector256 <float> f1 = Avx.ConvertToVector256Single(i1);
                Vector256 <float> m1 = Avx.Multiply(f1, f1);

                Vector128 <int>    i128 = Sse41.ConvertToVector128Int32(p);
                Vector256 <double> d256 = Avx.ConvertToVector256Double(i128);
                var dZero = Vector256 <double> .Zero;
                Vector256 <double> ma1 = Fma.MultiplyAdd(d256, d256, dZero);

                var i256  = Avx2.ConvertToVector256Int32(p);
                var f256  = Avx.ConvertToVector256Single(i256);
                var fZero = Vector256 <float> .Zero;
                var ma2   = Fma.MultiplyAdd(f256, f256, fZero);

                Vector128 <float> s128 = Sse2.ConvertToVector128Single(i128);
                Vector128 <float> ms   = Sse.MultiplyScalar(s128, s128);

//                x86 / x64 SIMD命令一覧表(SSE~AVX2)
//https://www.officedaytime.com/tips/simd.html
                //                pmaddwd
                //https://www.officedaytime.com/tips/simdimg/si.php?f=pmaddwd

                Vector128 <short> sh128 = Sse41.ConvertToVector128Int16(p);
                Vector128 <int>   vv3   = Avx.MultiplyAddAdjacent(sh128, sh128);

                var neko = 0;
                //Avx.MultiplyAddAdjacent;
                //Avx.MultiplyHigh;
                //Avx.MultiplyHighRoundScale;
                //Avx.MultiplyLow;
                //Avx.MultiplyScalar;
                //Fma.MultiplyAdd;
                //Fma.MultiplyAddNegated;
                //Fma.MultiplyAddNegatedScalar;
                //Fma.MultiplyAddScalar;
                //Fma.MultiplyAddSubtract;
                //Fma.MultiplySubtract;
                //Fma.MultiplySubtractAdd;
                //Fma.MultiplySubtractNegated;
                //Fma.MultiplySubtractNegatedScalar;
                //Fma.MultiplySubtractScalar;
            }
        }
Exemplo n.º 17
0
        public static unsafe void CalculateDiagonalSection_Sse41 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct
        {
            if (typeof(T) == typeof(int))
            {
                var diag1Ptr = (int *)refDiag1Ptr;
                var diag2Ptr = (int *)refDiag2Ptr;

                var sourceVector = Sse41.ConvertToVector128Int32((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count);
                var targetVector = Sse41.ConvertToVector128Int32((ushort *)targetPtr + columnIndex - 1);
                targetVector = Sse2.Shuffle(targetVector, 0x1b);
                var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector);

                var substitutionCost = Sse2.Add(
                    Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count),
                    substitutionCostAdjustment
                    );

                var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1));
                var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count);

                var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost);
                localCost = Sse2.Add(localCost, Vector128.Create(1));

                Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost);
            }
            else if (typeof(T) == typeof(ushort))
            {
                var diag1Ptr = (ushort *)refDiag1Ptr;
                var diag2Ptr = (ushort *)refDiag2Ptr;

                var sourceVector = Sse3.LoadDquVector128((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count);
                var targetVector = Sse3.LoadDquVector128((ushort *)targetPtr + columnIndex - 1);
                targetVector = Ssse3.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_128).AsUInt16();
                var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector);

                var substitutionCost = Sse2.Add(
                    Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count),
                    substitutionCostAdjustment
                    );

                var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1));
                var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count);

                var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost);
                localCost = Sse2.Add(localCost, Vector128.Create((ushort)1));

                Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost);
            }
        }
Exemplo n.º 18
0
 //12倍速、やっぱりVectorのSqrtは速い
 private unsafe void Test6_MT(byte[] vs)
 {
     Parallel.ForEach(Partitioner.Create(0, ELEMENT_COUNT), range =>
     {
         int simdLength = Vector256 <double> .Count;
         int lastIndex  = range.Item2 - (range.Item2 - range.Item1) % simdLength;
         fixed(byte *p  = vs)
         {
             for (int i = range.Item1; i < range.Item2; i += simdLength)
             {
                 _ = Avx.Sqrt(Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(p)));
             }
         }
     });
 }
Exemplo n.º 19
0
        //Intrinsics SSE41 DotProduct、ループの中で4個づつ処理
        private unsafe long Test8_Intrinsics_SSE41_DotProduct_float(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count * 4;
            int  lastIndex  = vs.Length - (vs.Length % simdLength);
            var  vTotal     = Vector128 <float> .Zero;

            fixed(byte *p = vs)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    Vector128 <int> v  = Sse41.ConvertToVector128Int32(p + i);
                    var             vv = Sse2.ConvertToVector128Single(v);
                    //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                    Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                    vTotal = Sse.Add(vTotal, dp);

                    v      = Sse41.ConvertToVector128Int32(p + i + 4);
                    vv     = Sse2.ConvertToVector128Single(v);
                    dp     = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる
                    vTotal = Sse.Add(vTotal, dp);

                    v      = Sse41.ConvertToVector128Int32(p + i + 8);
                    vv     = Sse2.ConvertToVector128Single(v);
                    dp     = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる
                    vTotal = Sse.Add(vTotal, dp);

                    v      = Sse41.ConvertToVector128Int32(p + i + 12);
                    vv     = Sse2.ConvertToVector128Single(v);
                    dp     = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる
                    vTotal = Sse.Add(vTotal, dp);
                }
            }

            float *f = stackalloc float[Vector128 <int> .Count];

            Sse.Store(f, vTotal);
            for (int i = 0; i < Vector128 <int> .Count; i++)
            {
                total += (long)f[i];
            }
            for (int i = lastIndex; i < vs.Length; i++)
            {
                total += vs[i] * vs[i];
            }
            return(total);
        }
Exemplo n.º 20
0
        //        x86/x64 SIMD命令一覧表 (SSE~AVX2)
        //https://www.officedaytime.com/tips/simd.html
        //算術演算 ドット積 DPPS
        //Intrinsics SSE41 DotProduct
        private unsafe long Test7_Intrinsics_SSE41_DotProduct_float(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count;
            int  lastIndex  = vs.Length - (vs.Length % simdLength);

            fixed(byte *p = vs)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    Vector128 <int> v  = Sse41.ConvertToVector128Int32(p + i);
                    var             vv = Sse2.ConvertToVector128Single(v);
                    //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                    Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                    total += (long)dp.GetElement(0);
                }
            }

            for (int i = lastIndex; i < vs.Length; i++)
            {
                total += vs[i] * vs[i];
            }
            return(total);
        }
Exemplo n.º 21
0
 public static unsafe Vector128 <int> xmm__1(sbyte *address)
 {
     return(Sse41.ConvertToVector128Int32(address));
 }
Exemplo n.º 22
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &LookupTables.Alpha[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscale = Vector256.Create(1f / byte.MaxValue);
                        var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMask3To3xChan)));

                        ipe -= Vector256 <byte> .Count * 3 / 4 + 2;                       // +2 accounts for the overrun on the last read
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3 / 4);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 6 / 4);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 9 / 4);
                            ip += Vector256 <byte> .Count * 3 / 4;

                            vi0 = Avx2.PermuteVar8x32(vi0, vmaskp);
                            vi1 = Avx2.PermuteVar8x32(vi1, vmaskp);
                            vi2 = Avx2.PermuteVar8x32(vi2, vmaskp);
                            vi3 = Avx2.PermuteVar8x32(vi3, vmaskp);

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            vf0 = Avx.Multiply(vf0, vscale);
                            vf1 = Avx.Multiply(vf1, vscale);
                            vf2 = Avx.Multiply(vf2, vscale);
                            vf3 = Avx.Multiply(vf3, vscale);

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <float> .Count, vf1);
                            Avx.Store(op + Vector256 <float> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <float> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count * 3 / 4 + 2;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscale = Vector128.Create(1f / byte.MaxValue);

                        ipe -= Vector128 <byte> .Count * 3 / 4 + 1;                       // +1 accounts for the overrun on the last read
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3 / 4);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 6 / 4);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 9 / 4);
                            ip += Vector128 <byte> .Count * 3 / 4;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(vf0, vscale);
                            vf1 = Sse.Multiply(vf1, vscale);
                            vf2 = Sse.Multiply(vf2, vscale);
                            vf3 = Sse.Multiply(vf3, vscale);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <float> .Count, vf1);
                            Sse.Store(op + Vector128 <float> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <float> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count * 3 / 4 + 1;
                    }
#endif

                    while (ip < ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        ip += 3;

                        op[0] = o0;
                        op[1] = o1;
                        op[2] = o2;
                        op   += 4;
                    }
                }
            }
Exemplo n.º 23
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &valueTable[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscal = Vector256.Create(scale);
                        var voffs = Fma.IsSupported ? Vector256.Create(offset * scale) : Vector256.Create(offset);

                        ipe -= Vector256 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3);
                            ip += Vector256 <byte> .Count;

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            if (Fma.IsSupported)
                            {
                                vf0 = Fma.MultiplySubtract(vf0, vscal, voffs);
                                vf1 = Fma.MultiplySubtract(vf1, vscal, voffs);
                                vf2 = Fma.MultiplySubtract(vf2, vscal, voffs);
                                vf3 = Fma.MultiplySubtract(vf3, vscal, voffs);
                            }
                            else
                            {
                                vf0 = Avx.Multiply(Avx.Subtract(vf0, voffs), vscal);
                                vf1 = Avx.Multiply(Avx.Subtract(vf1, voffs), vscal);
                                vf2 = Avx.Multiply(Avx.Subtract(vf2, voffs), vscal);
                                vf3 = Avx.Multiply(Avx.Subtract(vf3, voffs), vscal);
                            }

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <int> .Count, vf1);
                            Avx.Store(op + Vector256 <int> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <int> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscal = Vector128.Create(scale);
                        var voffs = Vector128.Create(offset);

                        ipe -= Vector128 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3);
                            ip += Vector128 <byte> .Count;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(Sse.Subtract(vf0, voffs), vscal);
                            vf1 = Sse.Multiply(Sse.Subtract(vf1, voffs), vscal);
                            vf2 = Sse.Multiply(Sse.Subtract(vf2, voffs), vscal);
                            vf3 = Sse.Multiply(Sse.Subtract(vf3, voffs), vscal);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <int> .Count, vf1);
                            Sse.Store(op + Vector128 <int> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <int> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count;
                    }
#elif VECTOR_CONVERT
                    var vscal = new VectorF(scale);
                    var voffs = new VectorF(offset);

                    ipe -= Vector <byte> .Count;
                    while (ip <= ipe)
                    {
                        var vb = Unsafe.ReadUnaligned <Vector <byte> >(ip);
                        Vector.Widen(vb, out var vs0, out var vs1);
                        Vector.Widen(vs0, out var vi0, out var vi1);
                        Vector.Widen(vs1, out var vi2, out var vi3);
                        ip += Vector <byte> .Count;

                        var vf0 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi0));
                        var vf1 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi1));
                        var vf2 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi2));
                        var vf3 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi3));

                        vf0 = (vf0 - voffs) * vscal;
                        vf1 = (vf1 - voffs) * vscal;
                        vf2 = (vf2 - voffs) * vscal;
                        vf3 = (vf3 - voffs) * vscal;

                        Unsafe.WriteUnaligned(op, vf0);
                        Unsafe.WriteUnaligned(op + VectorF.Count, vf1);
                        Unsafe.WriteUnaligned(op + VectorF.Count * 2, vf2);
                        Unsafe.WriteUnaligned(op + VectorF.Count * 3, vf3);
                        op += Vector <byte> .Count;
                    }
                    ipe += Vector <byte> .Count;
#endif

                    ipe -= 8;
                    while (ip <= ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        float o3 = at[(uint)ip[3]];
                        float o4 = at[(uint)ip[4]];
                        float o5 = at[(uint)ip[5]];
                        float o6 = at[(uint)ip[6]];
                        float o7 = at[(uint)ip[7]];
                        ip += 8;

                        op[0] = o0;
                        op[1] = o1;
                        op[2] = o2;
                        op[3] = o3;
                        op[4] = o4;
                        op[5] = o5;
                        op[6] = o6;
                        op[7] = o7;
                        op   += 8;
                    }
                    ipe += 8;

                    while (ip < ipe)
                    {
                        op[0] = at[(uint)ip[0]];
                        ip++;
                        op++;
                    }
                }
            }
Exemplo n.º 24
0
        private unsafe static void ResampleDefaultQuality(Span <float> outputBuffer, ReadOnlySpan <short> inputBuffer, float ratio, ref float fraction, int sampleCount, bool needPitch)
        {
            ReadOnlySpan <float> parameters = GetDefaultParameter(ratio);

            int inputBufferIndex = 0, i = 0;

            // TODO: REV8 fast path (when needPitch == false the input index progression is constant + we need SIMD)

            if (Sse41.IsSupported)
            {
                if (ratio == 1f)
                {
                    fixed(short *pInput = inputBuffer)
                    {
                        fixed(float *pOutput = outputBuffer, pParameters = parameters)
                        {
                            Vector128 <float> parameter = Sse.LoadVector128(pParameters);

                            for (; i < (sampleCount & ~3); i += 4)
                            {
                                Vector128 <int> intInput0 = Sse41.ConvertToVector128Int32(pInput + (uint)i);
                                Vector128 <int> intInput1 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 1);
                                Vector128 <int> intInput2 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 2);
                                Vector128 <int> intInput3 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 3);

                                Vector128 <float> input0 = Sse2.ConvertToVector128Single(intInput0);
                                Vector128 <float> input1 = Sse2.ConvertToVector128Single(intInput1);
                                Vector128 <float> input2 = Sse2.ConvertToVector128Single(intInput2);
                                Vector128 <float> input3 = Sse2.ConvertToVector128Single(intInput3);

                                Vector128 <float> mix0 = Sse.Multiply(input0, parameter);
                                Vector128 <float> mix1 = Sse.Multiply(input1, parameter);
                                Vector128 <float> mix2 = Sse.Multiply(input2, parameter);
                                Vector128 <float> mix3 = Sse.Multiply(input3, parameter);

                                Vector128 <float> mix01 = Sse3.HorizontalAdd(mix0, mix1);
                                Vector128 <float> mix23 = Sse3.HorizontalAdd(mix2, mix3);

                                Vector128 <float> mix0123 = Sse3.HorizontalAdd(mix01, mix23);

                                Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123));
                            }
                        }
                    }

                    inputBufferIndex = i;
                }
                else
                {
                    fixed(short *pInput = inputBuffer)
                    {
                        fixed(float *pOutput = outputBuffer, pParameters = parameters)
                        {
                            for (; i < (sampleCount & ~3); i += 4)
                            {
                                uint baseIndex0  = (uint)(fraction * 128) * 4;
                                uint inputIndex0 = (uint)inputBufferIndex;

                                fraction += ratio;

                                uint baseIndex1  = ((uint)(fraction * 128) & 127) * 4;
                                uint inputIndex1 = (uint)inputBufferIndex + (uint)fraction;

                                fraction += ratio;

                                uint baseIndex2  = ((uint)(fraction * 128) & 127) * 4;
                                uint inputIndex2 = (uint)inputBufferIndex + (uint)fraction;

                                fraction += ratio;

                                uint baseIndex3  = ((uint)(fraction * 128) & 127) * 4;
                                uint inputIndex3 = (uint)inputBufferIndex + (uint)fraction;

                                fraction         += ratio;
                                inputBufferIndex += (int)fraction;

                                // Only keep lower part (safe as fraction isn't supposed to be negative)
                                fraction -= (int)fraction;

                                Vector128 <float> parameter0 = Sse.LoadVector128(pParameters + baseIndex0);
                                Vector128 <float> parameter1 = Sse.LoadVector128(pParameters + baseIndex1);
                                Vector128 <float> parameter2 = Sse.LoadVector128(pParameters + baseIndex2);
                                Vector128 <float> parameter3 = Sse.LoadVector128(pParameters + baseIndex3);

                                Vector128 <int> intInput0 = Sse41.ConvertToVector128Int32(pInput + inputIndex0);
                                Vector128 <int> intInput1 = Sse41.ConvertToVector128Int32(pInput + inputIndex1);
                                Vector128 <int> intInput2 = Sse41.ConvertToVector128Int32(pInput + inputIndex2);
                                Vector128 <int> intInput3 = Sse41.ConvertToVector128Int32(pInput + inputIndex3);

                                Vector128 <float> input0 = Sse2.ConvertToVector128Single(intInput0);
                                Vector128 <float> input1 = Sse2.ConvertToVector128Single(intInput1);
                                Vector128 <float> input2 = Sse2.ConvertToVector128Single(intInput2);
                                Vector128 <float> input3 = Sse2.ConvertToVector128Single(intInput3);

                                Vector128 <float> mix0 = Sse.Multiply(input0, parameter0);
                                Vector128 <float> mix1 = Sse.Multiply(input1, parameter1);
                                Vector128 <float> mix2 = Sse.Multiply(input2, parameter2);
                                Vector128 <float> mix3 = Sse.Multiply(input3, parameter3);

                                Vector128 <float> mix01 = Sse3.HorizontalAdd(mix0, mix1);
                                Vector128 <float> mix23 = Sse3.HorizontalAdd(mix2, mix3);

                                Vector128 <float> mix0123 = Sse3.HorizontalAdd(mix01, mix23);

                                Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123));
                            }
                        }
                    }
                }
            }

            for (; i < sampleCount; i++)
            {
                int baseIndex = (int)(fraction * 128) * 4;
                ReadOnlySpan <float> parameter    = parameters.Slice(baseIndex, 4);
                ReadOnlySpan <short> currentInput = inputBuffer.Slice(inputBufferIndex, 4);

                outputBuffer[i] = (float)Math.Round(currentInput[0] * parameter[0] +
                                                    currentInput[1] * parameter[1] +
                                                    currentInput[2] * parameter[2] +
                                                    currentInput[3] * parameter[3]);

                fraction         += ratio;
                inputBufferIndex += (int)fraction;

                // Only keep lower part (safe as fraction isn't supposed to be negative)
                fraction -= (int)fraction;
            }
        }
Exemplo n.º 25
0
        unsafe private void remapDitherSse2(byte *pimage, int *perr, byte *pout, uint *pilut, OctreeNode *ptree, uint *ppal, ref nuint nextFree, nint cp)
        {
            var transnode = new OctreeNode();

            transnode.Sums[3] = byte.MaxValue;

            var vpmax = Vector128.Create((int)byte.MaxValue);
            var vprnd = Vector128.Create(7);
            var vzero = Vector128 <int> .Zero;

            nuint level = leafLevel;
            var   prnod = default(OctreeNode *);

            byte *ip = pimage, ipe = ip + cp * sizeof(uint);
            byte *op = pout;
            int * ep = perr;

            var vppix = vzero;
            var vperr = vzero;
            var vnerr = vzero;

            do
            {
                Vector128 <int> vpix, vdiff;
                if ((byte)ip[3] < alphaThreshold)
                {
                    vppix = vzero;
                    vdiff = vzero;
                    prnod = &transnode;
                    goto FoundExact;
                }

                if (Sse41.IsSupported)
                {
                    vpix = Sse41.ConvertToVector128Int32(ip);
                }
                else
                {
                    vpix = Sse2.UnpackLow(Sse2.UnpackLow(Sse2.LoadScalarVector128((int *)ip).AsByte(), vzero.AsByte()).AsInt16(), vzero.AsInt16()).AsInt32();
                }

                var verr = Sse2.Add(Sse2.Add(vprnd, Sse2.LoadVector128(ep)), Sse2.Subtract(Sse2.ShiftLeftLogical(vnerr, 3), vnerr));
                vpix = Sse2.Add(vpix, Sse2.ShiftRightArithmetic(verr, 4));
                vpix = Sse2.Min(vpix.AsInt16(), vpmax.AsInt16()).AsInt32();
                vpix = Sse2.Max(vpix.AsInt16(), vzero.AsInt16()).AsInt32();

                if (Sse2.MoveMask(Sse2.CompareEqual(vppix, vpix).AsByte()) == ushort.MaxValue)
                {
                    vdiff = vzero;
                    goto FoundExact;
                }

                vppix = vpix;
                nuint idx =
                    pilut[(nuint)Sse2.ConvertToUInt32(vppix.AsUInt32())] |
                    pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 2) + 256] |
                    pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 4) + 512];
                nuint next = idx & 7;

                var pnode = ptree + next;
                for (nuint i = 0; i <= level; i++)
                {
                    idx >>= 3;
                    nuint child = idx & 7;

                    ushort *children = (ushort *)pnode;
                    next = children[child];
                    if (next == 0)
                    {
                        uint *sums = (uint *)(children + 8);

                        if (i < minLeafLevel)
                        {
                            next            = nextFree++;
                            children[child] = (ushort)next;
                            pnode           = ptree + next;

                            if (i == minLeafLevel - 1)
                            {
                                initNode(pnode, vppix);
                                break;
                            }
                            else
                            {
                                uint *csums = (uint *)((ushort *)pnode + 8);
                                csums[3] = byte.MaxValue;
                            }
                        }
                        else if ((byte)sums[3] == byte.MaxValue)
                        {
                            for (nuint j = 1; j < 8; j++)
                            {
                                nuint sibling = children[child ^ j];
                                if (sibling != 0)
                                {
                                    var   snode = ptree + sibling;
                                    uint *ssums = (uint *)((ushort *)snode + 8);
                                    if ((byte)ssums[3] == byte.MaxValue)
                                    {
                                        next = sibling;
                                        nuint mask = child ^ sibling;
                                        idx = (child & mask) | (idx & ~mask);
                                        break;
                                    }
                                    else
                                    {
                                        prnod = snode;
                                        goto Found;
                                    }
                                }
                            }
                        }
                        else
                        {
                            break;
                        }
                    }

                    pnode = ptree + next;
                }

                prnod = pnode;

Found:
                vdiff = Sse2.Subtract(vppix, Sse2.LoadVector128((int *)((ushort *)prnod + 8)));

FoundExact:
                int *psums = (int *)((ushort *)prnod + 8);

                ip += sizeof(uint);
                *op++ = (byte)psums[3];

                Sse2.Store(ep - Vector128 <int> .Count, Sse2.Add(vperr, Sse2.Add(vdiff, vdiff)));
                ep += Vector128 <int> .Count;

                vperr = Sse2.Add(Sse2.ShiftLeftLogical(vdiff, 2), vnerr);
                vnerr = vdiff;
            } while (ip < ipe);

            Sse2.Store(ep - Vector128 <int> .Count, vperr);
        }
Exemplo n.º 26
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                fixed(float *atstart = &LookupTables.Alpha[0])
                {
                    byte * ip = ipstart, ipe = ipstart + cb;
                    float *op = (float *)opstart, at = atstart;

#if HWINTRINSICS
                    if (Avx2.IsSupported)
                    {
                        var vscale = Vector256.Create(1f / byte.MaxValue);

                        ipe -= Vector256 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Avx2.ConvertToVector256Int32(ip);
                            var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count);
                            var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2);
                            var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3);
                            ip += Vector256 <byte> .Count;

                            var vf0 = Avx.ConvertToVector256Single(vi0);
                            var vf1 = Avx.ConvertToVector256Single(vi1);
                            var vf2 = Avx.ConvertToVector256Single(vi2);
                            var vf3 = Avx.ConvertToVector256Single(vi3);

                            vf0 = Avx.Multiply(vf0, vscale);
                            vf1 = Avx.Multiply(vf1, vscale);
                            vf2 = Avx.Multiply(vf2, vscale);
                            vf3 = Avx.Multiply(vf3, vscale);

                            var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                            vf0 = Avx.Multiply(vf0, vfa0);
                            vf1 = Avx.Multiply(vf1, vfa1);
                            vf2 = Avx.Multiply(vf2, vfa2);
                            vf3 = Avx.Multiply(vf3, vfa3);

                            vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                            vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                            vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                            vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                            Avx.Store(op, vf0);
                            Avx.Store(op + Vector256 <int> .Count, vf1);
                            Avx.Store(op + Vector256 <int> .Count * 2, vf2);
                            Avx.Store(op + Vector256 <int> .Count * 3, vf3);
                            op += Vector256 <byte> .Count;
                        }
                        ipe += Vector256 <byte> .Count;
                    }
                    else if (Sse41.IsSupported)
                    {
                        var vscale = Vector128.Create(1f / byte.MaxValue);

                        ipe -= Vector128 <byte> .Count;
                        while (ip <= ipe)
                        {
                            var vi0 = Sse41.ConvertToVector128Int32(ip);
                            var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count);
                            var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2);
                            var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3);
                            ip += Vector128 <byte> .Count;

                            var vf0 = Sse2.ConvertToVector128Single(vi0);
                            var vf1 = Sse2.ConvertToVector128Single(vi1);
                            var vf2 = Sse2.ConvertToVector128Single(vi2);
                            var vf3 = Sse2.ConvertToVector128Single(vi3);

                            vf0 = Sse.Multiply(vf0, vscale);
                            vf1 = Sse.Multiply(vf1, vscale);
                            vf2 = Sse.Multiply(vf2, vscale);
                            vf3 = Sse.Multiply(vf3, vscale);

                            var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                            var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                            vf0 = Sse.Multiply(vf0, vfa0);
                            vf1 = Sse.Multiply(vf1, vfa1);
                            vf2 = Sse.Multiply(vf2, vfa2);
                            vf3 = Sse.Multiply(vf3, vfa3);

                            vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                            vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                            vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                            vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                            Sse.Store(op, vf0);
                            Sse.Store(op + Vector128 <int> .Count, vf1);
                            Sse.Store(op + Vector128 <int> .Count * 2, vf2);
                            Sse.Store(op + Vector128 <int> .Count * 3, vf3);
                            op += Vector128 <byte> .Count;
                        }
                        ipe += Vector128 <byte> .Count;
                    }
#endif

                    while (ip < ipe)
                    {
                        float o0 = at[(uint)ip[0]];
                        float o1 = at[(uint)ip[1]];
                        float o2 = at[(uint)ip[2]];
                        float o3 = at[(uint)ip[3]];
                        ip += 4;

                        op[0] = o0 * o3;
                        op[1] = o1 * o3;
                        op[2] = o2 * o3;
                        op[3] = o3;
                        op   += 4;
                    }
                }
            }
Exemplo n.º 27
0
 public static Vector128 <int> _mm_cvtepu8_epi32(Vector128 <byte> value)
 {
     return(Sse41.ConvertToVector128Int32(value));
 }
Exemplo n.º 28
0
 public static unsafe Vector128 <int> xmm(ushort *address)
 {
     return(Sse41.ConvertToVector128Int32(address));
 }
Exemplo n.º 29
0
 public static Vector128 <int> _mm_cvtepu16_epi32(Vector128 <ushort> value)
 {
     return(Sse41.ConvertToVector128Int32(value));
 }
Exemplo n.º 30
0
        public unsafe void Process(MutableByteImage currentPicture, MutableByteImage nextPicture)
        {
            float MaxFactor = 1;

            float[] attackAr = new float[] { Attack, Attack, Attack, Attack };
            float[] decayAr  = new float[] { Decay, Decay, Decay, Decay };

            int length = nextPicture.Data.Length;

            float *MaxFactorPtr = &MaxFactor;

            fixed(float *AttackPtr = attackAr)
            fixed(float *DecayPtr     = decayAr)
            fixed(byte *currentPicPtr = currentPicture.Data)
            fixed(byte *nextPicPtr    = nextPicture.Data)
            {
                byte *currentPxPtr = currentPicPtr;
                byte *nextPxPtr    = nextPicPtr;


                int remainingLength = length % 4;

                for (int i = 0; i < length; i += 4)
                {
                    var currentColor     = *nextPxPtr;
                    var workingDataColor = *currentPxPtr;

                    var currentColorPtr     = nextPxPtr;
                    var workingDataColorPtr = currentPxPtr;

                    var cmpResult = Avx.ConvertToVector128Single(
                        Sse2.CompareGreaterThan(
                            Sse41.ConvertToVector128Int32(currentColorPtr),
                            Sse41.ConvertToVector128Int32(workingDataColorPtr)
                            ));

                    var pixelFactor = Avx.Add(
                        Avx.And(cmpResult, Avx.BroadcastScalarToVector128(AttackPtr)),
                        Avx.AndNot(cmpResult, Avx.BroadcastScalarToVector128(DecayPtr))
                        );

                    var result = Avx.Add(
                        Avx.Multiply(
                            Avx.Subtract(
                                Avx.BroadcastScalarToVector128(MaxFactorPtr),
                                pixelFactor),
                            Sse41.ConvertToVector128Single(
                                Sse41.ConvertToVector128Int32(workingDataColorPtr))
                            ),
                        Avx.Multiply(
                            pixelFactor,
                            Sse41.ConvertToVector128Single(
                                Sse41.ConvertToVector128Int32(currentColorPtr))));

                    // TODO improve Store
                    *currentPxPtr = (byte)Avx.Extract(result, 0);
                    currentPxPtr++;
                    *currentPxPtr = (byte)Avx.Extract(result, 1);
                    currentPxPtr++;
                    *currentPxPtr = (byte)Avx.Extract(result, 2);
                    currentPxPtr++;
                    *currentPxPtr = (byte)Avx.Extract(result, 3);
                    currentPxPtr++;

                    nextPxPtr += 4;
                }

                for (int i = 0; i < remainingLength; i++)
                {
                    var currentColor     = *nextPxPtr;
                    var workingDataColor = *currentPxPtr;

                    var newPixelFactor = workingDataColor < currentColor ? Attack : Decay;

                    var newPixelValue = (byte)((currentColor * newPixelFactor) + (workingDataColor * (1 - newPixelFactor)));

                    *currentPxPtr = newPixelValue;
                    currentPxPtr++;
                    nextPxPtr++;
                }
            }
        }