示例#1
0
    private static bool ProblemWithLoadLow_Sse()
    {
        var data = stackalloc float[2];

        data[0] = 1;
        data[1] = 2;
        JitUse(data);

        Vector128 <float> a = Vector128 <float> .Zero;
        Vector128 <float> b = Sse.LoadLow(a, data);
        Vector128 <float> c = Sse.LoadLow(a, data + 1);

        // Make sure we take into account the address operand.
        if (b.AsInt32().GetElement(0) == c.AsInt32().GetElement(0))
        {
            return(true);
        }

        // Make sure we take the heap state into account.
        b       = Sse.LoadLow(a, data);
        data[0] = 3;
        c       = Sse.LoadLow(a, data);
        if (b.AsInt32().GetElement(0) == c.AsInt32().GetElement(0))
        {
            return(true);
        }

        return(false);
    }
示例#2
0
    public static Vector128 <short> DivideBy10(this Vector128 <short> dividend)
    {
        // Convert to two 32-bit integers
        Vector128 <int> a_hi = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);

        a_lo = Sse2.ShiftRightArithmetic(a_lo, 16);

        Vector128 <int> div10_hi;
        Vector128 <int> div10_lo;

        if (Avx2.IsSupported)
        {
            Vector256 <int> a      = Vector256.Create(a_lo, a_hi);
            Vector256 <int> s0     = Avx2.ShiftRightArithmetic(a, 15);
            Vector256 <int> factor = Vector256.Create(26215);
            Vector256 <int> mul    = Avx2.MultiplyLow(a, factor);
            Vector256 <int> s1     = Avx2.ShiftRightArithmetic(mul, 18);
            Vector256 <int> div10  = Avx2.Subtract(s1, s0);

            div10_hi = div10.GetUpper();
            div10_lo = div10.GetLower();
        }
        else
        {
            Vector128 <int> s0_hi = Sse2.ShiftRightArithmetic(a_hi, 15);
            Vector128 <int> s0_lo = Sse2.ShiftRightArithmetic(a_lo, 15);

            Vector128 <int> factor = Vector128.Create(26215);
            Vector128 <int> mul_hi = Sse41.MultiplyLow(a_hi, factor);
            Vector128 <int> mul_lo = Sse41.MultiplyLow(a_lo, factor);

            Vector128 <int> s1_hi = Sse2.ShiftRightArithmetic(mul_hi, 18);
            Vector128 <int> s1_lo = Sse2.ShiftRightArithmetic(mul_lo, 18);

            div10_hi = Sse2.Subtract(s1_hi, s0_hi);
            div10_lo = Sse2.Subtract(s1_lo, s0_lo);
        }

        //div10_hi = Sse2.ShiftLeftLogical(div10_hi, 16);
        div10_hi = Sse2.ShiftLeftLogical128BitLane(div10_hi, 2);
        return(Sse41.Blend(div10_lo.AsInt16(), div10_hi.AsInt16(), 0xAA));
    }
示例#3
0
        internal static unsafe long Extract64(Vector128 <sbyte> value)
        {
            if (Sse41.X64.IsSupported)
            {
                return(Sse41.X64.Extract(value.AsInt64(), 0));                                   //会在JIT时进行静态判断
            }
            var v = value.AsInt32();

            return((long)((uint)Sse41.Extract(v, 0) | ((ulong)Sse41.Extract(v, 1) << 32)));
        }
示例#4
0
        private unsafe ulong HashSse(byte *buf, int len)
        {
            ulong           h       = 0;
            Vector128 <int> v_ps    = Vector128 <int> .Zero;
            bool            useSse4 = Sse41.IsSupported;

            int i = 0;

            for (int j = len - i - 1; len - i >= 4; i += 4, j = len - i - 1)
            {
                Vector128 <int> c_v = Sse2.LoadVector128(&kMultFactorsPtr[j - 3]);
                c_v = Sse2.Shuffle(c_v, SO123);
                Vector128 <byte> q_v = Sse2.LoadVector128(buf + i);

                Vector128 <int> s_v;
                if (useSse4)
                {
                    s_v = Sse41.ConvertToVector128Int32(q_v);
                }
                else
                {
                    q_v = Sse2.UnpackLow(q_v, q_v);
                    s_v = Sse2.ShiftRightLogical(Sse2.UnpackLow(q_v.AsUInt16(), q_v.AsUInt16()).AsInt32(), 24);
                }

                if (useSse4)
                {
                    v_ps = Sse2.Add(v_ps, Sse41.MultiplyLow(c_v, s_v));
                }
                else
                {
                    Vector128 <ulong> v_tmp1 = Sse2.Multiply(c_v.AsUInt32(), s_v.AsUInt32());
                    Vector128 <ulong> v_tmp2 =
                        Sse2.Multiply(Sse2.ShiftRightLogical128BitLane(c_v.AsByte(), 4).AsUInt32(),
                                      Sse2.ShiftRightLogical128BitLane(s_v.AsByte(), 4).AsUInt32());
                    ;
                    v_ps = Sse2.Add(v_ps, Sse2.UnpackLow(Sse2.Shuffle(v_tmp1.AsInt32(), SOO2O),
                                                         Sse2.Shuffle(v_tmp2.AsInt32(), SOO2O)));
                }
            }

            v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S23O1));
            v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S1O32));
            h   += Sse2.ConvertToUInt32(v_ps.AsUInt32());

            for (; i < len; i++)
            {
                int   index = len - i - 1;
                ulong c     = (uint)kMultFactors[index];
                h += c * buf[i];
            }

            return(h & (kBase - 1));
        }
示例#5
0
        private static Vector128 <byte> KeyGenAssist(ref Vector128 <byte> tmp1, Vector128 <byte> tmp3, byte control)
        {
            var keyGened = Aes.KeygenAssist(tmp3, control);

            keyGened = Aes.Shuffle(keyGened.AsInt32(), 0x55).AsByte();
            tmp1     = Sse2.Xor(tmp1, Sse2.ShiftLeftLogical128BitLane(tmp1, 4));
            tmp1     = Sse2.Xor(tmp1, Sse2.ShiftLeftLogical128BitLane(tmp1, 4));
            tmp1     = Sse2.Xor(tmp1, Sse2.ShiftLeftLogical128BitLane(tmp1, 4));
            tmp1     = Sse2.Xor(tmp1, keyGened);
            keyGened = Sse2.Shuffle(tmp1.AsInt32(), 0xFF).AsByte();
            return(Sse2.Xor(Sse2.Xor(tmp3, Sse2.ShiftLeftLogical128BitLane(tmp3, 4)), keyGened));
        }
示例#6
0
        private static void Aes256Assist1(ref Vector128 <byte> t1, ref Vector128 <byte> t2)
        {
            Vector128 <byte> t4;

            t2 = Sse2.Shuffle(t2.AsInt32(), 0xff).AsByte();
            t4 = Sse2.ShiftLeftLogical128BitLane(t1, 0x04);
            t1 = Sse2.Xor(t1, t4);
            t4 = Sse2.ShiftLeftLogical128BitLane(t4, 0x04);
            t1 = Sse2.Xor(t1, t4);
            t4 = Sse2.ShiftLeftLogical128BitLane(t4, 0x04);
            t1 = Sse2.Xor(t1, t4);
            t1 = Sse2.Xor(t1, t2);
        }
示例#7
0
        public static Vector128 <sbyte> CreateEscapingMask(
            Vector128 <sbyte> sourceValue,
            Vector128 <sbyte> bitMaskLookup,
            Vector128 <sbyte> bitPosLookup,
            Vector128 <sbyte> nibbleMaskSByte,
            Vector128 <sbyte> nullMaskSByte)
        {
            // To check if an input byte needs to be escaped or not, we use a bitmask-lookup.
            // Therefore we split the input byte into the low- and high-nibble, which will get
            // the row-/column-index in the bit-mask.
            // The bitmask-lookup looks like (here for example s_bitMaskLookupBasicLatin):
            //                                     high-nibble
            // low-nibble  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
            //         0   1   1   0   0   0   0   1   0   1   1   1   1   1   1   1   1
            //         1   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         2   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         3   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         4   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         5   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         6   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         7   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         8   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         9   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         A   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         B   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         C   1   1   0   1   0   1   0   0   1   1   1   1   1   1   1   1
            //         D   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         E   1   1   0   1   0   0   0   0   1   1   1   1   1   1   1   1
            //         F   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1   1
            //
            // where 1 denotes the neeed for escaping, while 0 means no escaping needed.
            // For high-nibbles in the range 8..F every input needs to be escaped, so we
            // can omit them in the bit-mask, thus only high-nibbles in the range 0..7 need
            // to be considered, hence the entries in the bit-mask can be of type byte.
            //
            // In the bitmask-lookup for each row (= low-nibble) a bit-mask for the
            // high-nibbles (= columns) is created.

            Debug.Assert(Ssse3.IsSupported);

            Vector128 <sbyte> highNibbles = Sse2.And(Sse2.ShiftRightLogical(sourceValue.AsInt32(), 4).AsSByte(), nibbleMaskSByte);
            Vector128 <sbyte> lowNibbles  = Sse2.And(sourceValue, nibbleMaskSByte);

            Vector128 <sbyte> bitMask      = Ssse3.Shuffle(bitMaskLookup, lowNibbles);
            Vector128 <sbyte> bitPositions = Ssse3.Shuffle(bitPosLookup, highNibbles);

            Vector128 <sbyte> mask = Sse2.And(bitPositions, bitMask);

            mask = Sse2.CompareEqual(nullMaskSByte, Sse2.CompareEqual(nullMaskSByte, mask));
            return(mask);
        }
示例#8
0
        public static bool AreEqual(Vector128 <float> left, Vector128 <float> right)
        {
            for (int i = 0; i < Vector128 <float> .Count; i++)
            {
                int l = left.AsInt32().GetElement(i);
                int r = right.AsInt32().GetElement(i);

                if (l != r)
                {
                    return(false);
                }
            }

            return(true);
        }
示例#9
0
 public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 {
     if (typeof(T) == typeof(byte))
     {
         return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>());
     }
     else if (typeof(T) == typeof(sbyte))
     {
         return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>());
     }
     else if (typeof(T) == typeof(short))
     {
         return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>());
     }
     else if (typeof(T) == typeof(ushort))
     {
         return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>());
     }
     else if (typeof(T) == typeof(int))
     {
         return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>());
     }
     else if (typeof(T) == typeof(uint))
     {
         return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
     }
     else if (typeof(T) == typeof(long))
     {
         return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>());
     }
     else if (typeof(T) == typeof(ulong))
     {
         return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
     }
     else if (typeof(T) == typeof(float))
     {
         return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>());
     }
     else if (typeof(T) == typeof(double))
     {
         return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>());
     }
     else
     {
         throw new NotSupportedException();
     }
 }
示例#10
0
        private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI)
        {
            if (Avx2.IsSupported)
            {
                Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output);

                Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
                Vector128 <uint> masks  = Vector128.Create(7u);

                Vector128 <byte> vClut;

                fixed(byte *pRPal = rPal)
                {
                    vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte();
                }

                Vector128 <uint> indices0  = Vector128.Create((uint)rI);
                Vector128 <uint> indices1  = Vector128.Create((uint)(rI >> 24));
                Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
                Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
                Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
                Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
                indices00 = Sse2.And(indices00, masks);
                indices10 = Sse2.And(indices10, masks);
                indices01 = Sse2.And(indices01, masks);
                indices11 = Sse2.And(indices11, masks);

                Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
                Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());

                Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());

                outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
            }
            else
            {
                for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
                {
                    output[i] = rPal[(int)(rI & 7)];
                }
            }
        }
示例#11
0
        public unsafe static Vector128 <float> Log2(Vector128 <float> value)
        {
            // split value into exponent and mantissa parts
            Vector128 <float> one = AvxExtensions.BroadcastScalarToVector128(MathV.One);

            Vector128 <int>   integerValue = value.AsInt32();
            Vector128 <float> exponent     = Avx.ConvertToVector128Single(Avx.Subtract(Avx.ShiftRightLogical(Avx.And(integerValue, MathV.FloatExponentMask128),
                                                                                                             MathV.FloatMantissaBits),
                                                                                       MathV.FloatMantissaZero128));
            Vector128 <float> mantissa = Avx.Or(Avx.And(integerValue, MathV.FloatMantissaMask128).AsSingle(), one);

            // evaluate mantissa polynomial
            Vector128 <float> beta1 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta1);
            Vector128 <float> beta2 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta2);
            Vector128 <float> beta3 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta3);
            Vector128 <float> beta4 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta4);
            Vector128 <float> beta5 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta5);

            Vector128 <float> x          = Avx.Subtract(mantissa, one);
            Vector128 <float> polynomial = Avx.Multiply(beta1, x);
            Vector128 <float> x2         = Avx.Multiply(x, x);

            polynomial = Avx.Add(polynomial, Avx.Multiply(beta2, x2));
            Vector128 <float> x3 = Avx.Multiply(x2, x);

            polynomial = Avx.Add(polynomial, Avx.Multiply(beta3, x3));
            Vector128 <float> x4 = Avx.Multiply(x3, x);

            polynomial = Avx.Add(polynomial, Avx.Multiply(beta4, x4));
            Vector128 <float> x5 = Avx.Multiply(x4, x);

            polynomial = Avx.Add(polynomial, Avx.Multiply(beta5, x5));

            // form logarithm
            return(Avx.Add(exponent, polynomial));
        }
示例#12
0
    public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor)
    {
        // Based on https://stackoverflow.com/a/51458507/347870

        // Convert to two 32-bit integers
        Vector128 <int> a_hi_epi32       = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32       = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16);

        Vector128 <int> b_hi_epi32       = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32       = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16);

        // Convert to 32-bit floats
        Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32);
        Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32);
        Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32);
        Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32);

        // Calculate the reciprocal
        Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi);
        Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo);

        // Calculate the inverse
        Vector128 <float> b_hi_inv_1;
        Vector128 <float> b_lo_inv_1;
        Vector128 <float> two = Vector128.Create(2.00000051757f);

        if (Fma.IsSupported)
        {
            b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two);
            b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two);
        }
        else
        {
            Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi);
            Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo);
            b_hi_inv_1 = Sse.Subtract(two, b_mul_hi);
            b_lo_inv_1 = Sse.Subtract(two, b_mul_lo);
        }

        // Compensate for the loss
        Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1);
        Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1);

        // Perform the division by multiplication
        Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1);
        Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1);

        // Convert back to integers
        Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi);
        Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo);

        // Zero-out the unnecessary parts
        Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16);

        // Blend the bits, and return
        if (Sse41.IsSupported)
        {
            return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA));
        }
        else
        {
            Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32());
            return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16());
        }
    }
示例#13
0
        private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int gobBlocksInY = 1 << config.OutBlkHeight;

            bool outLinear = config.OutBlkKind == 0;

            int width   = Math.Min(config.OutLumaWidth + 1, input.Width);
            int height  = Math.Min(config.OutLumaHeight + 1, input.Height);
            int yStride = GetPitch(config.OutLumaWidth + 1, 1);

            int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY);

            if (Sse41.IsSupported)
            {
                Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16();

                int widthTrunc = width & ~0xf;
                int strideGap  = yStride - width;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstY)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 16)
                            {
                                byte *baseOffset = (byte *)(ip + (ulong)(uint)x);

                                Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset);
                                Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10));
                                Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20));
                                Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30));
                                Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40));
                                Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50));
                                Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60));
                                Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70));

                                pixelp1 = Sse2.And(pixelp1, mask);
                                pixelp2 = Sse2.And(pixelp2, mask);
                                pixelp3 = Sse2.And(pixelp3, mask);
                                pixelp4 = Sse2.And(pixelp4, mask);
                                pixelp5 = Sse2.And(pixelp5, mask);
                                pixelp6 = Sse2.And(pixelp6, mask);
                                pixelp7 = Sse2.And(pixelp7, mask);
                                pixelp8 = Sse2.And(pixelp8, mask);

                                Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32());
                                Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32());
                                Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32());
                                Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32());

                                pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32());
                                pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32());

                                pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2);
                                pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *op++ = Downsample(px->R);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        dstY[y * yStride + x] = Downsample(input.GetR(x, y));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstY,
                offsets.LumaOffset,
                outLinear,
                config.OutLumaWidth + 1,
                config.OutLumaHeight + 1,
                1,
                gobBlocksInY);

            rm.BufferPool.Return(dstYIndex);

            int uvWidth  = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1);
            int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1);
            int uvStride = GetPitch(config.OutChromaWidth + 1, 2);

            int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv);

            if (Sse2.IsSupported)
            {
                int widthTrunc = uvWidth & ~7;
                int strideGap  = uvStride - uvWidth * 2;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstUv)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < uvHeight; y++, ip += input.Width * 2)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16;

                                Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02));
                                Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12));
                                Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22));
                                Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32));
                                Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42));
                                Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52));
                                Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62));
                                Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72));

                                Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2);
                                Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4);
                                Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6);
                                Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8);

                                Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64());
                                Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64());

                                pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2);
                                pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < uvWidth; x++)
                            {
                                Pixel *px = ip + (uint)(x << 1);

                                *op++ = Downsample(px->G);
                                *op++ = Downsample(px->B);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < uvHeight; y++)
                {
                    for (int x = 0; x < uvWidth; x++)
                    {
                        int xx = x << 1;
                        int yy = y << 1;

                        int uvOffs = y * uvStride + xx;

                        dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy));
                        dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstUv,
                offsets.ChromaUOffset,
                outLinear,
                config.OutChromaWidth + 1,
                config.OutChromaHeight + 1, 2,
                gobBlocksInY);

            rm.BufferPool.Return(dstUvIndex);
        }
示例#14
0
        private static unsafe uint CalculateSse(uint crc, ReadOnlySpan <byte> buffer)
        {
            int chunksize = buffer.Length & ~ChunksizeMask;
            int length    = chunksize;

            fixed(byte *bufferPtr = buffer)
            fixed(ulong *k05PolyPtr = K05Poly)
            {
                byte *srcPtr = bufferPtr;

                // There's at least one block of 64.
                Vector128 <ulong> x1 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00));
                Vector128 <ulong> x2 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10));
                Vector128 <ulong> x3 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20));
                Vector128 <ulong> x4 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30));
                Vector128 <ulong> x5;

                x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64());

                // k1, k2
                Vector128 <ulong> x0 = Sse2.LoadVector128(k05PolyPtr + 0x0);

                srcPtr += 64;
                length -= 64;

                // Parallel fold blocks of 64, if any.
                while (length >= 64)
                {
                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                    Vector128 <ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
                    Vector128 <ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
                    Vector128 <ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);

                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                    x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
                    x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
                    x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);

                    Vector128 <ulong> y5 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00));
                    Vector128 <ulong> y6 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10));
                    Vector128 <ulong> y7 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20));
                    Vector128 <ulong> y8 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30));

                    x1 = Sse2.Xor(x1, x5);
                    x2 = Sse2.Xor(x2, x6);
                    x3 = Sse2.Xor(x3, x7);
                    x4 = Sse2.Xor(x4, x8);

                    x1 = Sse2.Xor(x1, y5);
                    x2 = Sse2.Xor(x2, y6);
                    x3 = Sse2.Xor(x3, y7);
                    x4 = Sse2.Xor(x4, y8);

                    srcPtr += 64;
                    length -= 64;
                }

                // Fold into 128-bits.
                // k3, k4
                x0 = Sse2.LoadVector128(k05PolyPtr + 0x2);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x2);
                x1 = Sse2.Xor(x1, x5);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x3);
                x1 = Sse2.Xor(x1, x5);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x4);
                x1 = Sse2.Xor(x1, x5);

                // Single fold blocks of 16, if any.
                while (length >= 16)
                {
                    x2 = Sse2.LoadVector128((ulong *)srcPtr);

                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                    x1 = Sse2.Xor(x1, x2);
                    x1 = Sse2.Xor(x1, x5);

                    srcPtr += 16;
                    length -= 16;
                }

                // Fold 128 - bits to 64 - bits.
                x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
                x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86
                x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
                x1 = Sse2.Xor(x1, x2);

                // k5, k0
                x0 = Sse2.LoadScalarVector128(k05PolyPtr + 0x4);

                x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
                x1 = Sse2.And(x1, x3);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Sse2.Xor(x1, x2);

                // Barret reduce to 32-bits.
                // polynomial
                x0 = Sse2.LoadVector128(k05PolyPtr + 0x6);

                x2 = Sse2.And(x1, x3);
                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
                x2 = Sse2.And(x2, x3);
                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
                x1 = Sse2.Xor(x1, x2);

                crc = (uint)Sse41.Extract(x1.AsInt32(), 1);
                return(buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer[chunksize..]));
示例#15
0
        public void Negate_NegateZero_Passes()
        {
            Vector128 <float> result = Vector.Negate4D(Vector128.Create(0f));

            Assert.True(result.AsInt32().Equals(Vector128.Create(int.MinValue)));
        }
示例#16
0
        internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len)
        {
            uint s1     = sum1;
            uint s2     = sum2;
            int  bufPos = 0;

            /*
             * Process the data in blocks.
             */
            uint BLOCK_SIZE = 1 << 5;
            uint blocks     = len / BLOCK_SIZE;

            len -= blocks * BLOCK_SIZE;

            while (blocks != 0)
            {
                uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */

                if (n > blocks)
                {
                    n = blocks;
                }

                blocks -= n;

                Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17).
                                        AsByte();

                Vector128 <byte>  tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte();
                Vector128 <byte>  zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte();
                Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1);

                /*
                 * Process n blocks of data. At most NMAX data bytes can be
                 * processed before s2 must be reduced modulo BASE.
                 */
                Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0);
                Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0);
                Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0);

                do
                {
                    /*
                     * Load 32 input bytes.
                     */
                    Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    /*
                     * Add previous block byte sum to v_ps.
                     */
                    v_ps = Sse2.Add(v_ps, v_s1);

                    /*
                     * Horizontally add the bytes for s1, multiply-adds the
                     * bytes by [ 32, 31, 30, ... ] for s2.
                     */
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad1 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32());
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad2 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32());
                } while(--n != 0);

                v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                /*
                 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                 */
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177));
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78));
                s1  += (uint)Sse2.ConvertToInt32(v_s1.AsInt32());
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177));
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78));
                s2   = (uint)Sse2.ConvertToInt32(v_s2.AsInt32());

                /*
                 * Reduce.
                 */
                s1 %= Adler32Context.ADLER_MODULE;
                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Handle leftover data.
             */
            if (len != 0)
            {
                if (len >= 16)
                {
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    len -= 16;
                }

                while (len-- != 0)
                {
                    s2 += s1 += buf[bufPos++];
                }

                if (s1 >= Adler32Context.ADLER_MODULE)
                {
                    s1 -= Adler32Context.ADLER_MODULE;
                }

                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Return the recombined sums.
             */
            sum1 = (ushort)(s1 & 0xFFFF);
            sum2 = (ushort)(s2 & 0xFFFF);
        }