示例#1
0
    public static unsafe void AddTransposeXor(
        ref Vector128 <uint> x0, ref Vector128 <uint> x1, ref Vector128 <uint> x2, ref Vector128 <uint> x3,
        ref Vector128 <uint> o0, ref Vector128 <uint> o1, ref Vector128 <uint> o2, ref Vector128 <uint> o3,
        byte *source, byte *destination)
    {
        // x+=o
        x0 = Sse2.Add(x0, o0);
        x1 = Sse2.Add(x1, o1);
        x2 = Sse2.Add(x2, o2);
        x3 = Sse2.Add(x3, o3);

        // Transpose
        var t0 = Sse2.UnpackLow(x0, x1);
        var t1 = Sse2.UnpackLow(x2, x3);
        var t2 = Sse2.UnpackHigh(x0, x1);
        var t3 = Sse2.UnpackHigh(x2, x3);

        x0 = Sse2.UnpackLow(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
        x1 = Sse2.UnpackHigh(t0.AsUInt64(), t1.AsUInt64()).AsUInt32();
        x2 = Sse2.UnpackLow(t2.AsUInt64(), t3.AsUInt64()).AsUInt32();
        x3 = Sse2.UnpackHigh(t2.AsUInt64(), t3.AsUInt64()).AsUInt32();

        // Xor
        Sse2.Store(destination, Sse2.Xor(x0.AsByte(), Sse2.LoadVector128(source)));
        Sse2.Store(destination + 64, Sse2.Xor(x1.AsByte(), Sse2.LoadVector128(source + 64)));
        Sse2.Store(destination + 128, Sse2.Xor(x2.AsByte(), Sse2.LoadVector128(source + 128)));
        Sse2.Store(destination + 192, Sse2.Xor(x3.AsByte(), Sse2.LoadVector128(source + 192)));
    }
示例#2
0
        private static unsafe void OneQuad(ref Vector128 <uint> x_A, ref Vector128 <uint> x_B, ref Vector128 <uint> x_C, ref Vector128 <uint> x_D, ref Vector128 <uint> origA, ref Vector128 <uint> origB, ref Vector128 <uint> origC, ref Vector128 <uint> origD, byte *m, byte *c)
        {
            Vector128 <uint> t_A, t_B, t_C, t_D, t0, t1, t2, t3;

            x_A = Sse2.Add(x_A, origA);
            x_B = Sse2.Add(x_B, origB);
            x_C = Sse2.Add(x_C, origC);
            x_D = Sse2.Add(x_D, origD);
            t_A = Sse2.UnpackLow(x_A, x_B);
            t_B = Sse2.UnpackLow(x_C, x_D);
            t_C = Sse2.UnpackHigh(x_A, x_B);
            t_D = Sse2.UnpackHigh(x_C, x_D);
            x_A = Sse2.UnpackLow(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32();
            x_B = Sse2.UnpackHigh(t_A.AsUInt64(), t_B.AsUInt64()).AsUInt32();
            x_C = Sse2.UnpackLow(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32();
            x_D = Sse2.UnpackHigh(t_C.AsUInt64(), t_D.AsUInt64()).AsUInt32();
            t0  = Sse2.Xor(x_A.AsByte(), Sse2.LoadVector128(m)).AsUInt32();
            Sse2.Store(c, t0.AsByte());
            t1 = Sse2.Xor(x_B.AsByte(), Sse2.LoadVector128(m + 64)).AsUInt32();
            Sse2.Store(c + 64, t1.AsByte());
            t2 = Sse2.Xor(x_C.AsByte(), Sse2.LoadVector128(m + 128)).AsUInt32();
            Sse2.Store(c + 128, t2.AsByte());
            t3 = Sse2.Xor(x_D.AsByte(), Sse2.LoadVector128(m + 192)).AsUInt32();
            Sse2.Store(c + 192, t3.AsByte());
        }
示例#3
0
        public static Vector128 <T> And <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
        {
            if (typeof(T) == typeof(float))
            {
                if (Sse.IsSupported)
                {
                    return(Sse.And(left.AsSingle(), right.AsSingle()).As <float, T>());
                }
            }

            if (typeof(T) == typeof(double))
            {
                if (Sse2.IsSupported)
                {
                    return(Sse2.And(left.AsDouble(), right.AsDouble()).As <double, T>());
                }
                if (Sse.IsSupported)
                {
                    return(Sse.And(left.AsSingle(), right.AsSingle()).As <float, T>());
                }
            }

            if (Sse2.IsSupported)
            {
                return(Sse2.And(left.AsByte(), right.AsByte()).As <byte, T>());
            }
            if (Sse.IsSupported)
            {
                return(Sse.And(left.AsSingle(), right.AsSingle()).As <float, T>());
            }

            return(SoftwareFallbacks.And_Software(left, right));
        }
        public override unsafe int FindFirstCharacterToEncode(char *text, int textLength)
        {
            if (text == null)
            {
                throw new ArgumentNullException(nameof(text));
            }

            int idx = 0;

#if NETCOREAPP
            if (Sse2.IsSupported)
            {
                short *startingAddress = (short *)text;
                while (textLength - 8 >= idx)
                {
                    Debug.Assert(startingAddress >= text && startingAddress <= (text + textLength - 8));

                    // Load the next 8 characters.
                    Vector128 <short> sourceValue = Sse2.LoadVector128(startingAddress);

                    // Check if any of the 8 characters need to be escaped.
                    Vector128 <short> mask = Sse2Helper.CreateEscapingMask_DefaultJavaScriptEncoderBasicLatin(sourceValue);

                    int index = Sse2.MoveMask(mask.AsByte());
                    // If index == 0, that means none of the 8 characters needed to be escaped.
                    // TrailingZeroCount is relatively expensive, avoid it if possible.
                    if (index != 0)
                    {
                        // Found at least one character that needs to be escaped, figure out the index of
                        // the first one found that needed to be escaped within the 8 characters.
                        Debug.Assert(index > 0 && index <= 65_535);
                        int tzc = BitOperations.TrailingZeroCount(index);
                        Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16);
                        idx += tzc >> 1;
                        goto Return;
                    }
                    idx             += 8;
                    startingAddress += 8;
                }

                // Process the remaining characters.
                Debug.Assert(textLength - idx < 8);
            }
#endif

            for (; idx < textLength; idx++)
            {
                Debug.Assert((text + idx) <= (text + textLength));
                if (NeedsEscaping(*(text + idx)))
                {
                    goto Return;
                }
            }

            idx = -1; // All characters are allowed.

Return:
            return(idx);
        }
示例#5
0
    public static bool MyEquals(ref Vector128 <int> left, Vector128 <int> right)
    {
        if (Sse2.IsSupported)
        {
            Vector128 <byte> result = MyCompareEqual(left.AsByte(), right.AsByte());
            return(Sse2.MoveMask(result) == 0b1111_1111_1111_1111); // We have one bit per element
        }

        return(true);
    }
示例#6
0
    static void ShiftRight128(Vector128 <ulong> initial, uint n, out Vector128 <ulong> outLeft,
                              out Vector128 <ulong> outRight)
    {
        uint maskPos = 16 - n;

        Vector128 <byte> maskA = Vector128.Create(_shuffleMasks[maskPos], _shuffleMasks[maskPos + 1],
                                                  _shuffleMasks[maskPos + 2], _shuffleMasks[maskPos + 3],
                                                  _shuffleMasks[maskPos + 4], _shuffleMasks[maskPos + 5],
                                                  _shuffleMasks[maskPos + 6], _shuffleMasks[maskPos + 7],
                                                  _shuffleMasks[maskPos + 8], _shuffleMasks[maskPos + 9],
                                                  _shuffleMasks[maskPos + 10], _shuffleMasks[maskPos + 11],
                                                  _shuffleMasks[maskPos + 12], _shuffleMasks[maskPos + 13],
                                                  _shuffleMasks[maskPos + 14], _shuffleMasks[maskPos + 15]);

        Vector128 <byte> maskB = Sse2.Xor(maskA, Sse2.CompareEqual(Vector128 <byte> .Zero, Vector128 <byte> .Zero));

        outLeft  = Ssse3.Shuffle(initial.AsByte(), maskB).AsUInt64();
        outRight = Ssse3.Shuffle(initial.AsByte(), maskA).AsUInt64();
    }
示例#7
0
        private unsafe ulong HashSse(byte *buf, int len)
        {
            ulong           h       = 0;
            Vector128 <int> v_ps    = Vector128 <int> .Zero;
            bool            useSse4 = Sse41.IsSupported;

            int i = 0;

            for (int j = len - i - 1; len - i >= 4; i += 4, j = len - i - 1)
            {
                Vector128 <int> c_v = Sse2.LoadVector128(&kMultFactorsPtr[j - 3]);
                c_v = Sse2.Shuffle(c_v, SO123);
                Vector128 <byte> q_v = Sse2.LoadVector128(buf + i);

                Vector128 <int> s_v;
                if (useSse4)
                {
                    s_v = Sse41.ConvertToVector128Int32(q_v);
                }
                else
                {
                    q_v = Sse2.UnpackLow(q_v, q_v);
                    s_v = Sse2.ShiftRightLogical(Sse2.UnpackLow(q_v.AsUInt16(), q_v.AsUInt16()).AsInt32(), 24);
                }

                if (useSse4)
                {
                    v_ps = Sse2.Add(v_ps, Sse41.MultiplyLow(c_v, s_v));
                }
                else
                {
                    Vector128 <ulong> v_tmp1 = Sse2.Multiply(c_v.AsUInt32(), s_v.AsUInt32());
                    Vector128 <ulong> v_tmp2 =
                        Sse2.Multiply(Sse2.ShiftRightLogical128BitLane(c_v.AsByte(), 4).AsUInt32(),
                                      Sse2.ShiftRightLogical128BitLane(s_v.AsByte(), 4).AsUInt32());
                    ;
                    v_ps = Sse2.Add(v_ps, Sse2.UnpackLow(Sse2.Shuffle(v_tmp1.AsInt32(), SOO2O),
                                                         Sse2.Shuffle(v_tmp2.AsInt32(), SOO2O)));
                }
            }

            v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S23O1));
            v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S1O32));
            h   += Sse2.ConvertToUInt32(v_ps.AsUInt32());

            for (; i < len; i++)
            {
                int   index = len - i - 1;
                ulong c     = (uint)kMultFactors[index];
                h += c * buf[i];
            }

            return(h & (kBase - 1));
        }
示例#8
0
        private static uint32_t parse_eight_digits_unrolled(bytechar *chars)
        {
            // this actually computes *16* values so we are being wasteful.
            Vector128 <sbyte>  ascii0 = Vector128.Create((bytechar)'0');
            Vector128 <sbyte>  input  = Sse2.Subtract(Sse2.LoadVector128(chars), ascii0);
            Vector128 <short>  t1     = Ssse3.MultiplyAddAdjacent(input.AsByte(), mul_1_10);
            Vector128 <int>    t2     = Sse2.MultiplyAddAdjacent(t1, mul_1_100);
            Vector128 <ushort> t3     = Sse41.PackUnsignedSaturate(t2, t2);
            Vector128 <int>    t4     = Sse2.MultiplyAddAdjacent(t3.AsInt16(), mul_1_10000);

            return(Sse2.ConvertToUInt32(t4.AsUInt32())); // only captures the sum of the first 8 digits, drop the rest
        }
示例#9
0
 public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 {
     if (typeof(T) == typeof(byte))
     {
         return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>());
     }
     else if (typeof(T) == typeof(sbyte))
     {
         return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>());
     }
     else if (typeof(T) == typeof(short))
     {
         return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>());
     }
     else if (typeof(T) == typeof(ushort))
     {
         return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>());
     }
     else if (typeof(T) == typeof(int))
     {
         return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>());
     }
     else if (typeof(T) == typeof(uint))
     {
         return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
     }
     else if (typeof(T) == typeof(long))
     {
         return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>());
     }
     else if (typeof(T) == typeof(ulong))
     {
         return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
     }
     else if (typeof(T) == typeof(float))
     {
         return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>());
     }
     else if (typeof(T) == typeof(double))
     {
         return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>());
     }
     else
     {
         throw new NotSupportedException();
     }
 }
示例#10
0
        public static Vector128 <float> ConditionalSelectBitwise(Vector128 <float> selector, Vector128 <float> ifTrue, Vector128 <float> ifFalse)
        {
            Debug.Assert(Sse.IsSupported || AdvSimd.IsSupported);

            if (Sse.IsSupported)
            {
                return(Sse.Or(
                           Sse.And(ifTrue, selector),
                           Sse.AndNot(selector, ifFalse)
                           ));
            }
            else if (AdvSimd.IsSupported)
            {
                return(AdvSimd.BitwiseSelect(selector.AsByte(), ifTrue.AsByte(), ifFalse.AsByte()).As <byte, float>());
            }

            return(default);
示例#11
0
        public static Vector128 <T> Select <T, U>(Vector128 <T> left, Vector128 <T> right, Vector128 <U> selector)
            where T : struct where U : struct
        {
            if (Sse41.IsSupported)
            {
                if (typeof(T) == typeof(float))
                {
                    return(Sse41.BlendVariable(left.AsSingle(), right.AsSingle(), selector.AsSingle()).As <float, T>());
                }
                else if (typeof(T) == typeof(double))
                {
                    return(Sse41.BlendVariable(left.AsDouble(), right.AsDouble(), selector.AsDouble()).As <double, T>());
                }

                return(Sse41.BlendVariable(left.AsByte(), right.AsByte(), selector.AsByte()).As <byte, T>());
            }

            return(Or(And(selector.As <U, T>(), right), AndNot(selector.As <U, T>(), left)));
        }
示例#12
0
 public static Vector128 <T> ReverseEndianness32 <T>(this Vector128 <T> value) where T : struct
 {
     return(Ssse3.Shuffle(value.AsByte(), Reverse32).As <byte, T>());
 }
示例#13
0
    /*public void ResizeBilinear2(FastBitmap rtnImage)
     * {
     *
     *   float scaleX = (float)this.width / rtnImage.width;
     *   float scaleY = (float)this.height / rtnImage.height;
     *   if (scaleX > 1 || scaleY > 1)
     *   {
     *       ResizeBilinear(rtnImage);
     *       return;
     *   }
     *
     *   byte[] tmp = new byte[4 * (this.height + 1) * (rtnImage.width)];
     *
     *   fixed (byte* p = tmp)
     *   {
     *       byte* tmpp = p;
     *
     *       Parallel.For(0, this.height, (y) =>
     *       {
     *           var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255);
     *           var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255);
     *           var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255);
     *
     *           uint* store = stackalloc uint[4];
     *
     *           uint* pos = (uint*)(this._ptr + (this._stride * y));
     *           uint* rtnPos = (uint*)(tmpp + (rtnImage._stride * y));
     *           for (int x = 0; x < rtnImage.width; x++)
     *           {
     *               float px = scaleX * x;
     *               int x0 = (int)px;
     *               int x1 = x0 + 1;
     *               float rx = px - x0;
     *
     *               var rxv = Vector128.Create(rx, rx, rx, rx);
     *
     *               var _ = Avx2.GatherVector128(pos, Vector128.Create(x0, x1, 0, 0), 4);
     *               var _b = Vector128.AsByte(_);
     *               var _00 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _00mask).AsInt32());
     *               var _01 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _01mask).AsInt32());
     *               var vf = Sse.Add(_00, Sse.Multiply(Sse.Subtract(_01, _00), rxv));
     *               var vb = Sse2.ConvertToVector128Int32WithTruncation(vf).AsByte();
     *               var v = Ssse3.Shuffle(vb, _vmask).AsUInt32();
     *               Sse2.Store(store, v);
     * rtnPos = *store;
     *               rtnPos++;
     *           }
     *       });
     *       Parallel.For(0, rtnImage.height, (y) =>
     *       // for (int y = 0; y < rtnImage.height; y++)
     *
     *       {
     *           float py = scaleY * y;
     *           int y0 = (int)py;
     *           int y1 = y0 + 1;
     *
     *           float ry = py - y0;
     *
     *           uint* pos = (uint*)(tmpp + rtnImage.width * 4 * y0);
     *           int offset = rtnImage.width;
     *           var ryv = Vector128.Create(ry, ry, ry, ry);
     *
     *           var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255);
     *           var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255);
     *           var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255);
     *           uint* rtnPos = (uint*)(rtnImage._ptr + (rtnImage._stride * y));
     *           uint* store = stackalloc uint[4];
     *
     *           for (int x = 0; x < rtnImage.width; x++)
     *           {
     *
     *               var _ = Avx2.GatherVector128(pos, Vector128.Create(0, offset, 0, 0), 4);
     *               var _b = Vector128.AsByte(_);
     *               var _00 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _00mask).AsInt32());
     *               var _01 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _01mask).AsInt32());
     *               var vf = Sse.Add(_00, Sse.Multiply(Sse.Subtract(_01, _00), ryv));
     *               var vb = Sse2.ConvertToVector128Int32WithTruncation(vf).AsByte();
     *               var v = Ssse3.Shuffle(vb, _vmask).AsUInt32();
     *               Sse2.Store(store, v);
     * rtnPos = *store;
     *               rtnPos++;
     *               pos++;
     *               //  *rtnPos = *pos;
     *               //  rtnPos++;
     *               //  pos++;
     *               //    byte* _00 = tmpp + (rtnImage.width * 4 * y0) + x * 4;
     *               //    byte* _10 = tmpp + (rtnImage.width * 4 * y1) + x * 4;
     *               //
     *               //    uint value = 0;
     *               //    ((byte*)(&value))[0] = (byte)(_00[0] + (_10[0] - _00[0]) * ry);
     *               //    ((byte*)(&value))[1] = (byte)(_00[1] + (_10[1] - _00[1]) * ry);
     *               //    ((byte*)(&value))[2] = (byte)(_00[2] + (_10[2] - _00[2]) * ry);
     *               //    ((byte*)(&value))[3] = (byte)(_00[3] + (_10[3] - _00[3]) * ry);
     *               //
     *               //    *(uint*)(rtnImage._ptr + (rtnImage._stride * y) + (x * 4)) = value;
     *
     *           }
     *       });
     *   }
     * }*/

    //int y0 = (int)py;
    //int y1 = y0 + 1;
    //
    //int x0 = (int)px;
    //int x1 = x0 + 1;
    //
    //float ry = py - y0;
    //float rx = px - x0;
    //
    //byte* _00 = this._ptr + (this._stride * y0) + (x0 * 4);
    //byte* _01 = this._ptr + (this._stride * y0) + (x1 * 4);
    //byte* _10 = this._ptr + (this._stride * y1) + (x0 * 4);
    //byte* _11 = this._ptr + (this._stride * y1) + (x1 * 4);
    //
    //
    //uint _y0u = 0;
    //uint _y1u = 0;
    //byte* _y0 = (byte*)&_y0u;
    //byte* _y1 = (byte*)&_y1u;
    //
    //_y0[0] = (byte)(_00[0] + (_10[0] - _00[0]) * ry);
    //_y0[1] = (byte)(_00[1] + (_10[1] - _00[1]) * ry);
    //_y0[2] = (byte)(_00[2] + (_10[2] - _00[2]) * ry);
    //_y0[3] = (byte)(_00[3] + (_10[3] - _00[3]) * ry);
    //
    //_y1[0] = (byte)(_01[0] + (_11[0] - _01[0]) * ry);
    //_y1[1] = (byte)(_01[1] + (_11[1] - _01[1]) * ry);
    //_y1[2] = (byte)(_01[2] + (_11[2] - _01[2]) * ry);
    //_y1[3] = (byte)(_01[3] + (_11[3] - _01[3]) * ry);
    //
    //uint value = 0;
    //((byte*)(&value))[0] = (byte)(_y0[0] + (_y1[0] - _y0[0]) * rx);
    //((byte*)(&value))[1] = (byte)(_y0[1] + (_y1[1] - _y0[1]) * rx);
    //((byte*)(&value))[2] = (byte)(_y0[2] + (_y1[2] - _y0[2]) * rx);
    //((byte*)(&value))[3] = (byte)(_y0[3] + (_y1[3] - _y0[3]) * rx);
    //
    //*(uint*)(rtnImage._ptr + (rtnImage._stride * y) + (x * 4)) = value;

    public void ResizeBilinear(FastBitmap rtnImage)
    {
        float scaleX = (float)this.width / rtnImage.width;
        float scaleY = (float)this.height / rtnImage.height;

        var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255);
        var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255);
        var _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255);
        var _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255);
        var _vmask  = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255);

        Parallel.For(0, rtnImage.height, (y) =>
        {
            float py = scaleY * y;
            int y0   = (int)py;
            int y1   = y0 + 1;
            float ry = py - y0;
            var ryv  = Vector128.Create(ry, ry, ry, ry);

            uint *py0 = (uint *)(this._ptr + (this._stride * y0));
            uint *py1 = (uint *)(this._ptr + (this._stride * y1));
            int dy    = (int)(py1 - py0);

            uint *rtnPos = (uint *)(rtnImage._ptr + (rtnImage._stride * y));
            uint *store  = stackalloc uint[4];



            for (int x = 0; x < rtnImage.width; x++)
            {
                float px = scaleX * x;
                int x0   = (int)px;
                int x1   = x0 + 1;

                float rx = px - x0;
                var rxv  = Vector128.Create(rx, rx, rx, rx);

                var _  = Avx2.GatherVector128(py0, Vector128.Create(x0, x1, x0 + dy, x1 + dy), 4);
                var _b = Vector128.AsByte(_);

                var _00 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _00mask).AsInt32());
                var _01 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _01mask).AsInt32());

                var _10 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _10mask).AsInt32());
                var _11 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(_b, _11mask).AsInt32());

                var _y0 = Sse.Add(_00, Sse.Multiply(Sse.Subtract(_10, _00), ryv));
                var _y1 = Sse.Add(_01, Sse.Multiply(Sse.Subtract(_11, _01), ryv));

                var vf = Sse.Add(_y0, Sse.Multiply(Sse.Subtract(_y1, _y0), rxv));
                var vb = Sse2.ConvertToVector128Int32WithTruncation(vf).AsByte();

                var v = Ssse3.Shuffle(vb, _vmask).AsUInt32();
                Sse2.Store(store, v);
                *rtnPos = *store;
                rtnPos++;

                _00 = _10;
                _01 = _11;
            }
        });
    }
示例#14
0
 public static Vector128 <T> RotateLeftUInt32_24 <T>(this Vector128 <T> value) where T : struct
 {
     return(Ssse3.IsSupported ? Ssse3.Shuffle(value.AsByte(), Rot24).As <byte, T>() : value.RotateLeftUInt32(24));
 }
示例#15
0
        private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets)
        {
            InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2);

            int width  = input.Width;
            int height = input.Height;

            int yStride  = GetPitch(width, 1);
            int uvStride = GetPitch(input.UvWidth, 2);

            Surface output = new Surface(rm.SurfacePool, width, height);

            if (Sse41.IsSupported)
            {
                Vector128 <byte> shufMask = Vector128.Create(
                    (byte)0, (byte)2, (byte)3, (byte)1,
                    (byte)4, (byte)6, (byte)7, (byte)5,
                    (byte)8, (byte)10, (byte)11, (byte)9,
                    (byte)12, (byte)14, (byte)15, (byte)13);
                Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16();

                int yStrideGap  = yStride - width;
                int uvStrideGap = uvStride - input.UvWidth;

                int widthTrunc = width & ~0xf;

                fixed(Pixel *dstPtr = output.Data)
                {
                    Pixel *op = dstPtr;

                    fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1)
                    {
                        byte *i0p = src0Ptr;

                        for (int y = 0; y < height; y++)
                        {
                            byte *i1p = src1Ptr + (y >> 1) * uvStride;

                            int x = 0;

                            for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16)
                            {
                                Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p);
                                Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8);

                                Vector128 <byte> uv = Sse2.LoadVector128(i1p);

                                Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16());
                                Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16());

                                Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0);
                                Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0);
                                Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1);
                                Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1);

                                rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16();
                                rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16();
                                rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16();
                                rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16();

                                Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte());
                                Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte()));
                                Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte());
                                Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte()));
                                Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte());
                                Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte()));
                                Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte());
                                Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte()));

                                rgba16_0 = Sse2.Or(rgba16_0, alphaMask);
                                rgba16_1 = Sse2.Or(rgba16_1, alphaMask);
                                rgba16_2 = Sse2.Or(rgba16_2, alphaMask);
                                rgba16_3 = Sse2.Or(rgba16_3, alphaMask);
                                rgba16_4 = Sse2.Or(rgba16_4, alphaMask);
                                rgba16_5 = Sse2.Or(rgba16_5, alphaMask);
                                rgba16_6 = Sse2.Or(rgba16_6, alphaMask);
                                rgba16_7 = Sse2.Or(rgba16_7, alphaMask);

                                rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2);
                                rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2);
                                rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2);
                                rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2);
                                rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2);
                                rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2);
                                rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2);
                                rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2);

                                Sse2.Store((short *)(op + (uint)x + 0), rgba16_0);
                                Sse2.Store((short *)(op + (uint)x + 2), rgba16_1);
                                Sse2.Store((short *)(op + (uint)x + 4), rgba16_2);
                                Sse2.Store((short *)(op + (uint)x + 6), rgba16_3);
                                Sse2.Store((short *)(op + (uint)x + 8), rgba16_4);
                                Sse2.Store((short *)(op + (uint)x + 10), rgba16_5);
                                Sse2.Store((short *)(op + (uint)x + 12), rgba16_6);
                                Sse2.Store((short *)(op + (uint)x + 14), rgba16_7);
                            }

                            for (; x < width; x++, i1p += (x & 1) * 2)
                            {
                                Pixel *px = op + (uint)x;

                                px->R = Upsample(*i0p++);
                                px->G = Upsample(*i1p);
                                px->B = Upsample(*(i1p + 1));
                                px->A = 0x3ff;
                            }

                            op  += width;
                            i0p += yStrideGap;
                            i1p += uvStrideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    int uvBase = (y >> 1) * uvStride;

                    for (int x = 0; x < width; x++)
                    {
                        output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x]));

                        int uvOffs = uvBase + (x & ~1);

                        output.SetG(x, y, Upsample(input.Buffer1[uvOffs]));
                        output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1]));
                        output.SetA(x, y, 0x3ff);
                    }
                }
            }

            return(output);
        }
    internal static unsafe void ProcessTextureSse2(Span <Color8> data)
    {
        uint registerElements = (uint)Vector128 <uint> .Count;

        registerElements.AssertEqual((uint)(sizeof(Vector128 <uint>) / sizeof(Color8)));

        uint offset;

        fixed(Color8 *dataPtr8 = data)
        {
            uint *dataPtr = (uint *)dataPtr8;

            for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements)
            {
                Vector128 <uint> rawColor = Sse2.LoadVector128(dataPtr + offset);

                Vector128 <uint> alphaMask = Vector128.Create(0xFF000000U);
                Vector128 <uint> alpha     = Sse2.And(rawColor, alphaMask);

                Vector128 <ushort> lo = Sse2.UnpackLow(rawColor.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi = Sse2.UnpackHigh(rawColor.AsByte(), Vector128 <byte> .Zero).AsUInt16();

                Vector128 <uint> alphaLo, alphaHi;
                if (Ssse3.IsSupported)
                {
                    Vector128 <byte> alphaShuffle = Vector128.Create(6, 0xFF, 6, 0xFF, 6, 0xFF, 6, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF);

                    alphaLo = Ssse3.Shuffle(lo.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi = Ssse3.Shuffle(hi.AsByte(), alphaShuffle).AsUInt32();
                }
                else
                {
                    alphaLo = Sse2.UnpackLow(alpha.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi = Sse2.UnpackHigh(alpha.AsByte(), Vector128 <byte> .Zero).AsUInt32();

                    Vector128 <uint> alphaLo16 = Sse2.ShiftRightLogical(alphaLo, 16);
                    Vector128 <uint> alphaHi16 = Sse2.ShiftRightLogical(alphaHi, 16);
                    alphaLo = Sse2.Or(alphaLo, alphaLo16);
                    alphaHi = Sse2.Or(alphaHi, alphaHi16);

                    Vector128 <ulong> alphaLo32 = Sse2.ShiftRightLogical(alphaLo.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi32 = Sse2.ShiftRightLogical(alphaHi.AsUInt64(), 32);
                    alphaLo = Sse2.Or(alphaLo.AsUInt64(), alphaLo32).AsUInt32();
                    alphaHi = Sse2.Or(alphaHi.AsUInt64(), alphaHi32).AsUInt32();
                }

                Vector128 <ushort> prodLo = Sse2.MultiplyLow(lo, alphaLo.AsUInt16());
                Vector128 <ushort> prodHi = Sse2.MultiplyLow(hi, alphaHi.AsUInt16());

                Vector128 <ushort> addend = Vector128.Create((ushort)0x00FFU);

                var sumLo = Sse2.Add(prodLo, addend);
                var sumHi = Sse2.Add(prodHi, addend);

                var shiftLo = Sse2.ShiftRightLogical(sumLo, 8);
                var shiftHi = Sse2.ShiftRightLogical(sumHi, 8);

                var packed = Sse2.PackUnsignedSaturate(shiftLo.AsInt16(), shiftHi.AsInt16()).AsUInt32();

                var mask = Vector128.Create(0x00FFFFFFU);
                packed = Sse2.And(packed, mask);
                packed = Sse2.Or(packed, alpha);

                Sse2.Store(dataPtr + offset, packed);
            }
        }

        // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4)
        if (offset < data.Length)
        {
            ProcessTextureScalar(data.SliceUnsafe(offset));
        }
    }
示例#17
0
        internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len)
        {
            uint s1     = sum1;
            uint s2     = sum2;
            int  bufPos = 0;

            /*
             * Process the data in blocks.
             */
            uint BLOCK_SIZE = 1 << 5;
            uint blocks     = len / BLOCK_SIZE;

            len -= blocks * BLOCK_SIZE;

            while (blocks != 0)
            {
                uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */

                if (n > blocks)
                {
                    n = blocks;
                }

                blocks -= n;

                Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17).
                                        AsByte();

                Vector128 <byte>  tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte();
                Vector128 <byte>  zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte();
                Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1);

                /*
                 * Process n blocks of data. At most NMAX data bytes can be
                 * processed before s2 must be reduced modulo BASE.
                 */
                Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0);
                Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0);
                Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0);

                do
                {
                    /*
                     * Load 32 input bytes.
                     */
                    Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    /*
                     * Add previous block byte sum to v_ps.
                     */
                    v_ps = Sse2.Add(v_ps, v_s1);

                    /*
                     * Horizontally add the bytes for s1, multiply-adds the
                     * bytes by [ 32, 31, 30, ... ] for s2.
                     */
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad1 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32());
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad2 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32());
                } while(--n != 0);

                v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                /*
                 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                 */
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177));
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78));
                s1  += (uint)Sse2.ConvertToInt32(v_s1.AsInt32());
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177));
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78));
                s2   = (uint)Sse2.ConvertToInt32(v_s2.AsInt32());

                /*
                 * Reduce.
                 */
                s1 %= Adler32Context.ADLER_MODULE;
                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Handle leftover data.
             */
            if (len != 0)
            {
                if (len >= 16)
                {
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    len -= 16;
                }

                while (len-- != 0)
                {
                    s2 += s1 += buf[bufPos++];
                }

                if (s1 >= Adler32Context.ADLER_MODULE)
                {
                    s1 -= Adler32Context.ADLER_MODULE;
                }

                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Return the recombined sums.
             */
            sum1 = (ushort)(s1 & 0xFFFF);
            sum2 = (ushort)(s2 & 0xFFFF);
        }
示例#18
0
 public static Vector128 <byte> ReverseEndianness32(this Vector128 <uint> value)
 {
     return(Ssse3.Shuffle(value.AsByte(), Reverse32));
 }
    internal static unsafe void ProcessTextureSse2Unrolled(Span <Color8> data)
    {
        uint registerElements = (uint)Vector128 <uint> .Count * 4U;

        registerElements.AssertEqual((uint)(sizeof(Vector128 <uint>) / sizeof(Color8)));

        uint offset;

        fixed(Color8 *dataPtr8 = data)
        {
            uint *dataPtr = (uint *)dataPtr8;

            for (offset = 0; offset + (registerElements - 1U) < data.Length; offset += registerElements)
            {
                Vector128 <uint> rawColor0 = Sse2.LoadVector128(dataPtr + offset + 0x0);
                Vector128 <uint> rawColor1 = Sse2.LoadVector128(dataPtr + offset + 0x4);
                Vector128 <uint> rawColor2 = Sse2.LoadVector128(dataPtr + offset + 0x8);
                Vector128 <uint> rawColor3 = Sse2.LoadVector128(dataPtr + offset + 0xC);

                Vector128 <uint> alphaMask = Vector128.Create(0xFF000000U);
                Vector128 <uint> alpha0    = Sse2.And(rawColor0, alphaMask);
                Vector128 <uint> alpha1    = Sse2.And(rawColor1, alphaMask);
                Vector128 <uint> alpha2    = Sse2.And(rawColor2, alphaMask);
                Vector128 <uint> alpha3    = Sse2.And(rawColor3, alphaMask);

                Vector128 <ushort> lo0 = Sse2.UnpackLow(rawColor0.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> lo1 = Sse2.UnpackLow(rawColor1.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> lo2 = Sse2.UnpackLow(rawColor2.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> lo3 = Sse2.UnpackLow(rawColor3.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi0 = Sse2.UnpackHigh(rawColor0.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi1 = Sse2.UnpackHigh(rawColor1.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi2 = Sse2.UnpackHigh(rawColor2.AsByte(), Vector128 <byte> .Zero).AsUInt16();
                Vector128 <ushort> hi3 = Sse2.UnpackHigh(rawColor3.AsByte(), Vector128 <byte> .Zero).AsUInt16();

                Vector128 <uint> alphaLo0, alphaHi0;
                Vector128 <uint> alphaLo1, alphaHi1;
                Vector128 <uint> alphaLo2, alphaHi2;
                Vector128 <uint> alphaLo3, alphaHi3;
                if (Ssse3.IsSupported)
                {
                    Vector128 <byte> alphaShuffle = Vector128.Create(6, 0xFF, 6, 0xFF, 6, 0xFF, 6, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF, 14, 0xFF);

                    alphaLo0 = Ssse3.Shuffle(lo0.AsByte(), alphaShuffle).AsUInt32();
                    alphaLo1 = Ssse3.Shuffle(lo1.AsByte(), alphaShuffle).AsUInt32();
                    alphaLo2 = Ssse3.Shuffle(lo2.AsByte(), alphaShuffle).AsUInt32();
                    alphaLo3 = Ssse3.Shuffle(lo3.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi0 = Ssse3.Shuffle(hi0.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi1 = Ssse3.Shuffle(hi1.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi2 = Ssse3.Shuffle(hi2.AsByte(), alphaShuffle).AsUInt32();
                    alphaHi3 = Ssse3.Shuffle(hi3.AsByte(), alphaShuffle).AsUInt32();
                }
                else
                {
                    alphaLo0 = Sse2.UnpackLow(alpha0.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaLo1 = Sse2.UnpackLow(alpha1.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaLo2 = Sse2.UnpackLow(alpha2.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaLo3 = Sse2.UnpackLow(alpha3.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi0 = Sse2.UnpackHigh(alpha0.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi1 = Sse2.UnpackHigh(alpha1.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi2 = Sse2.UnpackHigh(alpha2.AsByte(), Vector128 <byte> .Zero).AsUInt32();
                    alphaHi3 = Sse2.UnpackHigh(alpha3.AsByte(), Vector128 <byte> .Zero).AsUInt32();

                    Vector128 <uint> alphaLo160 = Sse2.ShiftRightLogical(alphaLo0, 16);
                    Vector128 <uint> alphaLo161 = Sse2.ShiftRightLogical(alphaLo1, 16);
                    Vector128 <uint> alphaLo162 = Sse2.ShiftRightLogical(alphaLo2, 16);
                    Vector128 <uint> alphaLo163 = Sse2.ShiftRightLogical(alphaLo3, 16);
                    Vector128 <uint> alphaHi160 = Sse2.ShiftRightLogical(alphaHi0, 16);
                    Vector128 <uint> alphaHi161 = Sse2.ShiftRightLogical(alphaHi1, 16);
                    Vector128 <uint> alphaHi162 = Sse2.ShiftRightLogical(alphaHi2, 16);
                    Vector128 <uint> alphaHi163 = Sse2.ShiftRightLogical(alphaHi3, 16);
                    alphaLo0 = Sse2.Or(alphaLo0, alphaLo160);
                    alphaLo1 = Sse2.Or(alphaLo1, alphaLo161);
                    alphaLo2 = Sse2.Or(alphaLo2, alphaLo162);
                    alphaLo3 = Sse2.Or(alphaLo3, alphaLo163);
                    alphaHi0 = Sse2.Or(alphaHi0, alphaHi160);
                    alphaHi1 = Sse2.Or(alphaHi1, alphaHi161);
                    alphaHi2 = Sse2.Or(alphaHi2, alphaHi162);
                    alphaHi3 = Sse2.Or(alphaHi3, alphaHi163);

                    Vector128 <ulong> alphaLo320 = Sse2.ShiftRightLogical(alphaLo0.AsUInt64(), 32);
                    Vector128 <ulong> alphaLo321 = Sse2.ShiftRightLogical(alphaLo1.AsUInt64(), 32);
                    Vector128 <ulong> alphaLo322 = Sse2.ShiftRightLogical(alphaLo2.AsUInt64(), 32);
                    Vector128 <ulong> alphaLo323 = Sse2.ShiftRightLogical(alphaLo3.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi320 = Sse2.ShiftRightLogical(alphaHi0.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi321 = Sse2.ShiftRightLogical(alphaHi1.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi322 = Sse2.ShiftRightLogical(alphaHi2.AsUInt64(), 32);
                    Vector128 <ulong> alphaHi323 = Sse2.ShiftRightLogical(alphaHi3.AsUInt64(), 32);
                    alphaLo0 = Sse2.Or(alphaLo0.AsUInt64(), alphaLo320).AsUInt32();
                    alphaLo1 = Sse2.Or(alphaLo1.AsUInt64(), alphaLo321).AsUInt32();
                    alphaLo2 = Sse2.Or(alphaLo2.AsUInt64(), alphaLo322).AsUInt32();
                    alphaLo3 = Sse2.Or(alphaLo3.AsUInt64(), alphaLo323).AsUInt32();
                    alphaHi0 = Sse2.Or(alphaHi0.AsUInt64(), alphaHi320).AsUInt32();
                    alphaHi1 = Sse2.Or(alphaHi1.AsUInt64(), alphaHi321).AsUInt32();
                    alphaHi2 = Sse2.Or(alphaHi2.AsUInt64(), alphaHi322).AsUInt32();
                    alphaHi3 = Sse2.Or(alphaHi3.AsUInt64(), alphaHi323).AsUInt32();
                }

                Vector128 <ushort> prodLo0 = Sse2.MultiplyLow(lo0, alphaLo0.AsUInt16());
                Vector128 <ushort> prodLo1 = Sse2.MultiplyLow(lo1, alphaLo1.AsUInt16());
                Vector128 <ushort> prodLo2 = Sse2.MultiplyLow(lo2, alphaLo2.AsUInt16());
                Vector128 <ushort> prodLo3 = Sse2.MultiplyLow(lo3, alphaLo3.AsUInt16());
                Vector128 <ushort> prodHi0 = Sse2.MultiplyLow(hi0, alphaHi0.AsUInt16());
                Vector128 <ushort> prodHi1 = Sse2.MultiplyLow(hi1, alphaHi1.AsUInt16());
                Vector128 <ushort> prodHi2 = Sse2.MultiplyLow(hi2, alphaHi2.AsUInt16());
                Vector128 <ushort> prodHi3 = Sse2.MultiplyLow(hi3, alphaHi3.AsUInt16());

                Vector128 <ushort> addend = Vector128.Create((ushort)0x00FFU);

                var sumLo0 = Sse2.Add(prodLo0, addend);
                var sumLo1 = Sse2.Add(prodLo1, addend);
                var sumLo2 = Sse2.Add(prodLo2, addend);
                var sumLo3 = Sse2.Add(prodLo3, addend);
                var sumHi0 = Sse2.Add(prodHi0, addend);
                var sumHi1 = Sse2.Add(prodHi1, addend);
                var sumHi2 = Sse2.Add(prodHi2, addend);
                var sumHi3 = Sse2.Add(prodHi3, addend);

                var shiftLo0 = Sse2.ShiftRightLogical(sumLo0, 8);
                var shiftLo1 = Sse2.ShiftRightLogical(sumLo1, 8);
                var shiftLo2 = Sse2.ShiftRightLogical(sumLo2, 8);
                var shiftLo3 = Sse2.ShiftRightLogical(sumLo3, 8);
                var shiftHi0 = Sse2.ShiftRightLogical(sumHi0, 8);
                var shiftHi1 = Sse2.ShiftRightLogical(sumHi1, 8);
                var shiftHi2 = Sse2.ShiftRightLogical(sumHi2, 8);
                var shiftHi3 = Sse2.ShiftRightLogical(sumHi3, 8);

                var packed0 = Sse2.PackUnsignedSaturate(shiftLo0.AsInt16(), shiftHi0.AsInt16()).AsUInt32();
                var packed1 = Sse2.PackUnsignedSaturate(shiftLo1.AsInt16(), shiftHi1.AsInt16()).AsUInt32();
                var packed2 = Sse2.PackUnsignedSaturate(shiftLo2.AsInt16(), shiftHi2.AsInt16()).AsUInt32();
                var packed3 = Sse2.PackUnsignedSaturate(shiftLo3.AsInt16(), shiftHi3.AsInt16()).AsUInt32();

                var mask = Vector128.Create(0x00FFFFFFU);
                packed0 = Sse2.And(packed0, mask);
                packed1 = Sse2.And(packed1, mask);
                packed2 = Sse2.And(packed2, mask);
                packed3 = Sse2.And(packed3, mask);
                packed0 = Sse2.Or(packed0, alpha0);
                packed1 = Sse2.Or(packed1, alpha1);
                packed2 = Sse2.Or(packed2, alpha2);
                packed3 = Sse2.Or(packed3, alpha3);

                Sse2.Store(dataPtr + offset + 0x0, packed0);
                Sse2.Store(dataPtr + offset + 0x4, packed1);
                Sse2.Store(dataPtr + offset + 0x8, packed2);
                Sse2.Store(dataPtr + offset + 0xC, packed3);
            }
        }

        // This is unlikely to happen, but handle when there are still elements left (the texture size isn't aligned to 4)
        if (offset < data.Length)
        {
            ProcessTextureScalar(data.SliceUnsafe(offset));
        }
    }
示例#20
0
 public static Vector128 <uint> RotateLeftUInt32_16(this Vector128 <uint> value)
 {
     return(Ssse3.IsSupported ? Ssse3.Shuffle(value.AsByte(), Rot16_128).AsUInt32() : value.RotateLeftUInt32(16));
 }
示例#21
0
        public static unsafe int NeedsEscaping(ReadOnlySpan <char> value, JavaScriptEncoder encoder)
        {
            fixed(char *ptr = value)
            {
                int idx = 0;

                // Some implementations of JavascriptEncoder.FindFirstCharacterToEncode may not accept
                // null pointers and gaurd against that. Hence, check up-front and fall down to return -1.
                if (encoder != null && !value.IsEmpty)
                {
                    idx = encoder.FindFirstCharacterToEncode(ptr, value.Length);
                    goto Return;
                }

#if BUILDING_INBOX_LIBRARY
                if (Sse2.IsSupported)
                {
                    short *startingAddress = (short *)ptr;
                    while (value.Length - 8 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 8));

                        // Load the next 8 characters.
                        Vector128 <short> sourceValue = Sse2.LoadVector128(startingAddress);

                        // Check if any of the 8 characters need to be escaped.
                        Vector128 <short> mask = CreateEscapingMask(sourceValue);

                        int index = Sse2.MoveMask(mask.AsByte());
                        // If index == 0, that means none of the 8 characters needed to be escaped.
                        // TrailingZeroCount is relatively expensive, avoid it if possible.
                        if (index != 0)
                        {
                            // Found at least one character that needs to be escaped, figure out the index of
                            // the first one found that needed to be escaped within the 8 characters.
                            Debug.Assert(index > 0 && index <= 65_535);
                            int tzc = BitOperations.TrailingZeroCount(index);
                            Debug.Assert(tzc % 2 == 0 && tzc >= 0 && tzc <= 16);
                            idx += tzc >> 1;
                            goto Return;
                        }
                        idx             += 8;
                        startingAddress += 8;
                    }

                    // Process the remaining characters.
                    Debug.Assert(value.Length - idx < 8);
                }
#endif

                for (; idx < value.Length; idx++)
                {
                    Debug.Assert((ptr + idx) <= (ptr + value.Length));
                    if (NeedsEscaping(*(ptr + idx)))
                    {
                        goto Return;
                    }
                }

                idx = -1; // All characters are allowed.

Return:
                return(idx);
            }
        }
示例#22
0
 public static Vector128 <T> ReverseEndianness128 <T>(this Vector128 <T> a) where T : struct
 {
     return(Ssse3.Shuffle(a.AsByte(), Reverse128).As <byte, T>());
 }
示例#23
0
        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue;
                        bool containsNonAsciiBytes;

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        if (Sse2.IsSupported)
                        {
                            sourceValue           = Sse2.LoadVector128(startingAddress);
                            containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue);
                        }
                        else
                        {
                            sourceValue           = AdvSimd.LoadVector128(startingAddress);
                            containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue);
                        }

                        if (containsNonAsciiBytes)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                            startingAddress = (sbyte *)ptr + idx;
                        }
                        else
                        {
                            // Check if any of the 16 bytes need to be escaped.
                            int index;

                            if (Sse2.IsSupported)
                            {
                                Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue);
                                index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte());
                            }
                            else
                            {
                                Vector128 <sbyte> mask = AdvSimdHelper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue);
                                index = AdvSimdHelper.GetIndexOfFirstNonAsciiByte(mask.AsByte());
                            }

                            // If index >= 16, that means none of the 16 bytes needed to be escaped.
                            if (index < 16)
                            {
                                // Found at least one byte that needs to be escaped, figure out the index of
                                // the first one found that needed to be escaped within the 16 bytes.
                                idx += index;
                                goto Return;
                            }
                            idx             += 16;
                            startingAddress += 16;
                        }
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }
示例#24
0
        public static unsafe void ChaCha20(uint *x, byte *m, byte *c, ulong bytes)
        {
            if (Avx2.IsSupported && bytes >= 512)
            {
                Vector256 <uint> x_0  = Vector256.Create(x[0]);
                Vector256 <uint> x_1  = Vector256.Create(x[1]);
                Vector256 <uint> x_2  = Vector256.Create(x[2]);
                Vector256 <uint> x_3  = Vector256.Create(x[3]);
                Vector256 <uint> x_4  = Vector256.Create(x[4]);
                Vector256 <uint> x_5  = Vector256.Create(x[5]);
                Vector256 <uint> x_6  = Vector256.Create(x[6]);
                Vector256 <uint> x_7  = Vector256.Create(x[7]);
                Vector256 <uint> x_8  = Vector256.Create(x[8]);
                Vector256 <uint> x_9  = Vector256.Create(x[9]);
                Vector256 <uint> x_10 = Vector256.Create(x[10]);
                Vector256 <uint> x_11 = Vector256.Create(x[11]);
                Vector256 <uint> x_12;
                Vector256 <uint> x_13;
                Vector256 <uint> x_14 = Vector256.Create(x[14]);
                Vector256 <uint> x_15 = Vector256.Create(x[15]);

                Vector256 <uint> orig0  = x_0;
                Vector256 <uint> orig1  = x_1;
                Vector256 <uint> orig2  = x_2;
                Vector256 <uint> orig3  = x_3;
                Vector256 <uint> orig4  = x_4;
                Vector256 <uint> orig5  = x_5;
                Vector256 <uint> orig6  = x_6;
                Vector256 <uint> orig7  = x_7;
                Vector256 <uint> orig8  = x_8;
                Vector256 <uint> orig9  = x_9;
                Vector256 <uint> orig10 = x_10;
                Vector256 <uint> orig11 = x_11;
                Vector256 <uint> orig12;
                Vector256 <uint> orig13;
                Vector256 <uint> orig14 = x_14;
                Vector256 <uint> orig15 = x_15;

                while (bytes >= 512)
                {
                    Vector256 <uint> addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32();
                    Vector256 <uint> addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32();
                    Vector256 <uint> permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32();
                    Vector256 <uint> t12, t13;
                    x_0  = orig0;
                    x_1  = orig1;
                    x_2  = orig2;
                    x_3  = orig3;
                    x_4  = orig4;
                    x_5  = orig5;
                    x_6  = orig6;
                    x_7  = orig7;
                    x_8  = orig8;
                    x_9  = orig9;
                    x_10 = orig10;
                    x_11 = orig11;
                    x_14 = orig14;
                    x_15 = orig15;
                    uint  in12 = x[12];
                    uint  in13 = x[13];
                    ulong in1213 = in12 | ((ulong)in13 << 32);
                    x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32();
                    t12  = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32();
                    t13  = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32();
                    x_12 = Avx2.UnpackLow(t12, t13);
                    x_13 = Avx2.UnpackHigh(t12, t13);
                    t12  = Avx2.UnpackLow(x_12, x_13);
                    t13  = Avx2.UnpackHigh(x_12, x_13);
                    x_12 = Avx2.PermuteVar8x32(t12, permute);
                    x_13 = Avx2.PermuteVar8x32(t13, permute);

                    orig12 = x_12;
                    orig13 = x_13;

                    in1213 += 8;

                    x[12] = (uint)(in1213 & 0xFFFFFFFF);
                    x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF);
                    for (int i = 0; i < 20; i += 2)
                    {
                        Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15);
                        Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14);
                    }

                    Vector256 <uint> t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15;
                    t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0);
                    // ONEOCTO enter
                    OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3);
                    OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7);
                    t_0 = Avx2.Permute2x128(x_0, x_4, 0x20);
                    t_4 = Avx2.Permute2x128(x_0, x_4, 0x31);
                    t_1 = Avx2.Permute2x128(x_1, x_5, 0x20);
                    t_5 = Avx2.Permute2x128(x_1, x_5, 0x31);
                    t_2 = Avx2.Permute2x128(x_2, x_6, 0x20);
                    t_6 = Avx2.Permute2x128(x_2, x_6, 0x31);
                    t_3 = Avx2.Permute2x128(x_3, x_7, 0x20);
                    t_7 = Avx2.Permute2x128(x_3, x_7, 0x31);
                    t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32());
                    t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32());
                    t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32());
                    t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32());
                    t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32());
                    t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32());
                    t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32());
                    t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32());
                    Avx.Store(c, t_0.AsByte());
                    Avx.Store(c + 64, t_1.AsByte());
                    Avx.Store(c + 128, t_2.AsByte());
                    Avx.Store(c + 192, t_3.AsByte());
                    Avx.Store(c + 256, t_4.AsByte());
                    Avx.Store(c + 320, t_5.AsByte());
                    Avx.Store(c + 384, t_6.AsByte());
                    Avx.Store(c + 448, t_7.AsByte());
                    // ONEOCTO exit

                    m += 32;
                    c += 32;

                    // ONEOCTO enter
                    OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11);
                    OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15);
                    t_8  = Avx2.Permute2x128(x_8, x_12, 0x20);
                    t_12 = Avx2.Permute2x128(x_8, x_12, 0x31);
                    t_9  = Avx2.Permute2x128(x_9, x_13, 0x20);
                    t_13 = Avx2.Permute2x128(x_9, x_13, 0x31);
                    t_10 = Avx2.Permute2x128(x_10, x_14, 0x20);
                    t_14 = Avx2.Permute2x128(x_10, x_14, 0x31);
                    t_11 = Avx2.Permute2x128(x_11, x_15, 0x20);
                    t_15 = Avx2.Permute2x128(x_11, x_15, 0x31);
                    t_8  = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32());
                    t_9  = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32());
                    t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32());
                    t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32());
                    t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32());
                    t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32());
                    t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32());
                    t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32());
                    Avx.Store(c, t_8.AsByte());
                    Avx.Store(c + 64, t_9.AsByte());
                    Avx.Store(c + 128, t_10.AsByte());
                    Avx.Store(c + 192, t_11.AsByte());
                    Avx.Store(c + 256, t_12.AsByte());
                    Avx.Store(c + 320, t_13.AsByte());
                    Avx.Store(c + 384, t_14.AsByte());
                    Avx.Store(c + 448, t_15.AsByte());
                    // ONEOCTO exit
                    m     -= 32;
                    c     -= 32;
                    bytes -= 512;
                    c     += 512;
                    m     += 512;
                }
            }
            if (bytes >= 256)
            {
                Vector128 <uint> x_0  = Vector128.Create(x[0]);
                Vector128 <uint> x_1  = Vector128.Create(x[1]);
                Vector128 <uint> x_2  = Vector128.Create(x[2]);
                Vector128 <uint> x_3  = Vector128.Create(x[3]);
                Vector128 <uint> x_4  = Vector128.Create(x[4]);
                Vector128 <uint> x_5  = Vector128.Create(x[5]);
                Vector128 <uint> x_6  = Vector128.Create(x[6]);
                Vector128 <uint> x_7  = Vector128.Create(x[7]);
                Vector128 <uint> x_8  = Vector128.Create(x[8]);
                Vector128 <uint> x_9  = Vector128.Create(x[9]);
                Vector128 <uint> x_10 = Vector128.Create(x[10]);
                Vector128 <uint> x_11 = Vector128.Create(x[11]);
                Vector128 <uint> x_12;
                Vector128 <uint> x_13;
                Vector128 <uint> x_14   = Vector128.Create(x[14]);
                Vector128 <uint> x_15   = Vector128.Create(x[15]);
                Vector128 <uint> orig0  = x_0;
                Vector128 <uint> orig1  = x_1;
                Vector128 <uint> orig2  = x_2;
                Vector128 <uint> orig3  = x_3;
                Vector128 <uint> orig4  = x_4;
                Vector128 <uint> orig5  = x_5;
                Vector128 <uint> orig6  = x_6;
                Vector128 <uint> orig7  = x_7;
                Vector128 <uint> orig8  = x_8;
                Vector128 <uint> orig9  = x_9;
                Vector128 <uint> orig10 = x_10;
                Vector128 <uint> orig11 = x_11;
                Vector128 <uint> orig12;
                Vector128 <uint> orig13;
                Vector128 <uint> orig14 = x_14;
                Vector128 <uint> orig15 = x_15;
                Vector128 <uint> t12, t13;

                while (bytes >= 256)
                {
                    Vector128 <uint> addv12 = Vector128.Create(0, 1).AsUInt32();
                    Vector128 <uint> addv13 = Vector128.Create(2, 3).AsUInt32();

                    x_0  = orig0;
                    x_1  = orig1;
                    x_2  = orig2;
                    x_3  = orig3;
                    x_4  = orig4;
                    x_5  = orig5;
                    x_6  = orig6;
                    x_7  = orig7;
                    x_8  = orig8;
                    x_9  = orig9;
                    x_10 = orig10;
                    x_11 = orig11;
                    x_14 = orig14;
                    x_15 = orig15;

                    uint  in12   = x[12];
                    uint  in13   = x[13];
                    ulong in1213 = in12 | ((ulong)in13) << 32;
                    t12 = Vector128.Create(in1213).AsUInt32();
                    t13 = Vector128.Create(in1213).AsUInt32();

                    x_12 = Sse2.Add(Vector128.AsUInt64 <uint>(addv12), Vector128.AsUInt64 <uint>(t12)).AsUInt32();
                    x_13 = Sse2.Add(Vector128.AsUInt64 <uint>(addv13), Vector128.AsUInt64 <uint>(t13)).AsUInt32();

                    t12 = Sse2.UnpackLow(x_12, x_13);
                    t13 = Sse2.UnpackHigh(x_12, x_13);

                    x_12 = Sse2.UnpackLow(t12, t13);
                    x_13 = Sse2.UnpackHigh(t12, t13);

                    orig12 = x_12;
                    orig13 = x_13;

                    in1213 += 4;

                    x[12] = (uint)(in1213 & 0xFFFFFFFF);
                    x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF);

                    for (int i = 0; i < 20; i += 2)
                    {
                        Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12);
                        Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13);
                        Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14);
                        Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15);
                        Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15);
                        Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12);
                        Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13);
                        Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14);
                    }
                    OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c);
                    m     -= 48;
                    c     -= 48;
                    bytes -= 256;
                    c     += 256;
                    m     += 256;
                }
            }
            while (bytes >= 64)
            {
                Vector128 <uint> x_0 = Sse2.LoadVector128(x);
                Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4);
                Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8);
                Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12);
                Vector128 <uint> t_1;

                for (int i = 0; i < 20; i += 2)
                {
                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 147);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 78);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 57);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 57);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 78);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 147);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);
                }
                x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x));
                x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4));
                x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8));
                x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12));
                x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32();
                x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32();
                x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32();
                x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32();
                Sse2.Store(c, x_0.AsByte());
                Sse2.Store(c + 16, x_1.AsByte());
                Sse2.Store(c + 32, x_2.AsByte());
                Sse2.Store(c + 48, x_3.AsByte());

                uint in12 = x[12];
                uint in13 = x[13];
                in12++;
                if (in12 == 0)
                {
                    in13++;
                }
                x[12] = in12;
                x[13] = in13;

                bytes -= 64;
                c     += 64;
                m     += 64;
            }
            if (bytes > 0)
            {
                Vector128 <uint> x_0 = Sse2.LoadVector128(x);
                Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4);
                Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8);
                Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12);
                Vector128 <uint> t_1;
                for (int i = 0; i < 20; i += 2)
                {
                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 0x93);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 0x4e);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 0x39);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 0x39);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 0x4e);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 0x93);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);
                }
                x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x));
                x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4));
                x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8));
                x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12));
                byte *partialblock = stackalloc byte[64];
                Sse2.Store(partialblock, Vector128.AsByte(x_0));
                Sse2.Store(partialblock + 16, Vector128.AsByte(x_1));
                Sse2.Store(partialblock + 32, Vector128.AsByte(x_2));
                Sse2.Store(partialblock + 48, Vector128.AsByte(x_3));

                for (ulong i = 0; i < bytes; i++)
                {
                    c[i] = (byte)(m[i] ^ partialblock[i]);
                }
                for (int n = 0; n < 64 / sizeof(int); n++)
                {
                    ((int *)partialblock)[n] = 0;
                }
            }
        }
示例#25
0
        public virtual unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            if (!_isAsciiCacheInitialized)
            {
                InitializeAsciiCache();
            }

            // Loop through the input text, terminating when we see ill-formed UTF-8 or when we decode a scalar value
            // that must be encoded. If we see either of these things then we'll return its index in the original
            // input sequence. If we consume the entire text without seeing either of these, return -1 to indicate
            // that the text can be copied as-is without escaping.

            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if ((Sse2.IsSupported || AdvSimd.Arm64.IsSupported) && utf8Text.Length - 16 >= idx)
                {
                    // Hoist these outside the loop, as the JIT won't do it.
                    Vector128 <sbyte> bitMaskLookupAsciiNeedsEscaping = _bitMaskLookupAsciiNeedsEscaping;
                    Vector128 <sbyte> bitPosLookup    = Ssse3Helper.s_bitPosLookup;
                    Vector128 <sbyte> nibbleMaskSByte = Ssse3Helper.s_nibbleMaskSByte;
                    Vector128 <sbyte> nullMaskSByte   = Ssse3Helper.s_nullMaskSByte;

                    sbyte *startingAddress = (sbyte *)ptr;
                    do
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue;
                        bool containsNonAsciiBytes;

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        if (Sse2.IsSupported)
                        {
                            sourceValue           = Sse2.LoadVector128(startingAddress);
                            containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue);
                        }
                        else if (AdvSimd.Arm64.IsSupported)
                        {
                            sourceValue           = AdvSimd.LoadVector128(startingAddress);
                            containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue);
                        }
                        else
                        {
                            throw new PlatformNotSupportedException();
                        }

                        if (!containsNonAsciiBytes)
                        {
                            // All of the following 16 bytes is ASCII.
                            // TODO AdvSimd: optimization maybe achievable using VectorTableLookup and/or VectorTableLookupExtension

                            if (Ssse3.IsSupported)
                            {
                                Vector128 <sbyte> mask = Ssse3Helper.CreateEscapingMask(sourceValue, bitMaskLookupAsciiNeedsEscaping, bitPosLookup, nibbleMaskSByte, nullMaskSByte);
                                int index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte());

                                if (index < 16)
                                {
                                    idx += index;
                                    goto Return;
                                }
                            }
                            else
                            {
                                byte *p = (byte *)startingAddress;
                                if (DoesAsciiNeedEncoding(p[0]))
                                {
                                    goto Return;
                                }
                                if (DoesAsciiNeedEncoding(p[1]))
                                {
                                    goto Return1;
                                }
                                if (DoesAsciiNeedEncoding(p[2]))
                                {
                                    goto Return2;
                                }
                                if (DoesAsciiNeedEncoding(p[3]))
                                {
                                    goto Return3;
                                }
                                if (DoesAsciiNeedEncoding(p[4]))
                                {
                                    goto Return4;
                                }
                                if (DoesAsciiNeedEncoding(p[5]))
                                {
                                    goto Return5;
                                }
                                if (DoesAsciiNeedEncoding(p[6]))
                                {
                                    goto Return6;
                                }
                                if (DoesAsciiNeedEncoding(p[7]))
                                {
                                    goto Return7;
                                }
                                if (DoesAsciiNeedEncoding(p[8]))
                                {
                                    goto Return8;
                                }
                                if (DoesAsciiNeedEncoding(p[9]))
                                {
                                    goto Return9;
                                }
                                if (DoesAsciiNeedEncoding(p[10]))
                                {
                                    goto Return10;
                                }
                                if (DoesAsciiNeedEncoding(p[11]))
                                {
                                    goto Return11;
                                }
                                if (DoesAsciiNeedEncoding(p[12]))
                                {
                                    goto Return12;
                                }
                                if (DoesAsciiNeedEncoding(p[13]))
                                {
                                    goto Return13;
                                }
                                if (DoesAsciiNeedEncoding(p[14]))
                                {
                                    goto Return14;
                                }
                                if (DoesAsciiNeedEncoding(p[15]))
                                {
                                    goto Return15;
                                }
                            }

                            idx += 16;
                        }
                        else
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (DoesAsciiNeedEncoding(ptr[idx]))
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                        }
                        startingAddress = (sbyte *)ptr + idx;
                    }while (utf8Text.Length - 16 >= idx);

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (DoesAsciiNeedEncoding(ptr[idx]))
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.
                goto Return;

#if NETCOREAPP
Return15:
                return(idx + 15);

Return14:
                return(idx + 14);

Return13:
                return(idx + 13);

Return12:
                return(idx + 12);

Return11:
                return(idx + 11);

Return10:
                return(idx + 10);

Return9:
                return(idx + 9);

Return8:
                return(idx + 8);

Return7:
                return(idx + 7);

Return6:
                return(idx + 6);

Return5:
                return(idx + 5);

Return4:
                return(idx + 4);

Return3:
                return(idx + 3);

Return2:
                return(idx + 2);

Return1:
                return(idx + 1);
#endif
Return:
                return(idx);
            }
        }
示例#26
0
        public static unsafe int NeedsEscaping(ReadOnlySpan <byte> value, JavaScriptEncoder encoder)
        {
            fixed(byte *ptr = value)
            {
                int idx = 0;

                if (encoder != null)
                {
                    idx = encoder.FindFirstCharacterToEncodeUtf8(value);
                    goto Return;
                }

#if BUILDING_INBOX_LIBRARY
                if (Sse2.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (value.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + value.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue = Sse2.LoadVector128(startingAddress);

                        // Check if any of the 16 bytes need to be escaped.
                        Vector128 <sbyte> mask = CreateEscapingMask(sourceValue);

                        int index = Sse2.MoveMask(mask.AsByte());
                        // If index == 0, that means none of the 16 bytes needed to be escaped.
                        // TrailingZeroCount is relatively expensive, avoid it if possible.
                        if (index != 0)
                        {
                            // Found at least one byte that needs to be escaped, figure out the index of
                            // the first one found that needed to be escaped within the 16 bytes.
                            Debug.Assert(index > 0 && index <= 65_535);
                            int tzc = BitOperations.TrailingZeroCount(index);
                            Debug.Assert(tzc >= 0 && tzc <= 16);
                            idx += tzc;
                            goto Return;
                        }
                        idx             += 16;
                        startingAddress += 16;
                    }

                    // Process the remaining characters.
                    Debug.Assert(value.Length - idx < 16);
                }
#endif

                for (; idx < value.Length; idx++)
                {
                    Debug.Assert((ptr + idx) <= (ptr + value.Length));
                    if (NeedsEscaping(*(ptr + idx)))
                    {
                        goto Return;
                    }
                }

                idx = -1; // all characters allowed

Return:
                return(idx);
            }
        }