コード例 #1
0
    private static void Transpose(ref Vector128 <byte> x0, ref Vector128 <byte> x1, ref Vector128 <byte> x2, ref Vector128 <byte> x3)
    {
        Vector128 <ulong> t0 = Sse2.UnpackHigh(x0.AsUInt32(), x1.AsUInt32()).AsUInt64();

        x0 = Sse2.UnpackLow(x0.AsUInt32(), x1.AsUInt32()).AsByte();

        Vector128 <ulong> t1 = Sse2.UnpackLow(x2.AsUInt32(), x3.AsUInt32()).AsUInt64();

        x2 = Sse2.UnpackHigh(x2.AsUInt32(), x3.AsUInt32()).AsByte();

        x1 = Sse2.UnpackHigh(x0.AsUInt64(), t1).AsByte();
        x0 = Sse2.UnpackLow(x0.AsUInt64(), t1).AsByte();

        x3 = Sse2.UnpackHigh(t0, x2.AsUInt64()).AsByte();
        x2 = Sse2.UnpackLow(t0, x2.AsUInt64()).AsByte();
    }
コード例 #2
0
ファイル: GHashX86.cs プロジェクト: HMBSbige/CryptoBase
    private unsafe void GFMul(ReadOnlySpan <byte> x)
    {
        var a = _key.AsUInt64();
        Vector128 <ulong> b;

        fixed(byte *p = x)
        {
            var t = Sse2.LoadVector128(p);

            b = t.ReverseEndianness128().AsUInt64();
        }

        b = Sse2.Xor(b.AsByte(), _buffer).AsUInt64();

        var tmp3 = Pclmulqdq.CarrylessMultiply(a, b, 0x00).AsUInt32();
        var tmp4 = Pclmulqdq.CarrylessMultiply(a, b, 0x10).AsUInt32();
        var tmp5 = Pclmulqdq.CarrylessMultiply(a, b, 0x01).AsUInt32();
        var tmp6 = Pclmulqdq.CarrylessMultiply(a, b, 0x11).AsUInt32();

        tmp4 = Sse2.Xor(tmp4, tmp5);
        tmp5 = Sse2.ShiftLeftLogical128BitLane(tmp4, 8);
        tmp4 = Sse2.ShiftRightLogical128BitLane(tmp4, 8);
        tmp3 = Sse2.Xor(tmp3, tmp5);
        tmp6 = Sse2.Xor(tmp6, tmp4);

        var tmp7 = Sse2.ShiftRightLogical(tmp3, 31);
        var tmp8 = Sse2.ShiftRightLogical(tmp6, 31);

        tmp3 = Sse2.ShiftLeftLogical(tmp3, 1);
        tmp6 = Sse2.ShiftLeftLogical(tmp6, 1);
        var tmp9 = Sse2.ShiftRightLogical128BitLane(tmp7, 12);

        tmp8 = Sse2.ShiftLeftLogical128BitLane(tmp8, 4);
        tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 4);
        tmp3 = Sse2.Or(tmp3, tmp7);
        tmp6 = Sse2.Or(tmp6, tmp8);
        tmp6 = Sse2.Or(tmp6, tmp9);
        tmp7 = Sse2.ShiftLeftLogical(tmp3, 31);
        tmp8 = Sse2.ShiftLeftLogical(tmp3, 30);
        tmp9 = Sse2.ShiftLeftLogical(tmp3, 25);
        tmp7 = Sse2.Xor(tmp7, tmp8);
        tmp7 = Sse2.Xor(tmp7, tmp9);
        tmp8 = Sse2.ShiftRightLogical128BitLane(tmp7, 4);
        tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 12);
        tmp3 = Sse2.Xor(tmp3, tmp7);
        var tmp2 = Sse2.ShiftRightLogical(tmp3, 1);

        tmp4 = Sse2.ShiftRightLogical(tmp3, 2);
        tmp5 = Sse2.ShiftRightLogical(tmp3, 7);
        tmp2 = Sse2.Xor(tmp2, tmp4);
        tmp2 = Sse2.Xor(tmp2, tmp5);
        tmp2 = Sse2.Xor(tmp2, tmp8);
        tmp3 = Sse2.Xor(tmp3, tmp2);
        tmp6 = Sse2.Xor(tmp6, tmp3);

        _buffer = tmp6.AsByte();
    }
コード例 #3
0
    public static Vector128 <uint> CreateTwoUInt(uint a, uint b)
    {
        if (Sse2.IsSupported)
        {
            Vector128 <uint> t1 = Vector128.CreateScalarUnsafe(a);
            Vector128 <uint> t2 = Vector128.CreateScalarUnsafe(b);

            return(Sse2.UnpackLow(t1.AsUInt64(), t2.AsUInt64()).AsUInt32());
        }

        return(Vector128.Create(a, 0, b, 0));
    }
コード例 #4
0
        private static ulong GetNonAsciiBytes(Vector128 <byte> value, Vector128 <byte> bitMask128)
        {
            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
            {
                throw new PlatformNotSupportedException();
            }

            Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
            Vector128 <byte> extractedBits           = AdvSimd.And(mostSignificantBitIsSet, bitMask128);

            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
            return(extractedBits.AsUInt64().ToScalar());
        }
コード例 #5
0
        public static bool ContainsNonAsciiByte(Vector128 <sbyte> value)
        {
            if (!AdvSimd.Arm64.IsSupported)
            {
                throw new PlatformNotSupportedException();
            }

            // most significant bit mask for a 64-bit byte vector
            const ulong MostSignficantBitMask = 0x8080808080808080;

            value = AdvSimd.Arm64.MinPairwise(value, value);
            return((value.AsUInt64().ToScalar() & MostSignficantBitMask) != 0);
        }
コード例 #6
0
        private unsafe static void VariantTwoShuffleAdd(
            byte *basePtr,
            int offset,
            Vector128 <byte> _b1,
            Vector128 <byte> _b,
            Vector128 <byte> _a)
        {
            Vector128 <ulong> chunk1 = Sse2.LoadVector128((ulong *)(basePtr + (offset ^ 0x10)));
            Vector128 <ulong> chunk2 = Sse2.LoadVector128((ulong *)(basePtr + (offset ^ 0x20)));
            Vector128 <ulong> chunk3 = Sse2.LoadVector128((ulong *)(basePtr + (offset ^ 0x30)));

            Sse2.Store((ulong *)(basePtr + (offset ^ 0x10)), Sse2.Add(chunk3, _b1.AsUInt64()));
            Sse2.Store((ulong *)(basePtr + (offset ^ 0x20)), Sse2.Add(chunk1, _b.AsUInt64()));
            Sse2.Store((ulong *)(basePtr + (offset ^ 0x30)), Sse2.Add(chunk2, _a.AsUInt64()));
        }
コード例 #7
0
 public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 {
     if (typeof(T) == typeof(byte))
     {
         return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>());
     }
     else if (typeof(T) == typeof(sbyte))
     {
         return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>());
     }
     else if (typeof(T) == typeof(short))
     {
         return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>());
     }
     else if (typeof(T) == typeof(ushort))
     {
         return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>());
     }
     else if (typeof(T) == typeof(int))
     {
         return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>());
     }
     else if (typeof(T) == typeof(uint))
     {
         return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
     }
     else if (typeof(T) == typeof(long))
     {
         return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>());
     }
     else if (typeof(T) == typeof(ulong))
     {
         return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
     }
     else if (typeof(T) == typeof(float))
     {
         return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>());
     }
     else if (typeof(T) == typeof(double))
     {
         return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>());
     }
     else
     {
         throw new NotSupportedException();
     }
 }
コード例 #8
0
ファイル: clmul.cs プロジェクト: aaru-dps/Aaru.Checksums
        static void Fold4(ref Vector128 <uint> xmmCRC0, ref Vector128 <uint> xmmCRC1, ref Vector128 <uint> xmmCRC2,
                          ref Vector128 <uint> xmmCRC3)
        {
            Vector128 <uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);

            Vector128 <uint> xTmp0 = xmmCRC0;
            Vector128 <uint> xTmp1 = xmmCRC1;
            Vector128 <uint> xTmp2 = xmmCRC2;
            Vector128 <uint> xTmp3 = xmmCRC3;

            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp0   = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC0 = xmmCRC0.AsSingle();
            Vector128 <float> psT0   = xTmp0.AsSingle();
            Vector128 <float> psRes0 = Sse.Xor(psCRC0, psT0);

            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp1   = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC1 = xmmCRC1.AsSingle();
            Vector128 <float> psT1   = xTmp1.AsSingle();
            Vector128 <float> psRes1 = Sse.Xor(psCRC1, psT1);

            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp2   = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC2 = xmmCRC2.AsSingle();
            Vector128 <float> psT2   = xTmp2.AsSingle();
            Vector128 <float> psRes2 = Sse.Xor(psCRC2, psT2);

            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp3   = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC3 = xmmCRC3.AsSingle();
            Vector128 <float> psT3   = xTmp3.AsSingle();
            Vector128 <float> psRes3 = Sse.Xor(psCRC3, psT3);

            xmmCRC0 = psRes0.AsUInt32();
            xmmCRC1 = psRes1.AsUInt32();
            xmmCRC2 = psRes2.AsUInt32();
            xmmCRC3 = psRes3.AsUInt32();
        }
コード例 #9
0
        public static int GetIndexOfFirstNonAsciiByte(Vector128 <byte> value)
        {
            if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian)
            {
                throw new PlatformNotSupportedException();
            }

            // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2)));
            Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte();
            Vector128 <byte> extractedBits           = AdvSimd.And(mostSignificantBitIsSet, s_bitmask);

            // collapse mask to lower bits
            extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits);
            ulong mask = extractedBits.AsUInt64().ToScalar();

            // calculate the index
            int index = BitOperations.TrailingZeroCount(mask) >> 2;

            Debug.Assert((mask != 0) ? index < 16 : index >= 16);
            return(index);
        }
コード例 #10
0
        private unsafe static void VariantTwoIntegerMath(
            ulong *b,
            ulong *ptr,
            ref ulong divisionResult,
            ref ulong sqrtResult)
        {
            b[0] ^= divisionResult ^ (sqrtResult << 32);
            ulong dividend = ptr[1];
            uint  divisor  = (uint)((ptr[0] + (uint)(sqrtResult << 1)) | 0x80000001UL);

            divisionResult = ((uint)(dividend / divisor)) + ((ulong)(dividend % divisor) << 32);
            ulong sqrtInput = ptr[0] + divisionResult;

            Vector128 <ulong>  expDoubleBias = Vector128.Create(1023UL << 52, 0);
            Vector128 <double> x             = Sse2.Add(Sse2.X64.ConvertScalarToVector128UInt64(sqrtInput >> 12), expDoubleBias).AsDouble();

            x          = Sse2.SqrtScalar(Vector128.Create(0).AsDouble(), x);
            sqrtResult = Sse2.X64.ConvertToUInt64(Sse2.Subtract(x.AsUInt64(), expDoubleBias)) >> 19;

            VariantTwoSqrtFixup(ref sqrtResult, sqrtInput);
        }
コード例 #11
0
ファイル: clmul.cs プロジェクト: aaru-dps/Aaru.Checksums
        internal static uint Step(byte[] src, long len, uint initialCRC)
        {
            Vector128 <uint> xmmT0, xmmT1, xmmT2;
            Vector128 <uint> xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC);
            Vector128 <uint> xmmCRC0    = Sse2.ConvertScalarToVector128UInt32(0x9db42487);
            Vector128 <uint> xmmCRC1    = Vector128 <uint> .Zero;
            Vector128 <uint> xmmCRC2    = Vector128 <uint> .Zero;
            Vector128 <uint> xmmCRC3    = Vector128 <uint> .Zero;
            int bufPos = 0;

            bool first = true;

            /* fold 512 to 32 step variable declarations for ISO-C90 compat. */
            Vector128 <uint> xmmMask  = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
            Vector128 <uint> xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);

            while ((len -= 64) >= 0)
            {
                xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
                                         BitConverter.ToUInt32(src, bufPos + 8),
                                         BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
                                         BitConverter.ToUInt32(src, bufPos + 8),
                                         BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4),
                                         BitConverter.ToUInt32(src, bufPos + 8),
                                         BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                Vector128 <uint> xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos),
                                                          BitConverter.ToUInt32(src, bufPos + 4),
                                                          BitConverter.ToUInt32(src, bufPos + 8),
                                                          BitConverter.ToUInt32(src, bufPos + 12));

                bufPos += 16;

                if (first)
                {
                    first = false;
                    xmmT0 = Sse2.Xor(xmmT0, xmmInitial);
                }

                Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3);

                xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0);
                xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1);
                xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2);
                xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3);
            }

            /* fold 512 to 32 */

            /*
             * k1
             */
            Vector128 <uint> crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]);

            Vector128 <uint> xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10).
                                     AsUInt32();

            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
            xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0);
            xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0);

            Vector128 <uint> xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10).
                                     AsUInt32();

            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
            xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1);
            xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1);

            Vector128 <uint> xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10).
                                     AsUInt32();

            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2);
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);

            /*
             * k5
             */
            crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]);

            xmmCRC0 = xmmCRC3;
            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
            xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8);
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);

            xmmCRC0 = xmmCRC3;
            xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4);
            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0);
            xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2);

            /*
             * k7
             */
            xmmCRC1 = xmmCRC3;
            xmmCRC2 = xmmCRC3;
            crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]);

            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
            xmmCRC3 = Sse2.And(xmmCRC3, xmmMask);

            xmmCRC2 = xmmCRC3;
            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32();
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2);
            xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1);

            /*
             * could just as well write xmm_crc3[2], doing a movaps and truncating, but
             * no real advantage - it's a tiny bit slower per call, while no additional CPUs
             * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL
             */
            return(~Sse41.Extract(xmmCRC3, 2));
        }
コード例 #12
0
 public static Vector128 <T> Not_Software <T>(Vector128 <T> vector) where T : struct
 => Not_Software(vector.AsUInt64()).As <ulong, T>();
コード例 #13
0
        private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int gobBlocksInY = 1 << config.OutBlkHeight;

            bool outLinear = config.OutBlkKind == 0;

            int width   = Math.Min(config.OutLumaWidth + 1, input.Width);
            int height  = Math.Min(config.OutLumaHeight + 1, input.Height);
            int yStride = GetPitch(config.OutLumaWidth + 1, 1);

            int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY);

            if (Sse41.IsSupported)
            {
                Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16();

                int widthTrunc = width & ~0xf;
                int strideGap  = yStride - width;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstY)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 16)
                            {
                                byte *baseOffset = (byte *)(ip + (ulong)(uint)x);

                                Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset);
                                Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10));
                                Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20));
                                Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30));
                                Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40));
                                Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50));
                                Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60));
                                Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70));

                                pixelp1 = Sse2.And(pixelp1, mask);
                                pixelp2 = Sse2.And(pixelp2, mask);
                                pixelp3 = Sse2.And(pixelp3, mask);
                                pixelp4 = Sse2.And(pixelp4, mask);
                                pixelp5 = Sse2.And(pixelp5, mask);
                                pixelp6 = Sse2.And(pixelp6, mask);
                                pixelp7 = Sse2.And(pixelp7, mask);
                                pixelp8 = Sse2.And(pixelp8, mask);

                                Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32());
                                Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32());
                                Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32());
                                Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32());

                                pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32());
                                pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32());

                                pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2);
                                pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *op++ = Downsample(px->R);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        dstY[y * yStride + x] = Downsample(input.GetR(x, y));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstY,
                offsets.LumaOffset,
                outLinear,
                config.OutLumaWidth + 1,
                config.OutLumaHeight + 1,
                1,
                gobBlocksInY);

            rm.BufferPool.Return(dstYIndex);

            int uvWidth  = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1);
            int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1);
            int uvStride = GetPitch(config.OutChromaWidth + 1, 2);

            int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv);

            if (Sse2.IsSupported)
            {
                int widthTrunc = uvWidth & ~7;
                int strideGap  = uvStride - uvWidth * 2;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstUv)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < uvHeight; y++, ip += input.Width * 2)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16;

                                Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02));
                                Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12));
                                Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22));
                                Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32));
                                Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42));
                                Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52));
                                Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62));
                                Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72));

                                Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2);
                                Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4);
                                Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6);
                                Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8);

                                Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64());
                                Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64());

                                pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2);
                                pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < uvWidth; x++)
                            {
                                Pixel *px = ip + (uint)(x << 1);

                                *op++ = Downsample(px->G);
                                *op++ = Downsample(px->B);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < uvHeight; y++)
                {
                    for (int x = 0; x < uvWidth; x++)
                    {
                        int xx = x << 1;
                        int yy = y << 1;

                        int uvOffs = y * uvStride + xx;

                        dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy));
                        dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstUv,
                offsets.ChromaUOffset,
                outLinear,
                config.OutChromaWidth + 1,
                config.OutChromaHeight + 1, 2,
                gobBlocksInY);

            rm.BufferPool.Return(dstUvIndex);
        }
コード例 #14
0
ファイル: Ghash.cs プロジェクト: vcsjones/AesNi
 public static Vector128 <byte> Reduce4(
     Vector128 <byte> h1, Vector128 <byte> h2, Vector128 <byte> h3, Vector128 <byte> h4,
     Vector128 <byte> x1, Vector128 <byte> x2, Vector128 <byte> x3, Vector128 <byte> x4)
 => Reduce4(h1.AsUInt64(), h2.AsUInt64(), h3.AsUInt64(), h4.AsUInt64(),
            x1.AsUInt64(), x2.AsUInt64(), x3.AsUInt64(), x4.AsUInt64()).AsByte();
コード例 #15
0
        internal unsafe static ulong GetSse(ReadOnlySpan <byte> buffer, ulong s1, ulong s2)
        {
            uint len = (uint)buffer.Length;

            uint blocks = len / BLOCK_SIZE;

            len = len - blocks * BLOCK_SIZE;

            Vector128 <sbyte> tap1            = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
            Vector128 <sbyte> tap2            = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
            Vector128 <byte>  zero            = Vector128 <byte> .Zero;
            Vector128 <short> onesShort       = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1);
            Vector128 <int>   onesInt         = Vector128.Create(1, 1, 1, 1);
            Vector128 <byte>  shuffleMask2301 = Vector128.Create((byte)4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11);
            Vector128 <byte>  shuffleMask1032 = Vector128.Create((byte)8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);
            Vector128 <byte>  shuffleMaskTrim = Vector128.Create(0, 1, 2, 3, 255, 255, 255, 255, 8, 9, 10, 11, 255, 255, 255, 255);
            // A B C D -> B A D C
            const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1;


            fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer))
            {
                var buf = bufPtr;

                while (blocks != 0)
                {
                    uint n = NMAX64 / BLOCK_SIZE;
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <ulong> v_ps = Vector128.Create(0, s1 * n);
                    Vector128 <ulong> v_s2 = Vector128.Create(0, s2);
                    Vector128 <ulong> v_s1 = Vector128.Create(0ul, 0);

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]);
                        Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]);


                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);



                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero);
                        v_s1 = Sse2.Add(v_s1, sad1.AsUInt64());
                        Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        Vector128 <int>   mad12 = Sse2.MultiplyAddAdjacent(mad11, onesShort);
                        var mad121          = Sse2.Add(mad12, Sse2.Shuffle(mad12, S2301));
                        var madTrimmed1     = Ssse3.Shuffle(mad121.AsByte(), shuffleMaskTrim);
                        var madTimmed1ULong = madTrimmed1.AsUInt64();
                        v_s2 = Sse2.Add(v_s2, madTimmed1ULong);



                        Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero);
                        v_s1 = Sse2.Add(v_s1, sad2.AsUInt64());
                        Vector128 <short> mad2  = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        Vector128 <int>   mad22 = Sse2.MultiplyAddAdjacent(mad2, onesShort);
                        var mad221          = Sse2.Add(mad22, Sse2.Shuffle(mad22, S2301));
                        var madTrimmed2     = Ssse3.Shuffle(mad221.AsByte(), shuffleMaskTrim);
                        var madTimmed2ULong = madTrimmed2.AsUInt64();
                        v_s2 = Sse2.Add(v_s2, madTimmed2ULong);


                        buf += BLOCK_SIZE;

                        n--;
                    } while (n != 0);


                    var shifted = Sse2.ShiftLeftLogical(v_ps, 5);
                    v_s2 = Sse2.Add(v_s2, shifted);

                    s1 += v_s1.GetElement(0);
                    s1 += v_s1.GetElement(1);


                    s2  = v_s2.GetElement(0);
                    s2 += v_s2.GetElement(1);

                    s1 %= MOD64;
                    s2 %= MOD64;
                }

                if (len > 0)
                {
                    if (len >= 16)
                    {
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        len -= 16;
                    }

                    while (len-- > 0)
                    {
                        s2 += (s1 += *buf++);
                    }
                    if (s1 >= MOD64)
                    {
                        s1 -= MOD64;
                    }

                    s2 %= MOD64;
                }

                return(s1 | (s2 << 32));
            }
        }
コード例 #16
0
ファイル: Ghash.cs プロジェクト: vcsjones/AesNi
 public static Vector128 <byte> Gfmul(Vector128 <byte> a, Vector128 <byte> b)
 => Gfmul(a.AsUInt64(), b.AsUInt64()).AsByte();
コード例 #17
0
ファイル: MeowHash.cs プロジェクト: tvandijck/meow_hash.NET
 private static Vector128 <byte> AddQ(Vector128 <byte> r1, Vector128 <byte> r2)
 {
     return(Sse2.Add(r1.AsUInt64(), r2.AsUInt64()).AsByte());
 }
コード例 #18
0
ファイル: XXHash128.cs プロジェクト: shukenmg/Ryujinx
        private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret)
        {
            if (Avx2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pInput = input, pSecret = secret)
                    {
                        Vector256 <ulong> *xAcc    = (Vector256 <ulong> *)pAcc;
                        Vector256 <byte> * xInput  = (Vector256 <byte> *)pInput;
                        Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 32; i++)
                        {
                            Vector256 <byte>  dataVec   = xInput[i];
                            Vector256 <byte>  keyVec    = xSecret[i];
                            Vector256 <byte>  dataKey   = Avx2.Xor(dataVec, keyVec);
                            Vector256 <uint>  dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector256 <ulong> product   = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector256 <uint>  dataSwap  = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector256 <ulong> sum       = Avx2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Avx2.Add(product, sum);
                        }
                    }
                }
            }
            else if (Sse2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pInput = input, pSecret = secret)
                    {
                        Vector128 <ulong> *xAcc    = (Vector128 <ulong> *)pAcc;
                        Vector128 <byte> * xInput  = (Vector128 <byte> *)pInput;
                        Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 16; i++)
                        {
                            Vector128 <byte>  dataVec   = xInput[i];
                            Vector128 <byte>  keyVec    = xSecret[i];
                            Vector128 <byte>  dataKey   = Sse2.Xor(dataVec, keyVec);
                            Vector128 <uint>  dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector128 <ulong> product   = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector128 <uint>  dataSwap  = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector128 <ulong> sum       = Sse2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Sse2.Add(product, sum);
                        }
                    }
                }
            }
            else
            {
                for (int i = 0; i < AccNb; i++)
                {
                    ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong)));
                    ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong)));
                    acc[i ^ 1] += dataVal;
                    acc[i]     += Mult32To64((uint)dataKey, dataKey >> 32);
                }
            }
        }
コード例 #19
0
        public static unsafe void ChaCha20(uint *x, byte *m, byte *c, ulong bytes)
        {
            if (Avx2.IsSupported && bytes >= 512)
            {
                Vector256 <uint> x_0  = Vector256.Create(x[0]);
                Vector256 <uint> x_1  = Vector256.Create(x[1]);
                Vector256 <uint> x_2  = Vector256.Create(x[2]);
                Vector256 <uint> x_3  = Vector256.Create(x[3]);
                Vector256 <uint> x_4  = Vector256.Create(x[4]);
                Vector256 <uint> x_5  = Vector256.Create(x[5]);
                Vector256 <uint> x_6  = Vector256.Create(x[6]);
                Vector256 <uint> x_7  = Vector256.Create(x[7]);
                Vector256 <uint> x_8  = Vector256.Create(x[8]);
                Vector256 <uint> x_9  = Vector256.Create(x[9]);
                Vector256 <uint> x_10 = Vector256.Create(x[10]);
                Vector256 <uint> x_11 = Vector256.Create(x[11]);
                Vector256 <uint> x_12;
                Vector256 <uint> x_13;
                Vector256 <uint> x_14 = Vector256.Create(x[14]);
                Vector256 <uint> x_15 = Vector256.Create(x[15]);

                Vector256 <uint> orig0  = x_0;
                Vector256 <uint> orig1  = x_1;
                Vector256 <uint> orig2  = x_2;
                Vector256 <uint> orig3  = x_3;
                Vector256 <uint> orig4  = x_4;
                Vector256 <uint> orig5  = x_5;
                Vector256 <uint> orig6  = x_6;
                Vector256 <uint> orig7  = x_7;
                Vector256 <uint> orig8  = x_8;
                Vector256 <uint> orig9  = x_9;
                Vector256 <uint> orig10 = x_10;
                Vector256 <uint> orig11 = x_11;
                Vector256 <uint> orig12;
                Vector256 <uint> orig13;
                Vector256 <uint> orig14 = x_14;
                Vector256 <uint> orig15 = x_15;

                while (bytes >= 512)
                {
                    Vector256 <uint> addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32();
                    Vector256 <uint> addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32();
                    Vector256 <uint> permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32();
                    Vector256 <uint> t12, t13;
                    x_0  = orig0;
                    x_1  = orig1;
                    x_2  = orig2;
                    x_3  = orig3;
                    x_4  = orig4;
                    x_5  = orig5;
                    x_6  = orig6;
                    x_7  = orig7;
                    x_8  = orig8;
                    x_9  = orig9;
                    x_10 = orig10;
                    x_11 = orig11;
                    x_14 = orig14;
                    x_15 = orig15;
                    uint  in12 = x[12];
                    uint  in13 = x[13];
                    ulong in1213 = in12 | ((ulong)in13 << 32);
                    x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32();
                    t12  = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32();
                    t13  = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32();
                    x_12 = Avx2.UnpackLow(t12, t13);
                    x_13 = Avx2.UnpackHigh(t12, t13);
                    t12  = Avx2.UnpackLow(x_12, x_13);
                    t13  = Avx2.UnpackHigh(x_12, x_13);
                    x_12 = Avx2.PermuteVar8x32(t12, permute);
                    x_13 = Avx2.PermuteVar8x32(t13, permute);

                    orig12 = x_12;
                    orig13 = x_13;

                    in1213 += 8;

                    x[12] = (uint)(in1213 & 0xFFFFFFFF);
                    x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF);
                    for (int i = 0; i < 20; i += 2)
                    {
                        Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15);
                        Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14);
                    }

                    Vector256 <uint> t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15;
                    t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0);
                    // ONEOCTO enter
                    OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3);
                    OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7);
                    t_0 = Avx2.Permute2x128(x_0, x_4, 0x20);
                    t_4 = Avx2.Permute2x128(x_0, x_4, 0x31);
                    t_1 = Avx2.Permute2x128(x_1, x_5, 0x20);
                    t_5 = Avx2.Permute2x128(x_1, x_5, 0x31);
                    t_2 = Avx2.Permute2x128(x_2, x_6, 0x20);
                    t_6 = Avx2.Permute2x128(x_2, x_6, 0x31);
                    t_3 = Avx2.Permute2x128(x_3, x_7, 0x20);
                    t_7 = Avx2.Permute2x128(x_3, x_7, 0x31);
                    t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32());
                    t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32());
                    t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32());
                    t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32());
                    t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32());
                    t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32());
                    t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32());
                    t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32());
                    Avx.Store(c, t_0.AsByte());
                    Avx.Store(c + 64, t_1.AsByte());
                    Avx.Store(c + 128, t_2.AsByte());
                    Avx.Store(c + 192, t_3.AsByte());
                    Avx.Store(c + 256, t_4.AsByte());
                    Avx.Store(c + 320, t_5.AsByte());
                    Avx.Store(c + 384, t_6.AsByte());
                    Avx.Store(c + 448, t_7.AsByte());
                    // ONEOCTO exit

                    m += 32;
                    c += 32;

                    // ONEOCTO enter
                    OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11);
                    OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15);
                    t_8  = Avx2.Permute2x128(x_8, x_12, 0x20);
                    t_12 = Avx2.Permute2x128(x_8, x_12, 0x31);
                    t_9  = Avx2.Permute2x128(x_9, x_13, 0x20);
                    t_13 = Avx2.Permute2x128(x_9, x_13, 0x31);
                    t_10 = Avx2.Permute2x128(x_10, x_14, 0x20);
                    t_14 = Avx2.Permute2x128(x_10, x_14, 0x31);
                    t_11 = Avx2.Permute2x128(x_11, x_15, 0x20);
                    t_15 = Avx2.Permute2x128(x_11, x_15, 0x31);
                    t_8  = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32());
                    t_9  = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32());
                    t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32());
                    t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32());
                    t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32());
                    t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32());
                    t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32());
                    t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32());
                    Avx.Store(c, t_8.AsByte());
                    Avx.Store(c + 64, t_9.AsByte());
                    Avx.Store(c + 128, t_10.AsByte());
                    Avx.Store(c + 192, t_11.AsByte());
                    Avx.Store(c + 256, t_12.AsByte());
                    Avx.Store(c + 320, t_13.AsByte());
                    Avx.Store(c + 384, t_14.AsByte());
                    Avx.Store(c + 448, t_15.AsByte());
                    // ONEOCTO exit
                    m     -= 32;
                    c     -= 32;
                    bytes -= 512;
                    c     += 512;
                    m     += 512;
                }
            }
            if (bytes >= 256)
            {
                Vector128 <uint> x_0  = Vector128.Create(x[0]);
                Vector128 <uint> x_1  = Vector128.Create(x[1]);
                Vector128 <uint> x_2  = Vector128.Create(x[2]);
                Vector128 <uint> x_3  = Vector128.Create(x[3]);
                Vector128 <uint> x_4  = Vector128.Create(x[4]);
                Vector128 <uint> x_5  = Vector128.Create(x[5]);
                Vector128 <uint> x_6  = Vector128.Create(x[6]);
                Vector128 <uint> x_7  = Vector128.Create(x[7]);
                Vector128 <uint> x_8  = Vector128.Create(x[8]);
                Vector128 <uint> x_9  = Vector128.Create(x[9]);
                Vector128 <uint> x_10 = Vector128.Create(x[10]);
                Vector128 <uint> x_11 = Vector128.Create(x[11]);
                Vector128 <uint> x_12;
                Vector128 <uint> x_13;
                Vector128 <uint> x_14   = Vector128.Create(x[14]);
                Vector128 <uint> x_15   = Vector128.Create(x[15]);
                Vector128 <uint> orig0  = x_0;
                Vector128 <uint> orig1  = x_1;
                Vector128 <uint> orig2  = x_2;
                Vector128 <uint> orig3  = x_3;
                Vector128 <uint> orig4  = x_4;
                Vector128 <uint> orig5  = x_5;
                Vector128 <uint> orig6  = x_6;
                Vector128 <uint> orig7  = x_7;
                Vector128 <uint> orig8  = x_8;
                Vector128 <uint> orig9  = x_9;
                Vector128 <uint> orig10 = x_10;
                Vector128 <uint> orig11 = x_11;
                Vector128 <uint> orig12;
                Vector128 <uint> orig13;
                Vector128 <uint> orig14 = x_14;
                Vector128 <uint> orig15 = x_15;
                Vector128 <uint> t12, t13;

                while (bytes >= 256)
                {
                    Vector128 <uint> addv12 = Vector128.Create(0, 1).AsUInt32();
                    Vector128 <uint> addv13 = Vector128.Create(2, 3).AsUInt32();

                    x_0  = orig0;
                    x_1  = orig1;
                    x_2  = orig2;
                    x_3  = orig3;
                    x_4  = orig4;
                    x_5  = orig5;
                    x_6  = orig6;
                    x_7  = orig7;
                    x_8  = orig8;
                    x_9  = orig9;
                    x_10 = orig10;
                    x_11 = orig11;
                    x_14 = orig14;
                    x_15 = orig15;

                    uint  in12   = x[12];
                    uint  in13   = x[13];
                    ulong in1213 = in12 | ((ulong)in13) << 32;
                    t12 = Vector128.Create(in1213).AsUInt32();
                    t13 = Vector128.Create(in1213).AsUInt32();

                    x_12 = Sse2.Add(Vector128.AsUInt64 <uint>(addv12), Vector128.AsUInt64 <uint>(t12)).AsUInt32();
                    x_13 = Sse2.Add(Vector128.AsUInt64 <uint>(addv13), Vector128.AsUInt64 <uint>(t13)).AsUInt32();

                    t12 = Sse2.UnpackLow(x_12, x_13);
                    t13 = Sse2.UnpackHigh(x_12, x_13);

                    x_12 = Sse2.UnpackLow(t12, t13);
                    x_13 = Sse2.UnpackHigh(t12, t13);

                    orig12 = x_12;
                    orig13 = x_13;

                    in1213 += 4;

                    x[12] = (uint)(in1213 & 0xFFFFFFFF);
                    x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF);

                    for (int i = 0; i < 20; i += 2)
                    {
                        Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12);
                        Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13);
                        Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14);
                        Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15);
                        Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15);
                        Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12);
                        Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13);
                        Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14);
                    }
                    OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c);
                    m += 16;
                    c += 16;
                    OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c);
                    m     -= 48;
                    c     -= 48;
                    bytes -= 256;
                    c     += 256;
                    m     += 256;
                }
            }
            while (bytes >= 64)
            {
                Vector128 <uint> x_0 = Sse2.LoadVector128(x);
                Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4);
                Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8);
                Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12);
                Vector128 <uint> t_1;

                for (int i = 0; i < 20; i += 2)
                {
                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 147);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 78);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 57);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 57);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 78);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 147);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);
                }
                x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x));
                x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4));
                x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8));
                x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12));
                x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32();
                x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32();
                x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32();
                x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32();
                Sse2.Store(c, x_0.AsByte());
                Sse2.Store(c + 16, x_1.AsByte());
                Sse2.Store(c + 32, x_2.AsByte());
                Sse2.Store(c + 48, x_3.AsByte());

                uint in12 = x[12];
                uint in13 = x[13];
                in12++;
                if (in12 == 0)
                {
                    in13++;
                }
                x[12] = in12;
                x[13] = in13;

                bytes -= 64;
                c     += 64;
                m     += 64;
            }
            if (bytes > 0)
            {
                Vector128 <uint> x_0 = Sse2.LoadVector128(x);
                Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4);
                Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8);
                Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12);
                Vector128 <uint> t_1;
                for (int i = 0; i < 20; i += 2)
                {
                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 0x93);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 0x4e);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 0x39);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_1 = Sse2.Xor(x_1, x_2);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 12);
                    t_1 = Sse2.ShiftRightLogical(t_1, 20);
                    x_1 = Sse2.Xor(x_1, t_1);

                    x_0 = Sse2.Add(x_0, x_1);
                    x_3 = Sse2.Xor(x_3, x_0);
                    x_0 = Sse2.Shuffle(x_0, 0x39);
                    x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32();

                    x_2 = Sse2.Add(x_2, x_3);
                    x_3 = Sse2.Shuffle(x_3, 0x4e);
                    x_1 = Sse2.Xor(x_1, x_2);
                    x_2 = Sse2.Shuffle(x_2, 0x93);

                    t_1 = x_1;
                    x_1 = Sse2.ShiftLeftLogical(x_1, 7);
                    t_1 = Sse2.ShiftRightLogical(t_1, 25);
                    x_1 = Sse2.Xor(x_1, t_1);
                }
                x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x));
                x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4));
                x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8));
                x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12));
                byte *partialblock = stackalloc byte[64];
                Sse2.Store(partialblock, Vector128.AsByte(x_0));
                Sse2.Store(partialblock + 16, Vector128.AsByte(x_1));
                Sse2.Store(partialblock + 32, Vector128.AsByte(x_2));
                Sse2.Store(partialblock + 48, Vector128.AsByte(x_3));

                for (ulong i = 0; i < bytes; i++)
                {
                    c[i] = (byte)(m[i] ^ partialblock[i]);
                }
                for (int n = 0; n < 64 / sizeof(int); n++)
                {
                    ((int *)partialblock)[n] = 0;
                }
            }
        }
コード例 #20
0
 public static Vector128 <T> AndNot_Software <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 => AndNot_Software(left.AsUInt64(), right.AsUInt64()).As <ulong, T>();
コード例 #21
0
        private static unsafe nuint NarrowUtf16ToAscii_Sse2(char *pUtf16Buffer, byte *pAsciiBuffer, nuint elementCount)
        {
            // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
            // will be elided by JIT once we determine which specific ISAs we support.

            // JIT turns the below into constants

            uint  SizeOfVector128          = (uint)Unsafe.SizeOf <Vector128 <byte> >();
            nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);

            // This method is written such that control generally flows top-to-bottom, avoiding
            // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
            // data, we jump out of the hot paths to targets at the end of the method.

            Debug.Assert(Sse2.IsSupported);
            Debug.Assert(BitConverter.IsLittleEndian);
            Debug.Assert(elementCount >= 2 * SizeOfVector128);

            Vector128 <short> asciiMaskForPTEST   = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware
            Vector128 <short> asciiMaskForPXOR    = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR
            Vector128 <short> asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW

            // First, perform an unaligned read of the first part of the input buffer.

            Vector128 <short> utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer); // unaligned load

            // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.
            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.

            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                {
                    return(0);
                }
            }
            else
            {
                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                {
                    return(0);
                }
            }

            // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.

            Vector128 <byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);

            Sse2.StoreScalar((ulong *)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED

            nuint currentOffsetInElements = SizeOfVector128 / 2;             // we processed 8 elements so far

            // We're going to get the best performance when we have aligned writes, so we'll take the
            // hit of potentially unaligned reads in order to hit this sweet spot.

            // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote
            // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote
            // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In
            // that case we can immediately back up to the previous aligned boundary and start the main loop.
            // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at
            // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump
            // just past the next aligned boundary address.

            if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2)))
            {
                // We need to perform one more partial vector write before we can get the alignment we want.

                utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements); // unaligned load

                // See comments earlier in this method for information about how this works.
                if (Sse41.IsSupported)
                {
                    if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                    {
                        goto Finish;
                    }
                }
                else
                {
                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                    {
                        goto Finish;
                    }
                }

                // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination.
                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);
                Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED
            }

            // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment
            // point, then use that as the base offset going forward.

            currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128);
            Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector.");

            Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer.");
            Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector.");

            nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;

            do
            {
                // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector.

                utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements);                                                      // unaligned load
                Vector128 <short> utf16VectorSecond = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load
                Vector128 <short> combinedVector    = Sse2.Or(utf16VectorFirst, utf16VectorSecond);

                // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
                if (Sse41.IsSupported)
                {
                    if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST))
                    {
                        goto FoundNonAsciiDataInLoop;
                    }
                }
                else
                {
                    if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                    {
                        goto FoundNonAsciiDataInLoop;
                    }
                }

                // Build up the UTF-8 vector and perform the store.

                asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond);

                Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned.");
                Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned

                currentOffsetInElements += SizeOfVector128;
            } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop);

Finish:

            // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain.
            return(currentOffsetInElements);

FoundNonAsciiDataInLoop:

            // Can we at least narrow the high vector?
            // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works.
            if (Sse41.IsSupported)
            {
                if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST))
                {
                    goto Finish; // found non-ASCII data
                }
            }
            else
            {
                if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0)
                {
                    goto Finish; // found non-ASCII data
                }
            }

            // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector.
            asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst);

            Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned.");

            Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned
            currentOffsetInElements += SizeOfVector128 / 2;

            goto Finish;
        }