private static void Transpose(ref Vector128 <byte> x0, ref Vector128 <byte> x1, ref Vector128 <byte> x2, ref Vector128 <byte> x3) { Vector128 <ulong> t0 = Sse2.UnpackHigh(x0.AsUInt32(), x1.AsUInt32()).AsUInt64(); x0 = Sse2.UnpackLow(x0.AsUInt32(), x1.AsUInt32()).AsByte(); Vector128 <ulong> t1 = Sse2.UnpackLow(x2.AsUInt32(), x3.AsUInt32()).AsUInt64(); x2 = Sse2.UnpackHigh(x2.AsUInt32(), x3.AsUInt32()).AsByte(); x1 = Sse2.UnpackHigh(x0.AsUInt64(), t1).AsByte(); x0 = Sse2.UnpackLow(x0.AsUInt64(), t1).AsByte(); x3 = Sse2.UnpackHigh(t0, x2.AsUInt64()).AsByte(); x2 = Sse2.UnpackLow(t0, x2.AsUInt64()).AsByte(); }
private unsafe void GFMul(ReadOnlySpan <byte> x) { var a = _key.AsUInt64(); Vector128 <ulong> b; fixed(byte *p = x) { var t = Sse2.LoadVector128(p); b = t.ReverseEndianness128().AsUInt64(); } b = Sse2.Xor(b.AsByte(), _buffer).AsUInt64(); var tmp3 = Pclmulqdq.CarrylessMultiply(a, b, 0x00).AsUInt32(); var tmp4 = Pclmulqdq.CarrylessMultiply(a, b, 0x10).AsUInt32(); var tmp5 = Pclmulqdq.CarrylessMultiply(a, b, 0x01).AsUInt32(); var tmp6 = Pclmulqdq.CarrylessMultiply(a, b, 0x11).AsUInt32(); tmp4 = Sse2.Xor(tmp4, tmp5); tmp5 = Sse2.ShiftLeftLogical128BitLane(tmp4, 8); tmp4 = Sse2.ShiftRightLogical128BitLane(tmp4, 8); tmp3 = Sse2.Xor(tmp3, tmp5); tmp6 = Sse2.Xor(tmp6, tmp4); var tmp7 = Sse2.ShiftRightLogical(tmp3, 31); var tmp8 = Sse2.ShiftRightLogical(tmp6, 31); tmp3 = Sse2.ShiftLeftLogical(tmp3, 1); tmp6 = Sse2.ShiftLeftLogical(tmp6, 1); var tmp9 = Sse2.ShiftRightLogical128BitLane(tmp7, 12); tmp8 = Sse2.ShiftLeftLogical128BitLane(tmp8, 4); tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 4); tmp3 = Sse2.Or(tmp3, tmp7); tmp6 = Sse2.Or(tmp6, tmp8); tmp6 = Sse2.Or(tmp6, tmp9); tmp7 = Sse2.ShiftLeftLogical(tmp3, 31); tmp8 = Sse2.ShiftLeftLogical(tmp3, 30); tmp9 = Sse2.ShiftLeftLogical(tmp3, 25); tmp7 = Sse2.Xor(tmp7, tmp8); tmp7 = Sse2.Xor(tmp7, tmp9); tmp8 = Sse2.ShiftRightLogical128BitLane(tmp7, 4); tmp7 = Sse2.ShiftLeftLogical128BitLane(tmp7, 12); tmp3 = Sse2.Xor(tmp3, tmp7); var tmp2 = Sse2.ShiftRightLogical(tmp3, 1); tmp4 = Sse2.ShiftRightLogical(tmp3, 2); tmp5 = Sse2.ShiftRightLogical(tmp3, 7); tmp2 = Sse2.Xor(tmp2, tmp4); tmp2 = Sse2.Xor(tmp2, tmp5); tmp2 = Sse2.Xor(tmp2, tmp8); tmp3 = Sse2.Xor(tmp3, tmp2); tmp6 = Sse2.Xor(tmp6, tmp3); _buffer = tmp6.AsByte(); }
public static Vector128 <uint> CreateTwoUInt(uint a, uint b) { if (Sse2.IsSupported) { Vector128 <uint> t1 = Vector128.CreateScalarUnsafe(a); Vector128 <uint> t2 = Vector128.CreateScalarUnsafe(b); return(Sse2.UnpackLow(t1.AsUInt64(), t2.AsUInt64()).AsUInt32()); } return(Vector128.Create(a, 0, b, 0)); }
private static ulong GetNonAsciiBytes(Vector128 <byte> value, Vector128 <byte> bitMask128) { if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) { throw new PlatformNotSupportedException(); } Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); Vector128 <byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitMask128); extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return(extractedBits.AsUInt64().ToScalar()); }
public static bool ContainsNonAsciiByte(Vector128 <sbyte> value) { if (!AdvSimd.Arm64.IsSupported) { throw new PlatformNotSupportedException(); } // most significant bit mask for a 64-bit byte vector const ulong MostSignficantBitMask = 0x8080808080808080; value = AdvSimd.Arm64.MinPairwise(value, value); return((value.AsUInt64().ToScalar() & MostSignficantBitMask) != 0); }
private unsafe static void VariantTwoShuffleAdd( byte *basePtr, int offset, Vector128 <byte> _b1, Vector128 <byte> _b, Vector128 <byte> _a) { Vector128 <ulong> chunk1 = Sse2.LoadVector128((ulong *)(basePtr + (offset ^ 0x10))); Vector128 <ulong> chunk2 = Sse2.LoadVector128((ulong *)(basePtr + (offset ^ 0x20))); Vector128 <ulong> chunk3 = Sse2.LoadVector128((ulong *)(basePtr + (offset ^ 0x30))); Sse2.Store((ulong *)(basePtr + (offset ^ 0x10)), Sse2.Add(chunk3, _b1.AsUInt64())); Sse2.Store((ulong *)(basePtr + (offset ^ 0x20)), Sse2.Add(chunk1, _b.AsUInt64())); Sse2.Store((ulong *)(basePtr + (offset ^ 0x30)), Sse2.Add(chunk2, _a.AsUInt64())); }
public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
static void Fold4(ref Vector128 <uint> xmmCRC0, ref Vector128 <uint> xmmCRC1, ref Vector128 <uint> xmmCRC2, ref Vector128 <uint> xmmCRC3) { Vector128 <uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001); Vector128 <uint> xTmp0 = xmmCRC0; Vector128 <uint> xTmp1 = xmmCRC1; Vector128 <uint> xTmp2 = xmmCRC2; Vector128 <uint> xTmp3 = xmmCRC3; xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp0 = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC0 = xmmCRC0.AsSingle(); Vector128 <float> psT0 = xTmp0.AsSingle(); Vector128 <float> psRes0 = Sse.Xor(psCRC0, psT0); xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp1 = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC1 = xmmCRC1.AsSingle(); Vector128 <float> psT1 = xTmp1.AsSingle(); Vector128 <float> psRes1 = Sse.Xor(psCRC1, psT1); xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp2 = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC2 = xmmCRC2.AsSingle(); Vector128 <float> psT2 = xTmp2.AsSingle(); Vector128 <float> psRes2 = Sse.Xor(psCRC2, psT2); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32(); xTmp3 = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32(); Vector128 <float> psCRC3 = xmmCRC3.AsSingle(); Vector128 <float> psT3 = xTmp3.AsSingle(); Vector128 <float> psRes3 = Sse.Xor(psCRC3, psT3); xmmCRC0 = psRes0.AsUInt32(); xmmCRC1 = psRes1.AsUInt32(); xmmCRC2 = psRes2.AsUInt32(); xmmCRC3 = psRes3.AsUInt32(); }
public static int GetIndexOfFirstNonAsciiByte(Vector128 <byte> value) { if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) { throw new PlatformNotSupportedException(); } // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2))); Vector128 <byte> mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); Vector128 <byte> extractedBits = AdvSimd.And(mostSignificantBitIsSet, s_bitmask); // collapse mask to lower bits extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); ulong mask = extractedBits.AsUInt64().ToScalar(); // calculate the index int index = BitOperations.TrailingZeroCount(mask) >> 2; Debug.Assert((mask != 0) ? index < 16 : index >= 16); return(index); }
private unsafe static void VariantTwoIntegerMath( ulong *b, ulong *ptr, ref ulong divisionResult, ref ulong sqrtResult) { b[0] ^= divisionResult ^ (sqrtResult << 32); ulong dividend = ptr[1]; uint divisor = (uint)((ptr[0] + (uint)(sqrtResult << 1)) | 0x80000001UL); divisionResult = ((uint)(dividend / divisor)) + ((ulong)(dividend % divisor) << 32); ulong sqrtInput = ptr[0] + divisionResult; Vector128 <ulong> expDoubleBias = Vector128.Create(1023UL << 52, 0); Vector128 <double> x = Sse2.Add(Sse2.X64.ConvertScalarToVector128UInt64(sqrtInput >> 12), expDoubleBias).AsDouble(); x = Sse2.SqrtScalar(Vector128.Create(0).AsDouble(), x); sqrtResult = Sse2.X64.ConvertToUInt64(Sse2.Subtract(x.AsUInt64(), expDoubleBias)) >> 19; VariantTwoSqrtFixup(ref sqrtResult, sqrtInput); }
internal static uint Step(byte[] src, long len, uint initialCRC) { Vector128 <uint> xmmT0, xmmT1, xmmT2; Vector128 <uint> xmmInitial = Sse2.ConvertScalarToVector128UInt32(initialCRC); Vector128 <uint> xmmCRC0 = Sse2.ConvertScalarToVector128UInt32(0x9db42487); Vector128 <uint> xmmCRC1 = Vector128 <uint> .Zero; Vector128 <uint> xmmCRC2 = Vector128 <uint> .Zero; Vector128 <uint> xmmCRC3 = Vector128 <uint> .Zero; int bufPos = 0; bool first = true; /* fold 512 to 32 step variable declarations for ISO-C90 compat. */ Vector128 <uint> xmmMask = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); Vector128 <uint> xmmMask2 = Vector128.Create(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); while ((len -= 64) >= 0) { xmmT0 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; xmmT1 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; xmmT2 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; Vector128 <uint> xmmT3 = Vector128.Create(BitConverter.ToUInt32(src, bufPos), BitConverter.ToUInt32(src, bufPos + 4), BitConverter.ToUInt32(src, bufPos + 8), BitConverter.ToUInt32(src, bufPos + 12)); bufPos += 16; if (first) { first = false; xmmT0 = Sse2.Xor(xmmT0, xmmInitial); } Fold4(ref xmmCRC0, ref xmmCRC1, ref xmmCRC2, ref xmmCRC3); xmmCRC0 = Sse2.Xor(xmmCRC0, xmmT0); xmmCRC1 = Sse2.Xor(xmmCRC1, xmmT1); xmmCRC2 = Sse2.Xor(xmmCRC2, xmmT2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmT3); } /* fold 512 to 32 */ /* * k1 */ Vector128 <uint> crcFold = Vector128.Create(_crcK[0], _crcK[1], _crcK[2], _crcK[3]); Vector128 <uint> xTmp0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x10). AsUInt32(); xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC1 = Sse2.Xor(xmmCRC1, xTmp0); xmmCRC1 = Sse2.Xor(xmmCRC1, xmmCRC0); Vector128 <uint> xTmp1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x10). AsUInt32(); xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC2 = Sse2.Xor(xmmCRC2, xTmp1); xmmCRC2 = Sse2.Xor(xmmCRC2, xmmCRC1); Vector128 <uint> xTmp2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x10). AsUInt32(); xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), crcFold.AsUInt64(), 0x01).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xTmp2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); /* * k5 */ crcFold = Vector128.Create(_crcK[4], _crcK[5], _crcK[6], _crcK[7]); xmmCRC0 = xmmCRC3; xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); xmmCRC0 = Sse2.ShiftRightLogical128BitLane(xmmCRC0, 8); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); xmmCRC0 = xmmCRC3; xmmCRC3 = Sse2.ShiftLeftLogical128BitLane(xmmCRC3, 4); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC0); xmmCRC3 = Sse2.And(xmmCRC3, xmmMask2); /* * k7 */ xmmCRC1 = xmmCRC3; xmmCRC2 = xmmCRC3; crcFold = Vector128.Create(_crcK[8], _crcK[9], _crcK[10], _crcK[11]); xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); xmmCRC3 = Sse2.And(xmmCRC3, xmmMask); xmmCRC2 = xmmCRC3; xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), crcFold.AsUInt64(), 0x10).AsUInt32(); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC2); xmmCRC3 = Sse2.Xor(xmmCRC3, xmmCRC1); /* * could just as well write xmm_crc3[2], doing a movaps and truncating, but * no real advantage - it's a tiny bit slower per call, while no additional CPUs * would be supported by only requiring SSSE3 and CLMUL instead of SSE4.1 + CLMUL */ return(~Sse41.Extract(xmmCRC3, 2)); }
public static Vector128 <T> Not_Software <T>(Vector128 <T> vector) where T : struct => Not_Software(vector.AsUInt64()).As <ulong, T>();
private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int gobBlocksInY = 1 << config.OutBlkHeight; bool outLinear = config.OutBlkKind == 0; int width = Math.Min(config.OutLumaWidth + 1, input.Width); int height = Math.Min(config.OutLumaHeight + 1, input.Height); int yStride = GetPitch(config.OutLumaWidth + 1, 1); int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY); if (Sse41.IsSupported) { Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16(); int widthTrunc = width & ~0xf; int strideGap = yStride - width; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstY) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 16) { byte *baseOffset = (byte *)(ip + (ulong)(uint)x); Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset); Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10)); Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20)); Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30)); Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40)); Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50)); Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60)); Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70)); pixelp1 = Sse2.And(pixelp1, mask); pixelp2 = Sse2.And(pixelp2, mask); pixelp3 = Sse2.And(pixelp3, mask); pixelp4 = Sse2.And(pixelp4, mask); pixelp5 = Sse2.And(pixelp5, mask); pixelp6 = Sse2.And(pixelp6, mask); pixelp7 = Sse2.And(pixelp7, mask); pixelp8 = Sse2.And(pixelp8, mask); Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32()); Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32()); Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32()); Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32()); pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32()); pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32()); pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2); pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *op++ = Downsample(px->R); } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { dstY[y * yStride + x] = Downsample(input.GetR(x, y)); } } } WriteBuffer( rm, dstY, offsets.LumaOffset, outLinear, config.OutLumaWidth + 1, config.OutLumaHeight + 1, 1, gobBlocksInY); rm.BufferPool.Return(dstYIndex); int uvWidth = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1); int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1); int uvStride = GetPitch(config.OutChromaWidth + 1, 2); int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv); if (Sse2.IsSupported) { int widthTrunc = uvWidth & ~7; int strideGap = uvStride - uvWidth * 2; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstUv) { byte *op = dstPtr; for (int y = 0; y < uvHeight; y++, ip += input.Width * 2) { int x = 0; for (; x < widthTrunc; x += 8) { byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16; Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02)); Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12)); Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22)); Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32)); Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42)); Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52)); Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62)); Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72)); Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2); Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4); Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6); Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8); Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64()); Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64()); pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2); pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < uvWidth; x++) { Pixel *px = ip + (uint)(x << 1); *op++ = Downsample(px->G); *op++ = Downsample(px->B); } op += strideGap; } } } } else { for (int y = 0; y < uvHeight; y++) { for (int x = 0; x < uvWidth; x++) { int xx = x << 1; int yy = y << 1; int uvOffs = y * uvStride + xx; dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy)); dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy)); } } } WriteBuffer( rm, dstUv, offsets.ChromaUOffset, outLinear, config.OutChromaWidth + 1, config.OutChromaHeight + 1, 2, gobBlocksInY); rm.BufferPool.Return(dstUvIndex); }
public static Vector128 <byte> Reduce4( Vector128 <byte> h1, Vector128 <byte> h2, Vector128 <byte> h3, Vector128 <byte> h4, Vector128 <byte> x1, Vector128 <byte> x2, Vector128 <byte> x3, Vector128 <byte> x4) => Reduce4(h1.AsUInt64(), h2.AsUInt64(), h3.AsUInt64(), h4.AsUInt64(), x1.AsUInt64(), x2.AsUInt64(), x3.AsUInt64(), x4.AsUInt64()).AsByte();
internal unsafe static ulong GetSse(ReadOnlySpan <byte> buffer, ulong s1, ulong s2) { uint len = (uint)buffer.Length; uint blocks = len / BLOCK_SIZE; len = len - blocks * BLOCK_SIZE; Vector128 <sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); Vector128 <sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; Vector128 <short> onesShort = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); Vector128 <int> onesInt = Vector128.Create(1, 1, 1, 1); Vector128 <byte> shuffleMask2301 = Vector128.Create((byte)4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11); Vector128 <byte> shuffleMask1032 = Vector128.Create((byte)8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); Vector128 <byte> shuffleMaskTrim = Vector128.Create(0, 1, 2, 3, 255, 255, 255, 255, 8, 9, 10, 11, 255, 255, 255, 255); // A B C D -> B A D C const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1; fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer)) { var buf = bufPtr; while (blocks != 0) { uint n = NMAX64 / BLOCK_SIZE; if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <ulong> v_ps = Vector128.Create(0, s1 * n); Vector128 <ulong> v_s2 = Vector128.Create(0, s2); Vector128 <ulong> v_s1 = Vector128.Create(0ul, 0); do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]); Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero); v_s1 = Sse2.Add(v_s1, sad1.AsUInt64()); Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); Vector128 <int> mad12 = Sse2.MultiplyAddAdjacent(mad11, onesShort); var mad121 = Sse2.Add(mad12, Sse2.Shuffle(mad12, S2301)); var madTrimmed1 = Ssse3.Shuffle(mad121.AsByte(), shuffleMaskTrim); var madTimmed1ULong = madTrimmed1.AsUInt64(); v_s2 = Sse2.Add(v_s2, madTimmed1ULong); Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero); v_s1 = Sse2.Add(v_s1, sad2.AsUInt64()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); Vector128 <int> mad22 = Sse2.MultiplyAddAdjacent(mad2, onesShort); var mad221 = Sse2.Add(mad22, Sse2.Shuffle(mad22, S2301)); var madTrimmed2 = Ssse3.Shuffle(mad221.AsByte(), shuffleMaskTrim); var madTimmed2ULong = madTrimmed2.AsUInt64(); v_s2 = Sse2.Add(v_s2, madTimmed2ULong); buf += BLOCK_SIZE; n--; } while (n != 0); var shifted = Sse2.ShiftLeftLogical(v_ps, 5); v_s2 = Sse2.Add(v_s2, shifted); s1 += v_s1.GetElement(0); s1 += v_s1.GetElement(1); s2 = v_s2.GetElement(0); s2 += v_s2.GetElement(1); s1 %= MOD64; s2 %= MOD64; } if (len > 0) { if (len >= 16) { s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); len -= 16; } while (len-- > 0) { s2 += (s1 += *buf++); } if (s1 >= MOD64) { s1 -= MOD64; } s2 %= MOD64; } return(s1 | (s2 << 32)); } }
public static Vector128 <byte> Gfmul(Vector128 <byte> a, Vector128 <byte> b) => Gfmul(a.AsUInt64(), b.AsUInt64()).AsByte();
private static Vector128 <byte> AddQ(Vector128 <byte> r1, Vector128 <byte> r2) { return(Sse2.Add(r1.AsUInt64(), r2.AsUInt64()).AsByte()); }
private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret) { if (Avx2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector256 <ulong> *xAcc = (Vector256 <ulong> *)pAcc; Vector256 <byte> * xInput = (Vector256 <byte> *)pInput; Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 32; i++) { Vector256 <byte> dataVec = xInput[i]; Vector256 <byte> keyVec = xSecret[i]; Vector256 <byte> dataKey = Avx2.Xor(dataVec, keyVec); Vector256 <uint> dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector256 <ulong> product = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector256 <uint> dataSwap = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector256 <ulong> sum = Avx2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Avx2.Add(product, sum); } } } } else if (Sse2.IsSupported) { fixed(ulong *pAcc = acc) { fixed(byte *pInput = input, pSecret = secret) { Vector128 <ulong> *xAcc = (Vector128 <ulong> *)pAcc; Vector128 <byte> * xInput = (Vector128 <byte> *)pInput; Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret; for (ulong i = 0; i < StripeLen / 16; i++) { Vector128 <byte> dataVec = xInput[i]; Vector128 <byte> keyVec = xSecret[i]; Vector128 <byte> dataKey = Sse2.Xor(dataVec, keyVec); Vector128 <uint> dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001); Vector128 <ulong> product = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo); Vector128 <uint> dataSwap = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110); Vector128 <ulong> sum = Sse2.Add(xAcc[i], dataSwap.AsUInt64()); xAcc[i] = Sse2.Add(product, sum); } } } } else { for (int i = 0; i < AccNb; i++) { ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong))); ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong))); acc[i ^ 1] += dataVal; acc[i] += Mult32To64((uint)dataKey, dataKey >> 32); } } }
public static unsafe void ChaCha20(uint *x, byte *m, byte *c, ulong bytes) { if (Avx2.IsSupported && bytes >= 512) { Vector256 <uint> x_0 = Vector256.Create(x[0]); Vector256 <uint> x_1 = Vector256.Create(x[1]); Vector256 <uint> x_2 = Vector256.Create(x[2]); Vector256 <uint> x_3 = Vector256.Create(x[3]); Vector256 <uint> x_4 = Vector256.Create(x[4]); Vector256 <uint> x_5 = Vector256.Create(x[5]); Vector256 <uint> x_6 = Vector256.Create(x[6]); Vector256 <uint> x_7 = Vector256.Create(x[7]); Vector256 <uint> x_8 = Vector256.Create(x[8]); Vector256 <uint> x_9 = Vector256.Create(x[9]); Vector256 <uint> x_10 = Vector256.Create(x[10]); Vector256 <uint> x_11 = Vector256.Create(x[11]); Vector256 <uint> x_12; Vector256 <uint> x_13; Vector256 <uint> x_14 = Vector256.Create(x[14]); Vector256 <uint> x_15 = Vector256.Create(x[15]); Vector256 <uint> orig0 = x_0; Vector256 <uint> orig1 = x_1; Vector256 <uint> orig2 = x_2; Vector256 <uint> orig3 = x_3; Vector256 <uint> orig4 = x_4; Vector256 <uint> orig5 = x_5; Vector256 <uint> orig6 = x_6; Vector256 <uint> orig7 = x_7; Vector256 <uint> orig8 = x_8; Vector256 <uint> orig9 = x_9; Vector256 <uint> orig10 = x_10; Vector256 <uint> orig11 = x_11; Vector256 <uint> orig12; Vector256 <uint> orig13; Vector256 <uint> orig14 = x_14; Vector256 <uint> orig15 = x_15; while (bytes >= 512) { Vector256 <uint> addv12 = Vector256.Create(0, 1, 2, 3).AsUInt32(); Vector256 <uint> addv13 = Vector256.Create(4, 5, 6, 7).AsUInt32(); Vector256 <uint> permute = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7).AsUInt32(); Vector256 <uint> t12, t13; x_0 = orig0; x_1 = orig1; x_2 = orig2; x_3 = orig3; x_4 = orig4; x_5 = orig5; x_6 = orig6; x_7 = orig7; x_8 = orig8; x_9 = orig9; x_10 = orig10; x_11 = orig11; x_14 = orig14; x_15 = orig15; uint in12 = x[12]; uint in13 = x[13]; ulong in1213 = in12 | ((ulong)in13 << 32); x_12 = x_13 = Avx2.BroadcastScalarToVector256(Sse2.X64.ConvertScalarToVector128UInt64(in1213)).AsUInt32(); t12 = Avx2.Add(addv12.AsUInt64(), x_12.AsUInt64()).AsUInt32(); t13 = Avx2.Add(addv13.AsUInt64(), x_13.AsUInt64()).AsUInt32(); x_12 = Avx2.UnpackLow(t12, t13); x_13 = Avx2.UnpackHigh(t12, t13); t12 = Avx2.UnpackLow(x_12, x_13); t13 = Avx2.UnpackHigh(x_12, x_13); x_12 = Avx2.PermuteVar8x32(t12, permute); x_13 = Avx2.PermuteVar8x32(t13, permute); orig12 = x_12; orig13 = x_13; in1213 += 8; x[12] = (uint)(in1213 & 0xFFFFFFFF); x[13] = (uint)((in1213 >> 32) & 0xFFFFFFFF); for (int i = 0; i < 20; i += 2) { Vec256Round(ref x_0, ref x_4, ref x_8, ref x_12, ref x_1, ref x_5, ref x_9, ref x_13, ref x_2, ref x_6, ref x_10, ref x_14, ref x_3, ref x_7, ref x_11, ref x_15); Vec256Round(ref x_0, ref x_5, ref x_10, ref x_15, ref x_1, ref x_6, ref x_11, ref x_12, ref x_2, ref x_7, ref x_8, ref x_13, ref x_3, ref x_4, ref x_9, ref x_14); } Vector256 <uint> t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, t_13, t_14, t_15; t_0 = t_1 = t_2 = t_3 = t_4 = t_5 = t_6 = t_7 = t_8 = t_9 = t_10 = t_11 = t_12 = t_13 = t_14 = t_15 = Vector256.Create((uint)0); // ONEOCTO enter OneQuadUnpack(ref x_0, ref x_1, ref x_2, ref x_3, ref t_0, ref t_1, ref t_2, ref t_3, ref orig0, ref orig1, ref orig2, ref orig3); OneQuadUnpack(ref x_4, ref x_5, ref x_6, ref x_7, ref t_4, ref t_5, ref t_6, ref t_7, ref orig4, ref orig5, ref orig6, ref orig7); t_0 = Avx2.Permute2x128(x_0, x_4, 0x20); t_4 = Avx2.Permute2x128(x_0, x_4, 0x31); t_1 = Avx2.Permute2x128(x_1, x_5, 0x20); t_5 = Avx2.Permute2x128(x_1, x_5, 0x31); t_2 = Avx2.Permute2x128(x_2, x_6, 0x20); t_6 = Avx2.Permute2x128(x_2, x_6, 0x31); t_3 = Avx2.Permute2x128(x_3, x_7, 0x20); t_7 = Avx2.Permute2x128(x_3, x_7, 0x31); t_0 = Avx2.Xor(t_0, Avx.LoadVector256(m).AsUInt32()); t_1 = Avx2.Xor(t_1, Avx.LoadVector256(m + 64).AsUInt32()); t_2 = Avx2.Xor(t_2, Avx.LoadVector256(m + 128).AsUInt32()); t_3 = Avx2.Xor(t_3, Avx.LoadVector256(m + 192).AsUInt32()); t_4 = Avx2.Xor(t_4, Avx.LoadVector256(m + 256).AsUInt32()); t_5 = Avx2.Xor(t_5, Avx.LoadVector256(m + 320).AsUInt32()); t_6 = Avx2.Xor(t_6, Avx.LoadVector256(m + 384).AsUInt32()); t_7 = Avx2.Xor(t_7, Avx.LoadVector256(m + 448).AsUInt32()); Avx.Store(c, t_0.AsByte()); Avx.Store(c + 64, t_1.AsByte()); Avx.Store(c + 128, t_2.AsByte()); Avx.Store(c + 192, t_3.AsByte()); Avx.Store(c + 256, t_4.AsByte()); Avx.Store(c + 320, t_5.AsByte()); Avx.Store(c + 384, t_6.AsByte()); Avx.Store(c + 448, t_7.AsByte()); // ONEOCTO exit m += 32; c += 32; // ONEOCTO enter OneQuadUnpack(ref x_8, ref x_9, ref x_10, ref x_11, ref t_8, ref t_9, ref t_10, ref t_11, ref orig8, ref orig9, ref orig10, ref orig11); OneQuadUnpack(ref x_12, ref x_13, ref x_14, ref x_15, ref t_12, ref t_13, ref t_14, ref t_15, ref orig12, ref orig13, ref orig14, ref orig15); t_8 = Avx2.Permute2x128(x_8, x_12, 0x20); t_12 = Avx2.Permute2x128(x_8, x_12, 0x31); t_9 = Avx2.Permute2x128(x_9, x_13, 0x20); t_13 = Avx2.Permute2x128(x_9, x_13, 0x31); t_10 = Avx2.Permute2x128(x_10, x_14, 0x20); t_14 = Avx2.Permute2x128(x_10, x_14, 0x31); t_11 = Avx2.Permute2x128(x_11, x_15, 0x20); t_15 = Avx2.Permute2x128(x_11, x_15, 0x31); t_8 = Avx2.Xor(t_8, Avx.LoadVector256(m).AsUInt32()); t_9 = Avx2.Xor(t_9, Avx.LoadVector256(m + 64).AsUInt32()); t_10 = Avx2.Xor(t_10, Avx.LoadVector256(m + 128).AsUInt32()); t_11 = Avx2.Xor(t_11, Avx.LoadVector256(m + 192).AsUInt32()); t_12 = Avx2.Xor(t_12, Avx.LoadVector256(m + 256).AsUInt32()); t_13 = Avx2.Xor(t_13, Avx.LoadVector256(m + 320).AsUInt32()); t_14 = Avx2.Xor(t_14, Avx.LoadVector256(m + 384).AsUInt32()); t_15 = Avx2.Xor(t_15, Avx.LoadVector256(m + 448).AsUInt32()); Avx.Store(c, t_8.AsByte()); Avx.Store(c + 64, t_9.AsByte()); Avx.Store(c + 128, t_10.AsByte()); Avx.Store(c + 192, t_11.AsByte()); Avx.Store(c + 256, t_12.AsByte()); Avx.Store(c + 320, t_13.AsByte()); Avx.Store(c + 384, t_14.AsByte()); Avx.Store(c + 448, t_15.AsByte()); // ONEOCTO exit m -= 32; c -= 32; bytes -= 512; c += 512; m += 512; } } if (bytes >= 256) { Vector128 <uint> x_0 = Vector128.Create(x[0]); Vector128 <uint> x_1 = Vector128.Create(x[1]); Vector128 <uint> x_2 = Vector128.Create(x[2]); Vector128 <uint> x_3 = Vector128.Create(x[3]); Vector128 <uint> x_4 = Vector128.Create(x[4]); Vector128 <uint> x_5 = Vector128.Create(x[5]); Vector128 <uint> x_6 = Vector128.Create(x[6]); Vector128 <uint> x_7 = Vector128.Create(x[7]); Vector128 <uint> x_8 = Vector128.Create(x[8]); Vector128 <uint> x_9 = Vector128.Create(x[9]); Vector128 <uint> x_10 = Vector128.Create(x[10]); Vector128 <uint> x_11 = Vector128.Create(x[11]); Vector128 <uint> x_12; Vector128 <uint> x_13; Vector128 <uint> x_14 = Vector128.Create(x[14]); Vector128 <uint> x_15 = Vector128.Create(x[15]); Vector128 <uint> orig0 = x_0; Vector128 <uint> orig1 = x_1; Vector128 <uint> orig2 = x_2; Vector128 <uint> orig3 = x_3; Vector128 <uint> orig4 = x_4; Vector128 <uint> orig5 = x_5; Vector128 <uint> orig6 = x_6; Vector128 <uint> orig7 = x_7; Vector128 <uint> orig8 = x_8; Vector128 <uint> orig9 = x_9; Vector128 <uint> orig10 = x_10; Vector128 <uint> orig11 = x_11; Vector128 <uint> orig12; Vector128 <uint> orig13; Vector128 <uint> orig14 = x_14; Vector128 <uint> orig15 = x_15; Vector128 <uint> t12, t13; while (bytes >= 256) { Vector128 <uint> addv12 = Vector128.Create(0, 1).AsUInt32(); Vector128 <uint> addv13 = Vector128.Create(2, 3).AsUInt32(); x_0 = orig0; x_1 = orig1; x_2 = orig2; x_3 = orig3; x_4 = orig4; x_5 = orig5; x_6 = orig6; x_7 = orig7; x_8 = orig8; x_9 = orig9; x_10 = orig10; x_11 = orig11; x_14 = orig14; x_15 = orig15; uint in12 = x[12]; uint in13 = x[13]; ulong in1213 = in12 | ((ulong)in13) << 32; t12 = Vector128.Create(in1213).AsUInt32(); t13 = Vector128.Create(in1213).AsUInt32(); x_12 = Sse2.Add(Vector128.AsUInt64 <uint>(addv12), Vector128.AsUInt64 <uint>(t12)).AsUInt32(); x_13 = Sse2.Add(Vector128.AsUInt64 <uint>(addv13), Vector128.AsUInt64 <uint>(t13)).AsUInt32(); t12 = Sse2.UnpackLow(x_12, x_13); t13 = Sse2.UnpackHigh(x_12, x_13); x_12 = Sse2.UnpackLow(t12, t13); x_13 = Sse2.UnpackHigh(t12, t13); orig12 = x_12; orig13 = x_13; in1213 += 4; x[12] = (uint)(in1213 & 0xFFFFFFFF); x[13] = (uint)(in1213 >> 32 & 0xFFFFFFFF); for (int i = 0; i < 20; i += 2) { Vec128QuarterRound(ref x_0, ref x_4, ref x_8, ref x_12); Vec128QuarterRound(ref x_1, ref x_5, ref x_9, ref x_13); Vec128QuarterRound(ref x_2, ref x_6, ref x_10, ref x_14); Vec128QuarterRound(ref x_3, ref x_7, ref x_11, ref x_15); Vec128QuarterRound(ref x_0, ref x_5, ref x_10, ref x_15); Vec128QuarterRound(ref x_1, ref x_6, ref x_11, ref x_12); Vec128QuarterRound(ref x_2, ref x_7, ref x_8, ref x_13); Vec128QuarterRound(ref x_3, ref x_4, ref x_9, ref x_14); } OneQuad(ref x_0, ref x_1, ref x_2, ref x_3, ref orig0, ref orig1, ref orig2, ref orig3, m, c); m += 16; c += 16; OneQuad(ref x_4, ref x_5, ref x_6, ref x_7, ref orig4, ref orig5, ref orig6, ref orig7, m, c); m += 16; c += 16; OneQuad(ref x_8, ref x_9, ref x_10, ref x_11, ref orig8, ref orig9, ref orig10, ref orig11, m, c); m += 16; c += 16; OneQuad(ref x_12, ref x_13, ref x_14, ref x_15, ref orig12, ref orig13, ref orig14, ref orig15, m, c); m -= 48; c -= 48; bytes -= 256; c += 256; m += 256; } } while (bytes >= 64) { Vector128 <uint> x_0 = Sse2.LoadVector128(x); Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4); Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8); Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12); Vector128 <uint> t_1; for (int i = 0; i < 20; i += 2) { x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 147); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 78); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 57); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 57); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 78); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 147); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); } x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); x_0 = Sse2.Xor(x_0.AsByte(), Sse2.LoadVector128(m)).AsUInt32(); x_1 = Sse2.Xor(x_1.AsByte(), Sse2.LoadVector128(m + 16)).AsUInt32(); x_2 = Sse2.Xor(x_2.AsByte(), Sse2.LoadVector128(m + 32)).AsUInt32(); x_3 = Sse2.Xor(x_3.AsByte(), Sse2.LoadVector128(m + 48)).AsUInt32(); Sse2.Store(c, x_0.AsByte()); Sse2.Store(c + 16, x_1.AsByte()); Sse2.Store(c + 32, x_2.AsByte()); Sse2.Store(c + 48, x_3.AsByte()); uint in12 = x[12]; uint in13 = x[13]; in12++; if (in12 == 0) { in13++; } x[12] = in12; x[13] = in13; bytes -= 64; c += 64; m += 64; } if (bytes > 0) { Vector128 <uint> x_0 = Sse2.LoadVector128(x); Vector128 <uint> x_1 = Sse2.LoadVector128(x + 4); Vector128 <uint> x_2 = Sse2.LoadVector128(x + 8); Vector128 <uint> x_3 = Sse2.LoadVector128(x + 12); Vector128 <uint> t_1; for (int i = 0; i < 20; i += 2) { x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 0x93); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 0x4e); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 0x39); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot16_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_1 = Sse2.Xor(x_1, x_2); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 12); t_1 = Sse2.ShiftRightLogical(t_1, 20); x_1 = Sse2.Xor(x_1, t_1); x_0 = Sse2.Add(x_0, x_1); x_3 = Sse2.Xor(x_3, x_0); x_0 = Sse2.Shuffle(x_0, 0x39); x_3 = Ssse3.Shuffle(x_3.AsByte(), rot8_128).AsUInt32(); x_2 = Sse2.Add(x_2, x_3); x_3 = Sse2.Shuffle(x_3, 0x4e); x_1 = Sse2.Xor(x_1, x_2); x_2 = Sse2.Shuffle(x_2, 0x93); t_1 = x_1; x_1 = Sse2.ShiftLeftLogical(x_1, 7); t_1 = Sse2.ShiftRightLogical(t_1, 25); x_1 = Sse2.Xor(x_1, t_1); } x_0 = Sse2.Add(x_0, Sse2.LoadVector128(x)); x_1 = Sse2.Add(x_1, Sse2.LoadVector128(x + 4)); x_2 = Sse2.Add(x_2, Sse2.LoadVector128(x + 8)); x_3 = Sse2.Add(x_3, Sse2.LoadVector128(x + 12)); byte *partialblock = stackalloc byte[64]; Sse2.Store(partialblock, Vector128.AsByte(x_0)); Sse2.Store(partialblock + 16, Vector128.AsByte(x_1)); Sse2.Store(partialblock + 32, Vector128.AsByte(x_2)); Sse2.Store(partialblock + 48, Vector128.AsByte(x_3)); for (ulong i = 0; i < bytes; i++) { c[i] = (byte)(m[i] ^ partialblock[i]); } for (int n = 0; n < 64 / sizeof(int); n++) { ((int *)partialblock)[n] = 0; } } }
public static Vector128 <T> AndNot_Software <T>(Vector128 <T> left, Vector128 <T> right) where T : struct => AndNot_Software(left.AsUInt64(), right.AsUInt64()).As <ulong, T>();
private static unsafe nuint NarrowUtf16ToAscii_Sse2(char *pUtf16Buffer, byte *pAsciiBuffer, nuint elementCount) { // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method // will be elided by JIT once we determine which specific ISAs we support. // JIT turns the below into constants uint SizeOfVector128 = (uint)Unsafe.SizeOf <Vector128 <byte> >(); nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); // This method is written such that control generally flows top-to-bottom, avoiding // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII // data, we jump out of the hot paths to targets at the end of the method. Debug.Assert(Sse2.IsSupported); Debug.Assert(BitConverter.IsLittleEndian); Debug.Assert(elementCount >= 2 * SizeOfVector128); Vector128 <short> asciiMaskForPTEST = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware Vector128 <short> asciiMaskForPXOR = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR Vector128 <short> asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW // First, perform an unaligned read of the first part of the input buffer. Vector128 <short> utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer); // unaligned load // If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do. // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST)) { return(0); } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { return(0); } } // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. Vector128 <byte> asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); Sse2.StoreScalar((ulong *)pAsciiBuffer, asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED nuint currentOffsetInElements = SizeOfVector128 / 2; // we processed 8 elements so far // We're going to get the best performance when we have aligned writes, so we'll take the // hit of potentially unaligned reads in order to hit this sweet spot. // pAsciiBuffer points to the start of the destination buffer, immediately before where we wrote // the 8 bytes previously. If the 0x08 bit is set at the pinned address, then the 8 bytes we wrote // previously mean that the 0x08 bit is *not* set at address &pAsciiBuffer[SizeOfVector128 / 2]. In // that case we can immediately back up to the previous aligned boundary and start the main loop. // If the 0x08 bit is *not* set at the pinned address, then it means the 0x08 bit *is* set at // address &pAsciiBuffer[SizeOfVector128 / 2], and we should perform one more 8-byte write to bump // just past the next aligned boundary address. if (0u >= ((uint)pAsciiBuffer & (SizeOfVector128 / 2))) { // We need to perform one more partial vector write before we can get the alignment we want. utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements); // unaligned load // See comments earlier in this method for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST)) { goto Finish; } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto Finish; } } // Turn the 8 ASCII chars we just read into 8 ASCII bytes, then copy it to the destination. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is UNALIGNED } // Calculate how many elements we wrote in order to get pAsciiBuffer to its next alignment // point, then use that as the base offset going forward. currentOffsetInElements = SizeOfVector128 - ((nuint)pAsciiBuffer & MaskOfAllBitsInVector128); Debug.Assert(0 < currentOffsetInElements && currentOffsetInElements <= SizeOfVector128, "We wrote at least 1 byte but no more than a whole vector."); Debug.Assert(currentOffsetInElements <= elementCount, "Shouldn't have overrun the destination buffer."); Debug.Assert(elementCount - currentOffsetInElements >= SizeOfVector128, "We should be able to run at least one whole vector."); nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128; do { // In a loop, perform two unaligned reads, narrow to a single vector, then aligned write one vector. utf16VectorFirst = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements); // unaligned load Vector128 <short> utf16VectorSecond = Sse2.LoadVector128((short *)pUtf16Buffer + currentOffsetInElements + SizeOfVector128 / sizeof(short)); // unaligned load Vector128 <short> combinedVector = Sse2.Or(utf16VectorFirst, utf16VectorSecond); // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInLoop; } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto FoundNonAsciiDataInLoop; } } // Build up the UTF-8 vector and perform the store. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorSecond); Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % SizeOfVector128 == 0, "Write should be aligned."); Sse2.StoreAligned(pAsciiBuffer + currentOffsetInElements, asciiVector); // aligned currentOffsetInElements += SizeOfVector128; } while (currentOffsetInElements <= finalOffsetWhereCanRunLoop); Finish: // There might be some ASCII data left over. That's fine - we'll let our caller handle the final drain. return(currentOffsetInElements); FoundNonAsciiDataInLoop: // Can we at least narrow the high vector? // See comments in GetIndexOfFirstNonAsciiChar_Sse2 for information about how this works. if (Sse41.IsSupported) { if (!Sse41.TestZ(utf16VectorFirst, asciiMaskForPTEST)) { goto Finish; // found non-ASCII data } } else { if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(utf16VectorFirst, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto Finish; // found non-ASCII data } } // First part was all ASCII, narrow and aligned write. Note we're only filling in the low half of the vector. asciiVector = Sse2.PackUnsignedSaturate(utf16VectorFirst, utf16VectorFirst); Debug.Assert(((nuint)pAsciiBuffer + currentOffsetInElements) % sizeof(ulong) == 0, "Destination should be ulong-aligned."); Sse2.StoreScalar((ulong *)(pAsciiBuffer + currentOffsetInElements), asciiVector.AsUInt64()); // ulong* calculated here is aligned currentOffsetInElements += SizeOfVector128 / 2; goto Finish; }