private static bool ProblemWithLoadLow_Sse() { var data = stackalloc float[2]; data[0] = 1; data[1] = 2; JitUse(data); Vector128 <float> a = Vector128 <float> .Zero; Vector128 <float> b = Sse.LoadLow(a, data); Vector128 <float> c = Sse.LoadLow(a, data + 1); // Make sure we take into account the address operand. if (b.AsInt32().GetElement(0) == c.AsInt32().GetElement(0)) { return(true); } // Make sure we take the heap state into account. b = Sse.LoadLow(a, data); data[0] = 3; c = Sse.LoadLow(a, data); if (b.AsInt32().GetElement(0) == c.AsInt32().GetElement(0)) { return(true); } return(false); }
public static Vector128 <short> DivideBy10(this Vector128 <short> dividend) { // Convert to two 32-bit integers Vector128 <int> a_hi = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); a_lo = Sse2.ShiftRightArithmetic(a_lo, 16); Vector128 <int> div10_hi; Vector128 <int> div10_lo; if (Avx2.IsSupported) { Vector256 <int> a = Vector256.Create(a_lo, a_hi); Vector256 <int> s0 = Avx2.ShiftRightArithmetic(a, 15); Vector256 <int> factor = Vector256.Create(26215); Vector256 <int> mul = Avx2.MultiplyLow(a, factor); Vector256 <int> s1 = Avx2.ShiftRightArithmetic(mul, 18); Vector256 <int> div10 = Avx2.Subtract(s1, s0); div10_hi = div10.GetUpper(); div10_lo = div10.GetLower(); } else { Vector128 <int> s0_hi = Sse2.ShiftRightArithmetic(a_hi, 15); Vector128 <int> s0_lo = Sse2.ShiftRightArithmetic(a_lo, 15); Vector128 <int> factor = Vector128.Create(26215); Vector128 <int> mul_hi = Sse41.MultiplyLow(a_hi, factor); Vector128 <int> mul_lo = Sse41.MultiplyLow(a_lo, factor); Vector128 <int> s1_hi = Sse2.ShiftRightArithmetic(mul_hi, 18); Vector128 <int> s1_lo = Sse2.ShiftRightArithmetic(mul_lo, 18); div10_hi = Sse2.Subtract(s1_hi, s0_hi); div10_lo = Sse2.Subtract(s1_lo, s0_lo); } //div10_hi = Sse2.ShiftLeftLogical(div10_hi, 16); div10_hi = Sse2.ShiftLeftLogical128BitLane(div10_hi, 2); return(Sse41.Blend(div10_lo.AsInt16(), div10_hi.AsInt16(), 0xAA)); }
internal static unsafe long Extract64(Vector128 <sbyte> value) { if (Sse41.X64.IsSupported) { return(Sse41.X64.Extract(value.AsInt64(), 0)); //会在JIT时进行静态判断 } var v = value.AsInt32(); return((long)((uint)Sse41.Extract(v, 0) | ((ulong)Sse41.Extract(v, 1) << 32))); }
private unsafe ulong HashSse(byte *buf, int len) { ulong h = 0; Vector128 <int> v_ps = Vector128 <int> .Zero; bool useSse4 = Sse41.IsSupported; int i = 0; for (int j = len - i - 1; len - i >= 4; i += 4, j = len - i - 1) { Vector128 <int> c_v = Sse2.LoadVector128(&kMultFactorsPtr[j - 3]); c_v = Sse2.Shuffle(c_v, SO123); Vector128 <byte> q_v = Sse2.LoadVector128(buf + i); Vector128 <int> s_v; if (useSse4) { s_v = Sse41.ConvertToVector128Int32(q_v); } else { q_v = Sse2.UnpackLow(q_v, q_v); s_v = Sse2.ShiftRightLogical(Sse2.UnpackLow(q_v.AsUInt16(), q_v.AsUInt16()).AsInt32(), 24); } if (useSse4) { v_ps = Sse2.Add(v_ps, Sse41.MultiplyLow(c_v, s_v)); } else { Vector128 <ulong> v_tmp1 = Sse2.Multiply(c_v.AsUInt32(), s_v.AsUInt32()); Vector128 <ulong> v_tmp2 = Sse2.Multiply(Sse2.ShiftRightLogical128BitLane(c_v.AsByte(), 4).AsUInt32(), Sse2.ShiftRightLogical128BitLane(s_v.AsByte(), 4).AsUInt32()); ; v_ps = Sse2.Add(v_ps, Sse2.UnpackLow(Sse2.Shuffle(v_tmp1.AsInt32(), SOO2O), Sse2.Shuffle(v_tmp2.AsInt32(), SOO2O))); } } v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S23O1)); v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S1O32)); h += Sse2.ConvertToUInt32(v_ps.AsUInt32()); for (; i < len; i++) { int index = len - i - 1; ulong c = (uint)kMultFactors[index]; h += c * buf[i]; } return(h & (kBase - 1)); }
private static Vector128 <byte> KeyGenAssist(ref Vector128 <byte> tmp1, Vector128 <byte> tmp3, byte control) { var keyGened = Aes.KeygenAssist(tmp3, control); keyGened = Aes.Shuffle(keyGened.AsInt32(), 0x55).AsByte(); tmp1 = Sse2.Xor(tmp1, Sse2.ShiftLeftLogical128BitLane(tmp1, 4)); tmp1 = Sse2.Xor(tmp1, Sse2.ShiftLeftLogical128BitLane(tmp1, 4)); tmp1 = Sse2.Xor(tmp1, Sse2.ShiftLeftLogical128BitLane(tmp1, 4)); tmp1 = Sse2.Xor(tmp1, keyGened); keyGened = Sse2.Shuffle(tmp1.AsInt32(), 0xFF).AsByte(); return(Sse2.Xor(Sse2.Xor(tmp3, Sse2.ShiftLeftLogical128BitLane(tmp3, 4)), keyGened)); }
private static void Aes256Assist1(ref Vector128 <byte> t1, ref Vector128 <byte> t2) { Vector128 <byte> t4; t2 = Sse2.Shuffle(t2.AsInt32(), 0xff).AsByte(); t4 = Sse2.ShiftLeftLogical128BitLane(t1, 0x04); t1 = Sse2.Xor(t1, t4); t4 = Sse2.ShiftLeftLogical128BitLane(t4, 0x04); t1 = Sse2.Xor(t1, t4); t4 = Sse2.ShiftLeftLogical128BitLane(t4, 0x04); t1 = Sse2.Xor(t1, t4); t1 = Sse2.Xor(t1, t2); }
public static Vector128 <sbyte> CreateEscapingMask( Vector128 <sbyte> sourceValue, Vector128 <sbyte> bitMaskLookup, Vector128 <sbyte> bitPosLookup, Vector128 <sbyte> nibbleMaskSByte, Vector128 <sbyte> nullMaskSByte) { // To check if an input byte needs to be escaped or not, we use a bitmask-lookup. // Therefore we split the input byte into the low- and high-nibble, which will get // the row-/column-index in the bit-mask. // The bitmask-lookup looks like (here for example s_bitMaskLookupBasicLatin): // high-nibble // low-nibble 0 1 2 3 4 5 6 7 8 9 A B C D E F // 0 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 // 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 2 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // 3 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 4 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 5 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 6 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // 7 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // 8 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 9 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // A 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // B 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 // C 1 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 // D 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // E 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 // F 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 // // where 1 denotes the neeed for escaping, while 0 means no escaping needed. // For high-nibbles in the range 8..F every input needs to be escaped, so we // can omit them in the bit-mask, thus only high-nibbles in the range 0..7 need // to be considered, hence the entries in the bit-mask can be of type byte. // // In the bitmask-lookup for each row (= low-nibble) a bit-mask for the // high-nibbles (= columns) is created. Debug.Assert(Ssse3.IsSupported); Vector128 <sbyte> highNibbles = Sse2.And(Sse2.ShiftRightLogical(sourceValue.AsInt32(), 4).AsSByte(), nibbleMaskSByte); Vector128 <sbyte> lowNibbles = Sse2.And(sourceValue, nibbleMaskSByte); Vector128 <sbyte> bitMask = Ssse3.Shuffle(bitMaskLookup, lowNibbles); Vector128 <sbyte> bitPositions = Ssse3.Shuffle(bitPosLookup, highNibbles); Vector128 <sbyte> mask = Sse2.And(bitPositions, bitMask); mask = Sse2.CompareEqual(nullMaskSByte, Sse2.CompareEqual(nullMaskSByte, mask)); return(mask); }
public static bool AreEqual(Vector128 <float> left, Vector128 <float> right) { for (int i = 0; i < Vector128 <float> .Count; i++) { int l = left.AsInt32().GetElement(i); int r = right.AsInt32().GetElement(i); if (l != r) { return(false); } } return(true); }
public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI) { if (Avx2.IsSupported) { Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output); Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u); Vector128 <uint> masks = Vector128.Create(7u); Vector128 <byte> vClut; fixed(byte *pRPal = rPal) { vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte(); } Vector128 <uint> indices0 = Vector128.Create((uint)rI); Vector128 <uint> indices1 = Vector128.Create((uint)(rI >> 24)); Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12); Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12); indices00 = Sse2.And(indices00, masks); indices10 = Sse2.And(indices10, masks); indices01 = Sse2.And(indices01, masks); indices11 = Sse2.And(indices11, masks); Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); } else { for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) { output[i] = rPal[(int)(rI & 7)]; } } }
public unsafe static Vector128 <float> Log2(Vector128 <float> value) { // split value into exponent and mantissa parts Vector128 <float> one = AvxExtensions.BroadcastScalarToVector128(MathV.One); Vector128 <int> integerValue = value.AsInt32(); Vector128 <float> exponent = Avx.ConvertToVector128Single(Avx.Subtract(Avx.ShiftRightLogical(Avx.And(integerValue, MathV.FloatExponentMask128), MathV.FloatMantissaBits), MathV.FloatMantissaZero128)); Vector128 <float> mantissa = Avx.Or(Avx.And(integerValue, MathV.FloatMantissaMask128).AsSingle(), one); // evaluate mantissa polynomial Vector128 <float> beta1 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta1); Vector128 <float> beta2 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta2); Vector128 <float> beta3 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta3); Vector128 <float> beta4 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta4); Vector128 <float> beta5 = AvxExtensions.BroadcastScalarToVector128(MathV.Log2Beta5); Vector128 <float> x = Avx.Subtract(mantissa, one); Vector128 <float> polynomial = Avx.Multiply(beta1, x); Vector128 <float> x2 = Avx.Multiply(x, x); polynomial = Avx.Add(polynomial, Avx.Multiply(beta2, x2)); Vector128 <float> x3 = Avx.Multiply(x2, x); polynomial = Avx.Add(polynomial, Avx.Multiply(beta3, x3)); Vector128 <float> x4 = Avx.Multiply(x3, x); polynomial = Avx.Add(polynomial, Avx.Multiply(beta4, x4)); Vector128 <float> x5 = Avx.Multiply(x4, x); polynomial = Avx.Add(polynomial, Avx.Multiply(beta5, x5)); // form logarithm return(Avx.Add(exponent, polynomial)); }
public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor) { // Based on https://stackoverflow.com/a/51458507/347870 // Convert to two 32-bit integers Vector128 <int> a_hi_epi32 = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32 = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16); Vector128 <int> b_hi_epi32 = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32 = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16); // Convert to 32-bit floats Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32); Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32); Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32); Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32); // Calculate the reciprocal Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi); Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo); // Calculate the inverse Vector128 <float> b_hi_inv_1; Vector128 <float> b_lo_inv_1; Vector128 <float> two = Vector128.Create(2.00000051757f); if (Fma.IsSupported) { b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two); b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two); } else { Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi); Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo); b_hi_inv_1 = Sse.Subtract(two, b_mul_hi); b_lo_inv_1 = Sse.Subtract(two, b_mul_lo); } // Compensate for the loss Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1); Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1); // Perform the division by multiplication Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1); Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1); // Convert back to integers Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi); Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo); // Zero-out the unnecessary parts Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16); // Blend the bits, and return if (Sse41.IsSupported) { return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA)); } else { Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32()); return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16()); } }
private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int gobBlocksInY = 1 << config.OutBlkHeight; bool outLinear = config.OutBlkKind == 0; int width = Math.Min(config.OutLumaWidth + 1, input.Width); int height = Math.Min(config.OutLumaHeight + 1, input.Height); int yStride = GetPitch(config.OutLumaWidth + 1, 1); int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY); if (Sse41.IsSupported) { Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16(); int widthTrunc = width & ~0xf; int strideGap = yStride - width; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstY) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 16) { byte *baseOffset = (byte *)(ip + (ulong)(uint)x); Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset); Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10)); Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20)); Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30)); Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40)); Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50)); Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60)); Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70)); pixelp1 = Sse2.And(pixelp1, mask); pixelp2 = Sse2.And(pixelp2, mask); pixelp3 = Sse2.And(pixelp3, mask); pixelp4 = Sse2.And(pixelp4, mask); pixelp5 = Sse2.And(pixelp5, mask); pixelp6 = Sse2.And(pixelp6, mask); pixelp7 = Sse2.And(pixelp7, mask); pixelp8 = Sse2.And(pixelp8, mask); Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32()); Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32()); Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32()); Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32()); pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32()); pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32()); pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2); pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *op++ = Downsample(px->R); } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { dstY[y * yStride + x] = Downsample(input.GetR(x, y)); } } } WriteBuffer( rm, dstY, offsets.LumaOffset, outLinear, config.OutLumaWidth + 1, config.OutLumaHeight + 1, 1, gobBlocksInY); rm.BufferPool.Return(dstYIndex); int uvWidth = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1); int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1); int uvStride = GetPitch(config.OutChromaWidth + 1, 2); int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv); if (Sse2.IsSupported) { int widthTrunc = uvWidth & ~7; int strideGap = uvStride - uvWidth * 2; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstUv) { byte *op = dstPtr; for (int y = 0; y < uvHeight; y++, ip += input.Width * 2) { int x = 0; for (; x < widthTrunc; x += 8) { byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16; Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02)); Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12)); Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22)); Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32)); Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42)); Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52)); Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62)); Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72)); Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2); Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4); Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6); Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8); Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64()); Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64()); pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2); pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < uvWidth; x++) { Pixel *px = ip + (uint)(x << 1); *op++ = Downsample(px->G); *op++ = Downsample(px->B); } op += strideGap; } } } } else { for (int y = 0; y < uvHeight; y++) { for (int x = 0; x < uvWidth; x++) { int xx = x << 1; int yy = y << 1; int uvOffs = y * uvStride + xx; dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy)); dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy)); } } } WriteBuffer( rm, dstUv, offsets.ChromaUOffset, outLinear, config.OutChromaWidth + 1, config.OutChromaHeight + 1, 2, gobBlocksInY); rm.BufferPool.Return(dstUvIndex); }
private static unsafe uint CalculateSse(uint crc, ReadOnlySpan <byte> buffer) { int chunksize = buffer.Length & ~ChunksizeMask; int length = chunksize; fixed(byte *bufferPtr = buffer) fixed(ulong *k05PolyPtr = K05Poly) { byte *srcPtr = bufferPtr; // There's at least one block of 64. Vector128 <ulong> x1 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00)); Vector128 <ulong> x2 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10)); Vector128 <ulong> x3 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20)); Vector128 <ulong> x4 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30)); Vector128 <ulong> x5; x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64()); // k1, k2 Vector128 <ulong> x0 = Sse2.LoadVector128(k05PolyPtr + 0x0); srcPtr += 64; length -= 64; // Parallel fold blocks of 64, if any. while (length >= 64) { x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); Vector128 <ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); Vector128 <ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00); Vector128 <ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11); x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11); x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11); Vector128 <ulong> y5 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00)); Vector128 <ulong> y6 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10)); Vector128 <ulong> y7 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20)); Vector128 <ulong> y8 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30)); x1 = Sse2.Xor(x1, x5); x2 = Sse2.Xor(x2, x6); x3 = Sse2.Xor(x3, x7); x4 = Sse2.Xor(x4, x8); x1 = Sse2.Xor(x1, y5); x2 = Sse2.Xor(x2, y6); x3 = Sse2.Xor(x3, y7); x4 = Sse2.Xor(x4, y8); srcPtr += 64; length -= 64; } // Fold into 128-bits. // k3, k4 x0 = Sse2.LoadVector128(k05PolyPtr + 0x2); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x2); x1 = Sse2.Xor(x1, x5); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x3); x1 = Sse2.Xor(x1, x5); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x4); x1 = Sse2.Xor(x1, x5); // Single fold blocks of 16, if any. while (length >= 16) { x2 = Sse2.LoadVector128((ulong *)srcPtr); x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11); x1 = Sse2.Xor(x1, x2); x1 = Sse2.Xor(x1, x5); srcPtr += 16; length -= 16; } // Fold 128 - bits to 64 - bits. x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10); x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86 x1 = Sse2.ShiftRightLogical128BitLane(x1, 8); x1 = Sse2.Xor(x1, x2); // k5, k0 x0 = Sse2.LoadScalarVector128(k05PolyPtr + 0x4); x2 = Sse2.ShiftRightLogical128BitLane(x1, 4); x1 = Sse2.And(x1, x3); x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00); x1 = Sse2.Xor(x1, x2); // Barret reduce to 32-bits. // polynomial x0 = Sse2.LoadVector128(k05PolyPtr + 0x6); x2 = Sse2.And(x1, x3); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10); x2 = Sse2.And(x2, x3); x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00); x1 = Sse2.Xor(x1, x2); crc = (uint)Sse41.Extract(x1.AsInt32(), 1); return(buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer[chunksize..]));
public void Negate_NegateZero_Passes() { Vector128 <float> result = Vector.Negate4D(Vector128.Create(0f)); Assert.True(result.AsInt32().Equals(Vector128.Create(int.MinValue))); }
internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len) { uint s1 = sum1; uint s2 = sum2; int bufPos = 0; /* * Process the data in blocks. */ uint BLOCK_SIZE = 1 << 5; uint blocks = len / BLOCK_SIZE; len -= blocks * BLOCK_SIZE; while (blocks != 0) { uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17). AsByte(); Vector128 <byte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte(); Vector128 <byte> zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte(); Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); /* * Process n blocks of data. At most NMAX data bytes can be * processed before s2 must be reduced modulo BASE. */ Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0); Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); do { /* * Load 32 input bytes. */ Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; /* * Add previous block byte sum to v_ps. */ v_ps = Sse2.Add(v_ps, v_s1); /* * Horizontally add the bytes for s1, multiply-adds the * bytes by [ 32, 31, 30, ... ] for s2. */ v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32()); Vector128 <short> mad1 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32()); Vector128 <short> mad2 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32()); } while(--n != 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); /* * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78)); s1 += (uint)Sse2.ConvertToInt32(v_s1.AsInt32()); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78)); s2 = (uint)Sse2.ConvertToInt32(v_s2.AsInt32()); /* * Reduce. */ s1 %= Adler32Context.ADLER_MODULE; s2 %= Adler32Context.ADLER_MODULE; } /* * Handle leftover data. */ if (len != 0) { if (len >= 16) { s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; len -= 16; } while (len-- != 0) { s2 += s1 += buf[bufPos++]; } if (s1 >= Adler32Context.ADLER_MODULE) { s1 -= Adler32Context.ADLER_MODULE; } s2 %= Adler32Context.ADLER_MODULE; } /* * Return the recombined sums. */ sum1 = (ushort)(s1 & 0xFFFF); sum2 = (ushort)(s2 & 0xFFFF); }