private Vector128 <byte> LongToUtf8_16(long value) { Vector128 <sbyte> v; if (Sse41.X64.IsSupported) { v = Vector128.CreateScalarUnsafe(value).AsSByte(); } else { var value0 = (int)value; var value1 = (int)((ulong)value >> 32); v = Sse41.Insert(Vector128.CreateScalarUnsafe(value0), value1, 1).AsSByte(); } var vector = Ssse3.Shuffle(v, ShuffleMask).AsInt16(); return(Sse2.Add(Sse2.Or(Sse2.ShiftRightLogical(vector, 4), Sse2.ShiftLeftLogical(Sse2.And(vector, LowMask), 8)), ShortCharA).AsByte()); }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__MultiplyHighRoundScaleInt16(); fixed(Vector128 <Int16> *pFld1 = &test._fld1) fixed(Vector128 <Int16> *pFld2 = &test._fld2) { var result = Ssse3.MultiplyHighRoundScale( Sse2.LoadVector128((Int16 *)(pFld1)), Sse2.LoadVector128((Int16 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
private static void diagonalize(ref Vector128 <ulong> row1l, ref Vector128 <ulong> row2l, ref Vector128 <ulong> row3l, ref Vector128 <ulong> row4l, ref Vector128 <ulong> row1h, ref Vector128 <ulong> row2h, ref Vector128 <ulong> row3h, ref Vector128 <ulong> row4h, ref Vector128 <ulong> b0) { var t0 = Ssse3.AlignRight(row2h.As <sbyte>(), row2l.As <sbyte>(), 8); var t1 = Ssse3.AlignRight(row2l.As <sbyte>(), row2h.As <sbyte>(), 8); row2l = t0.As <ulong>(); row2h = t1.As <ulong>(); b0 = row3l; row3l = row3h; row3h = b0; t0 = Ssse3.AlignRight(row4h.As <sbyte>(), row4l.As <sbyte>(), 8); t1 = Ssse3.AlignRight(row4l.As <sbyte>(), row4h.As <sbyte>(), 8); row4l = t1.As <ulong>(); row4h = t0.As <ulong>(); }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new HorizontalBinaryOpTest__HorizontalSubtractSaturateInt16(); fixed(Vector128 <Int16> *pFld1 = &test._fld1) fixed(Vector128 <Int16> *pFld2 = &test._fld2) { var result = Ssse3.HorizontalSubtractSaturate( Sse2.LoadVector128((Int16 *)(pFld1)), Sse2.LoadVector128((Int16 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__SignSByte(); fixed(Vector128 <SByte> *pFld1 = &test._fld1) fixed(Vector128 <SByte> *pFld2 = &test._fld2) { var result = Ssse3.Sign( Sse2.LoadVector128((SByte *)(pFld1)), Sse2.LoadVector128((SByte *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, nint cb) { T *ip = (T *)ipstart, ipe = (T *)(ipstart + cb), op = (T *)opstart; #if HWINTRINSICS if (typeof(T) == typeof(byte) && Ssse3.IsSupported && cb > Vector128 <byte> .Count) { var mask = (ReadOnlySpan <byte>)(new byte[] { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15 }); byte *pmask = (byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(mask)); var vmask0 = Sse2.LoadVector128(pmask); var vmask1 = Sse2.LoadVector128(pmask + Vector128 <byte> .Count); var vmask2 = Sse2.LoadVector128(pmask + Vector128 <byte> .Count * 2); ipe -= Vector128 <byte> .Count; do { var v0 = Sse2.LoadVector128((byte *)ip); ip += Vector128 <byte> .Count; Sse2.Store((byte *)op, Ssse3.Shuffle(v0, vmask0)); Sse2.Store((byte *)op + Vector128 <byte> .Count, Ssse3.Shuffle(v0, vmask1)); Sse2.Store((byte *)op + Vector128 <byte> .Count * 2, Ssse3.Shuffle(v0, vmask2)); op += Vector128 <byte> .Count * 3; } while (ip <= ipe); ipe += Vector128 <byte> .Count; } #endif while (ip < ipe) { var i0 = *ip; op[0] = i0; op[1] = i0; op[2] = i0; ip++; op += 3; } }
public bool Accepts(ReadOnlySpan <byte> input) { var transition = default(Vector128Impl).Id; int i = 0; for (; i + 6 < input.Length; i += 7) { var t1 = _transitions[input[i]]; var t2 = _transitions[input[i + 1]]; var t3 = _transitions[input[i + 2]]; var t4 = _transitions[input[i + 3]]; var t5 = _transitions[input[i + 4]]; var t6 = _transitions[input[i + 5]]; var t7 = _transitions[input[i + 6]]; var t01 = Ssse3.Shuffle(t1, transition); var t23 = Ssse3.Shuffle(t3, t2); var t45 = Ssse3.Shuffle(t5, t4); var t67 = Ssse3.Shuffle(t7, t6); var t0123 = Ssse3.Shuffle(t23, t01); var t4567 = Ssse3.Shuffle(t67, t45); transition = Ssse3.Shuffle(t4567, t0123); } for (; i < input.Length; ++i) { transition = Ssse3.Shuffle(_transitions[input[i]], transition); } var state = Sse41.Extract(transition, (byte)_start); bool found = false; for (int j = 0; j < _accept.Length; ++j) { found = found | (_accept[j] == state); } return(found); }
static void ShiftRight128(Vector128 <ulong> initial, uint n, out Vector128 <ulong> outLeft, out Vector128 <ulong> outRight) { uint maskPos = 16 - n; Vector128 <byte> maskA = Vector128.Create(_shuffleMasks[maskPos], _shuffleMasks[maskPos + 1], _shuffleMasks[maskPos + 2], _shuffleMasks[maskPos + 3], _shuffleMasks[maskPos + 4], _shuffleMasks[maskPos + 5], _shuffleMasks[maskPos + 6], _shuffleMasks[maskPos + 7], _shuffleMasks[maskPos + 8], _shuffleMasks[maskPos + 9], _shuffleMasks[maskPos + 10], _shuffleMasks[maskPos + 11], _shuffleMasks[maskPos + 12], _shuffleMasks[maskPos + 13], _shuffleMasks[maskPos + 14], _shuffleMasks[maskPos + 15]); Vector128 <byte> maskB = Sse2.Xor(maskA, Sse2.CompareEqual(Vector128 <byte> .Zero, Vector128 <byte> .Zero)); outLeft = Ssse3.Shuffle(initial.AsByte(), maskB).AsUInt64(); outRight = Ssse3.Shuffle(initial.AsByte(), maskA).AsUInt64(); }
public static int4 nabs(int4 x) { if (Ssse3.IsSsse3Supported) { v128 _nabs = Sse2.sub_epi32(default(v128), Ssse3.abs_epi32(*(v128 *)&x)); return(*(int4 *)&_nabs); } else if (Sse2.IsSse2Supported) { v128 _nabs = Mask.BlendV(Sse2.sub_epi32(default(v128), *(v128 *)&x), *(v128 *)&x, Sse2.cmpgt_epi32(default(v128), *(v128 *)&x)); return(*(int4 *)&_nabs); } else { return(new int4(nabs(x.x), nabs(x.y), nabs(x.z), nabs(x.w))); } }
private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI) { if (Avx2.IsSupported) { Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output); Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u); Vector128 <uint> masks = Vector128.Create(7u); Vector128 <byte> vClut; fixed(byte *pRPal = rPal) { vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte(); } Vector128 <uint> indices0 = Vector128.Create((uint)rI); Vector128 <uint> indices1 = Vector128.Create((uint)(rI >> 24)); Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12); Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12); indices00 = Sse2.And(indices00, masks); indices10 = Sse2.And(indices10, masks); indices01 = Sse2.And(indices01, masks); indices11 = Sse2.And(indices11, masks); Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); } else { for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) { output[i] = rPal[(int)(rI & 7)]; } } }
public bool Accepts(ReadOnlySpan <byte> input) { var transition = default(Vector128Impl).Id; for (int i = 0; i < input.Length; ++i) { transition = Ssse3.Shuffle(_transitions[input[i]], transition); } var state = Sse41.Extract(transition, (byte)_start); bool found = false; for (int i = 0; i < _accept.Length; ++i) { found = found | (_accept[i] == state); } return(found); }
public byte8 NextByte8(byte8 min, byte8 max) { Assert.IsNotSmaller(max.x0, min.x0); Assert.IsNotSmaller(max.x1, min.x1); Assert.IsNotSmaller(max.x2, min.x2); Assert.IsNotSmaller(max.x3, min.x3); Assert.IsNotSmaller(max.x4, min.x4); Assert.IsNotSmaller(max.x5, min.x5); Assert.IsNotSmaller(max.x6, min.x6); Assert.IsNotSmaller(max.x7, min.x7); if (Ssse3.IsSsse3Supported) { short8 temp = (short8)(max - min) * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState()); return(min + Ssse3.shuffle_epi8(temp, new byte8(1, 3, 5, 7, 9, 11, 13, 15))); } else { return(min + (byte8)(((short8)(max - min) * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState())) >> 8)); } }
private unsafe int SumVectorizedSse2(ReadOnlySpan <int> source) { int result; fixed(int *pSource = source) { Vector128 <int> vresult = Vector128 <int> .Zero; int i = 0; int lastBlockIndex = source.Length - source.Length % 4; while (i < lastBlockIndex) { vresult = Sse2.Add(vresult, Sse2.LoadVector128(pSource + i)); i += 4; } if (Ssse3.IsSupported) { vresult = Ssse3.HorizontalAdd(vresult, vresult); vresult = Ssse3.HorizontalAdd(vresult, vresult); } else { vresult = Sse2.Add(vresult, Sse2.Shuffle(vresult, 0x4E)); vresult = Sse2.Add(vresult, Sse2.Shuffle(vresult, 0xB1)); } result = vresult.ToScalar(); while (i < source.Length) { result += pSource[i]; i += 1; } } return(result); }
private static void ReverseEndianess(uint *source, uint *dest, int len) { int vecLen = Vector128 <uint> .Count; if (Ssse3.IsSupported && len >= vecLen) { int i = 0; do { var vec = Sse2.LoadVector128(source + i); vec = Ssse3.Shuffle(vec.AsByte(), ReverseEndianess_32_128).AsUInt32(); Sse2.Store(dest + i, vec); i += Vector128 <uint> .Count; }while (len - i >= Vector128 <uint> .Count); if (i < len) //Remainder problem { i = len - vecLen; var vec = Sse2.LoadVector128(source + i); vec = Ssse3.Shuffle(vec.AsByte(), ReverseEndianess_32_128).AsUInt32(); Sse2.Store(dest + i, vec); } return; } for (int i = 0; i < len; ++i) { dest[i] = BinaryPrimitives.ReverseEndianness(source[i]); } }
public static unsafe int CountEvenSIMD(int[] numbers) { int counter = 0; int len = numbers.Length; fixed(int *num = numbers) { Vector128 <int> vresult = Vector128 <int> .Zero; Vector128 <int> ones = Vector128.Create(1); int i = 0; int lastBlockIndex = len - (len % 4); while (i < lastBlockIndex) { var vec = Sse2.LoadVector128(num + i); var odds = Sse2.And(vec, ones); vresult = Sse2.Add(vresult, odds); i += 4; } vresult = Ssse3.HorizontalAdd(vresult, vresult); vresult = Ssse3.HorizontalAdd(vresult, vresult); counter = vresult.ToScalar(); while (i < len) { var odd = numbers[i] & 1; counter += odd; i += 1; } } return(numbers.Length - counter); }
static unsafe int SumVectorizedSse(ReadOnlySpan <int> source) { int result; fixed(int *sourcePointer = source) { Vector128 <int> resultVector = Vector128 <int> .Zero; int i = 0; int lastBlockIndex = source.Length - source.Length % 4; while (i < lastBlockIndex) { resultVector = Sse2.Add(resultVector, Sse2.LoadVector128(sourcePointer + i)); i += 4; } if (Ssse3.IsSupported) { resultVector = Ssse3.HorizontalAdd(resultVector, resultVector); resultVector = Ssse3.HorizontalAdd(resultVector, resultVector); } else { resultVector = Sse2.Add(resultVector, Sse2.Shuffle(resultVector, 0x4E)); resultVector = Sse2.Add(resultVector, Sse2.Shuffle(resultVector, 0xB1)); } result = resultVector.ToScalar(); while (i < source.Length) { result += sourcePointer[i]; i += 1; } } return(result); }
public static byte3 lzcnt(byte3 x) { if (Ssse3.IsSsse3Supported) { v128 NIBBLE_MASK = new v128(0x0F0F_0F0F); v128 SHUFFLE_MASK_LO = new v128(8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4); v128 SHUFFLE_MASK_HI = new v128(8, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0); return(Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)), Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4))))); } else if (Sse2.IsSse2Supported) { byte3 y; byte3 n = 8; byte3 mask; y = x >> 4; mask = Sse2.cmpeq_epi8(y, default(v128)); n = Mask.BlendV(n - 4, n, mask); x = Mask.BlendV(y, x, mask); y = x >> 2; mask = Sse2.cmpeq_epi8(y, default(v128)); n = Mask.BlendV(n - 2, n, mask); x = Mask.BlendV(y, x, mask); y = x >> 1; mask = Sse2.cmpeq_epi8(y, default(v128)); return(Mask.BlendV(n - 2, n - x, mask)); } else { return(new byte3(lzcnt(x.x), lzcnt(x.y), lzcnt(x.z))); } }
internal static unsafe uint GetSse(ReadOnlySpan <byte> buffer, uint s1, uint s2) { uint len = (uint)buffer.Length; uint blocks = len / BLOCK_SIZE; len = len - blocks * BLOCK_SIZE; Vector128 <sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); Vector128 <sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); Vector128 <byte> zero = Vector128 <byte> .Zero; Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer)) { var buf = bufPtr; while (blocks != 0) { uint n = NMAX32 / BLOCK_SIZE; if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.Create(0, 0, 0, s1 * n); Vector128 <uint> v_s2 = Vector128.Create(0, 0, 0, s2); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]); Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero); v_s1 = Sse2.Add(v_s1, sad1.AsUInt32()); Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); Vector128 <int> mad12 = Sse2.MultiplyAddAdjacent(mad11, ones); v_s2 = Sse2.Add(v_s2, mad12.AsUInt32()); Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero); v_s1 = Sse2.Add(v_s1, sad2.AsUInt32()); Vector128 <short> mad21 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); Vector128 <int> mad22 = Sse2.MultiplyAddAdjacent(mad21, ones); v_s2 = Sse2.Add(v_s2, mad22.AsUInt32()); buf += BLOCK_SIZE; n--; } while (n != 0); var shift = Sse2.ShiftLeftLogical(v_ps, 5); v_s2 = Sse2.Add(v_s2, shift); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). // A B C D -> B A D C const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1; // A B C D -> C D A B const int S1032 = 1 << 6 | 0 << 4 | 3 << 2 | 2; v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += Sse2.ConvertToUInt32(v_s1); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = Sse2.ConvertToUInt32(v_s2); s1 %= MOD32; s2 %= MOD32; } if (len > 0) { if (len >= 16) { s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); s2 += (s1 += *buf++); len -= 16; } while (len-- > 0) { s2 += (s1 += *buf++); } if (s1 >= MOD32) { s1 -= MOD32; } s2 %= MOD32; } return(s1 | (s2 << 16)); } }
public static unsafe void ReadVector(IntBlock block, BufferedReader reader, int[] buffer) { // Build first unadjusted vector and per-vector increment Vector128 <int> unadjusted = SetIncrement(block.Base, block.Slope); Vector128 <int> increment = Set1(block.Slope * 4); if (block.BitsPerAdjustment == 0) { // If no adjustments, calculate in blocks and return fixed(int *resultPtr = buffer) { for (int i = 0; i < block.Count; i += 4) { Unsafe.WriteUnaligned(&resultPtr[i], unadjusted); unadjusted = Sse2.Add(unadjusted, increment); } } return; } fixed(byte *bufferPtr = reader.Buffer) fixed(int *resultPtr = buffer) fixed(sbyte *shuffleMaskPtr = ShuffleMasks) fixed(int *multiplyMaskPtr = MultiplyMasks) { byte bitsPerAdjustment = block.BitsPerAdjustment; int index = reader.Index; int count = block.Count; // Calculate bytes consumed for the first and second four ints decoded (different for odd bit lengths) byte bytesPerEight = bitsPerAdjustment; byte bytes1 = (byte)(bytesPerEight / 2); // Calculate how much to shift values (from top of each int to bottom) byte shiftRightBits = (byte)(32 - bitsPerAdjustment); // Get shuffle mask (to get correct bits) and multiply value (to shift to top of each int) for halves Vector128 <sbyte> shuffle1 = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment]); Vector128 <int> multiply1 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment]); Vector128 <sbyte> shuffle2 = Unsafe.ReadUnaligned <Vector128 <sbyte> >(&shuffleMaskPtr[32 * bitsPerAdjustment + 16]); Vector128 <int> multiply2 = Unsafe.ReadUnaligned <Vector128 <int> >(&multiplyMaskPtr[8 * bitsPerAdjustment + 4]); for (int i = 0; i < count; i += 8, index += bytesPerEight) { // Read source bytes Vector128 <int> vector1 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index]); Vector128 <int> vector2 = Unsafe.ReadUnaligned <Vector128 <int> >(&bufferPtr[index + bytes1]); // Shuffle to get the right bytes in each integer vector1 = Sse.StaticCast <sbyte, int>(Ssse3.Shuffle(Sse.StaticCast <int, sbyte>(vector1), shuffle1)); vector2 = Sse.StaticCast <sbyte, int>(Ssse3.Shuffle(Sse.StaticCast <int, sbyte>(vector2), shuffle2)); // Multiply to shift each int so the desired bits are at the top vector1 = Sse41.MultiplyLow(vector1, multiply1); vector2 = Sse41.MultiplyLow(vector2, multiply2); // Shift the desired bits to the bottom and zero the top vector1 = Sse2.ShiftRightLogical(vector1, shiftRightBits); vector2 = Sse2.ShiftRightLogical(vector2, shiftRightBits); // Add the delta base value vector1 = Sse2.Add(vector1, unadjusted); unadjusted = Sse2.Add(unadjusted, increment); vector2 = Sse2.Add(vector2, unadjusted); unadjusted = Sse2.Add(unadjusted, increment); // Write the decoded integers Unsafe.WriteUnaligned(&resultPtr[i], vector1); Unsafe.WriteUnaligned(&resultPtr[i + 4], vector2); } reader.Index = index; } }
private static int AbsSsse3(int a) => (int)Sse2.ConvertToUInt32(Ssse3.Abs(Vector128.CreateScalarUnsafe(a)));
private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets) { InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2); int width = input.Width; int height = input.Height; int yStride = GetPitch(width, 1); int uvStride = GetPitch(input.UvWidth, 2); Surface output = new Surface(rm.SurfacePool, width, height); if (Sse41.IsSupported) { Vector128 <byte> shufMask = Vector128.Create( (byte)0, (byte)2, (byte)3, (byte)1, (byte)4, (byte)6, (byte)7, (byte)5, (byte)8, (byte)10, (byte)11, (byte)9, (byte)12, (byte)14, (byte)15, (byte)13); Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16(); int yStrideGap = yStride - width; int uvStrideGap = uvStride - input.UvWidth; int widthTrunc = width & ~0xf; fixed(Pixel *dstPtr = output.Data) { Pixel *op = dstPtr; fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1) { byte *i0p = src0Ptr; for (int y = 0; y < height; y++) { byte *i1p = src1Ptr + (y >> 1) * uvStride; int x = 0; for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16) { Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p); Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8); Vector128 <byte> uv = Sse2.LoadVector128(i1p); Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16()); Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16()); Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0); Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0); Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1); Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1); rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16(); rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16(); rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16(); rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16(); Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte()); Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte())); Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte()); Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte())); Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte()); Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte())); Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte()); Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte())); rgba16_0 = Sse2.Or(rgba16_0, alphaMask); rgba16_1 = Sse2.Or(rgba16_1, alphaMask); rgba16_2 = Sse2.Or(rgba16_2, alphaMask); rgba16_3 = Sse2.Or(rgba16_3, alphaMask); rgba16_4 = Sse2.Or(rgba16_4, alphaMask); rgba16_5 = Sse2.Or(rgba16_5, alphaMask); rgba16_6 = Sse2.Or(rgba16_6, alphaMask); rgba16_7 = Sse2.Or(rgba16_7, alphaMask); rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2); rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2); rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2); rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2); rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2); rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2); rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2); rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2); Sse2.Store((short *)(op + (uint)x + 0), rgba16_0); Sse2.Store((short *)(op + (uint)x + 2), rgba16_1); Sse2.Store((short *)(op + (uint)x + 4), rgba16_2); Sse2.Store((short *)(op + (uint)x + 6), rgba16_3); Sse2.Store((short *)(op + (uint)x + 8), rgba16_4); Sse2.Store((short *)(op + (uint)x + 10), rgba16_5); Sse2.Store((short *)(op + (uint)x + 12), rgba16_6); Sse2.Store((short *)(op + (uint)x + 14), rgba16_7); } for (; x < width; x++, i1p += (x & 1) * 2) { Pixel *px = op + (uint)x; px->R = Upsample(*i0p++); px->G = Upsample(*i1p); px->B = Upsample(*(i1p + 1)); px->A = 0x3ff; } op += width; i0p += yStrideGap; i1p += uvStrideGap; } } } } else { for (int y = 0; y < height; y++) { int uvBase = (y >> 1) * uvStride; for (int x = 0; x < width; x++) { output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x])); int uvOffs = uvBase + (x & ~1); output.SetG(x, y, Upsample(input.Buffer1[uvOffs])); output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1])); output.SetA(x, y, 0x3ff); } } } return(output); }
private unsafe static void WriteA8R8G8B8(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int width = input.Width; int height = input.Height; int stride = GetPitch(width, 4); int dstIndex = rm.BufferPool.Rent(height * stride, out Span <byte> dst); if (Ssse3.IsSupported) { Vector128 <byte> shuffleMask = Vector128.Create( (byte)2, (byte)1, (byte)0, (byte)3, (byte)6, (byte)5, (byte)4, (byte)7, (byte)10, (byte)9, (byte)8, (byte)11, (byte)14, (byte)13, (byte)12, (byte)15); int widthTrunc = width & ~7; int strideGap = stride - width * 4; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dst) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 8) { Vector128 <ushort> pixel12 = Sse2.LoadVector128((ushort *)(ip + (uint)x)); Vector128 <ushort> pixel34 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 2)); Vector128 <ushort> pixel56 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 4)); Vector128 <ushort> pixel78 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 6)); pixel12 = Sse2.ShiftRightLogical(pixel12, 2); pixel34 = Sse2.ShiftRightLogical(pixel34, 2); pixel56 = Sse2.ShiftRightLogical(pixel56, 2); pixel78 = Sse2.ShiftRightLogical(pixel78, 2); Vector128 <byte> pixel1234 = Sse2.PackUnsignedSaturate(pixel12.AsInt16(), pixel34.AsInt16()); Vector128 <byte> pixel5678 = Sse2.PackUnsignedSaturate(pixel56.AsInt16(), pixel78.AsInt16()); pixel1234 = Ssse3.Shuffle(pixel1234, shuffleMask); pixel5678 = Ssse3.Shuffle(pixel5678, shuffleMask); Sse2.Store(op + 0x00, pixel1234); Sse2.Store(op + 0x10, pixel5678); op += 0x20; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *(op + 0) = Downsample(px->B); *(op + 1) = Downsample(px->G); *(op + 2) = Downsample(px->R); *(op + 3) = Downsample(px->A); op += 4; } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { int baseOffs = y * stride; for (int x = 0; x < width; x++) { int offs = baseOffs + x * 4; dst[offs + 0] = Downsample(input.GetB(x, y)); dst[offs + 1] = Downsample(input.GetG(x, y)); dst[offs + 2] = Downsample(input.GetR(x, y)); dst[offs + 3] = Downsample(input.GetA(x, y)); } } } bool outLinear = config.OutBlkKind == 0; int gobBlocksInY = 1 << config.OutBlkHeight; WriteBuffer(rm, dst, offsets.LumaOffset, outLinear, width, height, 4, gobBlocksInY); rm.BufferPool.Return(dstIndex); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Ssse3.IsSupported) { using (TestTable <sbyte> sbyteTable = new TestTable <sbyte>(new sbyte[16] { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, new sbyte[16] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, new sbyte[16])) { var vf1 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <sbyte> >(sbyteTable.inArray2Ptr); var vf3 = Ssse3.AlignRight(vf1, vf2, 27); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) && (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Ssse3.AlignRight(vf1, vf2, 5); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 05) && (z[01] == 06) && (z[02] == 07) && (z[03] == 08) && (z[04] == 09) && (z[05] == 10) && (z[06] == 11) && (z[07] == 12) && (z[08] == 13) && (z[09] == 14) && (z[10] == 15) && (z[11] == 16) && (z[12] == 17) && (z[13] == 18) && (z[14] == 19) && (z[15] == 20))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Ssse3.AlignRight(vf1, vf2, 250); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) && (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Ssse3.AlignRight(vf1, vf2, 228); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 00) && (z[01] == 00) && (z[02] == 00) && (z[03] == 00) && (z[04] == 00) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = (Vector128 <sbyte>) typeof(Ssse3).GetMethod(nameof(Ssse3.AlignRight), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(27) }); Unsafe.Write(sbyteTable.outArrayPtr, vf3); if (!sbyteTable.CheckResult((x, y, z) => (z[00] == 27) && (z[01] == 28) && (z[02] == 29) && (z[03] == 30) && (z[04] == 31) && (z[05] == 00) && (z[06] == 00) && (z[07] == 00) && (z[08] == 00) && (z[09] == 00) && (z[10] == 00) && (z[11] == 00) && (z[12] == 00) && (z[13] == 00) && (z[14] == 00) && (z[15] == 00))) { Console.WriteLine("SSE AlignRight failed on sbyte:"); foreach (var item in sbyteTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
public unsafe void Serialize(ref MessagePackWriter writer, float[]?value, MessagePackSerializerOptions options) { if (value == null) { writer.WriteNil(); return; } var inputLength = value.Length; writer.WriteArrayHeader(inputLength); if (inputLength == 0) { return; } // output byte[] length can be calculated from input float[] length. var outputLength = inputLength * 5; var destination = writer.GetSpan(outputLength); fixed(byte *pDestination = &destination[0]) { var outputIterator = pDestination; fixed(float *pSource = &value[0]) { var inputEnd = pSource + inputLength; var inputIterator = (uint *)pSource; if (Sse42.IsSupported) { if (inputLength < 6) { goto ProcessEach; } // Process 3 floats at once. // From 12 bytes to 15 bytes. var vectorConstant = Vector128.Create(MessagePackCode.Float32, 0, 0, 0, 0, MessagePackCode.Float32, 0, 0, 0, 0, MessagePackCode.Float32, 0, 0, 0, 0, 0); var vectorShuffle = Vector128.Create(0x80, 3, 2, 1, 0, 0x80, 7, 6, 5, 4, 0x80, 11, 10, 9, 8, 0x80); var vectorLoopLength = ((inputLength / 3) - 1) * 3; for (var vectorizedEnd = inputIterator + vectorLoopLength; inputIterator != vectorizedEnd; inputIterator += 3, outputIterator += 15) { // new float[] { 1.0, -2.0, 3.5, } is byte[12] { 00, 00, 80, 3f, 00, 00, 00, c0, 00, 00, 60, 40 } in binary expression; var current = Sse2.LoadVector128((byte *)inputIterator); // Output binary should be byte[15] { ca, 3f, 80, 00, 00, ca, c0, 00, 00, 00, ca, 40, 60, 00, 00 }; Sse2.Store(outputIterator, Sse2.Or(Ssse3.Shuffle(current, vectorShuffle), vectorConstant)); } } ProcessEach: while (inputIterator != inputEnd) { // Encode float as Big Endian * outputIterator++ = MessagePackCode.Float32; var current = *inputIterator++; * outputIterator++ = (byte)(current >> 24); * outputIterator++ = (byte)(current >> 16); * outputIterator++ = (byte)(current >> 8); * outputIterator++ = (byte)current; } } } writer.Advance(outputLength); }
// PolyvalPowersTable updates the POLYVAL value in polyval to include length bytes // of data from input, given the POLYVAL key in hashKey. It uses the precomputed // powers of the key given in htbl. If the length is not divisible by 16, input // is padded with zeros until it's a multiple of 16 bytes. private static void PolyvalPowersTable(byte *polyval, byte *htbl, byte *input, int length) { if (length == 0) { return; } int blocks = Math.DivRem(length, 16, out int remainder16); int remainder128 = length % 128 - remainder16; Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4; var xhi = Sse2.SetZeroVector128 <ulong>(); var poly = Sse.StaticCast <uint, ulong>(Sse2.SetVector128(0xc2000000, 0, 0, 1)); var t = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(polyval)); if (remainder128 != 0) { int remainder128Blocks = remainder128 / 16; blocks -= remainder128Blocks; var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(input))); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - 1) * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); for (int i = 1; i < remainder128Blocks; ++i) { data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&input[i * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[(remainder128Blocks - i - 1) * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); } tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } if (blocks != 0) { var fixedInput = input + remainder128; if (remainder128 == 0) { var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[7 * 16])); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[6 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[5 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[4 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[3 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[2 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[1 * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[0 * 16]))); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } for (int i = remainder128 == 0 ? 8 : 0; i < blocks; i += 8) { var data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 7) * 16])); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[0 * 16])); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 6) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[1 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 5) * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[2 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, tmp4); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 4) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[3 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 3) * 16])); tmp4 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[4 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, tmp4); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 2) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[5 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); t = Sse2.Xor(t, xhi); data = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[(i + 1) * 16])); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[6 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&fixedInput[i * 16]))); h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(&htbl[7 * 16])); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp0 = Sse2.Xor(tmp0, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp1 = Sse2.Xor(tmp1, tmp3); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); } } if (blocks != 0 || remainder128 != 0) { tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); t = Sse2.Xor(xhi, t); } if (remainder16 != 0) { byte *b = stackalloc byte[16]; new Span <byte>(input + length - remainder16, remainder16).CopyTo(new Span <byte>(b, 16)); var data = Sse2.Xor(t, Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(b))); var h = Sse.StaticCast <byte, ulong>(Sse2.LoadVector128(htbl)); tmp2 = Pclmulqdq.CarrylessMultiply(data, h, 0x01); tmp0 = Pclmulqdq.CarrylessMultiply(data, h, 0x00); tmp1 = Pclmulqdq.CarrylessMultiply(data, h, 0x11); tmp3 = Pclmulqdq.CarrylessMultiply(data, h, 0x10); tmp2 = Sse2.Xor(tmp2, tmp3); tmp3 = Sse2.ShiftRightLogical128BitLane(tmp2, 8); tmp2 = Sse2.ShiftLeftLogical128BitLane(tmp2, 8); xhi = Sse2.Xor(tmp3, tmp1); t = Sse2.Xor(tmp0, tmp2); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); tmp3 = Pclmulqdq.CarrylessMultiply(t, poly, 0x10); t = Sse.StaticCast <sbyte, ulong>(Ssse3.AlignRight(Sse.StaticCast <ulong, sbyte>(t), Sse.StaticCast <ulong, sbyte>(t), 8)); t = Sse2.Xor(tmp3, t); t = Sse2.Xor(xhi, t); } Sse2.Store(polyval, Sse.StaticCast <ulong, byte>(t)); }
public unsafe void Serialize(ref MessagePackWriter writer, double[]?value, MessagePackSerializerOptions options) { if (value == null) { writer.WriteNil(); return; } var inputLength = value.Length; writer.WriteArrayHeader(inputLength); if (inputLength == 0) { return; } var outputLength = inputLength * 9; var destination = writer.GetSpan(outputLength); fixed(byte *pDestination = &destination[0]) { var outputIterator = pDestination; fixed(double *pSource = &value[0]) { var inputEnd = pSource + inputLength; var inputIterator = (ulong *)pSource; if (Avx2.IsSupported) { const int ShiftCount = 2; const int Stride = 1 << ShiftCount; if (inputLength < Stride << 1) { goto ProcessEach; } var vectorShuffle = Vector256.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride) { // Fetch 4 doubles. var current = Avx.LoadVector256((byte *)inputIterator); // Reorder Little Endian bytes to Big Endian. var answer = Avx2.Shuffle(current, vectorShuffle).AsUInt64(); // Write 4 Big-Endian doubles. *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(0); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(1); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(2); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(3); outputIterator += 8; } } else if (Ssse3.IsSupported) { const int ShiftCount = 1; const int Stride = 1 << ShiftCount; if (inputLength < Stride << 1) { goto ProcessEach; } var vectorShuffle = Vector128.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride) { var current = Sse2.LoadVector128((byte *)inputIterator); var answer = Ssse3.Shuffle(current, vectorShuffle).AsUInt64(); * outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(0); outputIterator += 8; *outputIterator++ = MessagePackCode.Float64; *(ulong *)outputIterator = answer.GetElement(1); outputIterator += 8; } } ProcessEach: while (inputIterator != inputEnd) { * outputIterator++ = MessagePackCode.Float64; var current = *inputIterator++; * outputIterator++ = (byte)(current >> 56); * outputIterator++ = (byte)(current >> 48); * outputIterator++ = (byte)(current >> 40); * outputIterator++ = (byte)(current >> 32); * outputIterator++ = (byte)(current >> 24); * outputIterator++ = (byte)(current >> 16); * outputIterator++ = (byte)(current >> 8); * outputIterator++ = (byte)current; } } } writer.Advance(outputLength); }
static int Main() { s_success = true; // We expect the AOT compiler generated HW intrinsics with the following characteristics: // // * TRUE = IsSupported assumed to be true, no runtime check // * NULL = IsSupported is a runtime check, code should be behind the check or bad things happen // * FALSE = IsSupported assumed to be false, no runtime check, PlatformNotSupportedException if used // // The test is compiled with multiple defines to test this. #if BASELINE_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = null; bool?AesLzPcl = null; bool?Sse4142 = null; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif NON_VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 16; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = false; bool?FmaBmi12 = false; bool?Avxvnni = false; #elif VEX_INTRINSICS bool vectorsAccelerated = true; int byteVectorLength = 32; bool?Sse2AndBelow = true; bool?Sse3Group = true; bool?AesLzPcl = null; bool?Sse4142 = true; bool?PopCnt = null; bool?Avx12 = true; bool?FmaBmi12 = null; bool?Avxvnni = null; #else #error Who dis? #endif if (vectorsAccelerated != Vector.IsHardwareAccelerated) { throw new Exception($"Vectors HW acceleration state unexpected - expected {vectorsAccelerated}, got {Vector.IsHardwareAccelerated}"); } if (byteVectorLength != Vector <byte> .Count) { throw new Exception($"Unexpected vector length - expected {byteVectorLength}, got {Vector<byte>.Count}"); } Check("Sse", Sse2AndBelow, &SseIsSupported, Sse.IsSupported, () => Sse.Subtract(Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse.X64", Sse2AndBelow, &SseX64IsSupported, Sse.X64.IsSupported, () => Sse.X64.ConvertToInt64WithTruncation(Vector128 <float> .Zero) == 0); Check("Sse2", Sse2AndBelow, &Sse2IsSupported, Sse2.IsSupported, () => Sse2.Extract(Vector128 <ushort> .Zero, 0) == 0); Check("Sse2.X64", Sse2AndBelow, &Sse2X64IsSupported, Sse2.X64.IsSupported, () => Sse2.X64.ConvertToInt64(Vector128 <double> .Zero) == 0); Check("Sse3", Sse3Group, &Sse3IsSupported, Sse3.IsSupported, () => Sse3.MoveHighAndDuplicate(Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Sse3.X64", Sse3Group, &Sse3X64IsSupported, Sse3.X64.IsSupported, null); Check("Ssse3", Sse3Group, &Ssse3IsSupported, Ssse3.IsSupported, () => Ssse3.Abs(Vector128 <short> .Zero).Equals(Vector128 <ushort> .Zero)); Check("Ssse3.X64", Sse3Group, &Ssse3X64IsSupported, Ssse3.X64.IsSupported, null); Check("Sse41", Sse4142, &Sse41IsSupported, Sse41.IsSupported, () => Sse41.Max(Vector128 <int> .Zero, Vector128 <int> .Zero).Equals(Vector128 <int> .Zero)); Check("Sse41.X64", Sse4142, &Sse41X64IsSupported, Sse41.X64.IsSupported, () => Sse41.X64.Extract(Vector128 <long> .Zero, 0) == 0); Check("Sse42", Sse4142, &Sse42IsSupported, Sse42.IsSupported, () => Sse42.Crc32(0, 0) == 0); Check("Sse42.X64", Sse4142, &Sse42X64IsSupported, Sse42.X64.IsSupported, () => Sse42.X64.Crc32(0, 0) == 0); Check("Aes", AesLzPcl, &AesIsSupported, Aes.IsSupported, () => Aes.KeygenAssist(Vector128 <byte> .Zero, 0).Equals(Vector128.Create((byte)99))); Check("Aes.X64", AesLzPcl, &AesX64IsSupported, Aes.X64.IsSupported, null); Check("Avx", Avx12, &AvxIsSupported, Avx.IsSupported, () => Avx.Add(Vector256 <double> .Zero, Vector256 <double> .Zero).Equals(Vector256 <double> .Zero)); Check("Avx.X64", Avx12, &AvxX64IsSupported, Avx.X64.IsSupported, null); Check("Avx2", Avx12, &Avx2IsSupported, Avx2.IsSupported, () => Avx2.Abs(Vector256 <int> .Zero).Equals(Vector256 <uint> .Zero)); Check("Avx2.X64", Avx12, &Avx2X64IsSupported, Avx2.X64.IsSupported, null); Check("Bmi1", FmaBmi12, &Bmi1IsSupported, Bmi1.IsSupported, () => Bmi1.AndNot(0, 0) == 0); Check("Bmi1.X64", FmaBmi12, &Bmi1X64IsSupported, Bmi1.X64.IsSupported, () => Bmi1.X64.AndNot(0, 0) == 0); Check("Bmi2", FmaBmi12, &Bmi2IsSupported, Bmi2.IsSupported, () => Bmi2.MultiplyNoFlags(0, 0) == 0); Check("Bmi2.X64", FmaBmi12, &Bmi2X64IsSupported, Bmi2.X64.IsSupported, () => Bmi2.X64.MultiplyNoFlags(0, 0) == 0); Check("Fma", FmaBmi12, &FmaIsSupported, Fma.IsSupported, () => Fma.MultiplyAdd(Vector128 <float> .Zero, Vector128 <float> .Zero, Vector128 <float> .Zero).Equals(Vector128 <float> .Zero)); Check("Fma.X64", FmaBmi12, &FmaX64IsSupported, Fma.X64.IsSupported, null); Check("Lzcnt", AesLzPcl, &LzcntIsSupported, Lzcnt.IsSupported, () => Lzcnt.LeadingZeroCount(0) == 32); Check("Lzcnt.X64", AesLzPcl, &LzcntX64IsSupported, Lzcnt.X64.IsSupported, () => Lzcnt.X64.LeadingZeroCount(0) == 64); Check("Pclmulqdq", AesLzPcl, &PclmulqdqIsSupported, Pclmulqdq.IsSupported, () => Pclmulqdq.CarrylessMultiply(Vector128 <long> .Zero, Vector128 <long> .Zero, 0).Equals(Vector128 <long> .Zero)); Check("Pclmulqdq.X64", AesLzPcl, &PclmulqdqX64IsSupported, Pclmulqdq.X64.IsSupported, null); Check("Popcnt", PopCnt, &PopcntIsSupported, Popcnt.IsSupported, () => Popcnt.PopCount(0) == 0); Check("Popcnt.X64", PopCnt, &PopcntX64IsSupported, Popcnt.X64.IsSupported, () => Popcnt.X64.PopCount(0) == 0); Check("AvxVnni", Avxvnni, &AvxVnniIsSupported, AvxVnni.IsSupported, () => AvxVnni.MultiplyWideningAndAdd(Vector128 <int> .Zero, Vector128 <byte> .Zero, Vector128 <sbyte> .Zero).Equals(Vector128 <int> .Zero)); Check("AvxVnni.X64", Avxvnni, &AvxVnniX64IsSupported, AvxVnni.X64.IsSupported, null); return(s_success ? 100 : 1); }
public static unsafe Vector128 <byte> End(ref State state, Span <byte> store128) { long Len = state.TotalLengthInBytes; Vector128 <byte> xmm0 = state.xmm0; Vector128 <byte> xmm1 = state.xmm1; Vector128 <byte> xmm2 = state.xmm2; Vector128 <byte> xmm3 = state.xmm3; Vector128 <byte> xmm4 = state.xmm4; Vector128 <byte> xmm5 = state.xmm5; Vector128 <byte> xmm6 = state.xmm6; Vector128 <byte> xmm7 = state.xmm7; Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; fixed(byte *rax = state.Buffer) { xmm9 = Vector128 <byte> .Zero; xmm11 = Vector128 <byte> .Zero; byte *Last = (byte *)rax + (Len & 0xf0); long Len8 = (Len & 0xf); if (Len8 > 0) { fixed(byte *MeowMaskLen = s_meowMaskLen) { xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]); } xmm9 = Sse2.LoadVector128(Last); xmm9 = Sse2.And(xmm9, xmm8); } if ((Len & 0x10) != 0) { xmm11 = xmm9; xmm9 = Sse2.LoadVector128(Last - 0x10); } xmm8 = xmm9; xmm10 = xmm9; xmm8 = Ssse3.AlignRight(xmm8, xmm11, 15); xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1); xmm12 = Vector128 <byte> .Zero; xmm13 = Vector128 <byte> .Zero; xmm14 = Vector128 <byte> .Zero; xmm15 = Vector128.Create((ulong)Len, 0).AsByte(); xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15); xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1); #if MEOW_DUMP MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); #endif // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15); #if MEOW_DUMP MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Hash all full 32-byte blocks // long LaneCount = (Len >> 5) & 0x7; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // MixDown: #if MEOW_DUMP MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2); MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3); MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4); MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5); MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); #if MEOW_DUMP MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif if (store128 != null) { fixed(byte *store128Ptr = store128) { Sse2.Store(store128Ptr + 0x00, xmm0); Sse2.Store(store128Ptr + 0x10, xmm1); Sse2.Store(store128Ptr + 0x20, xmm2); Sse2.Store(store128Ptr + 0x30, xmm3); Sse2.Store(store128Ptr + 0x40, xmm4); Sse2.Store(store128Ptr + 0x50, xmm5); Sse2.Store(store128Ptr + 0x60, xmm6); Sse2.Store(store128Ptr + 0x70, xmm7); } } xmm0 = AddQ(xmm0, xmm2); xmm1 = AddQ(xmm1, xmm3); xmm4 = AddQ(xmm4, xmm6); xmm5 = AddQ(xmm5, xmm7); xmm0 = Sse2.Xor(xmm0, xmm1); xmm4 = Sse2.Xor(xmm4, xmm5); xmm0 = AddQ(xmm0, xmm4); #if MEOW_DUMP MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif return(xmm0); } }
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer) { uint s1 = adler & 0xFFFF; uint s2 = (adler >> 16) & 0xFFFF; // Process the data in blocks. const int BLOCK_SIZE = 1 << 5; uint length = (uint)buffer.Length; uint blocks = length / BLOCK_SIZE; length -= blocks * BLOCK_SIZE; int index = 0; fixed(byte *bufferPtr = buffer) fixed(byte *tapPtr = Tap1Tap2) { index += (int)blocks * BLOCK_SIZE; var localBufferPtr = bufferPtr; // _mm_setr_epi8 on x86 Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr); Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10)); Vector128 <byte> zero = Vector128 <byte> .Zero; var ones = Vector128.Create((short)1); while (blocks > 0) { uint n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; // Process n blocks of data. At most NMAX data bytes can be // processed before s2 must be reduced modulo BASE. Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n); Vector128 <uint> v_s2 = Vector128.CreateScalar(s2); Vector128 <uint> v_s1 = Vector128 <uint> .Zero; do { // Load 32 input bytes. Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr); Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10); // Add previous block byte sum to v_ps. v_ps = Sse2.Add(v_ps, v_s1); // Horizontally add the bytes for s1, multiply-adds the // bytes by [ 32, 31, 30, ... ] for s2. v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32()); Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32()); Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32()); localBufferPtr += BLOCK_SIZE; }while (--n > 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); // Sum epi32 ints v_s1(s2) and accumulate in s1(s2). const byte S2301 = 0b1011_0001; // A B C D -> B A D C const byte S1032 = 0b0100_1110; // A B C D -> C D A B v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032)); s1 += v_s1.ToScalar(); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032)); s2 = v_s2.ToScalar(); // Reduce. s1 %= BASE; s2 %= BASE; } if (length > 0) { if (length >= 16) { s2 += s1 += localBufferPtr[0]; s2 += s1 += localBufferPtr[1]; s2 += s1 += localBufferPtr[2]; s2 += s1 += localBufferPtr[3]; s2 += s1 += localBufferPtr[4]; s2 += s1 += localBufferPtr[5]; s2 += s1 += localBufferPtr[6]; s2 += s1 += localBufferPtr[7]; s2 += s1 += localBufferPtr[8]; s2 += s1 += localBufferPtr[9]; s2 += s1 += localBufferPtr[10]; s2 += s1 += localBufferPtr[11]; s2 += s1 += localBufferPtr[12]; s2 += s1 += localBufferPtr[13]; s2 += s1 += localBufferPtr[14]; s2 += s1 += localBufferPtr[15]; localBufferPtr += 16; length -= 16; } while (length-- > 0) { s2 += s1 += *localBufferPtr++; } if (s1 >= BASE) { s1 -= BASE; } s2 %= BASE; } return(s1 | (s2 << 16)); } }
// // NOTE(casey): Single block version // public static unsafe Vector128 <byte> Hash(ReadOnlySpan <byte> Seed128Init, ReadOnlySpan <byte> SourceInit) { Vector128 <byte> xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length) int Len = SourceInit.Length; fixed(byte *sourceInitPtr = SourceInit) fixed(byte *seedInitPtr = Seed128Init) { byte *rax = sourceInitPtr; byte *rcx = seedInitPtr; // // NOTE(casey): Seed the eight hash registers // xmm0 = Sse2.LoadVector128(rcx + 0x00); xmm1 = Sse2.LoadVector128(rcx + 0x10); xmm2 = Sse2.LoadVector128(rcx + 0x20); xmm3 = Sse2.LoadVector128(rcx + 0x30); xmm4 = Sse2.LoadVector128(rcx + 0x40); xmm5 = Sse2.LoadVector128(rcx + 0x50); xmm6 = Sse2.LoadVector128(rcx + 0x60); xmm7 = Sse2.LoadVector128(rcx + 0x70); // MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0); // // NOTE(casey): Hash all full 256-byte blocks // int BlockCount = (SourceInit.Length >> 8); if (BlockCount > MEOW_PREFETCH_LIMIT) { // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop while (BlockCount-- > 0) { Sse.Prefetch0(rax + MEOW_PREFETCH + 0x00); Sse.Prefetch0(rax + MEOW_PREFETCH + 0x40); Sse.Prefetch0(rax + MEOW_PREFETCH + 0x80); Sse.Prefetch0(rax + MEOW_PREFETCH + 0xc0); MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00); MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20); MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40); MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60); MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80); MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0); MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0); MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0); rax += 0x100; } } else { // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop. while (BlockCount-- > 0) { MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00); MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20); MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40); MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60); MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80); MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0); MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0); MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0); rax += 0x100; } } #if MEOW_DUMP MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Load any less-than-32-byte residual // xmm9 = Vector128 <byte> .Zero; xmm11 = Vector128 <byte> .Zero; // // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here, // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due // to the & 0xf on the align computation. // // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned byte *Last = (byte *)sourceInitPtr + (Len & ~0xf); int Len8 = (Len & 0xf); if (Len8 > 0) { // NOTE(casey): Load the mask early fixed(byte *MeowMaskLen = s_meowMaskLen) { xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]); } byte *LastOk = (byte *)((((ulong)(((byte *)sourceInitPtr) + Len - 1)) | (MEOW_PAGESIZE - 1)) - 16); int Align = (Last > LastOk) ? ((int)(ulong)Last) & 0xf : 0; fixed(byte *MeowShiftAdjust = s_meowShiftAdjust) { xmm10 = Sse2.LoadVector128(&MeowShiftAdjust[Align]); } xmm9 = Sse2.LoadVector128(Last - Align); xmm9 = Ssse3.Shuffle(xmm9, xmm10); // NOTE(jeffr): and off the extra bytes xmm9 = Sse2.And(xmm9, xmm8); } // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned if ((Len & 0x10) != 0) { xmm11 = xmm9; xmm9 = Sse2.LoadVector128(Last - 0x10); } // // NOTE(casey): Construct the residual and length injests // xmm8 = xmm9; xmm10 = xmm9; xmm8 = Ssse3.AlignRight(xmm8, xmm11, 15); xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1); // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but // the decision was made to leave them zero'd so as not to confuse people // about hwo to use them or what security implications they had. xmm12 = Vector128 <byte> .Zero; xmm13 = Vector128 <byte> .Zero; xmm14 = Vector128 <byte> .Zero; xmm15 = Vector128.Create((ulong)Len, 0).AsByte(); xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15); xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1); #if MEOW_DUMP MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15); #endif // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11); // NOTE(casey): Append the length, to avoid problems with our 32-byte padding MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15); #if MEOW_DUMP MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif // // NOTE(casey): Hash all full 32-byte blocks // int LaneCount = (Len >> 5) & 0x7; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount; if (LaneCount == 0) { goto MixDown; } MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount; // // NOTE(casey): Mix the eight lanes down to one 128-bit hash // MixDown: #if MEOW_DUMP MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2); MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3); MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4); MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5); MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6); MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7); MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0); MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1); #if MEOW_DUMP MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif xmm0 = AddQ(xmm0, xmm2); xmm1 = AddQ(xmm1, xmm3); xmm4 = AddQ(xmm4, xmm6); xmm5 = AddQ(xmm5, xmm7); xmm0 = Sse2.Xor(xmm0, xmm1); xmm4 = Sse2.Xor(xmm4, xmm5); xmm0 = AddQ(xmm0, xmm4); #if MEOW_DUMP MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7); #endif return(xmm0); } }