internal static v128 mul_long(v128 left, v128 right) { if (Sse4_1.IsSse41Supported) { v128 product_Hi = Sse2.mul_epu32(left, right); v128 product_Lo = Sse4_1.mullo_epi32(left, Sse2.shuffle_epi32(right, Sse.SHUFFLE(2, 3, 0, 1))); product_Lo = Ssse3.hadd_epi32(product_Lo, default(v128)); product_Lo = Sse2.shuffle_epi32(product_Lo, Sse.SHUFFLE(1, 3, 0, 3)); return(Sse2.add_epi64(product_Lo, product_Hi)); } else if (Sse2.IsSse2Supported) { v128 ac = Sse2.mul_epu32(left, right); v128 b = Sse2.srli_epi64(left, 32); v128 bc = Sse2.mul_epu32(b, right); v128 d = Sse2.srli_epi64(right, 32); v128 ad = Sse2.mul_epu32(left, d); v128 hi = Sse2.add_epi64(bc, ad); hi = Sse2.slli_epi64(hi, 32); return(Sse2.add_epi64(hi, ac)); } else { throw new CPUFeatureCheckException(); } }
public static uint cprod(ushort8 x) { if (Avx2.IsAvx2Supported) { v128 prod = Avx.mm256_castsi256_si128((uint8)x * (uint8)(ushort8)Sse2.shuffle_epi32(x, Sse.SHUFFLE(0, 1, 2, 3))); prod = Sse4_1.mullo_epi32(prod, Sse2.shuffle_epi32(prod, Sse.SHUFFLE(0, 1, 2, 3))); return(Sse4_1.mullo_epi32(prod, Sse2.shufflelo_epi16(prod, Sse.SHUFFLE(0, 0, 3, 2))).UInt0); } else { return(cprod((uint4)x.v4_0 * (uint4)x.v4_4)); } }
public static int cprod(int8 x) { if (Avx2.IsAvx2Supported) { v128 result = Sse4_1.mullo_epi32(Avx.mm256_castsi256_si128(x), Avx2.mm256_extracti128_si256(x, 1)); result = Sse4_1.mullo_epi32(result, Sse2.shuffle_epi32(result, Sse.SHUFFLE(0, 1, 2, 3))); return(Sse4_1.mullo_epi32(result, Sse2.shufflelo_epi16(result, Sse.SHUFFLE(0, 0, 3, 2))).SInt0); } else { return(cprod(x.v4_0 * x.v4_4)); } }
public static uint cprod(ushort16 x) { if (Avx2.IsAvx2Supported) { v128 lo = x.v8_0; lo = Avx.mm256_castsi256_si128(((uint8)(ushort8)lo * (uint8)(ushort8)Sse2.shuffle_epi32(lo, Sse.SHUFFLE(0, 1, 2, 3))) * ((uint8)x.v8_8 * (uint8)(ushort8)Sse2.shuffle_epi32(x.v8_8, Sse.SHUFFLE(0, 1, 2, 3)))); lo = Sse4_1.mullo_epi32(lo, Sse2.shuffle_epi32(lo, Sse.SHUFFLE(0, 1, 2, 3))); return(Sse4_1.mullo_epi32(lo, Sse2.shufflelo_epi16(lo, Sse.SHUFFLE(0, 0, 3, 2))).UInt0); } else { return(cprod((uint8)x.v8_0 * (uint8)x.v8_8)); } }
internal static v128 mul_int(v128 left, v128 right) { if (Sse4_1.IsSse41Supported) { return(Sse4_1.mullo_epi32(left, right)); } else if (Sse2.IsSse2Supported) { v128 even = Sse2.mul_epu32(left, right); v128 odd = Sse2.mul_epu32(Sse2.shuffle_epi32(left, Sse.SHUFFLE(3, 3, 1, 1)), Sse2.shuffle_epi32(right, Sse.SHUFFLE(3, 3, 1, 1))); return(Sse2.unpacklo_epi64(Sse2.unpacklo_epi32(even, odd), Sse2.unpackhi_epi32(even, odd))); } else { throw new CPUFeatureCheckException(); } }