public static ushort8 bitmask16(ushort8 numBits, ushort8 index = default(ushort8)) { Assert.IsBetween(index.x0, 0u, 16u); Assert.IsBetween(index.x1, 0u, 16u); Assert.IsBetween(index.x2, 0u, 16u); Assert.IsBetween(index.x3, 0u, 16u); Assert.IsBetween(index.x4, 0u, 16u); Assert.IsBetween(index.x5, 0u, 16u); Assert.IsBetween(index.x6, 0u, 16u); Assert.IsBetween(index.x7, 0u, 16u); Assert.IsBetween(numBits.x0, 0u, 16u - index.x0); Assert.IsBetween(numBits.x1, 0u, 16u - index.x1); Assert.IsBetween(numBits.x2, 0u, 16u - index.x2); Assert.IsBetween(numBits.x3, 0u, 16u - index.x3); Assert.IsBetween(numBits.x4, 0u, 16u - index.x4); Assert.IsBetween(numBits.x5, 0u, 16u - index.x5); Assert.IsBetween(numBits.x6, 0u, 16u - index.x6); Assert.IsBetween(numBits.x7, 0u, 16u - index.x7); if (Sse2.IsSse2Supported) { // mask index = shl(ushort.MaxValue, index); v128 isMaxBitsMask = Sse2.cmpeq_epi16(numBits, new ushort8(16)); return(isMaxBitsMask | andnot(index, shl(index, numBits))); } else { return((ushort8)(-toint16(numBits == 16)) | andnot(index, shl(index, numBits))); } }
internal static byte8 vdivrem_byte_SSE_FALLBACK(byte8 dividend, byte8 divisor, out byte8 remainder) { Assert.AreNotEqual(divisor.x0, 0); Assert.AreNotEqual(divisor.x1, 0); Assert.AreNotEqual(divisor.x2, 0); Assert.AreNotEqual(divisor.x3, 0); Assert.AreNotEqual(divisor.x4, 0); Assert.AreNotEqual(divisor.x5, 0); Assert.AreNotEqual(divisor.x6, 0); Assert.AreNotEqual(divisor.x7, 0); if (Sse2.IsSse2Supported) { ushort8 quotients = ushort8.zero; ushort8 remainders = ushort8.zero; ushort8 divisorCast = divisor; ushort8 dividendCast = dividend; remainders |= (new ushort8(1) & (dividendCast >> 7)); v128 subtractDivisorFromRemainder = Sse2.cmpeq_epi16(maxmath.min(divisorCast, remainders), divisorCast); remainders -= Mask.BlendV(default(v128), divisorCast, subtractDivisorFromRemainder); quotients |= new ushort8(1) & subtractDivisorFromRemainder; for (int i = 6; i > 0; i--) { quotients <<= 1; remainders <<= 1; remainders |= (new ushort8(1) & (dividendCast >> i)); subtractDivisorFromRemainder = Sse2.cmpeq_epi8(maxmath.min(divisorCast, remainders), divisorCast); remainders -= Mask.BlendV(default(v128), divisorCast, subtractDivisorFromRemainder); quotients |= new ushort8(1) & subtractDivisorFromRemainder; } remainders <<= 1; quotients <<= 1; remainders |= new ushort8(1) & dividendCast; subtractDivisorFromRemainder = Sse2.cmpeq_epi16(maxmath.min(divisorCast, remainders), divisorCast); remainders -= Mask.BlendV(default(v128), divisorCast, subtractDivisorFromRemainder); quotients |= new ushort8(1) & subtractDivisorFromRemainder; byte16 temp = Sse2.packus_epi16(remainders, quotients); remainder = temp.v8_0; return(temp.v8_8); } else { throw new CPUFeatureCheckException(); } }
public static ushort8 tzcnt(ushort8 x) { if (Ssse3.IsSsse3Supported) { v128 NIBBLE_MASK = new v128(0x0F0F_0F0F); v128 SHUFFLE_MASK_LO = new v128(16, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0); v128 SHUFFLE_MASK_HI = new v128(16, 4, 5, 4, 6, 4, 5, 4, 7, 4, 5, 4, 6, 4, 5, 4); v128 tzcnt_bytes = Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)), Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4)))); return(Sse2.min_epu8(tzcnt_bytes, Sse2.srli_epi16(Sse2.add_epi8(tzcnt_bytes, Sse2.set1_epi8(8)), 8))); } else if (Sse2.IsSse2Supported) { v128 compareMask = x & (ushort8)(-((short8)x)); ushort8 first = Mask.BlendV(default(v128), new ushort8(1), Sse2.cmpeq_epi16(compareMask, default(v128))); ushort8 second = Mask.BlendV(default(v128), new ushort8(8), Sse2.cmpeq_epi16(compareMask & (ushort8)0x00FF, default(v128))); ushort8 third = Mask.BlendV(default(v128), new ushort8(4), Sse2.cmpeq_epi16(compareMask & (ushort8)0x0F0F, default(v128))); ushort8 fourth = Mask.BlendV(default(v128), new ushort8(2), Sse2.cmpeq_epi16(compareMask & (ushort8)0x3333, default(v128))); ushort8 fifth = Mask.BlendV(default(v128), new ushort8(1), Sse2.cmpeq_epi16(compareMask & (ushort8)0x5555, default(v128))); return((first + second) + ((third + fourth) + fifth)); } else { return(new ushort8(tzcnt(x.x0), tzcnt(x.x1), tzcnt(x.x2), tzcnt(x.x3), tzcnt(x.x4), tzcnt(x.x5), tzcnt(x.x6), tzcnt(x.x7))); } }
public static ushort3x4 operator %(ushort3x4 left, ushort3x4 right) { if (Avx2.IsAvx2Supported) { ushort8 dividend_lo = Sse2.unpacklo_epi64(left.c0, left.c1); ushort8 dividend_hi = Sse2.unpacklo_epi64(left.c2, left.c3); ushort8 divisor_lo = Sse2.unpacklo_epi64(right.c0, right.c1); ushort8 divisor_hi = Sse2.unpacklo_epi64(right.c2, right.c3); #if DEBUG divisor_lo.x3 = 1; divisor_lo.x7 = 1; divisor_hi.x3 = 1; divisor_hi.x7 = 1; #endif ushort8 rem_lo = dividend_lo % divisor_lo; ushort8 rem_hi = dividend_hi % divisor_hi; return(new ushort3x4(rem_lo.v3_0, rem_lo.v3_4, rem_hi.v3_0, rem_hi.v3_4)); } else { return(new ushort3x4(left.c0 % right.c0, left.c1 % right.c1, left.c2 % right.c2, left.c3 % right.c3)); } }
public static ushort2x4 operator /(ushort2x4 left, ushort right) { if (Avx2.IsAvx2Supported) { if (!Constant.IsConstantExpression(right)) { ushort8 div = new ushort8(left.c0, left.c1, left.c2, left.c3) / right; return(new ushort2x4(div.v2_0, div.v2_2, div.v2_4, div.v2_6)); } } else if (Sse2.IsSse2Supported) { if (!Constant.IsConstantExpression(right)) { ushort4 divisor = right; ushort4 lo = new ushort4(left.c0, left.c1) / divisor; ushort4 hi = new ushort4(left.c2, left.c3) / divisor; return(new ushort2x4(lo.xy, lo.zw, hi.xy, hi.zw)); } } return(new ushort2x4(left.c0 / right, left.c1 / right, left.c2 / right, left.c3 / right)); }
public static ushort8 lcm(short8 x, short8 y) { ushort8 absX = (ushort8)abs(x); ushort8 absY = (ushort8)abs(y); return((absX / gcd(absX, absY)) * absY); }
public static ushort2x4 operator %(ushort2x4 left, ushort right) { if (Avx2.IsAvx2Supported) { if (!Constant.IsConstantExpression(right)) { ushort8 rem = new ushort8(left.c0, left.c1, left.c2, left.c3) % right; return(new ushort2x4(rem.v2_0, rem.v2_2, rem.v2_4, rem.v2_6)); } } else if (Sse2.IsSse2Supported) { if (!Constant.IsConstantExpression(right)) { ushort4 divisor = right; ushort4 lo = new ushort4(left.c0, left.c1) % divisor; ushort4 hi = new ushort4(left.c2, left.c3) % divisor; return(new ushort2x4(lo.xy, lo.zw, hi.xy, hi.zw)); } } return(new ushort2x4(left.c0 % right, left.c1 % right, left.c2 % right, left.c3 % right)); }
public static int indexof(ushort16 v, ushort x) { if (Avx2.IsAvx2Supported) { return(math.tzcnt(Avx2.mm256_movemask_epi8(Avx2.mm256_cmpeq_epi16(v, new ushort16(x)))) >> 1); } else if (Sse2.IsSse2Supported) { ushort8 broadcast = x; return(math.tzcnt(Sse2.movemask_epi8(Sse2.cmpeq_epi16(v._v8_0, broadcast)) | (Sse2.movemask_epi8(Sse2.cmpeq_epi16(v._v8_8, broadcast)) << 16)) >> 1); } else { for (int i = 0; i < 16; i++) { if (v[i] == x) { return(i); } else { continue; } } return(16); } }
public static ushort8 reversebits(ushort8 x) { x = ((x >> 1) & 0x5555) | ((x & 0x5555) << 1); x = ((x >> 2) & 0x3333) | ((x & 0x3333) << 2); x = ((x >> 4) & 0x0F0F) | ((x & 0x0F0F) << 4); return((x >> 8) | (x << 8)); }
public static ushort8 floorpow2(ushort8 x) { x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; return(x - (x >> 1)); }
public static ushort8 ceilpow2(ushort8 x) { x -= 1; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; return(x + 1); }
public DebuggerProxy(ushort8 v) { x0 = v.x0; x1 = v.x1; x2 = v.x2; x3 = v.x3; x4 = v.x4; x5 = v.x5; x6 = v.x6; x7 = v.x7; }
public static ushort8 andnot(ushort8 left, ushort8 right) { if (Sse2.IsSse2Supported) { return(Sse2.andnot_si128(right, left)); } else { return(left & ~right); } }
public static half8 ashalf(ushort8 x) { if (Sse.IsSseSupported) { return((v128)x); } else { return(*(half8 *)&x); } }
public static ushort8 avg(ushort8 x, ushort8 y) { if (Sse2.IsSse2Supported) { return(Sse2.avg_epu16(x, y)); } else { return(new ushort8((ushort)((x.x0 + y.x0 + 1) >> 1), (ushort)((x.x1 + y.x1 + 1) >> 1), (ushort)((x.x2 + y.x2 + 1) >> 1), (ushort)((x.x3 + y.x3 + 1) >> 1), (ushort)((x.x4 + y.x4 + 1) >> 1), (ushort)((x.x5 + y.x5 + 1) >> 1), (ushort)((x.x6 + y.x6 + 1) >> 1), (ushort)((x.x7 + y.x7 + 1) >> 1))); } }
public static ushort8 subadd(ushort8 a, ushort8 b) { if (Ssse3.IsSsse3Supported) { return(a + Ssse3.sign_epi16(b, new ushort8(ushort.MaxValue, 1, ushort.MaxValue, 1, ushort.MaxValue, 1, ushort.MaxValue, 1))); } else { return(a - select(b, (ushort8)(-(short8)b), new bool8(false, true, false, true, false, true, false, true))); } }
internal static v128 ShortToByte(ushort8 x) { if (Ssse3.IsSsse3Supported) { return(Ssse3.shuffle_epi8(x, new byte8(0, 2, 4, 6, 8, 10, 12, 14))); } else { throw new CPUFeatureCheckException(); } }
public static ushort8 divrem(ushort8 dividend, ushort divisor, out ushort8 remainder) { if (Constant.IsConstantExpression(divisor)) { remainder = dividend % divisor; return(dividend / divisor); } else { return(divrem(dividend, (ushort8)divisor, out remainder)); } }
public static bool8 ispow2(ushort8 x) { if (Sse2.IsSse2Supported) { return((v128)(byte8)(new ushort8(1) & Sse2.and_si128(Operator.greater_mask_ushort(x, default(v128)), Sse2.cmpeq_epi16(default(v128), x & (x - 1))))); } else { return(new bool8(math.ispow2((uint)x.x0), math.ispow2((uint)x.x1), math.ispow2((uint)x.x2), math.ispow2((uint)x.x3), math.ispow2((uint)x.x4), math.ispow2((uint)x.x5), math.ispow2((uint)x.x6), math.ispow2((uint)x.x7))); } }
public static ushort4x2 operator /(ushort4x2 left, ushort4x2 right) { if (Avx2.IsAvx2Supported) { ushort8 div = new ushort8(left.c0, left.c1) / new ushort8(right.c0, right.c1); return(new ushort4x2(div.v4_0, div.v4_4)); } else { return(new ushort4x2(left.c0 / right.c0, left.c1 / right.c1)); } }
public static ushort4x3 operator /(ushort4x3 left, ushort4x3 right) { if (Avx2.IsAvx2Supported) { ushort8 div = new ushort8(left.c0, left.c1) / new ushort8(right.c0, right.c1); return(new ushort4x3(div.v4_0, div.v4_4, left.c2 / right.c2)); } else { return(new ushort4x3(left.c0 / right.c0, left.c1 / right.c1, left.c2 / right.c2)); } }
public static ushort4x2 operator %(ushort4x2 left, ushort4x2 right) { if (Avx2.IsAvx2Supported) { ushort8 rem = new ushort8(left.c0, left.c1) % new ushort8(right.c0, right.c1); return(new ushort4x2(rem.v4_0, rem.v4_4)); } else { return(new ushort4x2(left.c0 % right.c0, left.c1 % right.c1)); } }
public static ushort8 countbits(ushort8 x) { if (Ssse3.IsSsse3Supported) { ushort8 byteBits = (v128)countbits((byte16)(v128)x); return((byteBits & 0x00FF) + (byteBits >> 8)); } else { return(new ushort8((ushort)math.countbits((uint)x.x0), (ushort)math.countbits((uint)x.x1), (ushort)math.countbits((uint)x.x2), (ushort)math.countbits((uint)x.x3), (ushort)math.countbits((uint)x.x4), (ushort)math.countbits((uint)x.x5), (ushort)math.countbits((uint)x.x6), (ushort)math.countbits((uint)x.x7))); } }
public static bool8 toboolsafe(ushort8 x) { if (Sse2.IsSse2Supported) { return((v128)(byte8)clamp(x, 0, 1)); } else { byte8 temp = (byte8)clamp(x, 0, 1); return(*(bool8 *)&temp); } }
public static ushort4x3 operator %(ushort4x3 left, ushort4x3 right) { if (Avx2.IsAvx2Supported) { ushort8 rem = new ushort8(left.c0, left.c1) % new ushort8(right.c0, right.c1); return(new ushort4x3(rem.v4_0, rem.v4_4, left.c2 % right.c2)); } else { return(new ushort4x3(left.c0 % right.c0, left.c1 % right.c1, left.c2 % right.c2)); } }
public static ushort4x4 operator %(ushort4x4 left, ushort4x4 right) { if (Avx2.IsAvx2Supported) { ushort8 rem_lo = new ushort8(left.c0, left.c1) % new ushort8(right.c0, right.c1); ushort8 rem_hi = new ushort8(left.c2, left.c3) % new ushort8(right.c2, right.c3); return(new ushort4x4(rem_lo.v4_0, rem_lo.v4_4, rem_hi.v4_0, rem_hi.v4_4)); } else { return(new ushort4x4(left.c0 % right.c0, left.c1 % right.c1, left.c2 % right.c2, left.c3 % right.c3)); } }
public static ushort4x2 operator %(ushort4x2 left, ushort right) { if (Avx2.IsAvx2Supported) { if (!Constant.IsConstantExpression(right)) { ushort8 rem = new ushort8(left.c0, left.c1) % right; return(new ushort4x2(rem.v4_0, rem.v4_4)); } } return(new ushort4x2(left.c0 % right, left.c1 % right)); }
public static ushort4x2 operator /(ushort4x2 left, ushort right) { if (Avx2.IsAvx2Supported) { if (!Constant.IsConstantExpression(right)) { ushort8 div = new ushort8(left.c0, left.c1) / right; return(new ushort4x2(div.v4_0, div.v4_4)); } } return(new ushort4x2(left.c0 / right, left.c1 / right)); }
public static uint cprod(ushort8 x) { if (Avx2.IsAvx2Supported) { v128 prod = Avx.mm256_castsi256_si128((uint8)x * (uint8)(ushort8)Sse2.shuffle_epi32(x, Sse.SHUFFLE(0, 1, 2, 3))); prod = Sse4_1.mullo_epi32(prod, Sse2.shuffle_epi32(prod, Sse.SHUFFLE(0, 1, 2, 3))); return(Sse4_1.mullo_epi32(prod, Sse2.shufflelo_epi16(prod, Sse.SHUFFLE(0, 0, 3, 2))).UInt0); } else { return(cprod((uint4)x.v4_0 * (uint4)x.v4_4)); } }
internal static v128 greater_mask_ushort(ushort8 left, ushort8 right) { if (Sse2.IsSse2Supported) { ushort8 mask = 1 << 15; return(Sse2.cmpgt_epi16(Sse2.xor_si128(left, mask), Sse2.xor_si128(right, mask))); } else { throw new CPUFeatureCheckException(); } }