internal static v128 max_uint(v128 a, v128 b) { if (Sse4_1.IsSse41Supported) { return(Sse4_1.max_epu32(a, b)); } else if (Sse2.IsSse2Supported) { return(Mask.BlendV(a, b, greater_mask_uint(b, a))); } else { throw new CPUFeatureCheckException(); } }
public static uint4 gcd(uint4 x, uint4 y) { if (Sse2.IsSse2Supported) { v128 ZERO = default(v128); v128 _x = *(v128 *)&x; v128 _y = *(v128 *)&y; v128 result = ZERO; v128 result_if_zero_any = ZERO; v128 x_is_zero = Sse2.cmpeq_epi32(_x, ZERO); v128 y_is_zero = Sse2.cmpeq_epi32(_y, ZERO); v128 any_zero = Sse2.or_si128(x_is_zero, y_is_zero); result_if_zero_any = Mask.BlendV(result_if_zero_any, _y, x_is_zero); result_if_zero_any = Mask.BlendV(result_if_zero_any, _x, y_is_zero); v128 doneMask = any_zero; int4 shift = math.tzcnt(x | y); x = shrl(x, math.tzcnt(x)); _x = *(v128 *)&x; do { uint4 temp_y = shrl(*(uint4 *)&_y, math.tzcnt(*(uint4 *)&_y)); _y = *(v128 *)&temp_y; if (Sse4_1.IsSse41Supported) { v128 tempX = _x; _x = Sse4_1.min_epu32(_x, _y); _y = Sse4_1.max_epu32(_y, tempX); } else { v128 tempX = _x; v128 x_greater_y = Operator.greater_mask_uint(_x, _y); _x = Mask.BlendV(_x, _y, x_greater_y); _y = Mask.BlendV(_y, tempX, x_greater_y); } _y = Sse2.sub_epi32(_y, _x); v128 loopCheck = Sse2.andnot_si128(doneMask, Sse2.cmpeq_epi32(_y, ZERO)); result = Mask.BlendV(result, _x, loopCheck); doneMask = Sse2.or_si128(doneMask, loopCheck); } while (bitmask32(4 * sizeof(uint)) != Sse2.movemask_epi8(doneMask)); uint4 result_temp = shl(*(uint4 *)&result, shift); result = *(v128 *)&result_temp; result = Mask.BlendV(result, result_if_zero_any, any_zero); return(*(uint4 *)&result); } else { return(new uint4(gcd(x.x, y.x), gcd(x.y, y.y), gcd(x.z, y.z), gcd(x.w, y.w))); } }