Example #1
0
 internal static v128 max_uint(v128 a, v128 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epu32(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, greater_mask_uint(b, a)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Example #2
0
        public static uint4 gcd(uint4 x, uint4 y)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 ZERO = default(v128);

                v128 _x = *(v128 *)&x;
                v128 _y = *(v128 *)&y;

                v128 result             = ZERO;
                v128 result_if_zero_any = ZERO;

                v128 x_is_zero = Sse2.cmpeq_epi32(_x, ZERO);
                v128 y_is_zero = Sse2.cmpeq_epi32(_y, ZERO);
                v128 any_zero  = Sse2.or_si128(x_is_zero, y_is_zero);

                result_if_zero_any = Mask.BlendV(result_if_zero_any, _y, x_is_zero);
                result_if_zero_any = Mask.BlendV(result_if_zero_any, _x, y_is_zero);

                v128 doneMask = any_zero;

                int4 shift = math.tzcnt(x | y);

                x  = shrl(x, math.tzcnt(x));
                _x = *(v128 *)&x;

                do
                {
                    uint4 temp_y = shrl(*(uint4 *)&_y, math.tzcnt(*(uint4 *)&_y));
                    _y = *(v128 *)&temp_y;

                    if (Sse4_1.IsSse41Supported)
                    {
                        v128 tempX = _x;

                        _x = Sse4_1.min_epu32(_x, _y);
                        _y = Sse4_1.max_epu32(_y, tempX);
                    }
                    else
                    {
                        v128 tempX       = _x;
                        v128 x_greater_y = Operator.greater_mask_uint(_x, _y);

                        _x = Mask.BlendV(_x, _y, x_greater_y);
                        _y = Mask.BlendV(_y, tempX, x_greater_y);
                    }

                    _y = Sse2.sub_epi32(_y, _x);

                    v128 loopCheck = Sse2.andnot_si128(doneMask, Sse2.cmpeq_epi32(_y, ZERO));
                    result   = Mask.BlendV(result, _x, loopCheck);
                    doneMask = Sse2.or_si128(doneMask, loopCheck);
                } while (bitmask32(4 * sizeof(uint)) != Sse2.movemask_epi8(doneMask));

                uint4 result_temp = shl(*(uint4 *)&result, shift);
                result = *(v128 *)&result_temp;

                result = Mask.BlendV(result, result_if_zero_any, any_zero);

                return(*(uint4 *)&result);
            }
            else
            {
                return(new uint4(gcd(x.x, y.x), gcd(x.y, y.y), gcd(x.z, y.z), gcd(x.w, y.w)));
            }
        }