Exemple #1
0
        public static byte16 intsqrt(byte16 x)
        {
            if (Avx2.IsAvx2Supported)
            {
                return(new byte16(intsqrt(x.v8_0), intsqrt(x.v8_8)));
            }
            else if (Sse2.IsSse2Supported)
            {
                v128 ZERO = default(v128);

                byte16 result = ZERO;
                byte16 mask   = new byte16(1 << 6);

                v128 doneMask = ZERO;


                v128 tempMask = Sse2.cmpeq_epi8(ZERO, ZERO);

                doneMask = Sse2.cmpeq_epi8(x, max(mask, x));
                tempMask = Mask.BlendV(tempMask, mask, doneMask);

                while (bitmask32(16 * sizeof(byte)) != Sse2.movemask_epi8(doneMask))
                {
                    mask >>= 2;

                    doneMask = Sse2.or_si128(doneMask, Sse2.cmpeq_epi8(x, max(mask, x)));

                    if (Sse4_1.IsSse41Supported)
                    {
                        tempMask = Mask.BlendV(tempMask, mask, Sse2.and_si128(tempMask, doneMask));
                    }
                    else
                    {
                        tempMask = Mask.BlendV(tempMask, mask, Sse2.and_si128(Sse2.cmpgt_epi8(default, tempMask), doneMask));
Exemple #2
0
        internal static byte8 vdivrem_byte_SSE_FALLBACK(byte8 dividend, byte8 divisor, out byte8 remainder)
        {
            Assert.AreNotEqual(divisor.x0, 0);
            Assert.AreNotEqual(divisor.x1, 0);
            Assert.AreNotEqual(divisor.x2, 0);
            Assert.AreNotEqual(divisor.x3, 0);
            Assert.AreNotEqual(divisor.x4, 0);
            Assert.AreNotEqual(divisor.x5, 0);
            Assert.AreNotEqual(divisor.x6, 0);
            Assert.AreNotEqual(divisor.x7, 0);

            if (Sse2.IsSse2Supported)
            {
                ushort8 quotients  = ushort8.zero;
                ushort8 remainders = ushort8.zero;

                ushort8 divisorCast  = divisor;
                ushort8 dividendCast = dividend;


                remainders |= (new ushort8(1) & (dividendCast >> 7));

                v128 subtractDivisorFromRemainder = Sse2.cmpeq_epi16(maxmath.min(divisorCast, remainders), divisorCast);

                remainders -= Mask.BlendV(default(v128), divisorCast, subtractDivisorFromRemainder);
                quotients  |= new ushort8(1) & subtractDivisorFromRemainder;

                for (int i = 6; i > 0; i--)
                {
                    quotients  <<= 1;
                    remainders <<= 1;

                    remainders |= (new ushort8(1) & (dividendCast >> i));

                    subtractDivisorFromRemainder = Sse2.cmpeq_epi8(maxmath.min(divisorCast, remainders), divisorCast);

                    remainders -= Mask.BlendV(default(v128), divisorCast, subtractDivisorFromRemainder);
                    quotients  |= new ushort8(1) & subtractDivisorFromRemainder;
                }

                remainders <<= 1;
                quotients  <<= 1;

                remainders |= new ushort8(1) & dividendCast;

                subtractDivisorFromRemainder = Sse2.cmpeq_epi16(maxmath.min(divisorCast, remainders), divisorCast);

                remainders -= Mask.BlendV(default(v128), divisorCast, subtractDivisorFromRemainder);
                quotients  |= new ushort8(1) & subtractDivisorFromRemainder;


                byte16 temp = Sse2.packus_epi16(remainders, quotients);
                remainder = temp.v8_0;
                return(temp.v8_8);
            }
            else
            {
                throw new CPUFeatureCheckException();
            }
        }
Exemple #3
0
        public float8(float2 x01, float3 x234, float3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 lo  = Sse2.unpacklo_pd(*(v128 *)&x01, *(v128 *)&x234);
                v128 mid = Sse2.bsrli_si128(*(v128 *)&x234, 2 * sizeof(float));
                v128 hi  = Sse2.bslli_si128(*(v128 *)&x567, sizeof(float));

                if (Sse4_1.IsSse41Supported)
                {
                    hi = Sse4_1.blend_ps(mid, hi, 0b1110);
                }
                else
                {
                    hi = Mask.BlendV(mid, hi, new v128(0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255), false);
                }


                this = new float8(*(float4 *)&lo, *(float4 *)&hi);
            }
            else
            {
                this = new float8
                {
                    _v4_0 = new float4(x01, x234.xy),
                    _v4_4 = new float4(x234.z, x567)
                };
            }
        }
Exemple #4
0
        public byte8(byte3 x012, byte2 x34, byte3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 hi = Sse2.bslli_si128(x567, 2 * sizeof(byte));
                if (Sse4_1.IsSse41Supported)
                {
                    hi = Sse4_1.blend_epi16(x34, hi, 0b0110);
                }
                else
                {
                    hi = Mask.BlendEpi16_SSE2(x34, hi, 0b0110);
                }
                hi = Sse2.bslli_si128(hi, 3 * sizeof(byte));

                this = Mask.BlendV(x012, hi, new byte8(0, 0, 0, 255, 255, 255, 255, 255));
            }
            else
            {
                this.x0 = x012.x;
                this.x1 = x012.y;
                this.x2 = x012.z;
                this.x3 = x34.x;
                this.x4 = x34.y;
                this.x5 = x567.x;
                this.x6 = x567.y;
                this.x7 = x567.z;
            }
        }
Exemple #5
0
        public byte8(byte2 x01, byte3 x234, byte3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bslli_si128(x234, 2 * sizeof(byte));
                v128 hi  = Sse2.bslli_si128(x567, 5 * sizeof(byte));

                hi = Mask.BlendV(mid, hi, new byte8(0, 0, 0, 0, 0, 255, 255, 255));

                if (Sse4_1.IsSse41Supported)
                {
                    this = Sse4_1.blend_epi16(x01, hi, 0b1110);
                }
                else
                {
                    this = Mask.BlendEpi16_SSE2(x01, hi, 0b1110);
                }
            }
            else
            {
                this.x0 = x01.x;
                this.x1 = x01.y;
                this.x2 = x234.x;
                this.x3 = x234.y;
                this.x4 = x234.z;
                this.x5 = x567.x;
                this.x6 = x567.y;
                this.x7 = x567.z;
            }
        }
Exemple #6
0
        public static ushort8 tzcnt(ushort8 x)
        {
            if (Ssse3.IsSsse3Supported)
            {
                v128 NIBBLE_MASK     = new v128(0x0F0F_0F0F);
                v128 SHUFFLE_MASK_LO = new v128(16, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0);
                v128 SHUFFLE_MASK_HI = new v128(16, 4, 5, 4, 6, 4, 5, 4, 7, 4, 5, 4, 6, 4, 5, 4);

                v128 tzcnt_bytes = Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)),
                                                 Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4))));

                return(Sse2.min_epu8(tzcnt_bytes,
                                     Sse2.srli_epi16(Sse2.add_epi8(tzcnt_bytes, Sse2.set1_epi8(8)), 8)));
            }
            else if (Sse2.IsSse2Supported)
            {
                v128 compareMask = x & (ushort8)(-((short8)x));

                ushort8 first  = Mask.BlendV(default(v128), new ushort8(1), Sse2.cmpeq_epi16(compareMask, default(v128)));
                ushort8 second = Mask.BlendV(default(v128), new ushort8(8), Sse2.cmpeq_epi16(compareMask & (ushort8)0x00FF, default(v128)));
                ushort8 third  = Mask.BlendV(default(v128), new ushort8(4), Sse2.cmpeq_epi16(compareMask & (ushort8)0x0F0F, default(v128)));
                ushort8 fourth = Mask.BlendV(default(v128), new ushort8(2), Sse2.cmpeq_epi16(compareMask & (ushort8)0x3333, default(v128)));
                ushort8 fifth  = Mask.BlendV(default(v128), new ushort8(1), Sse2.cmpeq_epi16(compareMask & (ushort8)0x5555, default(v128)));

                return((first + second) + ((third + fourth) + fifth));
            }
            else
            {
                return(new ushort8(tzcnt(x.x0), tzcnt(x.x1), tzcnt(x.x2), tzcnt(x.x3), tzcnt(x.x4), tzcnt(x.x5), tzcnt(x.x6), tzcnt(x.x7)));
            }
        }
Exemple #7
0
        public static byte4 tzcnt(byte4 x)
        {
            if (Ssse3.IsSsse3Supported)
            {
                v128 NIBBLE_MASK     = new v128(0x0F0F_0F0F);
                v128 SHUFFLE_MASK_LO = new v128(8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0);
                v128 SHUFFLE_MASK_HI = new v128(8, 4, 5, 4, 6, 4, 5, 4, 7, 4, 5, 4, 6, 4, 5, 4);

                return(Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)),
                                     Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4)))));
            }
            else if (Sse2.IsSse2Supported)
            {
                v128 compareMask = x & (byte4)(-(sbyte4)x);

                byte4 first  = Mask.BlendV(default(v128), new byte4(1), Sse2.cmpeq_epi8(compareMask, default(v128)));
                byte4 second = Mask.BlendV(default(v128), new byte4(4), Sse2.cmpeq_epi8(compareMask & (byte4)0x0F, default(v128)));
                byte4 third  = Mask.BlendV(default(v128), new byte4(2), Sse2.cmpeq_epi8(compareMask & (byte4)0x33, default(v128)));
                byte4 fourth = Mask.BlendV(default(v128), new byte4(1), Sse2.cmpeq_epi8(compareMask & (byte4)0x55, default(v128)));

                return((first + second) + (third + fourth));
            }
            else
            {
                return(new byte4(tzcnt(x.x), tzcnt(x.y), tzcnt(x.z), tzcnt(x.w)));
            }
        }
Exemple #8
0
        public byte8(byte3 x012, byte3 x345, byte2 x67)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bslli_si128(x345, 3 * sizeof(byte));
                v128 hi  = Sse2.bslli_si128(x67, 6 * sizeof(byte));

                mid = Mask.BlendV(x012, mid, new byte8(0, 0, 0, 255, 255, 255, 0, 0));

                if (Sse4_1.IsSse41Supported)
                {
                    this = Sse4_1.blend_epi16(mid, hi, 0b1000);
                }
                else
                {
                    this = Mask.BlendEpi16_SSE2(mid, hi, 0b1000);
                }
            }
            else
            {
                this.x0 = x012.x;
                this.x1 = x012.y;
                this.x2 = x012.z;
                this.x3 = x345.x;
                this.x4 = x345.y;
                this.x5 = x345.z;
                this.x6 = x67.x;
                this.x7 = x67.y;
            }
        }
Exemple #9
0
        public static byte16 tzcnt(byte16 x)
        {
            if (Ssse3.IsSsse3Supported)
            {
                v128 NIBBLE_MASK     = new v128(0x0F0F_0F0F);
                v128 SHUFFLE_MASK_LO = new v128(8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0);
                v128 SHUFFLE_MASK_HI = new v128(8, 4, 5, 4, 6, 4, 5, 4, 7, 4, 5, 4, 6, 4, 5, 4);

                return(Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)),
                                     Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4)))));
            }
            else if (Sse2.IsSse2Supported)
            {
                v128 compareMask = x & (byte16)(-(sbyte16)x);

                byte16 first  = Mask.BlendV(default(v128), new byte16(1), Sse2.cmpeq_epi8(compareMask, default(v128)));
                byte16 second = Mask.BlendV(default(v128), new byte16(4), Sse2.cmpeq_epi8(compareMask & (byte16)0x0F, default(v128)));
                byte16 third  = Mask.BlendV(default(v128), new byte16(2), Sse2.cmpeq_epi8(compareMask & (byte16)0x33, default(v128)));
                byte16 fourth = Mask.BlendV(default(v128), new byte16(1), Sse2.cmpeq_epi8(compareMask & (byte16)0x55, default(v128)));

                return((first + second) + (third + fourth));
            }
            else
            {
                return(new byte16(tzcnt(x.x0), tzcnt(x.x1), tzcnt(x.x2), tzcnt(x.x3), tzcnt(x.x4), tzcnt(x.x5), tzcnt(x.x6), tzcnt(x.x7), tzcnt(x.x8), tzcnt(x.x9), tzcnt(x.x10), tzcnt(x.x11), tzcnt(x.x12), tzcnt(x.x13), tzcnt(x.x14), tzcnt(x.x15)));
            }
        }
Exemple #10
0
        public static ushort4 gcd(ushort4 x, ushort4 y)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 ZERO = default(v128);

                v128 result             = ZERO;
                v128 result_if_zero_any = ZERO;

                v128 x_is_zero = Sse2.cmpeq_epi16(x, ZERO);
                v128 y_is_zero = Sse2.cmpeq_epi16(y, ZERO);
                v128 any_zero  = Sse2.or_si128(x_is_zero, y_is_zero);

                result_if_zero_any = Mask.BlendV(result_if_zero_any, y, x_is_zero);
                result_if_zero_any = Mask.BlendV(result_if_zero_any, x, y_is_zero);

                v128 doneMask = any_zero;

                ushort4 shift = tzcnt(x | y);

                x = shrl(x, tzcnt(x));

                do
                {
                    y = shrl(y, tzcnt(y));

                    if (Sse4_1.IsSse41Supported)
                    {
                        v128 tempX = x;

                        x = Sse4_1.min_epu16(x, y);
                        y = Sse4_1.max_epu16(y, tempX);
                    }
                    else
                    {
                        v128 tempX       = x;
                        v128 x_greater_y = Operator.greater_mask_ushort(x, y);

                        x = Mask.BlendV(x, y, x_greater_y);
                        y = Mask.BlendV(y, tempX, x_greater_y);
                    }

                    y -= x;

                    v128 loopCheck = Sse2.andnot_si128(doneMask, Sse2.cmpeq_epi16(y, ZERO));
                    result   = Mask.BlendV(result, x, loopCheck);
                    doneMask = Sse2.or_si128(doneMask, loopCheck);
                } while (-1 != doneMask.SLong0);

                result = shl(result, shift);

                result = Mask.BlendV(result, result_if_zero_any, any_zero);

                return(result);
            }
            else
            {
                return(new ushort4((ushort)gcd((uint)x.x, (uint)y.x), (ushort)gcd((uint)x.y, (uint)y.y), (ushort)gcd((uint)x.z, (uint)y.z), (ushort)gcd((uint)x.w, (uint)y.w)));
            }
        }
Exemple #11
0
        internal static v128 shra_byte(v128 x, int n)
        {
            v128 even = shra_short(shl_short(x, 8), n + 8);
            v128 odd  = shra_short(x, n);

            return(Mask.BlendV(even, odd, new v128(0xFF00_FF00)));
        }
Exemple #12
0
        public float8(float3 x012, float3 x345, float2 x67)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bsrli_si128(*(v128 *)&x345, sizeof(float));
                v128 hi  = Sse2.unpacklo_pd(mid, *(v128 *)&x67);

                mid = Sse2.bslli_si128(*(v128 *)&x345, 3 * sizeof(float));
                v128 lo;

                if (Sse4_1.IsSse41Supported)
                {
                    lo = Sse4_1.blend_ps(*(v128 *)&x012, mid, 0b1000);
                }
                else
                {
                    lo = Mask.BlendV(*(v128 *)&x012, mid, new v128(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255), false);
                }


                this = new float8(*(float4 *)&lo, *(float4 *)&hi);
            }
            else
            {
                this = new float8
                {
                    _v4_0 = new float4(x012, x345.x),
                    _v4_4 = new float4(x345.yz, x67)
                };
            }
        }
Exemple #13
0
        internal static byte16 vrem_byte_SSE_FALLBACK(byte16 dividend, byte16 divisor)
        {
            Assert.AreNotEqual(divisor.x0, 0);
            Assert.AreNotEqual(divisor.x1, 0);
            Assert.AreNotEqual(divisor.x2, 0);
            Assert.AreNotEqual(divisor.x3, 0);
            Assert.AreNotEqual(divisor.x4, 0);
            Assert.AreNotEqual(divisor.x5, 0);
            Assert.AreNotEqual(divisor.x6, 0);
            Assert.AreNotEqual(divisor.x7, 0);
            Assert.AreNotEqual(divisor.x8, 0);
            Assert.AreNotEqual(divisor.x9, 0);
            Assert.AreNotEqual(divisor.x10, 0);
            Assert.AreNotEqual(divisor.x11, 0);
            Assert.AreNotEqual(divisor.x12, 0);
            Assert.AreNotEqual(divisor.x13, 0);
            Assert.AreNotEqual(divisor.x14, 0);
            Assert.AreNotEqual(divisor.x15, 0);

            if (Sse2.IsSse2Supported)
            {
                byte16 remainders = byte16.zero;


                remainders |= (new byte16(1) & (dividend >> 7));

                v128 subtractDivisorFromRemainder = Sse2.cmpeq_epi8(maxmath.min(divisor, remainders), divisor);

                remainders -= Mask.BlendV(default(v128), divisor, subtractDivisorFromRemainder);

                for (int i = 6; i > 0; i--)
                {
                    remainders <<= 1;

                    remainders |= (new byte16(1) & (dividend >> i));

                    subtractDivisorFromRemainder = Sse2.cmpeq_epi8(maxmath.min(divisor, remainders), divisor);

                    remainders -= Mask.BlendV(default(v128), divisor, subtractDivisorFromRemainder);
                }

                remainders <<= 1;;

                remainders |= new byte16(1) & dividend;

                subtractDivisorFromRemainder = Sse2.cmpeq_epi8(maxmath.min(divisor, remainders), divisor);

                remainders -= Mask.BlendV(default(v128), divisor, subtractDivisorFromRemainder);


                return(remainders);
            }
            else
            {
                throw new CPUFeatureCheckException();
            }
        }
        public static sbyte16 intpow(sbyte16 x, byte16 n)
        {
            if (Sse2.IsSse2Supported)
            {
                v128    ZERO = default(v128);
                sbyte16 ONE  = new sbyte16(1);

                v128 doneMask = ZERO;
                v128 result   = ZERO;

                sbyte16 p = x;
                sbyte16 y = ONE;


Loop:
                v128 y_times_p = y * p;
                y = Mask.BlendV(y, y_times_p, Sse2.cmpeq_epi8(ONE, Sse2.and_si128(ONE, n)));

                n >>= 1;

                v128 n_is_zero = Sse2.cmpeq_epi8(ZERO, n);
                result   = Mask.BlendV(result, y, Sse2.andnot_si128(doneMask, n_is_zero));
                doneMask = n_is_zero;


                if (bitmask32(16 * sizeof(sbyte)) != Sse2.movemask_epi8(doneMask))
                {
                    p *= p;

                    goto Loop;
                }
                else
                {
                    return(result);
                }
            }
            else
            {
                return(new sbyte16((sbyte)intpow((int)x.x0, n.x0),
                                   (sbyte)intpow((int)x.x1, n.x1),
                                   (sbyte)intpow((int)x.x2, n.x2),
                                   (sbyte)intpow((int)x.x3, n.x3),
                                   (sbyte)intpow((int)x.x4, n.x4),
                                   (sbyte)intpow((int)x.x5, n.x5),
                                   (sbyte)intpow((int)x.x6, n.x6),
                                   (sbyte)intpow((int)x.x7, n.x7),
                                   (sbyte)intpow((int)x.x8, n.x8),
                                   (sbyte)intpow((int)x.x9, n.x9),
                                   (sbyte)intpow((int)x.x10, n.x10),
                                   (sbyte)intpow((int)x.x11, n.x11),
                                   (sbyte)intpow((int)x.x12, n.x12),
                                   (sbyte)intpow((int)x.x13, n.x13),
                                   (sbyte)intpow((int)x.x14, n.x14),
                                   (sbyte)intpow((int)x.x15, n.x15)));
            }
        }
Exemple #15
0
        public static byte8 gcd(byte8 x, byte8 y)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 ZERO = default(v128);

                v128 result             = ZERO;
                v128 result_if_zero_any = ZERO;

                v128 x_is_zero = Sse2.cmpeq_epi8(x, ZERO);
                v128 y_is_zero = Sse2.cmpeq_epi8(y, ZERO);
                v128 any_zero  = Sse2.or_si128(x_is_zero, y_is_zero);

                result_if_zero_any = Mask.BlendV(result_if_zero_any, y, x_is_zero);
                result_if_zero_any = Mask.BlendV(result_if_zero_any, x, y_is_zero);

                v128 doneMask = any_zero;

                byte8 shift = tzcnt(x | y);

                x = shrl(x, tzcnt(x));

                do
                {
                    y = shrl(y, tzcnt(y));

                    v128 tempX = x;

                    x = Sse2.min_epu8(x, y);
                    y = Sse2.max_epu8(y, tempX);

                    y -= x;

                    v128 loopCheck = Sse2.andnot_si128(doneMask, Sse2.cmpeq_epi8(y, ZERO));
                    result   = Mask.BlendV(result, x, loopCheck);
                    doneMask = Sse2.or_si128(doneMask, loopCheck);
                } while (-1 != doneMask.SLong0);

                result = shl(result, shift);

                result = Mask.BlendV(result, result_if_zero_any, any_zero);

                return(result);
            }
            else
            {
                return(new byte8((byte)gcd((uint)x.x0, (uint)y.x0),
                                 (byte)gcd((uint)x.x1, (uint)y.x1),
                                 (byte)gcd((uint)x.x2, (uint)y.x2),
                                 (byte)gcd((uint)x.x3, (uint)y.x3),
                                 (byte)gcd((uint)x.x4, (uint)y.x4),
                                 (byte)gcd((uint)x.x5, (uint)y.x5),
                                 (byte)gcd((uint)x.x6, (uint)y.x6),
                                 (byte)gcd((uint)x.x7, (uint)y.x7)));
            }
        }
Exemple #16
0
 public static ulong2 min(ulong2 a, ulong2 b)
 {
     if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Operator.greater_mask_ulong(a, b)));
     }
     else
     {
         return(new ulong2(math.min(a.x, b.x), math.min(a.y, b.y)));
     }
 }
Exemple #17
0
 public static long2 nabs(long2 x)
 {
     if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(-x, x, Operator.greater_mask_long(default(v128), x)));
     }
     else
     {
         return(new long2(nabs(x.x), nabs(x.y)));
     }
 }
Exemple #18
0
 public static long2 max(long2 a, long2 b)
 {
     if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Operator.greater_mask_long(b, a)));
     }
     else
     {
         return(new long2(math.max(a.x, b.x), math.max(a.y, b.y)));
     }
 }
Exemple #19
0
        public static ulong2 gcd(ulong2 x, ulong2 y)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 ZERO = default(v128);

                v128 result             = ZERO;
                v128 result_if_zero_any = ZERO;

                v128 x_is_zero = Operator.equals_mask_long(x, ZERO);
                v128 y_is_zero = Operator.equals_mask_long(y, ZERO);
                v128 any_zero  = Sse2.or_si128(x_is_zero, y_is_zero);

                result_if_zero_any = Mask.BlendV(result_if_zero_any, y, x_is_zero);
                result_if_zero_any = Mask.BlendV(result_if_zero_any, x, y_is_zero);

                v128 doneMask = any_zero;

                ulong2 shift = tzcnt(x | y);

                x = shrl(x, tzcnt(x));

                do
                {
                    y = shrl(y, tzcnt(y));

                    v128 tempX       = x;
                    v128 x_greater_y = Operator.greater_mask_ulong(x, y);

                    x = Mask.BlendV(x, y, x_greater_y);
                    y = Mask.BlendV(y, tempX, x_greater_y);

                    y -= x;

                    v128 loopCheck = Sse2.andnot_si128(doneMask, Operator.equals_mask_long(y, ZERO));
                    result   = Mask.BlendV(result, x, loopCheck);
                    doneMask = Sse2.or_si128(doneMask, loopCheck);
                } while (bitmask32(2 * sizeof(ulong)) != Sse2.movemask_epi8(doneMask));

                result = shl(result, shift);

                result = Mask.BlendV(result, result_if_zero_any, any_zero);

                return(result);
            }
            else
            {
                return(new ulong2(gcd(x.x, y.x), gcd(x.y, y.y)));
            }
        }
Exemple #20
0
        public static byte4 gcd(byte4 x, byte4 y)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 ZERO = default(v128);

                v128 result             = ZERO;
                v128 result_if_zero_any = ZERO;

                v128 x_is_zero = Sse2.cmpeq_epi8(x, ZERO);
                v128 y_is_zero = Sse2.cmpeq_epi8(y, ZERO);
                v128 any_zero  = Sse2.or_si128(x_is_zero, y_is_zero);

                result_if_zero_any = Mask.BlendV(result_if_zero_any, y, x_is_zero);
                result_if_zero_any = Mask.BlendV(result_if_zero_any, x, y_is_zero);

                v128 doneMask = any_zero;

                byte4 shift = tzcnt(x | y);

                x = shrl(x, tzcnt(x));

                do
                {
                    y = shrl(y, tzcnt(y));

                    v128 tempX = x;

                    x = Sse2.min_epu8(x, y);
                    y = Sse2.max_epu8(y, tempX);

                    y -= x;

                    v128 loopCheck = Sse2.andnot_si128(doneMask, Sse2.cmpeq_epi8(y, ZERO));
                    result   = Mask.BlendV(result, x, loopCheck);
                    doneMask = Sse2.or_si128(doneMask, loopCheck);
                } while (-1 != doneMask.SInt0);

                result = shl(result, shift);

                result = Mask.BlendV(result, result_if_zero_any, any_zero);

                return(result);
            }
            else
            {
                return(new byte4((byte)gcd((uint)x.x, (uint)y.x), (byte)gcd((uint)x.y, (uint)y.y), (byte)gcd((uint)x.z, (uint)y.z), (byte)gcd((uint)x.w, (uint)y.w)));
            }
        }
Exemple #21
0
 public static sbyte8 max(sbyte8 a, sbyte8 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epi8(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi8(b, a)));
     }
     else
     {
         return(new sbyte8((sbyte)math.max(a.x0, b.x0), (sbyte)math.max(a.x1, b.x1), (sbyte)math.max(a.x2, b.x2), (sbyte)math.max(a.x3, b.x3), (sbyte)math.max(a.x4, b.x4), (sbyte)math.max(a.x5, b.x5), (sbyte)math.max(a.x6, b.x6), (sbyte)math.max(a.x7, b.x7)));
     }
 }
Exemple #22
0
 public static sbyte4 min(sbyte4 a, sbyte4 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.min_epi8(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi8(a, b)));
     }
     else
     {
         return(new sbyte4((sbyte)math.min(a.x, b.x), (sbyte)math.min(a.y, b.y), (sbyte)math.min(a.z, b.z), (sbyte)math.min(a.w, b.w)));
     }
 }
Exemple #23
0
 public static ushort3 min(ushort3 a, ushort3 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.min_epu16(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Operator.greater_mask_ushort(a, b)));
     }
     else
     {
         return(new ushort3((ushort)math.min((uint)a.x, (uint)b.x), (ushort)math.min((uint)a.y, (uint)b.y), (ushort)math.min((uint)a.z, (uint)b.z)));
     }
 }
 internal static v128 min_int(v128 a, v128 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.min_epi32(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi32(a, b)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Exemple #25
0
 public static sbyte16 nabs(sbyte16 x)
 {
     if (Ssse3.IsSsse3Supported)
     {
         return(Sse2.sub_epi8(default(v128), Ssse3.abs_epi8(x)));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(-x, x, Sse2.cmpgt_epi8(default(v128), x)));
     }
     else
     {
         return(new sbyte16((sbyte)nabs((int)x.x0), (sbyte)nabs((int)x.x1), (sbyte)nabs((int)x.x2), (sbyte)nabs((int)x.x3), (sbyte)nabs((int)x.x4), (sbyte)nabs((int)x.x5), (sbyte)nabs((int)x.x6), (sbyte)nabs((int)x.x7), (sbyte)nabs((int)x.x8), (sbyte)nabs((int)x.x9), (sbyte)nabs((int)x.x10), (sbyte)nabs((int)x.x11), (sbyte)nabs((int)x.x12), (sbyte)nabs((int)x.x13), (sbyte)nabs((int)x.x14), (sbyte)nabs((int)x.x15)));
     }
 }
Exemple #26
0
 public static ushort4 max(ushort4 a, ushort4 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epu16(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Operator.greater_mask_ushort(b, a)));
     }
     else
     {
         return(new ushort4((ushort)math.max((uint)a.x, (uint)b.x), (ushort)math.max((uint)a.y, (uint)b.y), (ushort)math.max((uint)a.z, (uint)b.z), (ushort)math.max((uint)a.w, (uint)b.w)));
     }
 }
Exemple #27
0
 public static sbyte2 max(sbyte2 a, sbyte2 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epi8(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi8(b, a)));
     }
     else
     {
         return(new sbyte2((sbyte)math.max(a.x, b.x), (sbyte)math.max(a.y, b.y)));
     }
 }
Exemple #28
0
 public static ushort8 max(ushort8 a, ushort8 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epu16(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Operator.greater_mask_ushort(b, a)));
     }
     else
     {
         return(new ushort8((ushort)math.max((uint)a.x0, (uint)b.x0), (ushort)math.max((uint)a.x1, (uint)b.x1), (ushort)math.max((uint)a.x2, (uint)b.x2), (ushort)math.max((uint)a.x3, (uint)b.x3), (ushort)math.max((uint)a.x4, (uint)b.x4), (ushort)math.max((uint)a.x5, (uint)b.x5), (ushort)math.max((uint)a.x6, (uint)b.x6), (ushort)math.max((uint)a.x7, (uint)b.x7)));
     }
 }
 internal static v128 max_uint(v128 a, v128 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epu32(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, greater_mask_uint(b, a)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Exemple #30
0
 public static sbyte16 max(sbyte16 a, sbyte16 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epi8(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi8(b, a)));
     }
     else
     {
         return(new sbyte16((sbyte)math.max(a.x0, b.x0), (sbyte)math.max(a.x1, b.x1), (sbyte)math.max(a.x2, b.x2), (sbyte)math.max(a.x3, b.x3), (sbyte)math.max(a.x4, b.x4), (sbyte)math.max(a.x5, b.x5), (sbyte)math.max(a.x6, b.x6), (sbyte)math.max(a.x7, b.x7), (sbyte)math.max(a.x8, b.x8), (sbyte)math.max(a.x9, b.x9), (sbyte)math.max(a.x10, b.x10), (sbyte)math.max(a.x11, b.x11), (sbyte)math.max(a.x12, b.x12), (sbyte)math.max(a.x13, b.x13), (sbyte)math.max(a.x14, b.x14), (sbyte)math.max(a.x15, b.x15)));
     }
 }