Example #1
0
        public int8(int2 x01, int3 x234, int3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 lo  = Sse2.unpacklo_epi64(*(v128 *)&x01, *(v128 *)&x234);
                v128 mid = Sse2.bsrli_si128(*(v128 *)&x234, 2 * sizeof(int));
                v128 hi  = Sse2.bslli_si128(*(v128 *)&x567, sizeof(int));

                if (Sse4_1.IsSse41Supported)
                {
                    hi = Sse4_1.blend_epi16(mid, hi, 0b1111_1100);
                }
                else
                {
                    hi = Mask.BlendEpi16_SSE2(mid, hi, 0b1111_1100);
                }


                this = new int8(*(int4 *)&lo, *(int4 *)&hi);
            }
            else
            {
                this = new int8
                {
                    _v4_0 = new int4(x01, x234.xy),
                    _v4_4 = new int4(x234.z, x567)
                };
            }
        }
Example #2
0
        public static ushort4 gcd(ushort4 x, ushort4 y)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 ZERO = default(v128);

                v128 result             = ZERO;
                v128 result_if_zero_any = ZERO;

                v128 x_is_zero = Sse2.cmpeq_epi16(x, ZERO);
                v128 y_is_zero = Sse2.cmpeq_epi16(y, ZERO);
                v128 any_zero  = Sse2.or_si128(x_is_zero, y_is_zero);

                result_if_zero_any = Mask.BlendV(result_if_zero_any, y, x_is_zero);
                result_if_zero_any = Mask.BlendV(result_if_zero_any, x, y_is_zero);

                v128 doneMask = any_zero;

                ushort4 shift = tzcnt(x | y);

                x = shrl(x, tzcnt(x));

                do
                {
                    y = shrl(y, tzcnt(y));

                    if (Sse4_1.IsSse41Supported)
                    {
                        v128 tempX = x;

                        x = Sse4_1.min_epu16(x, y);
                        y = Sse4_1.max_epu16(y, tempX);
                    }
                    else
                    {
                        v128 tempX       = x;
                        v128 x_greater_y = Operator.greater_mask_ushort(x, y);

                        x = Mask.BlendV(x, y, x_greater_y);
                        y = Mask.BlendV(y, tempX, x_greater_y);
                    }

                    y -= x;

                    v128 loopCheck = Sse2.andnot_si128(doneMask, Sse2.cmpeq_epi16(y, ZERO));
                    result   = Mask.BlendV(result, x, loopCheck);
                    doneMask = Sse2.or_si128(doneMask, loopCheck);
                } while (-1 != doneMask.SLong0);

                result = shl(result, shift);

                result = Mask.BlendV(result, result_if_zero_any, any_zero);

                return(result);
            }
            else
            {
                return(new ushort4((ushort)gcd((uint)x.x, (uint)y.x), (ushort)gcd((uint)x.y, (uint)y.y), (ushort)gcd((uint)x.z, (uint)y.z), (ushort)gcd((uint)x.w, (uint)y.w)));
            }
        }
        public static int cminpos(ushort4 x, out ushort min)
        {
            if (Sse4_1.IsSse41Supported)
            {
                v128 temp = Sse4_1.minpos_epu16(Sse2.or_si128(x, new v128(0, 0, -1, -1)));
                min = temp.UShort0;

                return(temp.UShort1);
            }
            else
            {
                min = cmin(x);

                if (min == x.x)
                {
                    return(0);
                }
                else if (min == x.y)
                {
                    return(1);
                }
                else if (min == x.z)
                {
                    return(2);
                }
                else
                {
                    return(3);
                }
            }
        }
Example #4
0
        public int8(int3 x012, int3 x345, int2 x67)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 lo;
                v128 mid = Sse2.bsrli_si128(*(v128 *)&x345, sizeof(int));
                v128 hi  = Sse2.unpacklo_epi64(mid, *(v128 *)&x67);

                mid = Sse2.bslli_si128(*(v128 *)&x345, 3 * sizeof(int));

                if (Sse4_1.IsSse41Supported)
                {
                    lo = Sse4_1.blend_epi16(*(v128 *)&x012, mid, 0b1100_0000);
                }
                else
                {
                    lo = Mask.BlendEpi16_SSE2(*(v128 *)&x012, mid, 0b1100_0000);
                }


                this = new int8(*(int4 *)&lo, *(int4 *)&hi);
            }
            else
            {
                this = new int8
                {
                    _v4_0 = new int4(x012, x345.x),
                    _v4_4 = new int4(x345.yz, x67)
                };
            }
        }
Example #5
0
        public byte8(byte3 x012, byte2 x34, byte3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 hi = Sse2.bslli_si128(x567, 2 * sizeof(byte));
                if (Sse4_1.IsSse41Supported)
                {
                    hi = Sse4_1.blend_epi16(x34, hi, 0b0110);
                }
                else
                {
                    hi = Mask.BlendEpi16_SSE2(x34, hi, 0b0110);
                }
                hi = Sse2.bslli_si128(hi, 3 * sizeof(byte));

                this = Mask.BlendV(x012, hi, new byte8(0, 0, 0, 255, 255, 255, 255, 255));
            }
            else
            {
                this.x0 = x012.x;
                this.x1 = x012.y;
                this.x2 = x012.z;
                this.x3 = x34.x;
                this.x4 = x34.y;
                this.x5 = x567.x;
                this.x6 = x567.y;
                this.x7 = x567.z;
            }
        }
Example #6
0
        public byte8(byte2 x01, byte3 x234, byte3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bslli_si128(x234, 2 * sizeof(byte));
                v128 hi  = Sse2.bslli_si128(x567, 5 * sizeof(byte));

                hi = Mask.BlendV(mid, hi, new byte8(0, 0, 0, 0, 0, 255, 255, 255));

                if (Sse4_1.IsSse41Supported)
                {
                    this = Sse4_1.blend_epi16(x01, hi, 0b1110);
                }
                else
                {
                    this = Mask.BlendEpi16_SSE2(x01, hi, 0b1110);
                }
            }
            else
            {
                this.x0 = x01.x;
                this.x1 = x01.y;
                this.x2 = x234.x;
                this.x3 = x234.y;
                this.x4 = x234.z;
                this.x5 = x567.x;
                this.x6 = x567.y;
                this.x7 = x567.z;
            }
        }
        internal static v128 mul_long(v128 left, v128 right)
        {
            if (Sse4_1.IsSse41Supported)
            {
                v128 product_Hi = Sse2.mul_epu32(left, right);

                v128 product_Lo = Sse4_1.mullo_epi32(left, Sse2.shuffle_epi32(right, Sse.SHUFFLE(2, 3, 0, 1)));
                product_Lo = Ssse3.hadd_epi32(product_Lo, default(v128));
                product_Lo = Sse2.shuffle_epi32(product_Lo, Sse.SHUFFLE(1, 3, 0, 3));

                return(Sse2.add_epi64(product_Lo, product_Hi));
            }
            else if (Sse2.IsSse2Supported)
            {
                v128 ac = Sse2.mul_epu32(left, right);
                v128 b  = Sse2.srli_epi64(left, 32);

                v128 bc = Sse2.mul_epu32(b, right);
                v128 d  = Sse2.srli_epi64(right, 32);
                v128 ad = Sse2.mul_epu32(left, d);

                v128 hi = Sse2.add_epi64(bc, ad);
                hi = Sse2.slli_epi64(hi, 32);

                return(Sse2.add_epi64(hi, ac));
            }
            else
            {
                throw new CPUFeatureCheckException();
            }
        }
Example #8
0
        public byte8(byte3 x012, byte3 x345, byte2 x67)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bslli_si128(x345, 3 * sizeof(byte));
                v128 hi  = Sse2.bslli_si128(x67, 6 * sizeof(byte));

                mid = Mask.BlendV(x012, mid, new byte8(0, 0, 0, 255, 255, 255, 0, 0));

                if (Sse4_1.IsSse41Supported)
                {
                    this = Sse4_1.blend_epi16(mid, hi, 0b1000);
                }
                else
                {
                    this = Mask.BlendEpi16_SSE2(mid, hi, 0b1000);
                }
            }
            else
            {
                this.x0 = x012.x;
                this.x1 = x012.y;
                this.x2 = x012.z;
                this.x3 = x345.x;
                this.x4 = x345.y;
                this.x5 = x345.z;
                this.x6 = x67.x;
                this.x7 = x67.y;
            }
        }
Example #9
0
        public float8(float2 x01, float3 x234, float3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 lo  = Sse2.unpacklo_pd(*(v128 *)&x01, *(v128 *)&x234);
                v128 mid = Sse2.bsrli_si128(*(v128 *)&x234, 2 * sizeof(float));
                v128 hi  = Sse2.bslli_si128(*(v128 *)&x567, sizeof(float));

                if (Sse4_1.IsSse41Supported)
                {
                    hi = Sse4_1.blend_ps(mid, hi, 0b1110);
                }
                else
                {
                    hi = Mask.BlendV(mid, hi, new v128(0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255), false);
                }


                this = new float8(*(float4 *)&lo, *(float4 *)&hi);
            }
            else
            {
                this = new float8
                {
                    _v4_0 = new float4(x01, x234.xy),
                    _v4_4 = new float4(x234.z, x567)
                };
            }
        }
Example #10
0
        public ushort8(ushort2 x01, ushort3 x234, ushort3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bslli_si128(x234, 2 * sizeof(ushort));
                v128 hi  = Sse2.bslli_si128(x567, 5 * sizeof(ushort));

                if (Sse4_1.IsSse41Supported)
                {
                    hi = Sse4_1.blend_epi16(mid, hi, 0b1110_0000);

                    this = Sse4_1.blend_epi16(x01, hi, 0b1111_1100);
                }
                else
                {
                    hi = Mask.BlendEpi16_SSE2(mid, hi, 0b1110_0000);

                    this = Mask.BlendEpi16_SSE2(x01, hi, 0b1111_1100);
                }
            }
            else
            {
                this.x0 = x01.x;
                this.x1 = x01.y;
                this.x2 = x234.x;
                this.x3 = x234.y;
                this.x4 = x234.z;
                this.x5 = x567.x;
                this.x6 = x567.y;
                this.x7 = x567.z;
            }
        }
Example #11
0
        public float8(float3 x012, float3 x345, float2 x67)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bsrli_si128(*(v128 *)&x345, sizeof(float));
                v128 hi  = Sse2.unpacklo_pd(mid, *(v128 *)&x67);

                mid = Sse2.bslli_si128(*(v128 *)&x345, 3 * sizeof(float));
                v128 lo;

                if (Sse4_1.IsSse41Supported)
                {
                    lo = Sse4_1.blend_ps(*(v128 *)&x012, mid, 0b1000);
                }
                else
                {
                    lo = Mask.BlendV(*(v128 *)&x012, mid, new v128(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255), false);
                }


                this = new float8(*(float4 *)&lo, *(float4 *)&hi);
            }
            else
            {
                this = new float8
                {
                    _v4_0 = new float4(x012, x345.x),
                    _v4_4 = new float4(x345.yz, x67)
                };
            }
        }
Example #12
0
        public ushort8(ushort3 x012, ushort2 x34, ushort3 x567)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 hi = Sse2.bslli_si128(x567, 2 * sizeof(short));

                if (Sse4_1.IsSse41Supported)
                {
                    hi = Sse4_1.blend_epi16(x34, hi, 0b0001_1100);
                    hi = Sse2.bslli_si128(hi, 3 * sizeof(short));

                    this = Sse4_1.blend_epi16(x012, hi, 0b1111_1000);
                }
                else
                {
                    hi = Mask.BlendEpi16_SSE2(x34, hi, 0b0001_1100);
                    hi = Sse2.bslli_si128(hi, 3 * sizeof(short));

                    this = Mask.BlendEpi16_SSE2(x012, hi, 0b1111_1000);
                }
            }
            else
            {
                this.x0 = x012.x;
                this.x1 = x012.y;
                this.x2 = x012.z;
                this.x3 = x34.x;
                this.x4 = x34.y;
                this.x5 = x567.x;
                this.x6 = x567.y;
                this.x7 = x567.z;
            }
        }
Example #13
0
        public ushort8(ushort3 x012, ushort3 x345, ushort2 x67)
        {
            if (Sse2.IsSse2Supported)
            {
                v128 mid = Sse2.bslli_si128(x345, 3 * sizeof(ushort));
                v128 hi  = Sse2.bslli_si128(x67, 6 * sizeof(ushort));

                if (Sse4_1.IsSse41Supported)
                {
                    mid = Sse4_1.blend_epi16(x012, mid, 0b0011_1000);

                    this = Sse4_1.blend_epi16(mid, hi, 0b1100_0000);
                }
                else
                {
                    mid = Mask.BlendEpi16_SSE2(x012, mid, 0b0011_1000);

                    this = Mask.BlendEpi16_SSE2(mid, hi, 0b1100_0000);
                }
            }
            else
            {
                this.x0 = x012.x;
                this.x1 = x012.y;
                this.x2 = x012.z;
                this.x3 = x345.x;
                this.x4 = x345.y;
                this.x5 = x345.z;
                this.x6 = x67.x;
                this.x7 = x67.y;
            }
        }
Example #14
0
 internal static v128 BlendV(v128 a, v128 b, v128 mask, bool integer = true)
 {
     if (Sse4_1.IsSse41Supported)
     {
         if (integer)
         {
             return(Sse4_1.blendv_epi8(a, b, mask));
         }
         else
         {
             return(Sse4_1.blendv_ps(a, b, mask));
         }
     }
     else if (Sse2.IsSse2Supported)
     {
         // UNSAFE - performs bit-by-bit blend and not byte-by-byte
         if (integer)
         {
             return(Sse2.or_si128(Sse2.and_si128(mask, b),
                                  Sse2.andnot_si128(mask, a)));
         }
         else
         {
             return(Sse.or_ps(Sse.and_ps(mask, b),
                              Sse.andnot_ps(mask, a)));
         }
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Example #15
0
        internal static long2 shra_long(long2 x, int n)
        {
            v128 shiftLo;
            v128 shiftHi;

            if (n <= 32)
            {
                shiftHi = shra_int(x, n);
                shiftLo = shrl_long(x, n);
            }
            else
            {
                shiftHi = shra_int(x, 31);
                shiftLo = shra_int(x, n - 32);
                shiftLo = shrl_long(shiftLo, 32);
            }


            if (Sse4_1.IsSse41Supported)
            {
                return(Sse4_1.blend_epi16(shiftLo, shiftHi, 0b1100_1100));
            }
            else if (Sse2.IsSse2Supported)
            {
                return(Mask.BlendEpi16_SSE2(shiftLo, shiftHi, 0b1100_1100));
            }
            else
            {
                throw new CPUFeatureCheckException();
            }
        }
Example #16
0
        public override int GetHashCode()
        {
            if (Sse4_1.IsSse41Supported)
            {
                return(Sse4_1.extract_epi32(this, 0));
            }
            else
            {
                short2 temp = this;

                return(*(int *)&temp);
            }
        }
Example #17
0
        public static uint cprod(ushort8 x)
        {
            if (Avx2.IsAvx2Supported)
            {
                v128 prod = Avx.mm256_castsi256_si128((uint8)x * (uint8)(ushort8)Sse2.shuffle_epi32(x, Sse.SHUFFLE(0, 1, 2, 3)));
                prod = Sse4_1.mullo_epi32(prod, Sse2.shuffle_epi32(prod, Sse.SHUFFLE(0, 1, 2, 3)));

                return(Sse4_1.mullo_epi32(prod, Sse2.shufflelo_epi16(prod, Sse.SHUFFLE(0, 0, 3, 2))).UInt0);
            }
            else
            {
                return(cprod((uint4)x.v4_0 * (uint4)x.v4_4));
            }
        }
Example #18
0
 internal static v128 min_int(v128 a, v128 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.min_epi32(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi32(a, b)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Example #19
0
 public static sbyte2 max(sbyte2 a, sbyte2 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epi8(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi8(b, a)));
     }
     else
     {
         return(new sbyte2((sbyte)math.max(a.x, b.x), (sbyte)math.max(a.y, b.y)));
     }
 }
Example #20
0
 public static bool2 operator >=(ushort2 left, ushort2 right)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(TestIsTrue(Sse2.cmpeq_epi16(Sse4_1.max_epu16(left, right), left)));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(TestIsFalse(Operator.greater_mask_ushort(right, left)));
     }
     else
     {
         return(new bool2(left.x >= right.x, left.y >= right.y));
     }
 }
Example #21
0
 public static bool8 operator >=(ushort8 left, ushort8 right)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(TestIsTrue(Sse2.cmpeq_epi16(Sse4_1.max_epu16(left, right), left)));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(TestIsFalse(Operator.greater_mask_ushort(right, left)));
     }
     else
     {
         return(new bool8(left.x0 >= right.x0, left.x1 >= right.x1, left.x2 >= right.x2, left.x3 >= right.x3, left.x4 >= right.x4, left.x5 >= right.x5, left.x6 >= right.x6, left.x7 >= right.x7));
     }
 }
Example #22
0
 public static bool8 operator <=(ushort8 left, ushort8 right)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(TestIsTrue(Sse2.cmpeq_epi16(Sse4_1.min_epu16(left, right), left)));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(TestIsFalse(Operator.greater_mask_ushort(left, right)));
     }
     else
     {
         return(new bool8(left.x0 <= right.x0, left.x1 <= right.x1, left.x2 <= right.x2, left.x3 <= right.x3, left.x4 <= right.x4, left.x5 <= right.x5, left.x6 <= right.x6, left.x7 <= right.x7));
     }
 }
Example #23
0
 internal static v128 UShortToInt(v128 x)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.cvtepu16_epi32(x));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Sse2.unpacklo_epi16(x, default(v128)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Example #24
0
 internal static v128 IntToLong(v128 x)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.cvtepi32_epi64(x));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Sse2.unpacklo_epi32(x, Sse2.cmpgt_epi32(default(v128), x)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Example #25
0
 internal static v128 SByteToShort(v128 x)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.cvtepi8_epi16(x));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Sse2.unpacklo_epi8(x, Sse2.cmpgt_epi8(default(v128), x)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Example #26
0
 public static ushort8 max(ushort8 a, ushort8 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epu16(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Operator.greater_mask_ushort(b, a)));
     }
     else
     {
         return(new ushort8((ushort)math.max((uint)a.x0, (uint)b.x0), (ushort)math.max((uint)a.x1, (uint)b.x1), (ushort)math.max((uint)a.x2, (uint)b.x2), (ushort)math.max((uint)a.x3, (uint)b.x3), (ushort)math.max((uint)a.x4, (uint)b.x4), (ushort)math.max((uint)a.x5, (uint)b.x5), (ushort)math.max((uint)a.x6, (uint)b.x6), (ushort)math.max((uint)a.x7, (uint)b.x7)));
     }
 }
Example #27
0
 public static sbyte16 max(sbyte16 a, sbyte16 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epi8(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi8(b, a)));
     }
     else
     {
         return(new sbyte16((sbyte)math.max(a.x0, b.x0), (sbyte)math.max(a.x1, b.x1), (sbyte)math.max(a.x2, b.x2), (sbyte)math.max(a.x3, b.x3), (sbyte)math.max(a.x4, b.x4), (sbyte)math.max(a.x5, b.x5), (sbyte)math.max(a.x6, b.x6), (sbyte)math.max(a.x7, b.x7), (sbyte)math.max(a.x8, b.x8), (sbyte)math.max(a.x9, b.x9), (sbyte)math.max(a.x10, b.x10), (sbyte)math.max(a.x11, b.x11), (sbyte)math.max(a.x12, b.x12), (sbyte)math.max(a.x13, b.x13), (sbyte)math.max(a.x14, b.x14), (sbyte)math.max(a.x15, b.x15)));
     }
 }
Example #28
0
 public static ushort4 max(ushort4 a, ushort4 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epu16(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Operator.greater_mask_ushort(b, a)));
     }
     else
     {
         return(new ushort4((ushort)math.max((uint)a.x, (uint)b.x), (ushort)math.max((uint)a.y, (uint)b.y), (ushort)math.max((uint)a.z, (uint)b.z), (ushort)math.max((uint)a.w, (uint)b.w)));
     }
 }
Example #29
0
 internal static v128 max_uint(v128 a, v128 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epu32(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, greater_mask_uint(b, a)));
     }
     else
     {
         throw new CPUFeatureCheckException();
     }
 }
Example #30
0
 public static sbyte8 max(sbyte8 a, sbyte8 b)
 {
     if (Sse4_1.IsSse41Supported)
     {
         return(Sse4_1.max_epi8(a, b));
     }
     else if (Sse2.IsSse2Supported)
     {
         return(Mask.BlendV(a, b, Sse2.cmpgt_epi8(b, a)));
     }
     else
     {
         return(new sbyte8((sbyte)math.max(a.x0, b.x0), (sbyte)math.max(a.x1, b.x1), (sbyte)math.max(a.x2, b.x2), (sbyte)math.max(a.x3, b.x3), (sbyte)math.max(a.x4, b.x4), (sbyte)math.max(a.x5, b.x5), (sbyte)math.max(a.x6, b.x6), (sbyte)math.max(a.x7, b.x7)));
     }
 }