public static sbyte3x4 transpose(sbyte4x3 v) { if (Ssse3.IsSsse3Supported) { v128 unpacklo = Sse2.unpacklo_epi16(Sse2.unpacklo_epi8(v.c0, v.c1), v.c2); return(new sbyte3x4(unpacklo, Ssse3.shuffle_epi8(unpacklo, new v128(4, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), Ssse3.shuffle_epi8(unpacklo, new v128(8, 9, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), Ssse3.shuffle_epi8(unpacklo, new v128(12, 13, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))); } else if (Sse2.IsSse2Supported) { v128 unpacklo = Sse2.unpacklo_epi8(v.c0, v.c1); return(new sbyte3x4(Sse2.unpacklo_epi16(unpacklo, v.c2), Sse2.unpacklo_epi16(Sse2.bsrli_si128(unpacklo, 2 * sizeof(sbyte)), Sse2.bsrli_si128(v.c2, 1 * sizeof(sbyte))), Sse2.unpacklo_epi16(Sse2.bsrli_si128(unpacklo, 4 * sizeof(sbyte)), Sse2.bsrli_si128(v.c2, 2 * sizeof(sbyte))), Sse2.unpacklo_epi16(Sse2.bsrli_si128(unpacklo, 6 * sizeof(sbyte)), Sse2.bsrli_si128(v.c2, 3 * sizeof(sbyte))))); } else { return(new sbyte3x4(v.c0.x, v.c0.y, v.c0.z, v.c0.w, v.c1.x, v.c1.y, v.c1.z, v.c1.w, v.c2.x, v.c2.y, v.c2.z, v.c2.w)); } }
public static sbyte3x2 transpose(sbyte2x3 v) { if (Sse2.IsSse2Supported) { v128 unpacklo = Sse2.unpacklo_epi8(v.c0, v.c1); if (Ssse3.IsSsse3Supported) { unpacklo = Sse2.unpacklo_epi16(unpacklo, v.c2); return(new sbyte3x2(unpacklo, Ssse3.shuffle_epi8(unpacklo, new v128(4, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)))); } else { return(new sbyte3x2(Sse2.unpacklo_epi16(unpacklo, v.c2), Sse2.unpacklo_epi16(Sse2.bsrli_si128(unpacklo, 2 * sizeof(sbyte)), Sse2.bsrli_si128(v.c2, sizeof(sbyte))))); } } else { return(new sbyte3x2(v.c0.x, v.c0.y, v.c1.x, v.c1.y, v.c2.x, v.c2.y)); } }
public static byte4 tzcnt(byte4 x) { if (Ssse3.IsSsse3Supported) { v128 NIBBLE_MASK = new v128(0x0F0F_0F0F); v128 SHUFFLE_MASK_LO = new v128(8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0); v128 SHUFFLE_MASK_HI = new v128(8, 4, 5, 4, 6, 4, 5, 4, 7, 4, 5, 4, 6, 4, 5, 4); return(Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)), Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4))))); } else if (Sse2.IsSse2Supported) { v128 compareMask = x & (byte4)(-(sbyte4)x); byte4 first = Mask.BlendV(default(v128), new byte4(1), Sse2.cmpeq_epi8(compareMask, default(v128))); byte4 second = Mask.BlendV(default(v128), new byte4(4), Sse2.cmpeq_epi8(compareMask & (byte4)0x0F, default(v128))); byte4 third = Mask.BlendV(default(v128), new byte4(2), Sse2.cmpeq_epi8(compareMask & (byte4)0x33, default(v128))); byte4 fourth = Mask.BlendV(default(v128), new byte4(1), Sse2.cmpeq_epi8(compareMask & (byte4)0x55, default(v128))); return((first + second) + (third + fourth)); } else { return(new byte4(tzcnt(x.x), tzcnt(x.y), tzcnt(x.z), tzcnt(x.w))); } }
public static ushort8 tzcnt(ushort8 x) { if (Ssse3.IsSsse3Supported) { v128 NIBBLE_MASK = new v128(0x0F0F_0F0F); v128 SHUFFLE_MASK_LO = new v128(16, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0); v128 SHUFFLE_MASK_HI = new v128(16, 4, 5, 4, 6, 4, 5, 4, 7, 4, 5, 4, 6, 4, 5, 4); v128 tzcnt_bytes = Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)), Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4)))); return(Sse2.min_epu8(tzcnt_bytes, Sse2.srli_epi16(Sse2.add_epi8(tzcnt_bytes, Sse2.set1_epi8(8)), 8))); } else if (Sse2.IsSse2Supported) { v128 compareMask = x & (ushort8)(-((short8)x)); ushort8 first = Mask.BlendV(default(v128), new ushort8(1), Sse2.cmpeq_epi16(compareMask, default(v128))); ushort8 second = Mask.BlendV(default(v128), new ushort8(8), Sse2.cmpeq_epi16(compareMask & (ushort8)0x00FF, default(v128))); ushort8 third = Mask.BlendV(default(v128), new ushort8(4), Sse2.cmpeq_epi16(compareMask & (ushort8)0x0F0F, default(v128))); ushort8 fourth = Mask.BlendV(default(v128), new ushort8(2), Sse2.cmpeq_epi16(compareMask & (ushort8)0x3333, default(v128))); ushort8 fifth = Mask.BlendV(default(v128), new ushort8(1), Sse2.cmpeq_epi16(compareMask & (ushort8)0x5555, default(v128))); return((first + second) + ((third + fourth) + fifth)); } else { return(new ushort8(tzcnt(x.x0), tzcnt(x.x1), tzcnt(x.x2), tzcnt(x.x3), tzcnt(x.x4), tzcnt(x.x5), tzcnt(x.x6), tzcnt(x.x7))); } }
public static byte16 tzcnt(byte16 x) { if (Ssse3.IsSsse3Supported) { v128 NIBBLE_MASK = new v128(0x0F0F_0F0F); v128 SHUFFLE_MASK_LO = new v128(8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0); v128 SHUFFLE_MASK_HI = new v128(8, 4, 5, 4, 6, 4, 5, 4, 7, 4, 5, 4, 6, 4, 5, 4); return(Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)), Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4))))); } else if (Sse2.IsSse2Supported) { v128 compareMask = x & (byte16)(-(sbyte16)x); byte16 first = Mask.BlendV(default(v128), new byte16(1), Sse2.cmpeq_epi8(compareMask, default(v128))); byte16 second = Mask.BlendV(default(v128), new byte16(4), Sse2.cmpeq_epi8(compareMask & (byte16)0x0F, default(v128))); byte16 third = Mask.BlendV(default(v128), new byte16(2), Sse2.cmpeq_epi8(compareMask & (byte16)0x33, default(v128))); byte16 fourth = Mask.BlendV(default(v128), new byte16(1), Sse2.cmpeq_epi8(compareMask & (byte16)0x55, default(v128))); return((first + second) + (third + fourth)); } else { return(new byte16(tzcnt(x.x0), tzcnt(x.x1), tzcnt(x.x2), tzcnt(x.x3), tzcnt(x.x4), tzcnt(x.x5), tzcnt(x.x6), tzcnt(x.x7), tzcnt(x.x8), tzcnt(x.x9), tzcnt(x.x10), tzcnt(x.x11), tzcnt(x.x12), tzcnt(x.x13), tzcnt(x.x14), tzcnt(x.x15))); } }
public static bool all_eq(byte16 c) { if (Ssse3.IsSsse3Supported) { return(((byte16)Ssse3.shuffle_epi8(c, default(v128))).Equals(c)); } else { return((((c.x0 == c.x1 & c.x0 == c.x2) & (c.x0 == c.x3 & c.x0 == c.x4)) & ((c.x0 == c.x5 & c.x0 == c.x6) & (c.x0 == c.x7 & c.x0 == c.x8))) & (((c.x0 == c.x9 & c.x0 == c.x10) & (c.x0 == c.x11 & c.x0 == c.x12)) & ((c.x0 == c.x13 & c.x0 == c.x14) & c.x0 == c.x15))); } }
public static bool all_eq(short8 c) { if (Ssse3.IsSsse3Supported) { return(((short8)Ssse3.shuffle_epi8(c, new v128(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1))).Equals(c)); } else { return(((c.x0 == c.x1 & c.x0 == c.x2) & (c.x0 == c.x3 & c.x0 == c.x4)) & ((c.x0 == c.x5 & c.x0 == c.x6) & c.x0 == c.x7)); } }
internal static v128 Int4ToShort4(v128 x) { if (Ssse3.IsSsse3Supported) { return(Ssse3.shuffle_epi8(x, new byte8(0, 1, 4, 5, 8, 9, 12, 13))); } else { throw new CPUFeatureCheckException(); } }
internal static v128 Long2ToShort2(v128 x) { if (Ssse3.IsSsse3Supported) { return(Ssse3.shuffle_epi8(x, new byte4(0, 1, 8, 9))); } else { throw new CPUFeatureCheckException(); } }
internal static v128 Int4ToByte4(v128 x) { if (Ssse3.IsSsse3Supported) { return(Ssse3.shuffle_epi8(x, new byte4(0, 4, 8, 12))); } else { throw new CPUFeatureCheckException(); } }
internal static v128 ShortToByte(short8 x) { if (Ssse3.IsSsse3Supported) { return(Ssse3.shuffle_epi8(x, new byte8(0, 2, 4, 6, 8, 10, 12, 14))); } else { throw new CPUFeatureCheckException(); } }
public byte8 NextByte8(byte8 max) { if (Ssse3.IsSsse3Supported) { short8 temp = (short8)max * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState()); return(Ssse3.shuffle_epi8(temp, new byte8(1, 3, 5, 7, 9, 11, 13, 15))); } else { return((byte8)(((short8)max * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState())) >> 8)); } }
public byte4 NextByte4(byte4 max) { if (Ssse3.IsSsse3Supported) { short4 temp = (short4)max * new short4(NextState(), NextState(), NextState(), NextState()); return(Ssse3.shuffle_epi8(temp, new byte4(1, 3, 5, 7))); } else { return((byte4)(((short4)max * new short4(NextState(), NextState(), NextState(), NextState())) >> 8)); } }
public byte3 NextByte3(byte3 max) { if (Ssse3.IsSsse3Supported) { short3 temp = (short3)max * new short3(NextState(), NextState(), NextState()); return(Ssse3.shuffle_epi8(temp, new byte4(1, 3, 5, 0))); } else { return((byte3)(((short3)max * new short3(NextState(), NextState(), NextState())) >> 8)); } }
public byte2 NextByte(byte2 max) { if (Ssse3.IsSsse3Supported) { short2 temp = (short2)max * new short2(NextState(), NextState()); return(Ssse3.shuffle_epi8(temp, new byte4(1, 3, 0, 0))); } else { return((byte2)(((short2)max * new short2(NextState(), NextState())) >> 8)); } }
public byte2 NextByte2(byte2 min, byte2 max) { Assert.IsNotSmaller(max.x, min.x); Assert.IsNotSmaller(max.y, min.y); if (Ssse3.IsSsse3Supported) { short2 temp = (short2)(max - min) * new short2(NextState(), NextState()); return(min + Ssse3.shuffle_epi8(temp, new byte4(1, 3, 0, 0))); } else { return(min + (byte2)(((short2)(max - min) * new short2(NextState(), NextState())) >> 8)); } }
public static byte16 countbits(byte16 x) { if (Ssse3.IsSsse3Supported) { byte16 lookup = new byte16(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); byte16 mask = new byte16(0x0F); byte16 countLo = Ssse3.shuffle_epi8(lookup, x & mask); byte16 countHi = Ssse3.shuffle_epi8(lookup, Sse2.srli_epi16(x, 4) & mask); return(countLo + countHi); } else { return(new byte16((byte)math.countbits((uint)x.x0), (byte)math.countbits((uint)x.x1), (byte)math.countbits((uint)x.x2), (byte)math.countbits((uint)x.x3), (byte)math.countbits((uint)x.x4), (byte)math.countbits((uint)x.x5), (byte)math.countbits((uint)x.x6), (byte)math.countbits((uint)x.x7), (byte)math.countbits((uint)x.x8), (byte)math.countbits((uint)x.x9), (byte)math.countbits((uint)x.x10), (byte)math.countbits((uint)x.x11), (byte)math.countbits((uint)x.x12), (byte)math.countbits((uint)x.x13), (byte)math.countbits((uint)x.x14), (byte)math.countbits((uint)x.x15))); } }
public sbyte3 NextSByte3(sbyte3 min, sbyte3 max) { Assert.IsNotSmaller(max.x, min.x); Assert.IsNotSmaller(max.y, min.y); Assert.IsNotSmaller(max.z, min.z); if (Ssse3.IsSsse3Supported) { ushort3 temp = (ushort3)(max - min) * new ushort3(NextState(), NextState(), NextState()); return(min + Ssse3.shuffle_epi8(temp, new byte4(1, 3, 5, 0))); } else { return(min + (sbyte3)(((ushort3)(max - min) * new ushort3(NextState(), NextState(), NextState())) >> 8)); } }
public static ushort8 lzcnt(ushort8 x) { if (Ssse3.IsSsse3Supported) { v128 NIBBLE_MASK = new v128(0x0F0F_0F0F); v128 SHUFFLE_MASK_LO = new v128(16, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4); v128 SHUFFLE_MASK_HI = new v128(16, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0); v128 lzcnt_bytes = Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)), Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4)))); return(Sse2.min_epu8(Sse2.add_epi8(lzcnt_bytes, Sse2.set1_epi16(8)), Sse2.srli_epi16(lzcnt_bytes, 8))); } else if (Sse2.IsSse2Supported) { ushort8 y; ushort8 n = 16; ushort8 mask; y = x >> 8; mask = Sse2.cmpeq_epi16(y, default(v128)); n = Mask.BlendV(n - 8, n, mask); x = Mask.BlendV(y, x, mask); y = x >> 4; mask = Sse2.cmpeq_epi16(y, default(v128)); n = Mask.BlendV(n - 4, n, mask); x = Mask.BlendV(y, x, mask); y = x >> 2; mask = Sse2.cmpeq_epi16(y, default(v128)); n = Mask.BlendV(n - 2, n, mask); x = Mask.BlendV(y, x, mask); y = x >> 1; mask = Sse2.cmpeq_epi16(y, default(v128)); return(Mask.BlendV(n - 2, n - x, mask)); } else { return(new ushort8(lzcnt(x.x0), lzcnt(x.x1), lzcnt(x.x2), lzcnt(x.x3), lzcnt(x.x4), lzcnt(x.x5), lzcnt(x.x6), lzcnt(x.x7))); } }
public byte4 NextByte4(byte4 min, byte4 max) { Assert.IsNotSmaller(max.x, min.x); Assert.IsNotSmaller(max.y, min.y); Assert.IsNotSmaller(max.z, min.z); Assert.IsNotSmaller(max.w, min.w); if (Ssse3.IsSsse3Supported) { short4 temp = (short4)(max - min) * new short4(NextState(), NextState(), NextState(), NextState()); return(min + Ssse3.shuffle_epi8(temp, new byte4(1, 3, 5, 7))); } else { return(min + (byte4)(((short4)(max - min) * new short4(NextState(), NextState(), NextState(), NextState())) >> 8)); } }
public byte8 NextByte8(byte8 min, byte8 max) { Assert.IsNotSmaller(max.x0, min.x0); Assert.IsNotSmaller(max.x1, min.x1); Assert.IsNotSmaller(max.x2, min.x2); Assert.IsNotSmaller(max.x3, min.x3); Assert.IsNotSmaller(max.x4, min.x4); Assert.IsNotSmaller(max.x5, min.x5); Assert.IsNotSmaller(max.x6, min.x6); Assert.IsNotSmaller(max.x7, min.x7); if (Ssse3.IsSsse3Supported) { short8 temp = (short8)(max - min) * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState()); return(min + Ssse3.shuffle_epi8(temp, new byte8(1, 3, 5, 7, 9, 11, 13, 15))); } else { return(min + (byte8)(((short8)(max - min) * new short8(NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState(), NextState())) >> 8)); } }
public static byte3 lzcnt(byte3 x) { if (Ssse3.IsSsse3Supported) { v128 NIBBLE_MASK = new v128(0x0F0F_0F0F); v128 SHUFFLE_MASK_LO = new v128(8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4); v128 SHUFFLE_MASK_HI = new v128(8, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0); return(Sse2.min_epu8(Ssse3.shuffle_epi8(SHUFFLE_MASK_LO, Sse2.and_si128(NIBBLE_MASK, x)), Ssse3.shuffle_epi8(SHUFFLE_MASK_HI, Sse2.and_si128(NIBBLE_MASK, Sse2.srli_epi16(x, 4))))); } else if (Sse2.IsSse2Supported) { byte3 y; byte3 n = 8; byte3 mask; y = x >> 4; mask = Sse2.cmpeq_epi8(y, default(v128)); n = Mask.BlendV(n - 4, n, mask); x = Mask.BlendV(y, x, mask); y = x >> 2; mask = Sse2.cmpeq_epi8(y, default(v128)); n = Mask.BlendV(n - 2, n, mask); x = Mask.BlendV(y, x, mask); y = x >> 1; mask = Sse2.cmpeq_epi8(y, default(v128)); return(Mask.BlendV(n - 2, n - x, mask)); } else { return(new byte3(lzcnt(x.x), lzcnt(x.y), lzcnt(x.z))); } }