Esempio n. 1
0
        static int Main(string[] args)
        {
            bool pass = true;

            if (Sse2.IsSupported)
            {
                Vector128 <byte> src         = Vector128.Create((byte)1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
                Vector128 <uint> srcAsUInt32 = src.AsUInt32();
                Vector128 <uint> result      = Sse2.Shuffle(srcAsUInt32, _MM_SHUFFLE(0, 1, 2, 3));
                pass = result.Equals(Sse2.Shuffle(srcAsUInt32, (byte)(0 << 6 | 1 << 4 | 2 << 2 | 3)));
            }
            return(pass ? 100 : 0);
        }
Esempio n. 2
0
        private static void MakeRoundKey(Vector128 <byte>[] keys, int i, byte rcon)
        {
            Vector128 <byte> s = keys[i - 1];
            Vector128 <byte> t = keys[i - 1];

            t = Aes.KeygenAssist(t, rcon);
            t = Sse2.Shuffle(t.AsUInt32(), 0xFF).AsByte();

            s = Sse2.Xor(s, Sse2.ShiftLeftLogical128BitLane(s, 4));
            s = Sse2.Xor(s, Sse2.ShiftLeftLogical128BitLane(s, 8));

            keys[i] = Sse2.Xor(s, t);
        }
Esempio n. 3
0
        private static void KeyRound(ref Vector128 <byte> a, ref Vector128 <byte> b, ref Vector128 <byte> c)
        {
            var t = Sse2.ShiftLeftLogical128BitLane(a, 4);

            b = Sse2.Shuffle(b.AsUInt32(), 0b01_01_01_01).AsByte();
            a = Sse2.Xor(a, t);
            t = Sse2.ShiftLeftLogical128BitLane(t, 4);
            a = Sse2.Xor(a, t);
            t = Sse2.ShiftLeftLogical128BitLane(t, 4);
            a = Sse2.Xor(a, t);
            a = Sse2.Xor(a, b);
            b = Sse2.Shuffle(a.AsUInt32(), 0b11_11_11_11).AsByte();
            t = Sse2.ShiftLeftLogical128BitLane(c, 4);
            c = Sse2.Xor(c, t);
            c = Sse2.Xor(c, b);
        }
Esempio n. 4
0
 public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 {
     if (typeof(T) == typeof(byte))
     {
         return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>());
     }
     else if (typeof(T) == typeof(sbyte))
     {
         return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>());
     }
     else if (typeof(T) == typeof(short))
     {
         return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>());
     }
     else if (typeof(T) == typeof(ushort))
     {
         return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>());
     }
     else if (typeof(T) == typeof(int))
     {
         return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>());
     }
     else if (typeof(T) == typeof(uint))
     {
         return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
     }
     else if (typeof(T) == typeof(long))
     {
         return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>());
     }
     else if (typeof(T) == typeof(ulong))
     {
         return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
     }
     else if (typeof(T) == typeof(float))
     {
         return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>());
     }
     else if (typeof(T) == typeof(double))
     {
         return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>());
     }
     else
     {
         throw new NotSupportedException();
     }
 }
Esempio n. 5
0
        static void Fold4(ref Vector128 <uint> xmmCRC0, ref Vector128 <uint> xmmCRC1, ref Vector128 <uint> xmmCRC2,
                          ref Vector128 <uint> xmmCRC3)
        {
            Vector128 <uint> xmmFold4 = Vector128.Create(0xc6e41596, 0x00000001, 0x54442bd4, 0x00000001);

            Vector128 <uint> xTmp0 = xmmCRC0;
            Vector128 <uint> xTmp1 = xmmCRC1;
            Vector128 <uint> xTmp2 = xmmCRC2;
            Vector128 <uint> xTmp3 = xmmCRC3;

            xmmCRC0 = Pclmulqdq.CarrylessMultiply(xmmCRC0.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp0   = Pclmulqdq.CarrylessMultiply(xTmp0.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC0 = xmmCRC0.AsSingle();
            Vector128 <float> psT0   = xTmp0.AsSingle();
            Vector128 <float> psRes0 = Sse.Xor(psCRC0, psT0);

            xmmCRC1 = Pclmulqdq.CarrylessMultiply(xmmCRC1.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp1   = Pclmulqdq.CarrylessMultiply(xTmp1.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC1 = xmmCRC1.AsSingle();
            Vector128 <float> psT1   = xTmp1.AsSingle();
            Vector128 <float> psRes1 = Sse.Xor(psCRC1, psT1);

            xmmCRC2 = Pclmulqdq.CarrylessMultiply(xmmCRC2.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp2   = Pclmulqdq.CarrylessMultiply(xTmp2.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC2 = xmmCRC2.AsSingle();
            Vector128 <float> psT2   = xTmp2.AsSingle();
            Vector128 <float> psRes2 = Sse.Xor(psCRC2, psT2);

            xmmCRC3 = Pclmulqdq.CarrylessMultiply(xmmCRC3.AsUInt64(), xmmFold4.AsUInt64(), 0x01).AsUInt32();
            xTmp3   = Pclmulqdq.CarrylessMultiply(xTmp3.AsUInt64(), xmmFold4.AsUInt64(), 0x10).AsUInt32();
            Vector128 <float> psCRC3 = xmmCRC3.AsSingle();
            Vector128 <float> psT3   = xTmp3.AsSingle();
            Vector128 <float> psRes3 = Sse.Xor(psCRC3, psT3);

            xmmCRC0 = psRes0.AsUInt32();
            xmmCRC1 = psRes1.AsUInt32();
            xmmCRC2 = psRes2.AsUInt32();
            xmmCRC3 = psRes3.AsUInt32();
        }
Esempio n. 6
0
        internal static unsafe uint GetSse(ReadOnlySpan <byte> buffer, uint s1, uint s2)
        {
            uint len = (uint)buffer.Length;

            uint blocks = len / BLOCK_SIZE;

            len = len - blocks * BLOCK_SIZE;

            Vector128 <sbyte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
            Vector128 <sbyte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
            Vector128 <byte>  zero = Vector128 <byte> .Zero;
            Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1);

            fixed(byte *bufPtr = &MemoryMarshal.GetReference(buffer))
            {
                var buf = bufPtr;

                while (blocks != 0)
                {
                    uint n = NMAX32 / BLOCK_SIZE;
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <uint> v_ps = Vector128.Create(0, 0, 0, s1 * n);
                    Vector128 <uint> v_s2 = Vector128.Create(0, 0, 0, s2);
                    Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0);

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse2.LoadVector128(&buf[0]);
                        Vector128 <byte> bytes2 = Sse2.LoadVector128(&buf[16]);


                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);



                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        Vector128 <ushort> sad1 = Sse2.SumAbsoluteDifferences(bytes1, zero);
                        v_s1 = Sse2.Add(v_s1, sad1.AsUInt32());
                        Vector128 <short> mad11 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        Vector128 <int>   mad12 = Sse2.MultiplyAddAdjacent(mad11, ones);
                        v_s2 = Sse2.Add(v_s2, mad12.AsUInt32());


                        Vector128 <ushort> sad2 = Sse2.SumAbsoluteDifferences(bytes2, zero);
                        v_s1 = Sse2.Add(v_s1, sad2.AsUInt32());
                        Vector128 <short> mad21 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        Vector128 <int>   mad22 = Sse2.MultiplyAddAdjacent(mad21, ones);
                        v_s2 = Sse2.Add(v_s2, mad22.AsUInt32());

                        buf += BLOCK_SIZE;

                        n--;
                    } while (n != 0);

                    var shift = Sse2.ShiftLeftLogical(v_ps, 5);
                    v_s2 = Sse2.Add(v_s2, shift);


                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).

                    // A B C D -> B A D C
                    const int S2301 = 2 << 6 | 3 << 4 | 0 << 2 | 1;
                    // A B C D -> C D A B
                    const int S1032 = 1 << 6 | 0 << 4 | 3 << 2 | 2;

                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301));
                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
                    s1  += Sse2.ConvertToUInt32(v_s1);
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
                    s2   = Sse2.ConvertToUInt32(v_s2);

                    s1 %= MOD32;
                    s2 %= MOD32;
                }

                if (len > 0)
                {
                    if (len >= 16)
                    {
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        s2  += (s1 += *buf++);
                        len -= 16;
                    }

                    while (len-- > 0)
                    {
                        s2 += (s1 += *buf++);
                    }
                    if (s1 >= MOD32)
                    {
                        s1 -= MOD32;
                    }

                    s2 %= MOD32;
                }

                return(s1 | (s2 << 16));
            }
        }
Esempio n. 7
0
        private unsafe static void Xxh3ScrambleAcc(Span <ulong> acc, ReadOnlySpan <byte> secret)
        {
            if (Avx2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pSecret = secret)
                    {
                        Vector256 <uint>   prime32 = Vector256.Create(Prime32_1);
                        Vector256 <ulong> *xAcc    = (Vector256 <ulong> *)pAcc;
                        Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 32; i++)
                        {
                            Vector256 <ulong> accVec  = xAcc[i];
                            Vector256 <ulong> shifted = Avx2.ShiftRightLogical(accVec, 47);
                            Vector256 <ulong> dataVec = Avx2.Xor(accVec, shifted);

                            Vector256 <byte> keyVec  = xSecret[i];
                            Vector256 <uint> dataKey = Avx2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32());

                            Vector256 <uint>  dataKeyHi = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector256 <ulong> prodLo    = Avx2.Multiply(dataKey, prime32);
                            Vector256 <ulong> prodHi    = Avx2.Multiply(dataKeyHi, prime32);

                            xAcc[i] = Avx2.Add(prodLo, Avx2.ShiftLeftLogical(prodHi, 32));
                        }
                    }
                }
            }
            else if (Sse2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pSecret = secret)
                    {
                        Vector128 <uint>   prime32 = Vector128.Create(Prime32_1);
                        Vector128 <ulong> *xAcc    = (Vector128 <ulong> *)pAcc;
                        Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 16; i++)
                        {
                            Vector128 <ulong> accVec  = xAcc[i];
                            Vector128 <ulong> shifted = Sse2.ShiftRightLogical(accVec, 47);
                            Vector128 <ulong> dataVec = Sse2.Xor(accVec, shifted);

                            Vector128 <byte> keyVec  = xSecret[i];
                            Vector128 <uint> dataKey = Sse2.Xor(dataVec.AsUInt32(), keyVec.AsUInt32());

                            Vector128 <uint>  dataKeyHi = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector128 <ulong> prodLo    = Sse2.Multiply(dataKey, prime32);
                            Vector128 <ulong> prodHi    = Sse2.Multiply(dataKeyHi, prime32);

                            xAcc[i] = Sse2.Add(prodLo, Sse2.ShiftLeftLogical(prodHi, 32));
                        }
                    }
                }
            }
            else
            {
                for (int i = 0; i < AccNb; i++)
                {
                    ulong key64 = BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong)));
                    ulong acc64 = acc[i];
                    acc64  = XorShift64(acc64, 47);
                    acc64 ^= key64;
                    acc64 *= Prime32_1;
                    acc[i] = acc64;
                }
            }
        }
Esempio n. 8
0
        private unsafe static void Xxh3Accumulate512(Span <ulong> acc, ReadOnlySpan <byte> input, ReadOnlySpan <byte> secret)
        {
            if (Avx2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pInput = input, pSecret = secret)
                    {
                        Vector256 <ulong> *xAcc    = (Vector256 <ulong> *)pAcc;
                        Vector256 <byte> * xInput  = (Vector256 <byte> *)pInput;
                        Vector256 <byte> * xSecret = (Vector256 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 32; i++)
                        {
                            Vector256 <byte>  dataVec   = xInput[i];
                            Vector256 <byte>  keyVec    = xSecret[i];
                            Vector256 <byte>  dataKey   = Avx2.Xor(dataVec, keyVec);
                            Vector256 <uint>  dataKeyLo = Avx2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector256 <ulong> product   = Avx2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector256 <uint>  dataSwap  = Avx2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector256 <ulong> sum       = Avx2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Avx2.Add(product, sum);
                        }
                    }
                }
            }
            else if (Sse2.IsSupported)
            {
                fixed(ulong *pAcc = acc)
                {
                    fixed(byte *pInput = input, pSecret = secret)
                    {
                        Vector128 <ulong> *xAcc    = (Vector128 <ulong> *)pAcc;
                        Vector128 <byte> * xInput  = (Vector128 <byte> *)pInput;
                        Vector128 <byte> * xSecret = (Vector128 <byte> *)pSecret;

                        for (ulong i = 0; i < StripeLen / 16; i++)
                        {
                            Vector128 <byte>  dataVec   = xInput[i];
                            Vector128 <byte>  keyVec    = xSecret[i];
                            Vector128 <byte>  dataKey   = Sse2.Xor(dataVec, keyVec);
                            Vector128 <uint>  dataKeyLo = Sse2.Shuffle(dataKey.AsUInt32(), 0b00110001);
                            Vector128 <ulong> product   = Sse2.Multiply(dataKey.AsUInt32(), dataKeyLo);
                            Vector128 <uint>  dataSwap  = Sse2.Shuffle(dataVec.AsUInt32(), 0b01001110);
                            Vector128 <ulong> sum       = Sse2.Add(xAcc[i], dataSwap.AsUInt64());
                            xAcc[i] = Sse2.Add(product, sum);
                        }
                    }
                }
            }
            else
            {
                for (int i = 0; i < AccNb; i++)
                {
                    ulong dataVal = BinaryPrimitives.ReadUInt64LittleEndian(input.Slice(i * sizeof(ulong)));
                    ulong dataKey = dataVal ^ BinaryPrimitives.ReadUInt64LittleEndian(secret.Slice(i * sizeof(ulong)));
                    acc[i ^ 1] += dataVal;
                    acc[i]     += Mult32To64((uint)dataKey, dataKey >> 32);
                }
            }
        }
Esempio n. 9
0
 private static Vector128 <ulong> ror64_32(ref Vector128 <ulong> x) => Sse2.Shuffle(x.AsUInt32(), 0b_10_11_00_01).AsUInt64();
Esempio n. 10
0
        private unsafe nuint GetIndexOfFirstByteToEncodeSsse3(byte *pData, nuint lengthInBytes)
        {
            Debug.Assert(Ssse3.IsSupported);
            Debug.Assert(BitConverter.IsLittleEndian);

            Vector128 <byte> vecZero           = Vector128 <byte> .Zero;
            Vector128 <byte> vec0x7            = Vector128.Create((byte)0x7);
            Vector128 <byte> vecPowersOfTwo    = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0);
            Vector128 <byte> allowedCodePoints = _allowedAsciiCodePoints.AsVector;
            int pmovmskb;

            nuint i = 0;

            if (lengthInBytes >= 16)
            {
                nuint lastLegalIterationFor16CharRead = lengthInBytes & unchecked ((nuint)(nint) ~0xF);

                do
                {
                    // Read 16 bytes at a time into a single 128-bit vector.

                    Vector128 <byte> packed = Sse2.LoadVector128(pData + i); // unaligned read

                    // Each element of the packed vector corresponds to a byte of untrusted source data. It will
                    // have the format [ ..., 0xYZ, ... ]. We use the low nibble of each byte to index into
                    // the 'allowedCodePoints' vector, and we use the high nibble of each byte to select a bit
                    // from the corresponding element in the 'allowedCodePoints' vector.
                    //
                    // Example: let packed := [ ..., 0x6D ('m'), ... ]
                    // The final 'result' vector will contain a non-zero value in the corresponding space iff the
                    // 0xD element in the 'allowedCodePoints' vector has its 1 << 0x6 bit set.
                    //
                    // We rely on the fact that the pshufb operation will turn each non-ASCII byte (high bit set)
                    // into 0x00 in the resulting 'shuffled' vector. That results in the corresponding element
                    // in the 'result' vector also being 0x00, meaning that escaping is required.

                    var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed);
                    var vecPowersOfTwoShuffled    = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7));
                    var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled);

                    // Now, each element of 'result' contains a non-zero value if the corresponding element in
                    // 'packed' is allowed; and it contains a zero value if the corresponding element in 'packed'
                    // is disallowed. We'll compare 'result' against an all-zero vector to normalize 0x00 -> 0xFF
                    // and (anything other than 0x00) -> 0x00. Then 'pmovmskb' will have its nth bit set iff
                    // the nth entry in 'packed' requires escaping. An all-zero pmovmskb means no escaping is required.

                    pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero));
                    if ((pmovmskb & 0xFFFF) != 0)
                    {
                        goto MaskContainsDataWhichRequiresEscaping;
                    }
                } while ((i += 16) < lastLegalIterationFor16CharRead);
            }

            if ((lengthInBytes & 8) != 0)
            {
                // Read 8 bytes at a time into a single 128-bit vector.
                // Same logic as the 16-byte case, but we only care about the low byte of the final pmovmskb value.
                // Everything except the low byte of pmovksmb contains garbage and must be discarded.

                var packed = Sse2.LoadScalarVector128((/* unaligned */ ulong *)(pData + i)).AsByte();
                var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed);
                var vecPowersOfTwoShuffled    = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7));
                var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero));
                if ((byte)pmovmskb != 0)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 8;
            }

            if ((lengthInBytes & 4) != 0)
            {
                // Read 4 bytes at a time into a single 128-bit vector.
                // Same logic as the 16-byte case, but we only care about the low nibble of the final pmovmskb value.
                // Everything except the low nibble of pmovksmb contains garbage and must be discarded.

                var packed = Sse2.LoadScalarVector128((/* unaligned */ uint *)(pData + i)).AsByte();
                var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed);
                var vecPowersOfTwoShuffled    = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7));
                var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero));
                if ((pmovmskb & 0xF) != 0)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 4;
            }

            // Beyond this point, vectorization isn't worthwhile. Just do a normal loop.

            if ((lengthInBytes & 3) != 0)
            {
                Debug.Assert(lengthInBytes - i <= 3);

                do
                {
                    if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i]))
                    {
                        break;
                    }
                } while (++i != lengthInBytes);
            }

Return:

            return(i);

MaskContainsDataWhichRequiresEscaping:

            Debug.Assert(pmovmskb != 0);
            i += (uint)BitOperations.TrailingZeroCount(pmovmskb); // location of lowest set bit is where we must begin escaping
            goto Return;
        }
Esempio n. 11
0
 public static Vector128 <T> AndNot_Software <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 {
     return(AndNot_Software(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
 }
Esempio n. 12
0
 public static Vector128 <T> Not_Software <T>(Vector128 <T> vector) where T : struct
 {
     return(Not_Software(vector.AsUInt32()).As <uint, T>());
 }
Esempio n. 13
0
        /// <summary>
        /// Figure 8. Code Sample -Performing GhashUsing an Aggregated Reduction Method
        /// Algorithm by Krzysztof Jankowski,  Pierre Laurent - Intel
        /// </summary>
        public static Vector128 <ulong> Reduce4(
            Vector128 <ulong> h1, Vector128 <ulong> h2, Vector128 <ulong> h3, Vector128 <ulong> h4,
            Vector128 <ulong> x1, Vector128 <ulong> x2, Vector128 <ulong> x3, Vector128 <ulong> x4)
        {
            Vector128 <ulong> h1x1Lo, h1x1Hi,
                              h2x2Lo, h2x2Hi,
                              h3x3Lo, h3x3Hi,
                              h4x4Lo, h4x4Hi,
                              lo, hi;
            Vector128 <ulong> tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;

            h1x1Lo = CarrylessMultiply(h1, x1, 0x00);
            h2x2Lo = CarrylessMultiply(h2, x2, 0x00);
            h3x3Lo = CarrylessMultiply(h3, x3, 0x00);
            h4x4Lo = CarrylessMultiply(h4, x4, 0x00);

            lo = Xor(h1x1Lo, h2x2Lo);
            lo = Xor(lo, h3x3Lo);
            lo = Xor(lo, h4x4Lo);

            h1x1Hi = CarrylessMultiply(h1, x1, 0x11);
            h2x2Hi = CarrylessMultiply(h2, x2, 0x11);
            h3x3Hi = CarrylessMultiply(h3, x3, 0x11);
            h4x4Hi = CarrylessMultiply(h4, x4, 0x11);

            hi = Xor(h1x1Hi, h2x2Hi);
            hi = Xor(hi, h3x3Hi);
            hi = Xor(hi, h4x4Hi);

            tmp0 = Shuffle(h1.AsUInt32(), 78).AsUInt64();
            tmp4 = Shuffle(x1.AsUInt32(), 78).AsUInt64();
            tmp0 = Xor(tmp0, h1);
            tmp4 = Xor(tmp4, x1);
            tmp1 = Shuffle(h2.AsUInt32(), 78).AsUInt64();
            tmp5 = Shuffle(x2.AsUInt32(), 78).AsUInt64();
            tmp1 = Xor(tmp1, h2);
            tmp5 = Xor(tmp5, x2);
            tmp2 = Shuffle(h3.AsUInt32(), 78).AsUInt64();
            tmp6 = Shuffle(x3.AsUInt32(), 78).AsUInt64();
            tmp2 = Xor(tmp2, h3);
            tmp6 = Xor(tmp6, x3);
            tmp3 = Shuffle(h4.AsUInt32(), 78).AsUInt64();
            tmp7 = Shuffle(x4.AsUInt32(), 78).AsUInt64();
            tmp3 = Xor(tmp3, h4);
            tmp7 = Xor(tmp7, x4);

            tmp0 = CarrylessMultiply(tmp0, tmp4, 0x00);
            tmp1 = CarrylessMultiply(tmp1, tmp5, 0x00);
            tmp2 = CarrylessMultiply(tmp2, tmp6, 0x00);
            tmp3 = CarrylessMultiply(tmp3, tmp7, 0x00);

            tmp0 = Xor(tmp0, lo);
            tmp0 = Xor(tmp0, hi);
            tmp0 = Xor(tmp1, tmp0);
            tmp0 = Xor(tmp2, tmp0);
            tmp0 = Xor(tmp3, tmp0);

            tmp4 = ShiftLeftLogical128BitLane(tmp0, 8);
            tmp0 = ShiftRightLogical128BitLane(tmp0, 8);

            lo = Xor(tmp4, lo);
            hi = Xor(tmp0, hi);

            tmp3 = lo;
            tmp6 = hi;

            tmp7 = ShiftRightLogical(tmp3.AsUInt32(), 31).AsUInt64();
            tmp8 = ShiftRightLogical(tmp6.AsUInt32(), 31).AsUInt64();
            tmp3 = ShiftLeftLogical(tmp3.AsUInt32(), 1).AsUInt64();
            tmp6 = ShiftLeftLogical(tmp6.AsUInt32(), 1).AsUInt64();

            tmp9 = ShiftRightLogical128BitLane(tmp7, 12);
            tmp8 = ShiftLeftLogical128BitLane(tmp8, 4);
            tmp7 = ShiftLeftLogical128BitLane(tmp7, 4);
            tmp3 = Or(tmp3, tmp7);
            tmp6 = Or(tmp6, tmp8);
            tmp6 = Or(tmp6, tmp9);

            tmp7 = ShiftLeftLogical(tmp3.AsUInt32(), 31).AsUInt64();
            tmp8 = ShiftLeftLogical(tmp3.AsUInt32(), 30).AsUInt64();
            tmp9 = ShiftLeftLogical(tmp3.AsUInt32(), 25).AsUInt64();

            tmp7 = Xor(tmp7, tmp8);
            tmp7 = Xor(tmp7, tmp9);
            tmp8 = ShiftRightLogical128BitLane(tmp7, 4);
            tmp7 = ShiftLeftLogical128BitLane(tmp7, 12);
            tmp3 = Xor(tmp3, tmp7);

            tmp2 = ShiftRightLogical(tmp3.AsUInt32(), 1).AsUInt64();
            tmp4 = ShiftRightLogical(tmp3.AsUInt32(), 2).AsUInt64();
            tmp5 = ShiftRightLogical(tmp3.AsUInt32(), 7).AsUInt64();
            tmp2 = Xor(tmp2, tmp4);
            tmp2 = Xor(tmp2, tmp5);
            tmp2 = Xor(tmp2, tmp8);
            tmp3 = Xor(tmp3, tmp2);
            tmp6 = Xor(tmp6, tmp3);

            return(tmp6);
        }
Esempio n. 14
0
 public static Vector128 <T> RotateLeftUInt32 <T>(this Vector128 <T> value, byte offset) where T : struct
 {
     return(Sse2.Or(Sse2.ShiftLeftLogical(value.AsUInt32(), offset), Sse2.ShiftRightLogical(value.AsUInt32(), (byte)(32 - offset))).As <uint, T>());
 }