Exemplo n.º 1
0
        public void RunClsVarScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario));

            var result = Sse2.And(
                _clsVar1,
                _clsVar2
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
Exemplo n.º 2
0
        public void RunBasicScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_LoadAligned));

            var result = Sse2.And(
                Sse2.LoadAlignedVector128((UInt64 *)(_dataTable.inArray1Ptr)),
                Sse2.LoadAlignedVector128((UInt64 *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
Exemplo n.º 3
0
        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Sse2.And(
                Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <UInt64> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
Exemplo n.º 4
0
        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Sse2.And(
                Sse2.LoadVector128((Double *)(&test._fld1)),
                Sse2.LoadVector128((Double *)(&test._fld2))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
Exemplo n.º 5
0
        private Hit[] RayTraceAVXFaster(Ray ray)
        {
            Vector256 <double> dir      = (Vector256 <double>)ray.Direction;
            Vector256 <double> vert0    = (Vector256 <double>)Vert0.Position;
            Vector256 <double> edge0to1 = (Vector256 <double>)Edge0to1;
            Vector256 <double> edge0to2 = (Vector256 <double>)Edge0to2;

            Vector256 <double> offset = Avx.Subtract((Vector256 <double>)ray.Origin, vert0);
            Vector256 <double> side1  = SIMDHelpers.Cross(offset, edge0to1);
            Vector256 <double> side2  = SIMDHelpers.Cross(dir, edge0to2);

            // Prepare all dot products
            Vector256 <double> uvTemp    = Avx.Multiply(offset, side2);         // u
            Vector256 <double> temp      = Avx.Multiply(dir, side1);            // v
            Vector256 <double> edge2Temp = Avx.Multiply(edge0to2, side1);
            Vector256 <double> distTemp  = Avx.Multiply(edge0to1, side2);

            uvTemp    = Avx.HorizontalAdd(uvTemp, temp);
            edge2Temp = Avx.HorizontalAdd(edge2Temp, edge2Temp);
            distTemp  = Avx.HorizontalAdd(distTemp, distTemp);

            // Complete all dot products for SSE ops
            Vector128 <double> uvs   = SIMDHelpers.Add2(uvTemp);
            Vector128 <double> dist  = SIMDHelpers.Add2(edge2Temp);
            Vector128 <double> temp1 = SIMDHelpers.Add2(distTemp);
            Vector128 <double> temp2;

            // vec2 constants we'll be using later
            Vector128 <double> ones2   = SIMDHelpers.BroadcastScalar2(1D);
            Vector128 <double> zeroes2 = new Vector128 <double>();

            // Reciprocal of distance along edge0to1
            temp1 = Sse2.Divide(ones2, temp1);
            temp2 = Sse2.CompareOrdered(temp1, temp1);
            // Remove NaNs from the result, replaced with 0
            Vector128 <double> distZeroed = Sse2.And(temp1, temp2);

            uvs  = Sse2.Multiply(uvs, distZeroed);
            dist = Sse2.Multiply(dist, distZeroed);

            // compare uvs < 0 and > 1, dist < 0, jump out if any of those conditions are met
            temp1 = Sse2.CompareLessThan(uvs, zeroes2);
            temp2 = Mirror ? uvs : Sse3.HorizontalAdd(uvs, uvs);
            temp2 = Sse2.CompareGreaterThan(temp2, ones2);
            temp1 = Sse2.Or(temp1, temp2);
            temp2 = Sse2.CompareLessThan(dist, zeroes2);
            temp1 = Sse2.Or(temp1, temp2);

            if (!Avx.TestZ(temp1, temp1))
            {
                return(default);
Exemplo n.º 6
0
        public static Vector128 <sbyte> CreateEscapingMask(
            Vector128 <sbyte> sourceValue,
            Vector128 <sbyte> bitMaskLookup,
            Vector128 <sbyte> bitPosLookup,
            Vector128 <sbyte> nibbleMaskSByte,
            Vector128 <sbyte> nullMaskSByte)
        {
            // To check if an input byte needs to be escaped or not, we use a bitmask-lookup.
            // Therefore we split the input byte into the low- and high-nibble, which will get
            // the row-/column-index in the bit-mask.
            // The bitmask-lookup looks like (here for example s_bitMaskLookupBasicLatin):
            //                                     high-nibble
            // low-nibble  0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
            //         0   1   1   0   0   0   0   1   0   1   1   1   1   1   1   1   1
            //         1   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         2   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         3   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         4   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         5   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         6   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         7   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         8   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         9   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         A   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         B   1   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         C   1   1   0   1   0   1   0   0   1   1   1   1   1   1   1   1
            //         D   1   1   0   0   0   0   0   0   1   1   1   1   1   1   1   1
            //         E   1   1   0   1   0   0   0   0   1   1   1   1   1   1   1   1
            //         F   1   1   0   0   0   0   0   1   1   1   1   1   1   1   1   1
            //
            // where 1 denotes the neeed for escaping, while 0 means no escaping needed.
            // For high-nibbles in the range 8..F every input needs to be escaped, so we
            // can omit them in the bit-mask, thus only high-nibbles in the range 0..7 need
            // to be considered, hence the entries in the bit-mask can be of type byte.
            //
            // In the bitmask-lookup for each row (= low-nibble) a bit-mask for the
            // high-nibbles (= columns) is created.

            Debug.Assert(Ssse3.IsSupported);

            Vector128 <sbyte> highNibbles = Sse2.And(Sse2.ShiftRightLogical(sourceValue.AsInt32(), 4).AsSByte(), nibbleMaskSByte);
            Vector128 <sbyte> lowNibbles  = Sse2.And(sourceValue, nibbleMaskSByte);

            Vector128 <sbyte> bitMask      = Ssse3.Shuffle(bitMaskLookup, lowNibbles);
            Vector128 <sbyte> bitPositions = Ssse3.Shuffle(bitPosLookup, highNibbles);

            Vector128 <sbyte> mask = Sse2.And(bitPositions, bitMask);

            mask = Sse2.CompareEqual(nullMaskSByte, Sse2.CompareEqual(nullMaskSByte, mask));
            return(mask);
        }
Exemplo n.º 7
0
        public static Vector128 <byte> op_Multiply(Vector128 <byte> left, Vector128 <byte> right)
        {
            Vector128 <ushort> lowBits = Vector128.Create((ushort)0x00FF);
            var lowProduct             = Sse2.And(lowBits, Sse2.MultiplyLow(left.As <ushort>(), right.As <ushort>())).AsByte();
            var highProduct            =
                Sse2.ShiftLeftLogical(
                    Sse2.MultiplyLow(
                        Sse2.ShiftRightLogical(left.As <ushort>(), 8),
                        Sse2.ShiftRightLogical(right.As <ushort>(), 8)
                        ),
                    8).AsByte();

            return(Sse2.Or(lowProduct, highProduct));
        }
Exemplo n.º 8
0
            public void RunStructFldScenario_Load(SimpleBinaryOpTest__AndDouble testClass)
            {
                fixed(Vector128 <Double> *pFld1 = &_fld1)
                fixed(Vector128 <Double> *pFld2 = &_fld2)
                {
                    var result = Sse2.And(
                        Sse2.LoadVector128((Double *)(pFld1)),
                        Sse2.LoadVector128((Double *)(pFld2))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
                }
            }
Exemplo n.º 9
0
        /*=========================================================================
        ** Returns a reference to the current instance ANDed with value.
        **
        ** Exceptions: ArgumentException if value == null or
        **             value.Length != this.Length.
        ** =========================================================================*/
        public unsafe BitArray And(BitArray value)
        {
            if (value == null)
            {
                throw new ArgumentNullException(nameof(value));
            }
            if (Length != value.Length)
            {
                throw new ArgumentException(SR.Arg_ArrayLengthsDiffer);
            }

            int count = m_array.Length;

            switch (count)
            {
            case 3: m_array[2] &= value.m_array[2]; goto case 2;

            case 2: m_array[1] &= value.m_array[1]; goto case 1;

            case 1: m_array[0] &= value.m_array[0]; goto Done;

            case 0: goto Done;
            }

            int i = 0;

            if (Sse2.IsSupported)
            {
                fixed(int *leftPtr = m_array)
                fixed(int *rightPtr = value.m_array)
                {
                    for (; i < count - (Vector128 <int> .Count - 1); i += Vector128 <int> .Count)
                    {
                        Vector128 <int> leftVec  = Sse2.LoadVector128(leftPtr + i);
                        Vector128 <int> rightVec = Sse2.LoadVector128(rightPtr + i);
                        Sse2.Store(leftPtr + i, Sse2.And(leftVec, rightVec));
                    }
                }
            }

            for (; i < count; i++)
            {
                m_array[i] &= value.m_array[i];
            }

Done:
            _version++;
            return(this);
        }
Exemplo n.º 10
0
        private static Vector128 <int> Recursion(Vector128 <int> a, Vector128 <int> b, Vector128 <int> c,
                                                 Vector128 <int> d)
        {
            var y = Sse2.ShiftRightLogical(b, Sr1);
            var z = Sse2.ShiftRightLogical128BitLane(c, Sr2);
            var v = Sse2.ShiftLeftLogical(d, Sl1);

            z = Sse2.Xor(z, a);
            z = Sse2.Xor(z, v);
            var x = Sse2.ShiftLeftLogical128BitLane(a, Sl2);

            y = Sse2.And(y, Sse2ParamMask.si);
            z = Sse2.Xor(z, x);
            return(Sse2.Xor(z, y));
        }
Exemplo n.º 11
0
        unsafe private static void denoiseLineSse2(byte *pcurr, byte *pprev, byte *pnext, int cb)
        {
            byte *ip = pcurr, pp = pprev, np = pnext;
            nuint cnt = 0, end = (nuint)cb - (nuint)Vector128 <byte> .Count;

            var voffset = Vector128.Create((byte)0x80);
            var vthresh = Vector128.Create(denoiseThreshold);

LoopTop:
            do
            {
                var vcurr = Sse2.LoadVector128(ip + cnt);
                var vprev = Sse2.LoadVector128(pp + cnt);
                var vnext = Sse2.LoadVector128(np + cnt);

                var vdiffp = Sse2.Or(Sse2.SubtractSaturate(vcurr, vprev), Sse2.SubtractSaturate(vprev, vcurr));
                var vmaskp = Sse2.CompareEqual(Sse2.Max(vdiffp, vthresh), vthresh);

                var vdiffn = Sse2.Or(Sse2.SubtractSaturate(vcurr, vnext), Sse2.SubtractSaturate(vnext, vcurr));
                var vmaskn = Sse2.CompareEqual(Sse2.Max(vdiffn, vthresh), vthresh);

                var vavgp = Sse2.Average(vcurr, vprev);
                var vavgn = Sse2.Average(vcurr, vnext);

                var voutval = Sse2.Average(HWIntrinsics.BlendVariable(vavgn, vavgp, vmaskp), HWIntrinsics.BlendVariable(vavgp, vavgn, vmaskn));
                var voutmsk = Sse2.Or(vmaskp, vmaskn);
                voutval = Sse2.Average(voutval, HWIntrinsics.BlendVariable(voutval, Sse2.Average(vprev, vnext), Sse2.And(vmaskp, vmaskn)));

                var vcurrs = Sse2.Xor(vcurr, voffset).AsSByte();
                var vprevs = Sse2.Xor(vprev, voffset).AsSByte();
                var vnexts = Sse2.Xor(vnext, voffset).AsSByte();

                var vsurlt = Sse2.And(Sse2.CompareGreaterThan(vcurrs, vprevs), Sse2.CompareGreaterThan(vcurrs, vnexts));
                var vsurgt = Sse2.And(Sse2.CompareGreaterThan(vprevs, vcurrs), Sse2.CompareGreaterThan(vnexts, vcurrs));

                voutmsk = Sse2.And(voutmsk, Sse2.Or(vsurlt, vsurgt).AsByte());
                voutval = HWIntrinsics.BlendVariable(vcurr, voutval, voutmsk);

                Sse2.Store(ip + cnt, voutval);
                cnt += (nuint)Vector128 <byte> .Count;
            } while (cnt <= end);

            if (cnt < end + (nuint)Vector128 <byte> .Count)
            {
                cnt = end;
                goto LoopTop;
            }
        }
Exemplo n.º 12
0
        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector128 <UInt32> *pClsVar1 = &_clsVar1)
            fixed(Vector128 <UInt32> *pClsVar2 = &_clsVar2)
            {
                var result = Sse2.And(
                    Sse2.LoadVector128((UInt32 *)(pClsVar1)),
                    Sse2.LoadVector128((UInt32 *)(pClsVar2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
            }
        }
Exemplo n.º 13
0
        static void do_recursion(ref FloatW128 r, ref FloatW128 a, ref FloatW128 b, ref FloatW128 u)
        {
            Vector128 <int> v, w, x, y, z;

            x    = a.si;
            z    = Sse2.ShiftLeftLogical(x.AsInt64(), DSFMT_SL1).AsInt32();
            y    = Sse2.Shuffle(u.si, SSE2_SHUFF);
            z    = Sse2.Xor(z, b.si);
            y    = Sse2.Xor(y, z);
            v    = Sse2.ShiftRightLogical(y.AsUInt64(), DSFMT_SR).AsInt32();
            w    = Sse2.And(y, sse2_param_mask.i128);
            v    = Sse2.Xor(v, x);
            v    = Sse2.Xor(v, w);
            r.si = v;
            u.si = y;
        }
Exemplo n.º 14
0
        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector128 <Double> *pFld1 = &_fld1)
            fixed(Vector128 <Double> *pFld2 = &_fld2)
            {
                var result = Sse2.And(
                    Sse2.LoadVector128((Double *)(pFld1)),
                    Sse2.LoadVector128((Double *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
            }
        }
Exemplo n.º 15
0
        private static unsafe int FillBuffer(ReadOnlySpan <char> input)
        {
            int count = Math.Min(LineBuffer.Length, input.Length);
            int i     = 0;

            fixed(char *buffer = LineBuffer, pInput = input)
            {
                if (Sse2.IsSupported && count >= Vector128 <ushort> .Count)
                {
                    Vector128 <ushort> Space = Vector128.Create(SpaceCharUShort); //Space character

                    do
                    {
                        var data = Sse2.LoadVector128((ushort *)pInput + i);

                        var comp = Vector128 <ushort> .Zero;

                        comp = Sse2.CompareEqual(comp, data);

                        if (Sse41.IsSupported)
                        {
                            data = Sse41.BlendVariable(data, Space, comp);
                        }
                        else
                        {
                            comp = Sse2.And(comp, Space);

                            data = Sse2.Or(data, comp); //Elements being replaced are already 0'ed
                        }

                        Sse2.Store((ushort *)buffer + i, data);

                        i += Vector128 <ushort> .Count;
                    }while ((count - i) >= Vector128 <ushort> .Count);
                }

                while (i < count)
                {
                    char tmp = pInput[i];
                    buffer[i] = tmp == 0 ? ' ' : tmp;

                    i += 1;
                }

                return(count);
            }
        }
        public void UseSse3_Unsafe(uint value)
        {
            char[] buffer = _buffer;
            _ = buffer.Length; // elide future null checks
                               // _ = buffer[7]; // elide future bounds checks

            uint tupleNumber = value;

            // These must be explicity typed as ReadOnlySpan<byte>
            // They then become a non-allocating mappings to the data section of the assembly.
            // This uses C# compiler's ability to refer to static data directly. For more information see https://vcsjones.dev/2019/02/01/csharp-readonly-span-bytes-static
            ReadOnlySpan <byte> shuffleMaskData = new byte[16]
            {
                0xF, 0xF, 3, 0xF,
                0xF, 0xF, 2, 0xF,
                0xF, 0xF, 1, 0xF,
                0xF, 0xF, 0, 0xF
            };

            ReadOnlySpan <byte> asciiUpperCaseData = new byte[16]
            {
                (byte)'0', (byte)'1', (byte)'2', (byte)'3',
                (byte)'4', (byte)'5', (byte)'6', (byte)'7',
                (byte)'8', (byte)'9', (byte)'A', (byte)'B',
                (byte)'C', (byte)'D', (byte)'E', (byte)'F'
            };

            // Load from data section memory into Vector128 registers
            var shuffleMask    = Unsafe.ReadUnaligned <Vector128 <byte> >(ref MemoryMarshal.GetReference(shuffleMaskData));
            var asciiUpperCase = Unsafe.ReadUnaligned <Vector128 <byte> >(ref MemoryMarshal.GetReference(asciiUpperCaseData));

            var lowNibbles  = Ssse3.Shuffle(Vector128.CreateScalarUnsafe(tupleNumber).AsByte(), shuffleMask);
            var highNibbles = Sse2.ShiftRightLogical(Sse2.ShiftRightLogical128BitLane(lowNibbles, 2).AsInt32(), 4).AsByte();
            var indices     = Sse2.And(Sse2.Or(lowNibbles, highNibbles), Vector128.Create((byte)0xF));
            // Lookup the hex values at the positions of the indices
            var hex = Ssse3.Shuffle(asciiUpperCase, indices);

            // The high bytes (0x00) of the chars have also been converted to ascii hex '0', so clear them out.
            hex = Sse2.And(hex, Vector128.Create((ushort)0xFF).AsByte());

            // This generates much more efficient asm than fixing the buffer and using
            // Sse2.Store((byte*)(p + i), chars.AsByte());
            Unsafe.WriteUnaligned(
                ref Unsafe.As <char, byte>(
                    ref MemoryMarshal.GetArrayDataReference(buffer)),
                hex);
        }
Exemplo n.º 17
0
        private static unsafe bool IsNoneOpaque32Bytes(byte *src, int i)
        {
            Vector128 <byte>  a0   = Sse2.LoadVector128(src + i).AsByte();
            Vector128 <byte>  a1   = Sse2.LoadVector128(src + i + 16).AsByte();
            Vector128 <int>   b0   = Sse2.And(a0, AlphaMask).AsInt32();
            Vector128 <int>   b1   = Sse2.And(a1, AlphaMask).AsInt32();
            Vector128 <short> c    = Sse2.PackSignedSaturate(b0, b1).AsInt16();
            Vector128 <byte>  d    = Sse2.PackSignedSaturate(c, c).AsByte();
            Vector128 <byte>  bits = Sse2.CompareEqual(d, All0x80);
            int mask = Sse2.MoveMask(bits);

            if (mask != 0xFFFF)
            {
                return(true);
            }

            return(false);
        }
Exemplo n.º 18
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleBinaryOpTest__AndDouble();

            fixed(Vector128 <Double> *pFld1 = &test._fld1)
            fixed(Vector128 <Double> *pFld2 = &test._fld2)
            {
                var result = Sse2.And(
                    Sse2.LoadVector128((Double *)(pFld1)),
                    Sse2.LoadVector128((Double *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }
Exemplo n.º 19
0
        public static Vector128 <double> ConditionalSelectBitwise(Vector128 <double> selector, Vector128 <double> ifTrue, Vector128 <double> ifFalse)
        {
            // This implementation is based on the DirectX Math Library XMVector4NotEqual method
            // https://github.com/microsoft/DirectXMath/blob/master/Inc/DirectXMathVector.inl

            if (AdvSimd.IsSupported)
            {
                return(AdvSimd.BitwiseSelect(selector, ifTrue, ifFalse));
            }
            else if (Sse2.IsSupported)
            {
                return(Sse2.Or(Sse2.And(ifTrue, selector), Sse2.AndNot(selector, ifFalse)));
            }
            else
            {
                // Redundant test so we won't prejit remainder of this method on platforms without AdvSimd.
                throw new PlatformNotSupportedException();
            }
        }
Exemplo n.º 20
0
        private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI)
        {
            if (Avx2.IsSupported)
            {
                Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output);

                Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
                Vector128 <uint> masks  = Vector128.Create(7u);

                Vector128 <byte> vClut;

                fixed(byte *pRPal = rPal)
                {
                    vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte();
                }

                Vector128 <uint> indices0  = Vector128.Create((uint)rI);
                Vector128 <uint> indices1  = Vector128.Create((uint)(rI >> 24));
                Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
                Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
                Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
                Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
                indices00 = Sse2.And(indices00, masks);
                indices10 = Sse2.And(indices10, masks);
                indices01 = Sse2.And(indices01, masks);
                indices11 = Sse2.And(indices11, masks);

                Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
                Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());

                Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());

                outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
            }
            else
            {
                for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
                {
                    output[i] = rPal[(int)(rI & 7)];
                }
            }
        }
Exemplo n.º 21
0
        public static unsafe void ToUpperASCIIInPlace_SIMD(string text)
        {
            //0b_01111111_11011111_11111111_11011111;
            const int upperIntOffset  = 2145386463;
            const int upperCharOffset = 95;

            Vector128 <int> vresult = Vector128 <int> .Zero;
            Vector128 <int> add     = Vector128.Create(upperIntOffset);

            var len = text.Length;

            fixed(char *pSource = text)
            {
                int i = 0;
                int lastBlockIndex = len - (len % 8);

                while (i < lastBlockIndex)
                {
                    int *c = (int *)(pSource + i);

                    vresult = Sse2.LoadVector128(c);
                    //0b_01111111_11011111_11111111_11011111;
                    vresult = Sse2.And(vresult, add);

                    Sse2.Store(c, vresult);

                    i += 8;
                }

                while (i < len)
                {
                    char *c = (char *)(pSource + i);
                    *     c = (Char)(*c & upperCharOffset);

                    i += 1;
                }
            }
        }
Exemplo n.º 22
0
        public static unsafe int CountEvenSIMD(int[] numbers)
        {
            int counter = 0;
            int len     = numbers.Length;

            fixed(int *num = numbers)
            {
                Vector128 <int> vresult = Vector128 <int> .Zero;
                Vector128 <int> ones    = Vector128.Create(1);

                int i = 0;
                int lastBlockIndex = len - (len % 4);

                while (i < lastBlockIndex)
                {
                    var vec  = Sse2.LoadVector128(num + i);
                    var odds = Sse2.And(vec, ones);
                    vresult = Sse2.Add(vresult, odds);

                    i += 4;
                }

                vresult = Ssse3.HorizontalAdd(vresult, vresult);
                vresult = Ssse3.HorizontalAdd(vresult, vresult);

                counter = vresult.ToScalar();

                while (i < len)
                {
                    var odd = numbers[i] & 1;
                    counter += odd;

                    i += 1;
                }
            }

            return(numbers.Length - counter);
        }
Exemplo n.º 23
0
        public unsafe static void Run(World world)
        {
            if (!Sse.IsSupported || !Sse2.IsSupported)
            {
                throw new Exception("Your processor must support SSE and SSE2 to run this.");
            }

            var charCount = world.AllCharacters.Count;
            var chars     = stackalloc CharData[charCount];

            for (var i = 0; i < charCount; i++)
            {
                var characterActor      = world.AllCharacters[i];
                var allegianceComponent = characterActor.FindComponent <AllegianceComponent>();

                chars[i] = new CharData
                {
                    X          = characterActor.Position.X,
                    Y          = characterActor.Position.Y,
                    Z          = characterActor.Position.Z,
                    Allegiance = allegianceComponent.Allegiance
                };
            }

            var doorData = world.DoorData;

            var doorCount = doorData.Count;

            for (var d = 0; d < doorCount; d += 4)
            {
                var doorX  = Sse.LoadAlignedVector128(doorData.X.AlignedPointer + d);
                var doorY  = Sse.LoadAlignedVector128(doorData.Y.AlignedPointer + d);
                var doorZ  = Sse.LoadAlignedVector128(doorData.Z.AlignedPointer + d);
                var doorR2 = Sse.LoadAlignedVector128(doorData.RadiusSquared.AlignedPointer + d);
                var doorA  = Sse2.LoadAlignedVector128(doorData.Allegiance.AlignedPointer + d);

                var state = Vector128 <uint> .Zero;

                for (var cc = 0; cc < charCount; cc++)
                {
                    ref var c = ref chars[cc];

                    var charX = Vector128.Create(c.X);
                    var charY = Vector128.Create(c.Y);
                    var charZ = Vector128.Create(c.Z);
                    var charA = Vector128.Create(c.Allegiance);

                    var ddx  = Sse.Subtract(doorX, charX);
                    var ddy  = Sse.Subtract(doorY, charY);
                    var ddz  = Sse.Subtract(doorZ, charZ);
                    var dtx  = Sse.Multiply(ddx, ddx);
                    var dty  = Sse.Multiply(ddy, ddy);
                    var dtz  = Sse.Multiply(ddz, ddz);
                    var dst2 = Sse.Add(Sse.Add(dtx, dty), dtz);

                    var rmask = Sse.CompareLessThanOrEqual(dst2, doorR2);
                    var amask = Sse2.CompareEqual(charA, doorA);
                    var mask  = Sse2.And(rmask.AsUInt32(), amask);

                    state = Sse2.Or(mask, state);
                }

                Sse2.StoreAligned(doorData.ShouldBeOpen.AlignedPointer + d, state);
            }
Exemplo n.º 24
0
        public static unsafe Vector128 <byte> End(ref State state, Span <byte> store128)
        {
            long Len = state.TotalLengthInBytes;

            Vector128 <byte> xmm0 = state.xmm0;
            Vector128 <byte> xmm1 = state.xmm1;
            Vector128 <byte> xmm2 = state.xmm2;
            Vector128 <byte> xmm3 = state.xmm3;
            Vector128 <byte> xmm4 = state.xmm4;
            Vector128 <byte> xmm5 = state.xmm5;
            Vector128 <byte> xmm6 = state.xmm6;
            Vector128 <byte> xmm7 = state.xmm7;

            Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

            fixed(byte *rax = state.Buffer)
            {
                xmm9  = Vector128 <byte> .Zero;
                xmm11 = Vector128 <byte> .Zero;

                byte *Last = (byte *)rax + (Len & 0xf0);
                long  Len8 = (Len & 0xf);

                if (Len8 > 0)
                {
                    fixed(byte *MeowMaskLen = s_meowMaskLen)
                    {
                        xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]);
                    }

                    xmm9 = Sse2.LoadVector128(Last);
                    xmm9 = Sse2.And(xmm9, xmm8);
                }

                if ((Len & 0x10) != 0)
                {
                    xmm11 = xmm9;
                    xmm9  = Sse2.LoadVector128(Last - 0x10);
                }


                xmm8  = xmm9;
                xmm10 = xmm9;
                xmm8  = Ssse3.AlignRight(xmm8, xmm11, 15);
                xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1);

                xmm12 = Vector128 <byte> .Zero;
                xmm13 = Vector128 <byte> .Zero;
                xmm14 = Vector128 <byte> .Zero;
                xmm15 = Vector128.Create((ulong)Len, 0).AsByte();
                xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15);
                xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
                MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
#endif

                // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
                MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11);

                // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
                MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Hash all full 32-byte blocks
                //
                long LaneCount = (Len >> 5) & 0x7;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount;

                //
                // NOTE(casey): Mix the eight lanes down to one 128-bit hash
                //

MixDown:

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);
                MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2);
                MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3);
                MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4);
                MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5);
                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                if (store128 != null)
                {
                    fixed(byte *store128Ptr = store128)
                    {
                        Sse2.Store(store128Ptr + 0x00, xmm0);
                        Sse2.Store(store128Ptr + 0x10, xmm1);
                        Sse2.Store(store128Ptr + 0x20, xmm2);
                        Sse2.Store(store128Ptr + 0x30, xmm3);
                        Sse2.Store(store128Ptr + 0x40, xmm4);
                        Sse2.Store(store128Ptr + 0x50, xmm5);
                        Sse2.Store(store128Ptr + 0x60, xmm6);
                        Sse2.Store(store128Ptr + 0x70, xmm7);
                    }
                }

                xmm0 = AddQ(xmm0, xmm2);
                xmm1 = AddQ(xmm1, xmm3);
                xmm4 = AddQ(xmm4, xmm6);
                xmm5 = AddQ(xmm5, xmm7);
                xmm0 = Sse2.Xor(xmm0, xmm1);
                xmm4 = Sse2.Xor(xmm4, xmm5);
                xmm0 = AddQ(xmm0, xmm4);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                return(xmm0);
            }
        }
Exemplo n.º 25
0
        //
        // NOTE(casey): Single block version
        //
        public static unsafe Vector128 <byte> Hash(ReadOnlySpan <byte> Seed128Init, ReadOnlySpan <byte> SourceInit)
        {
            Vector128 <byte> xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;       // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes
            Vector128 <byte> xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length)

            int Len = SourceInit.Length;

            fixed(byte *sourceInitPtr = SourceInit)
            fixed(byte *seedInitPtr = Seed128Init)
            {
                byte *rax = sourceInitPtr;
                byte *rcx = seedInitPtr;

                //
                // NOTE(casey): Seed the eight hash registers
                //

                xmm0 = Sse2.LoadVector128(rcx + 0x00);
                xmm1 = Sse2.LoadVector128(rcx + 0x10);
                xmm2 = Sse2.LoadVector128(rcx + 0x20);
                xmm3 = Sse2.LoadVector128(rcx + 0x30);

                xmm4 = Sse2.LoadVector128(rcx + 0x40);
                xmm5 = Sse2.LoadVector128(rcx + 0x50);
                xmm6 = Sse2.LoadVector128(rcx + 0x60);
                xmm7 = Sse2.LoadVector128(rcx + 0x70);

                // MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);

                //
                // NOTE(casey): Hash all full 256-byte blocks
                //

                int BlockCount = (SourceInit.Length >> 8);

                if (BlockCount > MEOW_PREFETCH_LIMIT)
                {
                    // NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop
                    while (BlockCount-- > 0)
                    {
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x00);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x40);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0x80);
                        Sse.Prefetch0(rax + MEOW_PREFETCH + 0xc0);

                        MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00);
                        MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20);
                        MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40);
                        MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60);
                        MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80);
                        MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0);
                        MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0);
                        MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0);

                        rax += 0x100;
                    }
                }
                else
                {
                    // NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop.
                    while (BlockCount-- > 0)
                    {
                        MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0x00);
                        MEOW_MIX(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, rax + 0x20);
                        MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x40);
                        MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x60);
                        MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x80);
                        MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0xa0);
                        MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0xc0);
                        MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xe0);

                        rax += 0x100;
                    }
                }

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Load any less-than-32-byte residual
                //

                xmm9  = Vector128 <byte> .Zero;
                xmm11 = Vector128 <byte> .Zero;

                //
                // TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here,
                // because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the
                // result, but it would take a little thought to make sure it couldn't read off the end of the buffer due
                // to the & 0xf on the align computation.
                //

                // NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned
                byte *Last = (byte *)sourceInitPtr + (Len & ~0xf);
                int   Len8 = (Len & 0xf);
                if (Len8 > 0)
                {
                    // NOTE(casey): Load the mask early
                    fixed(byte *MeowMaskLen = s_meowMaskLen)
                    {
                        xmm8 = Sse2.LoadVector128(&MeowMaskLen[0x10 - Len8]);
                    }

                    byte *LastOk = (byte *)((((ulong)(((byte *)sourceInitPtr) + Len - 1)) | (MEOW_PAGESIZE - 1)) - 16);
                    int   Align  = (Last > LastOk) ? ((int)(ulong)Last) & 0xf : 0;

                    fixed(byte *MeowShiftAdjust = s_meowShiftAdjust)
                    {
                        xmm10 = Sse2.LoadVector128(&MeowShiftAdjust[Align]);
                    }

                    xmm9 = Sse2.LoadVector128(Last - Align);
                    xmm9 = Ssse3.Shuffle(xmm9, xmm10);

                    // NOTE(jeffr): and off the extra bytes
                    xmm9 = Sse2.And(xmm9, xmm8);
                }

                // NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned
                if ((Len & 0x10) != 0)
                {
                    xmm11 = xmm9;
                    xmm9  = Sse2.LoadVector128(Last - 0x10);
                }

                //
                // NOTE(casey): Construct the residual and length injests
                //

                xmm8  = xmm9;
                xmm10 = xmm9;
                xmm8  = Ssse3.AlignRight(xmm8, xmm11, 15);
                xmm10 = Ssse3.AlignRight(xmm10, xmm11, 1);

                // NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but
                // the decision was made to leave them zero'd so as not to confuse people
                // about hwo to use them or what security implications they had.
                xmm12 = Vector128 <byte> .Zero;
                xmm13 = Vector128 <byte> .Zero;
                xmm14 = Vector128 <byte> .Zero;
                xmm15 = Vector128.Create((ulong)Len, 0).AsByte();
                xmm12 = Ssse3.AlignRight(xmm12, xmm15, 15);
                xmm14 = Ssse3.AlignRight(xmm14, xmm15, 1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);
#endif

                // NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
                MEOW_MIX_REG(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, xmm8, xmm9, xmm10, xmm11);

                // NOTE(casey): Append the length, to avoid problems with our 32-byte padding
                MEOW_MIX_REG(ref xmm1, ref xmm5, ref xmm7, ref xmm2, ref xmm3, xmm12, xmm13, xmm14, xmm15);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                //
                // NOTE(casey): Hash all full 32-byte blocks
                //
                int LaneCount = (Len >> 5) & 0x7;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm2, ref xmm6, ref xmm0, ref xmm3, ref xmm4, rax + 0x00); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm3, ref xmm7, ref xmm1, ref xmm4, ref xmm5, rax + 0x20); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm4, ref xmm0, ref xmm2, ref xmm5, ref xmm6, rax + 0x40); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm5, ref xmm1, ref xmm3, ref xmm6, ref xmm7, rax + 0x60); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm6, ref xmm2, ref xmm4, ref xmm7, ref xmm0, rax + 0x80); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm7, ref xmm3, ref xmm5, ref xmm0, ref xmm1, rax + 0xa0); --LaneCount;
                if (LaneCount == 0)
                {
                    goto MixDown;
                }
                MEOW_MIX(ref xmm0, ref xmm4, ref xmm6, ref xmm1, ref xmm2, rax + 0xc0); --LaneCount;

                //
                // NOTE(casey): Mix the eight lanes down to one 128-bit hash
                //

MixDown:

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);
                MEOW_SHUFFLE(ref xmm4, ref xmm5, xmm6, ref xmm0, ref xmm1, xmm2);
                MEOW_SHUFFLE(ref xmm5, ref xmm6, xmm7, ref xmm1, ref xmm2, xmm3);
                MEOW_SHUFFLE(ref xmm6, ref xmm7, xmm0, ref xmm2, ref xmm3, xmm4);
                MEOW_SHUFFLE(ref xmm7, ref xmm0, xmm1, ref xmm3, ref xmm4, xmm5);
                MEOW_SHUFFLE(ref xmm0, ref xmm1, xmm2, ref xmm4, ref xmm5, xmm6);
                MEOW_SHUFFLE(ref xmm1, ref xmm2, xmm3, ref xmm5, ref xmm6, xmm7);
                MEOW_SHUFFLE(ref xmm2, ref xmm3, xmm4, ref xmm6, ref xmm7, xmm0);
                MEOW_SHUFFLE(ref xmm3, ref xmm4, xmm5, ref xmm7, ref xmm0, xmm1);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                xmm0 = AddQ(xmm0, xmm2);
                xmm1 = AddQ(xmm1, xmm3);
                xmm4 = AddQ(xmm4, xmm6);
                xmm5 = AddQ(xmm5, xmm7);
                xmm0 = Sse2.Xor(xmm0, xmm1);
                xmm4 = Sse2.Xor(xmm4, xmm5);
                xmm0 = AddQ(xmm0, xmm4);

#if MEOW_DUMP
                MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);
#endif

                return(xmm0);
            }
        }
Exemplo n.º 26
0
    public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor)
    {
        // Based on https://stackoverflow.com/a/51458507/347870

        // Convert to two 32-bit integers
        Vector128 <int> a_hi_epi32       = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32       = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16);

        Vector128 <int> b_hi_epi32       = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32       = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16);

        // Convert to 32-bit floats
        Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32);
        Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32);
        Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32);
        Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32);

        // Calculate the reciprocal
        Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi);
        Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo);

        // Calculate the inverse
        Vector128 <float> b_hi_inv_1;
        Vector128 <float> b_lo_inv_1;
        Vector128 <float> two = Vector128.Create(2.00000051757f);

        if (Fma.IsSupported)
        {
            b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two);
            b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two);
        }
        else
        {
            Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi);
            Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo);
            b_hi_inv_1 = Sse.Subtract(two, b_mul_hi);
            b_lo_inv_1 = Sse.Subtract(two, b_mul_lo);
        }

        // Compensate for the loss
        Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1);
        Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1);

        // Perform the division by multiplication
        Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1);
        Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1);

        // Convert back to integers
        Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi);
        Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo);

        // Zero-out the unnecessary parts
        Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16);

        // Blend the bits, and return
        if (Sse41.IsSupported)
        {
            return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA));
        }
        else
        {
            Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32());
            return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16());
        }
    }
Exemplo n.º 27
0
        public static unsafe void ComputeDouble(
            uint[,] iterations,
            int startScanline, int increment,
            double offsetX, double offsetY,
            double zoom,
            uint maxIterations,
            ref bool cancel)
        {
            const int stride = 2;

            int height = iterations.GetLength(0);
            int width  = iterations.GetLength(1);

            var maxIter = Vector128.Create((double)maxIterations);
            var limit   = Vector128.Create(4.0);
            var one     = Vector128.Create(1.0);
            var two     = Vector128.Create(2.0);
            var results = stackalloc double[stride];

            for (int i = startScanline; i < height && !cancel; i += increment)
            {
                for (int j = 0; j < width && !cancel; j += stride)
                {
                    var c0 = Impl.GetPointCoordinate(j + 0, i, width, height, offsetX, offsetY, zoom);
                    var c1 = Impl.GetPointCoordinate(j + 1, i, width, height, offsetX, offsetY, zoom);

                    var cr = Vector128.Create(c0.X, c1.X);
                    var ci = Vector128.Create(c0.Y, c1.Y);
                    var zr = cr;
                    var zi = ci;
                    var it = Vector128.Create(0.0);

                    for (;;)
                    {
                        var zr2 = Sse2.Multiply(zr, zr);
                        var zi2 = Sse2.Multiply(zi, zi);
                        var squaredMagnitude = Sse2.Add(zr2, zi2);

                        var cond = Sse2.And(
                            Sse2.CompareLessThanOrEqual(squaredMagnitude, limit),
                            Sse2.CompareLessThanOrEqual(it, maxIter));

                        if (Sse2.MoveMask(cond) == 0)
                        {
                            Sse2.Store(results, it);

                            if (j + 0 < width)
                            {
                                iterations[i, j + 0] = (uint)results[0] % maxIterations;
                            }
                            if (j + 1 < width)
                            {
                                iterations[i, j + 1] = (uint)results[1] % maxIterations;
                            }
                            break;
                        }

                        zi = Sse2.Add(Sse2.Multiply(two, Sse2.Multiply(zr, zi)), ci);
                        zr = Sse2.Add(Sse2.Subtract(zr2, zi2), cr);
                        it = Sse2.Add(it, Sse2.And(one, cond));
                    }
                }
            }
        }
Exemplo n.º 28
0
        unsafe private void pruneTree(OctreeNode *ptree, ushort *pfree)
        {
#if HWINTRINSICS
            var sumsMask = Vector128.Create(0xffffffffu, 0xffffffffu, 0xffffffffu, 0x1fffffffu);
            var vzero    = Vector128 <uint> .Zero;
#endif

            ushort *pnext = pfree;
            uint    level = --leafLevel;

            for (nuint i = 8; i < maxHistogramSize; i++)
            {
                var  node = ptree + i;
                uint nl   = OctreeNode.GetLevel(node);
                if (nl == level)
                {
                    ushort *children = (ushort *)node;
                    uint *  sums     = (uint *)(children + 8);

#if HWINTRINSICS
                    if (Sse2.IsSupported)
                    {
                        var vsums = Sse2.LoadVector128(sums);

                        for (nuint j = 0; j < 8; j++)
                        {
                            nuint child = children[j];
                            if (child != 0)
                            {
                                var   cnode = ptree + child;
                                uint *csums = (uint *)((ushort *)cnode + 8);

                                var vcsum = Sse2.And(sumsMask, Sse2.LoadVector128(csums));
                                vsums = Sse2.Add(vsums, vcsum);

                                Sse2.Store((uint *)cnode, vzero);
                                Sse2.Store(csums, vzero);
                                *pnext++ = (ushort)child;
                            }
                        }

                        Sse2.Store((uint *)children, vzero);
                        Sse2.Store(sums, vsums);
                    }
                    else
#endif
                    {
                        for (nuint j = 0; j < 8; j++)
                        {
                            nuint child = children[j];
                            if (child != 0)
                            {
                                var   cnode = ptree + child;
                                uint *csums = (uint *)((ushort *)cnode + 8);

                                sums[0] += csums[0];
                                sums[1] += csums[1];
                                sums[2] += csums[2];
                                sums[3] += csums[3] & 0x1fffffff;

                                Unsafe.InitBlockUnaligned(cnode, 0, (uint)Unsafe.SizeOf <OctreeNode>());
                                *pnext++ = (ushort)child;
                            }
                        }

                        Unsafe.InitBlockUnaligned(children, 0, sizeof(ushort) * 8);
                    }
                }
            }

            *pnext = 0;
        }
Exemplo n.º 29
0
        private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int gobBlocksInY = 1 << config.OutBlkHeight;

            bool outLinear = config.OutBlkKind == 0;

            int width   = Math.Min(config.OutLumaWidth + 1, input.Width);
            int height  = Math.Min(config.OutLumaHeight + 1, input.Height);
            int yStride = GetPitch(config.OutLumaWidth + 1, 1);

            int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY);

            if (Sse41.IsSupported)
            {
                Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16();

                int widthTrunc = width & ~0xf;
                int strideGap  = yStride - width;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstY)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 16)
                            {
                                byte *baseOffset = (byte *)(ip + (ulong)(uint)x);

                                Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset);
                                Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10));
                                Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20));
                                Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30));
                                Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40));
                                Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50));
                                Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60));
                                Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70));

                                pixelp1 = Sse2.And(pixelp1, mask);
                                pixelp2 = Sse2.And(pixelp2, mask);
                                pixelp3 = Sse2.And(pixelp3, mask);
                                pixelp4 = Sse2.And(pixelp4, mask);
                                pixelp5 = Sse2.And(pixelp5, mask);
                                pixelp6 = Sse2.And(pixelp6, mask);
                                pixelp7 = Sse2.And(pixelp7, mask);
                                pixelp8 = Sse2.And(pixelp8, mask);

                                Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32());
                                Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32());
                                Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32());
                                Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32());

                                pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32());
                                pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32());

                                pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2);
                                pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *op++ = Downsample(px->R);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        dstY[y * yStride + x] = Downsample(input.GetR(x, y));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstY,
                offsets.LumaOffset,
                outLinear,
                config.OutLumaWidth + 1,
                config.OutLumaHeight + 1,
                1,
                gobBlocksInY);

            rm.BufferPool.Return(dstYIndex);

            int uvWidth  = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1);
            int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1);
            int uvStride = GetPitch(config.OutChromaWidth + 1, 2);

            int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv);

            if (Sse2.IsSupported)
            {
                int widthTrunc = uvWidth & ~7;
                int strideGap  = uvStride - uvWidth * 2;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstUv)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < uvHeight; y++, ip += input.Width * 2)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16;

                                Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02));
                                Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12));
                                Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22));
                                Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32));
                                Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42));
                                Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52));
                                Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62));
                                Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72));

                                Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2);
                                Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4);
                                Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6);
                                Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8);

                                Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64());
                                Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64());

                                pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2);
                                pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < uvWidth; x++)
                            {
                                Pixel *px = ip + (uint)(x << 1);

                                *op++ = Downsample(px->G);
                                *op++ = Downsample(px->B);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < uvHeight; y++)
                {
                    for (int x = 0; x < uvWidth; x++)
                    {
                        int xx = x << 1;
                        int yy = y << 1;

                        int uvOffs = y * uvStride + xx;

                        dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy));
                        dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstUv,
                offsets.ChromaUOffset,
                outLinear,
                config.OutChromaWidth + 1,
                config.OutChromaHeight + 1, 2,
                gobBlocksInY);

            rm.BufferPool.Return(dstUvIndex);
        }
Exemplo n.º 30
0
        private static unsafe uint CalculateSse(uint crc, ReadOnlySpan <byte> buffer)
        {
            int chunksize = buffer.Length & ~ChunksizeMask;
            int length    = chunksize;

            fixed(byte *bufferPtr = buffer)
            fixed(ulong *k05PolyPtr = K05Poly)
            {
                byte *srcPtr = bufferPtr;

                // There's at least one block of 64.
                Vector128 <ulong> x1 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00));
                Vector128 <ulong> x2 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10));
                Vector128 <ulong> x3 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20));
                Vector128 <ulong> x4 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30));
                Vector128 <ulong> x5;

                x1 = Sse2.Xor(x1, Sse2.ConvertScalarToVector128UInt32(crc).AsUInt64());

                // k1, k2
                Vector128 <ulong> x0 = Sse2.LoadVector128(k05PolyPtr + 0x0);

                srcPtr += 64;
                length -= 64;

                // Parallel fold blocks of 64, if any.
                while (length >= 64)
                {
                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                    Vector128 <ulong> x6 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
                    Vector128 <ulong> x7 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x00);
                    Vector128 <ulong> x8 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x00);

                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                    x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x11);
                    x3 = Pclmulqdq.CarrylessMultiply(x3, x0, 0x11);
                    x4 = Pclmulqdq.CarrylessMultiply(x4, x0, 0x11);

                    Vector128 <ulong> y5 = Sse2.LoadVector128((ulong *)(srcPtr + 0x00));
                    Vector128 <ulong> y6 = Sse2.LoadVector128((ulong *)(srcPtr + 0x10));
                    Vector128 <ulong> y7 = Sse2.LoadVector128((ulong *)(srcPtr + 0x20));
                    Vector128 <ulong> y8 = Sse2.LoadVector128((ulong *)(srcPtr + 0x30));

                    x1 = Sse2.Xor(x1, x5);
                    x2 = Sse2.Xor(x2, x6);
                    x3 = Sse2.Xor(x3, x7);
                    x4 = Sse2.Xor(x4, x8);

                    x1 = Sse2.Xor(x1, y5);
                    x2 = Sse2.Xor(x2, y6);
                    x3 = Sse2.Xor(x3, y7);
                    x4 = Sse2.Xor(x4, y8);

                    srcPtr += 64;
                    length -= 64;
                }

                // Fold into 128-bits.
                // k3, k4
                x0 = Sse2.LoadVector128(k05PolyPtr + 0x2);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x2);
                x1 = Sse2.Xor(x1, x5);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x3);
                x1 = Sse2.Xor(x1, x5);

                x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                x1 = Sse2.Xor(x1, x4);
                x1 = Sse2.Xor(x1, x5);

                // Single fold blocks of 16, if any.
                while (length >= 16)
                {
                    x2 = Sse2.LoadVector128((ulong *)srcPtr);

                    x5 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                    x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x11);
                    x1 = Sse2.Xor(x1, x2);
                    x1 = Sse2.Xor(x1, x5);

                    srcPtr += 16;
                    length -= 16;
                }

                // Fold 128 - bits to 64 - bits.
                x2 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x10);
                x3 = Vector128.Create(~0, 0, ~0, 0).AsUInt64(); // _mm_setr_epi32 on x86
                x1 = Sse2.ShiftRightLogical128BitLane(x1, 8);
                x1 = Sse2.Xor(x1, x2);

                // k5, k0
                x0 = Sse2.LoadScalarVector128(k05PolyPtr + 0x4);

                x2 = Sse2.ShiftRightLogical128BitLane(x1, 4);
                x1 = Sse2.And(x1, x3);
                x1 = Pclmulqdq.CarrylessMultiply(x1, x0, 0x00);
                x1 = Sse2.Xor(x1, x2);

                // Barret reduce to 32-bits.
                // polynomial
                x0 = Sse2.LoadVector128(k05PolyPtr + 0x6);

                x2 = Sse2.And(x1, x3);
                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x10);
                x2 = Sse2.And(x2, x3);
                x2 = Pclmulqdq.CarrylessMultiply(x2, x0, 0x00);
                x1 = Sse2.Xor(x1, x2);

                crc = (uint)Sse41.Extract(x1.AsInt32(), 1);
                return(buffer.Length - chunksize == 0 ? crc : CalculateScalar(crc, buffer[chunksize..]));