예제 #1
0
파일: And.Byte.cs 프로젝트: jbohua/coreclr
        public void RunBasicScenario_Load()
        {
            var result = Avx2.And(
                Avx.LoadVector256((Byte *)(_dataTable.inArray1Ptr)),
                Avx.LoadVector256((Byte *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #2
0
        public void RunStructLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));

            var test   = TestStruct.Create();
            var result = Avx2.And(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
예제 #3
0
파일: And.Byte.cs 프로젝트: jbohua/coreclr
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Avx2.And(
                Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #4
0
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test   = new SimpleBinaryOpTest__AndInt64();
            var result = Avx2.And(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
예제 #5
0
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var left   = Avx.LoadAlignedVector256((UInt32 *)(_dataTable.inArray1Ptr));
            var right  = Avx.LoadAlignedVector256((UInt32 *)(_dataTable.inArray2Ptr));
            var result = Avx2.And(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
예제 #6
0
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var op1    = Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray1Ptr);
            var op2    = Unsafe.Read <Vector256 <Byte> >(_dataTable.inArray2Ptr);
            var result = Avx2.And(op1, op2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, _dataTable.outArrayPtr);
        }
예제 #7
0
        public void RunLclVarScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead));

            var left   = Unsafe.Read <Vector256 <UInt32> >(_dataTable.inArray1Ptr);
            var right  = Unsafe.Read <Vector256 <UInt32> >(_dataTable.inArray2Ptr);
            var result = Avx2.And(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
예제 #8
0
        public void RunLclVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load));

            var op1    = Avx.LoadVector256((Int32 *)(_dataTable.inArray1Ptr));
            var op2    = Avx.LoadVector256((Int32 *)(_dataTable.inArray2Ptr));
            var result = Avx2.And(op1, op2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(op1, op2, _dataTable.outArrayPtr);
        }
예제 #9
0
        public void RunBasicScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load));

            var result = Avx2.And(
                Avx.LoadVector256((UInt32 *)(_dataTable.inArray1Ptr)),
                Avx.LoadVector256((UInt32 *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #10
0
        public void RunBasicScenario_UnsafeRead()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead));

            var result = Avx2.And(
                Unsafe.Read <Vector256 <UInt16> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector256 <UInt16> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #11
0
        public void RunClsVarScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario));

            var result = Avx2.And(
                _clsVar1,
                _clsVar2
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
예제 #12
0
            public static unsafe void Decode32Bytes(byte *source, byte *dest)
            {
                Vector256 <byte> maskA   = Vector256.Create((uint)0x0000_003f).AsByte();
                Vector256 <byte> maskB   = Vector256.Create((uint)0x0000_3f00).AsByte();
                Vector256 <byte> maskC   = Vector256.Create((uint)0x003f_0000).AsByte();
                Vector256 <byte> maskD   = Vector256.Create((uint)0x3f00_0000).AsByte();
                Vector256 <byte> offsets = Vector256.Create((sbyte)-32).AsByte();

                Vector256 <byte> vecSource  = Unsafe.As <byte, Vector256 <byte> >(ref source[0]);
                Vector256 <byte> subtracted = Avx2.Add(vecSource, offsets);

                Vector256 <byte> a = Avx2.And(subtracted, maskA);
                Vector256 <byte> b = Avx2.And(subtracted, maskB);
                Vector256 <byte> c = Avx2.And(subtracted, maskC);
                Vector256 <byte> d = Avx2.And(subtracted, maskD);


                a = Avx2.ShiftLeftLogical(a.AsUInt32(), 18).AsByte();                       // 00000000 00000000 00000000 00aaaaaa -> 00000000 aaaaaa00 00000000 00000000
                b = Avx2.ShiftLeftLogical(b.AsUInt32(), 4).AsByte();                        // 00000000 00000000 00bbbbbb 00000000 -> 00000000 000000bb bbbb0000 00000000
                c = Avx2.ShiftRightLogical(c.AsUInt32(), 10).AsByte();                      // 00000000 00cccccc 00000000 00000000 -> 00000000 00000000 0000cccc cc000000
                d = Avx2.ShiftRightLogical(d.AsUInt32(), 24).AsByte();                      // 00dddddd 00000000 00000000 00000000 -> 00000000 00000000 00000000 00dddddd
                //	After Or:							  00000000 aaaaaabb bbbbcccc ccdddddd
                //                                        byte 3   byte 1   byte 2   byte 0

                // a uint: 0x00000000_00000000__00000000_00111111
                // b uint: 0x00000000_00000000__00111111_00000000
                // c uint: 0x00000000_00111111__00000000_00000000
                // d uint: 0x00111111_00000000__00000000_00000000


                a = Avx2.Or(a, b);
                c = Avx2.Or(c, d);
                a = Avx2.Or(a, c);                                      // AA BB CC 00   AA BB CC 00


                // a contains: [C,B,A,0, F,E,D,0, I,H,G,0, L,K,J,0]
                // Shuffle bytes so that it becomes: [A,B,C, D,E,F, G,H,I, J,K,L, 0,0,0,0]


                //2,   1,  0,   6,  5,  4,   10,  9,  8,  14, 13, 12,  // 3, 7, 11, 15
                //	18, 17, 16,  22, 21, 20,     // 19

                var vecShuffle = Vector256.Create(
                    0x02, 0x01, 0x00, 0x06, 0x05, 0x04, 0x0a, 0x09, 0x08, 0x0e, 0x0d, 0x0c,
                    0x80, 0x80, 0x80, 0x80,                     // 0x03, 0x07, 0x0b, 0x0f
                    0x12, 0x11, 0x10, 0x16, 0x15, 0x14, 0x1a, 0x19, 0x18, 0x1e, 0x1d, 0x1c,
                    0x80, 0x80, 0x80, 0x80);                    // 0x13, 0x17, 0x1b, 0x1f

                var vecBytes2 = Avx2.Shuffle(a, vecShuffle);

                Sse2.Store(dest, vecBytes2.GetLower());
                Sse2.Store(dest + 12, vecBytes2.GetUpper());
            }
예제 #13
0
        public void RunStructLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load));

            var test   = TestStruct.Create();
            var result = Avx2.And(
                Avx.LoadVector256((UInt16 *)(&test._fld1)),
                Avx.LoadVector256((UInt16 *)(&test._fld2))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
예제 #14
0
        public static Vector256 <byte> op_Multiply(Vector256 <byte> left, Vector256 <byte> right)
        {
            Vector256 <ushort> lowBits = Vector256.Create((ushort)0x00FF);
            var lowProduct             = Avx2.And(lowBits, Avx2.MultiplyLow(left.As <ushort>(), right.As <ushort>())).AsByte();
            var highProduct            =
                Avx2.ShiftLeftLogical(
                    Avx2.MultiplyLow(
                        Avx2.ShiftRightLogical(left.As <ushort>(), 8),
                        Avx2.ShiftRightLogical(right.As <ushort>(), 8)
                        ),
                    8).AsByte();

            return(Avx2.Or(lowProduct, highProduct));
        }
예제 #15
0
        private static Vector256 <ulong> SpreadSIMD(Vector256 <byte> x)
        {
            // x  = _mm256_shuffle_epi8(x, _mm256_set_epi8(
            // -1, 11, -1, 10, -1, 9, -1, 8,
            // -1, 3, -1, 2, -1, 1, -1, 0,
            // -1, 11, -1, 10, -1, 9, -1, 8,
            // -1, 3, -1, 2, -1, 1, -1, 0));
            //the order of Vector256.Create is reversed of _mm256_set_epi8!

            x = Avx2.Shuffle(x.AsSByte(), Vector256.Create(
                                 0, -1, 1, -1, 2, -1, 3, -1,
                                 8, -1, 9, -1, 10, -1, 11, -1,
                                 0, -1, 1, -1, 2, -1, 3, -1,
                                 8, -1, 9, -1, 10, -1, 11, -1)).AsByte();

            // const __m256i lut = _mm256_set_epi8(
            // 85, 84, 81, 80, 69, 68, 65, 64,
            // 21, 20, 17, 16, 5, 4, 1, 0,
            // 85, 84, 81, 80, 69, 68, 65, 64,
            // 21, 20, 17, 16, 5, 4, 1, 0);
            // Vector256<byte> lut = Vector256.Create(
            //     (byte)0,1,4,5,16,17,20,21,
            //     64,65,68,69,80,81,84,85,
            //     0,1,4,5,16,17,20,21,
            //     64,65,68,69,80,81,84,85
            // );
            Vector256 <byte> lut = Vector256.Create(
                (byte)0b00000000, 0b00000001, 0b00000100, 0b00000101, 0b00010000, 0b00010001, 0b00010100, 0b00010101,
                0b01000000, 0b01000001, 0b01000100, 0b01000101, 0b01010000, 0b01010001, 0b01010100, 0b01010101,
                0b00000000, 0b00000001, 0b00000100, 0b00000101, 0b00010000, 0b00010001, 0b00010100, 0b00010101,
                0b01000000, 0b01000001, 0b01000100, 0b01000101, 0b01010000, 0b01010001, 0b01010100, 0b01010101
                );

            // __m256i lo = _mm256_and_si256(x, _mm256_set1_epi8(0xf));
            Vector256 <byte> lo = Avx2.And(x, Vector256.Create((byte)0x0f));

            // lo = _mm256_shuffle_epi8(lut, lo);
            lo = Avx2.Shuffle(lut, lo);

            // __m256i hi = _mm256_and_si256(x, _mm256_set1_epi8(0xf0));
            var hi = Avx2.And(x, Vector256.Create((byte)0xf0));

            // hi = _mm256_shuffle_epi8(lut, _mm256_srli_epi64(hi, 4));
            hi = Avx2.Shuffle(lut, Avx2.ShiftRightLogical(hi.AsUInt64(), 4).AsByte());

            // x = _mm256_or_si256(lo, _mm256_slli_epi64(hi, 8));
            x = Avx2.Or(lo, Avx2.ShiftRightLogical(hi.AsUInt64(), 8).AsByte());

            return(x.AsUInt64());
        }
예제 #16
0
            public void RunStructFldScenario_Load(SimpleBinaryOpTest__AndUInt16 testClass)
            {
                fixed(Vector256 <UInt16> *pFld1 = &_fld1)
                fixed(Vector256 <UInt16> *pFld2 = &_fld2)
                {
                    var result = Avx2.And(
                        Avx.LoadVector256((UInt16 *)(pFld1)),
                        Avx.LoadVector256((UInt16 *)(pFld2))
                        );

                    Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                    testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
                }
            }
예제 #17
0
        /// <summary>
        /// Absolute error bounded by 1e-4.
        /// </summary>
        public static Vector256 <float> Log(Vector256 <float> x)
        {
            Vector256 <float> exp, addcst, val;

            exp = Avx2.ConvertToVector256Single(Avx2.ShiftRightArithmetic(x.As <float, int>(), 23));

            // According to BenchmarkDotNet, isolating all the constants up-front
            // yield nearly 10% speed-up.

            const float bf0 = -89.970756366f;
            const float bf1 = float.NaN; // behavior of MathF.Log() on negative numbers
            const float bf2 = 3.529304993f;
            const float bf3 = -2.461222105f;
            const float bf4 = 1.130626167f;
            const float bf5 = -0.288739945f;
            const float bf6 = 3.110401639e-2f;
            const float bf7 = 0.6931471805f;

            const int bi0 = 0x7FFFFF;
            const int bi1 = 0x3F800000;

            //addcst = val > 0 ? -89.970756366f : -(float)INFINITY;

            addcst = Avx.BlendVariable(Vector256.Create(bf0),
                                       Vector256.Create(bf1),
                                       Avx.Compare(x, Vector256 <float> .Zero, FloatComparisonMode.OrderedLessThanNonSignaling));

            val = Avx2.Or(Avx2.And(
                              x.As <float, int>(),
                              Vector256.Create(bi0)),
                          Vector256.Create(bi1)).As <int, float>();

            /*    x * (3.529304993f +
             *      x * (-2.461222105f +
             *        x * (1.130626167f +
             *          x * (-0.288739945f +
             *            x * 3.110401639e-2f))))
             + (addcst + 0.6931471805f*exp); */

            return(Avx2.Add(
                       Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf2),
                                                   Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf3),
                                                                               Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf4),
                                                                                                           Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf5),
                                                                                                                                       Avx2.Multiply(val, Vector256.Create(bf6)))))))))),
                       Avx.Add(addcst,
                               Avx2.Multiply(Vector256.Create(bf7), exp))));
        }
예제 #18
0
        unsafe private static void denoiseLineAvx2(byte *pcurr, byte *pprev, byte *pnext, int cb)
        {
            byte *ip = pcurr, pp = pprev, np = pnext;
            nuint cnt = 0, end = (nuint)cb - (nuint)Vector256 <byte> .Count;

            var voffset = Vector256.Create((byte)0x80);
            var vthresh = Vector256.Create(denoiseThreshold);

LoopTop:
            do
            {
                var vcurr = Avx.LoadVector256(ip + cnt);
                var vprev = Avx.LoadVector256(pp + cnt);
                var vnext = Avx.LoadVector256(np + cnt);

                var vdiffp = Avx2.Or(Avx2.SubtractSaturate(vcurr, vprev), Avx2.SubtractSaturate(vprev, vcurr));
                var vmaskp = Avx2.CompareEqual(Avx2.Max(vdiffp, vthresh), vthresh);

                var vdiffn = Avx2.Or(Avx2.SubtractSaturate(vcurr, vnext), Avx2.SubtractSaturate(vnext, vcurr));
                var vmaskn = Avx2.CompareEqual(Avx2.Max(vdiffn, vthresh), vthresh);

                var vavgp = Avx2.Average(vcurr, vprev);
                var vavgn = Avx2.Average(vcurr, vnext);

                var voutval = Avx2.Average(Avx2.BlendVariable(vavgn, vavgp, vmaskp), Avx2.BlendVariable(vavgp, vavgn, vmaskn));
                var voutmsk = Avx2.Or(vmaskp, vmaskn);
                voutval = Avx2.Average(voutval, Avx2.BlendVariable(voutval, Avx2.Average(vprev, vnext), Avx2.And(vmaskp, vmaskn)));

                var vcurrs = Avx2.Xor(vcurr, voffset).AsSByte();
                var vprevs = Avx2.Xor(vprev, voffset).AsSByte();
                var vnexts = Avx2.Xor(vnext, voffset).AsSByte();

                var vsurlt = Avx2.And(Avx2.CompareGreaterThan(vcurrs, vprevs), Avx2.CompareGreaterThan(vcurrs, vnexts));
                var vsurgt = Avx2.And(Avx2.CompareGreaterThan(vprevs, vcurrs), Avx2.CompareGreaterThan(vnexts, vcurrs));

                voutmsk = Avx2.And(voutmsk, Avx2.Or(vsurlt, vsurgt).AsByte());
                voutval = Avx2.BlendVariable(vcurr, voutval, voutmsk);

                Avx.Store(ip + cnt, voutval);
                cnt += (nuint)Vector256 <byte> .Count;
            } while (cnt <= end);

            if (cnt < end + (nuint)Vector256 <byte> .Count)
            {
                cnt = end;
                goto LoopTop;
            }
        }
예제 #19
0
        public void RunClassFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load));

            fixed(Vector256 <UInt16> *pFld1 = &_fld1)
            fixed(Vector256 <UInt16> *pFld2 = &_fld2)
            {
                var result = Avx2.And(
                    Avx.LoadVector256((UInt16 *)(pFld1)),
                    Avx.LoadVector256((UInt16 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
            }
        }
예제 #20
0
        public void RunClsVarScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load));

            fixed(Vector256 <Int32> *pClsVar1 = &_clsVar1)
            fixed(Vector256 <Int32> *pClsVar2 = &_clsVar2)
            {
                var result = Avx2.And(
                    Avx.LoadVector256((Int32 *)(pClsVar1)),
                    Avx.LoadVector256((Int32 *)(pClsVar2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
            }
        }
예제 #21
0
        public unsafe void Test_AVX_BitsToBytes()
        {
            uint             x = 0b0000_0001__0010_0011__0100_0101__0110_0111u;
            uint             y = 0b1000_1001__1010_1011__1100_1101__1110_1111u;
            Vector256 <byte> mask1, mask2, zero = Vector256 <byte> .Zero, one, ff;

            byte[] mask1_bytes = new byte[]
            {
                0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
                2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
            };
            byte[] mask2_bytes = new byte[]
            {
                0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
                0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
                0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
                0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
            };

            fixed(byte *ptr = mask1_bytes) mask1 = Avx2.LoadVector256(ptr);

            fixed(byte *ptr = mask2_bytes) mask2 = Avx2.LoadVector256(ptr);

            byte one_byte = 1;

            one = Avx2.BroadcastScalarToVector256(&one_byte);
            byte ff_byte = 0xff;

            ff = Avx2.BroadcastScalarToVector256(&ff_byte);

            // ***** load **** //
            Vector256 <uint> ux         = Avx2.BroadcastScalarToVector256(&y);
            Vector256 <byte> bx         = ux.AsByte();
            Vector256 <byte> shuffled_x = Avx2.Shuffle(bx, mask1);
            Vector256 <byte> result_x   = Avx2.And(shuffled_x, mask2);

            result_x = Avx2.Min(result_x, one);

            // ***** store **** //
            Vector256 <byte> reverse_x = Avx2.CompareEqual(result_x, zero);

            reverse_x = Avx2.AndNot(reverse_x, ff);

            uint reversed_x = (uint)Avx2.MoveMask(reverse_x);

            Assert.AreEqual(reversed_x, y);
        }
예제 #22
0
        private unsafe static void BCnDecodeTileRgb(Span <uint> clut, Span <byte> output, ReadOnlySpan <byte> input)
        {
            if (Avx2.IsSupported)
            {
                Span <Vector256 <uint> > outputAsVector256 = MemoryMarshal.Cast <byte, Vector256 <uint> >(output);

                Vector256 <uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
                Vector256 <uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
                Vector256 <uint> masks   = Vector256.Create(3u);

                Vector256 <uint> vClut;

                fixed(uint *pClut = &clut[0])
                {
                    vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
                }

                Vector256 <uint> indices0;

                fixed(byte *pInput = input)
                {
                    indices0 = Avx2.BroadcastScalarToVector256((uint *)(pInput + 4));
                }

                Vector256 <uint> indices1 = indices0;

                indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
                indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
                indices0 = Avx2.And(indices0, masks);
                indices1 = Avx2.And(indices1, masks);

                outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
                outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
            }
            else
            {
                Span <uint> outputAsUint = MemoryMarshal.Cast <byte, uint>(output);

                uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4));

                for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
                {
                    outputAsUint[i] = clut[(int)(indices & 3)];
                }
            }
        }
예제 #23
0
        // when 0xED is found, next byte must be no larger than 0x9F
        // when 0xF4 is found, next byte must be no larger than 0x8F
        // next byte must be continuation, ie sign bit is set, so signed < is ok
        static void avxcheckFirstContinuationMax(Vector256 <byte> current_bytes,
                                                 Vector256 <byte> off1_current_bytes,
                                                 ref Vector256 <byte> has_error)
        {
            Vector256 <byte> maskED =
                Avx2.CompareEqual(off1_current_bytes, Vector256.Create((byte)0xED));
            Vector256 <byte> maskF4 =
                Avx2.CompareEqual(off1_current_bytes, Vector256.Create((byte)0xF4));

            Vector256 <byte> badfollowED = Avx2.And(
                Avx2.CompareGreaterThan(current_bytes.AsSByte(), Vector256.Create((byte)0x9F).AsSByte()).AsByte(), maskED);
            Vector256 <byte> badfollowF4 = Avx2.And(
                Avx2.CompareGreaterThan(current_bytes.AsSByte(), Vector256.Create((byte)0x8F).AsSByte()).AsByte(), maskF4);

            has_error =
                Avx2.Or(has_error, Avx2.Or(badfollowED, badfollowF4));
        }
예제 #24
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleBinaryOpTest__AndInt32();

            fixed(Vector256 <Int32> *pFld1 = &test._fld1)
            fixed(Vector256 <Int32> *pFld2 = &test._fld2)
            {
                var result = Avx2.And(
                    Avx.LoadVector256((Int32 *)(pFld1)),
                    Avx.LoadVector256((Int32 *)(pFld2))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
            }
        }
예제 #25
0
        private static void denoiseLineAvx2(byte *pcurr, byte *pprev, byte *pnext, nint cb)
        {
            byte *ip = pcurr, pp = pprev, np = pnext;
            nint  cnt = 0, end = cb - Vector256 <byte> .Count;

            var vthresh = Vector256.Create(denoiseThreshold);
            var vones   = Avx2.CompareEqual(vthresh, vthresh);

LoopTop:
            do
            {
                var vcurr = Avx.LoadVector256(ip + cnt);
                var vprev = Avx.LoadVector256(pp + cnt);
                var vnext = Avx.LoadVector256(np + cnt);

                var vdiffp = Avx2.Or(Avx2.SubtractSaturate(vcurr, vprev), Avx2.SubtractSaturate(vprev, vcurr));
                var vmaskp = Avx2.CompareEqual(Avx2.Max(vdiffp, vthresh), vthresh);

                var vdiffn = Avx2.Or(Avx2.SubtractSaturate(vcurr, vnext), Avx2.SubtractSaturate(vnext, vcurr));
                var vmaskn = Avx2.CompareEqual(Avx2.Max(vdiffn, vthresh), vthresh);

                var vavgp = Avx2.Average(vcurr, vprev);
                var vavgn = Avx2.Average(vcurr, vnext);

                var voutval = Avx2.Average(Avx2.BlendVariable(vavgn, vavgp, vmaskp), Avx2.BlendVariable(vavgp, vavgn, vmaskn));
                var voutmsk = Avx2.Or(vmaskp, vmaskn);
                voutval = Avx2.Average(voutval, Avx2.BlendVariable(voutval, Avx2.Average(vprev, vnext), Avx2.And(vmaskp, vmaskn)));

                var vsurlt = Avx2.Xor(Avx2.CompareEqual(Avx2.Min(Avx2.Max(vprev, vnext), vcurr), vcurr), vones);
                var vsurgt = Avx2.Xor(Avx2.CompareEqual(Avx2.Max(Avx2.Min(vprev, vnext), vcurr), vcurr), vones);
                voutmsk = Avx2.And(voutmsk, Avx2.Or(vsurlt, vsurgt));
                voutval = Avx2.BlendVariable(vcurr, voutval, voutmsk);

                Avx.Store(ip + cnt, voutval);
                cnt += Vector256 <byte> .Count;
            } while (cnt <= end);

            if (cnt < end + Vector256 <byte> .Count)
            {
                cnt = end;
                goto LoopTop;
            }
        }
예제 #26
0
            private static unsafe (Vector256 <UInt32>[] hi, Vector256 <UInt32>[] lo) Mul(Vector256 <UInt32>[] v, UInt32 n)
            {
                Vector256 <UInt32>[] w_hi = new Vector256 <UInt32> [v.Length], w_lo = new Vector256 <UInt32> [v.Length];
                Vector256 <UInt32>   u    = Avx2.ConvertToVector256Int64(Vector128.Create(n)).AsUInt32();
                Vector256 <UInt32>   mask = lower_mask;

                fixed(Vector256 <UInt32> *pv = v, pw_hi = w_hi, pw_lo = w_lo)
                {
                    for (int i = 0; i < v.Length; i++)
                    {
                        Vector256 <UInt32> c = Avx2.Multiply(pv[i], u).AsUInt32();

                        pw_hi[i] = Avx2.And(Avx2.Shuffle(c, MM_PERM_CDAB), mask);
                        pw_lo[i] = Avx2.And(c, mask);
                    }
                }

                return(w_hi, w_lo);
            }
예제 #27
0
        public static unsafe void ReverseBits(this Span <int> span)
        {
            var intsReversed = 0;

            if (Avx2.IsSupported)
            {
                fixed(int *ptr = span)
                {
                    var vectorCount = span.Length / 8;

                    for (int i = 0; i < vectorCount; i++)
                    {
                        var vector  = Avx.LoadVector256((ptr + intsReversed));
                        var vector2 = Avx2.And(Avx2.And(vector, Vector256.Create(0xFF00FF)), Vector256.Create(-16711936));
                        vector =
                            Avx2.Add(
                                Avx2.Or(
                                    Avx2.ShiftRightLogical(vector, 8),
                                    Avx2.ShiftLeftLogical(vector, 24)
                                    ),
                                Avx2.Or(
                                    Avx2.ShiftLeftLogical(vector2, 8),
                                    Avx2.ShiftRightLogical(vector2, 24)
                                    )
                                );

                        Avx.Store(ptr + intsReversed, vector);
                        intsReversed += 8;
                    }
                }
            }

            for (int i = intsReversed; i < span.Length; i++)
            {
                span[i] = BinaryPrimitives.ReverseEndianness(span[i]);
            }

            fixed(void *ptr = span)
            {
                new Span <byte>(ptr, span.Length * 4).ReverseBits();
            }
        }
예제 #28
0
        public static unsafe byte[] ArgbToAlpha32(byte[] argbBytes)
        {
            var result    = new byte[argbBytes.Length];
            var resultPtr = result.ToPtr();
            var bytePtr   = argbBytes.ToPtr();

            for (var ptrIndex = 0; ptrIndex < argbBytes.Length; ptrIndex += 16)
            {
                var vector128 = Avx2.LoadVector128(bytePtr + ptrIndex).AsUInt32();

                vector128 = Avx2.And(vector128, alphaOnlyVector128);
                vector128 = Avx2.Or(vector128, Avx2.ShiftRightLogical128BitLane(vector128, 1));
                vector128 = Avx2.Or(vector128, Avx2.ShiftRightLogical128BitLane(vector128, 2));


                Avx2.Store(resultPtr + ptrIndex, vector128.AsByte());
            }

            return(result);
        }
예제 #29
0
        //[MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static __m256 exp256_ps(__m256 V)
        {
            __m256 x   = V;
            __m256 tmp = __m256.Zero;
            __m256 one = SET(1.0f);

            x = Avx2.Min(x, exp_hi);
            x = Avx2.Max(x, exp_lo);
            __m256 fx = Avx2.Multiply(x, cLOG2EF);

            fx  = Avx2.Add(fx, SET(0.5f));
            tmp = Avx2.Floor(fx);
            var mask = Avx2.Compare(tmp, fx, FloatComparisonMode.OrderedGreaterThanSignaling);

            mask = Avx2.And(mask, one);
            fx   = Avx2.Subtract(tmp, mask);
            tmp  = Avx2.Multiply(fx, cexp_C1);
            __m256 z = Avx2.Multiply(fx, cexp_C2);

            x = Avx2.Subtract(x, tmp);
            x = Avx2.Subtract(x, z);
            z = Avx2.Multiply(x, x);
            __m256 y = cexp_p0;

            y = Fma.MultiplyAdd(y, x, cexp_p1);
            y = Fma.MultiplyAdd(y, x, cexp_p2);
            y = Fma.MultiplyAdd(y, x, cexp_p3);
            y = Fma.MultiplyAdd(y, x, cexp_p4);
            y = Fma.MultiplyAdd(y, x, cexp_p5);
            y = Fma.MultiplyAdd(y, z, x);
            y = Avx2.Add(y, one);
            var imm0 = Avx2.ConvertToVector256Int32(fx);
            var F7   = Vector256.Create((int)0x7f);

            imm0 = Avx2.Add(imm0, F7);
            imm0 = Avx2.ShiftLeftLogical(imm0, 23);
            __m256 pow2n = Vector256.AsSingle(imm0);

            y = Avx2.Multiply(y, pow2n);
            return(y);
        }
예제 #30
0
        public static void CollectColorBlueTransforms(Span <uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span <int> histo)
        {
#if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx2.IsSupported && tileWidth >= 16)
            {
                const int     span   = 16;
                Span <ushort> values = stackalloc ushort[span];
                var           multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
                var           multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
                for (int y = 0; y < tileHeight; y++)
                {
                    Span <uint> srcSpan  = bgra.Slice(y * stride);
                    ref uint    inputRef = ref MemoryMarshal.GetReference(srcSpan);
                    for (nint x = 0; x <= tileWidth - span; x += span)
                    {
                        nint               input0Idx = x;
                        nint               input1Idx = x + (span / 2);
                        Vector256 <byte>   input0    = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                        Vector256 <byte>   input1    = Unsafe.As <uint, Vector256 <uint> >(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
                        Vector256 <byte>   r0        = Avx2.Shuffle(input0, CollectColorBlueTransformsShuffleLowMask256);
                        Vector256 <byte>   r1        = Avx2.Shuffle(input1, CollectColorBlueTransformsShuffleHighMask256);
                        Vector256 <byte>   r         = Avx2.Or(r0, r1);
                        Vector256 <byte>   gb0       = Avx2.And(input0, CollectColorBlueTransformsGreenBlueMask256);
                        Vector256 <byte>   gb1       = Avx2.And(input1, CollectColorBlueTransformsGreenBlueMask256);
                        Vector256 <ushort> gb        = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
                        Vector256 <byte>   g         = Avx2.And(gb.AsByte(), CollectColorBlueTransformsGreenMask256);
                        Vector256 <short>  a         = Avx2.MultiplyHigh(r.AsInt16(), multsr);
                        Vector256 <short>  b         = Avx2.MultiplyHigh(g.AsInt16(), multsg);
                        Vector256 <byte>   c         = Avx2.Subtract(gb.AsByte(), b.AsByte());
                        Vector256 <byte>   d         = Avx2.Subtract(c, a.AsByte());
                        Vector256 <byte>   e         = Avx2.And(d, CollectColorBlueTransformsBlueMask256);

                        ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                        Unsafe.As <ushort, Vector256 <ushort> >(ref outputRef) = e.AsUInt16();

                        for (int i = 0; i < span; i++)
                        {
                            ++histo[values[i]];
                        }
                    }
                }