public void RunFldScenario()
        {
            var result = Avx.ConvertToVector256Int32WithTruncation(_fld);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_fld, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_LoadAligned()
        {
            var firstOp = Avx.LoadAlignedVector256((Single *)(_dataTable.inArrayPtr));
            var result  = Avx.ConvertToVector256Int32WithTruncation(firstOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, _dataTable.outArrayPtr);
        }
        public void RunLclFldScenario()
        {
            var test   = new SimpleUnaryOpTest__ConvertToVector256Int32WithTruncationSingle();
            var result = Avx.ConvertToVector256Int32WithTruncation(test._fld);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld, _dataTable.outArrayPtr);
        }
        public void RunLclVarScenario_UnsafeRead()
        {
            var firstOp = Unsafe.Read <Vector256 <Single> >(_dataTable.inArrayPtr);
            var result  = Avx.ConvertToVector256Int32WithTruncation(firstOp);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(firstOp, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_Load()
        {
            var result = Avx.ConvertToVector256Int32WithTruncation(
                Avx.LoadVector256((Single *)(_dataTable.inArrayPtr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Avx.ConvertToVector256Int32WithTruncation(
                Unsafe.Read <Vector256 <Single> >(_dataTable.inArrayPtr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr);
        }
            unsafe public static void ConvertFloat3A(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb)
            {
                Debug.Assert(ipstart == opstart);

                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                float *lp = lutstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vgmsk = Avx.BroadcastVector128ToVector256((float *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.GatherMask3x)));
                    var vgmax = Vector256.Create((float)lutmax);
                    var vzero = Vector256 <float> .Zero;
                    var vfone = Vector256.Create(1f);
                    var vione = Vector256.Create(1);

                    ipe -= Vector256 <float> .Count;
                    while (ip <= ipe)
                    {
                        var vf = Avx.Max(vzero, Avx.LoadVector256(ip));
                        var va = Avx.Shuffle(vf, vf, HWIntrinsics.ShuffleMaskAlpha);

                        vf = Avx.Multiply(vf, Avx.Multiply(vgmax, Avx.Reciprocal(va)));
                        vf = Avx.Min(vf, vgmax);

                        var vi  = Avx.ConvertToVector256Int32WithTruncation(vf);
                        var vfi = Avx.ConvertToVector256Single(vi);

                        var vl = Avx2.GatherMaskVector256(vfone, lp, vi, vgmsk, sizeof(float));
                        var vh = Avx2.GatherMaskVector256(vfone, lp, Avx2.Add(vi, vione), vgmsk, sizeof(float));

                        vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vfi));
                        vf = Avx.Multiply(vf, va);

                        Avx.Store(ip, vf);
                        ip += Vector256 <float> .Count;
                    }
                    ipe += Vector256 <float> .Count;
                }
#endif
                {
                    var   vlmax = new Vector4(lutmax);
                    var   vzero = Vector4.Zero;
                    float famin = new Vector4(1 / 1024f).X;

                    while (ip < ipe)
                    {
                        var vf = Unsafe.ReadUnaligned <Vector4>(ip);

                        float f3 = vf.W;
                        if (f3 < famin)
                        {
                            Unsafe.WriteUnaligned(ip, vzero);
                        }
                        else
                        {
                            vf = (vf * vlmax / f3).Clamp(vzero, vlmax);

                            float f0 = vf.X;
                            float f1 = vf.Y;
                            float f2 = vf.Z;

                            uint i0 = (uint)f0;
                            uint i1 = (uint)f1;
                            uint i2 = (uint)f2;

                            ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0) * f3;
                            ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1) * f3;
                            ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2) * f3;
                        }
                        ip += 4;
                    }
                }
            }
            unsafe public static void ConvertFloat(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb)
            {
                Debug.Assert(ipstart == opstart);

                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                float *lp = lutstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vlmax = Vector256.Create((float)lutmax);
                    var vzero = Vector256 <float> .Zero;
                    var vione = Vector256.Create(1);

                    ipe -= Vector256 <float> .Count;
                    while (ip <= ipe)
                    {
                        var vf = Avx.Multiply(vlmax, Avx.LoadVector256(ip));
                        vf = Avx.Min(Avx.Max(vzero, vf), vlmax);

                        var vi = Avx.ConvertToVector256Int32WithTruncation(vf);
                        var vp = Avx.ConvertToVector256Single(vi);

                        var vl = Avx2.GatherVector256(lp, vi, sizeof(float));
                        var vh = Avx2.GatherVector256(lp, Avx2.Add(vi, vione), sizeof(float));

                        vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vp));

                        Avx.Store(ip, vf);
                        ip += Vector256 <float> .Count;
                    }
                    ipe += Vector256 <float> .Count;

                    float fmin = vzero.ToScalar(), flmax = vlmax.ToScalar();
                    while (ip < ipe)
                    {
                        float f = (*ip * flmax).Clamp(fmin, flmax);
                        uint  i = (uint)f;

                        *ip++ = Lerp(lp[i], lp[i + 1], f - i);
                    }
                }
                else
#endif
                {
                    var vlmax = new Vector4(lutmax);
                    var vzero = Vector4.Zero;

                    ipe -= 4;
                    while (ip <= ipe)
                    {
                        var vf = (Unsafe.ReadUnaligned <Vector4>(ip) * vlmax).Clamp(vzero, vlmax);

                        float f0 = vf.X;
                        float f1 = vf.Y;
                        float f2 = vf.Z;
                        float f3 = vf.W;

                        uint i0 = (uint)f0;
                        uint i1 = (uint)f1;
                        uint i2 = (uint)f2;
                        uint i3 = (uint)f3;

                        ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0);
                        ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1);
                        ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2);
                        ip[3] = Lerp(lp[i3], lp[i3 + 1], f3 - (int)i3);

                        ip += 4;
                    }
                    ipe += 4;

                    float fmin = vzero.X, flmax = vlmax.X;
                    while (ip < ipe)
                    {
                        float f = (*ip * flmax).Clamp(fmin, flmax);
                        uint  i = (uint)f;

                        *ip++ = Lerp(lp[i], lp[i + 1], f - i);
                    }
                }
            }