Exemplo n.º 1
0
 public static void ElementwiseSelectTest()
 {
     Assert.That(() => VectorUtilities.ElementwiseSelect(Vector128.Create(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000).AsSingle(), Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)),
                 Is.EqualTo(Vector128.Create(0.0f, 5.0f, 2.0f, 7.0f))
                 );
 }
        public void RunReflectionScenario(int imm = 15, bool expectedOutOfRangeException = false)
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunReflectionScenario));

            SByte[] values = new SByte[ElementCount];

            for (int i = 0; i < ElementCount; i++)
            {
                values[i] = TestLibrary.Generator.GetSByte();
            }

            Vector128 <SByte> value = Vector128.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15]);

            bool succeeded = !expectedOutOfRangeException;

            try
            {
                object result = typeof(Vector128)
                                .GetMethod(nameof(Vector128.GetElement))
                                .MakeGenericMethod(typeof(SByte))
                                .Invoke(null, new object[] { value, imm });
                ValidateGetResult((SByte)(result), values);
            }
            catch (TargetInvocationException e)
            {
                succeeded = expectedOutOfRangeException &&
                            e.InnerException is ArgumentOutOfRangeException;
            }

            if (!succeeded)
            {
                TestLibrary.TestFramework.LogInformation($"Vector128<SByte.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException.");
                TestLibrary.TestFramework.LogInformation(string.Empty);

                Succeeded = false;
            }

            succeeded = !expectedOutOfRangeException;

            SByte insertedValue = TestLibrary.Generator.GetSByte();

            try
            {
                object result2 = typeof(Vector128)
                                 .GetMethod(nameof(Vector128.WithElement))
                                 .MakeGenericMethod(typeof(SByte))
                                 .Invoke(null, new object[] { value, imm, insertedValue });
                ValidateWithResult((Vector128 <SByte>)(result2), values, insertedValue);
            }
            catch (TargetInvocationException e)
            {
                succeeded = expectedOutOfRangeException &&
                            e.InnerException is ArgumentOutOfRangeException;
            }

            if (!succeeded)
            {
                TestLibrary.TestFramework.LogInformation($"Vector128<SByte.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException.");
                TestLibrary.TestFramework.LogInformation(string.Empty);

                Succeeded = false;
            }
        }
        public void RunReflectionScenario(int imm = 0, bool expectedOutOfRangeException = false)
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunReflectionScenario));

            UInt16[] values = new UInt16[ElementCount];

            for (int i = 0; i < ElementCount; i++)
            {
                values[i] = TestLibrary.Generator.GetUInt16();
            }

            Vector128 <UInt16> value = Vector128.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7]);

            bool succeeded = !expectedOutOfRangeException;

            try
            {
                object result = typeof(Vector128 <UInt16>)
                                .GetMethod(nameof(Vector128.GetElement), new Type[] { typeof(int) })
                                .Invoke(value, new object[] { imm });
                ValidateGetResult((UInt16)(result), values);
            }
            catch (TargetInvocationException e)
            {
                succeeded = expectedOutOfRangeException &&
                            e.InnerException is ArgumentOutOfRangeException;
            }

            if (!succeeded)
            {
                TestLibrary.TestFramework.LogInformation($"Vector128<UInt16.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException.");
                TestLibrary.TestFramework.LogInformation(string.Empty);

                Succeeded = false;
            }

            succeeded = !expectedOutOfRangeException;

            UInt16 insertedValue = TestLibrary.Generator.GetUInt16();

            try
            {
                object result2 = typeof(Vector128 <UInt16>)
                                 .GetMethod(nameof(Vector128.WithElement), new Type[] { typeof(int), typeof(UInt16) })
                                 .Invoke(value, new object[] { imm, insertedValue });
                ValidateWithResult((Vector128 <UInt16>)(result2), values, insertedValue);
            }
            catch (TargetInvocationException e)
            {
                succeeded = expectedOutOfRangeException &&
                            e.InnerException is ArgumentOutOfRangeException;
            }

            if (!succeeded)
            {
                TestLibrary.TestFramework.LogInformation($"Vector128<UInt16.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException.");
                TestLibrary.TestFramework.LogInformation(string.Empty);

                Succeeded = false;
            }
        }
Exemplo n.º 4
0
        private static void Mix(byte[] block, int r, int iterations)
        {
            var v = new Vector128 <uint> [iterations * r * 8];
            var x = new uint[r * 32];

            for (var k = 0; k < 2 * r; k++)
            {
                for (var i = 0; i < 16; i++)
                {
                    x[k * 16 + i] = Le32Dec(block, (k * 16 + i * 5 % 16) * 4);
                }
            }

            var xs = new Vector128 <uint> [r * 8];
            var ys = new Vector128 <uint> [r * 8];

            for (var i = 0; i < r * 8; ++i)
            {
                xs[i] = Vector128.Create(x[i * 4], x[i * 4 + 1], x[i * 4 + 2], x[i * 4 + 3]);
                ys[i] = Vector128.Create(0u);
            }

            for (var i = 0; i < iterations; i += 2)
            {
                for (var j = 0; j < r * 8; ++j)
                {
                    v[i * r * 8 + j] = xs[j];
                }
                MixSalsa8Sse2(xs, ys, r);

                for (var j = 0; j < r * 8; ++j)
                {
                    v[(i + 1) * r * 8 + j] = ys[j];
                }
                MixSalsa8Sse2(ys, xs, r);
            }

            for (var i = 0; i < iterations; i += 2)
            {
                var offset = Integerify(xs, r) & (ulong)(iterations - 1);

                for (var j = 0; j < r * 8; ++j)
                {
                    xs[j] = Sse2.Xor(xs[j], v[offset * (ulong)r * 8 + (ulong)j]);
                }
                MixSalsa8Sse2(xs, ys, r);

                offset = Integerify(ys, r) & (ulong)(iterations - 1);

                for (var j = 0; j < r * 8; ++j)
                {
                    ys[j] = Sse2.Xor(ys[j], v[offset * (ulong)r * 8 + (ulong)j]);
                }
                MixSalsa8Sse2(ys, xs, r);
            }

            for (var i = 0; i < r * 8; ++i)
            {
                for (var j = 0; j < 4; ++j)
                {
                    x[i * 4 + j] = xs[i].GetElement(j);
                }
            }

            for (var k = 0; k < 2 * r; k++)
            {
                for (var i = 0; i < 16; i++)
                {
                    Le32Enc(x[k * 16 + i], block, (k * 16 + i * 5 % 16) * 4);
                }
            }
        }
Exemplo n.º 5
0
        private void pruneTree(OctreeNode *ptree, ushort *pfree)
        {
#if HWINTRINSICS
            var sumsMask = Vector128.Create(0xffffffffu, 0xffffffffu, 0xffffffffu, 0x1fffffffu);
            var vzero    = Vector128 <uint> .Zero;
#endif

            ushort *pnext = pfree;
            uint    level = --leafLevel;

            for (nuint i = 8; i < maxHistogramSize; i++)
            {
                var  node = ptree + i;
                uint nl   = OctreeNode.GetLevel(node);
                if (nl == level)
                {
                    ushort *children = (ushort *)node;
                    uint *  sums     = (uint *)(children + 8);

#if HWINTRINSICS
                    if (Sse2.IsSupported)
                    {
                        var vsums = Sse2.LoadVector128(sums);

                        for (nuint j = 0; j < 8; j++)
                        {
                            nuint child = children[j];
                            if (child != 0)
                            {
                                var   cnode = ptree + child;
                                uint *csums = (uint *)((ushort *)cnode + 8);

                                var vcsum = Sse2.And(sumsMask, Sse2.LoadVector128(csums));
                                vsums = Sse2.Add(vsums, vcsum);

                                Sse2.Store((uint *)cnode, vzero);
                                Sse2.Store(csums, vzero);
                                *pnext++ = (ushort)child;
                            }
                        }

                        Sse2.Store((uint *)children, vzero);
                        Sse2.Store(sums, vsums);
                    }
                    else
#endif
                    {
                        for (nuint j = 0; j < 8; j++)
                        {
                            nuint child = children[j];
                            if (child != 0)
                            {
                                var   cnode = ptree + child;
                                uint *csums = (uint *)((ushort *)cnode + 8);

                                sums[0] += csums[0];
                                sums[1] += csums[1];
                                sums[2] += csums[2];
                                sums[3] += csums[3] & 0x1fffffff;

                                Unsafe.InitBlockUnaligned(cnode, 0, (uint)sizeof(OctreeNode));
                                *pnext++ = (ushort)child;
                            }
                        }

                        Unsafe.InitBlockUnaligned(children, 0, sizeof(ushort) * 8);
                    }
                }
            }

            *pnext = 0;
        }
Exemplo n.º 6
0
        private unsafe static void WriteA8R8G8B8(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int width  = input.Width;
            int height = input.Height;
            int stride = GetPitch(width, 4);

            int dstIndex = rm.BufferPool.Rent(height * stride, out Span <byte> dst);

            if (Ssse3.IsSupported)
            {
                Vector128 <byte> shuffleMask = Vector128.Create(
                    (byte)2, (byte)1, (byte)0, (byte)3,
                    (byte)6, (byte)5, (byte)4, (byte)7,
                    (byte)10, (byte)9, (byte)8, (byte)11,
                    (byte)14, (byte)13, (byte)12, (byte)15);

                int widthTrunc = width & ~7;
                int strideGap  = stride - width * 4;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dst)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                Vector128 <ushort> pixel12 = Sse2.LoadVector128((ushort *)(ip + (uint)x));
                                Vector128 <ushort> pixel34 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 2));
                                Vector128 <ushort> pixel56 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 4));
                                Vector128 <ushort> pixel78 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 6));

                                pixel12 = Sse2.ShiftRightLogical(pixel12, 2);
                                pixel34 = Sse2.ShiftRightLogical(pixel34, 2);
                                pixel56 = Sse2.ShiftRightLogical(pixel56, 2);
                                pixel78 = Sse2.ShiftRightLogical(pixel78, 2);

                                Vector128 <byte> pixel1234 = Sse2.PackUnsignedSaturate(pixel12.AsInt16(), pixel34.AsInt16());
                                Vector128 <byte> pixel5678 = Sse2.PackUnsignedSaturate(pixel56.AsInt16(), pixel78.AsInt16());

                                pixel1234 = Ssse3.Shuffle(pixel1234, shuffleMask);
                                pixel5678 = Ssse3.Shuffle(pixel5678, shuffleMask);

                                Sse2.Store(op + 0x00, pixel1234);
                                Sse2.Store(op + 0x10, pixel5678);

                                op += 0x20;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *(op + 0) = Downsample(px->B);
                                *(op + 1) = Downsample(px->G);
                                *(op + 2) = Downsample(px->R);
                                *(op + 3) = Downsample(px->A);

                                op += 4;
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    int baseOffs = y * stride;

                    for (int x = 0; x < width; x++)
                    {
                        int offs = baseOffs + x * 4;

                        dst[offs + 0] = Downsample(input.GetB(x, y));
                        dst[offs + 1] = Downsample(input.GetG(x, y));
                        dst[offs + 2] = Downsample(input.GetR(x, y));
                        dst[offs + 3] = Downsample(input.GetA(x, y));
                    }
                }
            }

            bool outLinear = config.OutBlkKind == 0;

            int gobBlocksInY = 1 << config.OutBlkHeight;

            WriteBuffer(rm, dst, offsets.LumaOffset, outLinear, width, height, 4, gobBlocksInY);

            rm.BufferPool.Return(dstIndex);
        }
Exemplo n.º 7
0
        protected override unsafe void ExecuteDay(byte[] input)
        {
            if (input == null)
            {
                return;
            }

            // borrowed liberally from https://github.com/Voltara/advent2017-fast/blob/master/src/day06.c
            var bytes  = stackalloc byte[Vector128 <byte> .Count];
            var ulongs = (ulong *)bytes;

            var x   = Vector128 <byte> .Zero;
            int n   = 0;
            var ctr = 0;

            for (int i = 0; i < input.Length && ctr < 16; i++)
            {
                if (input[i] < '0')
                {
                    x = x.WithElement(ctr++, (byte)n);
                    n = 0;
                }
                else
                {
                    n = n * 10 + (input[i] - '0');
                }
            }

            var map = new Dictionary <Vector128 <byte>, int>(capacity: PERFORMANCE_NOTE)
            {
                [x] = 0,
            };

            ctr = 0;

            var mask1 = Vector128.Create(0x0607040502030001ul, 0x0e0f0c0d0a0b0809ul).AsByte();
            var mask2 = Vector128.Create(0x0405060700010203ul, 0x0c0d0e0f08090a0bul).AsByte();
            var mask3 = Vector128.Create(0x0001020304050607ul, 0x08090a0b0c0d0e0ful).AsByte();
            var mask4 = Vector128.Create(0x08090a0b0c0d0e0ful, 0x0001020304050607ul).AsByte();

            while (true)
            {
                // get max byte
                var tmp = Avx2.Max(x, Avx2.Shuffle(x, mask1));
                tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask2));
                tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask3));
                tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask4));

                // every byte in tmp should be max value
                var max = Avx2.Extract(tmp, 0);

                // where is it in the original?
                var idx = (int)Bmi1.TrailingZeroCount((uint)
                                                      Avx2.MoveMask(Avx2.CompareEqual(x, tmp)));

                // subtract it from it's original place
                var high  = (ulong)(long)-((idx & 0x08) >> 3);
                var shift = idx << 3;
                ulongs[0] = ((ulong)max << shift) & ~high;
                ulongs[1] = ((ulong)max << shift) & high;
                tmp       = Avx2.Subtract(x, Avx2.LoadVector128(bytes));

                // over 16? add 1 to all
                high      = (ulong)(long)-((max & 0x10) >> 4);
                ulongs[0] = high & 0x0101010101010101ul;
                ulongs[1] = high & 0x0101010101010101ul;
                tmp       = Avx2.Add(tmp, Avx2.LoadVector128(bytes));

                // spread remainder to all
                // bitmask however many we're adding
                max  &= 0x0f;
                shift = max << 3;
                var isLong   = (ulong)(long)-((max & 0x08) >> 3);
                var mask     = (0x1ul << shift) - 1;
                var lowMask  = isLong | mask;
                var highMask = isLong & mask;

                // rotate our start point
                var start = (idx + 1) & 0x0f;
                isLong = (ulong)(long)-((start & 0x08) >> 3);
                var tmpLow  = (~isLong & lowMask) | (isLong & highMask);
                var tmpHigh = (isLong & lowMask) | (~isLong & highMask);

                var doShift = (ulong)((-(start & 0x07)) >> 4);
                shift   = start << 3;
                lowMask =
                    ((tmpLow << shift | tmpHigh >> (128 - shift)) & doShift) |
                    (~doShift & tmpLow);
                highMask =
                    ((tmpHigh << shift | tmpLow >> (128 - shift)) & doShift) |
                    (~doShift & tmpHigh);

                // build our adders and add values
                ulongs[0] = 0x0101010101010101ul & lowMask;
                ulongs[1] = 0x0101010101010101ul & highMask;
                tmp       = Avx2.Add(tmp, Avx2.LoadVector128(bytes));

                x = tmp;

                ctr++;
                if (map.ContainsKey(x))
                {
                    PartA = ctr.ToString();
                    PartB = (ctr - map[x]).ToString();
                    return;
                }

                map[x] = ctr;
            }
        }
Exemplo n.º 8
0
 public static void MinTest()
 {
     Assert.That(() => VectorUtilities.Min(Vector128.Create(-0.0f, -1.0f, -2.0f, -3.0f), Vector128.Create(-3.0f, -2.0f, -1.0f, -0.0f)),
                 Is.EqualTo(Vector128.Create(-3.0f, -2.0f, -2.0f, -3.0f))
                 );
 }
Exemplo n.º 9
0
 public static void MultiplyAddNegatedTest()
 {
     Assert.That(() => VectorUtilities.MultiplyAddNegated(Vector128.Create(10.0f, 10.0f, 10.0f, 10.0f), Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)),
                 Is.EqualTo(Vector128.Create(10.0f, 5.0f, -2.0f, -11.0f))
                 );
 }
Exemplo n.º 10
0
 public static void LengthSquaredTest()
 {
     Assert.That(() => VectorUtilities.LengthSquared(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)),
                 Is.EqualTo(Vector128.Create(14.0f, 14.0f, 14.0f, 14.0f))
                 );
 }
Exemplo n.º 11
0
 public static void MaxTest()
 {
     Assert.That(() => VectorUtilities.Max(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(3.0f, 2.0f, 1.0f, 0.0f)),
                 Is.EqualTo(Vector128.Create(3.0f, 2.0f, 2.0f, 3.0f))
                 );
 }
Exemplo n.º 12
0
    public static void CompareEqualAllTest()
    {
        Assert.That(() => VectorUtilities.CompareEqualAll(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f), Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f)),
                    Is.True
                    );

        Assert.That(() => VectorUtilities.CompareEqualAll(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f), Vector128.Create(1.0f, -2.0f, 3.0f, -4.0f)),
                    Is.False
                    );
    }
Exemplo n.º 13
0
 public static void LengthTest()
 {
     Assert.That(() => VectorUtilities.Length(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)),
                 Is.EqualTo(Vector128.Create(3.7416575f, 3.7416575f, 3.7416575f, 3.7416575f))
                 );
 }
Exemplo n.º 14
0
 public static void InterleaveUpperTest()
 {
     Assert.That(() => VectorUtilities.InterleaveUpper(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)),
                 Is.EqualTo(Vector128.Create(2.0f, 6.0f, 3.0f, 7.0f))
                 );
 }
Exemplo n.º 15
0
 private MD5HashCore(int ignored = default)
 {
     Miscellaneous.IgnoreParameter(ignored);
     _HashCode     = Vector128.Create(0X67452301U, 0Xefcdab89U, 0X98badcfeU, 0X10325476U);
     _BytePosition = 0;
 }
Exemplo n.º 16
0
 public static void MultiplyByWTest()
 {
     Assert.That(() => VectorUtilities.MultiplyByW(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)),
                 Is.EqualTo(Vector128.Create(0.0f, 7.0f, 14.0f, 21.0f))
                 );
 }
Exemplo n.º 17
0
 public static Vector128 <float> Create(float x, float y, float z, float w) => Vector128.Create(x, y, z, w);
Exemplo n.º 18
0
 public static void NormalizeTest()
 {
     Assert.That(() => VectorUtilities.Normalize(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)),
                 Is.EqualTo(Vector128.Create(0.0f, 0.26726124f, 0.5345225f, 0.8017837f))
                 );
 }
Exemplo n.º 19
0
        private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int gobBlocksInY = 1 << config.OutBlkHeight;

            bool outLinear = config.OutBlkKind == 0;

            int width   = Math.Min(config.OutLumaWidth + 1, input.Width);
            int height  = Math.Min(config.OutLumaHeight + 1, input.Height);
            int yStride = GetPitch(config.OutLumaWidth + 1, 1);

            int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY);

            if (Sse41.IsSupported)
            {
                Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16();

                int widthTrunc = width & ~0xf;
                int strideGap  = yStride - width;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstY)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 16)
                            {
                                byte *baseOffset = (byte *)(ip + (ulong)(uint)x);

                                Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset);
                                Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10));
                                Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20));
                                Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30));
                                Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40));
                                Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50));
                                Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60));
                                Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70));

                                pixelp1 = Sse2.And(pixelp1, mask);
                                pixelp2 = Sse2.And(pixelp2, mask);
                                pixelp3 = Sse2.And(pixelp3, mask);
                                pixelp4 = Sse2.And(pixelp4, mask);
                                pixelp5 = Sse2.And(pixelp5, mask);
                                pixelp6 = Sse2.And(pixelp6, mask);
                                pixelp7 = Sse2.And(pixelp7, mask);
                                pixelp8 = Sse2.And(pixelp8, mask);

                                Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32());
                                Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32());
                                Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32());
                                Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32());

                                pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32());
                                pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32());

                                pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2);
                                pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *op++ = Downsample(px->R);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        dstY[y * yStride + x] = Downsample(input.GetR(x, y));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstY,
                offsets.LumaOffset,
                outLinear,
                config.OutLumaWidth + 1,
                config.OutLumaHeight + 1,
                1,
                gobBlocksInY);

            rm.BufferPool.Return(dstYIndex);

            int uvWidth  = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1);
            int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1);
            int uvStride = GetPitch(config.OutChromaWidth + 1, 2);

            int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv);

            if (Sse2.IsSupported)
            {
                int widthTrunc = uvWidth & ~7;
                int strideGap  = uvStride - uvWidth * 2;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstUv)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < uvHeight; y++, ip += input.Width * 2)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16;

                                Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02));
                                Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12));
                                Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22));
                                Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32));
                                Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42));
                                Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52));
                                Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62));
                                Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72));

                                Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2);
                                Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4);
                                Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6);
                                Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8);

                                Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64());
                                Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64());

                                pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2);
                                pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < uvWidth; x++)
                            {
                                Pixel *px = ip + (uint)(x << 1);

                                *op++ = Downsample(px->G);
                                *op++ = Downsample(px->B);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < uvHeight; y++)
                {
                    for (int x = 0; x < uvWidth; x++)
                    {
                        int xx = x << 1;
                        int yy = y << 1;

                        int uvOffs = y * uvStride + xx;

                        dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy));
                        dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstUv,
                offsets.ChromaUOffset,
                outLinear,
                config.OutChromaWidth + 1,
                config.OutChromaHeight + 1, 2,
                gobBlocksInY);

            rm.BufferPool.Return(dstUvIndex);
        }
Exemplo n.º 20
0
 public static void QuaternionConjugateTest()
 {
     Assert.That(() => VectorUtilities.QuaternionConjugate(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f)),
                 Is.EqualTo(Vector128.Create(-1.0f, -2.0f, -3.0f, 4.0f))
                 );
 }
Exemplo n.º 21
0
 static VectorF SoftwareFallback(Vector4 vector)
 {
     return(Vector128.Create(vector.X, vector.Y, vector.Z, vector.W));
 }
Exemplo n.º 22
0
 public static void SqrtTest()
 {
     Assert.That(() => VectorUtilities.Sqrt(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)),
                 Is.EqualTo(Vector128.Create(0.0f, 1.0f, 1.4142135f, 1.7320508f))
                 );
 }
Exemplo n.º 23
0
        public static unsafe void SequentialFill(int *targetPtr, int startValue, int length)
        {
            var  value           = startValue;
            nint lengthToProcess = length;

#if NETCOREAPP
            nint alignmentCount = length;
            if (Sse2.IsSupported && length >= Vector128 <int> .Count * 2)
            {
                alignmentCount  = UnalignedCountVector128(targetPtr);
                lengthToProcess = alignmentCount;
            }
#endif

            while (lengthToProcess >= 8)
            {
                lengthToProcess -= 8;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
            }

            if (lengthToProcess > 4)
            {
                lengthToProcess -= 4;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
                *targetPtr = value;
                value++;
                targetPtr++;
            }

            while (lengthToProcess > 0)
            {
                lengthToProcess--;
                *targetPtr = value;
                value++;
                targetPtr++;
            }

#if NETCOREAPP
            lengthToProcess = length - alignmentCount;
            if (lengthToProcess > 0)
            {
                if (Avx2.IsSupported)
                {
                    var shiftVector256 = Vector256.Create(Vector256 <int> .Count);
                    var lastVector256  = Avx2.Add(Vector256.Create(value), VECTOR256_INT_ZERO_TO_SEVEN);

                    while (lengthToProcess >= Vector256 <int> .Count)
                    {
                        Avx.Store(targetPtr, lastVector256);
                        lastVector256    = Avx2.Add(lastVector256, shiftVector256);
                        targetPtr       += Vector256 <int> .Count;
                        lengthToProcess -= Vector256 <int> .Count;
                    }

                    if (lengthToProcess >= Vector128 <int> .Count)
                    {
                        Sse2.Store(targetPtr, lastVector256.GetLower());
                        targetPtr       += Vector128 <int> .Count;
                        lengthToProcess -= Vector128 <int> .Count;
                        value            = lastVector256.GetElement(Vector128 <int> .Count);
                    }
                    else
                    {
                        value = lastVector256.GetElement(0);
                    }
                }
                else if (Sse2.IsSupported)
                {
                    var shiftVector128 = Vector128.Create(Vector128 <int> .Count);
                    var lastVector128  = Sse2.Add(Vector128.Create(value), VECTOR128_INT_ZERO_TO_THREE);

                    while (lengthToProcess >= Vector128 <int> .Count)
                    {
                        Sse2.Store(targetPtr, lastVector128);
                        lastVector128    = Sse2.Add(lastVector128, shiftVector128);
                        targetPtr       += Vector128 <int> .Count;
                        lengthToProcess -= Vector128 <int> .Count;
                    }

                    value = lastVector128.GetElement(0);
                }

                while (lengthToProcess > 0)
                {
                    lengthToProcess--;
                    *targetPtr = value;
                    value++;
                    targetPtr++;
                }
            }
#endif
        }
Exemplo n.º 24
0
 public static void SubtractTest()
 {
     Assert.That(() => VectorUtilities.Subtract(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)),
                 Is.EqualTo(Vector128.Create(-4.0f, -4.0f, -4.0f, -4.0f))
                 );
 }
Exemplo n.º 25
0
        private void remapDitherSse2(byte *pimage, int *perr, byte *pout, uint *pilut, OctreeNode *ptree, uint *ppal, ref nuint nextFree, nint cp)
        {
            var transnode = new OctreeNode();

            transnode.Sums[3] = (uint)palEntries - 1;

            var vpmax = Vector128.Create((int)byte.MaxValue);
            var vprnd = Vector128.Create(7);
            var vzero = Vector128 <int> .Zero;

            nuint level = leafLevel;
            var   prnod = default(OctreeNode *);

            byte *ip = pimage, ipe = ip + cp * sizeof(uint);
            byte *op = pout;
            int * ep = perr;

            var vppix = vzero;
            var vperr = vzero;
            var vnerr = vzero;

            do
            {
                Vector128 <int> vpix, vdiff;
                if ((byte)ip[3] < alphaThreshold)
                {
                    vppix = vzero;
                    vdiff = vzero;
                    prnod = &transnode;
                    goto FoundExact;
                }

                if (Sse41.IsSupported)
                {
                    vpix = Sse41.ConvertToVector128Int32(ip);
                }
                else
                {
                    vpix = Sse2.UnpackLow(Sse2.UnpackLow(Sse2.LoadScalarVector128((int *)ip).AsByte(), vzero.AsByte()).AsInt16(), vzero.AsInt16()).AsInt32();
                }

                var verr = Sse2.Add(Sse2.Add(vprnd, Sse2.LoadVector128(ep)), Sse2.Subtract(Sse2.ShiftLeftLogical(vnerr, 3), vnerr));
                vpix = Sse2.Add(vpix, Sse2.ShiftRightArithmetic(verr, 4));
                vpix = Sse2.Min(vpix.AsInt16(), vpmax.AsInt16()).AsInt32();
                vpix = Sse2.Max(vpix.AsInt16(), vzero.AsInt16()).AsInt32();

                if (Sse2.MoveMask(Sse2.CompareEqual(vppix, vpix).AsByte()) == ushort.MaxValue)
                {
                    vdiff = vzero;
                    goto FoundExact;
                }

                vppix = vpix;
                nuint idx =
                    pilut[(nuint)Sse2.ConvertToUInt32(vppix.AsUInt32())] |
                    pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 2) + 256] |
                    pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 4) + 512];
                nuint next = idx & 7;

                var pnode = ptree + next;
                for (nuint i = 0; i <= level; i++)
                {
                    idx >>= 3;
                    nuint child = idx & 7;

                    ushort *children = (ushort *)pnode;
                    next = children[child];
                    if (next == 0)
                    {
                        uint *sums = (uint *)(children + 8);

                        if (i < minLeafLevel)
                        {
                            next            = nextFree++;
                            children[child] = (ushort)next;
                            pnode           = ptree + next;

                            if (i == minLeafLevel - 1)
                            {
                                initNode(pnode, vppix);
                                break;
                            }
                            else
                            {
                                uint *csums = (uint *)((ushort *)pnode + 8);
                                csums[3] = byte.MaxValue;
                            }
                        }
                        else if ((byte)sums[3] == byte.MaxValue)
                        {
                            for (nuint j = 1; j < 8; j++)
                            {
                                nuint sibling = children[child ^ j];
                                if (sibling != 0)
                                {
                                    var   snode = ptree + sibling;
                                    uint *ssums = (uint *)((ushort *)snode + 8);
                                    if ((byte)ssums[3] == byte.MaxValue)
                                    {
                                        next = sibling;
                                        nuint mask = child ^ sibling;
                                        idx = (child & mask) | (idx & ~mask);
                                        break;
                                    }
                                    else
                                    {
                                        prnod = snode;
                                        goto Found;
                                    }
                                }
                            }
                        }
                        else
                        {
                            break;
                        }
                    }

                    pnode = ptree + next;
                }

                prnod = pnode;

Found:
                vdiff = Sse2.Subtract(vppix, Sse2.LoadVector128((int *)((ushort *)prnod + 8)));

FoundExact:
                int *psums = (int *)((ushort *)prnod + 8);

                ip += sizeof(uint);
                *op++ = (byte)psums[3];

                Sse2.Store(ep - Vector128 <int> .Count, Sse2.Add(vperr, Sse2.Add(vdiff, vdiff)));
                ep += Vector128 <int> .Count;

                vperr = Sse2.Add(Sse2.ShiftLeftLogical(vdiff, 2), vnerr);
                vnerr = vdiff;
            } while (ip < ipe);

            Sse2.Store(ep - Vector128 <int> .Count, vperr);
        }
Exemplo n.º 26
0
 public static void CompareGreaterThanOrEqualTest()
 {
     Assert.That(() => VectorUtilities.CompareGreaterThanOrEqual(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f), Vector128.Create(1.0f, -2.0f, 3.0f, -4.0f)).AsUInt32(),
                 Is.EqualTo(Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
                 );
 }
Exemplo n.º 27
0
        public void RunReflectionScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunReflectionScenario));
            Vector128 <UInt64> value;

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object byteResult = typeof(Vector128 <UInt64>)
                                .GetMethod(nameof(Vector128 <UInt64> .AsByte), new Type[] { })
                                .Invoke(value, new object[] { });

            ValidateResult((Vector128 <byte>)(byteResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object doubleResult = typeof(Vector128 <UInt64>)
                                  .GetMethod(nameof(Vector128 <UInt64> .AsDouble), new Type[] { })
                                  .Invoke(value, new object[] { });

            ValidateResult((Vector128 <double>)(doubleResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object shortResult = typeof(Vector128 <UInt64>)
                                 .GetMethod(nameof(Vector128 <UInt64> .AsInt16), new Type[] { })
                                 .Invoke(value, new object[] { });

            ValidateResult((Vector128 <short>)(shortResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object intResult = typeof(Vector128 <UInt64>)
                               .GetMethod(nameof(Vector128 <UInt64> .AsInt32), new Type[] { })
                               .Invoke(value, new object[] { });

            ValidateResult((Vector128 <int>)(intResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object longResult = typeof(Vector128 <UInt64>)
                                .GetMethod(nameof(Vector128 <UInt64> .AsInt64), new Type[] { })
                                .Invoke(value, new object[] { });

            ValidateResult((Vector128 <long>)(longResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object sbyteResult = typeof(Vector128 <UInt64>)
                                 .GetMethod(nameof(Vector128 <UInt64> .AsSByte), new Type[] { })
                                 .Invoke(value, new object[] { });

            ValidateResult((Vector128 <sbyte>)(sbyteResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object floatResult = typeof(Vector128 <UInt64>)
                                 .GetMethod(nameof(Vector128 <UInt64> .AsSingle), new Type[] { })
                                 .Invoke(value, new object[] { });

            ValidateResult((Vector128 <float>)(floatResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object ushortResult = typeof(Vector128 <UInt64>)
                                  .GetMethod(nameof(Vector128 <UInt64> .AsUInt16), new Type[] { })
                                  .Invoke(value, new object[] { });

            ValidateResult((Vector128 <ushort>)(ushortResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object uintResult = typeof(Vector128 <UInt64>)
                                .GetMethod(nameof(Vector128 <UInt64> .AsUInt32), new Type[] { })
                                .Invoke(value, new object[] { });

            ValidateResult((Vector128 <uint>)(uintResult), value);

            value = Vector128.Create(TestLibrary.Generator.GetUInt64());
            object ulongResult = typeof(Vector128 <UInt64>)
                                 .GetMethod(nameof(Vector128 <UInt64> .AsUInt64), new Type[] { })
                                 .Invoke(value, new object[] { });

            ValidateResult((Vector128 <ulong>)(ulongResult), value);
        }
Exemplo n.º 28
0
        public VectorArg128 Change(float f)
        {
            Vector128 <float> t = Vector128.Create(f);

            return(new VectorArg128(Sse.Add(t, _rgb)));
        }
Exemplo n.º 29
0
        // Returns &inputBuffer[inputLength] if the input buffer is valid.
        /// <summary>
        /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
        /// </summary>
        /// <remarks>
        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
        /// </remarks>
        public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
        {
            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");

            // First, we'll handle the common case of all-ASCII. If this is able to
            // consume the entire buffer, we'll skip the remainder of this method's logic.

            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);

            Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);

            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
            inputLength  -= numAsciiCharsConsumedJustNow;

            if (inputLength == 0)
            {
                utf8CodeUnitCountAdjustment = 0;
                scalarCountAdjustment       = 0;
                return(pInputBuffer);
            }

            // If we got here, it means we saw some non-ASCII data, so within our
            // vectorized code paths below we'll handle all non-surrogate UTF-16
            // code points branchlessly. We'll only branch if we see surrogates.
            //
            // We still optimistically assume the data is mostly ASCII. This means that the
            // number of UTF-8 code units and the number of scalars almost matches the number
            // of UTF-16 code units. As we go through the input and find non-ASCII
            // characters, we'll keep track of these "adjustment" fixups. To get the
            // total number of UTF-8 code units required to encode the input data, add
            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
            // seen.  To get the total number of scalars present in the input data,
            // add the scalar count adjustment to the number of UTF-16 code units seen.

            long tempUtf8CodeUnitCountAdjustment = 0;
            int  tempScalarCountAdjustment       = 0;

            if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported)
            {
                if (inputLength >= Vector128 <ushort> .Count)
                {
                    Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80);
                    Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800);
                    Vector128 <short>  vector8800 = Vector128.Create(unchecked ((short)0x8800));
                    Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero;
                    do
                    {
                        Vector128 <ushort> utf16Data;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }
                        else
                        {
                            utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }

                        Vector128 <ushort> charIsNonAscii;

                        if (AdvSimd.Arm64.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = AdvSimd.Min(utf16Data, vector0080);
                        }
                        else if (Sse41.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
                        }
                        else
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will
                            // be handled in a few lines.

                            charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080);
                        }

#if DEBUG
                        // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element.
                        uint debugMask;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte());
                        }
                        else
                        {
                            debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
                        }
                        Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'.");
#endif // DEBUG

                        // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding
                        // input was 0x0800 <= [value]. This also handles the missing range a few lines above.

                        Vector128 <ushort> charIsThreeByteUtf8Encoded;
                        uint mask;

                        if (AdvSimd.IsSupported)
                        {
                            charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11));
                            mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }
                        else
                        {
                            charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
                            mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }

                        // Each even bit of mask will be 1 only if the char was >= 0x0080,
                        // and each odd bit of mask will be 1 only if the char was >= 0x0800.
                        //
                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
                        //
                        //            ,-- set if char[1] is >= 0x0800
                        //            |   ,-- set if char[0] is >= 0x0800
                        //            v   v
                        // mask = ... 1 1 0 1
                        //              ^   ^-- set if char[0] is non-ASCII
                        //              `-- set if char[1] is non-ASCII
                        //
                        // This means we can popcnt the number of set bits, and the result is the
                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
                        // it expands. This results in the wrong count for UTF-16 surrogate code
                        // units (we just counted that each individual code unit expands to 3 bytes,
                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
                        // We'll handle this in just a moment.
                        //
                        // For now, compute the popcnt but squirrel it away. We'll fold it in to the
                        // cumulative UTF-8 adjustment factor once we determine that there are no
                        // unpaired surrogates in our data. (Unpaired surrogates would invalidate
                        // our computed result and we'd have to throw it away.)

                        uint popcnt = (uint)BitOperations.PopCount(mask);

                        // Surrogates need to be special-cased for two reasons: (a) we need
                        // to account for the fact that we over-counted in the addition above;
                        // and (b) they require separate validation.
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.Add(utf16Data, vectorA800);
                            mask      = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }
                        else
                        {
                            utf16Data = Sse2.Add(utf16Data, vectorA800);
                            mask      = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }

                        if (mask != 0)
                        {
                            // There's at least one UTF-16 surrogate code unit present.
                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
                            // the resulting bits of 'mask' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
                            //
                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
                            // determine whether a given char was a high or a low surrogate.
                            //
                            // Therefore the resulting bits of 'mask2' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
                            //   Since 'mask' already has 00 in these positions (since the corresponding char
                            //   wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.

                            uint mask2;
                            if (AdvSimd.Arm64.IsSupported)
                            {
                                mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte());
                            }
                            else
                            {
                                mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
                            }

                            // 'lowSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a low surrogate char,
                            // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.

                            uint lowSurrogatesMask = mask2 & mask;

                            // 'highSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a high surrogate char,
                            // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.

                            uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask;

                            Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0,
                                         "A char cannot simultaneously be both a high and a low surrogate char.");

                            Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0,
                                         "Only even bits (no odd bits) of the masks should be set.");

                            // Now check that each high surrogate is followed by a low surrogate and that each
                            // low surrogate follows a high surrogate. We make an exception for the case where
                            // the final char of the vector is a high surrogate, since we can't perform validation
                            // on it until the next iteration of the loop when we hope to consume the matching
                            // low surrogate.

                            highSurrogatesMask <<= 2;
                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
                            {
                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                            }

                            if (highSurrogatesMask > ushort.MaxValue)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
                                popcnt            -= 2;                          // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
                                pInputBuffer--;
                                inputLength++;
                            }

                            // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
                            // free right now, saving the extension step a few lines below. If we're 32-bit, the
                            // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
                            // 64 -bit extension a few lines below.
                            nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
                            // assumes that the pair is encoded as 6 UTF-8 code units. Since each
                            // pair is in reality only encoded as 4 UTF-8 code units, we need to
                            // perform this adjustment now.

                            if (IntPtr.Size == 8)
                            {
                                // Since we've already zero-extended surrogatePairsCountNuint, we can directly
                                // sub + sub. It's more efficient than shl + sub.
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                            }
                            else
                            {
                                // Take the hit of the 64-bit extension now.
                                tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
                            }
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt;
                        pInputBuffer += Vector128 <ushort> .Count;
                        inputLength  -= Vector128 <ushort> .Count;
                    } while (inputLength >= Vector128 <ushort> .Count);
                }
            }
            else if (Vector.IsHardwareAccelerated)
            {
                if (inputLength >= Vector <ushort> .Count)
                {
                    Vector <ushort> vector0080 = new Vector <ushort>(0x0080);
                    Vector <ushort> vector0400 = new Vector <ushort>(0x0400);
                    Vector <ushort> vector0800 = new Vector <ushort>(0x0800);
                    Vector <ushort> vectorD800 = new Vector <ushort>(0xD800);

                    do
                    {
                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
                        // vectors, each element of the sum will contain one of three values:
                        //
                        // 0x0000 ( 0) = original char was 0000..007F
                        // 0xFFFF (-1) = original char was 0080..07FF
                        // 0xFFFE (-2) = original char was 0800..FFFF
                        //
                        // We'll negate them to produce a value 0..2 for each element, then sum all the
                        // elements together to produce the number of *additional* UTF-8 code units
                        // required to represent this UTF-16 data. This is similar to the popcnt step
                        // performed by the SSE2 code path. This will overcount surrogates, but we'll
                        // handle that shortly.

                        Vector <ushort>  utf16Data            = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer);
                        Vector <ushort>  twoOrMoreUtf8Bytes   = Vector.GreaterThanOrEqual(utf16Data, vector0080);
                        Vector <ushort>  threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
                        Vector <nuint_t> sumVector            = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);

                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
                        // which should halve the number of operations we must perform.

                        nuint popcnt = 0;
                        for (int i = 0; i < Vector <nuint_t> .Count; i++)
                        {
                            popcnt += (nuint)sumVector[i];
                        }

                        uint popcnt32 = (uint)popcnt;
                        if (IntPtr.Size == 8)
                        {
                            popcnt32 += (uint)(popcnt >> 32);
                        }

                        // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
                        // know there aren't any unpaired surrogates in the input data.

                        popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);

                        // Now check for surrogates.

                        utf16Data -= vectorD800;
                        Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
                        if (surrogateChars != Vector <ushort> .Zero)
                        {
                            // There's at least one surrogate (high or low) UTF-16 code unit in
                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
                            // UTF-16 code unit was a high or low surrogate, respectively.

                            Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
                            Vector <ushort> lowSurrogateChars  = Vector.AndNot(surrogateChars, highSurrogateChars);

                            // We want to make sure that each high surrogate code unit is followed by
                            // a low surrogate code unit and each low surrogate code unit follows a
                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
                            // or palignr available to us, we'll do this as a loop. We won't look at
                            // the very last high surrogate char element since we don't yet know if
                            // the next vector read will have a low surrogate char element.

                            if (lowSurrogateChars[0] != 0)
                            {
                                goto Error; // error: start of buffer contains standalone low surrogate char
                            }

                            ushort surrogatePairsCount = 0;
                            for (int i = 0; i < Vector <ushort> .Count - 1; i++)
                            {
                                surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
                                {
                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                                }
                            }

                            if (highSurrogateChars[Vector <ushort> .Count - 1] != 0)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                pInputBuffer--;
                                inputLength++;
                                popcnt32 -= 2;
                            }

                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
                            // so we'll adjust this now.

                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt32;
                        pInputBuffer += Vector <ushort> .Count;
                        inputLength  -= Vector <ushort> .Count;
                    } while (inputLength >= Vector <ushort> .Count);
                }
            }

NonVectorizedLoop:

            // Vectorization isn't supported on our current platform, or the input was too small to benefit
            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
            // drain remaining valid chars before we report failure.

            for (; inputLength > 0; pInputBuffer++, inputLength--)
            {
                uint thisChar = pInputBuffer[0];
                if (thisChar <= 0x7F)
                {
                    continue;
                }

                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
                // This optimistically assumes no surrogates, which we'll handle shortly.

                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;

                if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
                {
                    continue;
                }

                // Found a surrogate char. Back out the adjustment we made above, then
                // try to consume the entire surrogate pair all at once. We won't bother
                // trying to interpret the surrogate pair as a scalar value; we'll only
                // validate that its bit pattern matches what's expected for a surrogate pair.

                tempUtf8CodeUnitCountAdjustment -= 2;

                if (inputLength == 1)
                {
                    goto Error; // input buffer too small to read a surrogate pair
                }

                thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
                {
                    goto Error; // not a well-formed surrogate pair
                }

                tempScalarCountAdjustment--;          // 2 UTF-16 code units -> 1 scalar
                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units

                pInputBuffer++;                       // consumed one extra char
                inputLength--;
            }

Error:

            // Also used for normal return.

            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
            scalarCountAdjustment       = tempScalarCountAdjustment;
            return(pInputBuffer);
        }
Exemplo n.º 30
0
 public static void DotProductTest()
 {
     Assert.That(() => VectorUtilities.DotProduct(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)),
                 Is.EqualTo(Vector128.Create(38.0f, 38.0f, 38.0f, 38.0f))
                 );
 }