C# (CSharp) Vector128.AsInt16 примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: Vector128

Метод/Функция: AsInt16

Примеров на hotexamples.com: 15

C# (CSharp) Vector128.AsInt16 - 15 примеров найдено. Это лучшие примеры C# (CSharp) кода для Vector128.AsInt16, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetElement(30)

CreateScalarUnsafe(30)

Create(30)

CreateScalar(28)

AsByte(26)

AsUInt64(21)

As(18)

AsInt32(16)

AsInt16(15)

AsSByte(14)

AsUInt32(14)

Equals(12)

Element(10)

ConditionalSelect(8)

GreaterThanAny(7)

GreaterThanOrEqualAll(7)

EqualsAny(7)

GreaterThanAll(7)

GreaterThan(7)

GreaterThanOrEqualAny(7)

LessThanOrEqualAll(7)

LessThanAll(7)

LessThanAny(7)

Add(7)

LessThanOrEqualAny(7)

AsSingle(7)

AsInt64(7)

EqualsAll(7)

LessThan(6)

LessThanOrEqual(6)

GreaterThanOrEqual(6)

Max(6)

Min(6)

Multiply(6)

Narrow(6)

Negate(6)

Sqrt(6)

Abs(6)

Floor(6)

Divide(6)

AndNot(6)

AsUInt16(6)

BitwiseOr(6)

Subtract(6)

LoadUnsafe(5)

ConvertToUInt32(5)

ConvertToSingle(5)

ConvertToInt64(5)

ConvertToInt32(5)

ConvertToDouble(5)

Пример #1

Показать файл

Файл: BC7Encoder.cs Проект: Ryujinx/Ryujinx

        private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan <uint> tile, int subsetCount, int partition, int w, int h, int maxError)
        {
            byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];

            Span <RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
            Span <RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];

            BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);

            Span <uint> endPoints0 = stackalloc uint[subsetCount];
            Span <uint> endPoints1 = stackalloc uint[subsetCount];

            SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue);

            Span <RgbaColor32> palette = stackalloc RgbaColor32[8];

            int errorSum = 0;

            for (int subset = 0; subset < subsetCount; subset++)
            {
                RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
                int         sum      = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
                if (sum != 0)
                {
                    blockDir = (blockDir << 6) / new RgbaColor32(sum);
                }

                uint c0 = endPoints0[subset];
                uint c1 = endPoints1[subset];

                int pBit0 = GetPBit(c0, 6, 0);
                int pBit1 = GetPBit(c1, 6, 0);

                c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32();
                c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32();

                if (Sse41.IsSupported)
                {
                    Vector128 <byte> c0Rep = Vector128.Create(c0).AsByte();
                    Vector128 <byte> c1Rep = Vector128.Create(c1).AsByte();

                    Vector128 <byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);

                    Vector128 <byte> rWeights;
                    Vector128 <byte> lWeights;

                    fixed(byte *pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
                    {
                        rWeights = Sse2.LoadScalarVector128((ulong *)pWeights).AsByte();
                        lWeights = Sse2.LoadScalarVector128((ulong *)pInvWeights).AsByte();
                    }

                    Vector128 <byte> iWeights   = Sse2.UnpackLow(rWeights, lWeights);
                    Vector128 <byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
                    Vector128 <byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
                    Vector128 <byte> iWeights0  = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
                    Vector128 <byte> iWeights1  = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
                    Vector128 <byte> iWeights2  = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
                    Vector128 <byte> iWeights3  = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();

Пример #2

Показать файл

Файл: numberparsing.cs Проект: lhutyra/SimdJsonSharp

        private static uint32_t parse_eight_digits_unrolled(bytechar *chars)
        {
            // this actually computes *16* values so we are being wasteful.
            Vector128 <sbyte>  ascii0 = Vector128.Create((bytechar)'0');
            Vector128 <sbyte>  input  = Sse2.Subtract(Sse2.LoadVector128(chars), ascii0);
            Vector128 <short>  t1     = Ssse3.MultiplyAddAdjacent(input.AsByte(), mul_1_10);
            Vector128 <int>    t2     = Sse2.MultiplyAddAdjacent(t1, mul_1_100);
            Vector128 <ushort> t3     = Sse41.PackUnsignedSaturate(t2, t2);
            Vector128 <int>    t4     = Sse2.MultiplyAddAdjacent(t3.AsInt16(), mul_1_10000);

            return(Sse2.ConvertToUInt32(t4.AsUInt32())); // only captures the sum of the first 8 digits, drop the rest
        }

Пример #3

Показать файл

Файл: VectorHelpers.cs Проект: layomia/dotnet_runtime

 public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct
 {
     if (typeof(T) == typeof(byte))
     {
         return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>());
     }
     else if (typeof(T) == typeof(sbyte))
     {
         return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>());
     }
     else if (typeof(T) == typeof(short))
     {
         return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>());
     }
     else if (typeof(T) == typeof(ushort))
     {
         return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>());
     }
     else if (typeof(T) == typeof(int))
     {
         return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>());
     }
     else if (typeof(T) == typeof(uint))
     {
         return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>());
     }
     else if (typeof(T) == typeof(long))
     {
         return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>());
     }
     else if (typeof(T) == typeof(ulong))
     {
         return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>());
     }
     else if (typeof(T) == typeof(float))
     {
         return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>());
     }
     else if (typeof(T) == typeof(double))
     {
         return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>());
     }
     else
     {
         throw new NotSupportedException();
     }
 }

Пример #4

Показать файл

Файл: BCnDecoder.cs Проект: Ryujinx/Ryujinx

        private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI)
        {
            if (Avx2.IsSupported)
            {
                Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output);

                Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
                Vector128 <uint> masks  = Vector128.Create(7u);

                Vector128 <byte> vClut;

                fixed(byte *pRPal = rPal)
                {
                    vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte();
                }

                Vector128 <uint> indices0  = Vector128.Create((uint)rI);
                Vector128 <uint> indices1  = Vector128.Create((uint)(rI >> 24));
                Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
                Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
                Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
                Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
                indices00 = Sse2.And(indices00, masks);
                indices10 = Sse2.And(indices10, masks);
                indices01 = Sse2.And(indices01, masks);
                indices11 = Sse2.And(indices11, masks);

                Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
                Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());

                Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());

                outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
            }
            else
            {
                for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
                {
                    output[i] = rPal[(int)(rI & 7)];
                }
            }
        }

Пример #5

Показать файл

Файл: Vp8Residual.cs Проект: br3aker/ImageSharp

        public void SetCoeffs(Span <short> coeffs)
        {
#if SUPPORTS_RUNTIME_INTRINSICS
            if (Sse2.IsSupported)
            {
                ref short        coeffsRef = ref MemoryMarshal.GetReference(coeffs);
                Vector128 <byte> c0        = Unsafe.As <short, Vector128 <byte> >(ref coeffsRef);
                Vector128 <byte> c1        = Unsafe.As <short, Vector128 <byte> >(ref Unsafe.Add(ref coeffsRef, 8));

                // Use SSE2 to compare 16 values with a single instruction.
                Vector128 <sbyte> m0 = Sse2.PackSignedSaturate(c0.AsInt16(), c1.AsInt16());
                Vector128 <sbyte> m1 = Sse2.CompareEqual(m0, Vector128 <sbyte> .Zero);

                // Get the comparison results as a bitmask into 16bits. Negate the mask to get
                // the position of entries that are not equal to zero. We don't need to mask
                // out least significant bits according to res->first, since coeffs[0] is 0
                // if res->first > 0.
                uint mask = 0x0000ffffu ^ (uint)Sse2.MoveMask(m1);

                // The position of the most significant non-zero bit indicates the position of
                // the last non-zero value.
                this.Last = mask != 0 ? Numerics.Log2(mask) : -1;
            }

Пример #6

Показать файл

    public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor)
    {
        // Based on https://stackoverflow.com/a/51458507/347870

        // Convert to two 32-bit integers
        Vector128 <int> a_hi_epi32       = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16);
        Vector128 <int> a_lo_epi32       = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16);

        Vector128 <int> b_hi_epi32       = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16);
        Vector128 <int> b_lo_epi32       = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16);

        // Convert to 32-bit floats
        Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32);
        Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32);
        Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32);
        Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32);

        // Calculate the reciprocal
        Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi);
        Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo);

        // Calculate the inverse
        Vector128 <float> b_hi_inv_1;
        Vector128 <float> b_lo_inv_1;
        Vector128 <float> two = Vector128.Create(2.00000051757f);

        if (Fma.IsSupported)
        {
            b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two);
            b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two);
        }
        else
        {
            Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi);
            Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo);
            b_hi_inv_1 = Sse.Subtract(two, b_mul_hi);
            b_lo_inv_1 = Sse.Subtract(two, b_mul_lo);
        }

        // Compensate for the loss
        Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1);
        Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1);

        // Perform the division by multiplication
        Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1);
        Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1);

        // Convert back to integers
        Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi);
        Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo);

        // Zero-out the unnecessary parts
        Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16);

        // Blend the bits, and return
        if (Sse41.IsSupported)
        {
            return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA));
        }
        else
        {
            Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32());
            return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16());
        }
    }

Пример #7

Показать файл

Файл: SurfaceReader.cs Проект: ChosenCode1/Ryujinx

        private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets)
        {
            InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2);

            int width  = input.Width;
            int height = input.Height;

            int yStride  = GetPitch(width, 1);
            int uvStride = GetPitch(input.UvWidth, 2);

            Surface output = new Surface(rm.SurfacePool, width, height);

            if (Sse41.IsSupported)
            {
                Vector128 <byte> shufMask = Vector128.Create(
                    (byte)0, (byte)2, (byte)3, (byte)1,
                    (byte)4, (byte)6, (byte)7, (byte)5,
                    (byte)8, (byte)10, (byte)11, (byte)9,
                    (byte)12, (byte)14, (byte)15, (byte)13);
                Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16();

                int yStrideGap  = yStride - width;
                int uvStrideGap = uvStride - input.UvWidth;

                int widthTrunc = width & ~0xf;

                fixed(Pixel *dstPtr = output.Data)
                {
                    Pixel *op = dstPtr;

                    fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1)
                    {
                        byte *i0p = src0Ptr;

                        for (int y = 0; y < height; y++)
                        {
                            byte *i1p = src1Ptr + (y >> 1) * uvStride;

                            int x = 0;

                            for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16)
                            {
                                Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p);
                                Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8);

                                Vector128 <byte> uv = Sse2.LoadVector128(i1p);

                                Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16());
                                Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16());

                                Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0);
                                Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0);
                                Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1);
                                Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1);

                                rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16();
                                rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16();
                                rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16();
                                rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16();

                                Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte());
                                Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte()));
                                Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte());
                                Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte()));
                                Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte());
                                Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte()));
                                Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte());
                                Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte()));

                                rgba16_0 = Sse2.Or(rgba16_0, alphaMask);
                                rgba16_1 = Sse2.Or(rgba16_1, alphaMask);
                                rgba16_2 = Sse2.Or(rgba16_2, alphaMask);
                                rgba16_3 = Sse2.Or(rgba16_3, alphaMask);
                                rgba16_4 = Sse2.Or(rgba16_4, alphaMask);
                                rgba16_5 = Sse2.Or(rgba16_5, alphaMask);
                                rgba16_6 = Sse2.Or(rgba16_6, alphaMask);
                                rgba16_7 = Sse2.Or(rgba16_7, alphaMask);

                                rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2);
                                rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2);
                                rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2);
                                rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2);
                                rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2);
                                rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2);
                                rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2);
                                rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2);

                                Sse2.Store((short *)(op + (uint)x + 0), rgba16_0);
                                Sse2.Store((short *)(op + (uint)x + 2), rgba16_1);
                                Sse2.Store((short *)(op + (uint)x + 4), rgba16_2);
                                Sse2.Store((short *)(op + (uint)x + 6), rgba16_3);
                                Sse2.Store((short *)(op + (uint)x + 8), rgba16_4);
                                Sse2.Store((short *)(op + (uint)x + 10), rgba16_5);
                                Sse2.Store((short *)(op + (uint)x + 12), rgba16_6);
                                Sse2.Store((short *)(op + (uint)x + 14), rgba16_7);
                            }

                            for (; x < width; x++, i1p += (x & 1) * 2)
                            {
                                Pixel *px = op + (uint)x;

                                px->R = Upsample(*i0p++);
                                px->G = Upsample(*i1p);
                                px->B = Upsample(*(i1p + 1));
                                px->A = 0x3ff;
                            }

                            op  += width;
                            i0p += yStrideGap;
                            i1p += uvStrideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    int uvBase = (y >> 1) * uvStride;

                    for (int x = 0; x < width; x++)
                    {
                        output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x]));

                        int uvOffs = uvBase + (x & ~1);

                        output.SetG(x, y, Upsample(input.Buffer1[uvOffs]));
                        output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1]));
                        output.SetA(x, y, 0x3ff);
                    }
                }
            }

            return(output);
        }

Пример #8

Показать файл

        private static unsafe char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
        {
            // First, we'll handle the common case of all-ASCII. If this is able to
            // consume the entire buffer, we'll skip the remainder of this method's logic.

            int numAsciiCharsConsumedJustNow = (int)GetIndexOfFirstNonAsciiChar_Sse2(pInputBuffer, (uint)inputLength);

            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
            if (numAsciiCharsConsumedJustNow == inputLength)
            {
                utf8CodeUnitCountAdjustment = 0;
                scalarCountAdjustment       = 0;
                return(pInputBuffer);
            }

            // If we got here, it means we saw some non-ASCII data, so within our
            // vectorized code paths below we'll handle all non-surrogate UTF-16
            // code points branchlessly. We'll only branch if we see surrogates.
            //
            // We still optimistically assume the data is mostly ASCII. This means that the
            // number of UTF-8 code units and the number of scalars almost matches the number
            // of UTF-16 code units. As we go through the input and find non-ASCII
            // characters, we'll keep track of these "adjustment" fixups. To get the
            // total number of UTF-8 code units required to encode the input data, add
            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
            // seen.  To get the total number of scalars present in the input data,
            // add the scalar count adjustment to the number of UTF-16 code units seen.

            long tempUtf8CodeUnitCountAdjustment = 0;
            int  tempScalarCountAdjustment       = 0;

            if (Sse41.IsSupported)
            {
                if (inputLength >= Vector128 <ushort> .Count)
                {
                    Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80);
                    Vector128 <ushort> vector0800 = Sse2.ShiftLeftLogical(vector0080, 4); // = 0x0800
                    Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800);
                    Vector128 <short>  vector8800 = Vector128.Create(unchecked ((short)0x8800));

                    do
                    {
                        Vector128 <ushort> utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer);

                        uint mask = (uint)Sse2.MoveMask(
                            Sse2.Or(
                                Sse2.ShiftLeftLogical(Sse41.Min(utf16Data, vector0080), 8),
                                Sse2.ShiftRightLogical(Sse41.Min(utf16Data, vector0800), 4)).AsByte());

                        // Each odd bit of mask will be 1 only if the char was >= 0x0080,
                        // and each even bit of mask will be 1 only if the char was >= 0x0800.
                        //
                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
                        //
                        //            ,-- set if char[1] is non-ASCII
                        //            |   ,-- set if char[0] is non-ASCII
                        //            v   v
                        // mask = ... 1 1 1 0
                        //              ^   ^-- set if char[0] is >= 0x800
                        //              `-- set if char[1] is >= 0x800
                        //
                        // This means we can popcnt the number of set bits, and the result is the
                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
                        // it expands. This results in the wrong count for UTF-16 surrogate code
                        // units (we just counted that each individual code unit expands to 3 bytes,
                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
                        // We'll handle this in just a moment.

                        tempUtf8CodeUnitCountAdjustment += (uint)BitOperations.PopCount(mask);

                        // Surrogates need to be special-cased for two reasons: (a) we need
                        // to account for the fact that we over-counted in the addition above;
                        // and (b) they require separate validation.

                        utf16Data = Sse2.Add(utf16Data, vectorA800);
                        mask      = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());

                        if (mask != 0)
                        {
                            // There's at least one UTF-16 surrogate code unit present.
                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
                            // the resulting bits of 'mask' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
                            //
                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
                            // determine whether a given char was a high or a low surrogate.
                            //
                            // Therefore the resulting bits of 'mask2' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.

                            uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());

                            uint lowSurrogatesMask  = mask2 & mask;             // 01 only if was a low surrogate char, else 00
                            uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00

                            // Now check that each high surrogate is followed by a low surrogate and that each
                            // low surrogate follows a high surrogate. We make an exception for the case where
                            // the final char of the vector is a high surrogate, since we can't perform validation
                            // on it until the next iteration of the loop when we hope to consume the matching
                            // low surrogate.

                            highSurrogatesMask <<= 2;
                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
                            {
                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                            }

                            if (highSurrogatesMask > ushort.MaxValue)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
                                pInputBuffer--;
                                inputLength++;
                            }

                            int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask);

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= surrogatePairsCount;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
                            // so we'll adjust this now.

                            nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size
                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                        }

                        pInputBuffer += Vector128 <ushort> .Count;
                        inputLength  -= Vector128 <ushort> .Count;
                    } while (inputLength >= Vector128 <ushort> .Count);
                }
            }
            else if (Vector.IsHardwareAccelerated)
            {
                if (inputLength >= Vector <ushort> .Count)
                {
                    Vector <ushort> vector0080 = new Vector <ushort>(0x0080);
                    Vector <ushort> vector0400 = new Vector <ushort>(0x0400);
                    Vector <ushort> vector0800 = new Vector <ushort>(0x0800);
                    Vector <ushort> vectorD800 = new Vector <ushort>(0xD800);

                    do
                    {
                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
                        // vectors, each element of the sum will contain one of three values:
                        //
                        // 0x0000 ( 0) = original char was 0000..007F
                        // 0xFFFF (-1) = original char was 0080..07FF
                        // 0xFFFE (-2) = original char was 0800..FFFF
                        //
                        // We'll negate them to produce a value 0..2 for each element, then sum all the
                        // elements together to produce the number of *additional* UTF-8 code units
                        // required to represent this UTF-16 data. This is similar to the popcnt step
                        // performed by the SSE41 code path. This will overcount surrogates, but we'll
                        // handle that shortly.

                        Vector <ushort> utf16Data            = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer);
                        Vector <ushort> twoOrMoreUtf8Bytes   = Vector.GreaterThanOrEqual(utf16Data, vector0080);
                        Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
                        Vector <nuint>  sumVector            = (Vector <nuint>)(-Vector.Add(twoOrMoreUtf8Bytes, threeOrMoreUtf8Bytes));

                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
                        // which should halve the number of operations we must perform.

                        nuint popcnt = 0;
                        for (int i = 0; i < Vector <nuint> .Count; i++)
                        {
                            popcnt += sumVector[i];
                        }

                        uint popcnt32 = (uint)popcnt;
                        if (sizeof(nuint) == sizeof(ulong))
                        {
                            popcnt32 += (uint)(popcnt >> 32);
                        }

                        tempUtf8CodeUnitCountAdjustment += (ushort)popcnt32;
                        tempUtf8CodeUnitCountAdjustment += popcnt32 >> 16;

                        // Now check for surrogates.

                        utf16Data -= vectorD800;
                        Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
                        if (surrogateChars != Vector <ushort> .Zero)
                        {
                            // There's at least one surrogate (high or low) UTF-16 code unit in
                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
                            // UTF-16 code unit was a high or low surrogate, respectively.

                            Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
                            Vector <ushort> lowSurrogateChars  = Vector.AndNot(surrogateChars, highSurrogateChars);

                            // We want to make sure that each high surrogate code unit is followed by
                            // a low surrogate code unit and each low surrogate code unit follows a
                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
                            // or palignr available to us, we'll do this as a loop. We won't look at
                            // the very last high surrogate char element since we don't yet know if
                            // the next vector read will have a low surrogate char element.

                            ushort surrogatePairsCount = 0;
                            for (int i = 0; i < Vector <ushort> .Count - 1; i++)
                            {
                                surrogatePairsCount -= highSurrogateChars[i];
                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
                                {
                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                                }
                            }

                            if (highSurrogateChars[Vector <ushort> .Count - 1] != 0)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                pInputBuffer--;
                                inputLength++;
                                tempUtf8CodeUnitCountAdjustment -= 2;
                                tempScalarCountAdjustment--;
                            }

                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
                            // so we'll adjust this now.

                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                        }

                        pInputBuffer += Vector <ushort> .Count;
                        inputLength  -= Vector <ushort> .Count;
                    } while (inputLength >= Vector <ushort> .Count);
                }
            }

NonVectorizedLoop:

            // Vectorization isn't supported on our current platform, or the input was too small to benefit
            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
            // drain remaining valid chars before we report failure.

            for (; inputLength > 0; pInputBuffer++, inputLength--)
            {
                uint thisChar = pInputBuffer[0];
                if (thisChar <= 0x7F)
                {
                    continue;
                }

                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
                // This optimistically assumes no surrogates, which we'll handle shortly.

                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;

                if (!IsSurrogateCodePoint(thisChar))
                {
                    continue;
                }

                // Found a surrogate char. Back out the adjustment we made above, then
                // try to consume the entire surrogate pair all at once. We won't bother
                // trying to interpret the surrogate pair as a scalar value; we'll only
                // validate that its bit pattern matches what's expected for a surrogate pair.

                tempUtf8CodeUnitCountAdjustment -= 2;

                if (inputLength == 1)
                {
                    goto Error; // input buffer too small to read a surrogate pair
                }

                thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
                {
                    goto Error; // not a well-formed surrogate pair
                }

                tempScalarCountAdjustment--;          // 2 UTF-16 code units -> 1 scalar
                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units

                pInputBuffer++;                       // consumed one extra char
                inputLength--;
            }

Error:

            // Also used for normal return.

            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
            scalarCountAdjustment       = tempScalarCountAdjustment;
            return(pInputBuffer);
        }

Пример #9

Показать файл

        private unsafe static void WriteA8B8G8R8(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int width  = input.Width;
            int height = input.Height;
            int stride = GetPitch(width, 4);

            int dstIndex = rm.BufferPool.Rent(height * stride, out Span <byte> dst);

            if (Sse2.IsSupported)
            {
                int widthTrunc = width & ~7;
                int strideGap  = stride - width * 4;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dst)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                Vector128 <ushort> pixel12 = Sse2.LoadVector128((ushort *)(ip + (uint)x));
                                Vector128 <ushort> pixel34 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 2));
                                Vector128 <ushort> pixel56 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 4));
                                Vector128 <ushort> pixel78 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 6));

                                pixel12 = Sse2.ShiftRightLogical(pixel12, 2);
                                pixel34 = Sse2.ShiftRightLogical(pixel34, 2);
                                pixel56 = Sse2.ShiftRightLogical(pixel56, 2);
                                pixel78 = Sse2.ShiftRightLogical(pixel78, 2);

                                Vector128 <byte> pixel1234 = Sse2.PackUnsignedSaturate(pixel12.AsInt16(), pixel34.AsInt16());
                                Vector128 <byte> pixel5678 = Sse2.PackUnsignedSaturate(pixel56.AsInt16(), pixel78.AsInt16());

                                Sse2.Store(op + 0x00, pixel1234);
                                Sse2.Store(op + 0x10, pixel5678);

                                op += 0x20;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *(op + 0) = Downsample(px->R);
                                *(op + 1) = Downsample(px->G);
                                *(op + 2) = Downsample(px->B);
                                *(op + 3) = Downsample(px->A);

                                op += 4;
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    int baseOffs = y * stride;

                    for (int x = 0; x < width; x++)
                    {
                        int offs = baseOffs + x * 4;

                        dst[offs + 0] = Downsample(input.GetR(x, y));
                        dst[offs + 1] = Downsample(input.GetG(x, y));
                        dst[offs + 2] = Downsample(input.GetB(x, y));
                        dst[offs + 3] = Downsample(input.GetA(x, y));
                    }
                }
            }

            bool outLinear = config.OutBlkKind == 0;

            int gobBlocksInY = 1 << config.OutBlkHeight;

            WriteBuffer(rm, dst, offsets.LumaOffset, outLinear, width, height, 4, gobBlocksInY);

            rm.BufferPool.Return(dstIndex);
        }

Пример #10

Показать файл

        private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets)
        {
            int gobBlocksInY = 1 << config.OutBlkHeight;

            bool outLinear = config.OutBlkKind == 0;

            int width   = Math.Min(config.OutLumaWidth + 1, input.Width);
            int height  = Math.Min(config.OutLumaHeight + 1, input.Height);
            int yStride = GetPitch(config.OutLumaWidth + 1, 1);

            int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY);

            if (Sse41.IsSupported)
            {
                Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16();

                int widthTrunc = width & ~0xf;
                int strideGap  = yStride - width;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstY)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < height; y++, ip += input.Width)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 16)
                            {
                                byte *baseOffset = (byte *)(ip + (ulong)(uint)x);

                                Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset);
                                Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10));
                                Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20));
                                Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30));
                                Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40));
                                Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50));
                                Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60));
                                Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70));

                                pixelp1 = Sse2.And(pixelp1, mask);
                                pixelp2 = Sse2.And(pixelp2, mask);
                                pixelp3 = Sse2.And(pixelp3, mask);
                                pixelp4 = Sse2.And(pixelp4, mask);
                                pixelp5 = Sse2.And(pixelp5, mask);
                                pixelp6 = Sse2.And(pixelp6, mask);
                                pixelp7 = Sse2.And(pixelp7, mask);
                                pixelp8 = Sse2.And(pixelp8, mask);

                                Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32());
                                Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32());
                                Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32());
                                Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32());

                                pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32());
                                pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32());

                                pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2);
                                pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < width; x++)
                            {
                                Pixel *px = ip + (uint)x;

                                *op++ = Downsample(px->R);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        dstY[y * yStride + x] = Downsample(input.GetR(x, y));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstY,
                offsets.LumaOffset,
                outLinear,
                config.OutLumaWidth + 1,
                config.OutLumaHeight + 1,
                1,
                gobBlocksInY);

            rm.BufferPool.Return(dstYIndex);

            int uvWidth  = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1);
            int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1);
            int uvStride = GetPitch(config.OutChromaWidth + 1, 2);

            int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv);

            if (Sse2.IsSupported)
            {
                int widthTrunc = uvWidth & ~7;
                int strideGap  = uvStride - uvWidth * 2;

                fixed(Pixel *srcPtr = input.Data)
                {
                    Pixel *ip = srcPtr;

                    fixed(byte *dstPtr = dstUv)
                    {
                        byte *op = dstPtr;

                        for (int y = 0; y < uvHeight; y++, ip += input.Width * 2)
                        {
                            int x = 0;

                            for (; x < widthTrunc; x += 8)
                            {
                                byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16;

                                Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02));
                                Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12));
                                Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22));
                                Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32));
                                Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42));
                                Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52));
                                Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62));
                                Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72));

                                Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2);
                                Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4);
                                Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6);
                                Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8);

                                Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64());
                                Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64());

                                pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2);
                                pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2);

                                Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16());

                                Sse2.Store(op, pixel);

                                op += 0x10;
                            }

                            for (; x < uvWidth; x++)
                            {
                                Pixel *px = ip + (uint)(x << 1);

                                *op++ = Downsample(px->G);
                                *op++ = Downsample(px->B);
                            }

                            op += strideGap;
                        }
                    }
                }
            }
            else
            {
                for (int y = 0; y < uvHeight; y++)
                {
                    for (int x = 0; x < uvWidth; x++)
                    {
                        int xx = x << 1;
                        int yy = y << 1;

                        int uvOffs = y * uvStride + xx;

                        dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy));
                        dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy));
                    }
                }
            }

            WriteBuffer(
                rm,
                dstUv,
                offsets.ChromaUOffset,
                outLinear,
                config.OutChromaWidth + 1,
                config.OutChromaHeight + 1, 2,
                gobBlocksInY);

            rm.BufferPool.Return(dstUvIndex);
        }

Пример #11

Показать файл

 public static Vector128 <byte> Narrow(Vector128 <ushort> low, Vector128 <ushort> high)
 {
     return(Sse2.PackUnsignedSaturate(low.AsInt16(), high.AsInt16()));
 }

Пример #12

Показать файл

Файл: Utf16Utility.Validation.cs Проект: humbatoa/runtime

        // Returns &inputBuffer[inputLength] if the input buffer is valid.
        /// <summary>
        /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>,
        /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>.
        /// </summary>
        /// <remarks>
        /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed.
        /// </remarks>
        public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment)
        {
            Debug.Assert(inputLength >= 0, "Input length must not be negative.");
            Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null.");

            // First, we'll handle the common case of all-ASCII. If this is able to
            // consume the entire buffer, we'll skip the remainder of this method's logic.

            int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength);

            Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength);

            pInputBuffer += (uint)numAsciiCharsConsumedJustNow;
            inputLength  -= numAsciiCharsConsumedJustNow;

            if (inputLength == 0)
            {
                utf8CodeUnitCountAdjustment = 0;
                scalarCountAdjustment       = 0;
                return(pInputBuffer);
            }

            // If we got here, it means we saw some non-ASCII data, so within our
            // vectorized code paths below we'll handle all non-surrogate UTF-16
            // code points branchlessly. We'll only branch if we see surrogates.
            //
            // We still optimistically assume the data is mostly ASCII. This means that the
            // number of UTF-8 code units and the number of scalars almost matches the number
            // of UTF-16 code units. As we go through the input and find non-ASCII
            // characters, we'll keep track of these "adjustment" fixups. To get the
            // total number of UTF-8 code units required to encode the input data, add
            // the UTF-8 code unit count adjustment to the number of UTF-16 code units
            // seen.  To get the total number of scalars present in the input data,
            // add the scalar count adjustment to the number of UTF-16 code units seen.

            long tempUtf8CodeUnitCountAdjustment = 0;
            int  tempScalarCountAdjustment       = 0;

            if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported)
            {
                if (inputLength >= Vector128 <ushort> .Count)
                {
                    Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80);
                    Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800);
                    Vector128 <short>  vector8800 = Vector128.Create(unchecked ((short)0x8800));
                    Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero;
                    do
                    {
                        Vector128 <ushort> utf16Data;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }
                        else
                        {
                            utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned
                        }

                        Vector128 <ushort> charIsNonAscii;

                        if (AdvSimd.Arm64.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = AdvSimd.Min(utf16Data, vector0080);
                        }
                        else if (Sse41.IsSupported)
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.)
                            charIsNonAscii = Sse41.Min(utf16Data, vector0080);
                        }
                        else
                        {
                            // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding
                            // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will
                            // be handled in a few lines.

                            charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080);
                        }

#if DEBUG
                        // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element.
                        uint debugMask;
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte());
                        }
                        else
                        {
                            debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte());
                        }
                        Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'.");
#endif // DEBUG

                        // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding
                        // input was 0x0800 <= [value]. This also handles the missing range a few lines above.

                        Vector128 <ushort> charIsThreeByteUtf8Encoded;
                        uint mask;

                        if (AdvSimd.IsSupported)
                        {
                            charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11));
                            mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }
                        else
                        {
                            charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11));
                            mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte());
                        }

                        // Each even bit of mask will be 1 only if the char was >= 0x0080,
                        // and each odd bit of mask will be 1 only if the char was >= 0x0800.
                        //
                        // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...":
                        //
                        //            ,-- set if char[1] is >= 0x0800
                        //            |   ,-- set if char[0] is >= 0x0800
                        //            v   v
                        // mask = ... 1 1 0 1
                        //              ^   ^-- set if char[0] is non-ASCII
                        //              `-- set if char[1] is non-ASCII
                        //
                        // This means we can popcnt the number of set bits, and the result is the
                        // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as
                        // it expands. This results in the wrong count for UTF-16 surrogate code
                        // units (we just counted that each individual code unit expands to 3 bytes,
                        // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes).
                        // We'll handle this in just a moment.
                        //
                        // For now, compute the popcnt but squirrel it away. We'll fold it in to the
                        // cumulative UTF-8 adjustment factor once we determine that there are no
                        // unpaired surrogates in our data. (Unpaired surrogates would invalidate
                        // our computed result and we'd have to throw it away.)

                        uint popcnt = (uint)BitOperations.PopCount(mask);

                        // Surrogates need to be special-cased for two reasons: (a) we need
                        // to account for the fact that we over-counted in the addition above;
                        // and (b) they require separate validation.
                        if (AdvSimd.Arm64.IsSupported)
                        {
                            utf16Data = AdvSimd.Add(utf16Data, vectorA800);
                            mask      = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }
                        else
                        {
                            utf16Data = Sse2.Add(utf16Data, vectorA800);
                            mask      = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte());
                        }

                        if (mask != 0)
                        {
                            // There's at least one UTF-16 surrogate code unit present.
                            // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw,
                            // the resulting bits of 'mask' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was not a surrogate code unit;
                            // - 11 if the corresponding UTF-16 char was a surrogate code unit.
                            //
                            // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ],
                            // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents
                            // a low surrogate. Since we added 0xA800 in the vectorized operation above,
                            // our surrogate pairs will now have the bit pattern [ 10000q## ######## ].
                            // If we logical right-shift each word by 3, we'll end up with the bit pattern
                            // [ 00010000 q####### ], which means that we can immediately use pmovmskb to
                            // determine whether a given char was a high or a low surrogate.
                            //
                            // Therefore the resulting bits of 'mask2' will occur in pairs:
                            // - 00 if the corresponding UTF-16 char was a high surrogate code unit;
                            // - 01 if the corresponding UTF-16 char was a low surrogate code unit;
                            // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit.
                            //   Since 'mask' already has 00 in these positions (since the corresponding char
                            //   wasn't a surrogate), "mask AND mask2 == 00" holds for these positions.

                            uint mask2;
                            if (AdvSimd.Arm64.IsSupported)
                            {
                                mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte());
                            }
                            else
                            {
                                mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte());
                            }

                            // 'lowSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a low surrogate char,
                            // - 00 if the corresponding char was a high surrogate char or not a surrogate at all.

                            uint lowSurrogatesMask = mask2 & mask;

                            // 'highSurrogatesMask' has its bits occur in pairs:
                            // - 01 if the corresponding char was a high surrogate char,
                            // - 00 if the corresponding char was a low surrogate char or not a surrogate at all.

                            uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask;

                            Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0,
                                         "A char cannot simultaneously be both a high and a low surrogate char.");

                            Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0,
                                         "Only even bits (no odd bits) of the masks should be set.");

                            // Now check that each high surrogate is followed by a low surrogate and that each
                            // low surrogate follows a high surrogate. We make an exception for the case where
                            // the final char of the vector is a high surrogate, since we can't perform validation
                            // on it until the next iteration of the loop when we hope to consume the matching
                            // low surrogate.

                            highSurrogatesMask <<= 2;
                            if ((ushort)highSurrogatesMask != lowSurrogatesMask)
                            {
                                goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                            }

                            if (highSurrogatesMask > ushort.MaxValue)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt
                                popcnt            -= 2;                          // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here
                                pInputBuffer--;
                                inputLength++;
                            }

                            // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for
                            // free right now, saving the extension step a few lines below. If we're 32-bit, the
                            // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real
                            // 64 -bit extension a few lines below.
                            nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask);

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNuint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation
                            // assumes that the pair is encoded as 6 UTF-8 code units. Since each
                            // pair is in reality only encoded as 4 UTF-8 code units, we need to
                            // perform this adjustment now.

                            if (IntPtr.Size == 8)
                            {
                                // Since we've already zero-extended surrogatePairsCountNuint, we can directly
                                // sub + sub. It's more efficient than shl + sub.
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                                tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint;
                            }
                            else
                            {
                                // Take the hit of the 64-bit extension now.
                                tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint;
                            }
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt;
                        pInputBuffer += Vector128 <ushort> .Count;
                        inputLength  -= Vector128 <ushort> .Count;
                    } while (inputLength >= Vector128 <ushort> .Count);
                }
            }
            else if (Vector.IsHardwareAccelerated)
            {
                if (inputLength >= Vector <ushort> .Count)
                {
                    Vector <ushort> vector0080 = new Vector <ushort>(0x0080);
                    Vector <ushort> vector0400 = new Vector <ushort>(0x0400);
                    Vector <ushort> vector0800 = new Vector <ushort>(0x0800);
                    Vector <ushort> vectorD800 = new Vector <ushort>(0xD800);

                    do
                    {
                        // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain
                        // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding
                        // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these
                        // vectors, each element of the sum will contain one of three values:
                        //
                        // 0x0000 ( 0) = original char was 0000..007F
                        // 0xFFFF (-1) = original char was 0080..07FF
                        // 0xFFFE (-2) = original char was 0800..FFFF
                        //
                        // We'll negate them to produce a value 0..2 for each element, then sum all the
                        // elements together to produce the number of *additional* UTF-8 code units
                        // required to represent this UTF-16 data. This is similar to the popcnt step
                        // performed by the SSE2 code path. This will overcount surrogates, but we'll
                        // handle that shortly.

                        Vector <ushort>  utf16Data            = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer);
                        Vector <ushort>  twoOrMoreUtf8Bytes   = Vector.GreaterThanOrEqual(utf16Data, vector0080);
                        Vector <ushort>  threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800);
                        Vector <nuint_t> sumVector            = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes);

                        // We'll try summing by a natural word (rather than a 16-bit word) at a time,
                        // which should halve the number of operations we must perform.

                        nuint popcnt = 0;
                        for (int i = 0; i < Vector <nuint_t> .Count; i++)
                        {
                            popcnt += (nuint)sumVector[i];
                        }

                        uint popcnt32 = (uint)popcnt;
                        if (IntPtr.Size == 8)
                        {
                            popcnt32 += (uint)(popcnt >> 32);
                        }

                        // As in the SSE4.1 paths, compute popcnt but don't fold it in until we
                        // know there aren't any unpaired surrogates in the input data.

                        popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16);

                        // Now check for surrogates.

                        utf16Data -= vectorD800;
                        Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800);
                        if (surrogateChars != Vector <ushort> .Zero)
                        {
                            // There's at least one surrogate (high or low) UTF-16 code unit in
                            // the vector. We'll build up additional vectors: 'highSurrogateChars'
                            // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original
                            // UTF-16 code unit was a high or low surrogate, respectively.

                            Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400);
                            Vector <ushort> lowSurrogateChars  = Vector.AndNot(surrogateChars, highSurrogateChars);

                            // We want to make sure that each high surrogate code unit is followed by
                            // a low surrogate code unit and each low surrogate code unit follows a
                            // high surrogate code unit. Since we don't have an equivalent of pmovmskb
                            // or palignr available to us, we'll do this as a loop. We won't look at
                            // the very last high surrogate char element since we don't yet know if
                            // the next vector read will have a low surrogate char element.

                            if (lowSurrogateChars[0] != 0)
                            {
                                goto Error; // error: start of buffer contains standalone low surrogate char
                            }

                            ushort surrogatePairsCount = 0;
                            for (int i = 0; i < Vector <ushort> .Count - 1; i++)
                            {
                                surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0
                                if (highSurrogateChars[i] != lowSurrogateChars[i + 1])
                                {
                                    goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic
                                }
                            }

                            if (highSurrogateChars[Vector <ushort> .Count - 1] != 0)
                            {
                                // There was a standalone high surrogate at the end of the vector.
                                // We'll adjust our counters so that we don't consider this char consumed.

                                pInputBuffer--;
                                inputLength++;
                                popcnt32 -= 2;
                            }

                            nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size

                            // 2 UTF-16 chars become 1 Unicode scalar

                            tempScalarCountAdjustment -= (int)surrogatePairsCountNint;

                            // Since each surrogate code unit was >= 0x0800, we eagerly assumed
                            // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only
                            // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total),
                            // so we'll adjust this now.

                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                            tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint;
                        }

                        tempUtf8CodeUnitCountAdjustment += popcnt32;
                        pInputBuffer += Vector <ushort> .Count;
                        inputLength  -= Vector <ushort> .Count;
                    } while (inputLength >= Vector <ushort> .Count);
                }
            }

NonVectorizedLoop:

            // Vectorization isn't supported on our current platform, or the input was too small to benefit
            // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to
            // drain remaining valid chars before we report failure.

            for (; inputLength > 0; pInputBuffer++, inputLength--)
            {
                uint thisChar = pInputBuffer[0];
                if (thisChar <= 0x7F)
                {
                    continue;
                }

                // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF.
                // This optimistically assumes no surrogates, which we'll handle shortly.

                tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16;

                if (!UnicodeUtility.IsSurrogateCodePoint(thisChar))
                {
                    continue;
                }

                // Found a surrogate char. Back out the adjustment we made above, then
                // try to consume the entire surrogate pair all at once. We won't bother
                // trying to interpret the surrogate pair as a scalar value; we'll only
                // validate that its bit pattern matches what's expected for a surrogate pair.

                tempUtf8CodeUnitCountAdjustment -= 2;

                if (inputLength == 1)
                {
                    goto Error; // input buffer too small to read a surrogate pair
                }

                thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer);
                if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0)
                {
                    goto Error; // not a well-formed surrogate pair
                }

                tempScalarCountAdjustment--;          // 2 UTF-16 code units -> 1 scalar
                tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units

                pInputBuffer++;                       // consumed one extra char
                inputLength--;
            }

Error:

            // Also used for normal return.

            utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment;
            scalarCountAdjustment       = tempScalarCountAdjustment;
            return(pInputBuffer);
        }

Пример #13

Показать файл

Файл: Convolve.cs Проект: ianuub/Ryujinxxx

 private static Vector128 <byte> PackUnsignedSaturate(Vector128 <int> value, Vector128 <int> zero)
 {
     return(Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16()));
 }

Пример #14

Показать файл

        private unsafe nuint GetIndexOfFirstCharToEncodeSsse3(char *pData, nuint lengthInChars)
        {
            // See GetIndexOfFirstByteToEncodeSsse3 for the central logic behind this method.
            // The main difference here is that we need to pack WORDs to BYTEs before performing
            // the main vectorized logic. It doesn't matter if we use signed or unsigned saturation
            // while packing, as saturation will convert out-of-range (non-ASCII char) WORDs to
            // 0x00 or 0x7F..0xFF, all of which are forbidden by the encoder.

            Debug.Assert(Ssse3.IsSupported);
            Debug.Assert(BitConverter.IsLittleEndian);

            Vector128 <byte> vecZero           = Vector128 <byte> .Zero;
            Vector128 <byte> vec0x7            = Vector128.Create((byte)0x7);
            Vector128 <byte> vecPowersOfTwo    = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0);
            Vector128 <byte> allowedCodePoints = _allowedAsciiCodePoints.AsVector;
            int pmovmskb;

            nuint i = 0;

            if (lengthInChars >= 16)
            {
                nuint lastLegalIterationFor16CharRead = lengthInChars & unchecked ((nuint)(nint) ~0xF);

                do
                {
                    // Read 16 chars at a time into 2x 128-bit vectors, then pack into a single 128-bit vector.

                    var packed = Sse2.PackUnsignedSaturate(
                        Sse2.LoadVector128((/* unaligned */ short *)(pData + i)),
                        Sse2.LoadVector128((/* unaligned */ short *)(pData + 8 + i)));
                    var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed);
                    var vecPowersOfTwoShuffled    = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7));
                    var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                    pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero));
                    if ((pmovmskb & 0xFFFF) != 0)
                    {
                        goto MaskContainsDataWhichRequiresEscaping;
                    }
                } while ((i += 16) < lastLegalIterationFor16CharRead);
            }

            if ((lengthInChars & 8) != 0)
            {
                // Read 8 chars at a time into a single 128-bit vector, then pack into low 8 bytes.

                var packed = Sse2.PackUnsignedSaturate(
                    Sse2.LoadVector128((/* unaligned */ short *)(pData + i)),
                    vecZero.AsInt16());
                var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed);
                var vecPowersOfTwoShuffled    = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7));
                var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero));
                if ((byte)pmovmskb != 0)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 8;
            }

            if ((lengthInChars & 4) != 0)
            {
                // Read 4 chars at a time into a single 128-bit vector, then pack into low 4 bytes.
                // Everything except the low nibble of pmovksmb contains garbage and must be discarded.

                var packed = Sse2.PackUnsignedSaturate(
                    Sse2.LoadScalarVector128((/* unaligned */ ulong *)(pData + i)).AsInt16(),
                    vecZero.AsInt16());
                var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed);
                var vecPowersOfTwoShuffled    = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7));
                var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled);
                pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero));
                if ((pmovmskb & 0xF) != 0)
                {
                    goto MaskContainsDataWhichRequiresEscaping;
                }

                i += 4;
            }

            // Beyond this point, vectorization isn't worthwhile. Just do a normal loop.

            if ((lengthInChars & 3) != 0)
            {
                Debug.Assert(lengthInChars - i <= 3);

                do
                {
                    if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i]))
                    {
                        break;
                    }
                } while (++i != lengthInChars);
            }

Return:

            return(i);

MaskContainsDataWhichRequiresEscaping:

            Debug.Assert(pmovmskb != 0);
            i += (uint)BitOperations.TrailingZeroCount(pmovmskb); // location of lowest set bit is where we must begin escaping
            goto Return;
        }

Пример #15

Показать файл

        internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len)
        {
            uint s1     = sum1;
            uint s2     = sum2;
            int  bufPos = 0;

            /*
             * Process the data in blocks.
             */
            uint BLOCK_SIZE = 1 << 5;
            uint blocks     = len / BLOCK_SIZE;

            len -= blocks * BLOCK_SIZE;

            while (blocks != 0)
            {
                uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */

                if (n > blocks)
                {
                    n = blocks;
                }

                blocks -= n;

                Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17).
                                        AsByte();

                Vector128 <byte>  tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte();
                Vector128 <byte>  zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte();
                Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1);

                /*
                 * Process n blocks of data. At most NMAX data bytes can be
                 * processed before s2 must be reduced modulo BASE.
                 */
                Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0);
                Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0);
                Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0);

                do
                {
                    /*
                     * Load 32 input bytes.
                     */
                    Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos),
                                                               BitConverter.ToUInt32(buf, bufPos + 4),
                                                               BitConverter.ToUInt32(buf, bufPos + 8),
                                                               BitConverter.ToUInt32(buf, bufPos + 12));

                    bufPos += 16;

                    /*
                     * Add previous block byte sum to v_ps.
                     */
                    v_ps = Sse2.Add(v_ps, v_s1);

                    /*
                     * Horizontally add the bytes for s1, multiply-adds the
                     * bytes by [ 32, 31, 30, ... ] for s2.
                     */
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad1 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32());
                    v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32());

                    Vector128 <short> mad2 =
                        System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte());

                    v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32());
                } while(--n != 0);

                v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                /*
                 * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                 */
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177));
                v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78));
                s1  += (uint)Sse2.ConvertToInt32(v_s1.AsInt32());
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177));
                v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78));
                s2   = (uint)Sse2.ConvertToInt32(v_s2.AsInt32());

                /*
                 * Reduce.
                 */
                s1 %= Adler32Context.ADLER_MODULE;
                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Handle leftover data.
             */
            if (len != 0)
            {
                if (len >= 16)
                {
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    s2  += s1 += buf[bufPos++];
                    len -= 16;
                }

                while (len-- != 0)
                {
                    s2 += s1 += buf[bufPos++];
                }

                if (s1 >= Adler32Context.ADLER_MODULE)
                {
                    s1 -= Adler32Context.ADLER_MODULE;
                }

                s2 %= Adler32Context.ADLER_MODULE;
            }

            /*
             * Return the recombined sums.
             */
            sum1 = (ushort)(s1 & 0xFFFF);
            sum2 = (ushort)(s2 & 0xFFFF);
        }