private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan <uint> tile, int subsetCount, int partition, int w, int h, int maxError) { byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; Span <RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount]; Span <RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount]; BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount); Span <uint> endPoints0 = stackalloc uint[subsetCount]; Span <uint> endPoints1 = stackalloc uint[subsetCount]; SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue); Span <RgbaColor32> palette = stackalloc RgbaColor32[8]; int errorSum = 0; for (int subset = 0; subset < subsetCount; subset++) { RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A; if (sum != 0) { blockDir = (blockDir << 6) / new RgbaColor32(sum); } uint c0 = endPoints0[subset]; uint c1 = endPoints1[subset]; int pBit0 = GetPBit(c0, 6, 0); int pBit1 = GetPBit(c1, 6, 0); c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32(); c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32(); if (Sse41.IsSupported) { Vector128 <byte> c0Rep = Vector128.Create(c0).AsByte(); Vector128 <byte> c1Rep = Vector128.Create(c1).AsByte(); Vector128 <byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); Vector128 <byte> rWeights; Vector128 <byte> lWeights; fixed(byte *pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1]) { rWeights = Sse2.LoadScalarVector128((ulong *)pWeights).AsByte(); lWeights = Sse2.LoadScalarVector128((ulong *)pInvWeights).AsByte(); } Vector128 <byte> iWeights = Sse2.UnpackLow(rWeights, lWeights); Vector128 <byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); Vector128 <byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); Vector128 <byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); Vector128 <byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); Vector128 <byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); Vector128 <byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
private static uint32_t parse_eight_digits_unrolled(bytechar *chars) { // this actually computes *16* values so we are being wasteful. Vector128 <sbyte> ascii0 = Vector128.Create((bytechar)'0'); Vector128 <sbyte> input = Sse2.Subtract(Sse2.LoadVector128(chars), ascii0); Vector128 <short> t1 = Ssse3.MultiplyAddAdjacent(input.AsByte(), mul_1_10); Vector128 <int> t2 = Sse2.MultiplyAddAdjacent(t1, mul_1_100); Vector128 <ushort> t3 = Sse41.PackUnsignedSaturate(t2, t2); Vector128 <int> t4 = Sse2.MultiplyAddAdjacent(t3.AsInt16(), mul_1_10000); return(Sse2.ConvertToUInt32(t4.AsUInt32())); // only captures the sum of the first 8 digits, drop the rest }
public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
private unsafe static void BCnDecodeTileAlpha(Span <byte> output, Span <byte> rPal, ulong rI) { if (Avx2.IsSupported) { Span <Vector128 <byte> > outputAsVector128 = MemoryMarshal.Cast <byte, Vector128 <byte> >(output); Vector128 <uint> shifts = Vector128.Create(0u, 3u, 6u, 9u); Vector128 <uint> masks = Vector128.Create(7u); Vector128 <byte> vClut; fixed(byte *pRPal = rPal) { vClut = Sse2.LoadScalarVector128((ulong *)pRPal).AsByte(); } Vector128 <uint> indices0 = Vector128.Create((uint)rI); Vector128 <uint> indices1 = Vector128.Create((uint)(rI >> 24)); Vector128 <uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); Vector128 <uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); Vector128 <uint> indices01 = Sse2.ShiftRightLogical(indices00, 12); Vector128 <uint> indices11 = Sse2.ShiftRightLogical(indices10, 12); indices00 = Sse2.And(indices00, masks); indices10 = Sse2.And(indices10, masks); indices01 = Sse2.And(indices01, masks); indices11 = Sse2.And(indices11, masks); Vector128 <ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); Vector128 <ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); Vector128 <byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); } else { for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) { output[i] = rPal[(int)(rI & 7)]; } } }
public void SetCoeffs(Span <short> coeffs) { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) { ref short coeffsRef = ref MemoryMarshal.GetReference(coeffs); Vector128 <byte> c0 = Unsafe.As <short, Vector128 <byte> >(ref coeffsRef); Vector128 <byte> c1 = Unsafe.As <short, Vector128 <byte> >(ref Unsafe.Add(ref coeffsRef, 8)); // Use SSE2 to compare 16 values with a single instruction. Vector128 <sbyte> m0 = Sse2.PackSignedSaturate(c0.AsInt16(), c1.AsInt16()); Vector128 <sbyte> m1 = Sse2.CompareEqual(m0, Vector128 <sbyte> .Zero); // Get the comparison results as a bitmask into 16bits. Negate the mask to get // the position of entries that are not equal to zero. We don't need to mask // out least significant bits according to res->first, since coeffs[0] is 0 // if res->first > 0. uint mask = 0x0000ffffu ^ (uint)Sse2.MoveMask(m1); // The position of the most significant non-zero bit indicates the position of // the last non-zero value. this.Last = mask != 0 ? Numerics.Log2(mask) : -1; }
public static Vector128 <short> Divide(this Vector128 <short> dividend, Vector128 <short> divisor) { // Based on https://stackoverflow.com/a/51458507/347870 // Convert to two 32-bit integers Vector128 <int> a_hi_epi32 = Sse2.ShiftRightArithmetic(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32_shift = Sse2.ShiftLeftLogical(dividend.AsInt32(), 16); Vector128 <int> a_lo_epi32 = Sse2.ShiftRightArithmetic(a_lo_epi32_shift, 16); Vector128 <int> b_hi_epi32 = Sse2.ShiftRightArithmetic(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32_shift = Sse2.ShiftLeftLogical(divisor.AsInt32(), 16); Vector128 <int> b_lo_epi32 = Sse2.ShiftRightArithmetic(b_lo_epi32_shift, 16); // Convert to 32-bit floats Vector128 <float> a_hi = Sse2.ConvertToVector128Single(a_hi_epi32); Vector128 <float> a_lo = Sse2.ConvertToVector128Single(a_lo_epi32); Vector128 <float> b_hi = Sse2.ConvertToVector128Single(b_hi_epi32); Vector128 <float> b_lo = Sse2.ConvertToVector128Single(b_lo_epi32); // Calculate the reciprocal Vector128 <float> b_hi_rcp = Sse.Reciprocal(b_hi); Vector128 <float> b_lo_rcp = Sse.Reciprocal(b_lo); // Calculate the inverse Vector128 <float> b_hi_inv_1; Vector128 <float> b_lo_inv_1; Vector128 <float> two = Vector128.Create(2.00000051757f); if (Fma.IsSupported) { b_hi_inv_1 = Fma.MultiplyAddNegated(b_hi_rcp, b_hi, two); b_lo_inv_1 = Fma.MultiplyAddNegated(b_lo_rcp, b_lo, two); } else { Vector128 <float> b_mul_hi = Sse.Multiply(b_hi_rcp, b_hi); Vector128 <float> b_mul_lo = Sse.Multiply(b_lo_rcp, b_lo); b_hi_inv_1 = Sse.Subtract(two, b_mul_hi); b_lo_inv_1 = Sse.Subtract(two, b_mul_lo); } // Compensate for the loss Vector128 <float> b_hi_rcp_1 = Sse.Multiply(b_hi_rcp, b_hi_inv_1); Vector128 <float> b_lo_rcp_1 = Sse.Multiply(b_lo_rcp, b_lo_inv_1); // Perform the division by multiplication Vector128 <float> hi = Sse.Multiply(a_hi, b_hi_rcp_1); Vector128 <float> lo = Sse.Multiply(a_lo, b_lo_rcp_1); // Convert back to integers Vector128 <int> hi_epi32 = Sse2.ConvertToVector128Int32WithTruncation(hi); Vector128 <int> lo_epi32 = Sse2.ConvertToVector128Int32WithTruncation(lo); // Zero-out the unnecessary parts Vector128 <int> hi_epi32_shift = Sse2.ShiftLeftLogical(hi_epi32, 16); // Blend the bits, and return if (Sse41.IsSupported) { return(Sse41.Blend(lo_epi32.AsInt16(), hi_epi32_shift.AsInt16(), 0xAA)); } else { Vector128 <int> lo_epi32_mask = Sse2.And(lo_epi32, Vector128.Create((ushort)0xFFFF).AsInt16().AsInt32()); return(Sse2.Or(hi_epi32_shift, lo_epi32_mask).AsInt16()); } }
private unsafe static Surface ReadNv12(ResourceManager rm, ref SlotSurfaceConfig config, ref PlaneOffsets offsets) { InputSurface input = ReadSurface(rm.Gmm, ref config, ref offsets, 1, 2); int width = input.Width; int height = input.Height; int yStride = GetPitch(width, 1); int uvStride = GetPitch(input.UvWidth, 2); Surface output = new Surface(rm.SurfacePool, width, height); if (Sse41.IsSupported) { Vector128 <byte> shufMask = Vector128.Create( (byte)0, (byte)2, (byte)3, (byte)1, (byte)4, (byte)6, (byte)7, (byte)5, (byte)8, (byte)10, (byte)11, (byte)9, (byte)12, (byte)14, (byte)15, (byte)13); Vector128 <short> alphaMask = Vector128.Create(0xffUL << 48).AsInt16(); int yStrideGap = yStride - width; int uvStrideGap = uvStride - input.UvWidth; int widthTrunc = width & ~0xf; fixed(Pixel *dstPtr = output.Data) { Pixel *op = dstPtr; fixed(byte *src0Ptr = input.Buffer0, src1Ptr = input.Buffer1) { byte *i0p = src0Ptr; for (int y = 0; y < height; y++) { byte *i1p = src1Ptr + (y >> 1) * uvStride; int x = 0; for (; x < widthTrunc; x += 16, i0p += 16, i1p += 16) { Vector128 <short> ya0 = Sse41.ConvertToVector128Int16(i0p); Vector128 <short> ya1 = Sse41.ConvertToVector128Int16(i0p + 8); Vector128 <byte> uv = Sse2.LoadVector128(i1p); Vector128 <short> uv0 = Sse2.UnpackLow(uv.AsInt16(), uv.AsInt16()); Vector128 <short> uv1 = Sse2.UnpackHigh(uv.AsInt16(), uv.AsInt16()); Vector128 <short> rgba0 = Sse2.UnpackLow(ya0, uv0); Vector128 <short> rgba1 = Sse2.UnpackHigh(ya0, uv0); Vector128 <short> rgba2 = Sse2.UnpackLow(ya1, uv1); Vector128 <short> rgba3 = Sse2.UnpackHigh(ya1, uv1); rgba0 = Ssse3.Shuffle(rgba0.AsByte(), shufMask).AsInt16(); rgba1 = Ssse3.Shuffle(rgba1.AsByte(), shufMask).AsInt16(); rgba2 = Ssse3.Shuffle(rgba2.AsByte(), shufMask).AsInt16(); rgba3 = Ssse3.Shuffle(rgba3.AsByte(), shufMask).AsInt16(); Vector128 <short> rgba16_0 = Sse41.ConvertToVector128Int16(rgba0.AsByte()); Vector128 <short> rgba16_1 = Sse41.ConvertToVector128Int16(HighToLow(rgba0.AsByte())); Vector128 <short> rgba16_2 = Sse41.ConvertToVector128Int16(rgba1.AsByte()); Vector128 <short> rgba16_3 = Sse41.ConvertToVector128Int16(HighToLow(rgba1.AsByte())); Vector128 <short> rgba16_4 = Sse41.ConvertToVector128Int16(rgba2.AsByte()); Vector128 <short> rgba16_5 = Sse41.ConvertToVector128Int16(HighToLow(rgba2.AsByte())); Vector128 <short> rgba16_6 = Sse41.ConvertToVector128Int16(rgba3.AsByte()); Vector128 <short> rgba16_7 = Sse41.ConvertToVector128Int16(HighToLow(rgba3.AsByte())); rgba16_0 = Sse2.Or(rgba16_0, alphaMask); rgba16_1 = Sse2.Or(rgba16_1, alphaMask); rgba16_2 = Sse2.Or(rgba16_2, alphaMask); rgba16_3 = Sse2.Or(rgba16_3, alphaMask); rgba16_4 = Sse2.Or(rgba16_4, alphaMask); rgba16_5 = Sse2.Or(rgba16_5, alphaMask); rgba16_6 = Sse2.Or(rgba16_6, alphaMask); rgba16_7 = Sse2.Or(rgba16_7, alphaMask); rgba16_0 = Sse2.ShiftLeftLogical(rgba16_0, 2); rgba16_1 = Sse2.ShiftLeftLogical(rgba16_1, 2); rgba16_2 = Sse2.ShiftLeftLogical(rgba16_2, 2); rgba16_3 = Sse2.ShiftLeftLogical(rgba16_3, 2); rgba16_4 = Sse2.ShiftLeftLogical(rgba16_4, 2); rgba16_5 = Sse2.ShiftLeftLogical(rgba16_5, 2); rgba16_6 = Sse2.ShiftLeftLogical(rgba16_6, 2); rgba16_7 = Sse2.ShiftLeftLogical(rgba16_7, 2); Sse2.Store((short *)(op + (uint)x + 0), rgba16_0); Sse2.Store((short *)(op + (uint)x + 2), rgba16_1); Sse2.Store((short *)(op + (uint)x + 4), rgba16_2); Sse2.Store((short *)(op + (uint)x + 6), rgba16_3); Sse2.Store((short *)(op + (uint)x + 8), rgba16_4); Sse2.Store((short *)(op + (uint)x + 10), rgba16_5); Sse2.Store((short *)(op + (uint)x + 12), rgba16_6); Sse2.Store((short *)(op + (uint)x + 14), rgba16_7); } for (; x < width; x++, i1p += (x & 1) * 2) { Pixel *px = op + (uint)x; px->R = Upsample(*i0p++); px->G = Upsample(*i1p); px->B = Upsample(*(i1p + 1)); px->A = 0x3ff; } op += width; i0p += yStrideGap; i1p += uvStrideGap; } } } } else { for (int y = 0; y < height; y++) { int uvBase = (y >> 1) * uvStride; for (int x = 0; x < width; x++) { output.SetR(x, y, Upsample(input.Buffer0[y * yStride + x])); int uvOffs = uvBase + (x & ~1); output.SetG(x, y, Upsample(input.Buffer1[uvOffs])); output.SetB(x, y, Upsample(input.Buffer1[uvOffs + 1])); output.SetA(x, y, 0x3ff); } } } return(output); }
private static unsafe char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)GetIndexOfFirstNonAsciiChar_Sse2(pInputBuffer, (uint)inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; if (numAsciiCharsConsumedJustNow == inputLength) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if (Sse41.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vector0800 = Sse2.ShiftLeftLogical(vector0080, 4); // = 0x0800 Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); do { Vector128 <ushort> utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); uint mask = (uint)Sse2.MoveMask( Sse2.Or( Sse2.ShiftLeftLogical(Sse41.Min(utf16Data, vector0080), 8), Sse2.ShiftRightLogical(Sse41.Min(utf16Data, vector0800), 4)).AsByte()); // Each odd bit of mask will be 1 only if the char was >= 0x0080, // and each even bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is non-ASCII // | ,-- set if char[0] is non-ASCII // v v // mask = ... 1 1 1 0 // ^ ^-- set if char[0] is >= 0x800 // `-- set if char[1] is >= 0x800 // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. tempUtf8CodeUnitCountAdjustment += (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00 uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00 // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt pInputBuffer--; inputLength++; } int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= surrogatePairsCount; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE41 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint> sumVector = (Vector <nuint>)(-Vector.Add(twoOrMoreUtf8Bytes, threeOrMoreUtf8Bytes)); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint> .Count; i++) { popcnt += sumVector[i]; } uint popcnt32 = (uint)popcnt; if (sizeof(nuint) == sizeof(ulong)) { popcnt32 += (uint)(popcnt >> 32); } tempUtf8CodeUnitCountAdjustment += (ushort)popcnt32; tempUtf8CodeUnitCountAdjustment += popcnt32 >> 16; // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; tempUtf8CodeUnitCountAdjustment -= 2; tempScalarCountAdjustment--; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
private unsafe static void WriteA8B8G8R8(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int width = input.Width; int height = input.Height; int stride = GetPitch(width, 4); int dstIndex = rm.BufferPool.Rent(height * stride, out Span <byte> dst); if (Sse2.IsSupported) { int widthTrunc = width & ~7; int strideGap = stride - width * 4; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dst) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 8) { Vector128 <ushort> pixel12 = Sse2.LoadVector128((ushort *)(ip + (uint)x)); Vector128 <ushort> pixel34 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 2)); Vector128 <ushort> pixel56 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 4)); Vector128 <ushort> pixel78 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 6)); pixel12 = Sse2.ShiftRightLogical(pixel12, 2); pixel34 = Sse2.ShiftRightLogical(pixel34, 2); pixel56 = Sse2.ShiftRightLogical(pixel56, 2); pixel78 = Sse2.ShiftRightLogical(pixel78, 2); Vector128 <byte> pixel1234 = Sse2.PackUnsignedSaturate(pixel12.AsInt16(), pixel34.AsInt16()); Vector128 <byte> pixel5678 = Sse2.PackUnsignedSaturate(pixel56.AsInt16(), pixel78.AsInt16()); Sse2.Store(op + 0x00, pixel1234); Sse2.Store(op + 0x10, pixel5678); op += 0x20; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *(op + 0) = Downsample(px->R); *(op + 1) = Downsample(px->G); *(op + 2) = Downsample(px->B); *(op + 3) = Downsample(px->A); op += 4; } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { int baseOffs = y * stride; for (int x = 0; x < width; x++) { int offs = baseOffs + x * 4; dst[offs + 0] = Downsample(input.GetR(x, y)); dst[offs + 1] = Downsample(input.GetG(x, y)); dst[offs + 2] = Downsample(input.GetB(x, y)); dst[offs + 3] = Downsample(input.GetA(x, y)); } } } bool outLinear = config.OutBlkKind == 0; int gobBlocksInY = 1 << config.OutBlkHeight; WriteBuffer(rm, dst, offsets.LumaOffset, outLinear, width, height, 4, gobBlocksInY); rm.BufferPool.Return(dstIndex); }
private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int gobBlocksInY = 1 << config.OutBlkHeight; bool outLinear = config.OutBlkKind == 0; int width = Math.Min(config.OutLumaWidth + 1, input.Width); int height = Math.Min(config.OutLumaHeight + 1, input.Height); int yStride = GetPitch(config.OutLumaWidth + 1, 1); int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY); if (Sse41.IsSupported) { Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16(); int widthTrunc = width & ~0xf; int strideGap = yStride - width; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstY) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 16) { byte *baseOffset = (byte *)(ip + (ulong)(uint)x); Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset); Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10)); Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20)); Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30)); Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40)); Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50)); Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60)); Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70)); pixelp1 = Sse2.And(pixelp1, mask); pixelp2 = Sse2.And(pixelp2, mask); pixelp3 = Sse2.And(pixelp3, mask); pixelp4 = Sse2.And(pixelp4, mask); pixelp5 = Sse2.And(pixelp5, mask); pixelp6 = Sse2.And(pixelp6, mask); pixelp7 = Sse2.And(pixelp7, mask); pixelp8 = Sse2.And(pixelp8, mask); Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32()); Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32()); Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32()); Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32()); pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32()); pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32()); pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2); pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *op++ = Downsample(px->R); } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { dstY[y * yStride + x] = Downsample(input.GetR(x, y)); } } } WriteBuffer( rm, dstY, offsets.LumaOffset, outLinear, config.OutLumaWidth + 1, config.OutLumaHeight + 1, 1, gobBlocksInY); rm.BufferPool.Return(dstYIndex); int uvWidth = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1); int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1); int uvStride = GetPitch(config.OutChromaWidth + 1, 2); int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv); if (Sse2.IsSupported) { int widthTrunc = uvWidth & ~7; int strideGap = uvStride - uvWidth * 2; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstUv) { byte *op = dstPtr; for (int y = 0; y < uvHeight; y++, ip += input.Width * 2) { int x = 0; for (; x < widthTrunc; x += 8) { byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16; Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02)); Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12)); Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22)); Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32)); Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42)); Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52)); Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62)); Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72)); Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2); Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4); Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6); Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8); Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64()); Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64()); pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2); pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < uvWidth; x++) { Pixel *px = ip + (uint)(x << 1); *op++ = Downsample(px->G); *op++ = Downsample(px->B); } op += strideGap; } } } } else { for (int y = 0; y < uvHeight; y++) { for (int x = 0; x < uvWidth; x++) { int xx = x << 1; int yy = y << 1; int uvOffs = y * uvStride + xx; dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy)); dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy)); } } } WriteBuffer( rm, dstUv, offsets.ChromaUOffset, outLinear, config.OutChromaWidth + 1, config.OutChromaHeight + 1, 2, gobBlocksInY); rm.BufferPool.Return(dstUvIndex); }
public static Vector128 <byte> Narrow(Vector128 <ushort> low, Vector128 <ushort> high) { return(Sse2.PackUnsignedSaturate(low.AsInt16(), high.AsInt16())); }
// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; inputLength -= numAsciiCharsConsumedJustNow; if (inputLength == 0) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero; do { Vector128 <ushort> utf16Data; if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned } else { utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned } Vector128 <ushort> charIsNonAscii; if (AdvSimd.Arm64.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = AdvSimd.Min(utf16Data, vector0080); } else if (Sse41.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = Sse41.Min(utf16Data, vector0080); } else { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will // be handled in a few lines. charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080); } #if DEBUG // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element. uint debugMask; if (AdvSimd.Arm64.IsSupported) { debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte()); } else { debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); } Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'."); #endif // DEBUG // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding // input was 0x0800 <= [value]. This also handles the missing range a few lines above. Vector128 <ushort> charIsThreeByteUtf8Encoded; uint mask; if (AdvSimd.IsSupported) { charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11)); mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } else { charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } // Each even bit of mask will be 1 only if the char was >= 0x0080, // and each odd bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is >= 0x0800 // | ,-- set if char[0] is >= 0x0800 // v v // mask = ... 1 1 0 1 // ^ ^-- set if char[0] is non-ASCII // `-- set if char[1] is non-ASCII // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. // // For now, compute the popcnt but squirrel it away. We'll fold it in to the // cumulative UTF-8 adjustment factor once we determine that there are no // unpaired surrogates in our data. (Unpaired surrogates would invalidate // our computed result and we'd have to throw it away.) uint popcnt = (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.Add(utf16Data, vectorA800); mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } else { utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. // Since 'mask' already has 00 in these positions (since the corresponding char // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions. uint mask2; if (AdvSimd.Arm64.IsSupported) { mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte()); } else { mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); } // 'lowSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a low surrogate char, // - 00 if the corresponding char was a high surrogate char or not a surrogate at all. uint lowSurrogatesMask = mask2 & mask; // 'highSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a high surrogate char, // - 00 if the corresponding char was a low surrogate char or not a surrogate at all. uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask; Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0, "A char cannot simultaneously be both a high and a low surrogate char."); Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0, "Only even bits (no odd bits) of the masks should be set."); // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here pInputBuffer--; inputLength++; } // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for // free right now, saving the extension step a few lines below. If we're 32-bit, the // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real // 64 -bit extension a few lines below. nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNuint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation // assumes that the pair is encoded as 6 UTF-8 code units. Since each // pair is in reality only encoded as 4 UTF-8 code units, we need to // perform this adjustment now. if (IntPtr.Size == 8) { // Since we've already zero-extended surrogatePairsCountNuint, we can directly // sub + sub. It's more efficient than shl + sub. tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; } else { // Take the hit of the 64-bit extension now. tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint; } } tempUtf8CodeUnitCountAdjustment += popcnt; pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE2 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint_t> sumVector = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint_t> .Count; i++) { popcnt += (nuint)sumVector[i]; } uint popcnt32 = (uint)popcnt; if (IntPtr.Size == 8) { popcnt32 += (uint)(popcnt >> 32); } // As in the SSE4.1 paths, compute popcnt but don't fold it in until we // know there aren't any unpaired surrogates in the input data. popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. if (lowSurrogateChars[0] != 0) { goto Error; // error: start of buffer contains standalone low surrogate char } ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0 if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; popcnt32 -= 2; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } tempUtf8CodeUnitCountAdjustment += popcnt32; pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
private static Vector128 <byte> PackUnsignedSaturate(Vector128 <int> value, Vector128 <int> zero) { return(Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16())); }
private unsafe nuint GetIndexOfFirstCharToEncodeSsse3(char *pData, nuint lengthInChars) { // See GetIndexOfFirstByteToEncodeSsse3 for the central logic behind this method. // The main difference here is that we need to pack WORDs to BYTEs before performing // the main vectorized logic. It doesn't matter if we use signed or unsigned saturation // while packing, as saturation will convert out-of-range (non-ASCII char) WORDs to // 0x00 or 0x7F..0xFF, all of which are forbidden by the encoder. Debug.Assert(Ssse3.IsSupported); Debug.Assert(BitConverter.IsLittleEndian); Vector128 <byte> vecZero = Vector128 <byte> .Zero; Vector128 <byte> vec0x7 = Vector128.Create((byte)0x7); Vector128 <byte> vecPowersOfTwo = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0); Vector128 <byte> allowedCodePoints = _allowedAsciiCodePoints.AsVector; int pmovmskb; nuint i = 0; if (lengthInChars >= 16) { nuint lastLegalIterationFor16CharRead = lengthInChars & unchecked ((nuint)(nint) ~0xF); do { // Read 16 chars at a time into 2x 128-bit vectors, then pack into a single 128-bit vector. var packed = Sse2.PackUnsignedSaturate( Sse2.LoadVector128((/* unaligned */ short *)(pData + i)), Sse2.LoadVector128((/* unaligned */ short *)(pData + 8 + i))); var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed); var vecPowersOfTwoShuffled = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7)); var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled); pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero)); if ((pmovmskb & 0xFFFF) != 0) { goto MaskContainsDataWhichRequiresEscaping; } } while ((i += 16) < lastLegalIterationFor16CharRead); } if ((lengthInChars & 8) != 0) { // Read 8 chars at a time into a single 128-bit vector, then pack into low 8 bytes. var packed = Sse2.PackUnsignedSaturate( Sse2.LoadVector128((/* unaligned */ short *)(pData + i)), vecZero.AsInt16()); var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed); var vecPowersOfTwoShuffled = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7)); var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled); pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero)); if ((byte)pmovmskb != 0) { goto MaskContainsDataWhichRequiresEscaping; } i += 8; } if ((lengthInChars & 4) != 0) { // Read 4 chars at a time into a single 128-bit vector, then pack into low 4 bytes. // Everything except the low nibble of pmovksmb contains garbage and must be discarded. var packed = Sse2.PackUnsignedSaturate( Sse2.LoadScalarVector128((/* unaligned */ ulong *)(pData + i)).AsInt16(), vecZero.AsInt16()); var allowedCodePointsShuffled = Ssse3.Shuffle(allowedCodePoints, packed); var vecPowersOfTwoShuffled = Ssse3.Shuffle(vecPowersOfTwo, Sse2.And(Sse2.ShiftRightLogical(packed.AsUInt32(), 4).AsByte(), vec0x7)); var result = Sse2.And(allowedCodePointsShuffled, vecPowersOfTwoShuffled); pmovmskb = Sse2.MoveMask(Sse2.CompareEqual(result, vecZero)); if ((pmovmskb & 0xF) != 0) { goto MaskContainsDataWhichRequiresEscaping; } i += 4; } // Beyond this point, vectorization isn't worthwhile. Just do a normal loop. if ((lengthInChars & 3) != 0) { Debug.Assert(lengthInChars - i <= 3); do { if (!_allowedAsciiCodePoints.IsAllowedAsciiCodePoint(pData[i])) { break; } } while (++i != lengthInChars); } Return: return(i); MaskContainsDataWhichRequiresEscaping: Debug.Assert(pmovmskb != 0); i += (uint)BitOperations.TrailingZeroCount(pmovmskb); // location of lowest set bit is where we must begin escaping goto Return; }
internal static void Step(ref ushort sum1, ref ushort sum2, byte[] buf, uint len) { uint s1 = sum1; uint s2 = sum2; int bufPos = 0; /* * Process the data in blocks. */ uint BLOCK_SIZE = 1 << 5; uint blocks = len / BLOCK_SIZE; len -= blocks * BLOCK_SIZE; while (blocks != 0) { uint n = Adler32Context.NMAX / BLOCK_SIZE; /* The NMAX constraint. */ if (n > blocks) { n = blocks; } blocks -= n; Vector128 <byte> tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17). AsByte(); Vector128 <byte> tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1).AsByte(); Vector128 <byte> zero = Vector128.Create(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0).AsByte(); Vector128 <short> ones = Vector128.Create(1, 1, 1, 1, 1, 1, 1, 1); /* * Process n blocks of data. At most NMAX data bytes can be * processed before s2 must be reduced modulo BASE. */ Vector128 <uint> v_ps = Vector128.Create(s1 * n, 0, 0, 0); Vector128 <uint> v_s2 = Vector128.Create(s2, 0, 0, 0); Vector128 <uint> v_s1 = Vector128.Create(0u, 0, 0, 0); do { /* * Load 32 input bytes. */ Vector128 <uint> bytes1 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; Vector128 <uint> bytes2 = Vector128.Create(BitConverter.ToUInt32(buf, bufPos), BitConverter.ToUInt32(buf, bufPos + 4), BitConverter.ToUInt32(buf, bufPos + 8), BitConverter.ToUInt32(buf, bufPos + 12)); bufPos += 16; /* * Add previous block byte sum to v_ps. */ v_ps = Sse2.Add(v_ps, v_s1); /* * Horizontally add the bytes for s1, multiply-adds the * bytes by [ 32, 31, 30, ... ] for s2. */ v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1.AsByte(), zero).AsUInt32()); Vector128 <short> mad1 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes1.AsByte(), tap1.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1.AsInt16(), ones.AsInt16()).AsUInt32()); v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2.AsByte(), zero).AsUInt32()); Vector128 <short> mad2 = System.Runtime.Intrinsics.X86.Ssse3.MultiplyAddAdjacent(bytes2.AsByte(), tap2.AsSByte()); v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2.AsInt16(), ones.AsInt16()).AsUInt32()); } while(--n != 0); v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5)); /* * Sum epi32 ints v_s1(s2) and accumulate in s1(s2). */ v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 177)); v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, 78)); s1 += (uint)Sse2.ConvertToInt32(v_s1.AsInt32()); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 177)); v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, 78)); s2 = (uint)Sse2.ConvertToInt32(v_s2.AsInt32()); /* * Reduce. */ s1 %= Adler32Context.ADLER_MODULE; s2 %= Adler32Context.ADLER_MODULE; } /* * Handle leftover data. */ if (len != 0) { if (len >= 16) { s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; s2 += s1 += buf[bufPos++]; len -= 16; } while (len-- != 0) { s2 += s1 += buf[bufPos++]; } if (s1 >= Adler32Context.ADLER_MODULE) { s1 -= Adler32Context.ADLER_MODULE; } s2 %= Adler32Context.ADLER_MODULE; } /* * Return the recombined sums. */ sum1 = (ushort)(s1 & 0xFFFF); sum2 = (ushort)(s2 & 0xFFFF); }