public static void ElementwiseSelectTest() { Assert.That(() => VectorUtilities.ElementwiseSelect(Vector128.Create(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000).AsSingle(), Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)), Is.EqualTo(Vector128.Create(0.0f, 5.0f, 2.0f, 7.0f)) ); }
public void RunReflectionScenario(int imm = 15, bool expectedOutOfRangeException = false) { TestLibrary.TestFramework.BeginScenario(nameof(RunReflectionScenario)); SByte[] values = new SByte[ElementCount]; for (int i = 0; i < ElementCount; i++) { values[i] = TestLibrary.Generator.GetSByte(); } Vector128 <SByte> value = Vector128.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15]); bool succeeded = !expectedOutOfRangeException; try { object result = typeof(Vector128) .GetMethod(nameof(Vector128.GetElement)) .MakeGenericMethod(typeof(SByte)) .Invoke(null, new object[] { value, imm }); ValidateGetResult((SByte)(result), values); } catch (TargetInvocationException e) { succeeded = expectedOutOfRangeException && e.InnerException is ArgumentOutOfRangeException; } if (!succeeded) { TestLibrary.TestFramework.LogInformation($"Vector128<SByte.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; } succeeded = !expectedOutOfRangeException; SByte insertedValue = TestLibrary.Generator.GetSByte(); try { object result2 = typeof(Vector128) .GetMethod(nameof(Vector128.WithElement)) .MakeGenericMethod(typeof(SByte)) .Invoke(null, new object[] { value, imm, insertedValue }); ValidateWithResult((Vector128 <SByte>)(result2), values, insertedValue); } catch (TargetInvocationException e) { succeeded = expectedOutOfRangeException && e.InnerException is ArgumentOutOfRangeException; } if (!succeeded) { TestLibrary.TestFramework.LogInformation($"Vector128<SByte.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; } }
public void RunReflectionScenario(int imm = 0, bool expectedOutOfRangeException = false) { TestLibrary.TestFramework.BeginScenario(nameof(RunReflectionScenario)); UInt16[] values = new UInt16[ElementCount]; for (int i = 0; i < ElementCount; i++) { values[i] = TestLibrary.Generator.GetUInt16(); } Vector128 <UInt16> value = Vector128.Create(values[0], values[1], values[2], values[3], values[4], values[5], values[6], values[7]); bool succeeded = !expectedOutOfRangeException; try { object result = typeof(Vector128 <UInt16>) .GetMethod(nameof(Vector128.GetElement), new Type[] { typeof(int) }) .Invoke(value, new object[] { imm }); ValidateGetResult((UInt16)(result), values); } catch (TargetInvocationException e) { succeeded = expectedOutOfRangeException && e.InnerException is ArgumentOutOfRangeException; } if (!succeeded) { TestLibrary.TestFramework.LogInformation($"Vector128<UInt16.GetElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; } succeeded = !expectedOutOfRangeException; UInt16 insertedValue = TestLibrary.Generator.GetUInt16(); try { object result2 = typeof(Vector128 <UInt16>) .GetMethod(nameof(Vector128.WithElement), new Type[] { typeof(int), typeof(UInt16) }) .Invoke(value, new object[] { imm, insertedValue }); ValidateWithResult((Vector128 <UInt16>)(result2), values, insertedValue); } catch (TargetInvocationException e) { succeeded = expectedOutOfRangeException && e.InnerException is ArgumentOutOfRangeException; } if (!succeeded) { TestLibrary.TestFramework.LogInformation($"Vector128<UInt16.WithElement({imm}): {nameof(RunReflectionScenario)} failed to throw ArgumentOutOfRangeException."); TestLibrary.TestFramework.LogInformation(string.Empty); Succeeded = false; } }
private static void Mix(byte[] block, int r, int iterations) { var v = new Vector128 <uint> [iterations * r * 8]; var x = new uint[r * 32]; for (var k = 0; k < 2 * r; k++) { for (var i = 0; i < 16; i++) { x[k * 16 + i] = Le32Dec(block, (k * 16 + i * 5 % 16) * 4); } } var xs = new Vector128 <uint> [r * 8]; var ys = new Vector128 <uint> [r * 8]; for (var i = 0; i < r * 8; ++i) { xs[i] = Vector128.Create(x[i * 4], x[i * 4 + 1], x[i * 4 + 2], x[i * 4 + 3]); ys[i] = Vector128.Create(0u); } for (var i = 0; i < iterations; i += 2) { for (var j = 0; j < r * 8; ++j) { v[i * r * 8 + j] = xs[j]; } MixSalsa8Sse2(xs, ys, r); for (var j = 0; j < r * 8; ++j) { v[(i + 1) * r * 8 + j] = ys[j]; } MixSalsa8Sse2(ys, xs, r); } for (var i = 0; i < iterations; i += 2) { var offset = Integerify(xs, r) & (ulong)(iterations - 1); for (var j = 0; j < r * 8; ++j) { xs[j] = Sse2.Xor(xs[j], v[offset * (ulong)r * 8 + (ulong)j]); } MixSalsa8Sse2(xs, ys, r); offset = Integerify(ys, r) & (ulong)(iterations - 1); for (var j = 0; j < r * 8; ++j) { ys[j] = Sse2.Xor(ys[j], v[offset * (ulong)r * 8 + (ulong)j]); } MixSalsa8Sse2(ys, xs, r); } for (var i = 0; i < r * 8; ++i) { for (var j = 0; j < 4; ++j) { x[i * 4 + j] = xs[i].GetElement(j); } } for (var k = 0; k < 2 * r; k++) { for (var i = 0; i < 16; i++) { Le32Enc(x[k * 16 + i], block, (k * 16 + i * 5 % 16) * 4); } } }
private void pruneTree(OctreeNode *ptree, ushort *pfree) { #if HWINTRINSICS var sumsMask = Vector128.Create(0xffffffffu, 0xffffffffu, 0xffffffffu, 0x1fffffffu); var vzero = Vector128 <uint> .Zero; #endif ushort *pnext = pfree; uint level = --leafLevel; for (nuint i = 8; i < maxHistogramSize; i++) { var node = ptree + i; uint nl = OctreeNode.GetLevel(node); if (nl == level) { ushort *children = (ushort *)node; uint * sums = (uint *)(children + 8); #if HWINTRINSICS if (Sse2.IsSupported) { var vsums = Sse2.LoadVector128(sums); for (nuint j = 0; j < 8; j++) { nuint child = children[j]; if (child != 0) { var cnode = ptree + child; uint *csums = (uint *)((ushort *)cnode + 8); var vcsum = Sse2.And(sumsMask, Sse2.LoadVector128(csums)); vsums = Sse2.Add(vsums, vcsum); Sse2.Store((uint *)cnode, vzero); Sse2.Store(csums, vzero); *pnext++ = (ushort)child; } } Sse2.Store((uint *)children, vzero); Sse2.Store(sums, vsums); } else #endif { for (nuint j = 0; j < 8; j++) { nuint child = children[j]; if (child != 0) { var cnode = ptree + child; uint *csums = (uint *)((ushort *)cnode + 8); sums[0] += csums[0]; sums[1] += csums[1]; sums[2] += csums[2]; sums[3] += csums[3] & 0x1fffffff; Unsafe.InitBlockUnaligned(cnode, 0, (uint)sizeof(OctreeNode)); *pnext++ = (ushort)child; } } Unsafe.InitBlockUnaligned(children, 0, sizeof(ushort) * 8); } } } *pnext = 0; }
private unsafe static void WriteA8R8G8B8(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int width = input.Width; int height = input.Height; int stride = GetPitch(width, 4); int dstIndex = rm.BufferPool.Rent(height * stride, out Span <byte> dst); if (Ssse3.IsSupported) { Vector128 <byte> shuffleMask = Vector128.Create( (byte)2, (byte)1, (byte)0, (byte)3, (byte)6, (byte)5, (byte)4, (byte)7, (byte)10, (byte)9, (byte)8, (byte)11, (byte)14, (byte)13, (byte)12, (byte)15); int widthTrunc = width & ~7; int strideGap = stride - width * 4; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dst) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 8) { Vector128 <ushort> pixel12 = Sse2.LoadVector128((ushort *)(ip + (uint)x)); Vector128 <ushort> pixel34 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 2)); Vector128 <ushort> pixel56 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 4)); Vector128 <ushort> pixel78 = Sse2.LoadVector128((ushort *)(ip + (uint)x + 6)); pixel12 = Sse2.ShiftRightLogical(pixel12, 2); pixel34 = Sse2.ShiftRightLogical(pixel34, 2); pixel56 = Sse2.ShiftRightLogical(pixel56, 2); pixel78 = Sse2.ShiftRightLogical(pixel78, 2); Vector128 <byte> pixel1234 = Sse2.PackUnsignedSaturate(pixel12.AsInt16(), pixel34.AsInt16()); Vector128 <byte> pixel5678 = Sse2.PackUnsignedSaturate(pixel56.AsInt16(), pixel78.AsInt16()); pixel1234 = Ssse3.Shuffle(pixel1234, shuffleMask); pixel5678 = Ssse3.Shuffle(pixel5678, shuffleMask); Sse2.Store(op + 0x00, pixel1234); Sse2.Store(op + 0x10, pixel5678); op += 0x20; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *(op + 0) = Downsample(px->B); *(op + 1) = Downsample(px->G); *(op + 2) = Downsample(px->R); *(op + 3) = Downsample(px->A); op += 4; } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { int baseOffs = y * stride; for (int x = 0; x < width; x++) { int offs = baseOffs + x * 4; dst[offs + 0] = Downsample(input.GetB(x, y)); dst[offs + 1] = Downsample(input.GetG(x, y)); dst[offs + 2] = Downsample(input.GetR(x, y)); dst[offs + 3] = Downsample(input.GetA(x, y)); } } } bool outLinear = config.OutBlkKind == 0; int gobBlocksInY = 1 << config.OutBlkHeight; WriteBuffer(rm, dst, offsets.LumaOffset, outLinear, width, height, 4, gobBlocksInY); rm.BufferPool.Return(dstIndex); }
protected override unsafe void ExecuteDay(byte[] input) { if (input == null) { return; } // borrowed liberally from https://github.com/Voltara/advent2017-fast/blob/master/src/day06.c var bytes = stackalloc byte[Vector128 <byte> .Count]; var ulongs = (ulong *)bytes; var x = Vector128 <byte> .Zero; int n = 0; var ctr = 0; for (int i = 0; i < input.Length && ctr < 16; i++) { if (input[i] < '0') { x = x.WithElement(ctr++, (byte)n); n = 0; } else { n = n * 10 + (input[i] - '0'); } } var map = new Dictionary <Vector128 <byte>, int>(capacity: PERFORMANCE_NOTE) { [x] = 0, }; ctr = 0; var mask1 = Vector128.Create(0x0607040502030001ul, 0x0e0f0c0d0a0b0809ul).AsByte(); var mask2 = Vector128.Create(0x0405060700010203ul, 0x0c0d0e0f08090a0bul).AsByte(); var mask3 = Vector128.Create(0x0001020304050607ul, 0x08090a0b0c0d0e0ful).AsByte(); var mask4 = Vector128.Create(0x08090a0b0c0d0e0ful, 0x0001020304050607ul).AsByte(); while (true) { // get max byte var tmp = Avx2.Max(x, Avx2.Shuffle(x, mask1)); tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask2)); tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask3)); tmp = Avx2.Max(tmp, Avx2.Shuffle(tmp, mask4)); // every byte in tmp should be max value var max = Avx2.Extract(tmp, 0); // where is it in the original? var idx = (int)Bmi1.TrailingZeroCount((uint) Avx2.MoveMask(Avx2.CompareEqual(x, tmp))); // subtract it from it's original place var high = (ulong)(long)-((idx & 0x08) >> 3); var shift = idx << 3; ulongs[0] = ((ulong)max << shift) & ~high; ulongs[1] = ((ulong)max << shift) & high; tmp = Avx2.Subtract(x, Avx2.LoadVector128(bytes)); // over 16? add 1 to all high = (ulong)(long)-((max & 0x10) >> 4); ulongs[0] = high & 0x0101010101010101ul; ulongs[1] = high & 0x0101010101010101ul; tmp = Avx2.Add(tmp, Avx2.LoadVector128(bytes)); // spread remainder to all // bitmask however many we're adding max &= 0x0f; shift = max << 3; var isLong = (ulong)(long)-((max & 0x08) >> 3); var mask = (0x1ul << shift) - 1; var lowMask = isLong | mask; var highMask = isLong & mask; // rotate our start point var start = (idx + 1) & 0x0f; isLong = (ulong)(long)-((start & 0x08) >> 3); var tmpLow = (~isLong & lowMask) | (isLong & highMask); var tmpHigh = (isLong & lowMask) | (~isLong & highMask); var doShift = (ulong)((-(start & 0x07)) >> 4); shift = start << 3; lowMask = ((tmpLow << shift | tmpHigh >> (128 - shift)) & doShift) | (~doShift & tmpLow); highMask = ((tmpHigh << shift | tmpLow >> (128 - shift)) & doShift) | (~doShift & tmpHigh); // build our adders and add values ulongs[0] = 0x0101010101010101ul & lowMask; ulongs[1] = 0x0101010101010101ul & highMask; tmp = Avx2.Add(tmp, Avx2.LoadVector128(bytes)); x = tmp; ctr++; if (map.ContainsKey(x)) { PartA = ctr.ToString(); PartB = (ctr - map[x]).ToString(); return; } map[x] = ctr; } }
public static void MinTest() { Assert.That(() => VectorUtilities.Min(Vector128.Create(-0.0f, -1.0f, -2.0f, -3.0f), Vector128.Create(-3.0f, -2.0f, -1.0f, -0.0f)), Is.EqualTo(Vector128.Create(-3.0f, -2.0f, -2.0f, -3.0f)) ); }
public static void MultiplyAddNegatedTest() { Assert.That(() => VectorUtilities.MultiplyAddNegated(Vector128.Create(10.0f, 10.0f, 10.0f, 10.0f), Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)), Is.EqualTo(Vector128.Create(10.0f, 5.0f, -2.0f, -11.0f)) ); }
public static void LengthSquaredTest() { Assert.That(() => VectorUtilities.LengthSquared(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)), Is.EqualTo(Vector128.Create(14.0f, 14.0f, 14.0f, 14.0f)) ); }
public static void MaxTest() { Assert.That(() => VectorUtilities.Max(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(3.0f, 2.0f, 1.0f, 0.0f)), Is.EqualTo(Vector128.Create(3.0f, 2.0f, 2.0f, 3.0f)) ); }
public static void CompareEqualAllTest() { Assert.That(() => VectorUtilities.CompareEqualAll(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f), Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f)), Is.True ); Assert.That(() => VectorUtilities.CompareEqualAll(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f), Vector128.Create(1.0f, -2.0f, 3.0f, -4.0f)), Is.False ); }
public static void LengthTest() { Assert.That(() => VectorUtilities.Length(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)), Is.EqualTo(Vector128.Create(3.7416575f, 3.7416575f, 3.7416575f, 3.7416575f)) ); }
public static void InterleaveUpperTest() { Assert.That(() => VectorUtilities.InterleaveUpper(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)), Is.EqualTo(Vector128.Create(2.0f, 6.0f, 3.0f, 7.0f)) ); }
private MD5HashCore(int ignored = default) { Miscellaneous.IgnoreParameter(ignored); _HashCode = Vector128.Create(0X67452301U, 0Xefcdab89U, 0X98badcfeU, 0X10325476U); _BytePosition = 0; }
public static void MultiplyByWTest() { Assert.That(() => VectorUtilities.MultiplyByW(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)), Is.EqualTo(Vector128.Create(0.0f, 7.0f, 14.0f, 21.0f)) ); }
public static Vector128 <float> Create(float x, float y, float z, float w) => Vector128.Create(x, y, z, w);
public static void NormalizeTest() { Assert.That(() => VectorUtilities.Normalize(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)), Is.EqualTo(Vector128.Create(0.0f, 0.26726124f, 0.5345225f, 0.8017837f)) ); }
private unsafe static void WriteNv12(ResourceManager rm, Surface input, ref OutputSurfaceConfig config, ref PlaneOffsets offsets) { int gobBlocksInY = 1 << config.OutBlkHeight; bool outLinear = config.OutBlkKind == 0; int width = Math.Min(config.OutLumaWidth + 1, input.Width); int height = Math.Min(config.OutLumaHeight + 1, input.Height); int yStride = GetPitch(config.OutLumaWidth + 1, 1); int dstYIndex = rm.BufferPool.Rent((config.OutLumaHeight + 1) * yStride, out Span <byte> dstY); if (Sse41.IsSupported) { Vector128 <ushort> mask = Vector128.Create(0xffffUL).AsUInt16(); int widthTrunc = width & ~0xf; int strideGap = yStride - width; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstY) { byte *op = dstPtr; for (int y = 0; y < height; y++, ip += input.Width) { int x = 0; for (; x < widthTrunc; x += 16) { byte *baseOffset = (byte *)(ip + (ulong)(uint)x); Vector128 <ushort> pixelp1 = Sse2.LoadVector128((ushort *)baseOffset); Vector128 <ushort> pixelp2 = Sse2.LoadVector128((ushort *)(baseOffset + 0x10)); Vector128 <ushort> pixelp3 = Sse2.LoadVector128((ushort *)(baseOffset + 0x20)); Vector128 <ushort> pixelp4 = Sse2.LoadVector128((ushort *)(baseOffset + 0x30)); Vector128 <ushort> pixelp5 = Sse2.LoadVector128((ushort *)(baseOffset + 0x40)); Vector128 <ushort> pixelp6 = Sse2.LoadVector128((ushort *)(baseOffset + 0x50)); Vector128 <ushort> pixelp7 = Sse2.LoadVector128((ushort *)(baseOffset + 0x60)); Vector128 <ushort> pixelp8 = Sse2.LoadVector128((ushort *)(baseOffset + 0x70)); pixelp1 = Sse2.And(pixelp1, mask); pixelp2 = Sse2.And(pixelp2, mask); pixelp3 = Sse2.And(pixelp3, mask); pixelp4 = Sse2.And(pixelp4, mask); pixelp5 = Sse2.And(pixelp5, mask); pixelp6 = Sse2.And(pixelp6, mask); pixelp7 = Sse2.And(pixelp7, mask); pixelp8 = Sse2.And(pixelp8, mask); Vector128 <ushort> pixelq1 = Sse41.PackUnsignedSaturate(pixelp1.AsInt32(), pixelp2.AsInt32()); Vector128 <ushort> pixelq2 = Sse41.PackUnsignedSaturate(pixelp3.AsInt32(), pixelp4.AsInt32()); Vector128 <ushort> pixelq3 = Sse41.PackUnsignedSaturate(pixelp5.AsInt32(), pixelp6.AsInt32()); Vector128 <ushort> pixelq4 = Sse41.PackUnsignedSaturate(pixelp7.AsInt32(), pixelp8.AsInt32()); pixelq1 = Sse41.PackUnsignedSaturate(pixelq1.AsInt32(), pixelq2.AsInt32()); pixelq2 = Sse41.PackUnsignedSaturate(pixelq3.AsInt32(), pixelq4.AsInt32()); pixelq1 = Sse2.ShiftRightLogical(pixelq1, 2); pixelq2 = Sse2.ShiftRightLogical(pixelq2, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixelq1.AsInt16(), pixelq2.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < width; x++) { Pixel *px = ip + (uint)x; *op++ = Downsample(px->R); } op += strideGap; } } } } else { for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { dstY[y * yStride + x] = Downsample(input.GetR(x, y)); } } } WriteBuffer( rm, dstY, offsets.LumaOffset, outLinear, config.OutLumaWidth + 1, config.OutLumaHeight + 1, 1, gobBlocksInY); rm.BufferPool.Return(dstYIndex); int uvWidth = Math.Min(config.OutChromaWidth + 1, (width + 1) >> 1); int uvHeight = Math.Min(config.OutChromaHeight + 1, (height + 1) >> 1); int uvStride = GetPitch(config.OutChromaWidth + 1, 2); int dstUvIndex = rm.BufferPool.Rent((config.OutChromaHeight + 1) * uvStride, out Span <byte> dstUv); if (Sse2.IsSupported) { int widthTrunc = uvWidth & ~7; int strideGap = uvStride - uvWidth * 2; fixed(Pixel *srcPtr = input.Data) { Pixel *ip = srcPtr; fixed(byte *dstPtr = dstUv) { byte *op = dstPtr; for (int y = 0; y < uvHeight; y++, ip += input.Width * 2) { int x = 0; for (; x < widthTrunc; x += 8) { byte *baseOffset = (byte *)ip + (ulong)(uint)x * 16; Vector128 <uint> pixel1 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x02)); Vector128 <uint> pixel2 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x12)); Vector128 <uint> pixel3 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x22)); Vector128 <uint> pixel4 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x32)); Vector128 <uint> pixel5 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x42)); Vector128 <uint> pixel6 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x52)); Vector128 <uint> pixel7 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x62)); Vector128 <uint> pixel8 = Sse2.LoadScalarVector128((uint *)(baseOffset + 0x72)); Vector128 <uint> pixel12 = Sse2.UnpackLow(pixel1, pixel2); Vector128 <uint> pixel34 = Sse2.UnpackLow(pixel3, pixel4); Vector128 <uint> pixel56 = Sse2.UnpackLow(pixel5, pixel6); Vector128 <uint> pixel78 = Sse2.UnpackLow(pixel7, pixel8); Vector128 <ulong> pixel1234 = Sse2.UnpackLow(pixel12.AsUInt64(), pixel34.AsUInt64()); Vector128 <ulong> pixel5678 = Sse2.UnpackLow(pixel56.AsUInt64(), pixel78.AsUInt64()); pixel1234 = Sse2.ShiftRightLogical(pixel1234, 2); pixel5678 = Sse2.ShiftRightLogical(pixel5678, 2); Vector128 <byte> pixel = Sse2.PackUnsignedSaturate(pixel1234.AsInt16(), pixel5678.AsInt16()); Sse2.Store(op, pixel); op += 0x10; } for (; x < uvWidth; x++) { Pixel *px = ip + (uint)(x << 1); *op++ = Downsample(px->G); *op++ = Downsample(px->B); } op += strideGap; } } } } else { for (int y = 0; y < uvHeight; y++) { for (int x = 0; x < uvWidth; x++) { int xx = x << 1; int yy = y << 1; int uvOffs = y * uvStride + xx; dstUv[uvOffs + 0] = Downsample(input.GetG(xx, yy)); dstUv[uvOffs + 1] = Downsample(input.GetB(xx, yy)); } } } WriteBuffer( rm, dstUv, offsets.ChromaUOffset, outLinear, config.OutChromaWidth + 1, config.OutChromaHeight + 1, 2, gobBlocksInY); rm.BufferPool.Return(dstUvIndex); }
public static void QuaternionConjugateTest() { Assert.That(() => VectorUtilities.QuaternionConjugate(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f)), Is.EqualTo(Vector128.Create(-1.0f, -2.0f, -3.0f, 4.0f)) ); }
static VectorF SoftwareFallback(Vector4 vector) { return(Vector128.Create(vector.X, vector.Y, vector.Z, vector.W)); }
public static void SqrtTest() { Assert.That(() => VectorUtilities.Sqrt(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f)), Is.EqualTo(Vector128.Create(0.0f, 1.0f, 1.4142135f, 1.7320508f)) ); }
public static unsafe void SequentialFill(int *targetPtr, int startValue, int length) { var value = startValue; nint lengthToProcess = length; #if NETCOREAPP nint alignmentCount = length; if (Sse2.IsSupported && length >= Vector128 <int> .Count * 2) { alignmentCount = UnalignedCountVector128(targetPtr); lengthToProcess = alignmentCount; } #endif while (lengthToProcess >= 8) { lengthToProcess -= 8; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; } if (lengthToProcess > 4) { lengthToProcess -= 4; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; *targetPtr = value; value++; targetPtr++; } while (lengthToProcess > 0) { lengthToProcess--; *targetPtr = value; value++; targetPtr++; } #if NETCOREAPP lengthToProcess = length - alignmentCount; if (lengthToProcess > 0) { if (Avx2.IsSupported) { var shiftVector256 = Vector256.Create(Vector256 <int> .Count); var lastVector256 = Avx2.Add(Vector256.Create(value), VECTOR256_INT_ZERO_TO_SEVEN); while (lengthToProcess >= Vector256 <int> .Count) { Avx.Store(targetPtr, lastVector256); lastVector256 = Avx2.Add(lastVector256, shiftVector256); targetPtr += Vector256 <int> .Count; lengthToProcess -= Vector256 <int> .Count; } if (lengthToProcess >= Vector128 <int> .Count) { Sse2.Store(targetPtr, lastVector256.GetLower()); targetPtr += Vector128 <int> .Count; lengthToProcess -= Vector128 <int> .Count; value = lastVector256.GetElement(Vector128 <int> .Count); } else { value = lastVector256.GetElement(0); } } else if (Sse2.IsSupported) { var shiftVector128 = Vector128.Create(Vector128 <int> .Count); var lastVector128 = Sse2.Add(Vector128.Create(value), VECTOR128_INT_ZERO_TO_THREE); while (lengthToProcess >= Vector128 <int> .Count) { Sse2.Store(targetPtr, lastVector128); lastVector128 = Sse2.Add(lastVector128, shiftVector128); targetPtr += Vector128 <int> .Count; lengthToProcess -= Vector128 <int> .Count; } value = lastVector128.GetElement(0); } while (lengthToProcess > 0) { lengthToProcess--; *targetPtr = value; value++; targetPtr++; } } #endif }
public static void SubtractTest() { Assert.That(() => VectorUtilities.Subtract(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)), Is.EqualTo(Vector128.Create(-4.0f, -4.0f, -4.0f, -4.0f)) ); }
private void remapDitherSse2(byte *pimage, int *perr, byte *pout, uint *pilut, OctreeNode *ptree, uint *ppal, ref nuint nextFree, nint cp) { var transnode = new OctreeNode(); transnode.Sums[3] = (uint)palEntries - 1; var vpmax = Vector128.Create((int)byte.MaxValue); var vprnd = Vector128.Create(7); var vzero = Vector128 <int> .Zero; nuint level = leafLevel; var prnod = default(OctreeNode *); byte *ip = pimage, ipe = ip + cp * sizeof(uint); byte *op = pout; int * ep = perr; var vppix = vzero; var vperr = vzero; var vnerr = vzero; do { Vector128 <int> vpix, vdiff; if ((byte)ip[3] < alphaThreshold) { vppix = vzero; vdiff = vzero; prnod = &transnode; goto FoundExact; } if (Sse41.IsSupported) { vpix = Sse41.ConvertToVector128Int32(ip); } else { vpix = Sse2.UnpackLow(Sse2.UnpackLow(Sse2.LoadScalarVector128((int *)ip).AsByte(), vzero.AsByte()).AsInt16(), vzero.AsInt16()).AsInt32(); } var verr = Sse2.Add(Sse2.Add(vprnd, Sse2.LoadVector128(ep)), Sse2.Subtract(Sse2.ShiftLeftLogical(vnerr, 3), vnerr)); vpix = Sse2.Add(vpix, Sse2.ShiftRightArithmetic(verr, 4)); vpix = Sse2.Min(vpix.AsInt16(), vpmax.AsInt16()).AsInt32(); vpix = Sse2.Max(vpix.AsInt16(), vzero.AsInt16()).AsInt32(); if (Sse2.MoveMask(Sse2.CompareEqual(vppix, vpix).AsByte()) == ushort.MaxValue) { vdiff = vzero; goto FoundExact; } vppix = vpix; nuint idx = pilut[(nuint)Sse2.ConvertToUInt32(vppix.AsUInt32())] | pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 2) + 256] | pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 4) + 512]; nuint next = idx & 7; var pnode = ptree + next; for (nuint i = 0; i <= level; i++) { idx >>= 3; nuint child = idx & 7; ushort *children = (ushort *)pnode; next = children[child]; if (next == 0) { uint *sums = (uint *)(children + 8); if (i < minLeafLevel) { next = nextFree++; children[child] = (ushort)next; pnode = ptree + next; if (i == minLeafLevel - 1) { initNode(pnode, vppix); break; } else { uint *csums = (uint *)((ushort *)pnode + 8); csums[3] = byte.MaxValue; } } else if ((byte)sums[3] == byte.MaxValue) { for (nuint j = 1; j < 8; j++) { nuint sibling = children[child ^ j]; if (sibling != 0) { var snode = ptree + sibling; uint *ssums = (uint *)((ushort *)snode + 8); if ((byte)ssums[3] == byte.MaxValue) { next = sibling; nuint mask = child ^ sibling; idx = (child & mask) | (idx & ~mask); break; } else { prnod = snode; goto Found; } } } } else { break; } } pnode = ptree + next; } prnod = pnode; Found: vdiff = Sse2.Subtract(vppix, Sse2.LoadVector128((int *)((ushort *)prnod + 8))); FoundExact: int *psums = (int *)((ushort *)prnod + 8); ip += sizeof(uint); *op++ = (byte)psums[3]; Sse2.Store(ep - Vector128 <int> .Count, Sse2.Add(vperr, Sse2.Add(vdiff, vdiff))); ep += Vector128 <int> .Count; vperr = Sse2.Add(Sse2.ShiftLeftLogical(vdiff, 2), vnerr); vnerr = vdiff; } while (ip < ipe); Sse2.Store(ep - Vector128 <int> .Count, vperr); }
public static void CompareGreaterThanOrEqualTest() { Assert.That(() => VectorUtilities.CompareGreaterThanOrEqual(Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f), Vector128.Create(1.0f, -2.0f, 3.0f, -4.0f)).AsUInt32(), Is.EqualTo(Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)) ); }
public void RunReflectionScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunReflectionScenario)); Vector128 <UInt64> value; value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object byteResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsByte), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <byte>)(byteResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object doubleResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsDouble), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <double>)(doubleResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object shortResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsInt16), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <short>)(shortResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object intResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsInt32), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <int>)(intResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object longResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsInt64), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <long>)(longResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object sbyteResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsSByte), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <sbyte>)(sbyteResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object floatResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsSingle), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <float>)(floatResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object ushortResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsUInt16), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <ushort>)(ushortResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object uintResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsUInt32), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <uint>)(uintResult), value); value = Vector128.Create(TestLibrary.Generator.GetUInt64()); object ulongResult = typeof(Vector128 <UInt64>) .GetMethod(nameof(Vector128 <UInt64> .AsUInt64), new Type[] { }) .Invoke(value, new object[] { }); ValidateResult((Vector128 <ulong>)(ulongResult), value); }
public VectorArg128 Change(float f) { Vector128 <float> t = Vector128.Create(f); return(new VectorArg128(Sse.Add(t, _rgb))); }
// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; inputLength -= numAsciiCharsConsumedJustNow; if (inputLength == 0) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero; do { Vector128 <ushort> utf16Data; if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned } else { utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned } Vector128 <ushort> charIsNonAscii; if (AdvSimd.Arm64.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = AdvSimd.Min(utf16Data, vector0080); } else if (Sse41.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = Sse41.Min(utf16Data, vector0080); } else { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will // be handled in a few lines. charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080); } #if DEBUG // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element. uint debugMask; if (AdvSimd.Arm64.IsSupported) { debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte()); } else { debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); } Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'."); #endif // DEBUG // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding // input was 0x0800 <= [value]. This also handles the missing range a few lines above. Vector128 <ushort> charIsThreeByteUtf8Encoded; uint mask; if (AdvSimd.IsSupported) { charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11)); mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } else { charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } // Each even bit of mask will be 1 only if the char was >= 0x0080, // and each odd bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is >= 0x0800 // | ,-- set if char[0] is >= 0x0800 // v v // mask = ... 1 1 0 1 // ^ ^-- set if char[0] is non-ASCII // `-- set if char[1] is non-ASCII // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. // // For now, compute the popcnt but squirrel it away. We'll fold it in to the // cumulative UTF-8 adjustment factor once we determine that there are no // unpaired surrogates in our data. (Unpaired surrogates would invalidate // our computed result and we'd have to throw it away.) uint popcnt = (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.Add(utf16Data, vectorA800); mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } else { utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. // Since 'mask' already has 00 in these positions (since the corresponding char // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions. uint mask2; if (AdvSimd.Arm64.IsSupported) { mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte()); } else { mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); } // 'lowSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a low surrogate char, // - 00 if the corresponding char was a high surrogate char or not a surrogate at all. uint lowSurrogatesMask = mask2 & mask; // 'highSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a high surrogate char, // - 00 if the corresponding char was a low surrogate char or not a surrogate at all. uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask; Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0, "A char cannot simultaneously be both a high and a low surrogate char."); Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0, "Only even bits (no odd bits) of the masks should be set."); // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here pInputBuffer--; inputLength++; } // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for // free right now, saving the extension step a few lines below. If we're 32-bit, the // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real // 64 -bit extension a few lines below. nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNuint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation // assumes that the pair is encoded as 6 UTF-8 code units. Since each // pair is in reality only encoded as 4 UTF-8 code units, we need to // perform this adjustment now. if (IntPtr.Size == 8) { // Since we've already zero-extended surrogatePairsCountNuint, we can directly // sub + sub. It's more efficient than shl + sub. tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; } else { // Take the hit of the 64-bit extension now. tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint; } } tempUtf8CodeUnitCountAdjustment += popcnt; pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE2 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint_t> sumVector = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint_t> .Count; i++) { popcnt += (nuint)sumVector[i]; } uint popcnt32 = (uint)popcnt; if (IntPtr.Size == 8) { popcnt32 += (uint)(popcnt >> 32); } // As in the SSE4.1 paths, compute popcnt but don't fold it in until we // know there aren't any unpaired surrogates in the input data. popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. if (lowSurrogateChars[0] != 0) { goto Error; // error: start of buffer contains standalone low surrogate char } ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0 if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; popcnt32 -= 2; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } tempUtf8CodeUnitCountAdjustment += popcnt32; pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
public static void DotProductTest() { Assert.That(() => VectorUtilities.DotProduct(Vector128.Create(0.0f, 1.0f, 2.0f, 3.0f), Vector128.Create(4.0f, 5.0f, 6.0f, 7.0f)), Is.EqualTo(Vector128.Create(38.0f, 38.0f, 38.0f, 38.0f)) ); }