static unsafe int Main(string[] args) { int testResult = Pass; if (Avx2.IsSupported) { using (TestTable <byte, byte, byte> byteTable = new TestTable <byte, byte, byte>(new byte[32] { 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0 }, new byte[32] { 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0 }, new byte[32])) using (TestTable <sbyte, sbyte, sbyte> sbyteTable = new TestTable <sbyte, sbyte, sbyte>(new sbyte[32] { 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0 }, new sbyte[32] { 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0 }, new sbyte[32])) using (TestTable <short, short, short> shortTable = new TestTable <short, short, short>(new short[16] { 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0, 1, -5, 100, 0 }, new short[16] { 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0, 22, -1, -50, 0 }, new short[16])) using (TestTable <ushort, ushort, ushort> ushortTable = new TestTable <ushort, ushort, ushort>(new ushort[16] { 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0, 1, 5, 100, 0 }, new ushort[16] { 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0, 22, 1, 50, 0 }, new ushort[16])) { var vb1 = Unsafe.Read <Vector256 <byte> >(byteTable.inArray1Ptr); var vb2 = Unsafe.Read <Vector256 <byte> >(byteTable.inArray2Ptr); var vb3 = Avx2.AddSaturate(vb1, vb2); Unsafe.Write(byteTable.outArrayPtr, vb3); var vsb1 = Unsafe.Read <Vector256 <sbyte> >(sbyteTable.inArray1Ptr); var vsb2 = Unsafe.Read <Vector256 <sbyte> >(sbyteTable.inArray2Ptr); var vsb3 = Avx2.AddSaturate(vsb1, vsb2); Unsafe.Write(sbyteTable.outArrayPtr, vsb3); var vs1 = Unsafe.Read <Vector256 <short> >(shortTable.inArray1Ptr); var vs2 = Unsafe.Read <Vector256 <short> >(shortTable.inArray2Ptr); var vs3 = Avx2.AddSaturate(vs1, vs2); Unsafe.Write(shortTable.outArrayPtr, vs3); var vus1 = Unsafe.Read <Vector256 <ushort> >(ushortTable.inArray1Ptr); var vus2 = Unsafe.Read <Vector256 <ushort> >(ushortTable.inArray2Ptr); var vus3 = Avx2.AddSaturate(vus1, vus2); Unsafe.Write(ushortTable.outArrayPtr, vus3); for (int i = 0; i < byteTable.outArray.Length; i++) { int value = byteTable.inArray1[i] + byteTable.inArray2[i]; value = Math.Max(value, 0); value = Math.Min(value, byte.MaxValue); if ((byte)value != byteTable.outArray[i]) { Console.WriteLine("AVX2 AddSaturate failed on byte:"); Console.WriteLine(); testResult = Fail; break; } } for (int i = 0; i < sbyteTable.outArray.Length; i++) { int value = sbyteTable.inArray1[i] + sbyteTable.inArray2[i]; value = Math.Max(value, sbyte.MinValue); value = Math.Min(value, sbyte.MaxValue); if ((sbyte)value != sbyteTable.outArray[i]) { Console.WriteLine("AVX2 AddSaturate failed on sbyte:"); Console.WriteLine(); testResult = Fail; break; } } for (int i = 0; i < shortTable.outArray.Length; i++) { int value = shortTable.inArray1[i] + shortTable.inArray2[i]; value = Math.Max(value, short.MinValue); value = Math.Min(value, short.MaxValue); if ((short)value != shortTable.outArray[i]) { Console.WriteLine("AVX2 AddSaturate failed on short:"); Console.WriteLine(); testResult = Fail; break; } } for (int i = 0; i < ushortTable.outArray.Length; i++) { int value = ushortTable.inArray1[i] + ushortTable.inArray2[i]; value = Math.Max(value, 0); value = Math.Min(value, ushort.MaxValue); if ((ushort)value != ushortTable.outArray[i]) { Console.WriteLine("AVX2 AddSaturate failed on ushort:"); Console.WriteLine(); testResult = Fail; break; } } } } return(testResult); }
// take input from buf and remove useless whitespace, input and output can be // the same, result is null terminated, return the string length (minus the null termination) public static size_t Minify(uint8_t *buf, size_t len, uint8_t * @out) { if (!Avx2.IsSupported) { throw new NotSupportedException("AVX2 is required form SimdJson"); } //C#: load const vectors once (there is no `const _m256` in C#) Vector256 <byte> lut_cntrl = s_lut_cntrl; Vector256 <byte> low_nibble_mask = s_low_nibble_mask; Vector256 <byte> high_nibble_mask = s_high_nibble_mask; fixed(byte *mask128_epi8 = s_mask128_epi8) { // Useful constant masks const uint64_t even_bits = 0x5555555555555555UL; const uint64_t odd_bits = ~even_bits; uint8_t * initout = @out; uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value uint64_t prev_iter_inside_quote = 0UL; // either all zeros or all ones size_t idx = 0; if (len >= 64) { size_t avxlen = len - 63; for (; idx < avxlen; idx += 64) { Vector256 <byte> input_lo = Avx.LoadVector256((buf + idx + 0)); Vector256 <byte> input_hi = Avx.LoadVector256((buf + idx + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; bool iter_ends_odd_backslash = add_overflow( bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL; uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL).AsUInt64(), Vector128.Create((byte)0xFF).AsUInt64(), 0)); quote_mask ^= prev_iter_inside_quote; prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // might be undefined behavior, should be fully defined in C++20, ok according to John Regher from Utah University Vector256 <byte> whitespace_shufti_mask = Vector256.Create((byte)0x18); Vector256 <byte> v_lo = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_lo), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(), Vector256.Create((byte)0x7f)))); Vector256 <byte> v_hi = Avx2.And( Avx2.Shuffle(low_nibble_mask, input_hi), Avx2.Shuffle(high_nibble_mask, Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(), Vector256.Create((byte)0x7f)))); Vector256 <byte> tmp_ws_lo = Avx2.CompareEqual( Avx2.And(v_lo, whitespace_shufti_mask), Vector256.Create((byte)0)); Vector256 <byte> tmp_ws_hi = Avx2.CompareEqual( Avx2.And(v_hi, whitespace_shufti_mask), Vector256.Create((byte)0)); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; int mask1 = (int)(whitespace & 0xFFFF); int mask2 = (int)((whitespace >> 16) & 0xFFFF); int mask3 = (int)((whitespace >> 32) & 0xFFFF); int mask4 = (int)((whitespace >> 48) & 0xFFFF); int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & (ulong)(0xFFFFFFFF)); int pop3 = hamming((~whitespace) & (ulong)(0xFFFFFFFFFFFF)); int pop4 = hamming((~whitespace)); var vmask1 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2); var vmask2 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2); var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte()); var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte()); _mm256_storeu2_m128i((@out + pop1), @out, result1); _mm256_storeu2_m128i((@out + pop3), (@out + pop2), result2); @out += pop4; } } // we finish off the job... copying and pasting the code is not ideal here, // but it gets the job done. if (idx < len) { uint8_t *buffer = stackalloc uint8_t[64]; memset(buffer, 0, 64); memcpy(buffer, buf + idx, len - idx); var input_lo = Avx.LoadVector256((buffer)); var input_hi = Avx.LoadVector256((buffer + 32)); uint64_t bs_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'\\')); uint64_t start_edges = bs_bits & ~(bs_bits << 1); uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash; uint64_t even_starts = start_edges & even_start_mask; uint64_t odd_starts = start_edges & ~even_start_mask; uint64_t even_carries = bs_bits + even_starts; uint64_t odd_carries; //bool iter_ends_odd_backslash = add_overflow(bs_bits, odd_starts, &odd_carries); odd_carries |= prev_iter_ends_odd_backslash; //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL; // we never use it uint64_t even_carry_ends = even_carries & ~bs_bits; uint64_t odd_carry_ends = odd_carries & ~bs_bits; uint64_t even_start_odd_end = even_carry_ends & odd_bits; uint64_t odd_start_even_end = odd_carry_ends & even_bits; uint64_t odd_ends = even_start_odd_end | odd_start_even_end; uint64_t quote_bits = cmp_mask_against_input_mini(input_lo, input_hi, Vector256.Create((byte)'"')); quote_bits = quote_bits & ~odd_ends; uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply( Vector128.Create(quote_bits, 0UL), Vector128.Create((byte)0xFF).AsUInt64(), 0)); quote_mask ^= prev_iter_inside_quote; // prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63);// we don't need this anymore Vector256 <byte> mask_20 = Vector256.Create((byte)0x20); // c==32 Vector256 <byte> mask_70 = Vector256.Create((byte)0x70); // adding 0x70 does not check low 4-bits // but moves any value >= 16 above 128 Vector256 <byte> tmp_ws_lo = Avx2.Or( Avx2.CompareEqual(mask_20, input_lo), Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_lo))); Vector256 <byte> tmp_ws_hi = Avx2.Or( Avx2.CompareEqual(mask_20, input_hi), Avx2.Shuffle(lut_cntrl, Avx2.AddSaturate(mask_70, input_hi))); uint64_t ws_res_0 = (uint32_t)Avx2.MoveMask(tmp_ws_lo); uint64_t ws_res_1 = (uint64_t)Avx2.MoveMask(tmp_ws_hi); uint64_t whitespace = (ws_res_0 | (ws_res_1 << 32)); whitespace &= ~quote_mask; if (len - idx < 64) { whitespace |= ((0xFFFFFFFFFFFFFFFF) << (int)(len - idx)); } int mask1 = (int)(whitespace & 0xFFFF); int mask2 = (int)((whitespace >> 16) & 0xFFFF); int mask3 = (int)((whitespace >> 32) & 0xFFFF); int mask4 = (int)((whitespace >> 48) & 0xFFFF); int pop1 = hamming((~whitespace) & 0xFFFF); int pop2 = hamming((~whitespace) & 0xFFFFFFFF); int pop3 = hamming((~whitespace) & 0xFFFFFFFFFFFF); int pop4 = hamming((~whitespace)); var vmask1 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask2 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask1 & 0x7FFF) * 2); var vmask2 = _mm256_loadu2_m128i((ulong *)mask128_epi8 + (mask4 & 0x7FFF) * 2, (ulong *)mask128_epi8 + (mask3 & 0x7FFF) * 2); var result1 = Avx2.Shuffle(input_lo, vmask1.AsByte()); var result2 = Avx2.Shuffle(input_hi, vmask2.AsByte()); _mm256_storeu2_m128i((buffer + pop1), buffer, result1); _mm256_storeu2_m128i((buffer + pop3), (buffer + pop2), result2); memcpy(@out, buffer, (size_t)pop4); @out += pop4; } *@out = (byte)'\0'; // NULL termination return((size_t)@out - (size_t)initout); } }
private static unsafe int CalculateDistance(string sourceString, int sourceLength, string targetString, int targetLength, int startIndex) { var arrayPool = ArrayPool <int> .Shared; var pooledArray = arrayPool.Rent(targetLength); Span <int> previousRow = pooledArray; ReadOnlySpan <char> source = sourceString.AsSpan().Slice(startIndex, sourceLength); ReadOnlySpan <char> target = targetString.AsSpan().Slice(startIndex, targetLength); //ArrayPool values are sometimes bigger than allocated, let's trim our span to exactly what we use previousRow = previousRow.Slice(0, targetLength); fixed(char *targetPtr = target) fixed(char *srcPtr = source) fixed(int *previousRowPtr = previousRow) { FillRow(previousRowPtr, targetLength); var rowIndex = 0; //var sourceV = Vector128<short>.Zero; const int VECTOR_LENGTH = 16; for (; rowIndex < sourceLength - VECTOR_LENGTH - 1; rowIndex += VECTOR_LENGTH) { // todo max var temp = Vector128.Create(rowIndex); var diag = Sse42.PackUnsignedSaturate(temp, temp).ToVector256(); var one = Vector256.Create((ushort)1); var left = Avx2.AddSaturate(diag, one); var sourceV = Avx2.LoadVector256((ushort *)(srcPtr + rowIndex)); var targetV = Vector256 <ushort> .Zero; var shift = Vector256.CreateScalar(ushort.MaxValue); // First 3 iterations fills the vector for (int columnIndex = 0; columnIndex < VECTOR_LENGTH - 1; columnIndex++) { // Shift in the next character targetV = ShiftLeft(targetV); //targetV = Avx2.Insert(targetV, (ushort)targetPtr[columnIndex], 0); targetV = Avx2.Or(targetV, Vector256.CreateScalar((ushort)targetPtr[columnIndex])); // Insert "(rowIndex + columnIndex + 1)" from the left var leftValue = Vector256.Create(rowIndex + columnIndex + 1); left = Avx2.Or(Avx2.And(shift, Avx2.PackUnsignedSaturate(leftValue, leftValue)), left); shift = ShiftLeft(shift); // compare source to target // alternativ, compare equal and OR with One var match = Avx2.CompareEqual(sourceV, targetV); var add = Avx2.AndNot(match, one); var next = Avx2.AddSaturate(diag, add); // Create next diag which is current up var up = ShiftLeft(left); //up = Sse42.Insert(up, (ushort)previousRowPtr[columnIndex], 0); up = Avx2.Or(up, Vector256.CreateScalar((ushort)previousRowPtr[columnIndex])); var tmp = Avx2.AddSaturate(Avx2.Min(left, up), one); next = Avx2.Min(next, tmp); left = next; diag = up; } var writePtr = previousRowPtr; * writePtr = left.GetElement(VECTOR_LENGTH - 1); writePtr++; for (int columnIndex = VECTOR_LENGTH; columnIndex < targetLength; columnIndex++) { // Shift in the next character targetV = ShiftLeft(targetV); //targetV = Avx2.Insert(targetV, (ushort)targetPtr[columnIndex], 0); targetV = Avx2.Or(targetV, Vector256.CreateScalar((ushort)targetPtr[columnIndex])); // compare source to target // alternativ, compare equal and OR with One var match = Avx2.CompareEqual(sourceV, targetV); var add = Avx2.AndNot(match, one); var next = Avx2.AddSaturate(diag, add); // Create next diag which is current up var up = ShiftLeft(left); //up = Sse42.Insert(up, (ushort)previousRowPtr[columnIndex], 0); up = Avx2.Or(up, Vector256.CreateScalar((ushort)previousRowPtr[columnIndex])); var tmp = Avx2.AddSaturate(Avx2.Min(left, up), one); next = Avx2.Min(next, tmp); left = next; diag = up; // Store one value *writePtr = next.GetElement(VECTOR_LENGTH - 1); writePtr++; } // Finish with last 3 items, dont read any more chars just extract them for (int i = targetLength - (VECTOR_LENGTH - 1); i < previousRow.Length; i++) { // Shift in the next character targetV = ShiftLeft(targetV); // compare source to target // alternativ, compare equal and OR with One var match = Avx2.CompareEqual(sourceV, targetV); var add = Avx2.AndNot(match, one); var next = Avx2.AddSaturate(diag, add); // Create next diag which is current up var up = ShiftLeft(left); var tmp = Avx2.AddSaturate(Avx2.Min(left, up), one); next = Avx2.Min(next, tmp); left = next; diag = up; // Store one value previousRowPtr[i] = left.GetElement(VECTOR_LENGTH - 1); // writePtr++; } #if DEBUG if (true) { Console.Write("prev values for row {0}:", rowIndex); for (int i = 0; i < targetLength; ++i) { Console.Write("{0} ", previousRow[i]); } Console.WriteLine(); } #endif } //Calculate Single Rows for (; rowIndex < sourceLength; rowIndex++) { var lastSubstitutionCost = rowIndex; var lastInsertionCost = rowIndex + 1; var sourcePrevChar = source[rowIndex]; #if DEBUG Console.Write("prev values for row {0}:", rowIndex); for (int i = 0; i < targetLength; ++i) { Console.Write("{0} ", previousRow[i]); } Console.WriteLine(); #endif CalculateRow(previousRowPtr, targetPtr, targetLength, sourcePrevChar, lastInsertionCost, lastSubstitutionCost); } } var result = previousRow[targetLength - 1]; arrayPool.Return(pooledArray); return(result); }