public void RunLclVarScenario_Load() { var left = Sse2.LoadVector128((SByte *)(_dataTable.inArray1Ptr)); var right = Sse2.LoadVector128((SByte *)(_dataTable.inArray2Ptr)); var result = Sse41.Min(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { var left = Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr); var result = Sse41.Min(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Sse41.Min(_fld1, _fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); }
public static Vector4UInt32 Min(Vector4UInt32Param1_3 left, Vector4UInt32Param1_3 right) { if (Sse41.IsSupported) { return(Sse41.Min(left, right)); } return(Min_Software(left, right)); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new SimpleBinaryOpTest__MinInt32(); var result = Sse41.Min(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Sse41.Min(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse41.Min( Unsafe.Read <Vector128 <UInt32> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <UInt32> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public static Vector4Int32 Clamp(Vector4Int32Param1_3 vector, Vector4Int32Param1_3 low, Vector4Int32Param1_3 high) { if (Sse41.IsSupported) { Vector4Int32 temp = Sse41.Min(vector, high); return(Sse41.Max(temp, low)); } return(Clamp_Software(vector, low, high)); }
public void RunBasicScenario_LoadAligned() { var result = Sse41.Min( Sse2.LoadAlignedVector128((UInt32 *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((UInt32 *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { var result = Sse41.Min( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var left = Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray1Ptr)); var right = Sse2.LoadAlignedVector128((UInt16 *)(_dataTable.inArray2Ptr)); var result = Sse41.Min(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var left = Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <UInt16> >(_dataTable.inArray2Ptr); var result = Sse41.Min(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public static i32 Min_i32(i32 a, i32 b) { if (Sse41.IsSupported) { return(Sse41.Min(a, b)); } else { return(Select_i32(LessThan(a, b), a, b)); } }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var op1 = Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray1Ptr)); var op2 = Sse2.LoadAlignedVector128((Int32 *)(_dataTable.inArray2Ptr)); var result = Sse41.Min(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector128 <Int32> >(_dataTable.inArray2Ptr); var result = Sse41.Min(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Sse41.Min( Sse2.LoadVector128((Int32 *)(_dataTable.inArray1Ptr)), Sse2.LoadVector128((Int32 *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Sse41.Min( Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <SByte> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse41.Min( Sse2.LoadVector128((Int32 *)(&test._fld1)), Sse2.LoadVector128((Int32 *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__MinInt32 testClass) { fixed(Vector128 <Int32> *pFld1 = &_fld1) fixed(Vector128 <Int32> *pFld2 = &_fld2) { var result = Sse41.Min( Sse2.LoadVector128((Int32 *)(pFld1)), Sse2.LoadVector128((Int32 *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public static unsafe void CalculateDiagonalSection_Sse41 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct { if (typeof(T) == typeof(int)) { var diag1Ptr = (int *)refDiag1Ptr; var diag2Ptr = (int *)refDiag2Ptr; var sourceVector = Sse41.ConvertToVector128Int32((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse41.ConvertToVector128Int32((ushort *)targetPtr + columnIndex - 1); targetVector = Sse2.Shuffle(targetVector, 0x1b); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create(1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } else if (typeof(T) == typeof(ushort)) { var diag1Ptr = (ushort *)refDiag1Ptr; var diag2Ptr = (ushort *)refDiag2Ptr; var sourceVector = Sse3.LoadDquVector128((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse3.LoadDquVector128((ushort *)targetPtr + columnIndex - 1); targetVector = Ssse3.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_128).AsUInt16(); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create((ushort)1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } }
public static unsafe int Min(this Matrix <int> matrix) { var i = 0; fixed(int *ptr = matrix.GetArray()) { var span = new Span <int>(ptr, matrix.Length); var minScalar = span[0]; if (Sse41.IsSupported) { var minValues = stackalloc int[4] { span[0], span[0], span[0], span[0] }; var min = Sse2.LoadVector128(minValues); while (i < span.Length - 4) { var vector128 = Sse2.LoadVector128(ptr + i); min = Sse41.Min(vector128, min); i += 4; } var j = 0; var x = min.GetElement(0); while (j < 4) { var y = min.GetElement(j); x = (x & ((x - y) >> 31)) | (y & (~(x - y) >> 31)); j++; } minScalar = x; } while (i < span.Length) { var y = span[i]; minScalar = (minScalar & ((minScalar - y) >> 31)) | (y & (~(minScalar - y) >> 31)); i++; } return(minScalar); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector128 <UInt16> *pClsVar1 = &_clsVar1) fixed(Vector128 <UInt16> *pClsVar2 = &_clsVar2) { var result = Sse41.Min( Sse2.LoadVector128((UInt16 *)(pClsVar1)), Sse2.LoadVector128((UInt16 *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Int32> *pFld1 = &_fld1) fixed(Vector128 <Int32> *pFld2 = &_fld2) { var result = Sse41.Min( Sse2.LoadVector128((Int32 *)(pFld1)), Sse2.LoadVector128((Int32 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__MinUInt16(); fixed(Vector128 <UInt16> *pFld1 = &test._fld1) fixed(Vector128 <UInt16> *pFld2 = &test._fld2) { var result = Sse41.Min( Sse2.LoadVector128((UInt16 *)(pFld1)), Sse2.LoadVector128((UInt16 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; inputLength -= numAsciiCharsConsumedJustNow; if (inputLength == 0) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero; do { Vector128 <ushort> utf16Data; if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned } else { utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned } Vector128 <ushort> charIsNonAscii; if (AdvSimd.Arm64.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = AdvSimd.Min(utf16Data, vector0080); } else if (Sse41.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = Sse41.Min(utf16Data, vector0080); } else { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will // be handled in a few lines. charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080); } #if DEBUG // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element. uint debugMask; if (AdvSimd.Arm64.IsSupported) { debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte()); } else { debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); } Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'."); #endif // DEBUG // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding // input was 0x0800 <= [value]. This also handles the missing range a few lines above. Vector128 <ushort> charIsThreeByteUtf8Encoded; uint mask; if (AdvSimd.IsSupported) { charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11)); mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } else { charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } // Each even bit of mask will be 1 only if the char was >= 0x0080, // and each odd bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is >= 0x0800 // | ,-- set if char[0] is >= 0x0800 // v v // mask = ... 1 1 0 1 // ^ ^-- set if char[0] is non-ASCII // `-- set if char[1] is non-ASCII // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. // // For now, compute the popcnt but squirrel it away. We'll fold it in to the // cumulative UTF-8 adjustment factor once we determine that there are no // unpaired surrogates in our data. (Unpaired surrogates would invalidate // our computed result and we'd have to throw it away.) uint popcnt = (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.Add(utf16Data, vectorA800); mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } else { utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. // Since 'mask' already has 00 in these positions (since the corresponding char // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions. uint mask2; if (AdvSimd.Arm64.IsSupported) { mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte()); } else { mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); } // 'lowSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a low surrogate char, // - 00 if the corresponding char was a high surrogate char or not a surrogate at all. uint lowSurrogatesMask = mask2 & mask; // 'highSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a high surrogate char, // - 00 if the corresponding char was a low surrogate char or not a surrogate at all. uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask; Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0, "A char cannot simultaneously be both a high and a low surrogate char."); Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0, "Only even bits (no odd bits) of the masks should be set."); // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here pInputBuffer--; inputLength++; } // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for // free right now, saving the extension step a few lines below. If we're 32-bit, the // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real // 64 -bit extension a few lines below. nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNuint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation // assumes that the pair is encoded as 6 UTF-8 code units. Since each // pair is in reality only encoded as 4 UTF-8 code units, we need to // perform this adjustment now. if (IntPtr.Size == 8) { // Since we've already zero-extended surrogatePairsCountNuint, we can directly // sub + sub. It's more efficient than shl + sub. tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; } else { // Take the hit of the 64-bit extension now. tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint; } } tempUtf8CodeUnitCountAdjustment += popcnt; pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE2 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint_t> sumVector = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint_t> .Count; i++) { popcnt += (nuint)sumVector[i]; } uint popcnt32 = (uint)popcnt; if (IntPtr.Size == 8) { popcnt32 += (uint)(popcnt >> 32); } // As in the SSE4.1 paths, compute popcnt but don't fold it in until we // know there aren't any unpaired surrogates in the input data. popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. if (lowSurrogateChars[0] != 0) { goto Error; // error: start of buffer contains standalone low surrogate char } ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0 if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; popcnt32 -= 2; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } tempUtf8CodeUnitCountAdjustment += popcnt32; pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
public static Vector128 <uint> _mm_min_epu32(Vector128 <uint> left, Vector128 <uint> right) { return(Sse41.Min(left, right)); }
private static unsafe double[] BilinearInterpol_AVX( double[] x, double[] A, double minXA, double maxXA, double[] B, double minXB, double maxXB, double weightB) { double[] z = new double[outputVectorSize]; fixed(double *pX = &x[0], pA = &A[0], pB = &B[0], pZ = &z[0]) { Vector256 <double> vWeightB = Vector256.Create(weightB); Vector256 <double> vWeightA = Vector256.Create(1 - weightB); Vector256 <double> vMinXA = Vector256.Create(minXA); Vector256 <double> vMaxXA = Vector256.Create(maxXA); Vector256 <double> vMinXB = Vector256.Create(minXB); Vector256 <double> vMaxXB = Vector256.Create(maxXB); double deltaA = (maxXA - minXA) / (double)(A.Length - 1); double deltaB = (maxXB - minXB) / (double)(B.Length - 1); Vector256 <double> vDeltaA = Vector256.Create(deltaA); Vector256 <double> vDeltaB = Vector256.Create(deltaB); double invDeltaA = 1.0 / deltaA; double invDeltaB = 1.0 / deltaB; Vector256 <double> vInvDeltaA = Vector256.Create(invDeltaA); Vector256 <double> vInvDeltaB = Vector256.Create(invDeltaB); Vector128 <int> ALengthMinusOne = Vector128.Create(A.Length - 1); Vector128 <int> BLengthMinusOne = Vector128.Create(B.Length - 1); Vector128 <int> One = Vector128.Create(1); for (var i = 0; i < x.Length; i += Vector256 <double> .Count) { Vector256 <double> currentX = Avx.LoadVector256(pX + i); // Determine the largest a, such that A[i] = f(xA) and xA <= x[i]. // This involves casting from double to int; here we use a Vector conversion. Vector256 <double> aDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXA), vInvDeltaA); Vector128 <int> a = Avx.ConvertToVector128Int32WithTruncation(aDouble); a = Sse41.Min(Sse41.Max(a, Vector128 <int> .Zero), ALengthMinusOne); Vector128 <int> aPlusOne = Sse41.Min(Sse2.Add(a, One), ALengthMinusOne); // Now, get the reference input, xA, for our index a. // This involves casting from int to double. Vector256 <double> xA = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(a), vDeltaA), vMinXA); // Now, compute the lambda for our A reference point. Vector256 <double> currentXNormA = Avx.Max(vMinXA, Avx.Min(currentX, vMaxXA)); Vector256 <double> lambdaA = Avx.Multiply(Avx.Subtract(currentXNormA, xA), vInvDeltaA); // Now, we need to load up our reference points using Vector Gather operations. Vector256 <double> AVector = Avx2.GatherVector256(pA, a, 8); Vector256 <double> AVectorPlusOne = Avx2.GatherVector256(pA, aPlusOne, 8); // Now, do the all of the above for our B reference point. Vector256 <double> bDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXB), vInvDeltaB); Vector128 <int> b = Avx.ConvertToVector128Int32WithTruncation(bDouble); b = Sse41.Min(Sse41.Max(b, Vector128 <int> .Zero), BLengthMinusOne); Vector128 <int> bPlusOne = Sse41.Min(Sse2.Add(b, One), BLengthMinusOne); Vector256 <double> xB = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(b), vDeltaB), vMinXB); Vector256 <double> currentXNormB = Avx.Max(vMinXB, Avx.Min(currentX, vMaxXB)); Vector256 <double> lambdaB = Avx.Multiply(Avx.Subtract(currentXNormB, xB), vInvDeltaB); Vector256 <double> BVector = Avx2.GatherVector256(pB, b, 8); Vector256 <double> BVectorPlusOne = Avx2.GatherVector256(pB, bPlusOne, 8); Vector256 <double> newZ = Avx.Add(Avx.Multiply(vWeightA, Avx.Add(AVector, Avx.Multiply(lambdaA, Avx.Subtract(AVectorPlusOne, AVector)))), Avx.Multiply(vWeightB, Avx.Add(BVector, Avx.Multiply(lambdaB, Avx.Subtract(BVectorPlusOne, BVector))))); Avx.Store(pZ + i, newZ); } } return(z); }
private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char *pBuffer, nuint bufferLength /* in chars */) { // This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method // will be elided by JIT once we determine which specific ISAs we support. // Quick check for empty inputs. if (bufferLength == 0) { return(0); } // JIT turns the below into constants uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf <Vector128 <byte> >(); uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char); Debug.Assert(Sse2.IsSupported, "Should've been checked by caller."); Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian."); Vector128 <short> firstVector, secondVector; uint currentMask; char *pOriginalBuffer = pBuffer; if (bufferLength < SizeOfVector128InChars) { goto InputBufferLessThanOneVectorInLength; // can't vectorize; drain primitives instead } // This method is written such that control generally flows top-to-bottom, avoiding // jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII // data, we jump out of the hot paths to targets at the end of the method. Vector128 <short> asciiMaskForPTEST = Vector128.Create(unchecked ((short)0xFF80)); // used for PTEST on supported hardware Vector128 <ushort> asciiMaskForPMINUW = Vector128.Create((ushort)0x0080); // used for PMINUW on supported hardware Vector128 <short> asciiMaskForPXOR = Vector128.Create(unchecked ((short)0x8000)); // used for PXOR Vector128 <short> asciiMaskForPCMPGTW = Vector128.Create(unchecked ((short)0x807F)); // used for PCMPGTW Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); // Read the first vector unaligned. firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load if (Sse41.IsSupported) { // The SSE41-optimized code path works by forcing the 0x0080 bit in each WORD of the vector to be // set iff the WORD element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector // in order to extract the mask. currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte()); } else { // The SSE2-optimized code path works by forcing each WORD of the vector to be 0xFFFF iff the WORD // element has value >= 0x0080 (non-ASCII). Then we'll treat it as a BYTE vector in order to extract // the mask. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); } if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } // If we have less than 32 bytes to process, just go straight to the final unaligned // read. There's no need to mess with the loop logic in the middle of this method. // Adjust the remaining length to account for what we just read. // For the remainder of this code path, bufferLength will be in bytes, not chars. bufferLength <<= 1; // chars to bytes if (bufferLength < 2 * SizeOfVector128InBytes) { goto IncrementCurrentOffsetBeforeFinalUnalignedVectorRead; } // Now adjust the read pointer so that future reads are aligned. pBuffer = (char *)(((nuint)pBuffer + SizeOfVector128InBytes) & ~(nuint)(SizeOfVector128InBytes - 1)); #if DEBUG long numCharsRead = pBuffer - pOriginalBuffer; Debug.Assert(0 < numCharsRead && numCharsRead <= SizeOfVector128InChars, "We should've made forward progress of at least one char."); Debug.Assert((nuint)numCharsRead <= bufferLength, "We shouldn't have read past the end of the input buffer."); #endif // Adjust remaining buffer length. bufferLength += (nuint)pOriginalBuffer; bufferLength -= (nuint)pBuffer; // The buffer is now properly aligned. // Read 2 vectors at a time if possible. if (bufferLength >= 2 * SizeOfVector128InBytes) { char *pFinalVectorReadPos = (char *)((nuint)pBuffer + bufferLength - 2 * SizeOfVector128InBytes); // After this point, we no longer need to update the bufferLength value. do { firstVector = Sse2.LoadAlignedVector128((short *)pBuffer); secondVector = Sse2.LoadAlignedVector128((short *)pBuffer + SizeOfVector128InChars); Vector128 <short> combinedVector = Sse2.Or(firstVector, secondVector); if (Sse41.IsSupported) { // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. if (!Sse41.TestZ(combinedVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstOrSecondVector; } } else { // See comment earlier in the method for an explanation of how the below logic works. if (Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(combinedVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()) != 0) { goto FoundNonAsciiDataInFirstOrSecondVector; } } pBuffer += 2 * SizeOfVector128InChars; } while (pBuffer <= pFinalVectorReadPos); } // We have somewhere between 0 and (2 * vector length) - 1 bytes remaining to read from. // Since the above loop doesn't update bufferLength, we can't rely on its absolute value. // But we _can_ rely on it to tell us how much remaining data must be drained by looking // at what bits of it are set. This works because had we updated it within the loop above, // we would've been adding 2 * SizeOfVector128 on each iteration, but we only care about // bits which are less significant than those that the addition would've acted on. // If there is fewer than one vector length remaining, skip the next aligned read. // Remember, at this point bufferLength is measured in bytes, not chars. if ((bufferLength & SizeOfVector128InBytes) == 0) { goto DoFinalUnalignedVectorRead; } // At least one full vector's worth of data remains, so we can safely read it. // Remember, at this point pBuffer is still aligned. firstVector = Sse2.LoadAlignedVector128((short *)pBuffer); if (Sse41.IsSupported) { // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstVector; } } else { // See comment earlier in the method for an explanation of how the below logic works. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } } IncrementCurrentOffsetBeforeFinalUnalignedVectorRead: pBuffer += SizeOfVector128InChars; DoFinalUnalignedVectorRead: if (((byte)bufferLength & (SizeOfVector128InBytes - 1)) != 0) { // Perform an unaligned read of the last vector. // We need to adjust the pointer because we're re-reading data. pBuffer = (char *)((byte *)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes); firstVector = Sse2.LoadVector128((short *)pBuffer); // unaligned load if (Sse41.IsSupported) { // If a non-ASCII bit is set in any WORD of the combined vector, we have seen non-ASCII data. // Jump to the non-ASCII handler to figure out which particular vector contained non-ASCII data. if (!Sse41.TestZ(firstVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstVector; } } else { // See comment earlier in the method for an explanation of how the below logic works. currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } } pBuffer += SizeOfVector128InChars; } Finish: Debug.Assert(((nuint)pBuffer - (nuint)pOriginalBuffer) % 2 == 0, "Shouldn't have incremented any pointer by an odd byte count."); return(((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char)); // and we're done! (remember to adjust for char count) FoundNonAsciiDataInFirstOrSecondVector: // We don't know if the first or the second vector contains non-ASCII data. Check the first // vector, and if that's all-ASCII then the second vector must be the culprit. Either way // we'll make sure the first vector local is the one that contains the non-ASCII data. // See comment earlier in the method for an explanation of how the below logic works. if (Sse41.IsSupported) { if (!Sse41.TestZ(firstVector, asciiMaskForPTEST)) { goto FoundNonAsciiDataInFirstVector; } } else { currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); if (currentMask != 0) { goto FoundNonAsciiDataInCurrentMask; } } // Wasn't the first vector; must be the second. pBuffer += SizeOfVector128InChars; firstVector = secondVector; FoundNonAsciiDataInFirstVector: // See comment earlier in the method for an explanation of how the below logic works. if (Sse41.IsSupported) { currentMask = (uint)Sse2.MoveMask(Sse41.Min(firstVector.AsUInt16(), asciiMaskForPMINUW).AsByte()); } else { currentMask = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(firstVector, asciiMaskForPXOR), asciiMaskForPCMPGTW).AsByte()); } FoundNonAsciiDataInCurrentMask: // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte. // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't // available, we'll fall back to a normal loop. (Even though the original vector used WORD elements, // masks work on BYTE elements, and we account for this in the final fixup.) Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data."); pBuffer = (char *)((byte *)pBuffer + (uint)BitOperations.TrailingZeroCount(currentMask)); goto Finish; FoundNonAsciiDataInCurrentDWord: uint currentDWord; Debug.Assert(!AllCharsInUInt32AreAscii(currentDWord), "Shouldn't be here unless we see non-ASCII data."); if (FirstCharInUInt32IsAscii(currentDWord)) { pBuffer++; // skip past the ASCII char } goto Finish; InputBufferLessThanOneVectorInLength: // These code paths get hit if the original input length was less than one vector in size. // We can't perform vectorized reads at this point, so we'll fall back to reading primitives // directly. Note that all of these reads are unaligned. // Reminder: If this code path is hit, bufferLength is still a char count, not a byte count. // We skipped the code path that multiplied the count by sizeof(char). Debug.Assert(bufferLength < SizeOfVector128InChars); // QWORD drain if ((bufferLength & 4) != 0) { if (Bmi1.X64.IsSupported) { // If we can use 64-bit tzcnt to count the number of leading ASCII chars, prefer it. ulong candidateUInt64 = Unsafe.ReadUnaligned <ulong>(pBuffer); if (!AllCharsInUInt64AreAscii(candidateUInt64)) { // Clear the low 7 bits (the ASCII bits) of each char, then tzcnt. // Remember the / 8 at the end to convert bit count to byte count, // then the & ~1 at the end to treat a match in the high byte of // any char the same as a match in the low byte of that same char. candidateUInt64 &= 0xFF80FF80_FF80FF80ul; pBuffer = (char *)((byte *)pBuffer + ((nuint)(Bmi1.X64.TrailingZeroCount(candidateUInt64) / 8) & ~(nuint)1)); goto Finish; } } else { // If we can't use 64-bit tzcnt, no worries. We'll just do 2x 32-bit reads instead. currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer); uint nextDWord = Unsafe.ReadUnaligned <uint>(pBuffer + 4 / sizeof(char)); if (!AllCharsInUInt32AreAscii(currentDWord | nextDWord)) { // At least one of the values wasn't all-ASCII. // We need to figure out which one it was and stick it in the currentMask local. if (AllCharsInUInt32AreAscii(currentDWord)) { currentDWord = nextDWord; // this one is the culprit pBuffer += 4 / sizeof(char); } goto FoundNonAsciiDataInCurrentDWord; } } pBuffer += 4; // successfully consumed 4 ASCII chars } // DWORD drain if ((bufferLength & 2) != 0) { currentDWord = Unsafe.ReadUnaligned <uint>(pBuffer); if (!AllCharsInUInt32AreAscii(currentDWord)) { goto FoundNonAsciiDataInCurrentDWord; } pBuffer += 2; // successfully consumed 2 ASCII chars } // WORD drain // This is the final drain; there's no need for a BYTE drain since our elemental type is 16-bit char. if ((bufferLength & 1) != 0) { if (*pBuffer <= 0x007F) { pBuffer++; // successfully consumed a single char } } goto Finish; }
private static unsafe char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)GetIndexOfFirstNonAsciiChar_Sse2(pInputBuffer, (uint)inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; if (numAsciiCharsConsumedJustNow == inputLength) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if (Sse41.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vector0800 = Sse2.ShiftLeftLogical(vector0080, 4); // = 0x0800 Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); do { Vector128 <ushort> utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); uint mask = (uint)Sse2.MoveMask( Sse2.Or( Sse2.ShiftLeftLogical(Sse41.Min(utf16Data, vector0080), 8), Sse2.ShiftRightLogical(Sse41.Min(utf16Data, vector0800), 4)).AsByte()); // Each odd bit of mask will be 1 only if the char was >= 0x0080, // and each even bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is non-ASCII // | ,-- set if char[0] is non-ASCII // v v // mask = ... 1 1 1 0 // ^ ^-- set if char[0] is >= 0x800 // `-- set if char[1] is >= 0x800 // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. tempUtf8CodeUnitCountAdjustment += (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. uint mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); uint lowSurrogatesMask = mask2 & mask; // 01 only if was a low surrogate char, else 00 uint highSurrogatesMask = (mask2 ^ mask) & 0x5555u; // 01 only if was a high surrogate char, else 00 // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt pInputBuffer--; inputLength++; } int surrogatePairsCount = BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= surrogatePairsCount; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. nint surrogatePairsCountNint = (nint)(nuint)(uint)surrogatePairsCount; // zero-extend to native int size tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE41 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint> sumVector = (Vector <nuint>)(-Vector.Add(twoOrMoreUtf8Bytes, threeOrMoreUtf8Bytes)); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint> .Count; i++) { popcnt += sumVector[i]; } uint popcnt32 = (uint)popcnt; if (sizeof(nuint) == sizeof(ulong)) { popcnt32 += (uint)(popcnt >> 32); } tempUtf8CodeUnitCountAdjustment += (ushort)popcnt32; tempUtf8CodeUnitCountAdjustment += popcnt32 >> 16; // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; tempUtf8CodeUnitCountAdjustment -= 2; tempScalarCountAdjustment--; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
public static Vector128 <ushort> _mm_min_epu16(Vector128 <ushort> left, Vector128 <ushort> right) { return(Sse41.Min(left, right)); }