public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse2.AndNot( Sse2.LoadVector128((Int32 *)(&test._fld1)), Sse2.LoadVector128((Int32 *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__AndNotUInt32 testClass) { fixed(Vector128 <UInt32> *pFld1 = &_fld1) fixed(Vector128 <UInt32> *pFld2 = &_fld2) { var result = Sse2.AndNot( Sse2.LoadVector128((UInt32 *)(pFld1)), Sse2.LoadVector128((UInt32 *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <UInt32> *pFld1 = &_fld1) fixed(Vector128 <UInt32> *pFld2 = &_fld2) { var result = Sse2.AndNot( Sse2.LoadVector128((UInt32 *)(pFld1)), Sse2.LoadVector128((UInt32 *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector128 <Double> *pClsVar1 = &_clsVar1) fixed(Vector128 <Double> *pClsVar2 = &_clsVar2) { var result = Sse2.AndNot( Sse2.LoadVector128((Double *)(pClsVar1)), Sse2.LoadVector128((Double *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__AndNotDouble(); fixed(Vector128 <Double> *pFld1 = &test._fld1) fixed(Vector128 <Double> *pFld2 = &test._fld2) { var result = Sse2.AndNot( Sse2.LoadVector128((Double *)(pFld1)), Sse2.LoadVector128((Double *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
public static Vector128 <double> ConditionalSelectBitwise(Vector128 <double> selector, Vector128 <double> ifTrue, Vector128 <double> ifFalse) { // This implementation is based on the DirectX Math Library XMVector4NotEqual method // https://github.com/microsoft/DirectXMath/blob/master/Inc/DirectXMathVector.inl if (AdvSimd.IsSupported) { return(AdvSimd.BitwiseSelect(selector, ifTrue, ifFalse)); } else if (Sse2.IsSupported) { return(Sse2.Or(Sse2.And(ifTrue, selector), Sse2.AndNot(selector, ifFalse))); } else { // Redundant test so we won't prejit remainder of this method on platforms without AdvSimd. throw new PlatformNotSupportedException(); } }
// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of char length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> public static char *GetPointerToFirstInvalidChar(char *pInputBuffer, int inputLength, out long utf8CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); // First, we'll handle the common case of all-ASCII. If this is able to // consume the entire buffer, we'll skip the remainder of this method's logic. int numAsciiCharsConsumedJustNow = (int)ASCIIUtility.GetIndexOfFirstNonAsciiChar(pInputBuffer, (uint)inputLength); Debug.Assert(0 <= numAsciiCharsConsumedJustNow && numAsciiCharsConsumedJustNow <= inputLength); pInputBuffer += (uint)numAsciiCharsConsumedJustNow; inputLength -= numAsciiCharsConsumedJustNow; if (inputLength == 0) { utf8CodeUnitCountAdjustment = 0; scalarCountAdjustment = 0; return(pInputBuffer); } // If we got here, it means we saw some non-ASCII data, so within our // vectorized code paths below we'll handle all non-surrogate UTF-16 // code points branchlessly. We'll only branch if we see surrogates. // // We still optimistically assume the data is mostly ASCII. This means that the // number of UTF-8 code units and the number of scalars almost matches the number // of UTF-16 code units. As we go through the input and find non-ASCII // characters, we'll keep track of these "adjustment" fixups. To get the // total number of UTF-8 code units required to encode the input data, add // the UTF-8 code unit count adjustment to the number of UTF-16 code units // seen. To get the total number of scalars present in the input data, // add the scalar count adjustment to the number of UTF-16 code units seen. long tempUtf8CodeUnitCountAdjustment = 0; int tempScalarCountAdjustment = 0; if ((AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) || Sse2.IsSupported) { if (inputLength >= Vector128 <ushort> .Count) { Vector128 <ushort> vector0080 = Vector128.Create((ushort)0x80); Vector128 <ushort> vectorA800 = Vector128.Create((ushort)0xA800); Vector128 <short> vector8800 = Vector128.Create(unchecked ((short)0x8800)); Vector128 <ushort> vectorZero = Vector128 <ushort> .Zero; do { Vector128 <ushort> utf16Data; if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.LoadVector128((ushort *)pInputBuffer); // unaligned } else { utf16Data = Sse2.LoadVector128((ushort *)pInputBuffer); // unaligned } Vector128 <ushort> charIsNonAscii; if (AdvSimd.Arm64.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = AdvSimd.Min(utf16Data, vector0080); } else if (Sse41.IsSupported) { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value]. (i.e., [value] is non-ASCII.) charIsNonAscii = Sse41.Min(utf16Data, vector0080); } else { // Sets the 0x0080 bit of each element in 'charIsNonAscii' if the corresponding // input was 0x0080 <= [value] <= 0x7FFF. The case where 0x8000 <= [value] will // be handled in a few lines. charIsNonAscii = Sse2.AndNot(Sse2.CompareGreaterThan(vector0080.AsInt16(), utf16Data.AsInt16()).AsUInt16(), vector0080); } #if DEBUG // Quick check to ensure we didn't accidentally set the 0x8000 bit of any element. uint debugMask; if (AdvSimd.Arm64.IsSupported) { debugMask = GetNonAsciiBytes(charIsNonAscii.AsByte()); } else { debugMask = (uint)Sse2.MoveMask(charIsNonAscii.AsByte()); } Debug.Assert((debugMask & 0b_1010_1010_1010_1010) == 0, "Shouldn't have set the 0x8000 bit of any element in 'charIsNonAscii'."); #endif // DEBUG // Sets the 0x8080 bits of each element in 'charIsNonAscii' if the corresponding // input was 0x0800 <= [value]. This also handles the missing range a few lines above. Vector128 <ushort> charIsThreeByteUtf8Encoded; uint mask; if (AdvSimd.IsSupported) { charIsThreeByteUtf8Encoded = AdvSimd.Subtract(vectorZero, AdvSimd.ShiftRightLogical(utf16Data, 11)); mask = GetNonAsciiBytes(AdvSimd.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } else { charIsThreeByteUtf8Encoded = Sse2.Subtract(vectorZero, Sse2.ShiftRightLogical(utf16Data, 11)); mask = (uint)Sse2.MoveMask(Sse2.Or(charIsNonAscii, charIsThreeByteUtf8Encoded).AsByte()); } // Each even bit of mask will be 1 only if the char was >= 0x0080, // and each odd bit of mask will be 1 only if the char was >= 0x0800. // // Example for UTF-16 input "[ 0123 ] [ 1234 ] ...": // // ,-- set if char[1] is >= 0x0800 // | ,-- set if char[0] is >= 0x0800 // v v // mask = ... 1 1 0 1 // ^ ^-- set if char[0] is non-ASCII // `-- set if char[1] is non-ASCII // // This means we can popcnt the number of set bits, and the result is the // number of *additional* UTF-8 bytes that each UTF-16 code unit requires as // it expands. This results in the wrong count for UTF-16 surrogate code // units (we just counted that each individual code unit expands to 3 bytes, // but in reality a well-formed UTF-16 surrogate pair expands to 4 bytes). // We'll handle this in just a moment. // // For now, compute the popcnt but squirrel it away. We'll fold it in to the // cumulative UTF-8 adjustment factor once we determine that there are no // unpaired surrogates in our data. (Unpaired surrogates would invalidate // our computed result and we'd have to throw it away.) uint popcnt = (uint)BitOperations.PopCount(mask); // Surrogates need to be special-cased for two reasons: (a) we need // to account for the fact that we over-counted in the addition above; // and (b) they require separate validation. if (AdvSimd.Arm64.IsSupported) { utf16Data = AdvSimd.Add(utf16Data, vectorA800); mask = GetNonAsciiBytes(AdvSimd.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } else { utf16Data = Sse2.Add(utf16Data, vectorA800); mask = (uint)Sse2.MoveMask(Sse2.CompareLessThan(utf16Data.AsInt16(), vector8800).AsByte()); } if (mask != 0) { // There's at least one UTF-16 surrogate code unit present. // Since we performed a pmovmskb operation on the result of a 16-bit pcmpgtw, // the resulting bits of 'mask' will occur in pairs: // - 00 if the corresponding UTF-16 char was not a surrogate code unit; // - 11 if the corresponding UTF-16 char was a surrogate code unit. // // A UTF-16 high/low surrogate code unit has the bit pattern [ 11011q## ######## ], // where # is any bit; q = 0 represents a high surrogate, and q = 1 represents // a low surrogate. Since we added 0xA800 in the vectorized operation above, // our surrogate pairs will now have the bit pattern [ 10000q## ######## ]. // If we logical right-shift each word by 3, we'll end up with the bit pattern // [ 00010000 q####### ], which means that we can immediately use pmovmskb to // determine whether a given char was a high or a low surrogate. // // Therefore the resulting bits of 'mask2' will occur in pairs: // - 00 if the corresponding UTF-16 char was a high surrogate code unit; // - 01 if the corresponding UTF-16 char was a low surrogate code unit; // - ## (garbage) if the corresponding UTF-16 char was not a surrogate code unit. // Since 'mask' already has 00 in these positions (since the corresponding char // wasn't a surrogate), "mask AND mask2 == 00" holds for these positions. uint mask2; if (AdvSimd.Arm64.IsSupported) { mask2 = GetNonAsciiBytes(AdvSimd.ShiftRightLogical(utf16Data, 3).AsByte()); } else { mask2 = (uint)Sse2.MoveMask(Sse2.ShiftRightLogical(utf16Data, 3).AsByte()); } // 'lowSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a low surrogate char, // - 00 if the corresponding char was a high surrogate char or not a surrogate at all. uint lowSurrogatesMask = mask2 & mask; // 'highSurrogatesMask' has its bits occur in pairs: // - 01 if the corresponding char was a high surrogate char, // - 00 if the corresponding char was a low surrogate char or not a surrogate at all. uint highSurrogatesMask = (mask2 ^ 0b_0101_0101_0101_0101u /* flip all even-numbered bits 00 <-> 01 */) & mask; Debug.Assert((highSurrogatesMask & lowSurrogatesMask) == 0, "A char cannot simultaneously be both a high and a low surrogate char."); Debug.Assert(((highSurrogatesMask | lowSurrogatesMask) & 0b_1010_1010_1010_1010u) == 0, "Only even bits (no odd bits) of the masks should be set."); // Now check that each high surrogate is followed by a low surrogate and that each // low surrogate follows a high surrogate. We make an exception for the case where // the final char of the vector is a high surrogate, since we can't perform validation // on it until the next iteration of the loop when we hope to consume the matching // low surrogate. highSurrogatesMask <<= 2; if ((ushort)highSurrogatesMask != lowSurrogatesMask) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } if (highSurrogatesMask > ushort.MaxValue) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. highSurrogatesMask = (ushort)highSurrogatesMask; // don't allow stray high surrogate to be consumed by popcnt popcnt -= 2; // the '0xC000_0000' bits in the original mask are shifted out and discarded, so account for that here pInputBuffer--; inputLength++; } // If we're 64-bit, we can perform the zero-extension of the surrogate pairs count for // free right now, saving the extension step a few lines below. If we're 32-bit, the // convertion to nuint immediately below is a no-op, and we'll pay the cost of the real // 64 -bit extension a few lines below. nuint surrogatePairsCountNuint = (uint)BitOperations.PopCount(highSurrogatesMask); // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNuint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units, so our earlier popcnt computation // assumes that the pair is encoded as 6 UTF-8 code units. Since each // pair is in reality only encoded as 4 UTF-8 code units, we need to // perform this adjustment now. if (IntPtr.Size == 8) { // Since we've already zero-extended surrogatePairsCountNuint, we can directly // sub + sub. It's more efficient than shl + sub. tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; tempUtf8CodeUnitCountAdjustment -= (long)surrogatePairsCountNuint; } else { // Take the hit of the 64-bit extension now. tempUtf8CodeUnitCountAdjustment -= 2 * (uint)surrogatePairsCountNuint; } } tempUtf8CodeUnitCountAdjustment += popcnt; pInputBuffer += Vector128 <ushort> .Count; inputLength -= Vector128 <ushort> .Count; } while (inputLength >= Vector128 <ushort> .Count); } } else if (Vector.IsHardwareAccelerated) { if (inputLength >= Vector <ushort> .Count) { Vector <ushort> vector0080 = new Vector <ushort>(0x0080); Vector <ushort> vector0400 = new Vector <ushort>(0x0400); Vector <ushort> vector0800 = new Vector <ushort>(0x0800); Vector <ushort> vectorD800 = new Vector <ushort>(0xD800); do { // The 'twoOrMoreUtf8Bytes' and 'threeOrMoreUtf8Bytes' vectors will contain // elements whose values are 0xFFFF (-1 as signed word) iff the corresponding // UTF-16 code unit was >= 0x0080 and >= 0x0800, respectively. By summing these // vectors, each element of the sum will contain one of three values: // // 0x0000 ( 0) = original char was 0000..007F // 0xFFFF (-1) = original char was 0080..07FF // 0xFFFE (-2) = original char was 0800..FFFF // // We'll negate them to produce a value 0..2 for each element, then sum all the // elements together to produce the number of *additional* UTF-8 code units // required to represent this UTF-16 data. This is similar to the popcnt step // performed by the SSE2 code path. This will overcount surrogates, but we'll // handle that shortly. Vector <ushort> utf16Data = Unsafe.ReadUnaligned <Vector <ushort> >(pInputBuffer); Vector <ushort> twoOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0080); Vector <ushort> threeOrMoreUtf8Bytes = Vector.GreaterThanOrEqual(utf16Data, vector0800); Vector <nuint_t> sumVector = (Vector <nuint_t>)(Vector <ushort> .Zero - twoOrMoreUtf8Bytes - threeOrMoreUtf8Bytes); // We'll try summing by a natural word (rather than a 16-bit word) at a time, // which should halve the number of operations we must perform. nuint popcnt = 0; for (int i = 0; i < Vector <nuint_t> .Count; i++) { popcnt += (nuint)sumVector[i]; } uint popcnt32 = (uint)popcnt; if (IntPtr.Size == 8) { popcnt32 += (uint)(popcnt >> 32); } // As in the SSE4.1 paths, compute popcnt but don't fold it in until we // know there aren't any unpaired surrogates in the input data. popcnt32 = (ushort)popcnt32 + (popcnt32 >> 16); // Now check for surrogates. utf16Data -= vectorD800; Vector <ushort> surrogateChars = Vector.LessThan(utf16Data, vector0800); if (surrogateChars != Vector <ushort> .Zero) { // There's at least one surrogate (high or low) UTF-16 code unit in // the vector. We'll build up additional vectors: 'highSurrogateChars' // and 'lowSurrogateChars', where the elements are 0xFFFF iff the original // UTF-16 code unit was a high or low surrogate, respectively. Vector <ushort> highSurrogateChars = Vector.LessThan(utf16Data, vector0400); Vector <ushort> lowSurrogateChars = Vector.AndNot(surrogateChars, highSurrogateChars); // We want to make sure that each high surrogate code unit is followed by // a low surrogate code unit and each low surrogate code unit follows a // high surrogate code unit. Since we don't have an equivalent of pmovmskb // or palignr available to us, we'll do this as a loop. We won't look at // the very last high surrogate char element since we don't yet know if // the next vector read will have a low surrogate char element. if (lowSurrogateChars[0] != 0) { goto Error; // error: start of buffer contains standalone low surrogate char } ushort surrogatePairsCount = 0; for (int i = 0; i < Vector <ushort> .Count - 1; i++) { surrogatePairsCount -= highSurrogateChars[i]; // turns into +1 or +0 if (highSurrogateChars[i] != lowSurrogateChars[i + 1]) { goto NonVectorizedLoop; // error: mismatched surrogate pair; break out of vectorized logic } } if (highSurrogateChars[Vector <ushort> .Count - 1] != 0) { // There was a standalone high surrogate at the end of the vector. // We'll adjust our counters so that we don't consider this char consumed. pInputBuffer--; inputLength++; popcnt32 -= 2; } nint surrogatePairsCountNint = (nint)surrogatePairsCount; // zero-extend to native int size // 2 UTF-16 chars become 1 Unicode scalar tempScalarCountAdjustment -= (int)surrogatePairsCountNint; // Since each surrogate code unit was >= 0x0800, we eagerly assumed // it'd be encoded as 3 UTF-8 code units. Each surrogate half is only // encoded as 2 UTF-8 code units (for 4 UTF-8 code units total), // so we'll adjust this now. tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; tempUtf8CodeUnitCountAdjustment -= surrogatePairsCountNint; } tempUtf8CodeUnitCountAdjustment += popcnt32; pInputBuffer += Vector <ushort> .Count; inputLength -= Vector <ushort> .Count; } while (inputLength >= Vector <ushort> .Count); } } NonVectorizedLoop: // Vectorization isn't supported on our current platform, or the input was too small to benefit // from vectorization, or we saw invalid UTF-16 data in the vectorized code paths and need to // drain remaining valid chars before we report failure. for (; inputLength > 0; pInputBuffer++, inputLength--) { uint thisChar = pInputBuffer[0]; if (thisChar <= 0x7F) { continue; } // Bump adjustment by +1 for U+0080..U+07FF; by +2 for U+0800..U+FFFF. // This optimistically assumes no surrogates, which we'll handle shortly. tempUtf8CodeUnitCountAdjustment += (thisChar + 0x0001_F800u) >> 16; if (!UnicodeUtility.IsSurrogateCodePoint(thisChar)) { continue; } // Found a surrogate char. Back out the adjustment we made above, then // try to consume the entire surrogate pair all at once. We won't bother // trying to interpret the surrogate pair as a scalar value; we'll only // validate that its bit pattern matches what's expected for a surrogate pair. tempUtf8CodeUnitCountAdjustment -= 2; if (inputLength == 1) { goto Error; // input buffer too small to read a surrogate pair } thisChar = Unsafe.ReadUnaligned <uint>(pInputBuffer); if (((thisChar - (BitConverter.IsLittleEndian ? 0xDC00_D800u : 0xD800_DC00u)) & 0xFC00_FC00u) != 0) { goto Error; // not a well-formed surrogate pair } tempScalarCountAdjustment--; // 2 UTF-16 code units -> 1 scalar tempUtf8CodeUnitCountAdjustment += 2; // 2 UTF-16 code units -> 4 UTF-8 code units pInputBuffer++; // consumed one extra char inputLength--; } Error: // Also used for normal return. utf8CodeUnitCountAdjustment = tempUtf8CodeUnitCountAdjustment; scalarCountAdjustment = tempScalarCountAdjustment; return(pInputBuffer); }
public unsafe void Serialize(ref MessagePackWriter writer, int[]?value, MessagePackSerializerOptions options) { if (value == null) { writer.WriteNil(); return; } var inputLength = value.Length; writer.WriteArrayHeader(inputLength); if (inputLength == 0) { return; } fixed(int *pSource = &value[0]) { var inputEnd = pSource + inputLength; var inputIterator = pSource; if (Sse41.IsSupported) { const int ShiftCount = 2; const int Stride = 1 << ShiftCount; if (inputLength < Stride << 1) { goto ProcessEach; } { // Make InputIterator Aligned var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign16(inputIterator); // When offset is times of 4, you can adjust memory address. if ((offset & 3) == 0) { offset >>= 2; inputLength -= offset; var offsetEnd = inputIterator + offset; while (inputIterator != offsetEnd) { writer.Write(*inputIterator++); } } } fixed(byte *tablePointer = &ShuffleAndMaskTable[0]) { var countPointer = (int *)(tablePointer + CountTableOffset); fixed(byte *maskTablePointer = &SingleInstructionMultipleDataPrimitiveArrayFormatterHelper.StoreMaskTable[0]) { var vectorShortMinValueM1 = Vector128.Create(short.MinValue - 1); var vectorSByteMinValueM1 = Vector128.Create(sbyte.MinValue - 1); var vectorMinFixNegIntM1 = Vector128.Create(MessagePackRange.MinFixNegativeInt - 1); var vectorSByteMaxValue = Vector128.Create((int)sbyte.MaxValue); var vectorByteMaxValue = Vector128.Create((int)byte.MaxValue); var vectorUShortMaxValue = Vector128.Create((int)ushort.MaxValue); var vectorM1M7 = Vector128.Create(-1, -7, -1, -7); var vectorIn1Range = Vector128.Create(0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride) { var current = Sse2.LoadVector128(inputIterator); var isGreaterThanMinFixNegIntM1 = Sse2.CompareGreaterThan(current, vectorMinFixNegIntM1); var isGreaterThanSByteMaxValue = Sse2.CompareGreaterThan(current, vectorSByteMaxValue); if (Sse2.MoveMask(Sse2.AndNot(isGreaterThanSByteMaxValue, isGreaterThanMinFixNegIntM1).AsByte()) == 0xFFFF) { var answer = Ssse3.Shuffle(current.AsByte(), vectorIn1Range).AsUInt32(); var span = writer.GetSpan(Stride); Unsafe.As <byte, uint>(ref span[0]) = answer.GetElement(0); writer.Advance(Stride); continue; } var indexVector = Sse2.Add(isGreaterThanSByteMaxValue, isGreaterThanMinFixNegIntM1); indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorUShortMaxValue)); indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorByteMaxValue)); indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorShortMinValueM1)); indexVector = Sse2.Add(indexVector, Sse2.CompareGreaterThan(current, vectorSByteMinValueM1)); indexVector = Sse41.MultiplyLow(indexVector, vectorM1M7); indexVector = Ssse3.HorizontalAdd(indexVector, indexVector); var index0 = indexVector.GetElement(0); var index1 = indexVector.GetElement(1); var count0 = countPointer[index0]; var count1 = countPointer[index1]; var countTotal = count0 + count1; var destination = writer.GetSpan(countTotal); fixed(byte *pDestination = &destination[0]) { var tmpDestination = pDestination; var item0 = tablePointer + (index0 << 5); var shuffle0 = Sse2.LoadVector128(item0); var shuffled0 = Ssse3.Shuffle(current.AsByte(), shuffle0); var constant0 = Sse2.LoadVector128(item0 + 16); var answer0 = Sse2.Or(shuffled0, constant0); Sse2.MaskMove(answer0, Sse2.LoadVector128(maskTablePointer + (count0 << 4)), pDestination); tmpDestination += count0; var shift1 = Sse2.ShiftRightLogical128BitLane(current, 8).AsByte(); var item1 = tablePointer + (index1 << 5); var shuffle1 = Sse2.LoadVector128(item1); var shuffled1 = Ssse3.Shuffle(shift1, shuffle1); var constant1 = Sse2.LoadVector128(item1 + 16); var answer1 = Sse2.Or(shuffled1, constant1); Sse2.MaskMove(answer1, Sse2.LoadVector128(maskTablePointer + (count1 << 4)), tmpDestination); } writer.Advance(countTotal); } } } } ProcessEach: while (inputIterator != inputEnd) { writer.Write(*inputIterator++); } } }
public static unsafe int GetUtf16CharCountFromKnownWellFormedUtf8(ReadOnlySpan <byte> utf8Data) { // Remember: the number of resulting UTF-16 chars will never be greater than the number // of UTF-8 bytes given well-formed input, so we can get away with casting the final // result to an 'int'. fixed(byte *pPinnedUtf8Data = &MemoryMarshal.GetReference(utf8Data)) { if (Sse2.IsSupported && Popcnt.IsSupported) { // Optimizations via SSE2 & POPCNT are available - use them. Debug.Assert(BitConverter.IsLittleEndian, "SSE2 only supported on little-endian platforms."); Debug.Assert(sizeof(nint) == IntPtr.Size, "nint defined incorrectly."); Debug.Assert(sizeof(nuint) == IntPtr.Size, "nuint defined incorrectly."); byte *pBuffer = pPinnedUtf8Data; nuint bufferLength = (uint)utf8Data.Length; // Optimization: Can we stay in the all-ASCII code paths? nuint utf16CharCount = GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength); if (utf16CharCount != bufferLength) { // Found at least one non-ASCII byte, so fall down the slower (but still vectorized) code paths. // Given well-formed UTF-8 input, we can compute the number of resulting UTF-16 code units // using the following formula: // // utf16CharCount = utf8ByteCount - numUtf8ContinuationBytes + numUtf8FourByteHeaders utf16CharCount = bufferLength; Vector128 <sbyte> vecAllC0 = Vector128.Create(unchecked ((sbyte)0xC0)); Vector128 <sbyte> vecAll80 = Vector128.Create(unchecked ((sbyte)0x80)); Vector128 <sbyte> vecAll6F = Vector128.Create(unchecked ((sbyte)0x6F)); { // Perform an aligned read of the first part of the buffer. // We'll mask out any data at the start of the buffer we don't care about. // // For example, if (pBuffer MOD 16) = 2: // [ AA BB CC DD ... ] <-- original vector // [ 00 00 CC DD ... ] <-- after PANDN operation nint offset = -((nint)pBuffer & (sizeof(Vector128 <sbyte>) - 1)); Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(Vector128.Create((byte)((int)offset + sizeof(Vector128 <sbyte>) - 1)).AsSByte(), VectorOfElementIndices); Vector128 <sbyte> thisVector = Sse2.AndNot(shouldBeMaskedOut, Unsafe.Read <Vector128 <sbyte> >(pBuffer + offset)); // If there's any data at the end of the buffer we don't care about, mask it out now. // If this happens the 'bufferLength' value will be a lie, but it'll cause all of the // branches later in the method to be skipped, so it's not a huge problem. if (bufferLength < (nuint)offset + (uint)sizeof(Vector128 <sbyte>)) { Vector128 <sbyte> shouldBeAllowed = Sse2.CompareLessThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - (int)offset)).AsSByte()); thisVector = Sse2.And(shouldBeAllowed, thisVector); bufferLength = (nuint)offset + (uint)sizeof(Vector128 <sbyte>); } uint maskOfContinuationBytes = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector)); uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes); utf16CharCount -= countOfContinuationBytes; uint maskOfFourByteHeaders = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F)); uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders); utf16CharCount += countOfFourByteHeaders; bufferLength -= (nuint)offset; bufferLength -= (uint)sizeof(Vector128 <sbyte>); pBuffer += offset; pBuffer += (uint)sizeof(Vector128 <sbyte>); } // At this point, pBuffer is guaranteed aligned. Debug.Assert((nuint)pBuffer % (uint)sizeof(Vector128 <sbyte>) == 0, "pBuffer should have been aligned."); while (bufferLength >= (uint)sizeof(Vector128 <sbyte>)) { Vector128 <sbyte> thisVector = Sse2.LoadAlignedVector128((sbyte *)pBuffer); uint maskOfContinuationBytes = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector)); uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes); utf16CharCount -= countOfContinuationBytes; uint maskOfFourByteHeaders = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F)); uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders); utf16CharCount += countOfFourByteHeaders; pBuffer += sizeof(Vector128 <sbyte>); bufferLength -= (uint)sizeof(Vector128 <sbyte>); } if ((uint)bufferLength > 0) { // There's still more data to be read. // We need to mask out elements of the vector we don't care about. // These elements will occur at the end of the vector. // // For example, if 14 bytes remain in the input stream: // [ ... CC DD EE FF ] <-- original vector // [ ... CC DD 00 00 ] <-- after PANDN operation Vector128 <sbyte> shouldBeMaskedOut = Sse2.CompareGreaterThan(VectorOfElementIndices, Vector128.Create((byte)((int)bufferLength - 1)).AsSByte()); Vector128 <sbyte> thisVector = Sse2.AndNot(shouldBeMaskedOut, *(Vector128 <sbyte> *)pBuffer); uint maskOfContinuationBytes = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(vecAllC0, thisVector)); uint countOfContinuationBytes = Popcnt.PopCount(maskOfContinuationBytes); utf16CharCount -= countOfContinuationBytes; uint maskOfFourByteHeaders = (uint)Sse2.MoveMask(Sse2.CompareGreaterThan(Sse2.Xor(thisVector, vecAll80), vecAll6F)); uint countOfFourByteHeaders = Popcnt.PopCount(maskOfFourByteHeaders); utf16CharCount += countOfFourByteHeaders; } } return((int)utf16CharCount); } else { // Cannot use SSE2 & POPCNT. Fall back to slower code paths. throw new NotImplementedException(); } } }
/// <summary> /// Performs an NAND operation against two <see cref="ReadOnlySpan{byte}"/>. /// </summary> /// <param name="l"></param> /// <param name="r"></param> /// <param name="o"></param> public static void AndNot(this ReadOnlySpan <byte> l, ReadOnlySpan <byte> r, Span <byte> o) { var s = o.Length; if (l.Length != s) { throw new ArgumentException("Left span size must be equal to output size."); } if (r.Length != s) { throw new ArgumentException("Right span size must be equal to output size."); } #if NETCOREAPP3_0 if (Avx2.IsSupported) { while (o.Length >= 32) { var al = MemoryMarshal.Cast <byte, ulong>(l); var rl = MemoryMarshal.Cast <byte, ulong>(r); var ol = MemoryMarshal.Cast <byte, ulong>(o); unsafe { fixed(ulong *lp = al) fixed(ulong *rp = rl) fixed(ulong *op = ol) { var av = Avx.LoadVector256(lp); var bv = Avx.LoadVector256(rp); var ov = Avx2.AndNot(av, bv); Avx.Store(op, ov); } } l = l.Slice(32); r = r.Slice(32); o = o.Slice(32); } } #endif #if NETCOREAPP3_0 if (Sse2.IsSupported) { while (o.Length >= 16) { var ll = MemoryMarshal.Cast <byte, ulong>(l); var rl = MemoryMarshal.Cast <byte, ulong>(r); var ol = MemoryMarshal.Cast <byte, ulong>(o); unsafe { fixed(ulong *lp = ll) fixed(ulong *rp = rl) fixed(ulong *op = ol) { var av = Sse2.LoadVector128(lp); var bv = Sse2.LoadVector128(rp); var ov = Sse2.AndNot(av, bv); Sse2.Store(op, ov); } } l = l.Slice(16); r = r.Slice(16); o = o.Slice(16); } } #endif while (o.Length >= sizeof(ulong)) { var ll = MemoryMarshal.Cast <byte, ulong>(l); var rl = MemoryMarshal.Cast <byte, ulong>(r); var ol = MemoryMarshal.Cast <byte, ulong>(o); ol[0] = ~ll[0] & rl[0]; l = l.Slice(sizeof(ulong)); r = r.Slice(sizeof(ulong)); o = o.Slice(sizeof(ulong)); } while (o.Length >= sizeof(uint)) { var ll = MemoryMarshal.Cast <byte, uint>(l); var rl = MemoryMarshal.Cast <byte, uint>(r); var ol = MemoryMarshal.Cast <byte, uint>(o); ol[0] = ~ll[0] & rl[0]; l = l.Slice(sizeof(uint)); r = r.Slice(sizeof(uint)); o = o.Slice(sizeof(uint)); } // finish remaining bytes if (o.Length > 0) { for (var i = 0; i < o.Length; i++) { o[i] = (byte)((uint)~l[i] & r[i]); } } }
public void ResizeBicubic(FastBitmap rtnImage) { float scaleX = (float)this.width / rtnImage.width; float scaleY = (float)this.height / rtnImage.height; if (scaleX > 1 || scaleY > 1) { throw new Exception("拡大のみ対応"); } float[] tmpa = new float[rtnImage.width * 4 * this.height]; fixed(float *tmpp = tmpa) { float *tmp = tmpp; var _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255); var _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255); var _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255); var _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255); var _vmask = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); var _1012 = Vector128.Create(-1, 0, 1, 2); var _0123i = Vector128.Create(0, 1, 2, 3); var _0000 = Vector128.Create(0, 0, 0, 0); var _0000f = Vector128.Create(0f, 0, 0, 0); var _255f = Vector128.Create(255f, 255, 255, 255); var _1111 = Vector128.Create(1, 1, 1, 1); var _1111f = Vector128.Create(1f, 1, 1, 1); var _4444f = Vector128.Create(4f, 4, 4, 4); var _4444 = Vector128.Create(4, 4, 4, 4); var _5555f = Vector128.Create(5f, 5, 5, 5); var _2222f = Vector128.Create(2f, 2, 2, 2); var _8888f = Vector128.Create(8f, 8, 8, 8); var _7f = Vector128.Create(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff).AsSingle(); var _ff = Vector128.Create(-1, -1, -1, -1); var _stride = Vector128.Create(rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4); Parallel.For(0, this.height, (y) => { float py = (y * scaleY); float *tmpPos = tmp + y * rtnImage.width * 4; for (int x = 0; x < rtnImage.width; x++) { float px = (x * scaleX); int sx = (int)px; var _px = Vector128.CreateScalar(px); _px = Sse.Shuffle(_px, _px, 0); var _sx = Vector128.CreateScalar(sx); _sx = Sse2.Shuffle(_sx, 0); var _width = Vector128.CreateScalar(this.width); _width = Sse2.Shuffle(_width, 0); var _x2 = Sse2.Add(_sx, _1012); var _d = Sse.And(Sse.Subtract(_px, Sse2.ConvertToVector128Single(_x2)), _7f); var _d2 = Sse.Multiply(_d, _d); var _d3 = Sse.Multiply(_d2, _d); var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2))); var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3); var wb = Sse2.CompareGreaterThan(_d, _1111f); var _w = Sse41.BlendVariable(w1, w2, wb); var _xpb = Sse2.Or(Sse2.CompareLessThan(_x2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_x2, _width), _1111).AsInt32(), _ff)); var _xpp = Sse2.And(_sx, _xpb); var _xp = Sse41.BlendVariable(_x2, _xpp, _xpb); var p = Avx2.GatherVector128((uint *)(this._ptr + this._stride * y), _xp, 4).AsByte(); var _p0 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _00mask).AsInt32()); var _p1 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _01mask).AsInt32()); var _p2 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _10mask).AsInt32()); var _p3 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _11mask).AsInt32()); var _w0 = Sse.Shuffle(_w, _w, 0); var _w1 = Sse.Shuffle(_w, _w, 0b01010101); var _w2 = Sse.Shuffle(_w, _w, 0b10101010); var _w3 = Sse.Shuffle(_w, _w, 0b11111111); var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3))); Sse2.Store(tmpPos + x * 4, rgbaf); } }); Parallel.For(0, rtnImage.height, (y) => { float py = (y * scaleY); int sy = (int)py; uint *store = stackalloc uint[4]; var _py = Vector128.CreateScalar(py); _py = Sse.Shuffle(_py, _py, 0); var _sy = Vector128.CreateScalar(sy); _sy = Sse2.Shuffle(_sy, 0); var _height = Vector128.CreateScalar(this.height); _height = Sse2.Shuffle(_height, 0); var _y2 = Sse2.Add(_sy, _1012); var _d = Sse.And(Sse.Subtract(_py, Sse2.ConvertToVector128Single(_y2)), _7f); var _d2 = Sse.Multiply(_d, _d); var _d3 = Sse.Multiply(_d2, _d); var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2))); var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3); var wb = Sse2.CompareGreaterThan(_d, _1111f); var _w = Sse41.BlendVariable(w1, w2, wb); var _ypb = Sse2.Or(Sse2.CompareLessThan(_y2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_y2, _height), _1111).AsInt32(), _ff)); var _ypp = Sse2.And(_sy, _ypb); var _yp = Sse41.BlendVariable(_y2, _ypp, _ypb); var _yps = Sse41.MultiplyLow(_yp, _stride); var _yp0 = Sse2.Add(Sse2.Shuffle(_yps, 0), _0123i); var _yp1 = Sse2.Add(Sse2.Shuffle(_yps, 0b01010101), _0123i); var _yp2 = Sse2.Add(Sse2.Shuffle(_yps, 0b10101010), _0123i); var _yp3 = Sse2.Add(Sse2.Shuffle(_yps, 0b11111111), _0123i); uint *rtn = (uint *)(rtnImage._ptr + rtnImage._stride * y); for (int x = 0; x < rtnImage.width; x++) { var _p0 = Avx2.GatherVector128((float *)(tmp), _yp0, 4); var _p1 = Avx2.GatherVector128((float *)(tmp), _yp1, 4); var _p2 = Avx2.GatherVector128((float *)(tmp), _yp2, 4); var _p3 = Avx2.GatherVector128((float *)(tmp), _yp3, 4); var _w0 = Sse.Shuffle(_w, _w, 0); var _w1 = Sse.Shuffle(_w, _w, 0b01010101); var _w2 = Sse.Shuffle(_w, _w, 0b10101010); var _w3 = Sse.Shuffle(_w, _w, 0b11111111); var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3))); var _b0 = Sse.CompareLessThan(rgbaf, _0000f); rgbaf = Sse41.BlendVariable(rgbaf, _0000f, _b0); var _b1 = Sse.CompareGreaterThan(rgbaf, _255f); rgbaf = Sse41.BlendVariable(rgbaf, _255f, _b1); var rgbab = Sse2.ConvertToVector128Int32(rgbaf).AsByte(); var rgba = Ssse3.Shuffle(rgbab, _vmask).AsUInt32(); Sse2.Store(store, rgba); _yp0 = Sse2.Add(_yp0, _4444); _yp1 = Sse2.Add(_yp1, _4444); _yp2 = Sse2.Add(_yp2, _4444); _yp3 = Sse2.Add(_yp3, _4444); *rtn = *store; rtn++; } });
public static unsafe void Encrypt4(uint[] rk, ReadOnlySpan <byte> source, Span <byte> destination) { var p32 = MemoryMarshal.Cast <byte, uint>(source); var t3 = Vector128.Create(p32[3], p32[7], p32[11], p32[15]).ReverseEndianness32(); var t2 = Vector128.Create(p32[2], p32[6], p32[10], p32[14]).ReverseEndianness32(); var t1 = Vector128.Create(p32[1], p32[5], p32[9], p32[13]).ReverseEndianness32(); var t0 = Vector128.Create(p32[0], p32[4], p32[8], p32[12]).ReverseEndianness32(); for (var i = 0; i < 32; ++i) { var x = t1.Xor(t2).Xor(t3).Xor(Vector128.Create(rk[i]).AsByte()); var y = Sse2.And(x, c0f); // inner affine y = Ssse3.Shuffle(m1l, y); x = Sse2.ShiftRightLogical(x.AsUInt64(), 4).AsByte(); x = Sse2.And(x, c0f); x = Ssse3.Shuffle(m1h, x).Xor(y); x = Ssse3.Shuffle(x, shr); // inverse MixColumns x = Aes.EncryptLast(x, c0f); // AES-NI y = Sse2.AndNot(x, c0f); // outer affine y = Ssse3.Shuffle(m2l, y); x = Sse2.ShiftRightLogical(x.AsUInt64(), 4).AsByte(); x = Sse2.And(x, c0f); x = Ssse3.Shuffle(m2h, x).Xor(y); // 4 parallel L1 linear transforms y = x.Xor(x.RotateLeftUInt32_8()).Xor(x.RotateLeftUInt32_16()); y = y.AsUInt32().RotateLeftUInt32(2).AsByte(); x = x.Xor(y).Xor(x.RotateLeftUInt32_24()); // rotate registers x = x.Xor(t0); t0 = t1; t1 = t2; t2 = t3; t3 = x; } var a = t3.ReverseEndianness32().AsUInt32(); var b = t2.ReverseEndianness32().AsUInt32(); var c = t1.ReverseEndianness32().AsUInt32(); var d = t0.ReverseEndianness32().AsUInt32(); var x0 = Sse2.UnpackLow(a, b); var x1 = Sse2.UnpackLow(c, d); var x2 = Sse2.UnpackHigh(a, b); var x3 = Sse2.UnpackHigh(c, d); t0 = Sse2.UnpackLow(x0.AsUInt64(), x1.AsUInt64()).AsByte(); t1 = Sse2.UnpackHigh(x0.AsUInt64(), x1.AsUInt64()).AsByte(); t2 = Sse2.UnpackLow(x2.AsUInt64(), x3.AsUInt64()).AsByte(); t3 = Sse2.UnpackHigh(x2.AsUInt64(), x3.AsUInt64()).AsByte(); fixed(byte *p = destination) { Sse2.Store(p, t0); Sse2.Store(p + 16, t1); Sse2.Store(p + 32, t2); Sse2.Store(p + 48, t3); } }
static unsafe int Main(string[] args) { int testResult = Pass; int testsCount = 21; string methodUnderTestName = nameof(Sse2.AndNot); if (Sse2.IsSupported) { using (var doubleTable = TestTableSse2 <double> .Create(testsCount)) using (var longTable = TestTableSse2 <long> .Create(testsCount)) using (var ulongTable = TestTableSse2 <ulong> .Create(testsCount)) using (var intTable = TestTableSse2 <int> .Create(testsCount)) using (var uintTable = TestTableSse2 <uint> .Create(testsCount)) using (var shortTable = TestTableSse2 <short> .Create(testsCount)) using (var ushortTable = TestTableSse2 <ushort> .Create(testsCount)) using (var sbyteTable = TestTableSse2 <sbyte> .Create(testsCount)) using (var byteTable = TestTableSse2 <byte> .Create(testsCount)) { for (int i = 0; i < testsCount; i++) { (Vector128 <double>, Vector128 <double>, Vector128 <double>)value = doubleTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); doubleTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <long>, Vector128 <long>, Vector128 <long>)value = longTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); longTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <ulong>, Vector128 <ulong>, Vector128 <ulong>)value = ulongTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); ulongTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <int>, Vector128 <int>, Vector128 <int>)value = intTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); intTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <uint>, Vector128 <uint>, Vector128 <uint>)value = uintTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); uintTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <short>, Vector128 <short>, Vector128 <short>)value = shortTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); shortTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <ushort>, Vector128 <ushort>, Vector128 <ushort>)value = ushortTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); ushortTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <sbyte>, Vector128 <sbyte>, Vector128 <sbyte>)value = sbyteTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); sbyteTable.SetOutArray(result); } for (int i = 0; i < testsCount; i++) { (Vector128 <byte>, Vector128 <byte>, Vector128 <byte>)value = byteTable[i]; var result = Sse2.AndNot(value.Item1, value.Item2); byteTable.SetOutArray(result); } CheckMethod <double> checkDouble = (double x, double y, double z, ref double a) => (a = BinaryAndNot(x, y)) == z; if (!doubleTable.CheckResult(checkDouble)) { PrintError(doubleTable, methodUnderTestName, "(double x, double y, double z, ref double a) => (a = BinaryAndNot(x, y)) == z", checkDouble); testResult = Fail; } CheckMethod <long> checkLong = (long x, long y, long z, ref long a) => (a = (~x) & y) == z; if (!longTable.CheckResult(checkLong)) { PrintError(longTable, methodUnderTestName, "(long x, long y, long z, ref long a) => (a = (~x) & y) == z", checkLong); testResult = Fail; } CheckMethod <ulong> checkUlong = (ulong x, ulong y, ulong z, ref ulong a) => (a = (~x) & y) == z; if (!longTable.CheckResult(checkLong)) { PrintError(ulongTable, methodUnderTestName, "(ulong x, ulong y, ulong z, ref ulong a) => (a = (~x) & y) == z", checkUlong); testResult = Fail; } CheckMethod <int> checkInt32 = (int x, int y, int z, ref int a) => (a = (~x) & y) == z; if (!intTable.CheckResult(checkInt32)) { PrintError(intTable, methodUnderTestName, "(int x, int y, int z, ref int a) => (a = (~x) & y) == z", checkInt32); testResult = Fail; } CheckMethod <uint> checkUInt32 = (uint x, uint y, uint z, ref uint a) => (a = (~x) & y) == z; if (!uintTable.CheckResult(checkUInt32)) { PrintError(uintTable, methodUnderTestName, "(uint x, uint y, uint z, ref uint a) => (a = (~x) & y) == z", checkUInt32); testResult = Fail; } CheckMethod <short> checkInt16 = (short x, short y, short z, ref short a) => (a = (short)((~x) & y)) == z; if (!shortTable.CheckResult(checkInt16)) { PrintError(shortTable, methodUnderTestName, "(short x, short y, short z, ref short a) => (a = (short)((~x) & y)) == z", checkInt16); testResult = Fail; } CheckMethod <ushort> checkUInt16 = (ushort x, ushort y, ushort z, ref ushort a) => (a = (ushort)((~x) & y)) == z; if (!ushortTable.CheckResult(checkUInt16)) { PrintError(ushortTable, methodUnderTestName, "(ushort x, ushort y, ushort z, ref ushort a) => (a = (ushort)((~x) & y)) == z", checkUInt16); testResult = Fail; } CheckMethod <sbyte> checkSByte = (sbyte x, sbyte y, sbyte z, ref sbyte a) => (a = (sbyte)((~x) & y)) == z; if (!sbyteTable.CheckResult(checkSByte)) { PrintError(sbyteTable, methodUnderTestName, "(sbyte x, sbyte y, sbyte z, ref sbyte a) =>(a = (sbyte)((~x) & y)) == z", checkSByte); testResult = Fail; } CheckMethod <byte> checkByte = (byte x, byte y, byte z, ref byte a) => (a = (byte)((~x) & y)) == z; if (!byteTable.CheckResult(checkByte)) { PrintError(byteTable, methodUnderTestName, "(byte x, byte y, byte z, ref byte a) => (a = (byte)((~x) & y)) == z", checkByte); testResult = Fail; } } } else { Console.WriteLine($"Sse2.IsSupported: {Sse2.IsSupported}, skipped tests of {typeof(Sse2)}.{methodUnderTestName}"); } return(testResult); }
public static i32 NMask_i32(i32 a, m32 m) { return(Sse2.AndNot(m, a)); }
public static m32 BitwiseAndNot_m32(m32 a, m32 b) { return(Sse2.AndNot(b, a)); }
public static i32 BitwiseAndNot_i32(i32 a, i32 b) { return(Sse2.AndNot(b, a)); }