public static Vector128 <T> Vector128Add <T>(Vector128 <T> left, Vector128 <T> right) where T : struct { if (typeof(T) == typeof(byte)) { return(Sse2.Add(left.AsByte(), right.AsByte()).As <byte, T>()); } else if (typeof(T) == typeof(sbyte)) { return(Sse2.Add(left.AsSByte(), right.AsSByte()).As <sbyte, T>()); } else if (typeof(T) == typeof(short)) { return(Sse2.Add(left.AsInt16(), right.AsInt16()).As <short, T>()); } else if (typeof(T) == typeof(ushort)) { return(Sse2.Add(left.AsUInt16(), right.AsUInt16()).As <ushort, T>()); } else if (typeof(T) == typeof(int)) { return(Sse2.Add(left.AsInt32(), right.AsInt32()).As <int, T>()); } else if (typeof(T) == typeof(uint)) { return(Sse2.Add(left.AsUInt32(), right.AsUInt32()).As <uint, T>()); } else if (typeof(T) == typeof(long)) { return(Sse2.Add(left.AsInt64(), right.AsInt64()).As <long, T>()); } else if (typeof(T) == typeof(ulong)) { return(Sse2.Add(left.AsUInt64(), right.AsUInt64()).As <ulong, T>()); } else if (typeof(T) == typeof(float)) { return(Sse.Add(left.AsSingle(), right.AsSingle()).As <float, T>()); } else if (typeof(T) == typeof(double)) { return(Sse2.Add(left.AsDouble(), right.AsDouble()).As <double, T>()); } else { throw new NotSupportedException(); } }
private static Vector128 <ulong> blend_ulong(ref Vector128 <ulong> x, ref Vector128 <ulong> y, byte m) => Sse41.Blend(x.AsUInt16(), y.AsUInt16(), m).AsUInt64();
private static Vector128 <byte> MultiplyBytes(Vector128 <byte> left, Vector128 <byte> right) => Sse2.Or( Sse2.ShiftLeftLogical(Sse2.MultiplyLow(Sse2.ShiftRightLogical(left.AsUInt16(), 8), Sse2.ShiftRightLogical(right.AsUInt16(), 8)), 8), Sse2.ShiftRightLogical(Sse2.ShiftLeftLogical(Sse2.MultiplyLow(left.AsUInt16(), right.AsUInt16()), 8), 8) ).AsByte();
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block) { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); fixed(byte *maskPtr = SseShuffleMasks) { Vector128 <byte> rowA = block.V0.AsByte(); Vector128 <byte> rowB = block.V1.AsByte(); Vector128 <byte> rowC = block.V2.AsByte(); Vector128 <byte> rowD = block.V3.AsByte(); Vector128 <byte> rowE = block.V4.AsByte(); Vector128 <byte> rowF = block.V5.AsByte(); Vector128 <byte> rowG = block.V6.AsByte(); Vector128 <byte> rowH = block.V7.AsByte(); // row0 - A0 A1 B0 C0 B1 A2 A3 B2 Vector128 <short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16(); Vector128 <short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16(); Vector128 <short> row0 = Sse2.Or(rowA0, rowB0); Vector128 <short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16(); row0 = Sse2.Or(row0, rowC0); // row1 - C1 D0 E0 D1 C2 B3 A4 A5 Vector128 <short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16(); Vector128 <short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16(); Vector128 <short> row1 = Sse2.Or(rowA1, rowC1); Vector128 <short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16(); row1 = Sse2.Or(row1, rowD1); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16(); // row2 Vector128 <short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16(); Vector128 <short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16(); Vector128 <short> row2 = Sse2.Or(rowE2, rowF2); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16(); // row3 Vector128 <short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16(); Vector128 <short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16(); Vector128 <short> row3 = Sse2.Or(rowA3, rowB3); Vector128 <short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16(); row3 = Sse2.Or(row3, rowC3); Vector128 <byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11)); Vector128 <short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16(); row3 = Sse2.Or(row3, rowD3); // row4 Vector128 <short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16(); Vector128 <short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16(); Vector128 <short> row4 = Sse2.Or(rowE4, rowF4); Vector128 <short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16(); row4 = Sse2.Or(row4, rowG4); Vector128 <short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16(); row4 = Sse2.Or(row4, rowH4); // row5 Vector128 <short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16(); Vector128 <short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); Vector128 <short> row5 = Sse2.Or(rowC5, rowD5); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16(); // row6 Vector128 <short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16(); Vector128 <short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16(); Vector128 <short> row6 = Sse2.Or(rowE6, rowF6); Vector128 <short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16(); row6 = Sse2.Or(row6, rowH6); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16(); // row7 Vector128 <short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16(); Vector128 <short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16(); Vector128 <short> row7 = Sse2.Or(rowG7, rowH7); row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16(); block.V0 = row0; block.V1 = row1; block.V2 = row2; block.V3 = row3; block.V4 = row4; block.V5 = row5; block.V6 = row6; block.V7 = row7; } }
/// <summary> /// Searches for an opening character from a registered parser in the specified string. /// </summary> /// <param name="text">The text.</param> /// <param name="start">The start.</param> /// <param name="end">The end.</param> /// <returns>Index position within the string of the first opening character found in the specified text; if not found, returns -1</returns> public int IndexOfOpeningCharacter(string text, int start, int end) { Debug.Assert(text is not null); Debug.Assert(start >= 0 && end >= 0); Debug.Assert(end - start + 1 >= 0); Debug.Assert(end - start + 1 <= text.Length); if (nonAsciiMap is null) { #if NETCOREAPP3_1_OR_GREATER if (Ssse3.IsSupported && BitConverter.IsLittleEndian) { // Based on http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm // Optimized for sets in the [1, 127] range int lengthMinusOne = end - start; int charsToProcessVectorized = lengthMinusOne & ~(2 * Vector128 <short> .Count - 1); int finalStart = start + charsToProcessVectorized; if (start < finalStart) { ref char textStartRef = ref Unsafe.Add(ref Unsafe.AsRef(in text.GetPinnableReference()), start); Vector128 <byte> bitmap = _asciiBitmap; do { // Load 32 bytes (16 chars) into two Vector128<short>s (chars) // Drop the high byte of each char // Pack the remaining bytes into a single Vector128<byte> Vector128 <byte> input = Sse2.PackUnsignedSaturate( Unsafe.ReadUnaligned <Vector128 <short> >(ref Unsafe.As <char, byte>(ref textStartRef)), Unsafe.ReadUnaligned <Vector128 <short> >(ref Unsafe.As <char, byte>(ref Unsafe.Add(ref textStartRef, Vector128 <short> .Count)))); // Extract the higher nibble of each character ((input >> 4) & 0xF) Vector128 <byte> higherNibbles = Sse2.And(Sse2.ShiftRightLogical(input.AsUInt16(), 4).AsByte(), Vector128.Create((byte)0xF)); // Lookup the matching higher nibble for each character based on the lower nibble // PSHUFB will set the result to 0 for any non-ASCII (> 127) character Vector128 <byte> bitsets = Ssse3.Shuffle(bitmap, input); // Calculate a bitmask (1 << (higherNibble % 8)) for each character Vector128 <byte> bitmask = Ssse3.Shuffle(Vector128.Create(0x8040201008040201).AsByte(), higherNibbles); // Check which characters are present in the set // We are relying on bitsets being zero for non-ASCII characters Vector128 <byte> result = Sse2.And(bitsets, bitmask); if (!result.Equals(Vector128 <byte> .Zero)) { int resultMask = ~Sse2.MoveMask(Sse2.CompareEqual(result, Vector128 <byte> .Zero)); return(start + BitOperations.TrailingZeroCount((uint)resultMask)); } start += 2 * Vector128 <short> .Count; textStartRef = ref Unsafe.Add(ref textStartRef, 2 * Vector128 <short> .Count); }while (start != finalStart); } } ref char textRef = ref Unsafe.AsRef(in text.GetPinnableReference());
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block) { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); fixed(byte *shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks)) { Vector128 <byte> rowA = block.V0.AsByte(); Vector128 <byte> rowB = block.V1.AsByte(); Vector128 <byte> rowC = block.V2.AsByte(); Vector128 <byte> rowD = block.V3.AsByte(); Vector128 <byte> rowE = block.V4.AsByte(); Vector128 <byte> rowF = block.V5.AsByte(); Vector128 <byte> rowG = block.V6.AsByte(); Vector128 <byte> rowH = block.V7.AsByte(); // row0 - A0 B0 A1 A2 B1 C0 D0 C1 Vector128 <short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16(); Vector128 <short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16(); Vector128 <short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16(); Vector128 <short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C); row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16(); // row1 - B2 A3 A4 B3 C2 D1 E0 F0 Vector128 <short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16(); Vector128 <short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16(); Vector128 <short> row1 = Sse2.Or(row1_A, row1_B); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16(); // row2 - E1 D2 C3 B4 A5 A6 B5 C4 Vector128 <short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16(); Vector128 <short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16(); Vector128 <short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16(); Vector128 <short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16(); // row3 - D3 E2 F1 G0 H0 G1 F2 E3 Vector128 <short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16(); Vector128 <short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16(); Vector128 <short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16(); Vector128 <short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G); row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16(); row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16(); // row4 - D4 C5 B6 A7 B7 C6 D5 E4 Vector128 <short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16(); Vector128 <short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16(); Vector128 <short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16(); Vector128 <short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D); row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16(); row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16(); // row5 - F3 G2 H1 H2 G3 F4 E5 D6 Vector128 <short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16(); Vector128 <short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16(); Vector128 <short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16(); Vector128 <short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16(); // row6 - C7 D7 E6 F5 G4 H3 H4 G5 Vector128 <short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16(); Vector128 <short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16(); Vector128 <short> row6 = Sse2.Or(row6_G, row6_H); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16(); // row7 - F6 E7 F7 G6 H5 H6 G7 H7 Vector128 <short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16(); Vector128 <short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16(); Vector128 <short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16(); Vector128 <short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H); row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16(); block.V0 = row0; block.V1 = row1; block.V2 = row2; block.V3 = row3; block.V4 = row4; block.V5 = row5; block.V6 = row6; block.V7 = row7; } }