public static TiffBasePlanarColorDecoder <TPixel> CreatePlanar( TiffColorType colorType, TiffBitsPerSample bitsPerSample, ushort[] colorMap, Rational[] referenceBlackAndWhite, Rational[] ycbcrCoefficients, ushort[] ycbcrSubSampling, ByteOrder byteOrder) { switch (colorType) { case TiffColorType.Rgb888Planar: DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new RgbPlanarTiffColor <TPixel>(bitsPerSample)); case TiffColorType.YCbCrPlanar: return(new YCbCrPlanarTiffColor <TPixel>(referenceBlackAndWhite, ycbcrCoefficients, ycbcrSubSampling)); case TiffColorType.Rgb161616Planar: DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb16PlanarTiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.Rgb242424Planar: DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb24PlanarTiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.Rgb323232Planar: DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb32PlanarTiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); default: throw TiffThrowHelper.InvalidColorType(colorType.ToString()); } }
public virtual void Write(TiffBaseCompressor compressor, int rowsPerStrip) { DebugGuard.IsTrue(this.BytesPerRow == compressor.BytesPerRow, "bytes per row of the compressor does not match tiff color writer"); int stripsCount = (this.Image.Height + rowsPerStrip - 1) / rowsPerStrip; uint[] stripOffsets = new uint[stripsCount]; uint[] stripByteCounts = new uint[stripsCount]; int stripIndex = 0; compressor.Initialize(rowsPerStrip); for (int y = 0; y < this.Image.Height; y += rowsPerStrip) { long offset = compressor.Output.Position; int height = Math.Min(rowsPerStrip, this.Image.Height - y); this.EncodeStrip(y, height, compressor); long endOffset = compressor.Output.Position; stripOffsets[stripIndex] = (uint)offset; stripByteCounts[stripIndex] = (uint)(endOffset - offset); stripIndex++; } DebugGuard.IsTrue(stripIndex == stripsCount, "stripIndex and stripsCount should match"); this.AddStripTags(rowsPerStrip, stripOffsets, stripByteCounts); }
/// <summary> /// Initializes a new instance of the <see cref="TiffCcittCompressor" /> class. /// </summary> /// <param name="output">The output.</param> /// <param name="allocator">The allocator.</param> /// <param name="width">The width.</param> /// <param name="bitsPerPixel">The bits per pixel.</param> protected TiffCcittCompressor(Stream output, MemoryAllocator allocator, int width, int bitsPerPixel) : base(output, allocator, width, bitsPerPixel) { DebugGuard.IsTrue(bitsPerPixel == 1, nameof(bitsPerPixel), "CCITT compression requires one bit per pixel"); this.bytePosition = 0; this.bitPosition = 0; }
/// <inheritdoc/> public CieXyz Convert(LinearRgb input) { DebugGuard.IsTrue(input.WorkingSpace.Equals(this.SourceWorkingSpace), nameof(input.WorkingSpace), "Input and source working spaces must be equal."); Vector3 vector = Vector3.Transform(input.Vector, this.conversionMatrix); return(new CieXyz(vector)); }
public static TiffBaseCompressor Create( TiffCompression method, Stream output, MemoryAllocator allocator, int width, int bitsPerPixel, DeflateCompressionLevel compressionLevel, TiffPredictor predictor) { switch (method) { // The following compression types are not implemented in the encoder and will default to no compression instead. case TiffCompression.ItuTRecT43: case TiffCompression.ItuTRecT82: case TiffCompression.OldJpeg: case TiffCompression.OldDeflate: case TiffCompression.None: DebugGuard.IsTrue(compressionLevel == DeflateCompressionLevel.DefaultCompression, "No deflate compression level is expected to be set"); DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new NoCompressor(output, allocator, width, bitsPerPixel)); case TiffCompression.Jpeg: DebugGuard.IsTrue(compressionLevel == DeflateCompressionLevel.DefaultCompression, "No deflate compression level is expected to be set"); DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new TiffJpegCompressor(output, allocator, width, bitsPerPixel)); case TiffCompression.PackBits: DebugGuard.IsTrue(compressionLevel == DeflateCompressionLevel.DefaultCompression, "No deflate compression level is expected to be set"); DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new PackBitsCompressor(output, allocator, width, bitsPerPixel)); case TiffCompression.Deflate: return(new DeflateCompressor(output, allocator, width, bitsPerPixel, predictor, compressionLevel)); case TiffCompression.Lzw: DebugGuard.IsTrue(compressionLevel == DeflateCompressionLevel.DefaultCompression, "No deflate compression level is expected to be set"); return(new LzwCompressor(output, allocator, width, bitsPerPixel, predictor)); case TiffCompression.CcittGroup3Fax: DebugGuard.IsTrue(compressionLevel == DeflateCompressionLevel.DefaultCompression, "No deflate compression level is expected to be set"); DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new T4BitCompressor(output, allocator, width, bitsPerPixel, false)); case TiffCompression.CcittGroup4Fax: DebugGuard.IsTrue(compressionLevel == DeflateCompressionLevel.DefaultCompression, "No deflate compression level is expected to be set"); DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new T6BitCompressor(output, allocator, width, bitsPerPixel)); case TiffCompression.Ccitt1D: DebugGuard.IsTrue(compressionLevel == DeflateCompressionLevel.DefaultCompression, "No deflate compression level is expected to be set"); DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new T4BitCompressor(output, allocator, width, bitsPerPixel, true)); default: throw TiffThrowHelper.NotSupportedCompressor(method.ToString()); } }
public void IsTrue_IsFalse_ThrowsException() { var exception = Assert.Throws <ArgumentException>(() => { DebugGuard.IsTrue(false, "myParamName", "myTestMessage"); }); Assert.Equal("myParamName", exception.ParamName); Assert.True(exception.Message.Contains("myTestMessage")); }
/// <summary> /// Stores code in table[0], table[step], table[2*step], ..., table[end-step]. /// Assumes that end is an integer multiple of step. /// </summary> private static void ReplicateValue(Span <HuffmanCode> table, int step, int end, HuffmanCode code) { DebugGuard.IsTrue(end % step == 0, nameof(end), "end must be a multiple of step"); do { end -= step; table[end] = code; }while (end > 0); }
#pragma warning restore SA1310, SA1311, IDE1006 /// <summary> /// Apply floating point FDCT inplace using simd operations. /// </summary> /// <param name="block">Input block.</param> private static void FDCT8x8_Avx(ref Block8x8F block) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); // First pass - process columns FDCT8x8_1D_Avx(ref block); // Second pass - process rows block.TransposeInplace(); FDCT8x8_1D_Avx(ref block);
public static TiffBaseDecompressor Create( Configuration configuration, TiffDecoderCompressionType method, MemoryAllocator allocator, TiffPhotometricInterpretation photometricInterpretation, int width, int bitsPerPixel, TiffColorType colorType, TiffPredictor predictor, FaxCompressionOptions faxOptions, byte[] jpegTables, TiffFillOrder fillOrder, ByteOrder byteOrder) { switch (method) { case TiffDecoderCompressionType.None: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); DebugGuard.IsTrue(faxOptions == FaxCompressionOptions.None, "No fax compression options are expected"); return(new NoneTiffCompression(allocator, width, bitsPerPixel)); case TiffDecoderCompressionType.PackBits: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); DebugGuard.IsTrue(faxOptions == FaxCompressionOptions.None, "No fax compression options are expected"); return(new PackBitsTiffCompression(allocator, width, bitsPerPixel)); case TiffDecoderCompressionType.Deflate: DebugGuard.IsTrue(faxOptions == FaxCompressionOptions.None, "No fax compression options are expected"); return(new DeflateTiffCompression(allocator, width, bitsPerPixel, colorType, predictor, byteOrder == ByteOrder.BigEndian)); case TiffDecoderCompressionType.Lzw: DebugGuard.IsTrue(faxOptions == FaxCompressionOptions.None, "No fax compression options are expected"); return(new LzwTiffCompression(allocator, width, bitsPerPixel, colorType, predictor, byteOrder == ByteOrder.BigEndian)); case TiffDecoderCompressionType.T4: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new T4TiffCompression(allocator, fillOrder, width, bitsPerPixel, faxOptions, photometricInterpretation)); case TiffDecoderCompressionType.T6: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new T6TiffCompression(allocator, fillOrder, width, bitsPerPixel, photometricInterpretation)); case TiffDecoderCompressionType.HuffmanRle: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new ModifiedHuffmanTiffCompression(allocator, fillOrder, width, bitsPerPixel, photometricInterpretation)); case TiffDecoderCompressionType.Jpeg: DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression"); return(new JpegTiffCompression(configuration, allocator, width, bitsPerPixel, jpegTables, photometricInterpretation)); default: throw TiffThrowHelper.NotSupportedDecompressor(nameof(method)); } }
/// <inheritdoc /> protected override void Dispose(bool disposing) { DebugGuard.IsTrue(disposing, nameof(disposing), "Unmanaged buffers should not have finalizer!"); if (Interlocked.Exchange(ref this.disposed, 1) == 1) { // Already disposed return; } this.lifetimeGuard.Dispose(); }
private static void ApplyHorizontalPrediction8Bit(Span <byte> rows, int width) { DebugGuard.IsTrue(rows.Length % width == 0, "Values must be equals"); int height = rows.Length / width; for (int y = 0; y < height; y++) { Span <byte> rowSpan = rows.Slice(y * width, width); for (int x = rowSpan.Length - 1; x >= 1; x--) { rowSpan[x] -= rowSpan[x - 1]; } } }
/// <inheritdoc/> public override void CompressStrip(Span <byte> rows, int height) { DebugGuard.IsTrue(rows.Length % height == 0, "Invalid height"); DebugGuard.IsTrue(this.BytesPerRow == rows.Length / height, "The widths must match"); Span <byte> span = this.pixelData.GetSpan(); for (int i = 0; i < height; i++) { Span <byte> row = rows.Slice(i * this.BytesPerRow, this.BytesPerRow); int size = PackBitsWriter.PackBits(row, span); this.Output.Write(span.Slice(0, size)); } }
/// <summary> /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. /// </summary> /// <remarks> /// Requires Avx support. /// </remarks> /// <param name="block">Input matrix.</param> public static void FDCT8x8_Avx(ref Block8x8F block) { DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); Vector256 <float> tmp0 = Avx.Add(block.V0, block.V7); Vector256 <float> tmp7 = Avx.Subtract(block.V0, block.V7); Vector256 <float> tmp1 = Avx.Add(block.V1, block.V6); Vector256 <float> tmp6 = Avx.Subtract(block.V1, block.V6); Vector256 <float> tmp2 = Avx.Add(block.V2, block.V5); Vector256 <float> tmp5 = Avx.Subtract(block.V2, block.V5); Vector256 <float> tmp3 = Avx.Add(block.V3, block.V4); Vector256 <float> tmp4 = Avx.Subtract(block.V3, block.V4); // Even part Vector256 <float> tmp10 = Avx.Add(tmp0, tmp3); Vector256 <float> tmp13 = Avx.Subtract(tmp0, tmp3); Vector256 <float> tmp11 = Avx.Add(tmp1, tmp2); Vector256 <float> tmp12 = Avx.Subtract(tmp1, tmp2); block.V0 = Avx.Add(tmp10, tmp11); block.V4 = Avx.Subtract(tmp10, tmp11); Vector256 <float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); block.V2 = Avx.Add(tmp13, z1); block.V6 = Avx.Subtract(tmp13, z1); // Odd part tmp10 = Avx.Add(tmp4, tmp5); tmp11 = Avx.Add(tmp5, tmp6); tmp12 = Avx.Add(tmp6, tmp7); Vector256 <float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); Vector256 <float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); Vector256 <float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); Vector256 <float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071); Vector256 <float> z11 = Avx.Add(tmp7, z3); Vector256 <float> z13 = Avx.Subtract(tmp7, z3); block.V5 = Avx.Add(z13, z2); block.V3 = Avx.Subtract(z13, z2); block.V1 = Avx.Add(z11, z4); block.V7 = Avx.Subtract(z11, z4); }
/// <summary> /// Writes a image compressed with CCITT T6 to the stream. /// </summary> /// <param name="pixelsAsGray">The pixels as 8-bit gray array.</param> /// <param name="height">The strip height.</param> public override void CompressStrip(Span <byte> pixelsAsGray, int height) { DebugGuard.IsTrue(pixelsAsGray.Length / height == this.Width, "Values must be equals"); DebugGuard.IsTrue(pixelsAsGray.Length % height == 0, "Values must be equals"); this.compressedDataBuffer.Clear(); Span <byte> compressedData = this.compressedDataBuffer.GetSpan(); this.bytePosition = 0; this.bitPosition = 0; this.CompressStrip(pixelsAsGray, height, compressedData); // Write the compressed data to the stream. int bytesToWrite = this.bitPosition != 0 ? this.bytePosition + 1 : this.bytePosition; this.Output.Write(compressedData.Slice(0, bytesToWrite)); }
private static void ApplyHorizontalPrediction24Bit(Span <byte> rows, int width) { DebugGuard.IsTrue(rows.Length % width == 0, "Values must be equals"); int height = rows.Length / width; for (int y = 0; y < height; y++) { Span <byte> rowSpan = rows.Slice(y * width, width); Span <Rgb24> rowRgb = MemoryMarshal.Cast <byte, Rgb24>(rowSpan); for (int x = rowRgb.Length - 1; x >= 1; x--) { byte r = (byte)(rowRgb[x].R - rowRgb[x - 1].R); byte g = (byte)(rowRgb[x].G - rowRgb[x - 1].G); byte b = (byte)(rowRgb[x].B - rowRgb[x - 1].B); var rgb = new Rgb24(r, g, b); rowRgb[x].FromRgb24(rgb); } } }
private IMemoryOwner <ulong> ConvertNumbers(Array array, out Span <ulong> span) { if (array is Number[] numbers) { IMemoryOwner <ulong> memory = this.memoryAllocator.Allocate <ulong>(numbers.Length); span = memory.GetSpan(); for (int i = 0; i < numbers.Length; i++) { span[i] = (uint)numbers[i]; } return(memory); } else { DebugGuard.IsTrue(array is ulong[], $"Expected {nameof(UInt64)} array."); span = (ulong[])array; return(null); } }
/// <summary> /// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/> /// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>. /// </summary> /// <param name="sourceColors">The <see cref="Span{T}"/> to the source colors.</param> /// <param name="destVectors">The <see cref="Span{T}"/> to the dstination vectors.</param> /// <param name="count">The number of pixels to convert.</param> /// <remarks> /// Implementation adapted from: /// <see> /// <cref>http://stackoverflow.com/a/5362789</cref> /// </see> /// TODO: We can replace this implementation in the future using new Vector API-s: /// <see> /// <cref>https://github.com/dotnet/corefx/issues/15957</cref> /// </see> /// </remarks> internal static void ToVector4SimdAligned(ReadOnlySpan <Rgba32> sourceColors, Span <Vector4> destVectors, int count) { if (!Vector.IsHardwareAccelerated) { throw new InvalidOperationException( "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); } DebugGuard.IsTrue( count % Vector <uint> .Count == 0, nameof(count), "Argument 'count' should divisible by Vector<uint>.Count!"); var bVec = new Vector <float>(256.0f / 255.0f); var magicFloat = new Vector <float>(32768.0f); var magicInt = new Vector <uint>(1191182336); // reinterpreded value of 32768.0f var mask = new Vector <uint>(255); int unpackedRawCount = count * 4; ref uint sourceBase = ref Unsafe.As <Rgba32, uint>(ref MemoryMarshal.GetReference(sourceColors));
/// <summary> /// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/> /// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>. /// </summary> /// <param name="sourceColors">The <see cref="BufferSpan{T}"/> to the source colors.</param> /// <param name="destVectors">The <see cref="BufferSpan{T}"/> to the dstination vectors.</param> /// <param name="count">The number of pixels to convert.</param> /// <remarks> /// Implementation adapted from: /// <see> /// <cref>http://stackoverflow.com/a/5362789</cref> /// </see> /// TODO: We can replace this implementation in the future using new Vector API-s: /// <see> /// <cref>https://github.com/dotnet/corefx/issues/15957</cref> /// </see> /// </remarks> internal static unsafe void ToVector4SimdAligned(BufferSpan <Rgba32> sourceColors, BufferSpan <Vector4> destVectors, int count) { if (!Vector.IsHardwareAccelerated) { throw new InvalidOperationException( "Rgba32.BulkOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); } int vecSize = Vector <uint> .Count; DebugGuard.IsTrue( count % vecSize == 0, nameof(count), "Argument 'count' should divisible by Vector<uint>.Count!"); Vector <float> bVec = new Vector <float>(256.0f / 255.0f); Vector <float> magicFloat = new Vector <float>(32768.0f); Vector <uint> magicInt = new Vector <uint>(1191182336); // reinterpreded value of 32768.0f Vector <uint> mask = new Vector <uint>(255); int unpackedRawCount = count * 4; ref uint src = ref Unsafe.As <Rgba32, uint>(ref sourceColors.DangerousGetPinnableReference());
/// <summary> /// Calculates the size (in bytes) for a pixel buffer using the determined color format. /// </summary> /// <param name="width">The width for the desired pixel buffer.</param> /// <param name="height">The height for the desired pixel buffer.</param> /// <param name="plane">The index of the plane for planar image configuration (or zero for chunky).</param> /// <returns>The size (in bytes) of the required pixel buffer.</returns> private int CalculateStripBufferSize(int width, int height, int plane = -1) { DebugGuard.MustBeLessThanOrEqualTo(plane, 3, nameof(plane)); int bitsPerPixel = 0; if (this.PlanarConfiguration == TiffPlanarConfiguration.Chunky) { DebugGuard.IsTrue(plane == -1, "Expected Chunky planar."); bitsPerPixel = this.BitsPerPixel; } else { switch (plane) { case 0: bitsPerPixel = this.BitsPerSample.Channel0; break; case 1: bitsPerPixel = this.BitsPerSample.Channel1; break; case 2: bitsPerPixel = this.BitsPerSample.Channel2; break; default: TiffThrowHelper.ThrowNotSupported("More then 3 color channels are not supported"); break; } } int bytesPerRow = ((width * bitsPerPixel) + 7) / 8; return(bytesPerRow * height); }
public void IsTrue_IsTrue_ThrowsNoException() { DebugGuard.IsTrue(true, "myParamName", "myTestMessage"); }
/// <summary> /// Writes a image compressed with CCITT T4 to the stream. /// </summary> /// <param name="pixelsAsGray">The pixels as 8-bit gray array.</param> /// <param name="height">The strip height.</param> public override void CompressStrip(Span <byte> pixelsAsGray, int height) { DebugGuard.IsTrue(pixelsAsGray.Length / height == this.Width, "Values must be equals"); DebugGuard.IsTrue(pixelsAsGray.Length % height == 0, "Values must be equals"); this.compressedDataBuffer.Clear(); Span <byte> compressedData = this.compressedDataBuffer.GetSpan(); this.bytePosition = 0; this.bitPosition = 0; if (!this.useModifiedHuffman) { // An EOL code is expected at the start of the data. this.WriteCode(12, 1, compressedData); } for (int y = 0; y < height; y++) { bool isWhiteRun = true; bool isStartOrRow = true; int x = 0; Span <byte> row = pixelsAsGray.Slice(y * this.Width, this.Width); while (x < this.Width) { uint runLength = 0; for (int i = x; i < this.Width; i++) { if (isWhiteRun && row[i] != 255) { break; } if (isWhiteRun && row[i] == 255) { runLength++; continue; } if (!isWhiteRun && row[i] != 0) { break; } if (!isWhiteRun && row[i] == 0) { runLength++; } } if (isStartOrRow && runLength == 0) { this.WriteCode(8, WhiteZeroRunTermCode, compressedData); isWhiteRun = false; isStartOrRow = false; continue; } uint code; uint codeLength; if (runLength <= 63) { code = this.GetTermCode(runLength, out codeLength, isWhiteRun); this.WriteCode(codeLength, code, compressedData); x += (int)runLength; } else { runLength = this.GetBestFittingMakeupRunLength(runLength); code = this.GetMakeupCode(runLength, out codeLength, isWhiteRun); this.WriteCode(codeLength, code, compressedData); x += (int)runLength; // If we are at the end of the line with a makeup code, we need to write a final term code with a length of zero. if (x == this.Width) { if (isWhiteRun) { this.WriteCode(8, WhiteZeroRunTermCode, compressedData); } else { this.WriteCode(10, BlackZeroRunTermCode, compressedData); } } continue; } isStartOrRow = false; isWhiteRun = !isWhiteRun; } this.WriteEndOfLine(compressedData); } // Write the compressed data to the stream. int bytesToWrite = this.bitPosition != 0 ? this.bytePosition + 1 : this.bytePosition; this.Output.Write(compressedData.Slice(0, bytesToWrite)); }
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block) { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); fixed(byte *maskPtr = SseShuffleMasks) { Vector128 <byte> rowA = block.V0.AsByte(); Vector128 <byte> rowB = block.V1.AsByte(); Vector128 <byte> rowC = block.V2.AsByte(); Vector128 <byte> rowD = block.V3.AsByte(); Vector128 <byte> rowE = block.V4.AsByte(); Vector128 <byte> rowF = block.V5.AsByte(); Vector128 <byte> rowG = block.V6.AsByte(); Vector128 <byte> rowH = block.V7.AsByte(); // row0 - A0 A1 B0 C0 B1 A2 A3 B2 Vector128 <short> rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16(); Vector128 <short> rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16(); Vector128 <short> row0 = Sse2.Or(rowA0, rowB0); Vector128 <short> rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16(); row0 = Sse2.Or(row0, rowC0); // row1 - C1 D0 E0 D1 C2 B3 A4 A5 Vector128 <short> rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16(); Vector128 <short> rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16(); Vector128 <short> row1 = Sse2.Or(rowA1, rowC1); Vector128 <short> rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16(); row1 = Sse2.Or(row1, rowD1); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16(); // row2 Vector128 <short> rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16(); Vector128 <short> rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16(); Vector128 <short> row2 = Sse2.Or(rowE2, rowF2); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16(); // row3 Vector128 <short> rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16(); Vector128 <short> rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16(); Vector128 <short> row3 = Sse2.Or(rowA3, rowB3); Vector128 <short> rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16(); row3 = Sse2.Or(row3, rowC3); Vector128 <byte> shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11)); Vector128 <short> rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16(); row3 = Sse2.Or(row3, rowD3); // row4 Vector128 <short> rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16(); Vector128 <short> rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16(); Vector128 <short> row4 = Sse2.Or(rowE4, rowF4); Vector128 <short> rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16(); row4 = Sse2.Or(row4, rowG4); Vector128 <short> rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16(); row4 = Sse2.Or(row4, rowH4); // row5 Vector128 <short> rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16(); Vector128 <short> rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); Vector128 <short> row5 = Sse2.Or(rowC5, rowD5); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16(); // row6 Vector128 <short> rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16(); Vector128 <short> rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16(); Vector128 <short> row6 = Sse2.Or(rowE6, rowF6); Vector128 <short> rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16(); row6 = Sse2.Or(row6, rowH6); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16(); // row7 Vector128 <short> rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16(); Vector128 <short> rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16(); Vector128 <short> row7 = Sse2.Or(rowG7, rowH7); row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16(); block.V0 = row0; block.V1 = row1; block.V2 = row2; block.V3 = row3; block.V4 = row4; block.V5 = row5; block.V6 = row6; block.V7 = row7; } }
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); ref Vector256 <float> aBase = ref a.V0;
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); fixed(byte *shuffleVectorsPtr = AvxShuffleMasks) { Vector256 <byte> rowsAB = block.V01.AsByte(); Vector256 <byte> rowsCD = block.V23.AsByte(); Vector256 <byte> rowsEF = block.V45.AsByte(); Vector256 <byte> rowsGH = block.V67.AsByte(); // rows 0 1 Vector256 <int> rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); Vector256 <byte> row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); Vector256 <int> rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); Vector256 <byte> row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); Vector256 <byte> row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); Vector256 <byte> row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); Vector256 <byte> row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); // rows 2 3 Vector256 <int> rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); Vector256 <byte> row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 <byte> row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); Vector256 <byte> row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); Vector256 <byte> row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); Vector256 <byte> row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); Vector256 <byte> row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); Vector256 <byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); // rows 4 5 Vector256 <byte> row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); Vector256 <byte> row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); Vector256 <byte> row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); Vector256 <int> rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); Vector256 <byte> row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); Vector256 <byte> row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); Vector256 <byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); // rows 6 7 Vector256 <byte> row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); Vector256 <byte> row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); Vector256 <byte> row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); Vector256 <byte> row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); block.V01 = row01.AsInt16(); block.V23 = row23.AsInt16(); block.V45 = row45.AsInt16(); block.V67 = row67.AsInt16(); } }
/// <summary> /// SIMD convert using buffers of sizes divisable by 8. /// </summary> internal static void ConvertCore(ComponentValues values, Span <Vector4> result) { DebugGuard.IsTrue(result.Length % 8 == 0, nameof(result), "result.Length should be divisable by 8!"); ref Vector4Pair yBase =
public static TiffBaseColorDecoder <TPixel> Create( Configuration configuration, MemoryAllocator memoryAllocator, TiffColorType colorType, TiffBitsPerSample bitsPerSample, ushort[] colorMap, Rational[] referenceBlackAndWhite, Rational[] ycbcrCoefficients, ushort[] ycbcrSubSampling, ByteOrder byteOrder) { switch (colorType) { case TiffColorType.WhiteIsZero: DebugGuard.IsTrue(bitsPerSample.Channels == 1, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZeroTiffColor <TPixel>(bitsPerSample)); case TiffColorType.WhiteIsZero1: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 1, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZero1TiffColor <TPixel>()); case TiffColorType.WhiteIsZero4: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 4, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZero4TiffColor <TPixel>()); case TiffColorType.WhiteIsZero8: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 8, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZero8TiffColor <TPixel>()); case TiffColorType.WhiteIsZero16: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 16, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZero16TiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.WhiteIsZero24: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 24, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZero24TiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.WhiteIsZero32: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 32, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZero32TiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.WhiteIsZero32Float: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 32, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new WhiteIsZero32FloatTiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.BlackIsZero: DebugGuard.IsTrue(bitsPerSample.Channels == 1, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZeroTiffColor <TPixel>(bitsPerSample)); case TiffColorType.BlackIsZero1: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 1, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZero1TiffColor <TPixel>()); case TiffColorType.BlackIsZero4: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 4, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZero4TiffColor <TPixel>()); case TiffColorType.BlackIsZero8: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 8, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZero8TiffColor <TPixel>(configuration)); case TiffColorType.BlackIsZero16: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 16, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZero16TiffColor <TPixel>(configuration, byteOrder == ByteOrder.BigEndian)); case TiffColorType.BlackIsZero24: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 24, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZero24TiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.BlackIsZero32: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 32, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZero32TiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.BlackIsZero32Float: DebugGuard.IsTrue(bitsPerSample.Channels == 1 && bitsPerSample.Channel0 == 32, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new BlackIsZero32FloatTiffColor <TPixel>(byteOrder == ByteOrder.BigEndian)); case TiffColorType.Rgb: DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new RgbTiffColor <TPixel>(bitsPerSample)); case TiffColorType.Rgb222: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 2 && bitsPerSample.Channel1 == 2 && bitsPerSample.Channel0 == 2, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new RgbTiffColor <TPixel>(bitsPerSample)); case TiffColorType.Rgb444: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 4 && bitsPerSample.Channel1 == 4 && bitsPerSample.Channel0 == 4, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb444TiffColor <TPixel>()); case TiffColorType.Rgb888: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 8 && bitsPerSample.Channel1 == 8 && bitsPerSample.Channel0 == 8, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb888TiffColor <TPixel>(configuration)); case TiffColorType.Rgb101010: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 10 && bitsPerSample.Channel1 == 10 && bitsPerSample.Channel0 == 10, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new RgbTiffColor <TPixel>(bitsPerSample)); case TiffColorType.Rgb121212: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 12 && bitsPerSample.Channel1 == 12 && bitsPerSample.Channel0 == 12, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new RgbTiffColor <TPixel>(bitsPerSample)); case TiffColorType.Rgb141414: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 14 && bitsPerSample.Channel1 == 14 && bitsPerSample.Channel0 == 14, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new RgbTiffColor <TPixel>(bitsPerSample)); case TiffColorType.Rgb161616: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 16 && bitsPerSample.Channel1 == 16 && bitsPerSample.Channel0 == 16, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb161616TiffColor <TPixel>(configuration, isBigEndian: byteOrder == ByteOrder.BigEndian)); case TiffColorType.Rgb242424: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 24 && bitsPerSample.Channel1 == 24 && bitsPerSample.Channel0 == 24, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb242424TiffColor <TPixel>(isBigEndian: byteOrder == ByteOrder.BigEndian)); case TiffColorType.Rgb323232: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 32 && bitsPerSample.Channel1 == 32 && bitsPerSample.Channel0 == 32, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new Rgb323232TiffColor <TPixel>(isBigEndian: byteOrder == ByteOrder.BigEndian)); case TiffColorType.RgbFloat323232: DebugGuard.IsTrue( bitsPerSample.Channels == 3 && bitsPerSample.Channel2 == 32 && bitsPerSample.Channel1 == 32 && bitsPerSample.Channel0 == 32, "bitsPerSample"); DebugGuard.IsTrue(colorMap == null, "colorMap"); return(new RgbFloat323232TiffColor <TPixel>(isBigEndian: byteOrder == ByteOrder.BigEndian)); case TiffColorType.PaletteColor: DebugGuard.NotNull(colorMap, "colorMap"); return(new PaletteTiffColor <TPixel>(bitsPerSample, colorMap)); case TiffColorType.YCbCr: return(new YCbCrTiffColor <TPixel>(memoryAllocator, referenceBlackAndWhite, ycbcrCoefficients, ycbcrSubSampling)); default: throw TiffThrowHelper.InvalidColorType(colorType.ToString()); } }
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block) { DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); fixed(byte *shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks)) { Vector128 <byte> rowA = block.V0.AsByte(); Vector128 <byte> rowB = block.V1.AsByte(); Vector128 <byte> rowC = block.V2.AsByte(); Vector128 <byte> rowD = block.V3.AsByte(); Vector128 <byte> rowE = block.V4.AsByte(); Vector128 <byte> rowF = block.V5.AsByte(); Vector128 <byte> rowG = block.V6.AsByte(); Vector128 <byte> rowH = block.V7.AsByte(); // row0 - A0 B0 A1 A2 B1 C0 D0 C1 Vector128 <short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16(); Vector128 <short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16(); Vector128 <short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16(); Vector128 <short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C); row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16(); // row1 - B2 A3 A4 B3 C2 D1 E0 F0 Vector128 <short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16(); Vector128 <short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16(); Vector128 <short> row1 = Sse2.Or(row1_A, row1_B); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16(); row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16(); // row2 - E1 D2 C3 B4 A5 A6 B5 C4 Vector128 <short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16(); Vector128 <short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16(); Vector128 <short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16(); Vector128 <short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16(); row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16(); // row3 - D3 E2 F1 G0 H0 G1 F2 E3 Vector128 <short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16(); Vector128 <short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16(); Vector128 <short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16(); Vector128 <short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G); row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16(); row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16(); // row4 - D4 C5 B6 A7 B7 C6 D5 E4 Vector128 <short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16(); Vector128 <short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16(); Vector128 <short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16(); Vector128 <short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D); row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16(); row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16(); // row5 - F3 G2 H1 H2 G3 F4 E5 D6 Vector128 <short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16(); Vector128 <short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16(); Vector128 <short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16(); Vector128 <short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16(); row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16(); // row6 - C7 D7 E6 F5 G4 H3 H4 G5 Vector128 <short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16(); Vector128 <short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16(); Vector128 <short> row6 = Sse2.Or(row6_G, row6_H); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16(); row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16(); // row7 - F6 E7 F7 G6 H5 H6 G7 H7 Vector128 <short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16(); Vector128 <short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16(); Vector128 <short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16(); Vector128 <short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H); row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16(); block.V0 = row0; block.V1 = row1; block.V2 = row2; block.V3 = row3; block.V4 = row4; block.V5 = row5; block.V6 = row6; block.V7 = row7; } }
/// <summary> /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics. /// </summary> /// <param name="block">Input matrix.</param> public static unsafe void ApplyTransposingZigZagOrderingAvx2(ref Block8x8 block) { DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); fixed(byte *shuffleVectorsPtr = &MemoryMarshal.GetReference(AvxShuffleMasks)) { Vector256 <byte> rowAB = block.V01.AsByte(); Vector256 <byte> rowCD = block.V23.AsByte(); Vector256 <byte> rowEF = block.V45.AsByte(); Vector256 <byte> rowGH = block.V67.AsByte(); /* row01 - A0 B0 A1 A2 B1 C0 D0 C1 | B2 A3 A4 B3 C2 D1 E0 F0 */ Vector256 <int> crln_01_AB_CD = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); Vector256 <byte> row01_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_01_AB_CD).AsByte(); row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); Vector256 <byte> row01_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_AB_CD).AsByte(); row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (2 * 32))).AsByte(); Vector256 <int> crln_01_23_EF_23_CD = Avx.LoadVector256(shuffleVectorsPtr + (3 * 32)).AsInt32(); Vector256 <byte> row01_23_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_01_23_EF_23_CD).AsByte(); Vector256 <byte> row01_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); Vector256 <byte> row01 = Avx2.Or(row01_AB, Avx2.Or(row01_CD, row01_EF)); /* row23 - E1 D2 C3 B4 A5 A6 B5 C4 | D3 E2 F1 G0 H0 G1 F2 E3 */ Vector256 <int> crln_23_AB_23_45_GH = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); Vector256 <byte> row23_45_AB = Avx2.PermuteVar8x32(rowAB.AsInt32(), crln_23_AB_23_45_GH).AsByte(); Vector256 <byte> row23_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); Vector256 <byte> row23_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_01_23_EF_23_CD).AsByte(); row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); Vector256 <byte> row23_EF = Avx2.Shuffle(row01_23_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); Vector256 <byte> row23_45_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_23_AB_23_45_GH).AsByte(); Vector256 <byte> row23_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32))).AsByte(); Vector256 <byte> row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); /* row45 - D4 C5 B6 A7 B7 C6 D5 E4 | F3 G2 H1 H2 G3 F4 E5 D6 */ Vector256 <byte> row45_AB = Avx2.Shuffle(row23_45_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32))).AsByte(); Vector256 <int> crln_45_67_CD_45_EF = Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsInt32(); Vector256 <byte> row45_67_CD = Avx2.PermuteVar8x32(rowCD.AsInt32(), crln_45_67_CD_45_EF).AsByte(); Vector256 <byte> row45_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (12 * 32))).AsByte(); Vector256 <byte> row45_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_45_67_CD_45_EF).AsByte(); row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32))).AsByte(); Vector256 <byte> row45_GH = Avx2.Shuffle(row23_45_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32))).AsByte(); Vector256 <byte> row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); /* row67 - C7 D7 E6 F5 G4 H3 H4 G5 | F6 E7 F7 G6 H5 H6 G7 H7 */ Vector256 <byte> row67_CD = Avx2.Shuffle(row45_67_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32))).AsByte(); Vector256 <int> crln_67_EF_67_GH = Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsInt32(); Vector256 <byte> row67_EF = Avx2.PermuteVar8x32(rowEF.AsInt32(), crln_67_EF_67_GH).AsByte(); row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32))).AsByte(); Vector256 <byte> row67_GH = Avx2.PermuteVar8x32(rowGH.AsInt32(), crln_67_EF_67_GH).AsByte(); row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (18 * 32))).AsByte(); Vector256 <byte> row67 = Avx2.Or(row67_CD, Avx2.Or(row67_EF, row67_GH)); block.V01 = row01.AsInt16(); block.V23 = row23.AsInt16(); block.V45 = row45.AsInt16(); block.V67 = row67.AsInt16(); } }