internal static void SpecializedExplicitValueKernel <T>( ArrayView1D <T, Stride1D.Dense> data, SpecializedValue <T> value) where T : unmanaged, IEquatable <T> { data[0] = value; }
public Camera(Camera camera, Vec3 movement, Vec3 turn) { this.width = camera.width; this.height = camera.height; this.maxBounces = camera.maxBounces; Vector4 temp = camera.lookAt - camera.origin; if (turn.y != 0) { temp += Vector4.Transform(temp, Matrix4x4.CreateFromAxisAngle(Vec3.cross(Vec3.cross(camera.up, (camera.lookAt - camera.origin)), (camera.lookAt - camera.origin)), (float)turn.y)); } if (turn.x != 0) { temp += Vector4.Transform(temp, Matrix4x4.CreateFromAxisAngle(Vec3.cross(camera.up, (camera.lookAt - camera.origin)), (float)turn.x)); } lookAt = camera.origin + Vec3.unitVector(temp); this.origin = camera.origin + movement; this.lookAt += movement; this.up = camera.up; axis = OrthoNormalBasis.fromZY(Vec3.unitVector(lookAt - origin), up); aspectRatio = ((float)width / (float)height); cameraPlaneDist = 1.0f / XMath.Tan(camera.verticalFov * XMath.PI / 360.0f); this.verticalFov = camera.verticalFov; reciprocalHeight = 1.0f / height; reciprocalWidth = 1.0f / width; }
internal static void SpecializedImplicitValueKernel <T>( Index1 _, ArrayView <T> data, SpecializedValue <T> value) where T : unmanaged, IEquatable <T> { data[0] = value; }
// The specialization also works with generic kernels static void SpecializedGenericKernel <TValue>( ArrayView <TValue> view, SpecializedValue <TValue> specialized) where TValue : unmanaged, IEquatable <TValue> { var globalIndex = Grid.GlobalIndex.X; view[globalIndex] = specialized; }
private static void IlGpuKernelConstants( ArrayView2D <Real> mSquaredDistances, ArrayView <Real> mCoordinates, SpecializedValue <int> c, int n) { // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant. // Also, we write the results as float2. var shared = SharedMemory.GetDynamic <Real>(); var coordinatesI = shared.GetSubView(0, c * Group.DimX); var coordinatesJ = shared.GetSubView(c * Group.DimX); var bI = Grid.IdxY * Group.DimX; var bJ = Grid.IdxX * Group.DimX; for (int k = 0; k != c; ++k) { if (bI + Group.IdxX < n) { coordinatesI[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bI + Group.IdxX]; } if (bJ + Group.IdxX < n) { coordinatesJ[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bJ + Group.IdxX]; } } Group.Barrier(); var line = Group.IdxX / (Group.DimX / 2); var tid = Group.IdxX % (Group.DimX / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Cast <IlReal2>(); for (int i = line; i < Group.DimX & bI + i < n; i += 2) { var dist = default(IlReal2); for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * Group.DimX + i]; var coord2 = coordinatesJ2[(k * Group.DimX / 2) + tid]; var diff = new IlReal2(coord1 - coord2.X, coord1 - coord2.Y); dist += diff * diff; } var dst = mSquaredDistances.Cast <IlReal2>(); dst[bJ / 2 + tid, bI + i] = dist; } } }
// A simple kernel with a specialized kernel parameter public static void SpecializedKernel( ArrayView <int> view, // This parameter will be replaced by a 'constant' value during // the first kernel call SpecializedValue <int> specialized) { var globalIndex = Grid.GlobalIndex.X; view[globalIndex] = specialized; }
static void Main() { using (var context = new Context()) { // For each available accelerator... foreach (var acceleratorId in Accelerator.Accelerators) { // Create default accelerator for the given accelerator id using (var accelerator = Accelerator.Create(context, acceleratorId)) { Console.WriteLine($"Performing operations on {accelerator}"); int groupSize = accelerator.MaxNumThreadsPerGroup; // Scenario 1: simple version using (var buffer = accelerator.Allocate <int>(groupSize)) { var kernel = accelerator.LoadStreamKernel < ArrayView <int>, SpecializedValue <int> >(SpecializedKernel); kernel((1, groupSize), buffer.View, SpecializedValue.New(2)); kernel((1, groupSize), buffer.View, SpecializedValue.New(23)); kernel((1, groupSize), buffer.View, SpecializedValue.New(42)); } // Scenario 2: custom structure using (var buffer = accelerator.Allocate <int>(groupSize)) { var kernel = accelerator.LoadStreamKernel < ArrayView <int>, SpecializedValue <CustomStruct> >(SpecializedCustomStructKernel); kernel( (1, groupSize), buffer.View, SpecializedValue.New( new CustomStruct(1, 7))); kernel( (1, groupSize), buffer.View, SpecializedValue.New( new CustomStruct(23, 42))); } // Scenario 3: generic kernel using (var buffer = accelerator.Allocate <long>(groupSize)) { var kernel = accelerator.LoadStreamKernel < ArrayView <long>, SpecializedValue <long> >(SpecializedGenericKernel); kernel((1, groupSize), buffer.View, SpecializedValue.New(23L)); kernel((1, groupSize), buffer.View, SpecializedValue.New(42L)); } } } } }
private static void IlGpuKernelLocalMemory( ArrayView2D <Real> mSquaredDistances, ArrayView <Real> mCoordinates, SpecializedValue <int> dimX, SpecializedValue <int> c, int n) { // Same as KernelConstants, but use both local and shared memory to increase the effective shared memory. var coordinatesI = SharedMemory.Allocate <Real>(c * dimX); var coordinatesJ = new IlReal2[c.Value]; var bI = Grid.IdxY * dimX; var bJ = Grid.IdxX * dimX; var line = Group.IdxX / (dimX / 2); var tid = Group.IdxX % (dimX / 2); var isActive = bJ + tid * 2 < n; for (int k = 0; k != c.Value; ++k) { if (bI + Group.IdxX < n) { coordinatesI[k * dimX + Group.IdxX] = mCoordinates[k * n + bI + Group.IdxX]; } if (isActive) { var mCoordinates2 = mCoordinates.Cast <IlReal2>(); coordinatesJ[k] = mCoordinates2[(k * n + bJ) / 2 + tid]; } } Group.Barrier(); if (isActive) { for (int i = line; i < dimX && bI + i < n; i += 2) { var dist = default(IlReal2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * dimX + i]; var coord2 = coordinatesJ[k]; var diff = new IlReal2(coord1 - coord2.X, coord1 - coord2.Y); dist += diff * diff; } var dst = mSquaredDistances.Cast <IlReal2>(); dst[bJ / 2 + tid, bI + i] = dist; } } }
/// <summary> /// Kernel that multiplies two UInt64 values to produce a UInt64 value. /// </summary> public static void MultiplyUInt128Kernel( Index1D index, ArrayView <UInt128> buffer, SpecializedValue <ulong> constant) { // NB: Need to convert index.X to ulong, so that %2 will use the correct PTX register type. ulong multiplier = (ulong)index.X; CudaAsm.Emit("mul.hi.u64 %0, %1, %2;", out ulong high, constant.Value, multiplier); CudaAsm.Emit("mul.lo.u64 %0, %1, %2;", out ulong low, constant.Value, multiplier); buffer[index] = new UInt128(high, low); }
// The specialization functionality supports user-defined types, as long as they // are value types and implement the IEquatable interface (and have useful // GetHashCode and Equals implementations). public static void SpecializedCustomStructKernel( ArrayView <int> view, SpecializedValue <CustomStruct> specialized) { var globalIndex = Grid.GlobalIndex.X; // Note that an implicit conversion from a specialized value to // a non-specialized value is possible. But: not the other way around ;) CustomStruct customValue = specialized; // The value is specialized and the additional optimization passes will // perform constant propagation to create an 'optimized' store with a single constant // value (in this case) view[globalIndex] = customValue.Value1 + customValue.Value2; }
public Camera(Vec3 origin, Vec3 lookAt, Vec3 up, int width, int height, float verticalFov) { this.width = new SpecializedValue <int>(width); this.height = new SpecializedValue <int>(height); this.verticalFov = verticalFov; this.origin = origin; this.lookAt = lookAt; this.up = up; axis = OrthoNormalBasis.fromZY(Vec3.unitVector(lookAt - origin), up); aspectRatio = ((float)width / (float)height); cameraPlaneDist = 1.0f / XMath.Tan(verticalFov * XMath.PI / 360.0f); reciprocalHeight = 1.0f / height; reciprocalWidth = 1.0f / width; }
/// <summary> /// Demonstrates using the mul.hi.u64 and mul.lo.u64 inline PTX instructions to /// multiply two UInt64 values to produce a UInt128 value. /// </summary> static void MultiplyUInt128(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <UInt128>(1024); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <UInt128>, SpecializedValue <ulong> >(MultiplyUInt128Kernel); kernel( (int)buffer.Length, buffer.View, SpecializedValue.New(ulong.MaxValue)); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
private static void IlGpuOptimisedImpl( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <ArrayView2D <Real>, ArrayView <Real>, SpecializedValue <int>, SpecializedValue <int>, int> kernelFunc) { using var cudaSquaredDistance = gpu.Allocate <Real>(n, n); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = ((gridSize, gridSize, 1), (blockSize, 1, 1)); gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, SpecializedValue.New(blockSize), SpecializedValue.New(c), n); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n)); }
/// <summary> /// The actual unique kernel implementation. /// </summary> /// <typeparam name="T">The element type.</typeparam> /// <typeparam name="TComparisonOperation">The comparison operation.</typeparam> /// <param name="input">The input view.</param> /// <param name="output">The output view to store the new length.</param> /// <param name="sequentialGroupExecutor"> /// The sequential group executor to use. /// </param> /// <param name="tileSize">The tile size.</param> /// <param name="numIterationsPerGroup"> /// The number of iterations per group. /// </param> internal static void UniqueKernel <T, TComparisonOperation>( ArrayView <T> input, ArrayView <long> output, SequentialGroupExecutor sequentialGroupExecutor, SpecializedValue <int> tileSize, Index1D numIterationsPerGroup) where T : unmanaged where TComparisonOperation : struct, IComparisonOperation <T> { TComparisonOperation comparison = default; var isFirstGrid = Grid.IdxX == 0; var tileInfo = new TileInfo(input.IntLength, numIterationsPerGroup); // Sync groups and wait for the current one to become active sequentialGroupExecutor.Wait(); var temp = SharedMemory.Allocate <bool>(tileSize); var startIdx = Grid.ComputeGlobalIndex(Grid.IdxX, 0); for ( int i = tileInfo.StartIndex; i < tileInfo.MaxLength; i += Group.DimX) { if (Group.IsFirstThread && i == tileInfo.StartIndex && isFirstGrid) { temp[0] = true; } else { var currIdx = i; var prevIdx = Group.IsFirstThread && i == tileInfo.StartIndex ? output[0] - 1 : currIdx - 1; temp[currIdx - startIdx] = comparison.Compare(input[currIdx], input[prevIdx]) != 0; } } Group.Barrier(); if (Group.IsFirstThread) { var offset = isFirstGrid ? 0 : output[0]; var maxLength = XMath.Min(startIdx + temp.IntLength, tileInfo.MaxLength) - startIdx; for (var i = 0; i < maxLength; i++) { if (temp[i]) { input[offset++] = input[startIdx + i]; } } output[0] = offset; } MemoryFence.DeviceLevel(); Group.Barrier(); sequentialGroupExecutor.Release(); }
/// <summary> /// Performs the first radix-sort pass. /// </summary> /// <typeparam name="T">The element type.</typeparam> /// <typeparam name="TOperation">The radix-sort operation.</typeparam> /// <typeparam name="TSpecialization">The specialization type.</typeparam> /// <param name="view">The input view to use.</param> /// <param name="counter">The global counter view.</param> /// <param name="groupSize">The number of threads in the group.</param> /// <param name="numGroups">The number of virtually launched groups.</param> /// <param name="paddedLength">The padded length of the input view.</param> /// <param name="shift">The bit shift to use.</param> internal static void RadixSortKernel1 <T, TOperation, TSpecialization>( ArrayView <T> view, ArrayView <int> counter, SpecializedValue <int> groupSize, int numGroups, int paddedLength, int shift) where T : unmanaged where TOperation : struct, IRadixSortOperation <T> where TSpecialization : struct, IRadixSortSpecialization { TSpecialization specialization = default; var scanMemory = SharedMemory.Allocate <int>( groupSize * specialization.UnrollFactor); int gridIdx = Grid.IdxX; for ( int i = Grid.GlobalIndex.X; i < paddedLength; i += GridExtensions.GridStrideLoopStride) { bool inRange = i < view.Length; // Read value from global memory TOperation operation = default; T value = operation.DefaultValue; if (inRange) { value = view[i]; } var bits = operation.ExtractRadixBits( value, shift, specialization.UnrollFactor - 1); for (int j = 0; j < specialization.UnrollFactor; ++j) { scanMemory[Group.IdxX + groupSize * j] = 0; } if (inRange) { scanMemory[Group.IdxX + groupSize * bits] = 1; } Group.Barrier(); for (int j = 0; j < specialization.UnrollFactor; ++j) { var address = Group.IdxX + groupSize * j; scanMemory[address] = GroupExtensions.ExclusiveScan <int, AddInt32>(scanMemory[address]); } Group.Barrier(); if (Group.IdxX == Group.DimX - 1) { // Write counters to global memory for (int j = 0; j < specialization.UnrollFactor; ++j) { ref var newOffset = ref scanMemory[Group.IdxX + groupSize * j]; newOffset += Utilities.Select(inRange & j == bits, 1, 0); counter[j * numGroups + gridIdx] = newOffset; } } Group.Barrier(); var gridSize = gridIdx * Group.DimX; Index1 pos = gridSize + scanMemory[Group.IdxX + groupSize * bits] - Utilities.Select(inRange & Group.IdxX == Group.DimX - 1, 1, 0); for (int j = 1; j <= bits; ++j) { pos += scanMemory[groupSize * j - 1] + Utilities.Select(j - 1 == bits, 1, 0); } // Pre-sort the current value into the corresponding segment if (inRange) { view[pos] = value; } Group.Barrier(); gridIdx += Grid.DimX; }
static void ScalarConsecutiveOperationKernel(Index1 index, ArrayView <float> OutPut, ArrayView <float> Input, float Scalar, SpecializedValue <int> operation) { switch ((Operations)operation.Value) { case Operations.multiplication: OutPut[index] = Input[index] * Scalar; break; case Operations.addition: OutPut[index] = Input[index] + Scalar; break; case Operations.subtraction: OutPut[index] = Input[index] - Scalar; break; case Operations.flipSubtraction: OutPut[index] = Scalar - Input[index]; break; case Operations.division: OutPut[index] = Input[index] / Scalar; break; case Operations.inverseDivision: OutPut[index] = Scalar / Input[index]; break; case Operations.power: OutPut[index] = XMath.Pow(Input[index], Scalar); break; case Operations.powerFlipped: OutPut[index] = XMath.Pow(Scalar, Input[index]); break; case Operations.squareOfDiffs: OutPut[index] = XMath.Pow((Input[index] - Scalar), 2f); break; } }