/// <summary> /// Multiplies two dense matrices and returns the resultant matrix (using tiling). /// </summary> /// <param name="accelerator">The Accelerator to run the multiplication on</param> /// <param name="a">A dense MxK matrix</param> /// <param name="b">A dense KxN matrix</param> /// <returns>A dense MxN matrix</returns> static float[,] MatrixMultiplyTiled(Accelerator accelerator, float[,] a, float[,] b) { var m = a.GetLength(0); var ka = a.GetLength(1); var kb = b.GetLength(0); var n = b.GetLength(1); if (ka != kb) { throw new ArgumentException($"Cannot multiply {m}x{ka} matrix by {n}x{kb} matrix", nameof(b)); } var kernel = accelerator.LoadStreamKernel <GroupedIndex2, ArrayView2D <float>, ArrayView2D <float>, ArrayView2D <float> >(MatrixMultiplyTiledKernel); var groupSize = new Index2(TILE_SIZE, TILE_SIZE); var numGroups = new Index2((m + TILE_SIZE - 1) / TILE_SIZE, (n + TILE_SIZE - 1) / TILE_SIZE); var launchDimension = new GroupedIndex2(numGroups, groupSize); using (var aBuffer = accelerator.Allocate <float>(m, ka)) using (var bBuffer = accelerator.Allocate <float>(ka, n)) using (var cBuffer = accelerator.Allocate <float>(m, n)) { aBuffer.CopyFrom(a, Index2.Zero, Index2.Zero, aBuffer.Extent); bBuffer.CopyFrom(b, Index2.Zero, Index2.Zero, bBuffer.Extent); kernel(launchDimension, aBuffer, bBuffer, cBuffer); accelerator.Synchronize(); return(cBuffer.GetAs2DArray()); } }
public void GroupedIndex2EntryPoint(int length) { var end = (int)Math.Sqrt(Accelerator.MaxNumThreadsPerGroup); for (int i = 1; i <= end; i <<= 1) { var stride = new Index2(i, i); var extent = new GroupedIndex2( new Index2(length, length), stride); using var buffer = Accelerator.Allocate <int>(extent.Size); buffer.MemSetToZero(Accelerator.DefaultStream); Execute(extent, buffer.View, stride, extent.GridIdx); var expected = new int[extent.Size]; for (int j = 0; j < length * length; ++j) { var gridIdx = Index2.ReconstructIndex(j, extent.GridIdx); for (int k = 0; k < i * i; ++k) { var groupIdx = Index2.ReconstructIndex(k, extent.GroupIdx); var idx = (gridIdx * stride + groupIdx).ComputeLinearIndex(extent.GridIdx); expected[idx] = idx; } } Verify(buffer, expected); } }
internal static void GroupedIndex2EntryPointKernel( GroupedIndex2 index, ArrayView <int> output, Index2 stride, Index2 extent) { var idx1 = index.GridIdx.X * stride.X + index.GroupIdx.X; var idx2 = index.GridIdx.Y * stride.Y + index.GroupIdx.Y; var idx = idx2 * extent.X + idx1; output[idx] = idx; }
/// <summary> /// The tiled matrix multiplication kernel that runs on the accelerated device. /// </summary> /// <param name="index">Current matrix index</param> /// <param name="aView">An input matrix of size MxK</param> /// <param name="bView">An input matrix of size KxN</param> /// <param name="cView">An output matrix of size MxN</param> static void MatrixMultiplyTiledKernel(GroupedIndex2 index, ArrayView2D <float> aView, ArrayView2D <float> bView, ArrayView2D <float> cView) { var global = index.ComputeGlobalIndex(); var x = index.GroupIdx.X; var y = index.GroupIdx.Y; var aTile = SharedMemory.Allocate2D <float>(TILE_SIZE, TILE_SIZE); var bTile = SharedMemory.Allocate2D <float>(TILE_SIZE, TILE_SIZE); var sum = 0.0f; for (var i = 0; i < aView.Width; i += TILE_SIZE) { if (global.X < aView.Width && y + i < aView.Height) { aTile[x, y] = aView[global.X, y + i]; } else { aTile[x, y] = 0; } if (x + i < bView.Width && global.Y < bView.Height) { bTile[x, y] = bView[x + i, global.Y]; } else { bTile[x, y] = 0; } Group.Barrier(); for (var k = 0; k < TILE_SIZE; k++) { sum += aTile[new Index2(x, k)] * bTile[new Index2(k, y)]; } Group.Barrier(); } if (global.X < cView.Width && global.Y < cView.Height) { cView[global] = sum; } }
static void ShadowKernel1( GroupedIndex2 index, ArrayView <float> points, ArrayView <float> matrices, ArrayView <int> horizon, ArrayView <float> test_array, [SharedMemory(1440)] ArrayView <int> horizon_shared) { var target_line = index.GridIdx.Y; var target_sample = index.GridIdx.X; var caster_line = index.GroupIdx.Y; Debug.Assert(index.GroupIdx.X == 1); // Copy horizon for a[target_line,target_sample] into shared memory { var dim = Group.Dimension.Y; var len = horizon_shared.Length; var passes = (len + (dim - 1)) / dim; var offset = (target_line * TerrainPatch.DefaultSize + target_sample) * len; for (var pass = 0; pass < passes; pass++) { var ptr = pass * dim + caster_line; if (ptr < len) // divergence { horizon_shared[ptr] = horizon[ptr + offset]; } } } Group.Barrier(); // Copy the matrix into registers var pos = (target_line * TerrainPatch.DefaultSize + target_sample) * 12; var row0x = matrices[pos++]; var row1x = matrices[pos++]; var row2x = matrices[pos++]; var row3x = matrices[pos++]; var row0y = matrices[pos++]; var row1y = matrices[pos++]; var row2y = matrices[pos++]; var row3y = matrices[pos++]; var row0z = matrices[pos++]; var row1z = matrices[pos++]; var row2z = matrices[pos++]; var row3z = matrices[pos]; for (var caster_sample = 0; caster_sample < TerrainPatch.DefaultSize; caster_sample++) { // Fetch the other point in local frame var points_offset = (caster_line * TerrainPatch.DefaultSize + caster_sample) * 3; var x_patch = points[points_offset]; var y_patch = points[points_offset + 1]; var z_patch = points[points_offset + 2]; // Transform the point to the local frame var x = x_patch * row0x + y_patch * row1x + z_patch * row2x + row3x; var y = x_patch * row0y + y_patch * row1y + z_patch * row2y + row3y; var z = x_patch * row0z + y_patch * row1z + z_patch * row2z + row3z; // Adjust for solar array height (this is temporary, and I'm not sure we want this in the final version) z -= ObserverHeight; // meters var azimuth = GPUMath.Atan2(y, x) + GPUMath.PI; // [0,2 PI] var alen = GPUMath.Sqrt(x * x + y * y); var slope = z / alen; var slopem = slope > 2f ? 2f : slope; slopem = slopem < -2f ? -2f : slopem; slopem = slopem / 4f; var slopei = (int)(slopem * 1000000); var horizon_index = (int)(0.5f + 1439 * (azimuth / (2f * GPUMath.PI))); Atomic.Max(horizon_shared.GetVariableView(horizon_index), slopei); if (caster_sample == 0 && caster_line == 0 && target_line == 0 && target_sample == 0) { test_array[0] = x_patch; test_array[1] = y_patch; test_array[2] = z_patch; test_array[3] = x; test_array[4] = y; test_array[5] = z; test_array[6] = slope; test_array[7] = slopem; test_array[8] = slopei; test_array[9] = row3x; test_array[10] = row3y; test_array[11] = row3z; } } Group.Barrier(); { var dim = Group.Dimension.Y; var len = horizon_shared.Length; var passes = (len + (dim - 1)) / dim; var offset = (target_line * TerrainPatch.DefaultSize + target_sample) * len; for (var pass = 0; pass < passes; pass++) { var ptr = pass * dim + caster_line; if (ptr < len) // divergence { horizon[ptr + offset] = horizon_shared[ptr]; } } } }
/// <summary> /// Update the horizons of a patch based on a list of shadow casters. /// The horizons will be in slope, not angle, format /// </summary> /// <param name="target"></param> /// <param name="casters"></param> public void UpdateHorizons(TerrainPatch target, List <TerrainPatch> casters) { Debug.Assert(Terrain != null); if (casters.Count < 1) { return; } using (var context = new Context()) { AcceleratorId aid = Accelerator.Accelerators.Where(id => id.AcceleratorType == AcceleratorType.Cuda).FirstOrDefault(); if (aid.AcceleratorType != AcceleratorType.Cuda) { Console.WriteLine(@"There is no CUDA accelerator present. Doing nothing."); return; } using (var accelerator = Accelerator.Create(context, aid)) { target.FillPoints(Terrain); target.FillMatricesRelativeToPoint(Terrain, target.Points[0][0]); // Matrices var cpu_matrices_size = target.Height * target.Width * 12; var basePoint = target.Points[0][0]; var cpu_matrices = MakeCPUMatrices(target); // Horizon (load from target) var cpu_horizon_size = target.Height * target.Width * Horizon.HorizonSamples; var cpu_horizon = new int[cpu_horizon_size]; for (var line = 0; line < TerrainPatch.DefaultSize; line++) { for (var sample = 0; sample < TerrainPatch.DefaultSize; sample++) { var offset = (line * TerrainPatch.DefaultSize + sample) * Horizon.HorizonSamples; var buffer = target.Horizons[line][sample].Buffer; for (var i = 0; i < Horizon.HorizonSamples; i++) { cpu_horizon[i + offset] = SlopeToEncoding(buffer[i]); } } } // Caster points var cpu_caster_points_size = casters[0].Width * casters[0].Height * 3; var cpu_caster_points = new float[cpu_caster_points_size]; // test array var cpu_test_array = new float[20]; using (var gpu_matrices = accelerator.Allocate <float>(cpu_matrices_size)) using (var gpu_horizon = accelerator.Allocate <int>(cpu_horizon_size)) using (var gpu_caster_points = accelerator.Allocate <float>(cpu_caster_points_size)) using (var gpu_test_array = accelerator.Allocate <float>(cpu_test_array.Length)) { gpu_matrices.CopyFrom(cpu_matrices, 0, 0, cpu_matrices_size); gpu_horizon.CopyFrom(cpu_horizon, 0, 0, cpu_horizon_size); var groupSize = accelerator.MaxNumThreadsPerGroup; var launchDimension = new GroupedIndex2( new Index2(128, 128), // (data.Length + groupSize - 1) / groupSize, // Compute the number of groups (round up) new Index2(1, 128)); var kernel1 = accelerator.LoadSharedMemoryStreamKernel1 <GroupedIndex2, ArrayView <float>, ArrayView <float>, ArrayView <int>, ArrayView <float>, ArrayView <int> >(ShadowKernel1); //var stopwatch = new Stopwatch(); //stopwatch.Start(); foreach (var caster in casters) { caster.FillPoints(Terrain); CopyPointsToCpuArray(caster, basePoint, cpu_caster_points); gpu_caster_points.CopyFrom(cpu_caster_points, 0, 0, cpu_caster_points_size); kernel1(launchDimension, gpu_caster_points, gpu_matrices, gpu_horizon, gpu_test_array); accelerator.Synchronize(); } // Copy out data gpu_horizon.CopyTo(cpu_horizon, 0, 0, cpu_horizon_size); gpu_test_array.CopyTo(cpu_test_array, 0, 0, cpu_test_array.Length); //stopwatch.Stop(); //Console.WriteLine($"kernel time={stopwatch.Elapsed} cpu_horizon.Max()={cpu_horizon.Max()} cpu_horizon[0]={cpu_horizon[0]}"); // Update the horizons for (var line = 0; line < TerrainPatch.DefaultSize; line++) { for (var sample = 0; sample < TerrainPatch.DefaultSize; sample++) { var offset = (line * TerrainPatch.DefaultSize + sample) * Horizon.HorizonSamples; var buffer = target.Horizons[line][sample].Buffer; for (var i = 0; i < Horizon.HorizonSamples; i++) { buffer[i] = EncodingToSlope(cpu_horizon[i + offset]); } } } //Console.WriteLine($" max slope={cpu_horizon.Select(EncodingToSlope).Max()}"); } } } }
/// <summary> /// Computes the global index of a grouped index (gridIdx, groupIdx). /// </summary> /// <param name="index">The grouped index.</param> /// <returns>The computes global index.</returns> public static Index2 ComputeGlobalIndex(GroupedIndex2 index) { return(ComputeGlobalIndex(index.GridIdx, index.GroupIdx)); }