public unsafe static void RunLocal() { var mem = new SharedMemory(_channel.GetSubChannel("mem")); int * ptr1 = (int *)mem.Allocate(_channel.GetSubChannel("A"), 4, 4).Pointer; int * ptr2 = (int *)mem.Allocate(_channel.GetSubChannel("B"), 4, 4).Pointer; Func <bool> func = () => Volatile.Read(ref *ptr2) == 2; FastSpinUntil(() => Volatile.Read(ref *ptr2) == 1, 2000); for (int i = 0; i < 50; ++i) { Volatile.Write(ref *ptr1, 1); FastSpinUntil(func, 2000); Volatile.Write(ref *ptr2, 1); } var clock = Stopwatch.StartNew(); for (int i = 0; i < 50; ++i) { Volatile.Write(ref *ptr1, 1); FastSpinUntil(func, 2000); Volatile.Write(ref *ptr2, 1); } var time = clock.ElapsedTicks / (float)TimeSpan.TicksPerMillisecond * 1000; Console.WriteLine("Average round-trip delay: {0} us", time / 50); }
private static ArrayView <T> InclusiveScanImplementation < T, TScanOperation, TImpl>( T value) where T : unmanaged where TScanOperation : struct, IScanReduceOperation <T> where TImpl : struct, IILFunctionImplementation { TImpl impl = default; // Load values into shared memory var sharedMemory = SharedMemory.Allocate <T>(impl.MaxNumThreads); Debug.Assert( impl.ThreadDimension <= impl.MaxNumThreads, "Invalid group/warp size"); sharedMemory[impl.ThreadIndex] = value; impl.Barrier(); // First thread performs all operations if (impl.IsFirstThread) { TScanOperation scanOperation = default; for (int i = 1; i < impl.ThreadDimension; ++i) { sharedMemory[i] = scanOperation.Apply( sharedMemory[i - 1], sharedMemory[i]); } } impl.Barrier(); return(sharedMemory); }
public void RunTest() { const int id0 = 100; const int blockSize = 16; SharedMemory memory = new SharedMemory(Channel.FromHash("Test")); List <IntPtr> blocks = new List <IntPtr>(); //Create for (int i = 0; i < 200; ++i) { blocks.Add(memory.Allocate(id0 + i, 4 * blockSize, 4).Pointer); } //Write for (int i = 0; i < 200; ++i) { for (int j = 0; j < blockSize; ++j) { Assert.AreEqual(0, Marshal.ReadInt32(blocks[i], j * 4)); Marshal.WriteInt32(blocks[i], j * 4, i + j); } } //Read for (int i = 0; i < 200; ++i) { for (int j = 0; j < blockSize; ++j) { Assert.AreEqual(i + j, Marshal.ReadInt32(blocks[i], j * 4)); } } }
public unsafe static void RunRemote() { var mem = new SharedMemory(_channel.GetSubChannel("mem")); int * ptr1 = (int *)mem.Allocate(_channel.GetSubChannel("A"), 4, 4).Pointer; int * ptr2 = (int *)mem.Allocate(_channel.GetSubChannel("B"), 4, 4).Pointer; Func <bool> func = () => Volatile.Read(ref *ptr1) == 1; Volatile.Write(ref *ptr2, 1); for (int i = 0; i < 100; ++i) { FastSpinUntil(func, 2000); Volatile.Write(ref *ptr1, 0); Volatile.Write(ref *ptr2, 2); } }
private static void IlGpuKernelLocalMemory( ArrayView2D <Real> mSquaredDistances, ArrayView <Real> mCoordinates, SpecializedValue <int> dimX, SpecializedValue <int> c, int n) { // Same as KernelConstants, but use both local and shared memory to increase the effective shared memory. var coordinatesI = SharedMemory.Allocate <Real>(c * dimX); var coordinatesJ = new IlReal2[c.Value]; var bI = Grid.IdxY * dimX; var bJ = Grid.IdxX * dimX; var line = Group.IdxX / (dimX / 2); var tid = Group.IdxX % (dimX / 2); var isActive = bJ + tid * 2 < n; for (int k = 0; k != c.Value; ++k) { if (bI + Group.IdxX < n) { coordinatesI[k * dimX + Group.IdxX] = mCoordinates[k * n + bI + Group.IdxX]; } if (isActive) { var mCoordinates2 = mCoordinates.Cast <IlReal2>(); coordinatesJ[k] = mCoordinates2[(k * n + bJ) / 2 + tid]; } } Group.Barrier(); if (isActive) { for (int i = line; i < dimX && bI + i < n; i += 2) { var dist = default(IlReal2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * dimX + i]; var coord2 = coordinatesJ[k]; var diff = new IlReal2(coord1 - coord2.X, coord1 - coord2.Y); dist += diff * diff; } var dst = mSquaredDistances.Cast <IlReal2>(); dst[bJ / 2 + tid, bI + i] = dist; } } }
public static T AllReduce <T, TReduction>(T value) where T : unmanaged where TReduction : IScanReduceOperation <T> { // A fixed number of memory banks to distribute the workload // of the atomic operations in shared memory. const int NumMemoryBanks = 4; var sharedMemory = SharedMemory.Allocate <T>(NumMemoryBanks); var warpIdx = Warp.ComputeWarpIdx(Group.IdxX); var laneIdx = Warp.LaneIdx; TReduction reduction = default; if (warpIdx == 0) { for ( int bankIdx = laneIdx; bankIdx < NumMemoryBanks; bankIdx += Warp.WarpSize) { sharedMemory[bankIdx] = reduction.Identity; } } Group.Barrier(); value = PTXWarpExtensions.Reduce <T, TReduction>(value); if (laneIdx == 0) { reduction.AtomicApply(ref sharedMemory[warpIdx % NumMemoryBanks], value); } Group.Barrier(); // Note that this is explicitly unrolled (see NumMemoryBanks above) var result = sharedMemory[0]; result = reduction.Apply(result, sharedMemory[1]); result = reduction.Apply(result, sharedMemory[2]); result = reduction.Apply(result, sharedMemory[3]); Group.Barrier(); return(result); }
private static T ComputeScan <T, TScanOperation, TScanImplementation>( T value, out ArrayView <T> sharedMemory) where T : unmanaged where TScanOperation : struct, IScanReduceOperation <T> where TScanImplementation : struct, IScanImplementation <T, TScanOperation> { const int SharedMemoryLength = 32; sharedMemory = SharedMemory.Allocate <T>(SharedMemoryLength); int warpIdx = Warp.WarpIdx; TScanOperation scanOperation = default; // Initialize if (Group.DimX / Warp.WarpSize < SharedMemoryLength) { if (warpIdx < 1) { sharedMemory[Group.IdxX] = scanOperation.Identity; } Group.Barrier(); } TScanImplementation scanImplementation = default; var scannedValue = scanImplementation.Scan(value); if (Warp.IsLastLane) { sharedMemory[warpIdx] = scanImplementation.ScanRightBoundary( scannedValue, value); } Group.Barrier(); // Reduce results again in the first warp if (warpIdx < 1) { ref T sharedBoundary = ref sharedMemory[Group.IdxX]; sharedBoundary = PTXWarpExtensions.InclusiveScan <T, TScanOperation>( sharedBoundary); }
public static T AllReduce <T, TReduction, TImpl>(T value) where T : unmanaged where TReduction : IScanReduceOperation <T> where TImpl : struct, IILFunctionImplementation { TImpl impl = default; var sharedMemory = SharedMemory.Allocate <T>(impl.ReduceSegments); TReduction reduction = default; if (impl.IsFirstThread) { sharedMemory[impl.ReduceSegmentIndex] = reduction.Identity; } impl.Barrier(); reduction.AtomicApply(ref sharedMemory[impl.ReduceSegmentIndex], value); impl.Barrier(); return(sharedMemory[impl.ReduceSegmentIndex]); }
/// <summary> /// The actual unique kernel implementation. /// </summary> /// <typeparam name="T">The element type.</typeparam> /// <typeparam name="TComparisonOperation">The comparison operation.</typeparam> /// <param name="input">The input view.</param> /// <param name="output">The output view to store the new length.</param> /// <param name="sequentialGroupExecutor"> /// The sequential group executor to use. /// </param> /// <param name="tileSize">The tile size.</param> /// <param name="numIterationsPerGroup"> /// The number of iterations per group. /// </param> internal static void UniqueKernel <T, TComparisonOperation>( ArrayView <T> input, ArrayView <long> output, SequentialGroupExecutor sequentialGroupExecutor, SpecializedValue <int> tileSize, Index1D numIterationsPerGroup) where T : unmanaged where TComparisonOperation : struct, IComparisonOperation <T> { TComparisonOperation comparison = default; var isFirstGrid = Grid.IdxX == 0; var tileInfo = new TileInfo(input.IntLength, numIterationsPerGroup); // Sync groups and wait for the current one to become active sequentialGroupExecutor.Wait(); var temp = SharedMemory.Allocate <bool>(tileSize); var startIdx = Grid.ComputeGlobalIndex(Grid.IdxX, 0); for ( int i = tileInfo.StartIndex; i < tileInfo.MaxLength; i += Group.DimX) { if (Group.IsFirstThread && i == tileInfo.StartIndex && isFirstGrid) { temp[0] = true; } else { var currIdx = i; var prevIdx = Group.IsFirstThread && i == tileInfo.StartIndex ? output[0] - 1 : currIdx - 1; temp[currIdx - startIdx] = comparison.Compare(input[currIdx], input[prevIdx]) != 0; } } Group.Barrier(); if (Group.IsFirstThread) { var offset = isFirstGrid ? 0 : output[0]; var maxLength = XMath.Min(startIdx + temp.IntLength, tileInfo.MaxLength) - startIdx; for (var i = 0; i < maxLength; i++) { if (temp[i]) { input[offset++] = input[startIdx + i]; } } output[0] = offset; } MemoryFence.DeviceLevel(); Group.Barrier(); sequentialGroupExecutor.Release(); }
/// <summary> /// Performs the first radix-sort pass. /// </summary> /// <typeparam name="T">The element type.</typeparam> /// <typeparam name="TOperation">The radix-sort operation.</typeparam> /// <typeparam name="TSpecialization">The specialization type.</typeparam> /// <param name="view">The input view to use.</param> /// <param name="counter">The global counter view.</param> /// <param name="groupSize">The number of threads in the group.</param> /// <param name="numGroups">The number of virtually launched groups.</param> /// <param name="paddedLength">The padded length of the input view.</param> /// <param name="shift">The bit shift to use.</param> internal static void RadixSortKernel1 <T, TOperation, TSpecialization>( ArrayView <T> view, ArrayView <int> counter, SpecializedValue <int> groupSize, int numGroups, int paddedLength, int shift) where T : unmanaged where TOperation : struct, IRadixSortOperation <T> where TSpecialization : struct, IRadixSortSpecialization { TSpecialization specialization = default; var scanMemory = SharedMemory.Allocate <int>( groupSize * specialization.UnrollFactor); int gridIdx = Grid.IdxX; for ( int i = Grid.GlobalIndex.X; i < paddedLength; i += GridExtensions.GridStrideLoopStride) { bool inRange = i < view.Length; // Read value from global memory TOperation operation = default; T value = operation.DefaultValue; if (inRange) { value = view[i]; } var bits = operation.ExtractRadixBits( value, shift, specialization.UnrollFactor - 1); for (int j = 0; j < specialization.UnrollFactor; ++j) { scanMemory[Group.IdxX + groupSize * j] = 0; } if (inRange) { scanMemory[Group.IdxX + groupSize * bits] = 1; } Group.Barrier(); for (int j = 0; j < specialization.UnrollFactor; ++j) { var address = Group.IdxX + groupSize * j; scanMemory[address] = GroupExtensions.ExclusiveScan <int, AddInt32>(scanMemory[address]); } Group.Barrier(); if (Group.IdxX == Group.DimX - 1) { // Write counters to global memory for (int j = 0; j < specialization.UnrollFactor; ++j) { ref var newOffset = ref scanMemory[Group.IdxX + groupSize * j]; newOffset += Utilities.Select(inRange & j == bits, 1, 0); counter[j * numGroups + gridIdx] = newOffset; } } Group.Barrier(); var gridSize = gridIdx * Group.DimX; Index1 pos = gridSize + scanMemory[Group.IdxX + groupSize * bits] - Utilities.Select(inRange & Group.IdxX == Group.DimX - 1, 1, 0); for (int j = 1; j <= bits; ++j) { pos += scanMemory[groupSize * j - 1] + Utilities.Select(j - 1 == bits, 1, 0); } // Pre-sort the current value into the corresponding segment if (inRange) { view[pos] = value; } Group.Barrier(); gridIdx += Grid.DimX; }
static void Test3Kernel( GroupedIndex index, ArrayView <short> heights, ArrayView <float> matrices, ArrayView <float> horizon, int a_line, int a_sample) { var idx = index.GroupIdx; const int patch_size = 128; ArrayView <float> horizon_shared = SharedMemory.Allocate <float>(1440); // Do the calculation var aline = index.GridIdx; for (var asample = 0; asample < patch_size; asample++) { // Copy horizon for a[line,sample] into shared memory { var dim = Group.Dimension.X; var len = horizon_shared.Length; var passes = (len + (dim - 1)) / dim; var offset = (aline * patch_size + asample) * len; for (var pass = 0; pass < passes; pass++) { var ptr = pass * dim + idx; if (ptr < len) { horizon_shared[ptr] = horizon[ptr + offset]; } // Note warp divergence } } Group.Barrier(); // Copy the matrix into registers var pos = (aline * patch_size + asample) * 12; var row0x = matrices[pos++]; var row1x = matrices[pos++]; var row2x = matrices[pos++]; var row3x = matrices[pos++]; var row0y = matrices[pos++]; var row1y = matrices[pos++]; var row2y = matrices[pos++]; var row3y = matrices[pos++]; var row0z = matrices[pos++]; var row1z = matrices[pos++]; var row2z = matrices[pos++]; var row3z = matrices[pos]; for (var oline = 0; oline < patch_size; oline++) { // osample = idx var relz = 0.5d * heights[aline * patch_size + idx]; var radius = MoonRadius + relz / 1000d; var line = a_line + aline; var sample = a_sample + idx; var map_x = (sample - S0) * Scale; var map_y = (L0 - line) * Scale; var P = Math.Sqrt(map_x * map_x + map_y * map_y); var C = 2d * Math.Atan2(P, 2 * MoonRadius); var latitude = Math.Asin(Math.Cos(C) * Math.Sin(LatP) + map_y * Math.Sin(C) * Math.Cos(LatP) / P); var longitude = LonP + Math.Atan2(map_x, map_y * LonFactor); var latdeg = latitude * 180d / Math.PI; var londeg = longitude * 180d / Math.PI; // Calculate the other point in ME frame var z_me = radius * Math.Sin(latitude); var c = radius * Math.Cos(latitude); var x_me = c * Math.Cos(longitude); var y_me = c * Math.Sin(longitude); // Transform the point to the local frame var x = x_me * row0x + y_me * row1x + z_me * row2x + row3x; var y = x_me * row0y + y_me * row1y + z_me * row2y + row3y; var z = x_me * row0z + y_me * row1z + z_me * row2z + row3z; // if (idx == 0) // relz = relz; var azimuth = Math.Atan2(y, x) + Math.PI; // [0,2 PI] var alen = Math.Sqrt(x * x + y * y); var slope = z / alen; var slopef = (float)slope; var horizon_index = (int)(0.5d + 1439 * (azimuth / (2d * Math.PI))); Atomic.Max(ref horizon_shared[horizon_index], slopef); //horizon_shared[horizon_index] = 1f; } Group.Barrier(); { var dim = Group.Dimension.X; var len = horizon_shared.Length; var passes = (len + (dim - 1)) / dim; var offset = (aline * patch_size + asample) * len; for (var pass = 0; pass < passes; pass++) { var ptr = pass * dim + idx; if (ptr < len) { horizon[ptr + offset] = horizon_shared[ptr]; } // Note warp divergence } } } }
static void FarFieldKernel1( GroupedIndex2 index, ArrayView <float> points, ArrayView <float> matrices, ArrayView <int> horizon, ArrayView <float> test_array, float observer_height_in_km) { var target_line = index.GridIdx.Y; var target_sample = index.GridIdx.X; var caster_line = index.GroupIdx.Y; Debug.Assert(index.GroupIdx.X == 1); ArrayView <int> horizon_shared = SharedMemory.Allocate <int>(1440); // Copy horizon for a[target_line,target_sample] into shared memory { var dim = Group.Dimension.Y; var len = horizon_shared.Length; var passes = (len + (dim - 1)) / dim; var offset = (target_line * TerrainPatch.DefaultSize + target_sample) * len; for (var pass = 0; pass < passes; pass++) { var ptr = pass * dim + caster_line; if (ptr < len) // divergence { horizon_shared[ptr] = horizon[ptr + offset]; } } } Group.Barrier(); // Copy the matrix into registers var pos = (target_line * TerrainPatch.DefaultSize + target_sample) * 12; var row0x = matrices[pos++]; var row1x = matrices[pos++]; var row2x = matrices[pos++]; var row3x = matrices[pos++]; var row0y = matrices[pos++]; var row1y = matrices[pos++]; var row2y = matrices[pos++]; var row3y = matrices[pos++]; var row0z = matrices[pos++]; var row1z = matrices[pos++]; var row2z = matrices[pos++]; var row3z = matrices[pos]; for (var caster_sample = 0; caster_sample < TerrainPatch.DefaultSize; caster_sample++) { // Fetch the other point in local frame var points_offset = (caster_line * TerrainPatch.DefaultSize + caster_sample) * 3; var x_patch = points[points_offset]; var y_patch = points[points_offset + 1]; var z_patch = points[points_offset + 2]; // Transform the point to the local frame var x = x_patch * row0x + y_patch * row1x + z_patch * row2x + row3x; var y = x_patch * row0y + y_patch * row1y + z_patch * row2y + row3y; var z = x_patch * row0z + y_patch * row1z + z_patch * row2z + row3z; // Adjust for solar array height z -= observer_height_in_km; var azimuth = Atan2(y, x) + XMath.PI; // [0,2 PI] var alen = XMath.Sqrt(x * x + y * y); var slope = z / alen; var slopem = slope > 2f ? 2f : slope; slopem = slopem < -2f ? -2f : slopem; slopem = slopem / 4f; var slopei = (int)(slopem * 1000000); var horizon_index = (int)(0.5f + 1439 * (azimuth / (2f * XMath.PI))); Atomic.Max(ref horizon_shared[horizon_index], slopei); if (caster_sample == 0 && caster_line == 0 && target_line == 0 && target_sample == 0) { test_array[0] = x_patch; test_array[1] = y_patch; test_array[2] = z_patch; test_array[3] = x; test_array[4] = y; test_array[5] = z; test_array[6] = slope; test_array[7] = slopem; test_array[8] = slopei; test_array[9] = row3x; test_array[10] = row3y; test_array[11] = row3z; } } Group.Barrier(); { var dim = Group.Dimension.Y; var len = horizon_shared.Length; var passes = (len + (dim - 1)) / dim; var offset = (target_line * TerrainPatch.DefaultSize + target_sample) * len; for (var pass = 0; pass < passes; pass++) { var ptr = pass * dim + caster_line; if (ptr < len) // divergence { horizon[ptr + offset] = horizon_shared[ptr]; } } } }