private static void IlGpuKernelConstants( ArrayView2D <Real> mSquaredDistances, ArrayView <Real> mCoordinates, SpecializedValue <int> c, int n) { // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant. // Also, we write the results as float2. var shared = SharedMemory.GetDynamic <Real>(); var coordinatesI = shared.GetSubView(0, c * Group.DimX); var coordinatesJ = shared.GetSubView(c * Group.DimX); var bI = Grid.IdxY * Group.DimX; var bJ = Grid.IdxX * Group.DimX; for (int k = 0; k != c; ++k) { if (bI + Group.IdxX < n) { coordinatesI[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bI + Group.IdxX]; } if (bJ + Group.IdxX < n) { coordinatesJ[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bJ + Group.IdxX]; } } Group.Barrier(); var line = Group.IdxX / (Group.DimX / 2); var tid = Group.IdxX % (Group.DimX / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Cast <IlReal2>(); for (int i = line; i < Group.DimX & bI + i < n; i += 2) { var dist = default(IlReal2); for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * Group.DimX + i]; var coord2 = coordinatesJ2[(k * Group.DimX / 2) + tid]; var diff = new IlReal2(coord1 - coord2.X, coord1 - coord2.Y); dist += diff * diff; } var dst = mSquaredDistances.Cast <IlReal2>(); dst[bJ / 2 + tid, bI + i] = dist; } } }
private static void IlGpuKernelFloat2( ArrayView2D <Real> mSquaredDistances, ArrayView <Real> mCoordinates, int c, int n) { // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads. var shared = SharedMemory.GetDynamic <Real>(); var coordinatesI = shared.GetSubView(0, c * Group.DimX); var coordinatesJ = shared.GetSubView(c * Group.DimX); var bI = Grid.IdxY * Group.DimX; var bJ = Grid.IdxX * Group.DimX; for (int k = 0; k != c; ++k) { if (bI + Group.IdxX < n) { coordinatesI[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bI + Group.IdxX]; } if (bJ + Group.IdxX < n) { coordinatesJ[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bJ + Group.IdxX]; } } Group.Barrier(); var line = Group.IdxX / (Group.DimX / 2); var tid = Group.IdxX % (Group.DimX / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Cast <IlReal2>(); for (int i = line; i < Group.DimX && bI + i < n; i += 2) { var dist = default(IlReal2); for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * Group.DimX + i]; var coord2 = coordinatesJ2[(k * Group.DimX / 2) + tid]; var diff = new IlReal2(coord1 - coord2.X, coord1 - coord2.Y); dist += diff * diff; } mSquaredDistances[bJ + 2 * tid + 0, bI + i] = dist.X; mSquaredDistances[bJ + 2 * tid + 1, bI + i] = dist.Y; } } }
private static void DynamicSharedMemoryHelper(int globalIndex, ArrayView <short> view) { // Get a dynamically allocated shared memory view with a custom element type. var dynamicMemory = SharedMemory.GetDynamic <short>(); // Store data in shared memory dynamicMemory[Group.IdxX] = (short)Group.IdxX; Group.Barrier(); // Read data from shared memory view[globalIndex] = dynamicMemory[Group.IdxX]; }
// The kernel public static void SharedMemKernel(ArrayView <int> view1, ArrayView <short> view2) { var globalIndex = Grid.GlobalIndex.X; // Allocate a statically known amount of shared memory // var staticMemory = SharedMemory.Allocate<int>(1024); // Get a dynamically allocated shared memory view. var dynamicMemory = SharedMemory.GetDynamic <int>(); // Store data in shared memory dynamicMemory[Group.IdxX] = Group.IdxX; Group.Barrier(); // Read data from shared memory view1[globalIndex] = dynamicMemory[Group.IdxX]; // Call another function that uses dynamic shared memory DynamicSharedMemoryHelper(globalIndex, view2); }
private static void IlGpuKernelSharedMemory( ArrayView2D <Real> mSquaredDistances, ArrayView <Real> mCoordinates, int c, int n) { // We've got shared memory of two vector of K dimensions for B points: // // var coordI = __shared__ new Real[k*blockDim.x]; // var coordJ = __shared__ new Real[k*blockDim.x]; // // We fill in these two vectors with the coordinates of the I points and J points. // Afterwards, the current block will compute the euclidean distances between all // the I points and J points, producing a square matrix [B, B]. // // This optimisation means that when producing the square matrix, the I and J points // coordinates are only read once. // // This optimisation works well if K is small enough. Otherwise the shared memory is // too small and not enough blocks get schedule per SM. var shared = SharedMemory.GetDynamic <Real>(); var coordinatesI = shared.GetSubView(0, c * Group.DimX); var coordinatesJ = shared.GetSubView(c * Group.DimX); var bI = Grid.IdxY * Group.DimX; var bJ = Grid.IdxX * Group.DimX; for (int k = 0; k != c; ++k) { if (bI + Group.IdxX < n) { coordinatesI[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bI + Group.IdxX]; } if (bJ + Group.IdxX < n) { coordinatesJ[k * Group.DimX + Group.IdxX] = mCoordinates[k * n + bJ + Group.IdxX]; } } Group.Barrier(); if (bJ + Group.IdxX < n) { for (int i = 0; i < Group.DimX && bI + i < n; ++i) { Real dist = 0; for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * Group.DimX + i]; //mCoordinates[k * x + i]; var coord2 = coordinatesJ[k * Group.DimX + Group.IdxX]; //mCoordinates[k * x + j]; var diff = coord1 - coord2; dist += diff * diff; } mSquaredDistances[bJ + Group.IdxX, bI + i] = dist; } } }