// ReSharper disable once SuggestBaseTypeForParameter private static void KernelSequentialReduceIdleThreads <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op) { var shared = __shared__.ExternArray <T>(); var tid = threadIdx.x; var bid = blockIdx.x; var gid = 2 * blockDim.x * bid + tid; shared[tid] = (gid < length && gid + blockDim.x < length) ? op(array[gid], array[gid + blockDim.x]) : array[gid]; DeviceFunction.SyncThreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s && gid + s < length) { shared[tid] = op(shared[tid], shared[tid + s]); } DeviceFunction.SyncThreads(); } if (tid == 0) { result[bid] = shared[0]; } }
// ReSharper disable once SuggestBaseTypeForParameter private static void KernelInterleavedAccess <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op) { var shared = __shared__.ExternArray <T>(); var tid = threadIdx.x; var bid = blockIdx.x; var gid = blockDim.x * bid + tid; if (gid < length) { shared[tid] = array[gid]; } DeviceFunction.SyncThreads(); for (var s = 1; s < blockDim.x; s *= 2) { if (tid % (2 * s) == 0 && gid + s < length) { shared[tid] = op(shared[tid], shared[tid + s]); } DeviceFunction.SyncThreads(); } if (tid == 0) { result[bid] = shared[0]; } }
public void ComputeValue(deviceptr <double> results, deviceptr <double> points, int numSims) { // Determine thread ID var bid = blockIdx.x; var tid = blockIdx.x * blockDim.x + threadIdx.x; var step = gridDim.x * blockDim.x; // Shift the input/output pointers var pointx = points + tid; var pointy = pointx + numSims; var pointsInside = 0; for (var i = tid; i < numSims; i += step, pointx += step, pointy += step) { var x = pointx[0]; var y = pointy[0]; var l2norm2 = x * x + y * y; if (l2norm2 < 1.0) { pointsInside++; } } // Reduce within the block pointsInside = ReduceSum(pointsInside); // Store the result if (threadIdx.x == 0) { results[bid] = pointsInside; } }
private static void AleaKernel( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, int c, int n) { var j = blockIdx.x * blockDim.x + threadIdx.x; if (j < n) { for (int i = 0; i < n; ++i) { Real dist = 0; for (int k = 0; k != c; ++k) { var coord1 = mCoordinates[k * n + i]; var coord2 = mCoordinates[k * n + j]; var diff = coord1 - coord2; dist += diff * diff; } mSquaredDistances[i * n + j] = dist; } } }
// Link: https://mail.google.com/mail/u/0/#inbox/1598d0b3b2850009?projector=1 // I'm sure memory management is far from optimal! // Fixed Block and Thread! internal static T ComputeGpu5 <T>(T[] array, Func <T, T, T> op) { const int dimGrid = 256; const int blockDim = 256; var gpu = Gpu.Default; var inputLength = array.Length; var inputMemory = gpu.ArrayGetMemory(array, true, false); var inputDevPtr = new deviceptr <T>(inputMemory.Handle); var resultMemory = gpu.AllocateDevice <T>(dimGrid); var resultDevPtr = new deviceptr <T>(resultMemory.Handle); gpu.Launch(() => KernelSequentialReduceIdleThreadsWarpMultiple(inputDevPtr, inputLength, resultDevPtr, op), new LaunchParam(dimGrid, blockDim)); inputDevPtr = resultDevPtr; resultMemory = gpu.AllocateDevice <T>(dimGrid); resultDevPtr = new deviceptr <T>(resultMemory.Handle); gpu.Launch(() => KernelSequentialReduceIdleThreadsWarpMultiple(inputDevPtr, dimGrid, resultDevPtr, op), new LaunchParam(1, blockDim)); return(Gpu.CopyToHost(resultMemory)[0]); }
// Alea Parallel.For! internal static Image Render1(Bounds bounds) { bounds.AdjustAspectRatio(); var width = bounds.Width; var height = bounds.Height; var scale = (bounds.XMax - bounds.XMin) / width; var resultLength = ColorComponents * width * height; var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength); var resultDevPtr = new deviceptr <byte>(resultMemory.Handle); Gpu.Default.For(0, width * height, i => { var x = i % width; var y = i / width; var offset = ColorComponents * i; if (offset < resultLength) { // ReSharper disable once PossibleLossOfFraction var c = new Complex { Real = bounds.XMin + x * scale, Imaginary = bounds.YMin + y * scale, }; ComputeMandelbrotAtOffset(resultDevPtr, c, offset); } }); return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height)); }
static void Filter_Symmetric_Kernel(int width, int height, deviceptr <float> inputImg, deviceptr <float> outputImg, deviceptr <float> filter, int filterSize) { var x = blockIdx.x * blockDim.x + threadIdx.x; var y = blockIdx.y * blockDim.y + threadIdx.y; if (x < 0 || x >= width || y < 0 || y >= height) { return; } int filterWidth = 2 * filterSize + 1; float sum = 0; int xx, yy; for (int i = -filterSize; i <= filterSize; i++) { for (int j = -filterSize; j <= filterSize; j++) { xx = x + j; yy = y + i; if (xx >= 0 && xx < width && yy >= 0 && yy < height) { sum += filter[(i + filterSize) * filterWidth + (j + filterSize)] * inputImg[yy * width + xx]; } } } outputImg[y * width + x] = sum; }
//[/GLdescription] //[LockPositions] void LockPos(Del f) { CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[0], (uint) CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY)); CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[1], (uint) CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD)); CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsMapResourcesEx(2u, _resources, IntPtr.Zero)); var bytes = IntPtr.Zero; var handle0 = IntPtr.Zero; var handle1 = IntPtr.Zero; unsafe { CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle0, &bytes, _resources[0])); CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle1, &bytes, _resources[1])); } var pos0 = new deviceptr<float4>(handle0); var pos1 = new deviceptr<float4>(handle1); try { f(pos0, pos1); } finally { CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsUnmapResourcesEx(2u, _resources, IntPtr.Zero)); } }
//[/GLdescription] //[LockPositions] void LockPos(Del f) { CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[0], (uint)CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY)); CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[1], (uint)CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD)); CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsMapResourcesEx(2u, _resources, IntPtr.Zero)); var bytes = IntPtr.Zero; var handle0 = IntPtr.Zero; var handle1 = IntPtr.Zero; unsafe { CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle0, &bytes, _resources[0])); CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle1, &bytes, _resources[1])); } var pos0 = new deviceptr <float4>(handle0); var pos1 = new deviceptr <float4>(handle1); try { f(pos0, pos1); } finally { CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsUnmapResourcesEx(2u, _resources, IntPtr.Zero)); } }
private AdaptiveLBP(Size size) { this.size = size; // initialize data structures to avoid reallocating with every call hist = new int[numSubuniformPatterns * numVarBins]; hist2 = new int[numSubuniformPatterns * numVarBins]; lbpImageGPU = worker.Malloc <short>(size.Width * size.Height); varImageGPU = worker.Malloc <short>(size.Width * size.Height); histGPU = worker.Malloc <int>(hist.Length); floatImageGPU = worker.Malloc <float>(size.Width * size.Height); // precompute the subuniform bin for each LBP pattern, and push it to the GPU subuniformBins = new short[(short)Math.Pow(2, numNeighbors)]; for (int i = 0; i < subuniformBins.Length; i++) { short bin = GetPatternNum(i); subuniformBins[i] = bin; } subuniformBinsGPU = worker.Malloc(subuniformBins); neighborCoordinateX = new float[numNeighbors]; neighborCoordinateY = new float[numNeighbors]; for (int i = 0; i < numNeighbors; i++) { float xx = (float)Math.Cos(2.0 * PI * (double)i / (double)numNeighbors); float yy = (float)Math.Sin(2.0 * PI * (double)i / (double)numNeighbors); neighborCoordinateX[i] = xx; neighborCoordinateY[i] = yy; } neighborCoordinateXGPU = worker.Malloc(neighborCoordinateX); neighborCoordinateYGPU = worker.Malloc(neighborCoordinateY); varBinsGPU = worker.Malloc(varBins); // initialize CUDA parameters var blockDims = new dim3(8, 8); var gridDims = new dim3(Common.divup(size.Width, blockDims.x), Common.divup(size.Height, blockDims.y)); lp = new LaunchParam(gridDims, blockDims); // create filters for (int i = 0; i < numScales; i++) { float[,] filter = LaplacianOfGaussian.Generate(i + 1); filters[i] = Utils.Flatten(filter); filtersGPU[i] = worker.Malloc(filters[i]); filterSizes[i] = (filter.GetLength(0) - 1) / 2; } // allocate space for scale space images deviceptr <float>[] tempPointers = new deviceptr <float> [numScales]; for (int i = 0; i < numScales; i++) { scaledImages[i] = worker.Malloc <float>(size.Width * size.Height); tempPointers[i] = scaledImages[i].Ptr; } scaledImagePointers = worker.Malloc(tempPointers); pixelScaleImage = worker.Malloc <short>(size.Width * size.Height); }
private void TraceKernel(int index, deviceptr <ColorRaw> image, int width) { var x = index % width; var y = index / width; var color = ColorRaw.FromRgb(0xed, 0x95, 0x64); var maxDepth = float.MinValue; for (var i = 0; i < _spheres.Length; i++) { // Todo: Get rid of 'n' until we have a way to properly shade! float n; var sphere = _spheres[i]; var depth = sphere.GetIntersection(x, y, out n); if (depth > maxDepth) { color = ColorRaw.FromRgb( (byte)(sphere.Color.R * n), (byte)(sphere.Color.G * n), (byte)(sphere.Color.B * n)); maxDepth = depth; } } image[index] = color; }
// Alea Parallel.For! internal static Image Render1(Bitmap image, ConvolutionFilter filter) { var gpu = Gpu.Default; var width = image.Width; var array = BitmapUtility.ToColorArray(image); var mFilter = filter.Filter; var mFactor = filter.Factor; var mOffset = filter.Offset; var inputMemory = gpu.ArrayGetMemory(array, true, false); var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle); var resultLength = array.Length; var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength); var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle); gpu.For(0, resultLength, i => { if (i < resultLength) { ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width); } }); return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height)); }
// Fixed Block Size! internal static Image Render3(Bitmap image, ConvolutionFilter filter) { var gpu = Gpu.Default; var width = image.Width; var array = BitmapUtility.ToColorArray(image); var mFilter = filter.Filter; var mFactor = filter.Factor; var mOffset = filter.Offset; var inputMemory = gpu.ArrayGetMemory(array, true, false); var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle); var resultLength = array.Length; var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength); var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle); var lp = new LaunchParam(256, 256); gpu.Launch(() => { var i = blockDim.x * blockIdx.x + threadIdx.x; while (i < resultLength) { ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width); i += blockDim.x * gridDim.x; } }, lp); return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height)); }
public void Upsweep(deviceptr <T> dValues, deviceptr <int> dRanges, deviceptr <T> dRangeTotals) { // Each block is processing a range. var range = blockIdx.x; var tid = threadIdx.x; var rangeX = dRanges[range]; var rangeY = dRanges[range + 1]; // Loop through all elements in the interval, adding up values. // There is no need to synchronize until we perform the multi-reduce. var reduced = _initFunc(); var index = rangeX + tid; while (index < rangeY) { reduced = _reductionOp.Invoke(reduced, _transform.Invoke(dValues[index])); index += _numThreads; } // Get the total. var total = _multiReduce.Invoke(tid, reduced); if (tid == 0) { dRangeTotals[range] = total; } }
public void Mult(int wA, int wB, int hC, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C) { var block = new dim3(BlockSize, BlockSize); var grid = new dim3(wB/block.x, hC/block.y); var lp = new LaunchParam(grid, block); GPULaunch(Kernel, lp, wA, wB, A, B, C); }
// Custom! internal static Image Render2(Bounds bounds) { bounds.AdjustAspectRatio(); var width = bounds.Width; var height = bounds.Height; var scale = (bounds.XMax - bounds.XMin) / width; var resultLength = ColorComponents * width * height; var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength); var resultDevPtr = new deviceptr <byte>(resultMemory.Handle); var lp = ComputeLaunchParameters(bounds); Gpu.Default.Launch(() => { var i = blockDim.x * blockIdx.x + threadIdx.x; var x = i % width; var y = i / width; var offset = ColorComponents * i; if (offset < resultLength) { var c = new Complex { Real = bounds.XMin + x * scale, Imaginary = bounds.YMin + y * scale, }; ComputeMandelbrotAtOffset(resultDevPtr, c, offset); } }, lp); return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height)); }
public void Upsweep(deviceptr <T> dValues1, deviceptr <T> dValues2, deviceptr <int> dRanges, deviceptr <T> dRangeTotals) { var block = blockIdx.x; var tid = threadIdx.x; var rangeX = dRanges[block]; var rangeY = dRanges[block + 1]; // Loop through all elements in the interval, adding up values. // There is no need to synchronize until we perform the multireduce. var sum = default(T); var index = rangeX + tid; while (index < rangeY) { sum = _add(sum, _mult(dValues1[index], dValues2[index])); index += _plan.NumThreads; } // Get the total. var total = _multiReduce(tid, sum); if (tid == 0) { dRangeTotals[block] = total; } }
public Tensor <T> Cast <T>() { var ptr = new deviceptr <T>(Ptr); var buffer = new Buffer <T>(Device, Memory, Layout, ptr); return(new Tensor <T>(buffer)); }
public Buffer(Device device, BufferMemory memory, Layout layout, deviceptr <T> ptr) { Device = device; Memory = memory; Layout = layout; Ptr = ptr; switch (Device.Type) { case DeviceType.Gpu: _hptr = null; var dptr = ptr; RawReader = i => dptr.LongGet(i); RawWriter = (i, value) => dptr.LongSet(i, value); break; case DeviceType.Cpu: var hptr = new HostPtrAccessor <T>(memory, ptr); _hptr = hptr; RawReader = hptr.Get; RawWriter = hptr.Set; break; default: throw new ArgumentOutOfRangeException(); } }
public void IntegrateBodies(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping, int numTiles) { var index = threadIdx.x + blockIdx.x*_blockSize; if (index >= numBodies) return; var position = oldPos[index]; var accel = ComputeBodyAccel(softeningSquared, position, oldPos, numTiles); // acceleration = force \ mass // new velocity = old velocity + acceleration*deltaTime // note we factor out the body's mass from the equation, here and in bodyBodyInteraction // (because they cancel out). Thus here force = acceleration var velocity = vel[index]; velocity.x = velocity.x + accel.x*deltaTime; velocity.y = velocity.y + accel.y*deltaTime; velocity.z = velocity.z + accel.z*deltaTime; velocity.x = velocity.x*damping; velocity.y = velocity.y*damping; velocity.z = velocity.z*damping; // new position = old position + velocity*deltaTime position.x = position.x + velocity.x*deltaTime; position.y = position.y + velocity.y*deltaTime; position.z = position.z + velocity.z*deltaTime; // store new position and velocity newPos[index] = position; vel[index] = velocity; }
/// <summary> /// Checks whether or not the Cuda features are currently supported /// </summary> public static bool IsGpuAccelerationSupported() { try { // CUDA test Gpu gpu = Gpu.Default; if (gpu == null) { return(false); } if (!Dnn.IsAvailable) { return(false); // cuDNN } using (DeviceMemory <float> sample_gpu = gpu.AllocateDevice <float>(1024)) { deviceptr <float> ptr = sample_gpu.Ptr; void Kernel(int i) => ptr[i] = i; Alea.Parallel.GpuExtension.For(gpu, 0, 1024, Kernel); // JIT test float[] sample = Gpu.CopyToHost(sample_gpu); return(Enumerable.Range(0, 1024).Select <int, float>(i => i).ToArray().ContentEquals(sample)); } } catch { // Missing .dll or other errors return(false); } }
public void IntegrateBodies(deviceptr <float4> newPos, deviceptr <float4> oldPos, deviceptr <float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping, int numTiles) { var index = threadIdx.x + blockIdx.x * _blockSize; if (index >= numBodies) { return; } var position = oldPos[index]; var accel = ComputeBodyAccel(softeningSquared, position, oldPos, numTiles); // acceleration = force \ mass // new velocity = old velocity + acceleration*deltaTime // note we factor out the body's mass from the equation, here and in bodyBodyInteraction // (because they cancel out). Thus here force = acceleration var velocity = vel[index]; velocity.x = velocity.x + accel.x * deltaTime; velocity.y = velocity.y + accel.y * deltaTime; velocity.z = velocity.z + accel.z * deltaTime; velocity.x = velocity.x * damping; velocity.y = velocity.y * damping; velocity.z = velocity.z * damping; // new position = old position + velocity*deltaTime position.x = position.x + velocity.x * deltaTime; position.y = position.y + velocity.y * deltaTime; position.z = position.z + velocity.z * deltaTime; // store new position and velocity newPos[index] = position; vel[index] = velocity; }
public void ComputeValue(deviceptr<double> results, deviceptr<double> points, int numSims) { // Determine thread ID var bid = blockIdx.x; var tid = blockIdx.x * blockDim.x + threadIdx.x; var step = gridDim.x * blockDim.x; // Shift the input/output pointers var pointx = points + tid; var pointy = pointx + numSims; var pointsInside = 0; for (var i = tid; i < numSims; i += step, pointx += step, pointy += step) { var x = pointx[0]; var y = pointy[0]; var l2norm2 = x * x + y * y; if (l2norm2 < 1.0) pointsInside++; } // Reduce within the block pointsInside = ReduceSum(pointsInside); // Store the result if (threadIdx.x == 0) results[bid] = pointsInside; }
public GpuSpaceBufferContext(uint buffer, DiscreteBounds bounds) { CUDAInterop.cuGLRegisterBufferObject(buffer); _buffer = buffer; _devicePointer = new deviceptr <VoxelFace>(GetDevicePointer()); _bounds = bounds; }
public void Apply(int numSystems, int n, deviceptr <double> dl, deviceptr <double> dd, deviceptr <double> du, deviceptr <double> db, deviceptr <double> dx) { var sharedSize = 9 * n * sizeof(double); var lp = new LaunchParam(numSystems, n, sharedSize); this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx); }
static void Truncate_Kernel(int width, deviceptr <float> input, deviceptr <byte> output) { var x = blockIdx.x * blockDim.x + threadIdx.x; var y = blockIdx.y * blockDim.y + threadIdx.y; var p = y * width + x; output[p] = (byte)input[p]; }
static void UpdateCorrectionFactor_Kernel(int width, deviceptr <float> meanImg, deviceptr <float> correctionFactor, float mean) { var x = blockIdx.x * blockDim.x + threadIdx.x; var y = blockIdx.y * blockDim.y + threadIdx.y; var p = y * width + x; correctionFactor[p] = mean / meanImg[p]; }
static void RunningMean_Kernel(int width, deviceptr <byte> image, deviceptr <float> meanImg, float numCalibrationSamples) { var x = blockIdx.x * blockDim.x + threadIdx.x; var y = blockIdx.y * blockDim.y + threadIdx.y; var p = y * width + x; meanImg[p] = meanImg[p] * numCalibrationSamples / (numCalibrationSamples + 1) + image[p] * 1.0f / (numCalibrationSamples + 1); }
private static void AleaKernelConstants( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, Constant <int> c, int n, int pitch) { // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant. // Also, we write the results as float2. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c.Value * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c.Value; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { var dist = default(Real2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist.x += diff.x * diff.x; dist.y += diff.y * diff.y; } var dst = mSquaredDistances.Ptr((bI + i) * pitch + bJ).Reinterpret <Real2>(); dst[tid] = dist; } } }
private static void AleaKernelFloat2( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, int c, int n, int pitch) { // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { Real dist0 = 0; Real dist1 = 0; for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist0 += diff.x * diff.x; dist1 += diff.y * diff.y; } mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 0)] = dist0; mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 1)] = dist1; } } }
static void SquareKernel(deviceptr<double> outputs, deviceptr<double> inputs, int n) { var start = blockIdx.x * blockDim.x + threadIdx.x; var stride = gridDim.x * blockDim.x; for (var i = start; i < n; i += stride) { outputs[i] = inputs[i] * inputs[i]; } }
internal Image Trace(int width, int height) { var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(width * height); var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle); Gpu.Default.For(0, width * height, i => TraceKernel(i, resultDevPtr, width)); return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), width, height)); }
//[/transformKernel] //[transformGPUDevice] public void Apply(int n, deviceptr <T> x, deviceptr <T> y, deviceptr <T> z) { const int blockSize = 256; var numSm = this.GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; var gridSize = Math.Min(16 * numSm, Common.divup(n, blockSize)); var lp = new LaunchParam(gridSize, blockSize); GPULaunch(Kernel, lp, n, x, y, z); }
Public MyKernel(deviceptr<int> Data) { var start = blockIdx.x * blockDim.x + threadIdx.x; int ind = threadIdx.x; for (int i=0;i<100;i++) { //Kernel Code here } }
public void ClassifyVoxel(deviceptr <int3> d_gridIdx, deviceptr <float3> d_voxelV, deviceptr <int> d_voxelVerts, deviceptr <int> d_voxelOccupied, deviceptr <float3> d_samplePts, int sampleLength) { int blockId = blockIdx.y * gridDim.x + blockIdx.x; //block在grid中的位置 int i = blockId * blockDim.x + threadIdx.x; //线程索引 // compute 3d index in the grid int3 gridPos = calcGridPos(i, constGridSize.Value); d_gridIdx[i] = gridPos; float3 p = new float3(); p.x = constBasePoint.Value.x + gridPos.x * constVoxelSize.Value.x; p.y = constBasePoint.Value.y + gridPos.y * constVoxelSize.Value.y; p.z = constBasePoint.Value.z + gridPos.z * constVoxelSize.Value.z; // compute all vertices d_voxelV[i * 8] = p; d_voxelV[i * 8 + 1] = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, 0 + p.z); d_voxelV[i * 8 + 2] = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, 0 + p.z); d_voxelV[i * 8 + 3] = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, 0 + p.z); d_voxelV[i * 8 + 4] = CreateFloat3(0 + p.x, 0 + p.y, constVoxelSize.Value.z + p.z); d_voxelV[i * 8 + 5] = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, constVoxelSize.Value.z + p.z); d_voxelV[i * 8 + 6] = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z); d_voxelV[i * 8 + 7] = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z); // compute cube value of each vertex float d0 = ComputeValue(d_samplePts, d_voxelV[i * 8], sampleLength); float d1 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 1], sampleLength); float d2 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 2], sampleLength); float d3 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 3], sampleLength); float d4 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 4], sampleLength); float d5 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 5], sampleLength); float d6 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 6], sampleLength); float d7 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 7], sampleLength); // check their status int cubeindex; cubeindex = Compact(d0, constIsovalue.Value); cubeindex += Compact(d1, constIsovalue.Value) * 2; cubeindex += Compact(d2, constIsovalue.Value) * 4; cubeindex += Compact(d3, constIsovalue.Value) * 8; cubeindex += Compact(d4, constIsovalue.Value) * 16; cubeindex += Compact(d5, constIsovalue.Value) * 32; cubeindex += Compact(d6, constIsovalue.Value) * 64; cubeindex += Compact(d7, constIsovalue.Value) * 128; //find out the number of vertices in each voxel int numVerts = verticesTable[cubeindex]; d_voxelVerts[i] = numVerts; if (numVerts > 0) { d_voxelOccupied[i] = 1; } }
//[/StaticStartKernel] //[StaticPrepareAndLaunchKernel] public void IntegrateNbodySystem(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping) { var numBlocks = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize); var numTiles = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize); var lp = new LaunchParam(numBlocks, _blockSize); GPULaunch(IntegrateBodies, lp, newPos, oldPos, vel, numBodies, deltaTime, softeningSquared, damping, numTiles); }
static void SimpleMultiplyKernel(deviceptr<float> a, deviceptr<float> b, deviceptr<float> c, int aRows, int bCols, int aCols_bRows) { var row = blockDim.y * blockIdx.y + threadIdx.y; var col = blockDim.x * blockIdx.x + threadIdx.x; if (row >= aRows || col >= bCols) return; var sum = 0.0f; for (var k = 0; k < aCols_bRows; ++k) { sum += a[row * aCols_bRows + k] * b[k * bCols + col]; } c[row * bCols + col] = sum; }
public void Kernel(int wA, int wB, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C) { var blx = blockIdx.x; var bly = blockIdx.y; var tx = threadIdx.x; var ty = threadIdx.y; // offset to first element of the first sub-matrix of A processed by the block var aBegin = wA * BlockSize * bly; // index of the last sub-matrix of A processed by the block var aEnd = aBegin + wA - 1; // step size used to iterate through the sub-matrices of A var aStep = BlockSize; // offset to first element of the first sub-matrix of B processed by the block var bBegin = BlockSize * blx; // step size used to iterate through the sub-matrices of B var bStep = BlockSize * wB; // Csub is used to store the element of the block sub-matrix that is computed by the thread var Csub = 0.0; // loop over all the sub-matrices of A and B required to compute the block sub-matrix var a = aBegin; var b = bBegin; for (; a <= aEnd; a += aStep, b += bStep) { var As = __shared__.Array2D<double>(BlockSize, BlockSize); var Bs = __shared__.Array2D<double>(BlockSize, BlockSize); // load the matrices from device memory to shared memory; each thread loads one element of each matrix As[ty, tx] = A[a + wA * ty + tx]; Bs[ty, tx] = B[b + wB * ty + tx]; Intrinsic.__syncthreads(); // multiply the two matrices together; each thread computes one element of the block sub-matrix for (var k = 0; k < BlockSize; ++k) Csub += As[ty, k] * Bs[k, tx]; Intrinsic.__syncthreads(); } // write the block sub-matrix to device memory; each thread writes one element var c = wB * BlockSize * bly + BlockSize * blx; C[c + wB * ty + tx] = Csub; }
public void Kernel(deviceptr<float4> pos, float time) { var x = blockIdx.x*blockDim.x + threadIdx.x; var y = blockIdx.y*blockDim.y + threadIdx.y; var u = ((float) x)/((float) Width); var v = ((float) y)/((float) Height); u = u*2.0f - 1.0f; v = v*2.0f - 1.0f; const float freq = 4.0f; var w = LibDevice.__nv_sinf(u*freq + time)*LibDevice.__nv_cosf(v*freq + time)*0.5f; pos[y * Width + x] = new float4(u, w, v, LibDevice.__nv_uint_as_float(0xff00ff00)); }
unsafe public void Update(IntPtr vbRes, float time) { // 1. map resource to cuda space, means lock to cuda space var vbRes1 = vbRes; CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsMapResources(1, &vbRes1, IntPtr.Zero)); // 2. get memory pointer from mapped resource var vbPtr = IntPtr.Zero; var vbSize = IntPtr.Zero; CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&vbPtr, &vbSize, vbRes1)); // 3. create device pointer, and run the kernel var pos = new deviceptr<float4>(vbPtr); GPULaunch(Kernel, LaunchParam, pos, time); // 4. unmap resource, means unlock, so that DirectX can then use it again CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsUnmapResources(1u, &vbRes1, IntPtr.Zero)); }
//[/DynamicAOTCompile] //[DynamicComputeBodyAccel] public float3 ComputeBodyAccel(float softeningSquared, float4 bodyPos, deviceptr<float4> positions, int numTiles) { var sharedPos = __shared__.ExternArray<float4>(); var acc = new float3(0.0f, 0.0f, 0.0f); for (var tile = 0; tile < numTiles; tile++) { sharedPos[threadIdx.x] = positions[tile*blockDim.x + threadIdx.x]; Intrinsic.__syncthreads(); // This is the "tile_calculation" function from the GPUG3 article. for (var counter = 0; counter < blockDim.x; counter++) { acc = Common.BodyBodyInteraction(softeningSquared, acc, bodyPos, sharedPos[counter]); } Intrinsic.__syncthreads(); } return (acc); }
void ISimulator.Integrate(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping) { _worker.Gather(oldPos, _hpos, FSharpOption<int>.None, FSharpOption<int>.None); _worker.Gather(vel, _hvel, FSharpOption<int>.None, FSharpOption<int>.None); CpuIntegrator.IntegrateNbodySystem(_haccel, _hpos, _hvel, _numBodies, deltaTime, softeningSquared, damping); }
public void Apply(int numSystems, int n, deviceptr<double> dl, deviceptr<double> dd, deviceptr<double> du, deviceptr<double> db, deviceptr<double> dx) { var sharedSize = 9*n*sizeof (double); var lp = new LaunchParam(numSystems, n, sharedSize); this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx); }
public void Kernel(int n, deviceptr<double> dl, deviceptr<double> dd, deviceptr<double> du, deviceptr<double> db, deviceptr<double> dx) { var tid = threadIdx.x; var gid = blockIdx.x * n + tid; var shared = Intrinsic.__array_to_ptr(__shared__.ExternArray<double>()); var l = shared; var d = l + n; var u = d + n; var b = u + n; l[tid] = dl[gid]; d[tid] = dd[gid]; u[tid] = du[gid]; b[tid] = db[gid]; Intrinsic.__syncthreads(); Solve(n, l, d, u, b); dx[gid] = b[tid]; }
// core solver function // n the dimension of the tridiagonal system, must fit into one block // l lower diagonal // d diagonal // u upper diagonal // h right hand side and solution at exit public static void Solve(int n, deviceptr<double> l, deviceptr<double> d, deviceptr<double> u, deviceptr<double> h) { var rank = threadIdx.x; var ltemp = 0.0; var utemp = 0.0; var htemp = 0.0; var span = 1; while (span < n) { if (rank < n) { ltemp = (rank - span >= 0) ? (d[rank - span] != 0.0) ? -l[rank] / d[rank - span] : 0.0 : 0.0; utemp = (rank + span < n) ? (d[rank + span] != 0.0) ? -u[rank] / d[rank + span] : 0.0 : 0.0; htemp = h[rank]; } Intrinsic.__syncthreads(); if (rank < n) { if (rank - span >= 0) { d[rank] = d[rank] + ltemp * u[rank - span]; htemp = htemp + ltemp * h[rank - span]; ltemp = ltemp * l[rank - span]; } if (rank + span < n) { d[rank] = d[rank] + utemp * l[rank + span]; htemp = htemp + utemp * h[rank + span]; utemp = utemp * u[rank + span]; } } Intrinsic.__syncthreads(); if (rank < n) { l[rank] = ltemp; u[rank] = utemp; h[rank] = htemp; } Intrinsic.__syncthreads(); span *= 2; } if (rank < n) h[rank] = h[rank] / d[rank]; }
void ISimulator.Integrate(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping) { IntegrateNbodySystem(newPos, oldPos, vel, numBodies, deltaTime, softeningSquared, damping); }