// Fixed Block Size! internal static Image Render3(Bitmap image, ConvolutionFilter filter) { var gpu = Gpu.Default; var width = image.Width; var array = BitmapUtility.ToColorArray(image); var mFilter = filter.Filter; var mFactor = filter.Factor; var mOffset = filter.Offset; var inputMemory = gpu.ArrayGetMemory(array, true, false); var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle); var resultLength = array.Length; var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength); var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle); var lp = new LaunchParam(256, 256); gpu.Launch(() => { var i = blockDim.x * blockIdx.x + threadIdx.x; while (i < resultLength) { ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width); i += blockDim.x * gridDim.x; } }, lp); return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height)); }
// Alea Parallel.For! internal static Image Render1(Bitmap image, ConvolutionFilter filter) { var gpu = Gpu.Default; var width = image.Width; var array = BitmapUtility.ToColorArray(image); var mFilter = filter.Filter; var mFactor = filter.Factor; var mOffset = filter.Offset; var inputMemory = gpu.ArrayGetMemory(array, true, false); var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle); var resultLength = array.Length; var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength); var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle); gpu.For(0, resultLength, i => { if (i < resultLength) { ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width); } }); return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height)); }
// Custom! internal static Image Render2(Bounds bounds) { bounds.AdjustAspectRatio(); var width = bounds.Width; var height = bounds.Height; var scale = (bounds.XMax - bounds.XMin) / width; var resultLength = ColorComponents * width * height; var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength); var resultDevPtr = new deviceptr <byte>(resultMemory.Handle); var lp = ComputeLaunchParameters(bounds); Gpu.Default.Launch(() => { var i = blockDim.x * blockIdx.x + threadIdx.x; var x = i % width; var y = i / width; var offset = ColorComponents * i; if (offset < resultLength) { var c = new Complex { Real = bounds.XMin + x * scale, Imaginary = bounds.YMin + y * scale, }; ComputeMandelbrotAtOffset(resultDevPtr, c, offset); } }, lp); return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height)); }
// Link: https://mail.google.com/mail/u/0/#inbox/1598d0b3b2850009?projector=1 // I'm sure memory management is far from optimal! // Fixed Block and Thread! internal static T ComputeGpu5 <T>(T[] array, Func <T, T, T> op) { const int dimGrid = 256; const int blockDim = 256; var gpu = Gpu.Default; var inputLength = array.Length; var inputMemory = gpu.ArrayGetMemory(array, true, false); var inputDevPtr = new deviceptr <T>(inputMemory.Handle); var resultMemory = gpu.AllocateDevice <T>(dimGrid); var resultDevPtr = new deviceptr <T>(resultMemory.Handle); gpu.Launch(() => KernelSequentialReduceIdleThreadsWarpMultiple(inputDevPtr, inputLength, resultDevPtr, op), new LaunchParam(dimGrid, blockDim)); inputDevPtr = resultDevPtr; resultMemory = gpu.AllocateDevice <T>(dimGrid); resultDevPtr = new deviceptr <T>(resultMemory.Handle); gpu.Launch(() => KernelSequentialReduceIdleThreadsWarpMultiple(inputDevPtr, dimGrid, resultDevPtr, op), new LaunchParam(1, blockDim)); return(Gpu.CopyToHost(resultMemory)[0]); }
// Alea Parallel.For! internal static Image Render1(Bounds bounds) { bounds.AdjustAspectRatio(); var width = bounds.Width; var height = bounds.Height; var scale = (bounds.XMax - bounds.XMin) / width; var resultLength = ColorComponents * width * height; var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength); var resultDevPtr = new deviceptr <byte>(resultMemory.Handle); Gpu.Default.For(0, width * height, i => { var x = i % width; var y = i / width; var offset = ColorComponents * i; if (offset < resultLength) { // ReSharper disable once PossibleLossOfFraction var c = new Complex { Real = bounds.XMin + x * scale, Imaginary = bounds.YMin + y * scale, }; ComputeMandelbrotAtOffset(resultDevPtr, c, offset); } }); return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height)); }
private float[] ComputeDistancesDatasetGPU(Item query) { // gpu Task <float[]> task = Task <float[]> .Factory.StartNew(() => { lock (gpuLock) { float[] queryGpu = Gpu.Default.Allocate(query.Descriptor); //float[] distancesGpu = Gpu.Default.Allocate<float>(gpuDatasetSize); Gpu.Default.Launch <float[], float[][], float[], Func <float[], float[], float> > (ComputeDistancesKernel, launchParam, queryGpu, subDatasetGpu, distancesGpu, Item.GetDistanceSQR); Gpu.Free(queryGpu); float[] distancesGpuResultTask = Gpu.CopyToHost(distancesGpu); //Gpu.Free(distancesGpu); return(distancesGpuResultTask); } }); // cpu //float[] distances = new float[Dataset.Length]; //Parallel.For(gpuDatasetSize, distances.Length, index => //{ // distances[index] = Item.GetDistanceSQR(query.Descriptor, Dataset[index].Descriptor); //}); float[] distancesGpuResult = task.Result; //Array.Copy(distancesGpuResult, distances, distancesGpuResult.Length); return(distancesGpuResult); }
public void runClassifyVoxel() { var gpu = Gpu.Default; samplePts = ConvertPointsToFloat3(samplePoints); // allocate memorys var d_voxelVerts = gpu.Allocate <int>(numVoxels); float3[] d_samplePts = gpu.Allocate <float3>(samplePts); //Copy const values float3 baseP = new float3((float)basePoint.X, (float)basePoint.Y, (float)basePoint.Z); gpu.Copy(isoValue, constIsovalue); gpu.Copy(fusion, constFusion); gpu.Copy(baseP, constBasePoint); gpu.Copy(voxelSize, constVoxelSize); gpu.Copy(gridSize, constGridSize); gpu.Copy(Tables.VertsTable, verticesTable); gpu.For(0, numVoxels, i => ClassifyVoxel(i, d_samplePts, d_voxelVerts)); voxelVerts = Gpu.CopyToHost(d_voxelVerts); Gpu.Free(d_samplePts); Gpu.Free(d_voxelVerts); }
/// <summary> /// Checks whether or not the Cuda features are currently supported /// </summary> public static bool IsGpuAccelerationSupported() { try { // CUDA test Gpu gpu = Gpu.Default; if (gpu == null) { return(false); } if (!Dnn.IsAvailable) { return(false); // cuDNN } using (DeviceMemory <float> sample_gpu = gpu.AllocateDevice <float>(1024)) { deviceptr <float> ptr = sample_gpu.Ptr; void Kernel(int i) => ptr[i] = i; Alea.Parallel.GpuExtension.For(gpu, 0, 1024, Kernel); // JIT test float[] sample = Gpu.CopyToHost(sample_gpu); return(Enumerable.Range(0, 1024).Select <int, float>(i => i).ToArray().ContentEquals(sample)); } } catch { // Missing .dll or other errors return(false); } }
// reduce empty voxel and extract active voxels public void runExtractActiveVoxels() { var gpu = Gpu.Default; // compute the number of active voxels List <int> index_voxelActiveList = new List <int>(); for (int i = 0; i < voxelVerts.Length; i++) { if (voxelVerts[i] > 0) { index_voxelActiveList.Add(i); } } // the index of active voxel index_voxelActive = index_voxelActiveList.ToArray(); num_voxelActive = index_voxelActive.Length; // the number of vertices in each active voxel verts_voxelActive = new int[num_voxelActive]; // the number of all vertices sumVerts = 0; Parallel.For(0, num_voxelActive, i => { verts_voxelActive[i] = voxelVerts[index_voxelActive[i]]; }); // execute exclusive scan for finding out the indices of result vertices var op = new Func <int, int, int>((a, b) => { return(a + b); }); Alea.Session session = new Alea.Session(gpu); int[] d_verts_voxelActive = Gpu.Default.Allocate <int>(verts_voxelActive); int[] d_voxelVertsScan = Gpu.Default.Allocate <int>(num_voxelActive); GpuExtension.Scan <int>(session, d_voxelVertsScan, d_verts_voxelActive, 0, op, 0); var result_Scan = Gpu.CopyToHost(d_voxelVertsScan); verts_scanIdx = new int[num_voxelActive]; for (int i = 1; i < num_voxelActive; i++) { verts_scanIdx[i] = result_Scan[i - 1]; } try { verts_scanIdx[0] = 0; } catch (Exception) { throw new Exception("No eligible isosurface can be extracted, please change isovalue."); } sumVerts = verts_scanIdx.ElementAt(verts_scanIdx.Length - 1) + verts_voxelActive.ElementAt(verts_voxelActive.Length - 1); Gpu.Free(d_verts_voxelActive); Gpu.Free(d_voxelVertsScan); }
internal Image Trace(int width, int height) { var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(width * height); var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle); Gpu.Default.For(0, width * height, i => TraceKernel(i, resultDevPtr, width)); return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), width, height)); }
// Alea Parallel.For! internal static Image Render1(int width, int height) { var resultLength = ColorComponents * width * height; var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength); var resultDevPtr = new deviceptr <byte>(resultMemory.Handle); Gpu.Default.For(0, width * height, i => { ComputeRippleAtOffset(resultDevPtr, i, width, height); }); return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height)); }
// Custom! internal static Image Render2(int width, int height) { var resultLength = ColorComponents * width * height; var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength); var resultDevPtr = new deviceptr <byte>(resultMemory.Handle); var lp = ComputeLaunchParameters(width, height); Gpu.Default.Launch(() => { var i = blockDim.x * blockIdx.x + threadIdx.x; ComputeRippleAtOffset(resultDevPtr, i, width, height); }, lp); return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height)); }
public void AllocateDeviceRows() { float[,] source = { { 0, 0, 0, 0, 0, 1, 2, 3, 0, 0 }, { 0, 0, 0, 0, 0, 4, 5, 6, 0, 0 }, { 0, 0, 0, 0, 0, 7, 8, 9, 0, 0 } }; Tensor.From(source, out Tensor tensor); Gpu gpu = Gpu.Default; using (DeviceMemory <float> m_gpu = gpu.AllocateDevice(tensor, 5, 3)) { float[] copy = Gpu.CopyToHost(m_gpu), expected = { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; Assert.IsTrue(copy.ContentEquals(expected)); } }
public static unsafe void gemm_ongpu(int ta, int tb, int m, int n, int k, float alpha, float[] a, int lda, float[] b, int ldb, float beta, float[] c, int ldc) { using (var gpuA = Gpu.Default.AllocateDevice(a.ToArray())) using (var gpuB = Gpu.Default.AllocateDevice(b.ToArray())) using (var gpuC = Gpu.Default.AllocateDevice(c.ToArray())) { var handle = CudaUtils.blas_handle(); CudaUtils.SafeCall(CuBlas.cublasSgemm_v2(handle, (tb != 0 ? cublasOperation_t.CUBLAS_OP_T : cublasOperation_t.CUBLAS_OP_N), (ta != 0 ? cublasOperation_t.CUBLAS_OP_T : cublasOperation_t.CUBLAS_OP_N), n, m, k, &alpha, (float *)gpuB.Handle, ldb, (float *)gpuA.Handle, lda, &beta, (float *)gpuC.Handle, ldc)); a = Gpu.CopyToHost(gpuA); b = Gpu.CopyToHost(gpuB); c = Gpu.CopyToHost(gpuC); } }
// I'm sure memory management is far from optimal! // Helpers private static T ReduceHelper <T>(T[] array, Func <T, T, T> op, Action <deviceptr <T>, int, T[], Func <T, T, T> > kernel, Func <int, LaunchParam> launchParamsFactory) { if (array.Length < CpuThreashold) { return(array.AsParallel().Aggregate(op)); } var gpu = Gpu.Default; var arrayLength = array.Length; var arrayMemory = gpu.ArrayGetMemory(array, true, false); var arrayDevPtr = new deviceptr <T>(arrayMemory.Handle); while (true) { var launchParams = launchParamsFactory(arrayLength); var resultLength = launchParams.GridDim.x; var resultDevice = gpu.Allocate <T>(resultLength); // I'm allowed to use the CPU. if (arrayLength < CpuThreashold) { using (var m = arrayMemory as Memory <T>) { return(Gpu.CopyToHost(m).AsParallel().Aggregate(op)); } } // ReSharper disable once AccessToModifiedClosure // ReSharper disable once AccessToModifiedClosure gpu.Launch(() => kernel(arrayDevPtr, arrayLength, resultDevice, op), launchParams); // I should be able to dispose at this point! // This is a symptom I did something stupid! //arrayMemory.Dispose(); arrayLength = resultLength; arrayMemory = gpu.ArrayGetMemory(resultDevice, true, false); arrayDevPtr = new deviceptr <T>(arrayMemory.Handle); } }
internal static int[] Compute1(int[] array) /*where T : IComparable<T>*/ { var steps = array.Length % 2 == 0 ? array.Length / 2 : array.Length / 2 + 1; var gpu = Gpu.Default; var inputLength = array.Length; var inputMemory = gpu.Allocate(array); gpu.For(0, array.Length, i => { for (var k = 0; k < steps; k++) { if (i < inputLength - 1) { var c = inputMemory[i + 0]; var n = inputMemory[i + 1]; if (i % 2 == 0 && c > n) { Exchange(inputMemory, i, i + 1); } if (i % 2 != 0 && c > n) { Exchange(inputMemory, i, i + 1); } DeviceFunction.SyncThreads(); } } }); return(Gpu.CopyToHost(inputMemory)); }
private static void DoStuff() { const float xs = -2.1F; const float ys = -1.3F; const int zoom = 1; const int width = 10240; const int height = 10240; const int blackpixel = (byte)0 | ((byte)0 << 8) | ((byte)0 << 16) | ((byte)255 << 24); var gpubits = gpu.Allocate <int>(width * height); Console.WriteLine("Started..."); var watch = new Stopwatch(); watch.Start(); const float dx = 2.6F / zoom / width; const float dy = 2.6F / zoom / height; gpu.For(0, width * height, (index) => { //gpu.For(0, width, (x) => { //for (var y = 0; y < height; y++) { var y = index / height; var x = index - (y * height); var cr = xs + (x * dx); var ci = ys + (y * dy); var zr = 0F; var zi = 0F; float zrSquare; float ziSquare; byte i = 0; while (i < 255) { zrSquare = zr * zr; ziSquare = zi * zi; zi = (2 * zi * zr) + ci; zr = zrSquare - ziSquare + cr; i++; if (zrSquare + ziSquare > 4) { break; } } if (i == 255) { gpubits[index] = blackpixel; } else { gpubits[index] = (byte)(i * 25 % 256) | ((byte)(i * 3 % 256) << 8) | (((byte)(i % 256)) << 16) | ((byte)255 << 24); } }); watch.Stop(); Console.WriteLine($"Elapsed microseconds: {((double)watch.ElapsedTicks) / Stopwatch.Frequency * 1000000}"); var bits = Gpu.CopyToHost(gpubits); var bitsHandle = GCHandle.Alloc(bits, GCHandleType.Pinned); var bitmap = new Bitmap(width, height, width * 4, PixelFormat.Format32bppPArgb, bitsHandle.AddrOfPinnedObject()); bitmap.Save("b.png", ImageFormat.Png); bitmap.Dispose(); bitsHandle.Free(); Gpu.Free(gpubits); }
// extract isosurface points using GPU public List <Point3f> runExtractIsoSurfaceGPU() { var gpu = Gpu.Default; // output arguments float3[] pts = new float3[12 * num_voxelActive]; float3[] d_pts = Gpu.Default.Allocate <float3>(pts); float3[] d_resultV = Gpu.Default.Allocate <float3>(sumVerts); float[] d_cubeValues = Gpu.Default.Allocate <float>(8 * num_voxelActive); // input arguments float3[] d_samplePts = Gpu.Default.Allocate <float3>(samplePts); int[] d_verts_scanIdx = Gpu.Default.Allocate <int>(verts_scanIdx); int[] d_index_voxelActive = Gpu.Default.Allocate <int>(index_voxelActive); // const values gpu.Copy(Vertices, constVertices); gpu.Copy(EdgeDirection, constEdgeDirection); gpu.Copy(EdgeConnection, constEdgeConnection); gpu.Copy(Tables.EdgeTable, edgeTable); gpu.Copy(Tables.TriangleTable_GPU, triangleTable); float3 baseP = new float3((float)basePoint.X, (float)basePoint.Y, (float)basePoint.Z); gpu.Copy(baseP, constBasePoint); gpu.Copy(isoValue, constIsovalue); gpu.Copy(scale, constScale); gpu.For(0, num_voxelActive, i => { //计算grid中的位置 int3 gridPos = calcGridPos(d_index_voxelActive[i], constGridSize.Value); float3 p = new float3(); p.x = constBasePoint.Value.x + gridPos.x * constVoxelSize.Value.x; p.y = constBasePoint.Value.y + gridPos.y * constVoxelSize.Value.y; p.z = constBasePoint.Value.z + gridPos.z * constVoxelSize.Value.z; //输出所有顶点 float3 a0 = p; float3 a1 = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, 0 + p.z); float3 a2 = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, 0 + p.z); float3 a3 = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, 0 + p.z); float3 a4 = CreateFloat3(0 + p.x, 0 + p.y, constVoxelSize.Value.z + p.z); float3 a5 = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, constVoxelSize.Value.z + p.z); float3 a6 = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z); float3 a7 = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z); float distance = constVoxelSize.Value.x * constVoxelSize.Value.x + constVoxelSize.Value.y * constVoxelSize.Value.y + constVoxelSize.Value.z * constVoxelSize.Value.z; float radius = distance * constFusion.Value; //Compute cubeValues of 8 vertices d_cubeValues[i * 8] = ComputeValue(d_samplePts, a0, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 1] = ComputeValue(d_samplePts, a1, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 2] = ComputeValue(d_samplePts, a2, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 3] = ComputeValue(d_samplePts, a3, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 4] = ComputeValue(d_samplePts, a4, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 5] = ComputeValue(d_samplePts, a5, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 6] = ComputeValue(d_samplePts, a6, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 7] = ComputeValue(d_samplePts, a7, d_samplePts.Length, constFusion.Value, radius); //Check each vertex state int flag = Compact(d_cubeValues[i * 8], constIsovalue.Value); flag += Compact(d_cubeValues[i * 8 + 1], constIsovalue.Value) * 2; flag += Compact(d_cubeValues[i * 8 + 2], constIsovalue.Value) * 4; flag += Compact(d_cubeValues[i * 8 + 3], constIsovalue.Value) * 8; flag += Compact(d_cubeValues[i * 8 + 4], constIsovalue.Value) * 16; flag += Compact(d_cubeValues[i * 8 + 5], constIsovalue.Value) * 32; flag += Compact(d_cubeValues[i * 8 + 6], constIsovalue.Value) * 64; flag += Compact(d_cubeValues[i * 8 + 7], constIsovalue.Value) * 128; //find out which edge intersects the isosurface int EdgeFlag = edgeTable[flag]; //check whether this voxel is crossed by the isosurface for (int j = 0; j < 12; j++) { //check whether an edge have a point if ((EdgeFlag & (1 << j)) != 0) { //compute t values from two end points on each edge float Offset = GetOffset(d_cubeValues[i * 8 + constEdgeConnection[j * 2 + 0]], d_cubeValues[i * 8 + constEdgeConnection[j * 2 + 1]], constIsovalue.Value); float3 pt = new float3(); //get positions pt.x = constBasePoint.Value.x + (gridPos.x + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 0] + Offset * constEdgeDirection[j * 3 + 0]) * constScale.Value; pt.y = constBasePoint.Value.y + (gridPos.y + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 1] + Offset * constEdgeDirection[j * 3 + 1]) * constScale.Value; pt.z = constBasePoint.Value.z + (gridPos.z + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 2] + Offset * constEdgeDirection[j * 3 + 2]) * constScale.Value; d_pts[12 * i + j] = pt; } } int num = 0; //Find out points from each triangle for (int Triangle = 0; Triangle < 5; Triangle++) { if (triangleTable[flag * 16 + 3 * Triangle] < 0) { break; } for (int Corner = 0; Corner < 3; Corner++) { int Vertex = triangleTable[flag * 16 + 3 * Triangle + Corner]; float3 pd = CreateFloat3(d_pts[12 * i + Vertex].x, d_pts[12 * i + Vertex].y, d_pts[12 * i + Vertex].z); d_resultV[d_verts_scanIdx[i] + num] = pd; num++; } } }); resultVerts = Gpu.CopyToHost(d_resultV); Gpu.Free(d_resultV); Gpu.Free(d_pts); Gpu.Free(d_samplePts); Gpu.Free(d_verts_scanIdx); Gpu.Free(d_index_voxelActive); return(ConvertFloat3ToPoint3f(resultVerts)); }
private static void CopyValues(float[] x, DeviceMemory <float> gpuX) { x = Gpu.CopyToHost(gpuX); }