public void runClassifyVoxel() { var gpu = Gpu.Default; samplePts = ConvertPointsToFloat3(samplePoints); // allocate memorys var d_voxelVerts = gpu.Allocate <int>(numVoxels); float3[] d_samplePts = gpu.Allocate <float3>(samplePts); //Copy const values float3 baseP = new float3((float)basePoint.X, (float)basePoint.Y, (float)basePoint.Z); gpu.Copy(isoValue, constIsovalue); gpu.Copy(fusion, constFusion); gpu.Copy(baseP, constBasePoint); gpu.Copy(voxelSize, constVoxelSize); gpu.Copy(gridSize, constGridSize); gpu.Copy(Tables.VertsTable, verticesTable); gpu.For(0, numVoxels, i => ClassifyVoxel(i, d_samplePts, d_voxelVerts)); voxelVerts = Gpu.CopyToHost(d_voxelVerts); Gpu.Free(d_samplePts); Gpu.Free(d_voxelVerts); }
public void Dispose() { Gpu.Free(fromArr); Gpu.Free(toArr); Gpu.Free(fromCacheArr); Gpu.Free(toCacheArr); }
private async void MergeBuffersGPU(float[] color, float[] normal, float[] p3D, float[] tex) { _pixelBufferImg.Clear((byte)0); var specular = Settings.Specular; Gpu.Default.For(0, _imgHeight * _imgWidth, i => { var k = i * 3; ColorPass(_enviromentLight, color, normal, p3D, tex, specular, k); ArrayMath.Clamp(color, 0, 255, k, 3); _pixelBufferImg[k + 0] = (byte)(color[k + 0]); _pixelBufferImg[k + 1] = (byte)(color[k + 1]); _pixelBufferImg[k + 2] = (byte)(color[k + 2]); }); Gpu.Free(color); Gpu.Free(normal); Gpu.Free(p3D); Gpu.Free(tex); await App.Current?.Dispatcher.InvokeAsync(() => { if (Bitmap == null) { return; } WriteToBitmap(Bitmap, _pixelBufferImg); }); }
private float[] ComputeDistancesDatasetGPU(Item query) { // gpu Task <float[]> task = Task <float[]> .Factory.StartNew(() => { lock (gpuLock) { float[] queryGpu = Gpu.Default.Allocate(query.Descriptor); //float[] distancesGpu = Gpu.Default.Allocate<float>(gpuDatasetSize); Gpu.Default.Launch <float[], float[][], float[], Func <float[], float[], float> > (ComputeDistancesKernel, launchParam, queryGpu, subDatasetGpu, distancesGpu, Item.GetDistanceSQR); Gpu.Free(queryGpu); float[] distancesGpuResultTask = Gpu.CopyToHost(distancesGpu); //Gpu.Free(distancesGpu); return(distancesGpuResultTask); } }); // cpu //float[] distances = new float[Dataset.Length]; //Parallel.For(gpuDatasetSize, distances.Length, index => //{ // distances[index] = Item.GetDistanceSQR(query.Descriptor, Dataset[index].Descriptor); //}); float[] distancesGpuResult = task.Result; //Array.Copy(distancesGpuResult, distances, distancesGpuResult.Length); return(distancesGpuResult); }
private static int TestGPU() { var length = 32_000_000; var gpu = Gpu.Default; var a1 = Enumerable.Repeat(1, length).ToArray(); var a2 = Enumerable.Repeat(1, length).ToArray(); var r = new int[length]; int[] arg1 = gpu.Allocate <int>(a1); int[] arg2 = gpu.Allocate <int>(a2); int[] result = gpu.Allocate <int>(r); for (var i = 0; i < 10; i++) { var sw = new Stopwatch(); sw.Start(); var s = TestSummGPU(gpu, length, arg1, arg2, result); sw.Stop(); Console.WriteLine($"GPU1: {sw.Elapsed.Milliseconds} Summ: {s}"); Console.WriteLine(); } Gpu.Free(arg1); Gpu.Free(arg2); Gpu.Free(result); return(0); }
// reduce empty voxel and extract active voxels public void runExtractActiveVoxels() { var gpu = Gpu.Default; // compute the number of active voxels List <int> index_voxelActiveList = new List <int>(); for (int i = 0; i < voxelVerts.Length; i++) { if (voxelVerts[i] > 0) { index_voxelActiveList.Add(i); } } // the index of active voxel index_voxelActive = index_voxelActiveList.ToArray(); num_voxelActive = index_voxelActive.Length; // the number of vertices in each active voxel verts_voxelActive = new int[num_voxelActive]; // the number of all vertices sumVerts = 0; Parallel.For(0, num_voxelActive, i => { verts_voxelActive[i] = voxelVerts[index_voxelActive[i]]; }); // execute exclusive scan for finding out the indices of result vertices var op = new Func <int, int, int>((a, b) => { return(a + b); }); Alea.Session session = new Alea.Session(gpu); int[] d_verts_voxelActive = Gpu.Default.Allocate <int>(verts_voxelActive); int[] d_voxelVertsScan = Gpu.Default.Allocate <int>(num_voxelActive); GpuExtension.Scan <int>(session, d_voxelVertsScan, d_verts_voxelActive, 0, op, 0); var result_Scan = Gpu.CopyToHost(d_voxelVertsScan); verts_scanIdx = new int[num_voxelActive]; for (int i = 1; i < num_voxelActive; i++) { verts_scanIdx[i] = result_Scan[i - 1]; } try { verts_scanIdx[0] = 0; } catch (Exception) { throw new Exception("No eligible isosurface can be extracted, please change isovalue."); } sumVerts = verts_scanIdx.ElementAt(verts_scanIdx.Length - 1) + verts_voxelActive.ElementAt(verts_voxelActive.Length - 1); Gpu.Free(d_verts_voxelActive); Gpu.Free(d_voxelVertsScan); }
public Tuple <int, int> CompareAbsoluteOpt(double[] source, double[] target, double tolerance, double ThreshholdTol) { System.Diagnostics.Debug.WriteLine("starting an absolute comparison on GPU"); if (source.Length != target.Length) { throw new ArgumentException("The source and target lengths need to match"); } double epsilon = ThreshholdTol; double MaxSource = source.Max(); double MaxTarget = target.Max(); double MinDoseEvaluated = (MaxSource * epsilon); double zero = 0.0; double lowMultiplier = (1 - tolerance); double highMultiplier = (1 + tolerance); int failed = 0; int isCounted = 0; Gpu gpu = Gpu.Default; // filter doses below threshold // TODO: should failure be -1? int dimension = source.Length; double[] sourceOnGPU = gpu.Allocate(source); double[] targetOnGPU = gpu.Allocate(target); double[] isCountedArray = gpu.Allocate <double>(dimension); double[] sourceOnGPULow = gpu.Allocate <double>(dimension); double[] sourceOnGPUHigh = gpu.Allocate <double>(dimension); double[] isGTtol = gpu.Allocate <double>(dimension); gpu.For(0, dimension, i => sourceOnGPU[i] = (sourceOnGPU[i] > epsilon) ? sourceOnGPU[i] : zero); gpu.For(0, dimension, i => targetOnGPU[i] = (targetOnGPU[i] > epsilon) ? targetOnGPU[i] : zero); gpu.For(0, dimension, i => sourceOnGPU[i] = (targetOnGPU[i] > epsilon) ? sourceOnGPU[i] : zero); gpu.For(0, dimension, i => targetOnGPU[i] = (sourceOnGPU[i] > epsilon) ? targetOnGPU[i] : zero); gpu.For(0, dimension, i => isCountedArray[i] = (sourceOnGPU[i] > zero) ? 1.0 : zero); gpu.For(0, dimension, i => sourceOnGPULow[i] = lowMultiplier * sourceOnGPU[i]); gpu.For(0, dimension, i => sourceOnGPUHigh[i] = highMultiplier * sourceOnGPU[i]); //determine if relative difference is greater than minDoseEvaluated // stores 1 as GT minDoseEvaluated is true gpu.For(0, isGTtol.Length, i => isGTtol[i] = (targetOnGPU[i] < sourceOnGPULow[i] || targetOnGPU[i] > sourceOnGPUHigh[i]) ? 1 : 0); isCounted = (int)gpu.Sum(isCountedArray); failed = (int)gpu.Sum(isGTtol); Gpu.Free(sourceOnGPU); Gpu.Free(targetOnGPU); Gpu.Free(sourceOnGPULow); Gpu.Free(sourceOnGPUHigh); Gpu.Free(isCountedArray); Gpu.Free(isGTtol); System.Diagnostics.Debug.WriteLine("finished an absolute comparison on GPU"); //gpu.Dispose(); return(new Tuple <int, int>(failed, isCounted)); }
private void DisposeDevInputSamples() { if (m_DevInputSamples == null) { return; } Gpu.Free(m_DevInputSamples); m_DevInputSamples = null; }
private void DisposeGpuResources() { if (m_DevOverdBs == null) { return; } Gpu.Free(m_DevOverdBs); m_DevOverdBs = null; m_SampleCount = 0; }
public static void RunGpu() { var n = GetData(out var x, out var y); var result = new float[n]; var gpu = Gpu.Default; var xDevice = gpu.Allocate <float>(n); var yDevice = gpu.Allocate <float>(n); var resultDevice = gpu.Allocate <float>(n); Gpu.Copy(x, xDevice); Gpu.Copy(y, yDevice); var lp = new LaunchParam(16, 256); gpu.Launch(Kernel, lp, resultDevice, xDevice, yDevice); Gpu.Copy(resultDevice, result); Gpu.Free(xDevice); Gpu.Free(yDevice); Gpu.Free(resultDevice); }
private static double[,] CosineSimilarityGpu(Gpu gpu, double[][] dataset) { int size = dataset.Length * dataset.Length; var gpuDataset = gpu.Allocate(dataset); // Allocate directly on gpu. var gpuDistances = gpu.Allocate <double>(dataset.Length, dataset.Length); gpu.For(0, size, index => { int i = index / dataset.Length; int j = index % dataset.Length; double dotProduct = 0; double magnitudeOne = 0; double magnitudeTwo = 0; for (int k = 0; k < dataset[i].Length; k++) { dotProduct += (dataset[i][k] * dataset[j][k]); magnitudeOne += (dataset[i][k] * dataset[i][k]); magnitudeTwo += (dataset[j][k] * dataset[j][k]); } double distance = Math.Max(0, 1 - (dotProduct / Math.Sqrt(magnitudeOne * magnitudeTwo))); gpuDistances[i, j] = distance; }); // Gpu -> Cpu. var result = new double[dataset.Length, dataset.Length]; Gpu.Copy(gpuDistances, result); // Release gpu memory. Gpu.Free(gpuDataset); Gpu.Free(gpuDistances); return(result); }
protected virtual void Free() { Gpu.Free(rneuronsArr); Gpu.Free(resultsArr); Gpu.Free(inputsArr); }
public void Free() { Gpu.Free(GPUValues); Gpu.Free(GPUIndices); }
public void Free() { Gpu.Free(GPUArray); }
public ComputationResult[] Compute( Problem[] problemsToSolve, int streamCount, Action asyncAction = null, int warpCount = 2) // cannot be more warps since more memory should be allocated { #if (benchmark) var totalTiming = new Stopwatch(); totalTiming.Start(); var benchmarkTiming = new Stopwatch(); #endif var gpu = Gpu.Default; var n = problemsToSolve.First().size; var power = 1 << n; var maximumPermissibleWordLength = (n - 1) * (n - 1); // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex) var maximumWarps = gpu.Device.Attributes.MaxThreadsPerBlock / gpu.Device.Attributes.WarpSize; if (warpCount > maximumWarps) { warpCount = maximumWarps; } var problemsPerStream = (problemsToSolve.Count() + streamCount - 1) / streamCount; var problemsPartitioned = Enumerable.Range(0, streamCount) .Select(i => problemsToSolve.Skip(problemsPerStream * i) .Take(problemsPerStream) .ToArray()) .Where(partition => partition.Length > 0) .ToArray(); streamCount = problemsPartitioned.Length; var streams = Enumerable.Range(0, streamCount) .Select(_ => gpu.CreateStream()).ToArray(); var gpuA = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray(); var gpuB = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray(); var shortestSynchronizingWordLength = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length)).ToArray(); var isSynchronizable = problemsPartitioned.Select(problems => gpu.Allocate <bool>(problems.Length)).ToArray(); gpu.Copy(n, problemSize); var queueUpperBound = power / 2 + 1; var launchParameters = new LaunchParam( new dim3(1, 1, 1), new dim3(gpu.Device.Attributes.WarpSize * warpCount, 1, 1), warpCount * ( sizeof(ushort) * queueUpperBound + sizeof(ushort) * n * 2 + sizeof(byte) * power ) ); var gpuResultsIsSynchronizable = problemsPartitioned .Select(problems => new bool[problems.Length]) .ToArray(); var gpuResultsShortestSynchronizingWordLength = problemsPartitioned .Select(problems => new int[problems.Length]) .ToArray(); for (int stream = 0; stream < streamCount; stream++) { var problems = problemsPartitioned[stream]; var matrixA = new int[problems.Length * n]; var matrixB = new int[problems.Length * n]; Parallel.For(0, problems.Length, problem => { Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n); Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n); }); streams[stream].Copy(matrixA, gpuA[stream]); streams[stream].Copy(matrixB, gpuB[stream]); streams[stream].Launch( Kernel, launchParameters, gpuA[stream], gpuB[stream], isSynchronizable[stream], shortestSynchronizingWordLength[stream] ); } asyncAction?.Invoke(); for (int stream = 0; stream < streamCount; stream++) { #if (benchmark) benchmarkTiming.Start(); #endif streams[stream].Synchronize(); #if (benchmark) benchmarkTiming.Stop(); #endif streams[stream].Copy(isSynchronizable[stream], gpuResultsIsSynchronizable[stream]); streams[stream].Copy(shortestSynchronizingWordLength[stream], gpuResultsShortestSynchronizingWordLength[stream]); } gpu.Synchronize(); #if (benchmark) #endif var results = Enumerable.Range(0, streamCount).SelectMany(i => gpuResultsIsSynchronizable[i].Zip(gpuResultsShortestSynchronizingWordLength[i], (isSyncable, shortestWordLength) => new ComputationResult() { computationType = ComputationType.GPU, size = problemsToSolve.First().size, isSynchronizable = isSyncable, shortestSynchronizingWordLength = shortestWordLength, algorithmName = GetType().Name } ).ToArray() ).ToArray(); foreach (var array in gpuA.AsEnumerable <Array>() .Concat(gpuB) .Concat(shortestSynchronizingWordLength) .Concat(isSynchronizable)) { Gpu.Free(array); } foreach (var stream in streams) { stream.Dispose(); } if (results.Any(result => result.isSynchronizable && result.shortestSynchronizingWordLength > maximumPermissibleWordLength)) { throw new Exception("Cerny conjecture is false"); } #if (benchmark) results[0].benchmarkResult = new BenchmarkResult { benchmarkedTime = benchmarkTiming.Elapsed, totalTime = totalTiming.Elapsed }; #endif return(results); }
private static void DoStuff() { const float xs = -2.1F; const float ys = -1.3F; const int zoom = 1; const int width = 10240; const int height = 10240; const int blackpixel = (byte)0 | ((byte)0 << 8) | ((byte)0 << 16) | ((byte)255 << 24); var gpubits = gpu.Allocate <int>(width * height); Console.WriteLine("Started..."); var watch = new Stopwatch(); watch.Start(); const float dx = 2.6F / zoom / width; const float dy = 2.6F / zoom / height; gpu.For(0, width * height, (index) => { //gpu.For(0, width, (x) => { //for (var y = 0; y < height; y++) { var y = index / height; var x = index - (y * height); var cr = xs + (x * dx); var ci = ys + (y * dy); var zr = 0F; var zi = 0F; float zrSquare; float ziSquare; byte i = 0; while (i < 255) { zrSquare = zr * zr; ziSquare = zi * zi; zi = (2 * zi * zr) + ci; zr = zrSquare - ziSquare + cr; i++; if (zrSquare + ziSquare > 4) { break; } } if (i == 255) { gpubits[index] = blackpixel; } else { gpubits[index] = (byte)(i * 25 % 256) | ((byte)(i * 3 % 256) << 8) | (((byte)(i % 256)) << 16) | ((byte)255 << 24); } }); watch.Stop(); Console.WriteLine($"Elapsed microseconds: {((double)watch.ElapsedTicks) / Stopwatch.Frequency * 1000000}"); var bits = Gpu.CopyToHost(gpubits); var bitsHandle = GCHandle.Alloc(bits, GCHandleType.Pinned); var bitmap = new Bitmap(width, height, width * 4, PixelFormat.Format32bppPArgb, bitsHandle.AddrOfPinnedObject()); bitmap.Save("b.png", ImageFormat.Png); bitmap.Dispose(); bitsHandle.Free(); Gpu.Free(gpubits); }
// extract isosurface points using GPU public List <Point3f> runExtractIsoSurfaceGPU() { var gpu = Gpu.Default; // output arguments float3[] pts = new float3[12 * num_voxelActive]; float3[] d_pts = Gpu.Default.Allocate <float3>(pts); float3[] d_resultV = Gpu.Default.Allocate <float3>(sumVerts); float[] d_cubeValues = Gpu.Default.Allocate <float>(8 * num_voxelActive); // input arguments float3[] d_samplePts = Gpu.Default.Allocate <float3>(samplePts); int[] d_verts_scanIdx = Gpu.Default.Allocate <int>(verts_scanIdx); int[] d_index_voxelActive = Gpu.Default.Allocate <int>(index_voxelActive); // const values gpu.Copy(Vertices, constVertices); gpu.Copy(EdgeDirection, constEdgeDirection); gpu.Copy(EdgeConnection, constEdgeConnection); gpu.Copy(Tables.EdgeTable, edgeTable); gpu.Copy(Tables.TriangleTable_GPU, triangleTable); float3 baseP = new float3((float)basePoint.X, (float)basePoint.Y, (float)basePoint.Z); gpu.Copy(baseP, constBasePoint); gpu.Copy(isoValue, constIsovalue); gpu.Copy(scale, constScale); gpu.For(0, num_voxelActive, i => { //计算grid中的位置 int3 gridPos = calcGridPos(d_index_voxelActive[i], constGridSize.Value); float3 p = new float3(); p.x = constBasePoint.Value.x + gridPos.x * constVoxelSize.Value.x; p.y = constBasePoint.Value.y + gridPos.y * constVoxelSize.Value.y; p.z = constBasePoint.Value.z + gridPos.z * constVoxelSize.Value.z; //输出所有顶点 float3 a0 = p; float3 a1 = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, 0 + p.z); float3 a2 = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, 0 + p.z); float3 a3 = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, 0 + p.z); float3 a4 = CreateFloat3(0 + p.x, 0 + p.y, constVoxelSize.Value.z + p.z); float3 a5 = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, constVoxelSize.Value.z + p.z); float3 a6 = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z); float3 a7 = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z); float distance = constVoxelSize.Value.x * constVoxelSize.Value.x + constVoxelSize.Value.y * constVoxelSize.Value.y + constVoxelSize.Value.z * constVoxelSize.Value.z; float radius = distance * constFusion.Value; //Compute cubeValues of 8 vertices d_cubeValues[i * 8] = ComputeValue(d_samplePts, a0, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 1] = ComputeValue(d_samplePts, a1, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 2] = ComputeValue(d_samplePts, a2, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 3] = ComputeValue(d_samplePts, a3, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 4] = ComputeValue(d_samplePts, a4, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 5] = ComputeValue(d_samplePts, a5, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 6] = ComputeValue(d_samplePts, a6, d_samplePts.Length, constFusion.Value, radius); d_cubeValues[i * 8 + 7] = ComputeValue(d_samplePts, a7, d_samplePts.Length, constFusion.Value, radius); //Check each vertex state int flag = Compact(d_cubeValues[i * 8], constIsovalue.Value); flag += Compact(d_cubeValues[i * 8 + 1], constIsovalue.Value) * 2; flag += Compact(d_cubeValues[i * 8 + 2], constIsovalue.Value) * 4; flag += Compact(d_cubeValues[i * 8 + 3], constIsovalue.Value) * 8; flag += Compact(d_cubeValues[i * 8 + 4], constIsovalue.Value) * 16; flag += Compact(d_cubeValues[i * 8 + 5], constIsovalue.Value) * 32; flag += Compact(d_cubeValues[i * 8 + 6], constIsovalue.Value) * 64; flag += Compact(d_cubeValues[i * 8 + 7], constIsovalue.Value) * 128; //find out which edge intersects the isosurface int EdgeFlag = edgeTable[flag]; //check whether this voxel is crossed by the isosurface for (int j = 0; j < 12; j++) { //check whether an edge have a point if ((EdgeFlag & (1 << j)) != 0) { //compute t values from two end points on each edge float Offset = GetOffset(d_cubeValues[i * 8 + constEdgeConnection[j * 2 + 0]], d_cubeValues[i * 8 + constEdgeConnection[j * 2 + 1]], constIsovalue.Value); float3 pt = new float3(); //get positions pt.x = constBasePoint.Value.x + (gridPos.x + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 0] + Offset * constEdgeDirection[j * 3 + 0]) * constScale.Value; pt.y = constBasePoint.Value.y + (gridPos.y + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 1] + Offset * constEdgeDirection[j * 3 + 1]) * constScale.Value; pt.z = constBasePoint.Value.z + (gridPos.z + constVertices[constEdgeConnection[j * 2 + 0] * 3 + 2] + Offset * constEdgeDirection[j * 3 + 2]) * constScale.Value; d_pts[12 * i + j] = pt; } } int num = 0; //Find out points from each triangle for (int Triangle = 0; Triangle < 5; Triangle++) { if (triangleTable[flag * 16 + 3 * Triangle] < 0) { break; } for (int Corner = 0; Corner < 3; Corner++) { int Vertex = triangleTable[flag * 16 + 3 * Triangle + Corner]; float3 pd = CreateFloat3(d_pts[12 * i + Vertex].x, d_pts[12 * i + Vertex].y, d_pts[12 * i + Vertex].z); d_resultV[d_verts_scanIdx[i] + num] = pd; num++; } } }); resultVerts = Gpu.CopyToHost(d_resultV); Gpu.Free(d_resultV); Gpu.Free(d_pts); Gpu.Free(d_samplePts); Gpu.Free(d_verts_scanIdx); Gpu.Free(d_index_voxelActive); return(ConvertFloat3ToPoint3f(resultVerts)); }
public int ComputeAction( Problem[] problemsToSolve, int problemsReadingIndex, ComputationResult[] computationResults, int resultsWritingIndex, int problemCount, int streamCount, Action asyncAction = null) // cannot be more warps since more memory should be allocated { #if (benchmark) var totalTiming = new Stopwatch(); totalTiming.Start(); var benchmarkTiming = new Stopwatch(); #endif var CernyConjectureFailingIndex = -1; var gpu = Gpu.Default; var n = problemsToSolve[problemsReadingIndex].size; var power = 1 << n; var maximumPermissibleWordLength = (n - 1) * (n - 1); // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex) // -1 for the already discovered vertex // at least 2*n+1 threads i.e. 27 var threads = gpu.Device.Attributes.MaxThreadsPerBlock; var maximumThreads = Math.Min( gpu.Device.Attributes.MaxThreadsPerBlock, 32 * 2 ); if (threads > maximumThreads) { threads = maximumThreads; } if (problemCount < streamCount) { streamCount = problemCount; } var problemsPerStream = (problemCount + streamCount - 1) / streamCount; var streams = Enumerable.Range(0, streamCount) .Select(_ => gpu.CreateStream()).ToArray(); gpu.Copy(n, problemSize); var launchParameters = new LaunchParam( new dim3(1 << 10, 1, 1), new dim3(threads, 1, 1), 2 * n * sizeof(ushort) ); var gpuResultsIsSynchronizable = new bool[streamCount][]; var gpuAs = new int[streamCount][]; var gpuBs = new int[streamCount][]; var isSynchronizable = new bool[streamCount][]; for (int stream = 0; stream < streamCount; stream++) { var offset = stream * problemsPerStream; var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset); gpuAs[stream] = gpu.Allocate <int>(localProblemsCount * n); gpuBs[stream] = gpu.Allocate <int>(localProblemsCount * n); isSynchronizable[stream] = gpu.Allocate <bool>(localProblemsCount); var matrixA = new int[localProblemsCount * n]; var matrixB = new int[localProblemsCount * n]; Parallel.For(0, localProblemsCount, problem => { Array.ConstrainedCopy( problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n ); Array.ConstrainedCopy( problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n ); }); streams[stream].Copy(matrixA, gpuAs[stream]); streams[stream].Copy(matrixB, gpuBs[stream]); gpuResultsIsSynchronizable[stream] = new bool[localProblemsCount]; streams[stream].Launch( Kernel, launchParameters, gpuAs[stream], gpuBs[stream], isSynchronizable[stream] ); } asyncAction?.Invoke(); var streamId = 0; foreach (var stream in streams) { #if (benchmark) benchmarkTiming.Start(); #endif stream.Synchronize(); #if (benchmark) benchmarkTiming.Stop(); #endif stream.Copy(isSynchronizable[streamId], gpuResultsIsSynchronizable[streamId]); var offset = streamId * problemsPerStream; var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset); if (computationResults == null) { resultsWritingIndex = 0; computationResults = new ComputationResult[resultsWritingIndex + problemCount]; } for (int i = 0; i < localProblemsCount; i++) { computationResults[resultsWritingIndex + offset + i].isSynchronizable = gpuResultsIsSynchronizable[streamId][i]; } streamId++; } #if (benchmark) gpu.Synchronize(); #endif foreach (var arrays in new IEnumerable <Array>[] { gpuAs, gpuBs, isSynchronizable }) { foreach (var array in arrays) { Gpu.Free(array); } } foreach (var stream in streams) { stream.Dispose(); } var cpu = new SlimCPUSkipper(); var result = cpu.Compute(problemsToSolve, problemsReadingIndex, computationResults, resultsWritingIndex, problemCount, cpu.GetBestParallelism()); #if (benchmark) computationResults[resultsWritingIndex].benchmarkResult = new BenchmarkResult { benchmarkedTime = benchmarkTiming.Elapsed, totalTime = totalTiming.Elapsed }; #endif return(result); }
public int ComputeManyWithAction( Problem[] problemsToSolve, int problemsReadingIndex, ComputationResult[] computationResults, int resultsWritingIndex, int problemCount, int streamCount, Action asyncAction = null) // cannot be more warps since more memory should be allocated { #if (benchmark) var totalTiming = new Stopwatch(); totalTiming.Start(); var benchmarkTiming = new Stopwatch(); #endif var CernyConjectureFailingIndex = -1; var gpu = Gpu.Default; var n = problemsToSolve[problemsReadingIndex].size; var power = 1 << n; const int bitSize = 6; var maximumPermissibleWordLength = (n - 1) * (n - 1); // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex) // -1 for the already discovered vertex, so its -2 in total for both reasons // at least 2*n+1 threads i.e. 27 var threads = gpu.Device.Attributes.MaxThreadsPerBlock; var maximumThreads = Math.Min( gpu.Device.Attributes.MaxThreadsPerBlock, ((1 << bitSize) - 1) - 1 ); var minimumThreads = n + 1; if (threads > maximumThreads) { threads = maximumThreads; } if (threads < minimumThreads) { threads = minimumThreads; } if (threads > maximumThreads) { throw new Exception("Impossible to satisfy"); } if (problemCount < streamCount) { streamCount = problemCount; } var problemsPerStream = (problemCount + streamCount - 1) / streamCount; var streams = Enumerable.Range(0, streamCount) .Select(_ => gpu.CreateStream()).ToArray(); gpu.Copy(n, problemSize); var isDiscoveredComplexOffset = (power * sizeof(int) + 8 * sizeof(int) / bitSize - 1) / (8 * sizeof(int) / bitSize); var launchParameters = new LaunchParam( new dim3(1 << 9, 1, 1), new dim3(threads, 1, 1), sizeof(int) * 3 + isDiscoveredComplexOffset + (((isDiscoveredComplexOffset % sizeof(int)) & 1) == 1 ? 1 : 0) + (power / 2 + 1) * sizeof(ushort) * 2 + (n + 1) * sizeof(uint) + sizeof(bool) ); var gpuResultsIsSynchronizable = new bool[streamCount][]; var gpuResultsShortestSynchronizingWordLength = new int[streamCount][]; var gpuAs = new int[streamCount][]; var gpuBs = new int[streamCount][]; var shortestSynchronizingWordLength = new int[streamCount][]; var isSynchronizable = new bool[streamCount][]; for (int stream = 0; stream < streamCount; stream++) { var offset = stream * problemsPerStream; var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset); gpuAs[stream] = gpu.Allocate <int>(localProblemsCount * n); gpuBs[stream] = gpu.Allocate <int>(localProblemsCount * n); shortestSynchronizingWordLength[stream] = gpu.Allocate <int>(localProblemsCount); isSynchronizable[stream] = gpu.Allocate <bool>(localProblemsCount); var matrixA = new int[localProblemsCount * n]; var matrixB = new int[localProblemsCount * n]; Parallel.For(0, localProblemsCount, problem => { Array.ConstrainedCopy( problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n ); Array.ConstrainedCopy( problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n ); }); streams[stream].Copy(matrixA, gpuAs[stream]); streams[stream].Copy(matrixB, gpuBs[stream]); gpuResultsIsSynchronizable[stream] = new bool[localProblemsCount]; gpuResultsShortestSynchronizingWordLength[stream] = new int[localProblemsCount]; streams[stream].Launch( Kernel, launchParameters, gpuAs[stream], gpuBs[stream], isSynchronizable[stream], shortestSynchronizingWordLength[stream] ); } asyncAction?.Invoke(); var streamId = 0; foreach (var stream in streams) { #if (benchmark) benchmarkTiming.Start(); #endif stream.Synchronize(); #if (benchmark) benchmarkTiming.Stop(); #endif stream.Copy(isSynchronizable[streamId], gpuResultsIsSynchronizable[streamId]); stream.Copy(shortestSynchronizingWordLength[streamId], gpuResultsShortestSynchronizingWordLength[streamId]); streamId++; } #if (benchmark) gpu.Synchronize(); #endif Parallel.For(0, streamCount, stream => { var offset = stream * problemsPerStream; var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset); for (int i = 0; i < localProblemsCount; i++) { if (computationResults != null) { computationResults[resultsWritingIndex + offset + i] = new ComputationResult() { computationType = ComputationType.GPU, size = problemsToSolve[problemsReadingIndex].size, isSynchronizable = gpuResultsIsSynchronizable[stream][i], shortestSynchronizingWordLength = gpuResultsShortestSynchronizingWordLength[stream][i], algorithmName = GetType().Name } } ; if (gpuResultsIsSynchronizable[stream][i] && gpuResultsShortestSynchronizingWordLength[stream][i] > maximumPermissibleWordLength) { CernyConjectureFailingIndex = resultsWritingIndex + offset + i; break; } } }); foreach (var arrays in new IEnumerable <Array>[] { gpuAs, gpuBs, isSynchronizable, shortestSynchronizingWordLength }) { foreach (var array in arrays) { Gpu.Free(array); } } Gpu.FreeAllImplicitMemory(); foreach (var stream in streams) { stream.Dispose(); } #if (benchmark) computationResults[resultsWritingIndex].benchmarkResult = new BenchmarkResult { benchmarkedTime = benchmarkTiming.Elapsed, totalTime = totalTiming.Elapsed }; #endif return(CernyConjectureFailingIndex); }
public ComputationResult[] Compute( Problem[] problemsToSolve, int streamCount, Action asyncAction = null, int warpCount = 16) // cannot be more warps since more memory should be allocated { #if (benchmark) var totalTiming = new Stopwatch(); totalTiming.Start(); var benchmarkTiming = new Stopwatch(); #endif var gpu = Gpu.Default; var n = problemsToSolve.First().size; var power = 1 << n; var maximumPermissibleWordLength = (n - 1) * (n - 1); var maximumWarps = gpu.Device.Attributes.MaxThreadsPerBlock / gpu.Device.Attributes.WarpSize; if (warpCount > maximumWarps) { warpCount = maximumWarps; } var problemsPerStream = (problemsToSolve.Count() + streamCount - 1) / streamCount; var problemsPartitioned = Enumerable.Range(0, streamCount) .Select(i => problemsToSolve.Skip(problemsPerStream * i) .Take(problemsPerStream) .ToArray()) .Where(partition => partition.Length > 0) .ToArray(); streamCount = problemsPartitioned.Length; var streams = Enumerable.Range(0, streamCount) .Select(_ => gpu.CreateStream()).ToArray(); var gpuA = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray(); var gpuB = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray(); var isSynchronizable = problemsPartitioned.Select(problems => gpu.Allocate <bool>(problems.Length)).ToArray(); gpu.Copy(n, problemSize); var launchParameters = new LaunchParam( new dim3(1, 1, 1), new dim3(gpu.Device.Attributes.WarpSize * warpCount, 1, 1), 2 * n * sizeof(ushort) ); var gpuResultsIsSynchronizable = problemsPartitioned .Select(problems => new bool[problems.Length]) .ToArray(); for (int stream = 0; stream < streamCount; stream++) { var problems = problemsPartitioned[stream]; var matrixA = new int[problems.Length * n]; var matrixB = new int[problems.Length * n]; Parallel.For(0, problems.Length, problem => { Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n); Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n); }); streams[stream].Copy(matrixA, gpuA[stream]); streams[stream].Copy(matrixB, gpuB[stream]); // TODO: change this entirely // warning, this might not compute the length of a shortest synchronizing word but it will verify the Cerny conjecture streams[stream].Launch( Kernel, launchParameters, gpuA[stream], gpuB[stream], isSynchronizable[stream] ); } asyncAction?.Invoke(); for (int stream = 0; stream < streamCount; stream++) { #if (benchmark) benchmarkTiming.Start(); #endif streams[stream].Synchronize(); #if (benchmark) benchmarkTiming.Stop(); #endif streams[stream].Copy(isSynchronizable[stream], gpuResultsIsSynchronizable[stream]); } gpu.Synchronize(); #if (benchmark) #endif //Enumerable.Range(0, streamCount) // .SelectMany(stream => gpuResultsIsSynchronizable[stream]) // .Select((result) => // { // }); var results = new ComputationResult[problemsToSolve.Count()]; var slimCPU = new SlimCPU(); var listOfCPUProblems = new List <Problem>(); var cpuProblemIndex = new List <int>(); int generalIndex = 0; for (int stream = 0; stream < streamCount; stream++) { for (int index = 0; index < gpuResultsIsSynchronizable[stream].Length; index++) { if (gpuResultsIsSynchronizable[stream][index]) { results[generalIndex] = new ComputationResult { isSynchronizable = true, computationType = ComputationType.CPU_GPU_Combined, size = n, algorithmName = GetType().Name }; } else { listOfCPUProblems.Add(problemsPartitioned[stream][index]); cpuProblemIndex.Add(generalIndex); } generalIndex++; } } var cpuResults = slimCPU.Compute(listOfCPUProblems.ToArray(), slimCPU.GetBestParallelism()); for (int i = 0; i < listOfCPUProblems.Count; i++) { results[cpuProblemIndex[i]] = cpuResults[i]; } if (cpuResults.Any(result => result.isSynchronizable && result.shortestSynchronizingWordLength > maximumPermissibleWordLength)) { throw new Exception("Cerny conjecture is false"); } foreach (var array in gpuA.AsEnumerable <Array>() .Concat(gpuB) .Concat(isSynchronizable)) { Gpu.Free(array); } foreach (var stream in streams) { stream.Dispose(); } #if (benchmark) results[0].benchmarkResult = new BenchmarkResult { benchmarkedTime = benchmarkTiming.Elapsed, totalTime = totalTiming.Elapsed }; #endif return(results); }
protected virtual void FreeCache() { Free(); Gpu.Free(rneuronsCacheArr); }