public static void RunGpu(double[,] a, double[,] b, double[,] c) { ulong totalMemory = Gpu.Default.Device.TotalMemory; var lp = LaunchParam(a, b, c); Gpu.Default.Launch(Kernel, lp, a, b, c); Gpu.FreeAllImplicitMemory(); }
public int ComputeManyWithAction( Problem[] problemsToSolve, int problemsReadingIndex, ComputationResult[] computationResults, int resultsWritingIndex, int problemCount, int streamCount, Action asyncAction = null) // cannot be more warps since more memory should be allocated { #if (benchmark) var totalTiming = new Stopwatch(); totalTiming.Start(); var benchmarkTiming = new Stopwatch(); #endif var CernyConjectureFailingIndex = -1; var gpu = Gpu.Default; var n = problemsToSolve[problemsReadingIndex].size; var power = 1 << n; const int bitSize = 6; var maximumPermissibleWordLength = (n - 1) * (n - 1); // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex) // -1 for the already discovered vertex, so its -2 in total for both reasons // at least 2*n+1 threads i.e. 27 var threads = gpu.Device.Attributes.MaxThreadsPerBlock; var maximumThreads = Math.Min( gpu.Device.Attributes.MaxThreadsPerBlock, ((1 << bitSize) - 1) - 1 ); var minimumThreads = n + 1; if (threads > maximumThreads) { threads = maximumThreads; } if (threads < minimumThreads) { threads = minimumThreads; } if (threads > maximumThreads) { throw new Exception("Impossible to satisfy"); } if (problemCount < streamCount) { streamCount = problemCount; } var problemsPerStream = (problemCount + streamCount - 1) / streamCount; var streams = Enumerable.Range(0, streamCount) .Select(_ => gpu.CreateStream()).ToArray(); gpu.Copy(n, problemSize); var isDiscoveredComplexOffset = (power * sizeof(int) + 8 * sizeof(int) / bitSize - 1) / (8 * sizeof(int) / bitSize); var launchParameters = new LaunchParam( new dim3(1 << 9, 1, 1), new dim3(threads, 1, 1), sizeof(int) * 3 + isDiscoveredComplexOffset + (((isDiscoveredComplexOffset % sizeof(int)) & 1) == 1 ? 1 : 0) + (power / 2 + 1) * sizeof(ushort) * 2 + (n + 1) * sizeof(uint) + sizeof(bool) ); var gpuResultsIsSynchronizable = new bool[streamCount][]; var gpuResultsShortestSynchronizingWordLength = new int[streamCount][]; var gpuAs = new int[streamCount][]; var gpuBs = new int[streamCount][]; var shortestSynchronizingWordLength = new int[streamCount][]; var isSynchronizable = new bool[streamCount][]; for (int stream = 0; stream < streamCount; stream++) { var offset = stream * problemsPerStream; var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset); gpuAs[stream] = gpu.Allocate <int>(localProblemsCount * n); gpuBs[stream] = gpu.Allocate <int>(localProblemsCount * n); shortestSynchronizingWordLength[stream] = gpu.Allocate <int>(localProblemsCount); isSynchronizable[stream] = gpu.Allocate <bool>(localProblemsCount); var matrixA = new int[localProblemsCount * n]; var matrixB = new int[localProblemsCount * n]; Parallel.For(0, localProblemsCount, problem => { Array.ConstrainedCopy( problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n ); Array.ConstrainedCopy( problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n ); }); streams[stream].Copy(matrixA, gpuAs[stream]); streams[stream].Copy(matrixB, gpuBs[stream]); gpuResultsIsSynchronizable[stream] = new bool[localProblemsCount]; gpuResultsShortestSynchronizingWordLength[stream] = new int[localProblemsCount]; streams[stream].Launch( Kernel, launchParameters, gpuAs[stream], gpuBs[stream], isSynchronizable[stream], shortestSynchronizingWordLength[stream] ); } asyncAction?.Invoke(); var streamId = 0; foreach (var stream in streams) { #if (benchmark) benchmarkTiming.Start(); #endif stream.Synchronize(); #if (benchmark) benchmarkTiming.Stop(); #endif stream.Copy(isSynchronizable[streamId], gpuResultsIsSynchronizable[streamId]); stream.Copy(shortestSynchronizingWordLength[streamId], gpuResultsShortestSynchronizingWordLength[streamId]); streamId++; } #if (benchmark) gpu.Synchronize(); #endif Parallel.For(0, streamCount, stream => { var offset = stream * problemsPerStream; var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset); for (int i = 0; i < localProblemsCount; i++) { if (computationResults != null) { computationResults[resultsWritingIndex + offset + i] = new ComputationResult() { computationType = ComputationType.GPU, size = problemsToSolve[problemsReadingIndex].size, isSynchronizable = gpuResultsIsSynchronizable[stream][i], shortestSynchronizingWordLength = gpuResultsShortestSynchronizingWordLength[stream][i], algorithmName = GetType().Name } } ; if (gpuResultsIsSynchronizable[stream][i] && gpuResultsShortestSynchronizingWordLength[stream][i] > maximumPermissibleWordLength) { CernyConjectureFailingIndex = resultsWritingIndex + offset + i; break; } } }); foreach (var arrays in new IEnumerable <Array>[] { gpuAs, gpuBs, isSynchronizable, shortestSynchronizingWordLength }) { foreach (var array in arrays) { Gpu.Free(array); } } Gpu.FreeAllImplicitMemory(); foreach (var stream in streams) { stream.Dispose(); } #if (benchmark) computationResults[resultsWritingIndex].benchmarkResult = new BenchmarkResult { benchmarkedTime = benchmarkTiming.Elapsed, totalTime = totalTiming.Elapsed }; #endif return(CernyConjectureFailingIndex); }
static void Main(string[] args) { var gpu = Gpu.Default; // Clean up. GC.Collect(); GC.WaitForPendingFinalizers(); Gpu.FreeAllImplicitMemory(true); // Generate a random dataset. List <double[]> listDataset = new List <double[]>(); int numSample = 300; int wordFreq = 8000; for (int i = 0; i < numSample; i++) { listDataset.Add(Enumerable.Range(1, wordFreq).Select(m => i + m + 1.1).ToArray()); } double[][] dataset = listDataset.ToArray(); Stopwatch watch = Stopwatch.StartNew(); // The method can be cached so that there is only a single JIT compilation // It takes about 1 sec. CosineSimilarityGpu(gpu, new double[][] { new double[] { } }); Console.WriteLine("JIT compilation: " + watch.Elapsed); // Measure Gpu. watch.Restart(); var resultGpu = CosineSimilarityGpu(gpu, dataset); long durationGpu = watch.ElapsedMilliseconds; Console.WriteLine("Gpu: " + watch.Elapsed); // Measure Cpu. watch.Restart(); var resultCpu = CosineSimilarityMultipleThread(dataset); long durationCpu = watch.ElapsedMilliseconds; Console.WriteLine("Cpu (" + Environment.ProcessorCount + " threads): " + watch.Elapsed); Console.WriteLine("Speed-up: " + (durationCpu / durationGpu)); // Verify results. for (int i = 0; i < resultCpu.GetLength(0); i++) { for (int j = 0; j < resultCpu.GetLength(1); j++) { double diff = Math.Abs(resultCpu[i, j] - resultGpu[i, j]); // Margin of errors if (diff > 0.0000001) { throw new Exception("Results not equals"); } } } Console.WriteLine("Press any key..."); Console.ReadLine(); var devices = Device.Devices; var numGpus = devices.Length; foreach (var device in devices) { // print device information to standard output device.Print(); } Console.WriteLine("Press any key to continue..."); Console.ReadLine(); }