public ComputationResult[] Compute(Problem[] problemsToSolve, int degreeOfParallelism, float cpuPart) { IEnumerable <ComputationResult> cpuResults = null; int cpuProblems = (int)Math.Floor(problemsToSolve.Count() * cpuPart); var gpu = new SlimGPUQueue(); var gpuResults = gpu.Compute( problemsToSolve.Skip(cpuProblems).Take(problemsToSolve.Count() - cpuProblems).ToArray(), gpu.GetBestParallelism(), () => { var cpu = new SlimCPU(); cpuResults = cpu .Compute(problemsToSolve .Take(cpuProblems).ToArray(), cpu.GetBestParallelism()); }); return(cpuResults.Concat(gpuResults).ToArray()); }
public ComputationResult[] Compute( Problem[] problemsToSolve, int streamCount, Action asyncAction = null, int warpCount = 16) // cannot be more warps since more memory should be allocated { #if (benchmark) var totalTiming = new Stopwatch(); totalTiming.Start(); var benchmarkTiming = new Stopwatch(); #endif var gpu = Gpu.Default; var n = problemsToSolve.First().size; var power = 1 << n; var maximumPermissibleWordLength = (n - 1) * (n - 1); var maximumWarps = gpu.Device.Attributes.MaxThreadsPerBlock / gpu.Device.Attributes.WarpSize; if (warpCount > maximumWarps) { warpCount = maximumWarps; } var problemsPerStream = (problemsToSolve.Count() + streamCount - 1) / streamCount; var problemsPartitioned = Enumerable.Range(0, streamCount) .Select(i => problemsToSolve.Skip(problemsPerStream * i) .Take(problemsPerStream) .ToArray()) .Where(partition => partition.Length > 0) .ToArray(); streamCount = problemsPartitioned.Length; var streams = Enumerable.Range(0, streamCount) .Select(_ => gpu.CreateStream()).ToArray(); var gpuA = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray(); var gpuB = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray(); var isSynchronizable = problemsPartitioned.Select(problems => gpu.Allocate <bool>(problems.Length)).ToArray(); gpu.Copy(n, problemSize); var launchParameters = new LaunchParam( new dim3(1, 1, 1), new dim3(gpu.Device.Attributes.WarpSize * warpCount, 1, 1), 2 * n * sizeof(ushort) ); var gpuResultsIsSynchronizable = problemsPartitioned .Select(problems => new bool[problems.Length]) .ToArray(); for (int stream = 0; stream < streamCount; stream++) { var problems = problemsPartitioned[stream]; var matrixA = new int[problems.Length * n]; var matrixB = new int[problems.Length * n]; Parallel.For(0, problems.Length, problem => { Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n); Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n); }); streams[stream].Copy(matrixA, gpuA[stream]); streams[stream].Copy(matrixB, gpuB[stream]); // TODO: change this entirely // warning, this might not compute the length of a shortest synchronizing word but it will verify the Cerny conjecture streams[stream].Launch( Kernel, launchParameters, gpuA[stream], gpuB[stream], isSynchronizable[stream] ); } asyncAction?.Invoke(); for (int stream = 0; stream < streamCount; stream++) { #if (benchmark) benchmarkTiming.Start(); #endif streams[stream].Synchronize(); #if (benchmark) benchmarkTiming.Stop(); #endif streams[stream].Copy(isSynchronizable[stream], gpuResultsIsSynchronizable[stream]); } gpu.Synchronize(); #if (benchmark) #endif //Enumerable.Range(0, streamCount) // .SelectMany(stream => gpuResultsIsSynchronizable[stream]) // .Select((result) => // { // }); var results = new ComputationResult[problemsToSolve.Count()]; var slimCPU = new SlimCPU(); var listOfCPUProblems = new List <Problem>(); var cpuProblemIndex = new List <int>(); int generalIndex = 0; for (int stream = 0; stream < streamCount; stream++) { for (int index = 0; index < gpuResultsIsSynchronizable[stream].Length; index++) { if (gpuResultsIsSynchronizable[stream][index]) { results[generalIndex] = new ComputationResult { isSynchronizable = true, computationType = ComputationType.CPU_GPU_Combined, size = n, algorithmName = GetType().Name }; } else { listOfCPUProblems.Add(problemsPartitioned[stream][index]); cpuProblemIndex.Add(generalIndex); } generalIndex++; } } var cpuResults = slimCPU.Compute(listOfCPUProblems.ToArray(), slimCPU.GetBestParallelism()); for (int i = 0; i < listOfCPUProblems.Count; i++) { results[cpuProblemIndex[i]] = cpuResults[i]; } if (cpuResults.Any(result => result.isSynchronizable && result.shortestSynchronizingWordLength > maximumPermissibleWordLength)) { throw new Exception("Cerny conjecture is false"); } foreach (var array in gpuA.AsEnumerable <Array>() .Concat(gpuB) .Concat(isSynchronizable)) { Gpu.Free(array); } foreach (var stream in streams) { stream.Dispose(); } #if (benchmark) results[0].benchmarkResult = new BenchmarkResult { benchmarkedTime = benchmarkTiming.Elapsed, totalTime = totalTiming.Elapsed }; #endif return(results); }