public ComputationResult[] Compute(Problem[] problemsToSolve, int degreeOfParallelism, float cpuPart)
        {
            IEnumerable <ComputationResult> cpuResults = null;
            int cpuProblems = (int)Math.Floor(problemsToSolve.Count() * cpuPart);

            var gpu        = new SlimGPUQueue();
            var gpuResults = gpu.Compute(
                problemsToSolve.Skip(cpuProblems).Take(problemsToSolve.Count() - cpuProblems).ToArray(),
                gpu.GetBestParallelism(),
                () =>
            {
                var cpu    = new SlimCPU();
                cpuResults = cpu
                             .Compute(problemsToSolve
                                      .Take(cpuProblems).ToArray(),
                                      cpu.GetBestParallelism());
            });

            return(cpuResults.Concat(gpuResults).ToArray());
        }
        public ComputationResult[] Compute(
            Problem[] problemsToSolve,
            int streamCount,
            Action asyncAction = null,
            int warpCount      = 16)
        // cannot be more warps since more memory should be allocated
        {
#if (benchmark)
            var totalTiming = new Stopwatch();
            totalTiming.Start();
            var benchmarkTiming = new Stopwatch();
#endif
            var gpu = Gpu.Default;
            var n   = problemsToSolve.First().size;

            var power = 1 << n;
            var maximumPermissibleWordLength = (n - 1) * (n - 1);

            var maximumWarps = gpu.Device.Attributes.MaxThreadsPerBlock / gpu.Device.Attributes.WarpSize;
            if (warpCount > maximumWarps)
            {
                warpCount = maximumWarps;
            }

            var problemsPerStream   = (problemsToSolve.Count() + streamCount - 1) / streamCount;
            var problemsPartitioned = Enumerable.Range(0, streamCount)
                                      .Select(i => problemsToSolve.Skip(problemsPerStream * i)
                                              .Take(problemsPerStream)
                                              .ToArray())
                                      .Where(partition => partition.Length > 0)
                                      .ToArray();
            streamCount = problemsPartitioned.Length;
            var streams = Enumerable.Range(0, streamCount)
                          .Select(_ => gpu.CreateStream()).ToArray();

            var gpuA             = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray();
            var gpuB             = problemsPartitioned.Select(problems => gpu.Allocate <int>(problems.Length * n)).ToArray();
            var isSynchronizable = problemsPartitioned.Select(problems => gpu.Allocate <bool>(problems.Length)).ToArray();
            gpu.Copy(n, problemSize);

            var launchParameters = new LaunchParam(
                new dim3(1, 1, 1),
                new dim3(gpu.Device.Attributes.WarpSize * warpCount, 1, 1),
                2 * n * sizeof(ushort)
                );
            var gpuResultsIsSynchronizable = problemsPartitioned
                                             .Select(problems => new bool[problems.Length])
                                             .ToArray();


            for (int stream = 0; stream < streamCount; stream++)
            {
                var problems = problemsPartitioned[stream];

                var matrixA = new int[problems.Length * n];
                var matrixB = new int[problems.Length * n];
                Parallel.For(0, problems.Length, problem =>
                {
                    Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixA, 0, matrixA, problem * n, n);
                    Array.ConstrainedCopy(problems[problem].stateTransitioningMatrixB, 0, matrixB, problem * n, n);
                });

                streams[stream].Copy(matrixA, gpuA[stream]);
                streams[stream].Copy(matrixB, gpuB[stream]);

                // TODO: change this entirely
                // warning, this might not compute the length of a shortest synchronizing word but it will verify the Cerny conjecture
                streams[stream].Launch(
                    Kernel,
                    launchParameters,
                    gpuA[stream],
                    gpuB[stream],
                    isSynchronizable[stream]
                    );
            }

            asyncAction?.Invoke();

            for (int stream = 0; stream < streamCount; stream++)
            {
#if (benchmark)
                benchmarkTiming.Start();
#endif
                streams[stream].Synchronize();
#if (benchmark)
                benchmarkTiming.Stop();
#endif
                streams[stream].Copy(isSynchronizable[stream], gpuResultsIsSynchronizable[stream]);
            }

            gpu.Synchronize();

#if (benchmark)
#endif
            //Enumerable.Range(0, streamCount)
            //    .SelectMany(stream => gpuResultsIsSynchronizable[stream])
            //    .Select((result) =>
            //    {

            //    });
            var results = new ComputationResult[problemsToSolve.Count()];


            var slimCPU           = new SlimCPU();
            var listOfCPUProblems = new List <Problem>();
            var cpuProblemIndex   = new List <int>();
            int generalIndex      = 0;
            for (int stream = 0; stream < streamCount; stream++)
            {
                for (int index = 0; index < gpuResultsIsSynchronizable[stream].Length; index++)
                {
                    if (gpuResultsIsSynchronizable[stream][index])
                    {
                        results[generalIndex] = new ComputationResult
                        {
                            isSynchronizable = true,
                            computationType  = ComputationType.CPU_GPU_Combined,
                            size             = n,
                            algorithmName    = GetType().Name
                        };
                    }
                    else
                    {
                        listOfCPUProblems.Add(problemsPartitioned[stream][index]);
                        cpuProblemIndex.Add(generalIndex);
                    }
                    generalIndex++;
                }
            }

            var cpuResults = slimCPU.Compute(listOfCPUProblems.ToArray(), slimCPU.GetBestParallelism());

            for (int i = 0; i < listOfCPUProblems.Count; i++)
            {
                results[cpuProblemIndex[i]] = cpuResults[i];
            }

            if (cpuResults.Any(result => result.isSynchronizable && result.shortestSynchronizingWordLength > maximumPermissibleWordLength))
            {
                throw new Exception("Cerny conjecture is false");
            }

            foreach (var array in gpuA.AsEnumerable <Array>()
                     .Concat(gpuB)
                     .Concat(isSynchronizable))
            {
                Gpu.Free(array);
            }

            foreach (var stream in streams)
            {
                stream.Dispose();
            }

#if (benchmark)
            results[0].benchmarkResult = new BenchmarkResult
            {
                benchmarkedTime = benchmarkTiming.Elapsed,
                totalTime       = totalTiming.Elapsed
            };
#endif
            return(results);
        }