public static void RunGpu(double[,] a, double[,] b, double[,] c)
        {
            ulong totalMemory = Gpu.Default.Device.TotalMemory;
            var   lp          = LaunchParam(a, b, c);

            Gpu.Default.Launch(Kernel, lp, a, b, c);
            Gpu.FreeAllImplicitMemory();
        }
        public int ComputeManyWithAction(
            Problem[] problemsToSolve,
            int problemsReadingIndex,
            ComputationResult[] computationResults,
            int resultsWritingIndex,
            int problemCount,
            int streamCount,
            Action asyncAction = null)
        // cannot be more warps since more memory should be allocated
        {
#if (benchmark)
            var totalTiming = new Stopwatch();
            totalTiming.Start();
            var benchmarkTiming = new Stopwatch();
#endif
            var CernyConjectureFailingIndex = -1;
            var gpu = Gpu.Default;
            var n   = problemsToSolve[problemsReadingIndex].size;

            var       power   = 1 << n;
            const int bitSize = 6;
            var       maximumPermissibleWordLength = (n - 1) * (n - 1);

            // in order to atomically add to a checking array (designed for queue consistency) one int is used by four threads, so in a reeeeaally pessimistic case 255 is the maximum number of threads (everyone go to the same vertex)
            // -1 for the already discovered vertex, so its -2 in total for both reasons
            // at least 2*n+1 threads i.e. 27
            var threads        = gpu.Device.Attributes.MaxThreadsPerBlock;
            var maximumThreads = Math.Min(
                gpu.Device.Attributes.MaxThreadsPerBlock,
                ((1 << bitSize) - 1) - 1
                );
            var minimumThreads = n + 1;
            if (threads > maximumThreads)
            {
                threads = maximumThreads;
            }
            if (threads < minimumThreads)
            {
                threads = minimumThreads;
            }
            if (threads > maximumThreads)
            {
                throw new Exception("Impossible to satisfy");
            }

            if (problemCount < streamCount)
            {
                streamCount = problemCount;
            }

            var problemsPerStream = (problemCount + streamCount - 1) / streamCount;
            var streams           = Enumerable.Range(0, streamCount)
                                    .Select(_ => gpu.CreateStream()).ToArray();

            gpu.Copy(n, problemSize);

            var isDiscoveredComplexOffset = (power * sizeof(int) + 8 * sizeof(int) / bitSize - 1) / (8 * sizeof(int) / bitSize);

            var launchParameters = new LaunchParam(
                new dim3(1 << 9, 1, 1),
                new dim3(threads, 1, 1),
                sizeof(int) * 3
                + isDiscoveredComplexOffset + (((isDiscoveredComplexOffset % sizeof(int)) & 1) == 1 ? 1 : 0)
                + (power / 2 + 1) * sizeof(ushort) * 2
                + (n + 1) * sizeof(uint)
                + sizeof(bool)
                );
            var gpuResultsIsSynchronizable = new bool[streamCount][];
            var gpuResultsShortestSynchronizingWordLength = new int[streamCount][];
            var gpuAs = new int[streamCount][];
            var gpuBs = new int[streamCount][];
            var shortestSynchronizingWordLength = new int[streamCount][];
            var isSynchronizable = new bool[streamCount][];

            for (int stream = 0; stream < streamCount; stream++)
            {
                var offset             = stream * problemsPerStream;
                var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset);

                gpuAs[stream] = gpu.Allocate <int>(localProblemsCount * n);
                gpuBs[stream] = gpu.Allocate <int>(localProblemsCount * n);
                shortestSynchronizingWordLength[stream] = gpu.Allocate <int>(localProblemsCount);
                isSynchronizable[stream] = gpu.Allocate <bool>(localProblemsCount);

                var matrixA = new int[localProblemsCount * n];
                var matrixB = new int[localProblemsCount * n];
                Parallel.For(0, localProblemsCount, problem =>
                {
                    Array.ConstrainedCopy(
                        problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixA,
                        0,
                        matrixA,
                        problem * n,
                        n
                        );
                    Array.ConstrainedCopy(
                        problemsToSolve[problemsReadingIndex + offset + problem].stateTransitioningMatrixB,
                        0,
                        matrixB,
                        problem * n,
                        n
                        );
                });

                streams[stream].Copy(matrixA, gpuAs[stream]);
                streams[stream].Copy(matrixB, gpuBs[stream]);

                gpuResultsIsSynchronizable[stream] = new bool[localProblemsCount];
                gpuResultsShortestSynchronizingWordLength[stream] = new int[localProblemsCount];

                streams[stream].Launch(
                    Kernel,
                    launchParameters,
                    gpuAs[stream],
                    gpuBs[stream],
                    isSynchronizable[stream],
                    shortestSynchronizingWordLength[stream]
                    );
            }

            asyncAction?.Invoke();

            var streamId = 0;
            foreach (var stream in streams)
            {
#if (benchmark)
                benchmarkTiming.Start();
#endif
                stream.Synchronize();
#if (benchmark)
                benchmarkTiming.Stop();
#endif
                stream.Copy(isSynchronizable[streamId], gpuResultsIsSynchronizable[streamId]);
                stream.Copy(shortestSynchronizingWordLength[streamId], gpuResultsShortestSynchronizingWordLength[streamId]);

                streamId++;
            }

#if (benchmark)
            gpu.Synchronize();
#endif


            Parallel.For(0, streamCount, stream =>
            {
                var offset             = stream * problemsPerStream;
                var localProblemsCount = Math.Min(problemsPerStream, problemCount - offset);

                for (int i = 0; i < localProblemsCount; i++)
                {
                    if (computationResults != null)
                    {
                        computationResults[resultsWritingIndex + offset + i] = new ComputationResult()
                        {
                            computationType  = ComputationType.GPU,
                            size             = problemsToSolve[problemsReadingIndex].size,
                            isSynchronizable = gpuResultsIsSynchronizable[stream][i],
                            shortestSynchronizingWordLength = gpuResultsShortestSynchronizingWordLength[stream][i],
                            algorithmName = GetType().Name
                        }
                    }
                    ;

                    if (gpuResultsIsSynchronizable[stream][i] && gpuResultsShortestSynchronizingWordLength[stream][i] > maximumPermissibleWordLength)
                    {
                        CernyConjectureFailingIndex = resultsWritingIndex + offset + i;
                        break;
                    }
                }
            });

            foreach (var arrays in new IEnumerable <Array>[] { gpuAs, gpuBs, isSynchronizable, shortestSynchronizingWordLength })
            {
                foreach (var array in arrays)
                {
                    Gpu.Free(array);
                }
            }
            Gpu.FreeAllImplicitMemory();

            foreach (var stream in streams)
            {
                stream.Dispose();
            }

#if (benchmark)
            computationResults[resultsWritingIndex].benchmarkResult = new BenchmarkResult
            {
                benchmarkedTime = benchmarkTiming.Elapsed,
                totalTime       = totalTiming.Elapsed
            };
#endif
            return(CernyConjectureFailingIndex);
        }
        static void Main(string[] args)
        {
            var gpu = Gpu.Default;

            // Clean up.
            GC.Collect();
            GC.WaitForPendingFinalizers();
            Gpu.FreeAllImplicitMemory(true);

            // Generate a random dataset.
            List <double[]> listDataset = new List <double[]>();
            int             numSample   = 300;
            int             wordFreq    = 8000;

            for (int i = 0; i < numSample; i++)
            {
                listDataset.Add(Enumerable.Range(1, wordFreq).Select(m => i + m + 1.1).ToArray());
            }
            double[][] dataset = listDataset.ToArray();

            Stopwatch watch = Stopwatch.StartNew();

            // The method can be cached so that there is only a single JIT compilation
            // It takes about 1 sec.
            CosineSimilarityGpu(gpu, new double[][] { new double[] { } });
            Console.WriteLine("JIT compilation: " + watch.Elapsed);

            // Measure Gpu.
            watch.Restart();
            var  resultGpu   = CosineSimilarityGpu(gpu, dataset);
            long durationGpu = watch.ElapsedMilliseconds;

            Console.WriteLine("Gpu: " + watch.Elapsed);

            // Measure Cpu.
            watch.Restart();
            var  resultCpu   = CosineSimilarityMultipleThread(dataset);
            long durationCpu = watch.ElapsedMilliseconds;

            Console.WriteLine("Cpu (" + Environment.ProcessorCount + " threads): " + watch.Elapsed);

            Console.WriteLine("Speed-up: " + (durationCpu / durationGpu));

            // Verify results.
            for (int i = 0; i < resultCpu.GetLength(0); i++)
            {
                for (int j = 0; j < resultCpu.GetLength(1); j++)
                {
                    double diff = Math.Abs(resultCpu[i, j] - resultGpu[i, j]);

                    // Margin of errors
                    if (diff > 0.0000001)
                    {
                        throw new Exception("Results not equals");
                    }
                }
            }

            Console.WriteLine("Press any key...");
            Console.ReadLine();

            var devices = Device.Devices;
            var numGpus = devices.Length;

            foreach (var device in devices)
            {
                // print device information to standard output
                device.Print();
            }

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }