/// <summary>
        /// Transposes a matrix on the GPU
        /// </summary>
        /// <param name="matrix">The matrix</param>
        /// <returns>The transposed matrix</returns>
        public FastMatrix Transpose(FastMatrix matrix)
        {
            if (matrix == null)
            {
                throw new ArgumentNullException();
            }
            Accelerator             accelerator;
            MemoryBuffer2D <double> resultBuffer;

            matrix.CopyToGPU();

            accelerator  = HardwareAcceleratorManager.GPUAccelerator;
            resultBuffer = accelerator.Allocate <double>(matrix.GetSize(1), matrix.GetSize(0));
            var kernel = GPUTransposeKernel;

            matrix.WaitForCopy();

            kernel(resultBuffer.Extent, matrix.buffer.View, resultBuffer.View);
            accelerator.Synchronize();

            var tempArray = resultBuffer.GetAs2DArray();

            accelerator.Synchronize();

            FastMatrix returnMatrix = new FastMatrix(tempArray);

            return(returnMatrix);
        }
        /// <summary>
        /// Subtracts two matrices on the GPU
        /// </summary>
        /// <param name="one">The first matrix</param>
        /// <param name="two">The second matrix</param>
        /// <returns>The result of the subtraction (one - two) </returns>
        public FastMatrix Subtract(FastMatrix one, FastMatrix two)
        {
            if (one == null || two == null)
            {
                throw new ArgumentNullException();
            }
            if ((one.GetSize(0) != two.GetSize(0)) || (one.GetSize(1) != two.GetSize(1)))
            {
                throw new BadDimensionException(one.GetSize(0), one.GetSize(1), two.GetSize(0),
                                                two.GetSize(1));
            }

            MemoryBuffer2D <double> resultBuffer;

            //start tasks
            one.CopyToGPU();
            two.CopyToGPU();

            resultBuffer = accelerator.Allocate <double>(one.GetSize(0), one.GetSize(1));

            one.WaitForCopy();
            two.WaitForCopy();

            GPUSubKernel(resultBuffer.Extent, one.buffer.View, two.buffer.View, resultBuffer.View);

            accelerator.Synchronize();

            var tempArray = resultBuffer.GetAs2DArray();

            accelerator.Synchronize();

            FastMatrix returnMatrix = new FastMatrix(tempArray);

            return(returnMatrix);
        }
        /// <summary>
        /// Multiplies two matrices on the GPU
        /// </summary>
        /// <param name="one">The first matrix</param>
        /// <param name="two">The second matrix</param>
        /// <returns>The result of the multiplication</returns>
        public FastMatrix Multiply(FastMatrix one, FastMatrix two)
        {
            if (one == null || two == null)
            {
                throw new ArgumentNullException();
            }
            if (one.GetSize(1) != two.GetSize(0))
            {
                throw new BadDimensionException(one.GetSize(0), one.GetSize(1), two.GetSize(0),
                                                two.GetSize(1));
            }

            Stopwatch watch = Stopwatch.StartNew();
            MemoryBuffer2D <double> resultBuffer;

            //start tasks
            one.CopyToGPU();
            two.CopyToGPU();
            Console.WriteLine($"Copy: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            resultBuffer = accelerator.Allocate <double>(one.GetSize(0), two.GetSize(1));
            Console.WriteLine($"Alloc: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            one.WaitForCopy();
            two.WaitForCopy();
            Console.WriteLine($"Finish copy: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            GPUMultKernel(resultBuffer.Extent, one.buffer.View, two.buffer.View,
                          resultBuffer.View);

            accelerator.Synchronize();
            Console.WriteLine($"Execute: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            var tempArray = resultBuffer.GetAs2DArray();

            accelerator.Synchronize();
            Console.WriteLine($"Copy back: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            FastMatrix returnMatrix = new FastMatrix(tempArray);

            return(returnMatrix);
        }
        /// <summary>
        /// Adds two matrices on the GPU
        /// </summary>
        /// <param name="one">The first matrix</param>
        /// <param name="two">The second matrix</param>
        /// <returns>The result of the addition</returns>
        public FastMatrix Add(FastMatrix one, FastMatrix two)
        {
            if (one == null || two == null)
            {
                throw new ArgumentNullException();
            }
            if ((one.GetSize(0) != two.GetSize(0)) || (one.GetSize(1) != two.GetSize(1)))
            {
                throw new BadDimensionException(one.GetSize(0), one.GetSize(1), two.GetSize(0),
                                                two.GetSize(1));
            }

            Stopwatch watch = Stopwatch.StartNew();
            MemoryBuffer2D <double> resultBuffer;

            one.CopyToGPU();
            two.CopyToGPU();
            Console.WriteLine($"Copy: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            resultBuffer = accelerator.Allocate <double>(one.GetSize(0), one.GetSize(1));
            Console.WriteLine($"Allocate: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            one.WaitForCopy(); //this function call is currently not required,
                               //will come up with a better solution later but for now I'm just
                               //gonna leave it here
            two.WaitForCopy();
            Console.WriteLine($"Finish copy: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            GPUAddKernel(resultBuffer.Extent, one.buffer.View, two.buffer.View, resultBuffer.View);

            accelerator.Synchronize();
            Console.WriteLine($"Execution: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            var tempArray = resultBuffer.GetAs2DArray();

            accelerator.Synchronize();
            Console.WriteLine($"Copy back: {watch.ElapsedMilliseconds}ms");
            watch.Restart();

            FastMatrix returnMatrix = new FastMatrix(tempArray);

            return(returnMatrix);
        }