Beispiel #1
0
        public static void blaa()
        {
            int num = 10;
            //NewContext creation
            CudaContext cntxt = new CudaContext();

            //Module loading from precompiled .ptx in a project output folder
            CUmodule cumodule = cntxt.LoadModule("kernel.ptx");

            //_Z9addKernelPf - function name, can be found in *.ptx file
            CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt);

            //Create device array for data
            CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num);

            //Create arrays with data
            float[] vec1 = new float[num];

            //Copy data to device
            vec1_device.CopyToDevice(vec1);

            //Set grid and block dimensions
            addWithCuda.GridDimensions  = new dim3(8, 1, 1);
            addWithCuda.BlockDimensions = new dim3(512, 1, 1);

            //Run the kernel
            addWithCuda.Run(
                vec1_device.DevicePointer);

            //Copy data from device
            vec1_device.CopyToHost(vec1);
        }
Beispiel #2
0
        //private CudaKernel kernel1;

        //public Class1()
        //{
        //    //int deviceID = 0;

        //    //CudaContext ctx = new CudaContext(deviceID);
        //    //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
        //    //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);
        //}

        public static double[,] TestMatrix(double[][,] a)
        {
            using (CudaContext ctx = new CudaContext(0))
            {
                CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
                var      kernel   = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);

                int dimZ = a.Length;
                int dimX = a[0].GetLength(0);
                int dimY = a[0].GetLength(1);

                kernel.GridDimensions  = new dim3(28, 28, 1);
                kernel.BlockDimensions = new dim3(1, 1, 1);
                //kernel.BlockDimensions = new dim3(dimX, dimY, 1);

                // Allocate vectors in device memory and copy vectors from host memory to device memory
                CudaDeviceVariable <double> dA = a.ToLinearArray();
                //CudaDeviceVariable<double> dB = ToLinearArray(b);
                CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY);

                // Invoke kernel
                kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ);

                // Copy result from device memory to host memory
                double[] c = dC;

                //ctx.FreeMemory(dC.DevicePointer);
                //ctx.FreeMemory(dA.DevicePointer);
                //ctx.Dispose();

                return(ToMultyArray(c, dimX));
            }
        }
Beispiel #3
0
        public static void InitKernels()
        {
            CudaContext cntxt = new CudaContext();

            //CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Michał\Documents\Visual Studio 2013\Projects\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");
            CUmodule cumodule = cntxt.LoadModule(@"D:\Grafika\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");
            addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt);
        }
        void InitKernels()
        {
            var path = @"..\..\..\CudaParticleSimulation\kernel.ptx";

            if (!System.IO.File.Exists(path))
            {
                Debug.Error(path + " doesnt exists");
                return;
            }

            var cntxt = new CudaContext();

            uint deviceCount = 1;
            var  devices     = new CUdevice[50];

            OpenGLNativeMethods.CUDA3.cuGLGetDevices(ref deviceCount, devices, 50, CUGLDeviceList.All);

            var context = cntxt.Context;

            OpenGLNativeMethods.CUDA3.cuGLCtxCreate(ref context, CUCtxFlags.BlockingSync, devices[0]);

            Debug.Info("Found " + deviceCount + " OpenGL devices associated with current context");


            CUmodule cumodule = cntxt.LoadModule(path);

            updateParticles = new CudaKernel("updateParticles", cumodule, cntxt);
            updateParticles.BlockDimensions = new dim3(16 * 16, 1, 1);
            updateParticles.GridDimensions  = new dim3(16 * 16, 1, 1);


            generateParticles = new CudaKernel("generateParticles", cumodule, cntxt);
            generateParticles.BlockDimensions = updateParticles.BlockDimensions;
            generateParticles.GridDimensions  = updateParticles.GridDimensions;

            var random       = new Random();
            var randomFloats = new float[1000];

            for (int i = 0; i < randomFloats.Length; i++)
            {
                randomFloats[i] = (float)random.NextDouble();
            }

            generateParticles.SetConstantVariable("randomFloats", randomFloats);

            // CudaGraphicsInteropResourceCollection

            resources.Clear();
            foreach (var h in renderer.particleMesh.allBufferHandles)
            {
                var resoure = new CudaOpenGLBufferInteropResource(h, CUGraphicsRegisterFlags.None, CUGraphicsMapResourceFlags.None);
                resources.Add(resoure);
            }


            randomIndex_D = 0;
            randomIndex_D.CopyToDevice(0);
        }
Beispiel #5
0
        public static void InitKernels()
        {
            CudaContext cntxt = new CudaContext();

            //CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Michał\Documents\Visual Studio 2013\Projects\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");
            CUmodule cumodule = cntxt.LoadModule(@"D:\Grafika\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");

            addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt);
        }
Beispiel #6
0
        static void InitKernels()
        {
            CudaContext cntxt    = new CudaContext();
            var         cumodule = cntxt.LoadModule(@"..\..\..\TestManaged\Debug\kernel.ptx");

            fillVectorWithCuda = new CudaKernel("kernel", cumodule, cntxt);
            fillVectorWithCuda.BlockDimensions = THREADS_PER_BLOCK;
            fillVectorWithCuda.GridDimensions  = VECTOR_SIZE / THREADS_PER_BLOCK + 1;
        }
Beispiel #7
0
        static void Main(string[] args)
        {
            CudaContext cntxt    = new CudaContext();
            var         cumodule = cntxt.LoadModule(@"..\..\..\Marcher\Debug\march3.ptx");
            var         kernel   = new CudaKernel("simpleKernel", cumodule, cntxt);

            kernel.SetConstantVariable("d_edgeTable", Tables.EDGE_TABLE);
            kernel.SetConstantVariable("d_triTable", Tables.TRI_TABLE);
        }
 public CudaModuleHelper(CudaContext context, string file)
 {
     Context = context;
     Module = context.LoadModule(file);
     PtxFile = file;
     functionNames = File.ReadAllLines(file)
         .Where(x => x.Contains("// .globl"))
         .Select(x => x.Replace("// .globl", "").Trim())
         .ToArray();
 }
Beispiel #9
0
        static void InitKernels()
        {
            //max thread number - 65534x256=16776704
            _matrixSize = 256;
            _threadsPerBlock = 256;

            CleanUpResources();
            _cnContext = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            CUmodule cumodule = _cnContext.LoadModule(@"\Kernel\kernel.ptx");
            _multiplyTwoVectorWithCuda = new CudaKernel("_Z6kernel_", cumodule, _cnContext);
        }
Beispiel #10
0
        public PatchTracker(int aMaxWidth, int aMaxHeight, List <int> aTileSizes, List <int> aMaxShifts, List <int> aLevels, CudaContext ctx)
        {
            forward  = new CudaFFTPlanMany[aLevels.Count];
            backward = new CudaFFTPlanMany[aLevels.Count];


            //Allocate FFT plans
            SizeT oldFFTSize = 0;

            for (int i = 0; i < aTileSizes.Count; i++)
            {
                SizeT memFFT = InitFFT(i, aMaxWidth / aLevels[i], aMaxHeight / aLevels[i], aTileSizes[i], aMaxShifts[i]);
                if (memFFT > oldFFTSize)
                {
                    oldFFTSize = memFFT;
                }
            }
            FTTBufferSize = oldFFTSize;

            //find maximum for allocations:
            for (int i = 0; i < aTileSizes.Count; i++)
            {
                currentWidth    = aMaxWidth / aLevels[i];
                currentHeight   = aMaxHeight / aLevels[i];
                currentTileSize = aTileSizes[i];
                currentMaxShift = aMaxShifts[i];

                int currentMaxPixelsShiftImage = (2 * currentMaxShift + 1) * (2 * currentMaxShift + 1) * CurrentBlockCountX * CurrentBlockCountY;
                maxPixelsShiftImage = Math.Max(currentMaxPixelsShiftImage, maxPixelsShiftImage);

                int tilePixels = CurrentBlockSize * CurrentBlockSize * CurrentBlockCountX * CurrentBlockCountY;
                maxPixelsImage = Math.Max(tilePixels, maxPixelsImage);
                int fftWidth  = CurrentBlockSize / 2 + 1;
                int fftPixels = fftWidth * CurrentBlockSize * CurrentBlockCountX * CurrentBlockCountY;
                maxPixelsFFT = Math.Max(fftPixels, maxPixelsFFT);

                maxWidth  = Math.Max(aMaxWidth / aLevels[i], maxWidth);
                maxHeight = Math.Max(aMaxHeight / aLevels[i], maxHeight);

                maxBlockCountX = Math.Max(maxBlockCountX, CurrentBlockCountX);
                maxBlockCountY = Math.Max(maxBlockCountY, CurrentBlockCountY);
            }

            CUmodule mod = ctx.LoadModule("kernel.ptx");

            conjKernel           = new conjugateComplexMulKernel(ctx, mod);
            convertToTiles       = new convertToTilesOverlapKernel(ctx, mod);
            convertToTilesBorder = new convertToTilesOverlapBorderKernel(ctx, mod);
            squaredSumKernel     = new squaredSumKernel(ctx, mod);
            boxFilterXKernel     = new boxFilterWithBorderXKernel(ctx, mod);
            boxFilterYKernel     = new boxFilterWithBorderYKernel(ctx, mod);
            normalizedCCKernel   = new normalizedCCKernel(ctx, mod);
            findMinimumKernel    = new findMinimumKernel(ctx, mod);
        }
Beispiel #11
0
        static void InitKernels()
        {
            var context  = new CudaContext(0);
            var cumodule = context.LoadModule(kernelPath);

            hilbertFinalTransformKernel = new CudaKernel("_Z23hilbert_final_transformP6float2S0_ii", cumodule, context)
            {
                BlockDimensions = MAX_THREADS_PER_BLOCK,
                GridDimensions  = NUM_RECORDS / MAX_THREADS_PER_BLOCK + 1,
            };

            hilbertIntermediateTransformKernel = new CudaKernel("_Z30hilbert_intermediate_transformP6float2ii", cumodule, context)
            {
                BlockDimensions = MAX_THREADS_PER_BLOCK,
                GridDimensions  = NUM_RECORDS / MAX_THREADS_PER_BLOCK + 1,
            };
        }
Beispiel #12
0
        public PreAlignment(NPPImage_32fC1 img, CudaContext ctx)
        {
            width             = img.WidthRoi;
            height            = img.HeightRoi;
            imgToTrackRotated = new NPPImage_32fC1(width, height);

            CUmodule mod = ctx.LoadModule("kernel.ptx");

            int fftWidth = width / 2 + 1;

            conjKernel          = new conjugateComplexMulKernel(ctx, mod);
            fourierFilterKernel = new fourierFilterKernel(ctx, mod);
            fftshiftKernel      = new fftshiftKernel(ctx, mod);

            squaredSumKernel   = new squaredSumKernel(ctx, mod);
            boxFilterXKernel   = new boxFilterWithBorderXKernel(ctx, mod);
            boxFilterYKernel   = new boxFilterWithBorderYKernel(ctx, mod);
            normalizedCCKernel = new normalizedCCKernel(ctx, mod);
            findMinimumKernel  = new findMinimumKernel(ctx, mod);



            int n = 2;

            int[] dims    = new int[] { height, width };
            int   batches = 1;

            int[] inembed = new int[] { 1, imgToTrackRotated.Pitch / 4 };
            int[] onembed = new int[] { 1, fftWidth };
            int   idist   = height * imgToTrackRotated.Pitch / 4;
            int   odist   = height * fftWidth;
            int   istride = 1;
            int   ostride = 1;

            cufftHandle handleForward  = cufftHandle.Create();
            cufftHandle handleBackward = cufftHandle.Create();

            SizeT sizeForward  = new SizeT();
            SizeT sizeBackward = new SizeT();

            forward  = new CudaFFTPlanMany(handleForward, n, dims, batches, cufftType.R2C, inembed, istride, idist, onembed, ostride, odist, ref sizeForward, false);
            backward = new CudaFFTPlanMany(handleBackward, n, dims, batches, cufftType.C2R, onembed, ostride, odist, inembed, istride, idist, ref sizeBackward, false);

            FFTBufferSize = sizeForward > sizeBackward ? sizeForward : sizeBackward;
        }
        public override void Init()
        {
            int         N        = DataGenerator.InputCount;
            CudaContext cntxt    = new CudaContext();
            CUmodule    cumodule = cntxt.LoadModule(@"kernel.cubin");

            myKernel = new CudaKernel("proccess", cumodule, cntxt);
            //myKernel.GridDimensions = (N + 255) / 256;
            //myKernel.BlockDimensions = Math.Min(N, 256);
            myKernel.GridDimensions  = (N + 255) / 256;
            myKernel.BlockDimensions = 256;

            // https://softwarehut.com/blog/general-purpose-computing-gpu-net-world-part-1/
            //https://stackoverflow.com/questions/2392250/understanding-cuda-grid-dimensions-block-dimensions-and-threads-organization-s
            //myKernel.GridDimensions = new dim3(1, 1, 1);
            //myKernel.BlockDimensions = new dim3(16, 16);

            // init input parameters
            input1_dev = new CudaDeviceVariable <int>(DataGenerator.In1.Length);
            input2_dev = new CudaDeviceVariable <int>(DataGenerator.In2.Length);
            input3_dev = new CudaDeviceVariable <double>(DataGenerator.In3.Length);
            input4_dev = new CudaDeviceVariable <byte>(DataGenerator.In4_3_bytes.Length);

            result_dev     = new CudaDeviceVariable <byte>(resultsBytes.Length);
            resultCalc_dev = new CudaDeviceVariable <double>(calculatables.Length);

            // copy input parameters
            input1_dev.CopyToDevice(DataGenerator.In1);
            input2_dev.CopyToDevice(DataGenerator.In2);
            input3_dev.CopyToDevice(DataGenerator.In3);
            input4_dev.CopyToDevice(DataGenerator.In4_3_bytes);

            // init output parameters
            //result_dev = new CudaDeviceVariable<bool>(results.Length);

            //myKernel.SetConstantVariable("width", DataGenerator.Width);
            //myKernel.SetConstantVariable("inputCount", N);
            //myKernel.SetConstantVariable("height", DataGenerator.Height);
        }
Beispiel #14
0
 public KernelModule(CudaContext context, string path)
 {
     _context = context;
     _module  = _context.LoadModule(path);
 }
        public void Compile()
        {
            using (var ctx = new CudaContext())
            {
                // with verbaim string @, we only have to double up double quotes: no other escaping
                string source = @"
                extern ""C"" __global__
                void saxpy(float a, float *x, float *y, float *out, size_t n)
                {
                    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
                    if (tid < n)
                    {
                        out[tid] = a * x[tid] + y[tid];
                    }
                }
                ";

                source += Environment.NewLine;

                var name = "Test";
                var headers = new string[0];
                var includeNames = new string[0];

                var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames);

                //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames);
                // --ptxas-options=-v -keep
                compiler.Compile(new string[] { "-G" });

                //var ptxString = compiler.GetPTXAsString(); // for debugging

                var ptx = compiler.GetPTX();

                //compiler2.Compile(new string[] { });

                var kernel = ctx.LoadKernelPTX(ptx, "kernelName");

                //One kernel per cu file:
                //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname");
                kernel.GridDimensions = new dim3(1, 1, 1);
                kernel.BlockDimensions = new dim3(16, 16);

                //kernel.Run()

                var a = new CudaDeviceVariable<double>(100);
                //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr()

                //Multiple kernels per cu file:
                CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx");
                CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
                CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };

            }
        }
Beispiel #16
0
 static void InitKernels()
 {
     CudaContext cntxt = new CudaContext();
     CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Niels\Documents\uni ting\P10\P10\programs\small programs\CUDA 1D MA in C Sharp\CUDA 1D MA in C Sharp\Debug\kernel.ptx");
     addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt);
 }
Beispiel #17
0
        static void Main(string[] args)
        {
            // NOTE: You need to change this location to match your own machine.
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment.");
            Console.ResetColor();
            System.Threading.Thread.Sleep(500);

            string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx";
            CudaContext ctx = new CudaContext();
            CUmodule module = ctx.LoadModule(path);
            kernel = new CudaKernel("kernel", module, ctx);

            // This tells the kernel to allocate a lot of threads for the Gpu.
            kernel.BlockDimensions = THREADS_PER_BLOCK;
            kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ;

            // Now let's load the kernel!
            // Create the topology.
            int[] topology = new int[] { 1, 200, 200, 100, 1 };

            int height = topology.Length;
            int width = 0;

            for (int i = 0; i < topology.Length; i++)
                if (width < topology[i]) width = topology[i];

            // Launch!
            float[] res = new float[height * width];
            for (int i = 0; i < 10; i++)
            {
                float[] matrix = new float[height * width];
                float[] weights = new float[height * width];
                Random rand = new Random(424242);
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0;
                        weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0;
                    }
                }

                // Load the kernel with some variables.
                CudaDeviceVariable<int> cuda_topology = topology;
                CudaDeviceVariable<float> cuda_membank = matrix;
                CudaDeviceVariable<float> cuda_weights = weights;

                Stopwatch sw = new Stopwatch();
                sw.Start();
                kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width);
                cuda_membank.CopyToHost(res);
                sw.Stop();

                Console.ForegroundColor = ConsoleColor.Green;
                Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]);
                Console.ResetColor();

            }

            Console.ReadKey();
        }
        public void Compile()
        {
            using (var ctx = new CudaContext())
            {
                // with verbaim string @, we only have to double up double quotes: no other escaping
                string source = @"
                extern ""C"" __global__ 
                void saxpy(float a, float *x, float *y, float *out, size_t n)
                { 
	                size_t tid = blockIdx.x * blockDim.x + threadIdx.x; 
	                if (tid < n) 
	                { 
		                out[tid] = a * x[tid] + y[tid]; 
	                } 
                }
                ";

                source += Environment.NewLine;

                var name         = "Test";
                var headers      = new string[0];
                var includeNames = new string[0];

                var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames);

                //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames);
                // --ptxas-options=-v -keep
                compiler.Compile(new string[] { "-G" });

                //var ptxString = compiler.GetPTXAsString(); // for debugging

                var ptx = compiler.GetPTX();

                //compiler2.Compile(new string[] { });

                var kernel = ctx.LoadKernelPTX(ptx, "kernelName");

                //One kernel per cu file:
                //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname");
                kernel.GridDimensions  = new dim3(1, 1, 1);
                kernel.BlockDimensions = new dim3(16, 16);

                //kernel.Run()

                var a = new CudaDeviceVariable <double>(100);
                //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr()

                //Multiple kernels per cu file:
                CUmodule   cumodule = ctx.LoadModule(@"path\to\kernel.ptx");
                CudaKernel kernel1  = new CudaKernel("kernel1", cumodule, ctx)
                {
                    GridDimensions  = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
                CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx)
                {
                    GridDimensions  = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
            }
        }
Beispiel #19
0
        static void Main(string[] args)
        {
            // NOTE: You need to change this location to match your own machine.
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment.");
            Console.ResetColor();
            System.Threading.Thread.Sleep(500);

            string      path   = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx";
            CudaContext ctx    = new CudaContext();
            CUmodule    module = ctx.LoadModule(path);

            kernel = new CudaKernel("kernel", module, ctx);

            // This tells the kernel to allocate a lot of threads for the Gpu.
            kernel.BlockDimensions = THREADS_PER_BLOCK;
            kernel.GridDimensions  = VECTOR_SIZE / THREADS_PER_BLOCK + 1;;

            // Now let's load the kernel!
            // Create the topology.
            int[] topology = new int[] { 1, 200, 200, 100, 1 };

            int height = topology.Length;
            int width  = 0;

            for (int i = 0; i < topology.Length; i++)
            {
                if (width < topology[i])
                {
                    width = topology[i];
                }
            }

            // Launch!
            float[] res = new float[height * width];
            for (int i = 0; i < 10; i++)
            {
                float[] matrix  = new float[height * width];
                float[] weights = new float[height * width];
                Random  rand    = new Random(424242);
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        matrix[y * width + x]  = (y == 0 && x < topology[y]) ? 1.0f : 0;
                        weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0;
                    }
                }

                // Load the kernel with some variables.
                CudaDeviceVariable <int>   cuda_topology = topology;
                CudaDeviceVariable <float> cuda_membank  = matrix;
                CudaDeviceVariable <float> cuda_weights  = weights;

                Stopwatch sw = new Stopwatch();
                sw.Start();
                kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width);
                cuda_membank.CopyToHost(res);
                sw.Stop();

                Console.ForegroundColor = ConsoleColor.Green;
                Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]);
                Console.ResetColor();
            }

            Console.ReadKey();
        }
        public void cuFFTreconstruct()
        {
            CudaContext ctx = new CudaContext(0);

            ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx");
            CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx);

            float2[] fData  = new float2[Resolution * Resolution];
            float2[] result = new float2[Resolution * Resolution];
            FFTData2D = new float[Resolution, Resolution, 2];
            CudaDeviceVariable <float2> devData      = new CudaDeviceVariable <float2>(Resolution * Resolution);
            CudaDeviceVariable <float2> copy_devData = new CudaDeviceVariable <float2>(Resolution * Resolution);

            int    i, j;
            Random rnd  = new Random();
            double avrg = 0.0;

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    fData[i * Resolution + j].x = i + j * 2;
                    avrg += fData[i * Resolution + j].x;
                    fData[i * Resolution + j].y = 0.0f;
                }
            }

            avrg = avrg / (double)(Resolution * Resolution);

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg;
                }
            }

            devData.CopyToDevice(fData);

            CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);

            plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);

            cuKernel.GridDimensions  = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1);
            cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1);

            cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);

            copy_devData.CopyToHost(result);

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    FFTData2D[i, j, 0] = result[i * Resolution + j].x;
                    FFTData2D[i, j, 1] = result[i * Resolution + j].y;
                }
            }

            //Clean up
            devData.Dispose();
            copy_devData.Dispose();
            plan1D.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();
        }