public static void blaa() { int num = 10; //NewContext creation CudaContext cntxt = new CudaContext(); //Module loading from precompiled .ptx in a project output folder CUmodule cumodule = cntxt.LoadModule("kernel.ptx"); //_Z9addKernelPf - function name, can be found in *.ptx file CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt); //Create device array for data CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num); //Create arrays with data float[] vec1 = new float[num]; //Copy data to device vec1_device.CopyToDevice(vec1); //Set grid and block dimensions addWithCuda.GridDimensions = new dim3(8, 1, 1); addWithCuda.BlockDimensions = new dim3(512, 1, 1); //Run the kernel addWithCuda.Run( vec1_device.DevicePointer); //Copy data from device vec1_device.CopyToHost(vec1); }
//private CudaKernel kernel1; //public Class1() //{ // //int deviceID = 0; // //CudaContext ctx = new CudaContext(deviceID); // //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); // //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); //} public static double[,] TestMatrix(double[][,] a) { using (CudaContext ctx = new CudaContext(0)) { CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); var kernel = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); int dimZ = a.Length; int dimX = a[0].GetLength(0); int dimY = a[0].GetLength(1); kernel.GridDimensions = new dim3(28, 28, 1); kernel.BlockDimensions = new dim3(1, 1, 1); //kernel.BlockDimensions = new dim3(dimX, dimY, 1); // Allocate vectors in device memory and copy vectors from host memory to device memory CudaDeviceVariable <double> dA = a.ToLinearArray(); //CudaDeviceVariable<double> dB = ToLinearArray(b); CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY); // Invoke kernel kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ); // Copy result from device memory to host memory double[] c = dC; //ctx.FreeMemory(dC.DevicePointer); //ctx.FreeMemory(dA.DevicePointer); //ctx.Dispose(); return(ToMultyArray(c, dimX)); } }
public static void InitKernels() { CudaContext cntxt = new CudaContext(); //CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Michał\Documents\Visual Studio 2013\Projects\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx"); CUmodule cumodule = cntxt.LoadModule(@"D:\Grafika\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx"); addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt); }
void InitKernels() { var path = @"..\..\..\CudaParticleSimulation\kernel.ptx"; if (!System.IO.File.Exists(path)) { Debug.Error(path + " doesnt exists"); return; } var cntxt = new CudaContext(); uint deviceCount = 1; var devices = new CUdevice[50]; OpenGLNativeMethods.CUDA3.cuGLGetDevices(ref deviceCount, devices, 50, CUGLDeviceList.All); var context = cntxt.Context; OpenGLNativeMethods.CUDA3.cuGLCtxCreate(ref context, CUCtxFlags.BlockingSync, devices[0]); Debug.Info("Found " + deviceCount + " OpenGL devices associated with current context"); CUmodule cumodule = cntxt.LoadModule(path); updateParticles = new CudaKernel("updateParticles", cumodule, cntxt); updateParticles.BlockDimensions = new dim3(16 * 16, 1, 1); updateParticles.GridDimensions = new dim3(16 * 16, 1, 1); generateParticles = new CudaKernel("generateParticles", cumodule, cntxt); generateParticles.BlockDimensions = updateParticles.BlockDimensions; generateParticles.GridDimensions = updateParticles.GridDimensions; var random = new Random(); var randomFloats = new float[1000]; for (int i = 0; i < randomFloats.Length; i++) { randomFloats[i] = (float)random.NextDouble(); } generateParticles.SetConstantVariable("randomFloats", randomFloats); // CudaGraphicsInteropResourceCollection resources.Clear(); foreach (var h in renderer.particleMesh.allBufferHandles) { var resoure = new CudaOpenGLBufferInteropResource(h, CUGraphicsRegisterFlags.None, CUGraphicsMapResourceFlags.None); resources.Add(resoure); } randomIndex_D = 0; randomIndex_D.CopyToDevice(0); }
static void InitKernels() { CudaContext cntxt = new CudaContext(); var cumodule = cntxt.LoadModule(@"..\..\..\TestManaged\Debug\kernel.ptx"); fillVectorWithCuda = new CudaKernel("kernel", cumodule, cntxt); fillVectorWithCuda.BlockDimensions = THREADS_PER_BLOCK; fillVectorWithCuda.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; }
static void Main(string[] args) { CudaContext cntxt = new CudaContext(); var cumodule = cntxt.LoadModule(@"..\..\..\Marcher\Debug\march3.ptx"); var kernel = new CudaKernel("simpleKernel", cumodule, cntxt); kernel.SetConstantVariable("d_edgeTable", Tables.EDGE_TABLE); kernel.SetConstantVariable("d_triTable", Tables.TRI_TABLE); }
public CudaModuleHelper(CudaContext context, string file) { Context = context; Module = context.LoadModule(file); PtxFile = file; functionNames = File.ReadAllLines(file) .Where(x => x.Contains("// .globl")) .Select(x => x.Replace("// .globl", "").Trim()) .ToArray(); }
static void InitKernels() { //max thread number - 65534x256=16776704 _matrixSize = 256; _threadsPerBlock = 256; CleanUpResources(); _cnContext = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); CUmodule cumodule = _cnContext.LoadModule(@"\Kernel\kernel.ptx"); _multiplyTwoVectorWithCuda = new CudaKernel("_Z6kernel_", cumodule, _cnContext); }
public PatchTracker(int aMaxWidth, int aMaxHeight, List <int> aTileSizes, List <int> aMaxShifts, List <int> aLevels, CudaContext ctx) { forward = new CudaFFTPlanMany[aLevels.Count]; backward = new CudaFFTPlanMany[aLevels.Count]; //Allocate FFT plans SizeT oldFFTSize = 0; for (int i = 0; i < aTileSizes.Count; i++) { SizeT memFFT = InitFFT(i, aMaxWidth / aLevels[i], aMaxHeight / aLevels[i], aTileSizes[i], aMaxShifts[i]); if (memFFT > oldFFTSize) { oldFFTSize = memFFT; } } FTTBufferSize = oldFFTSize; //find maximum for allocations: for (int i = 0; i < aTileSizes.Count; i++) { currentWidth = aMaxWidth / aLevels[i]; currentHeight = aMaxHeight / aLevels[i]; currentTileSize = aTileSizes[i]; currentMaxShift = aMaxShifts[i]; int currentMaxPixelsShiftImage = (2 * currentMaxShift + 1) * (2 * currentMaxShift + 1) * CurrentBlockCountX * CurrentBlockCountY; maxPixelsShiftImage = Math.Max(currentMaxPixelsShiftImage, maxPixelsShiftImage); int tilePixels = CurrentBlockSize * CurrentBlockSize * CurrentBlockCountX * CurrentBlockCountY; maxPixelsImage = Math.Max(tilePixels, maxPixelsImage); int fftWidth = CurrentBlockSize / 2 + 1; int fftPixels = fftWidth * CurrentBlockSize * CurrentBlockCountX * CurrentBlockCountY; maxPixelsFFT = Math.Max(fftPixels, maxPixelsFFT); maxWidth = Math.Max(aMaxWidth / aLevels[i], maxWidth); maxHeight = Math.Max(aMaxHeight / aLevels[i], maxHeight); maxBlockCountX = Math.Max(maxBlockCountX, CurrentBlockCountX); maxBlockCountY = Math.Max(maxBlockCountY, CurrentBlockCountY); } CUmodule mod = ctx.LoadModule("kernel.ptx"); conjKernel = new conjugateComplexMulKernel(ctx, mod); convertToTiles = new convertToTilesOverlapKernel(ctx, mod); convertToTilesBorder = new convertToTilesOverlapBorderKernel(ctx, mod); squaredSumKernel = new squaredSumKernel(ctx, mod); boxFilterXKernel = new boxFilterWithBorderXKernel(ctx, mod); boxFilterYKernel = new boxFilterWithBorderYKernel(ctx, mod); normalizedCCKernel = new normalizedCCKernel(ctx, mod); findMinimumKernel = new findMinimumKernel(ctx, mod); }
static void InitKernels() { var context = new CudaContext(0); var cumodule = context.LoadModule(kernelPath); hilbertFinalTransformKernel = new CudaKernel("_Z23hilbert_final_transformP6float2S0_ii", cumodule, context) { BlockDimensions = MAX_THREADS_PER_BLOCK, GridDimensions = NUM_RECORDS / MAX_THREADS_PER_BLOCK + 1, }; hilbertIntermediateTransformKernel = new CudaKernel("_Z30hilbert_intermediate_transformP6float2ii", cumodule, context) { BlockDimensions = MAX_THREADS_PER_BLOCK, GridDimensions = NUM_RECORDS / MAX_THREADS_PER_BLOCK + 1, }; }
public PreAlignment(NPPImage_32fC1 img, CudaContext ctx) { width = img.WidthRoi; height = img.HeightRoi; imgToTrackRotated = new NPPImage_32fC1(width, height); CUmodule mod = ctx.LoadModule("kernel.ptx"); int fftWidth = width / 2 + 1; conjKernel = new conjugateComplexMulKernel(ctx, mod); fourierFilterKernel = new fourierFilterKernel(ctx, mod); fftshiftKernel = new fftshiftKernel(ctx, mod); squaredSumKernel = new squaredSumKernel(ctx, mod); boxFilterXKernel = new boxFilterWithBorderXKernel(ctx, mod); boxFilterYKernel = new boxFilterWithBorderYKernel(ctx, mod); normalizedCCKernel = new normalizedCCKernel(ctx, mod); findMinimumKernel = new findMinimumKernel(ctx, mod); int n = 2; int[] dims = new int[] { height, width }; int batches = 1; int[] inembed = new int[] { 1, imgToTrackRotated.Pitch / 4 }; int[] onembed = new int[] { 1, fftWidth }; int idist = height * imgToTrackRotated.Pitch / 4; int odist = height * fftWidth; int istride = 1; int ostride = 1; cufftHandle handleForward = cufftHandle.Create(); cufftHandle handleBackward = cufftHandle.Create(); SizeT sizeForward = new SizeT(); SizeT sizeBackward = new SizeT(); forward = new CudaFFTPlanMany(handleForward, n, dims, batches, cufftType.R2C, inembed, istride, idist, onembed, ostride, odist, ref sizeForward, false); backward = new CudaFFTPlanMany(handleBackward, n, dims, batches, cufftType.C2R, onembed, ostride, odist, inembed, istride, idist, ref sizeBackward, false); FFTBufferSize = sizeForward > sizeBackward ? sizeForward : sizeBackward; }
public override void Init() { int N = DataGenerator.InputCount; CudaContext cntxt = new CudaContext(); CUmodule cumodule = cntxt.LoadModule(@"kernel.cubin"); myKernel = new CudaKernel("proccess", cumodule, cntxt); //myKernel.GridDimensions = (N + 255) / 256; //myKernel.BlockDimensions = Math.Min(N, 256); myKernel.GridDimensions = (N + 255) / 256; myKernel.BlockDimensions = 256; // https://softwarehut.com/blog/general-purpose-computing-gpu-net-world-part-1/ //https://stackoverflow.com/questions/2392250/understanding-cuda-grid-dimensions-block-dimensions-and-threads-organization-s //myKernel.GridDimensions = new dim3(1, 1, 1); //myKernel.BlockDimensions = new dim3(16, 16); // init input parameters input1_dev = new CudaDeviceVariable <int>(DataGenerator.In1.Length); input2_dev = new CudaDeviceVariable <int>(DataGenerator.In2.Length); input3_dev = new CudaDeviceVariable <double>(DataGenerator.In3.Length); input4_dev = new CudaDeviceVariable <byte>(DataGenerator.In4_3_bytes.Length); result_dev = new CudaDeviceVariable <byte>(resultsBytes.Length); resultCalc_dev = new CudaDeviceVariable <double>(calculatables.Length); // copy input parameters input1_dev.CopyToDevice(DataGenerator.In1); input2_dev.CopyToDevice(DataGenerator.In2); input3_dev.CopyToDevice(DataGenerator.In3); input4_dev.CopyToDevice(DataGenerator.In4_3_bytes); // init output parameters //result_dev = new CudaDeviceVariable<bool>(results.Length); //myKernel.SetConstantVariable("width", DataGenerator.Width); //myKernel.SetConstantVariable("inputCount", N); //myKernel.SetConstantVariable("height", DataGenerator.Height); }
public KernelModule(CudaContext context, string path) { _context = context; _module = _context.LoadModule(path); }
public void Compile() { using (var ctx = new CudaContext()) { // with verbaim string @, we only have to double up double quotes: no other escaping string source = @" extern ""C"" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = a * x[tid] + y[tid]; } } "; source += Environment.NewLine; var name = "Test"; var headers = new string[0]; var includeNames = new string[0]; var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames); //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames); // --ptxas-options=-v -keep compiler.Compile(new string[] { "-G" }); //var ptxString = compiler.GetPTXAsString(); // for debugging var ptx = compiler.GetPTX(); //compiler2.Compile(new string[] { }); var kernel = ctx.LoadKernelPTX(ptx, "kernelName"); //One kernel per cu file: //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname"); kernel.GridDimensions = new dim3(1, 1, 1); kernel.BlockDimensions = new dim3(16, 16); //kernel.Run() var a = new CudaDeviceVariable<double>(100); //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr() //Multiple kernels per cu file: CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx"); CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; } }
static void InitKernels() { CudaContext cntxt = new CudaContext(); CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Niels\Documents\uni ting\P10\P10\programs\small programs\CUDA 1D MA in C Sharp\CUDA 1D MA in C Sharp\Debug\kernel.ptx"); addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt); }
static void Main(string[] args) { // NOTE: You need to change this location to match your own machine. Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment."); Console.ResetColor(); System.Threading.Thread.Sleep(500); string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx"; CudaContext ctx = new CudaContext(); CUmodule module = ctx.LoadModule(path); kernel = new CudaKernel("kernel", module, ctx); // This tells the kernel to allocate a lot of threads for the Gpu. kernel.BlockDimensions = THREADS_PER_BLOCK; kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ; // Now let's load the kernel! // Create the topology. int[] topology = new int[] { 1, 200, 200, 100, 1 }; int height = topology.Length; int width = 0; for (int i = 0; i < topology.Length; i++) if (width < topology[i]) width = topology[i]; // Launch! float[] res = new float[height * width]; for (int i = 0; i < 10; i++) { float[] matrix = new float[height * width]; float[] weights = new float[height * width]; Random rand = new Random(424242); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0; weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0; } } // Load the kernel with some variables. CudaDeviceVariable<int> cuda_topology = topology; CudaDeviceVariable<float> cuda_membank = matrix; CudaDeviceVariable<float> cuda_weights = weights; Stopwatch sw = new Stopwatch(); sw.Start(); kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width); cuda_membank.CopyToHost(res); sw.Stop(); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]); Console.ResetColor(); } Console.ReadKey(); }
public void Compile() { using (var ctx = new CudaContext()) { // with verbaim string @, we only have to double up double quotes: no other escaping string source = @" extern ""C"" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = a * x[tid] + y[tid]; } } "; source += Environment.NewLine; var name = "Test"; var headers = new string[0]; var includeNames = new string[0]; var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames); //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames); // --ptxas-options=-v -keep compiler.Compile(new string[] { "-G" }); //var ptxString = compiler.GetPTXAsString(); // for debugging var ptx = compiler.GetPTX(); //compiler2.Compile(new string[] { }); var kernel = ctx.LoadKernelPTX(ptx, "kernelName"); //One kernel per cu file: //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname"); kernel.GridDimensions = new dim3(1, 1, 1); kernel.BlockDimensions = new dim3(16, 16); //kernel.Run() var a = new CudaDeviceVariable <double>(100); //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr() //Multiple kernels per cu file: CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx"); CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; } }
static void Main(string[] args) { // NOTE: You need to change this location to match your own machine. Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment."); Console.ResetColor(); System.Threading.Thread.Sleep(500); string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx"; CudaContext ctx = new CudaContext(); CUmodule module = ctx.LoadModule(path); kernel = new CudaKernel("kernel", module, ctx); // This tells the kernel to allocate a lot of threads for the Gpu. kernel.BlockDimensions = THREADS_PER_BLOCK; kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1;; // Now let's load the kernel! // Create the topology. int[] topology = new int[] { 1, 200, 200, 100, 1 }; int height = topology.Length; int width = 0; for (int i = 0; i < topology.Length; i++) { if (width < topology[i]) { width = topology[i]; } } // Launch! float[] res = new float[height * width]; for (int i = 0; i < 10; i++) { float[] matrix = new float[height * width]; float[] weights = new float[height * width]; Random rand = new Random(424242); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0; weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0; } } // Load the kernel with some variables. CudaDeviceVariable <int> cuda_topology = topology; CudaDeviceVariable <float> cuda_membank = matrix; CudaDeviceVariable <float> cuda_weights = weights; Stopwatch sw = new Stopwatch(); sw.Start(); kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width); cuda_membank.CopyToHost(res); sw.Stop(); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]); Console.ResetColor(); } Console.ReadKey(); }
public void cuFFTreconstruct() { CudaContext ctx = new CudaContext(0); ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx"); CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx); float2[] fData = new float2[Resolution * Resolution]; float2[] result = new float2[Resolution * Resolution]; FFTData2D = new float[Resolution, Resolution, 2]; CudaDeviceVariable <float2> devData = new CudaDeviceVariable <float2>(Resolution * Resolution); CudaDeviceVariable <float2> copy_devData = new CudaDeviceVariable <float2>(Resolution * Resolution); int i, j; Random rnd = new Random(); double avrg = 0.0; for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { fData[i * Resolution + j].x = i + j * 2; avrg += fData[i * Resolution + j].x; fData[i * Resolution + j].y = 0.0f; } } avrg = avrg / (double)(Resolution * Resolution); for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg; } } devData.CopyToDevice(fData); CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution); plan1D.Exec(devData.DevicePointer, TransformDirection.Forward); cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1); cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1); cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution); copy_devData.CopyToHost(result); for (i = 0; i < Resolution; i++) { for (j = 0; j < Resolution; j++) { FFTData2D[i, j, 0] = result[i * Resolution + j].x; FFTData2D[i, j, 1] = result[i * Resolution + j].y; } } //Clean up devData.Dispose(); copy_devData.Dispose(); plan1D.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); }