private void initGLAndCuda() { //Create render target control m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default); m_renderControl.Dock = DockStyle.Fill; m_renderControl.BackColor = Color.White; m_renderControl.BorderStyle = BorderStyle.FixedSingle; m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown); m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove); m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown); m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged); panel1.Controls.Add(m_renderControl); Console.WriteLine(" OpenGL device is Available"); int deviceID = CudaContext.GetMaxGflopsDeviceId(); ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync); string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount); Console.WriteLine(console); CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx); hvfield = new cData[DS]; dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM); tPitch = dvfield.Pitch; dvfield.CopyToDevice(hvfield); vxfield = new CudaDeviceVariable<cData>(DS); vyfield = new CudaDeviceVariable<cData>(DS); // Create particle array particles = new cData[DS]; initParticles(particles, DIM, DIM); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); GL.GenBuffers(1, out vbo); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw); int bsize; GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize); if (bsize != DS * cData.SizeOf) throw new Exception("Sizes don't match."); GL.BindBuffer(BufferTarget.ArrayBuffer, 0); cuda_vbo_resource = new CudaGraphicsInteropResourceCollection(); cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None)); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); stopwatch = new CudaStopWatch(CUEventFlags.Default); reshape(); isInit = true; display(); }
public GPU_Functionality(int deviceID = 0) { ctx = new CudaContext(deviceID); version = ctx.GetDeviceComputeCapability(); Trace.WriteLine($"cuda compute capability {version.Major}.{version.Minor}"); CUmodule collision_module = ctx.LoadModulePTX("collision_kernels.ptx"); kNarrowPhase = new CudaKernel("kNarrowPhase_new", collision_module, ctx); kFindClosestFace = new CudaKernel("kFindClosestFace", collision_module, ctx); kCollisionResponseForce = new CudaKernel("kCollisionResponseForce", collision_module, ctx); dim3 block = new dim3(block_size, 1, 1); kNarrowPhase.BlockDimensions = block; kFindClosestFace.BlockDimensions = block; kCollisionResponseForce.BlockDimensions = block; // cz CUmodule module_cz_kernels = ctx.LoadModulePTX("cz_kernels.ptx"); kczCZForce = new CudaKernel("kczCZForce", module_cz_kernels, ctx); kczCZForce.BlockDimensions = block; // elem CUmodule module_elem_kernels = ctx.LoadModulePTX("elem_kernels.ptx"); kelElementElasticityForce = new CudaKernel("kelElementElasticityForce", module_elem_kernels, ctx); kelElementElasticityForce.BlockDimensions = block; }
private void Form1_Load(object sender, EventArgs e) { ctx = new PrimaryContext(); ctx.SetCurrent(); modPRelu = ctx.LoadModulePTX("PRelu.ptx"); modDeBayer = ctx.LoadModulePTX("DeBayer.ptx"); modColor = ctx.LoadModulePTX("ImageColorProcessing.ptx"); createBayerKernel = new CreateBayerWithNoiseKernel(ctx, modDeBayer); deBayerGreenKernel = new DeBayerGreenKernel(modDeBayer, ctx); deBayerRedBlueKernel = new DeBayerRedBlueKernel(modDeBayer, ctx); setupCurandKernel = new SetupCurandKernel(ctx, modDeBayer); highlightRecoveryKernel = new HighlightRecoveryKernel(modColor, ctx); camToXYZKernel = new ConvertCamToXYZKernel(modColor, ctx); convertRGBTosRGBKernel = new ConvertRGBTosRGBKernel(modColor, ctx); //constant variable is set for the entire module! createBayerKernel.BayerPattern = new BayerColor[] { BayerColor.Red, BayerColor.Green, BayerColor.Green, BayerColor.Blue }; //If you do not have CUDNN, set the last parameter to false (use NPP instead) denoiseAndDemoisaic = new DenoiseAndDemoisaic(TileSize, ctx, modPRelu, true); CuRandStates = new CudaDeviceVariable <byte>(TileSize * TileSize * 48); //one state has the size of 48 bytes setupCurandKernel.RunSafe(CuRandStates, TileSize * TileSize); tile = new NPPImage_32fC3(TileSize, TileSize); cmb_IsoValue.SelectedIndex = 0; }
public GpuMathOperations() { this.cuBlas = new CudaBlas(); this.cudaContext = new CudaContext(); this.cuModule = cudaContext.LoadModulePTX(kernalFile); this.maxThreadPerBlockDim = (int)Math.Sqrt(this.cudaContext.GetDeviceInfo().MaxThreadsPerBlock); }
static void InitKernels() { cntxt = new CudaContext(); CUmodule cumodule = cntxt.LoadModulePTX(@"C:\work\Sobel\CudaTest\x64\Debug\kernel.ptx"); matrixSumCude = new CudaKernel("_Z15matrixSumKernelPdPKdiii", cumodule, cntxt); }
public void JITcompile() { tic(); foreach (var sc in sourceCodes) { if (sc.ptx == null) { sc.mod = ctx.LoadModulePTX(sc.cubin, null, null); // no jit, just upload } else { sc.mod = ctx.LoadModulePTX(sc.ptx); //jit } } SampleCollector.Collect("JIT", toc()); }
public GrabCutGMM() { ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId(), false); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "GrabCutGMM_x64.ptx"; } else { resName = "GrabCutGMM.ptx"; } string resNamespace = "GrabCutNPP"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } byte[] kernel = new byte[stream.Length]; int bytesToRead = (int)stream.Length; while (bytesToRead > 0) { bytesToRead -= stream.Read(kernel, (int)stream.Position, bytesToRead); } CUmodule module = ctx.LoadModulePTX(kernel); GMMReductionKernelCreateGmmFlags = new CudaKernel("_Z18GMMReductionKernelILi4ELb1EEviPfiPK6uchar4iPhiiiPj", module, ctx); GMMReductionKernelNoCreateGmmFlags = new CudaKernel("_Z18GMMReductionKernelILi4ELb0EEviPfiPK6uchar4iPhiiiPj", module, ctx); GMMFinalizeKernelInvertSigma = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb1EEvPfS0_ii", module, ctx); GMMFinalizeKernelNoInvertSigma = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb0EEvPfS0_ii", module, ctx); GMMcommonTerm = new CudaKernel("_Z13GMMcommonTermiPfi", module, ctx); DataTermKernel = new CudaKernel("_Z14DataTermKernelPiiiPKfiPK6uchar4iPKhiii", module, ctx); GMMAssignKernel = new CudaKernel("_Z15GMMAssignKerneliPKfiPK6uchar4iPhiii", module, ctx); GMMFindSplit = new CudaKernel("_Z12GMMFindSplitP10GMMSplit_tiPfi", module, ctx); GMMDoSplit = new CudaKernel("_Z10GMMDoSplitPK10GMMSplit_tiPfiPK6uchar4iPhiii", module, ctx); MeanEdgeStrengthReductionKernel = new CudaKernel("_Z31MeanEdgeStrengthReductionKerneliiPf", module, ctx); MeanEdgeStrengthFinalKernel = new CudaKernel("_Z27MeanEdgeStrengthFinalKernelPfi", module, ctx); EdgeCuesKernel = new CudaKernel("_Z14EdgeCuesKernelfPKfPiS1_S1_S1_S1_S1_S1_S1_iiii", module, ctx); SegmentationChangedKernel = new CudaKernel("_Z25SegmentationChangedKernelPiPhS0_iii", module, ctx); downscaleKernel1 = new CudaKernel("_Z18downscaleKernelBoxI6uchar4EvPT_iiiPKS1_iii", module, ctx); downscaleKernel2 = new CudaKernel("_Z18downscaleKernelMaxIhEvPT_iiiPKS0_iii", module, ctx); upsampleAlphaKernel = new CudaKernel("_Z19upsampleAlphaKernelPhS_iiii", module, ctx); GMMFinalizeKernelInvertSigma.SetConstantVariable("det_indices", det_indices); GMMFinalizeKernelInvertSigma.SetConstantVariable("inv_indices", inv_indices); GMMFinalizeKernelNoInvertSigma.SetConstantVariable("det_indices", det_indices); GMMFinalizeKernelNoInvertSigma.SetConstantVariable("inv_indices", inv_indices); }
private void InitializeCUDA() { context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11); module = context.LoadModulePTX(@"Kernels\kernel.ptx"); kernelPositionWeightNoiseCube = new CudaKernel("position_weight_noise_cube", module, context); kernelNormalAmbient = new CudaKernel("normal_ambient", module, context); kernelMarchingCubesCases = new CudaKernel("marching_cubes_cases", module, context); kernelMarchingCubesVertices = new CudaKernel("marching_cubes_vertices", module, context); kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context); kernelPositionWeightFormula = new CudaKernel("position_weight_formula", module, context); prefixScan = new CUDAPrefixScan(module, context); }
public OpticalFlow(int width, int height, CudaContext ctx) { CUmodule mod = ctx.LoadModulePTX("opticalFlow.ptx"); warpingKernel = new WarpingKernel(ctx, mod); createFlowFieldFromTiles = new CreateFlowFieldFromTiles(ctx, mod); computeDerivativesKernel = new ComputeDerivativesKernel(ctx, mod); lukasKanade = new LukasKanadeKernel(ctx, mod); d_tmp = new NPPImage_32fC1(width, height); d_Ix = new NPPImage_32fC1(width, height); d_Iy = new NPPImage_32fC1(width, height); d_Iz = new NPPImage_32fC1(width, height); d_flow = new NPPImage_32fC2(width, height); buffer = new CudaDeviceVariable <byte>(d_tmp.MeanStdDevGetBufferHostSize() * 3); mean = new CudaDeviceVariable <double>(1); std = new CudaDeviceVariable <double>(1); d_filterX = new float[] { -0.25f, 0.25f, -0.25f, 0.25f }; d_filterY = new float[] { -0.25f, -0.25f, 0.25f, 0.25f }; d_filterT = new float[] { 0.25f, 0.25f, 0.25f, 0.25f }; }
public ShiftCollection(int aFrameCount, int aMaxTileCountX, int aMaxTileCountY, int aReferenceIndex, TrackingStrategy aStrategy, int aBlockSize, CudaContext ctx) { strategy = aStrategy; referenceIndex = aReferenceIndex; frameCount = aFrameCount; if (aBlockSize >= aFrameCount) { blockSize = aFrameCount - 1; } else { blockSize = aBlockSize; } blas = new CudaBlas(PointerMode.Device, AtomicsMode.Allowed); one = 1.0f; zero = 0.0f; shiftPairs = new List <ShiftPair>(); int shiftCount = GetShiftCount(); FillShiftPairs(); FillIndexTable(); if (shiftPairs.Count != shiftCount) { throw new Exception("Ooups, something went wrong with my math..."); } shifts = new List <NPPImage_32fC2>(shiftCount); int[] shiftPitches_h = new int[shiftCount]; CUdeviceptr[] ptrList = new CUdeviceptr[shiftCount]; for (int i = 0; i < shiftCount; i++) { NPPImage_32fC2 devVar = new NPPImage_32fC2(aMaxTileCountX, aMaxTileCountY); shifts.Add(devVar); shiftPitches_h[i] = devVar.Pitch; ptrList[i] = devVar.DevicePointer; } shiftPitches = shiftPitches_h; AllShifts_d = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * shiftCount); shiftsOneToOne_d = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1)); shifts_d = ptrList; status = new CudaDeviceVariable <int>(aMaxTileCountX * aMaxTileCountY); infoInverse = new CudaDeviceVariable <int>(aMaxTileCountX * aMaxTileCountY); shiftMatrixArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); shiftMatrixSafeArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); matrixSquareArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); matrixInvertedArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); solvedMatrixArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); shiftOneToOneArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); shiftMeasuredArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); shiftOptimArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY); shiftMatrices = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * shiftCount * (frameCount - 1)); shiftSafeMatrices = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * shiftCount * (frameCount - 1)); matricesSquared = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1) * (frameCount - 1)); matricesInverted = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1) * (frameCount - 1)); solvedMatrices = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * shiftCount * (frameCount - 1)); shiftsOneToOne = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1)); pivotArray = new CudaDeviceVariable <int>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1)); shiftsMeasured = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * shiftCount); shiftsOptim = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * shiftCount); buffer = new CudaDeviceVariable <byte>(status.SumGetBufferSize()); statusSum = new CudaDeviceVariable <int>(1); CUmodule mod = ctx.LoadModulePTX("ShiftMinimizerKernels.ptx"); concatenateShifts = new concatenateShiftsKernel(ctx, mod); separateShifts = new separateShiftsKernel(ctx, mod); getOptimalShifts = new getOptimalShiftsKernel(ctx, mod); copyShiftMatrixKernel = new copyShiftMatrixKernel(ctx, mod); setPointers = new setPointersKernel(ctx, mod); checkForOutliers = new checkForOutliersKernel(ctx, mod); transposeShifts = new transposeShiftsKernel(ctx, mod); setPointers.RunSafe(shiftMatrixArray, shiftMatrixSafeArray, matrixSquareArray, matrixInvertedArray, solvedMatrixArray, shiftOneToOneArray, shiftMeasuredArray, shiftOptimArray, shiftMatrices, shiftSafeMatrices, matricesSquared, matricesInverted, solvedMatrices, shiftsOneToOne, shiftsMeasured, shiftsOptim, aMaxTileCountX * aMaxTileCountY, frameCount, shiftCount); Reset(); }
static void Main(string[] args) { //Read CL arguments for (int i = 0; i < args.Length; i++) { if (args[i] == "-d") { deviceID = int.Parse(args[++i]); } if (args[i] == "-lr") { learning_rate = double.Parse(args[++i], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture); } if (args[i] == "-iso") { ISO = args[++i]; } if (args[i] == "-t") { crosscheck = true; } if (args[i] == "-w") { warmStart = int.Parse(args[++i]); Console.WriteLine("Start with epoch " + warmStart); } if (args[i] == "-s") { saveImages = true; } } Console.WriteLine("Using device ID: " + deviceID); Console.WriteLine("Learning rate: " + learning_rate); //Init Cuda stuff ctx = new PrimaryContext(deviceID); ctx.SetCurrent(); Console.WriteLine("Context created"); CUmodule modPatch = ctx.LoadModulePTX("PatchProcessing.ptx"); Console.WriteLine("modPatch loaded"); CUmodule modBorder = ctx.LoadModulePTX("BorderTreatment.ptx"); Console.WriteLine("modBorder loaded"); CUmodule modError = ctx.LoadModulePTX("ErrorComputation.ptx"); Console.WriteLine("modError loaded"); CUmodule modPRelu = ctx.LoadModulePTX("PRelu.ptx"); Console.WriteLine("modPRelu loaded"); CUmodule modDeBayer = ctx.LoadModulePTX("DeBayer.ptx"); Console.WriteLine("all modules loaded"); deBayerGreenKernel = new DeBayerGreenKernel(modDeBayer, ctx); deBayerRedBlueKernel = new DeBayerRedBlueKernel(modDeBayer, ctx); //Both deBayer kernels are load from the same module: setting the constant variable for bayer pattern one is enough... deBayerGreenKernel.BayerPattern = new BayerColor[] { BayerColor.Red, BayerColor.Green, BayerColor.Green, BayerColor.Blue }; prepareDataKernel = new PrepareDataKernel(modPatch, ctx); restoreImageKernel = new RestoreImageKernel(modPatch, ctx); Console.WriteLine("kernels loaded"); int countOwn = 468083; int count5k = 33408; string fileBase = @"/ssd/data/TrainingsDataNN/"; List <float3> WhiteBalanceFactors = new List <float3>(); FileStream fs1 = new FileStream(fileBase + "FromOwnDataset/WhiteBalancesOwn.txt", FileMode.Open, FileAccess.Read); FileStream fs2 = new FileStream(fileBase + "From5kDataset/WhiteBalances5k.txt", FileMode.Open, FileAccess.Read); StreamReader sr1 = new StreamReader(fs1); StreamReader sr2 = new StreamReader(fs2); for (int i = 0; i < countOwn; i++) { fileRawList.Add(fileBase + "FromOwnDataset/ISO" + ISO + "/img_" + i.ToString("0000000") + ".bin"); fileTrouthList.Add(fileBase + "FromOwnDataset/GroundTruth/img_" + i.ToString("0000000") + ".bin"); string line = sr1.ReadLine(); string[] values = line.Split('\t'); float3 wb = new float3(float.Parse(values[1], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture), float.Parse(values[2], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture), float.Parse(values[3], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture)); WhiteBalanceFactors.Add(wb); } for (int i = 0; i < count5k; i++) { fileRawList.Add(fileBase + "From5kDataset/ISO" + ISO + "/img_" + i.ToString("0000000") + ".bin"); fileTrouthList.Add(fileBase + "From5kDataset/GroundTruth/img_" + i.ToString("0000000") + ".bin"); string line = sr2.ReadLine(); string[] values = line.Split('\t'); float3 wb = new float3(float.Parse(values[1], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture), float.Parse(values[2], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture), float.Parse(values[3], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture)); WhiteBalanceFactors.Add(wb); } sr2.Close(); sr1.Close(); baOriginal = new float3[countOwn + count5k][]; baRAW = new float[countOwn + count5k][]; Random rand = new Random(0); //random order for the image patches for (int i = 0; i < countOwn + count5k - 1; i++) { int r = i + (rand.Next() % (countOwn + count5k - i)); string temp = fileRawList[i]; fileRawList[i] = fileRawList[r]; fileRawList[r] = temp; temp = fileTrouthList[i]; fileTrouthList[i] = fileTrouthList[r]; fileTrouthList[r] = temp; float3 tempf = WhiteBalanceFactors[i]; WhiteBalanceFactors[i] = WhiteBalanceFactors[r]; WhiteBalanceFactors[r] = tempf; } Console.WriteLine("Initialization done!"); int trainingSize = (int)((countOwn + count5k) * 0.9f); //4 patches per file int testSize = fileRawList.Count - trainingSize; CudaBlas blas = new CudaBlas(PointerMode.Host); CudaDNNContext cudnn = new CudaDNNContext(); int patchSize = 31; int patchSize4 = 66; //Size of an 2x2 patch read from file int batch = 64; float normalization = 0.5f; //define neural network: StartLayer start = new StartLayer(patchSize, patchSize, 3, batch); FinalLayer final = new FinalLayer(patchSize, patchSize, 3, batch, FinalLayer.Norm.Mix, ctx, modError); ConvolutionalLayer conv1 = new ConvolutionalLayer(patchSize, patchSize, 3, patchSize, patchSize, 64, batch, 9, 9, ConvolutionalLayer.Activation.PRelu, blas, cudnn, ctx, modBorder, modPRelu); ConvolutionalLayer conv2 = new ConvolutionalLayer(patchSize, patchSize, 64, patchSize, patchSize, 64, batch, 5, 5, ConvolutionalLayer.Activation.PRelu, blas, cudnn, ctx, modBorder, modPRelu); ConvolutionalLayer conv3 = new ConvolutionalLayer(patchSize, patchSize, 64, patchSize, patchSize, 3, batch, 5, 5, ConvolutionalLayer.Activation.None, blas, cudnn, ctx, modBorder, modPRelu); start.ConnectFollowingLayer(conv1); conv1.ConnectFollowingLayer(conv2); conv2.ConnectFollowingLayer(conv3); conv3.ConnectFollowingLayer(final); CudaDeviceVariable <float3> imgA = new CudaDeviceVariable <float3>(patchSize4 * patchSize4); CudaDeviceVariable <float3> imgB = new CudaDeviceVariable <float3>(patchSize4 * patchSize4); CudaDeviceVariable <float> rawd = new CudaDeviceVariable <float>(patchSize4 * patchSize4); CudaDeviceVariable <float> inputImgs = new CudaDeviceVariable <float>(patchSize * patchSize * 3 * batch); CudaDeviceVariable <float> groundTrouth = new CudaDeviceVariable <float>(patchSize * patchSize * 3 * batch); NPPImage_8uC3 imgU3a = new NPPImage_8uC3(patchSize, patchSize); NPPImage_8uC3 imgU3b = new NPPImage_8uC3(patchSize, patchSize); NPPImage_8uC3 imgU3c = new NPPImage_8uC3(patchSize, patchSize); Bitmap a = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb); Bitmap b = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb); Bitmap c = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb); Random randImageOutput = new Random(0); Random randForInit = new Random(0); start.InitRandomWeight(randForInit); conv1.SetActivation(0.1f); conv2.SetActivation(0.1f); int startEpoch = warmStart; FileStream fs; //restore network in case of warm start: if (warmStart > 0) { fs = new FileStream("epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + (warmStart - 1) + ".cnn", FileMode.Open, FileAccess.Read); start.RestoreValues(fs); fs.Close(); fs.Dispose(); } //validate results on validation data set if (crosscheck) { FileStream csvResult = new FileStream("results_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write); StreamWriter sw = new StreamWriter(csvResult); sw.WriteLine("L1;L2;Mix;Filename"); for (int i = 0; i < 2000; i += 1) { string filename = "epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + i + ".cnn"; try { FileStream cnn = new FileStream(filename, FileMode.Open, FileAccess.Read); start.RestoreValues(cnn); cnn.Close(); cnn.Dispose(); } catch (Exception) { Console.WriteLine("Skipping: " + i); continue; } double errorL1 = 0; double errorL2 = 0; double errorMix = 0; for (int iter = 0; iter < testSize / batch * 4; iter++) { //Prepare batch for training: for (int ba = 0; ba < batch / 4; ba++) { int idx = iter * (batch / 4) + ba + trainingSize; float3[] original; float[] raw; if (baRAW[idx - trainingSize] == null) { original = ReadRAWFloat3(fileTrouthList[idx]); raw = ReadRAWFloat(fileRawList[idx]); baOriginal[idx - trainingSize] = original; baRAW[idx - trainingSize] = raw; } else { original = baOriginal[idx - trainingSize]; raw = baRAW[idx - trainingSize]; } rawd.CopyToDevice(raw); imgA.CopyToDevice(original); deBayerGreenKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]); deBayerRedBlueKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]); prepareDataKernel.RunSafe(imgA, imgB, groundTrouth, inputImgs, ba, normalization, WhiteBalanceFactors[idx]); } start.SetData(inputImgs); final.SetGroundTrouth(groundTrouth); float err = start.InferenceTraining(inputImgs); errorMix += err; errorL1 += final.GetError(FinalLayer.Norm.L1); errorL2 += final.GetError(FinalLayer.Norm.L2); } Console.WriteLine("Results for: " + filename); Console.WriteLine("Mean Error L1: " + errorL1 / testSize * batch / 4); Console.WriteLine("Mean Error L2: " + errorL2 / testSize * batch / 4); Console.WriteLine("Mean Error Mix: " + errorMix / testSize * batch / 4); sw.Write((errorL1 / testSize * batch / 4).ToString().Replace(".", ",")); sw.Write(";"); sw.Write((errorL2 / testSize * batch / 4).ToString().Replace(".", ",")); sw.Write(";"); sw.Write((errorMix / testSize * batch / 4).ToString().Replace(".", ",")); sw.Write(";"); sw.WriteLine(filename); sw.Flush(); } sw.Close(); csvResult.Close(); csvResult.Dispose(); } //or train existing network: else { double error = 0; double errorEpoch = 0; for (int epoch = startEpoch; epoch < 2000; epoch++) { errorEpoch = 0; error = 0; for (int iter = 0; iter < trainingSize / batch * 4; iter++) { //Prepare batch for training: for (int ba = 0; ba < batch / 4; ba++) { int idx = iter * (batch / 4) + ba; float3[] original; float[] raw; if (baRAW[idx] == null) { original = ReadRAWFloat3(fileTrouthList[idx]); raw = ReadRAWFloat(fileRawList[idx]); baOriginal[idx] = original; baRAW[idx] = raw; } else { original = baOriginal[idx]; raw = baRAW[idx]; } rawd.CopyToDevice(raw); imgA.CopyToDevice(original); deBayerGreenKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]); deBayerRedBlueKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]); prepareDataKernel.RunSafe(imgA, imgB, groundTrouth, inputImgs, ba, normalization, WhiteBalanceFactors[idx]); } start.SetData(inputImgs); final.SetGroundTrouth(groundTrouth); float err = start.InferenceTraining(inputImgs); final.BackPropagation(groundTrouth); start.UpdateWeights(GetLearningRate(epoch * (trainingSize) / batch * 4 + iter));//*0+951342 error += err; errorEpoch += err; if ((epoch * trainingSize / batch * 4 + iter) % 1000 == 0 && iter != 0) { FileStream status = new FileStream("status_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write); StreamWriter sw = new StreamWriter(status); sw.WriteLine((error / 1000.0).ToString().Replace(".", ",") + ";" + GetLearningRate(epoch * trainingSize / batch * 4 + iter).ToString().Replace(".", ",")); sw.Close(); status.Close(); status.Dispose(); error = 0; } //if ((epoch * trainingSize / batch * 4 + iter) % 10000 == 0) //{ // fs = new FileStream("iter_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + (epoch * trainingSize / batch * 4 + iter) + ".cnn", FileMode.Create, FileAccess.Write); // start.SaveValues(fs); // fs.Close(); // fs.Dispose(); // Console.WriteLine("Network saved for iteration " + (epoch * trainingSize / batch * 4 + iter) + "!"); //} Console.WriteLine("Epoch: " + epoch + " Iteration: " + (epoch * trainingSize / batch * 4 + iter) + ", Error: " + err); if (saveImages && iter == 0)//(epoch * trainingSize / batch * 4 + iter) % 10000 == 0 && { for (int i = 0; i < 1; i++) { int imgidx = randImageOutput.Next(batch); float3 wb = WhiteBalanceFactors[iter * (batch / 4) + imgidx / 4]; restoreImageKernel.RunSafe(groundTrouth, imgU3a, imgidx, wb.x, wb.y, wb.z, normalization); restoreImageKernel.RunSafe(inputImgs, imgU3b, imgidx, wb.x, wb.y, wb.z, normalization); CudaDeviceVariable <float> res = final.GetResult(); restoreImageKernel.RunSafe(res, imgU3c, imgidx, wb.x, wb.y, wb.z, normalization); imgU3a.CopyToHost(a); imgU3b.CopyToHost(b); imgU3c.CopyToHost(c); a.Save("GroundTrouth_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png");// * trainingSize / batch * 4 + iter b.Save("Input_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png"); c.Save("Result_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png"); } } } errorEpoch /= trainingSize / batch * 4; fs = new FileStream("errorEpoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write); StreamWriter sw2 = new StreamWriter(fs); sw2.WriteLine(errorEpoch.ToString().Replace(".", ",")); sw2.Close(); fs.Close(); fs.Dispose(); fs = new FileStream("epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + ".cnn", FileMode.Create, FileAccess.Write); start.SaveValues(fs); fs.Close(); fs.Dispose(); } } }
private void InitializeD3D() { // Create the D3D object. d3d = new Direct3DEx(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = panel1.Handle; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA."); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA."); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { Console.WriteLine("No CUDA-compatible Direct3D9 device available"); if (device != null) { device.Dispose(); } Close(); return; } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); }
private bool InitializeD3D() { HwndSource hwnd = new HwndSource(0, 0, 0, 0, 0, "null", IntPtr.Zero); // Create the D3D object. d3d = new Direct3D(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = (IntPtr)0; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new Device(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, hwnd.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.\n"); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.\n"); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { infoLog.AppendText("No CUDA-compatible Direct3D9 device available"); if (device != null) { device.Dispose(); } return(false); } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); deviceName.Text = "Device name: " + ctx.GetDeviceName(); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); d3dimage.Lock(); Surface surf = device.GetBackBuffer(0, 0); d3dimage.SetBackBuffer(D3DResourceType.IDirect3DSurface9, surf.ComPointer); d3dimage.Unlock(); surf.Dispose(); //Setup the "real" frame rate counter. //The cuda counter only measures cuda runtime, not the overhead to actually //show the result via DirectX and WPF. realLastTick = Environment.TickCount; return(true); }
private bool InitializeD3D() { HwndSource hwnd = new HwndSource(0, 0, 0, 0, 0, "null", IntPtr.Zero); // Create the D3D object. d3d = new Direct3DEx(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = (IntPtr)0; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, hwnd.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.\n"); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.\n"); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { infoLog.AppendText("No CUDA-compatible Direct3D9 device available"); if (device != null) device.Dispose(); return false; } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); deviceName.Text = "Device name: " + ctx.GetDeviceName(); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); d3dimage.Lock(); Surface surf = device.GetBackBuffer(0, 0); d3dimage.SetBackBuffer(D3DResourceType.IDirect3DSurface9, surf.ComPointer); d3dimage.Unlock(); surf.Dispose(); //Setup the "real" frame rate counter. //The cuda counter only measures cuda runtime, not the overhead to actually //show the result via DirectX and WPF. realLastTick = Environment.TickCount; return true; }
private void InitializeD3D() { // Create the D3D object. d3d = new Direct3DEx(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = panel1.Handle; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA."); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA."); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { Console.WriteLine("No CUDA-compatible Direct3D9 device available"); if (device != null) device.Dispose(); Close(); return; } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); }
private void initGLAndCuda() { //Create render target control m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default); m_renderControl.Dock = DockStyle.Fill; m_renderControl.BackColor = Color.White; m_renderControl.BorderStyle = BorderStyle.FixedSingle; m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown); m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove); m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown); m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged); panel1.Controls.Add(m_renderControl); Console.WriteLine(" OpenGL device is Available"); int deviceID = CudaContext.GetMaxGflopsDeviceId(); ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync); string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount); Console.WriteLine(console); CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx); hvfield = new cData[DS]; dvfield = new CudaPitchedDeviceVariable <cData>(DIM, DIM); tPitch = dvfield.Pitch; dvfield.CopyToDevice(hvfield); vxfield = new CudaDeviceVariable <cData>(DS); vyfield = new CudaDeviceVariable <cData>(DS); // Create particle array particles = new cData[DS]; initParticles(particles, DIM, DIM); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); GL.GenBuffers(1, out vbo); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.BufferData <cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw); int bsize; GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize); if (bsize != DS * cData.SizeOf) { throw new Exception("Sizes don't match."); } GL.BindBuffer(BufferTarget.ArrayBuffer, 0); cuda_vbo_resource = new CudaGraphicsInteropResourceCollection(); cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None)); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); stopwatch = new CudaStopWatch(CUEventFlags.Default); reshape(); isInit = true; display(); }
private static async Task <CudaModule> CompileAsync( IMethod method, IEnumerable <ITypeMember> memberRoots, IEnumerable <IType> typeRoots, int threadIdParamIndex, ClrAssembly assembly, CudaContext context) { // Figure out which members we need to compile. var desc = await CreateContentDescriptionAsync(method, memberRoots, typeRoots, assembly); // Compile those members to LLVM IR. Use an Itanium name mangling scheme. var mangler = new ItaniumMangler(assembly.Resolver.TypeEnvironment); var moduleBuilder = LlvmBackend.Compile(desc, assembly.Resolver.TypeEnvironment); var module = moduleBuilder.Module; // Generate type metadata for all type roots. foreach (var type in typeRoots) { moduleBuilder.Metadata.GetMetadata(type, moduleBuilder); } // Get the compiled kernel function. var kernelFuncName = mangler.Mangle(method, true); var kernelFunc = LLVM.GetNamedFunction(module, kernelFuncName); if (threadIdParamIndex >= 0) { // If we have a thread ID parameter, then we need to generate a thunk // kernel function that calls our actual kernel function. This thunk's // responsibility is to determine the thread ID of the kernel. var thunkKernelName = "kernel"; var thunkTargetType = kernelFunc.TypeOf().GetElementType(); var thunkParamTypes = new List <LLVMTypeRef>(thunkTargetType.GetParamTypes()); if (threadIdParamIndex < thunkParamTypes.Count) { thunkParamTypes.RemoveAt(threadIdParamIndex); } var thunkKernel = LLVM.AddFunction( module, thunkKernelName, LLVM.FunctionType( thunkTargetType.GetReturnType(), thunkParamTypes.ToArray(), thunkTargetType.IsFunctionVarArg)); using (var builder = new IRBuilder(moduleBuilder.Context)) { builder.PositionBuilderAtEnd(thunkKernel.AppendBasicBlock("entry")); var args = new List <LLVMValueRef>(thunkKernel.GetParams()); args.Insert(threadIdParamIndex, ComputeUniqueThreadId(builder, module)); var call = builder.CreateCall(kernelFunc, args.ToArray(), ""); if (call.TypeOf().TypeKind == LLVMTypeKind.LLVMVoidTypeKind) { builder.CreateRetVoid(); } else { builder.CreateRet(call); } } kernelFuncName = thunkKernelName; kernelFunc = thunkKernel; } // Mark the compiled kernel as a kernel symbol. LLVM.AddNamedMetadataOperand( module, "nvvm.annotations", LLVM.MDNode(new LLVMValueRef[] { kernelFunc, MDString("kernel"), LLVM.ConstInt(LLVM.Int32TypeInContext(LLVM.GetModuleContext(module)), 1, false) })); // LLVM.DumpModule(module); // Compile that LLVM IR down to PTX. LLVMTargetMachineRef machine; var ptx = CompileToPtx(module, context.GetDeviceComputeCapability(), out machine); // Console.WriteLine(System.Text.Encoding.UTF8.GetString(ptx)); // Load the PTX kernel. return(new CudaModule(assembly, moduleBuilder, machine, context.LoadModulePTX(ptx), kernelFuncName, context)); }