Ejemplo n.º 1
        private void initGLAndCuda()
            //Create render target control
            m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default);
            m_renderControl.Dock = DockStyle.Fill;
            m_renderControl.BackColor = Color.White;
            m_renderControl.BorderStyle = BorderStyle.FixedSingle;
            m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown);
            m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove);
            m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown);
            m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged);

            Console.WriteLine("   OpenGL device is Available");

            int deviceID = CudaContext.GetMaxGflopsDeviceId();

            ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync);
            string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount);

            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx);

            hvfield = new cData[DS];
            dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM);
            tPitch = dvfield.Pitch;


            vxfield = new CudaDeviceVariable<cData>(DS);
            vyfield = new CudaDeviceVariable<cData>(DS);

            // Create particle array
            particles = new cData[DS];
            initParticles(particles, DIM, DIM);

            // TODO: update kernels to use the new unpadded memory layout for perf
            // rather than the old FFTW-compatible layout
            planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding);
            planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding);

            GL.GenBuffers(1, out vbo);
            GL.BindBuffer(BufferTarget.ArrayBuffer, vbo);
            GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw);
            int bsize;
            GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize);

            if (bsize != DS * cData.SizeOf)
                throw new Exception("Sizes don't match.");

            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);

            cuda_vbo_resource = new CudaGraphicsInteropResourceCollection();
            cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None));

            texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two);

            stopwatch = new CudaStopWatch(CUEventFlags.Default);

            isInit = true;
Ejemplo n.º 2
        public GPU_Functionality(int deviceID = 0)
            ctx     = new CudaContext(deviceID);
            version = ctx.GetDeviceComputeCapability();
            Trace.WriteLine($"cuda compute capability {version.Major}.{version.Minor}");

            CUmodule collision_module = ctx.LoadModulePTX("collision_kernels.ptx");

            kNarrowPhase            = new CudaKernel("kNarrowPhase_new", collision_module, ctx);
            kFindClosestFace        = new CudaKernel("kFindClosestFace", collision_module, ctx);
            kCollisionResponseForce = new CudaKernel("kCollisionResponseForce", collision_module, ctx);
            dim3 block = new dim3(block_size, 1, 1);

            kNarrowPhase.BlockDimensions            = block;
            kFindClosestFace.BlockDimensions        = block;
            kCollisionResponseForce.BlockDimensions = block;

            // cz
            CUmodule module_cz_kernels = ctx.LoadModulePTX("cz_kernels.ptx");

            kczCZForce = new CudaKernel("kczCZForce", module_cz_kernels, ctx);
            kczCZForce.BlockDimensions = block;

            // elem
            CUmodule module_elem_kernels = ctx.LoadModulePTX("elem_kernels.ptx");

            kelElementElasticityForce = new CudaKernel("kelElementElasticityForce", module_elem_kernels, ctx);
            kelElementElasticityForce.BlockDimensions = block;
Ejemplo n.º 3
        private void Form1_Load(object sender, EventArgs e)
            ctx = new PrimaryContext();

            modPRelu   = ctx.LoadModulePTX("PRelu.ptx");
            modDeBayer = ctx.LoadModulePTX("DeBayer.ptx");
            modColor   = ctx.LoadModulePTX("ImageColorProcessing.ptx");

            createBayerKernel       = new CreateBayerWithNoiseKernel(ctx, modDeBayer);
            deBayerGreenKernel      = new DeBayerGreenKernel(modDeBayer, ctx);
            deBayerRedBlueKernel    = new DeBayerRedBlueKernel(modDeBayer, ctx);
            setupCurandKernel       = new SetupCurandKernel(ctx, modDeBayer);
            highlightRecoveryKernel = new HighlightRecoveryKernel(modColor, ctx);
            camToXYZKernel          = new ConvertCamToXYZKernel(modColor, ctx);
            convertRGBTosRGBKernel  = new ConvertRGBTosRGBKernel(modColor, ctx);

            //constant variable is set for the entire module!
            createBayerKernel.BayerPattern = new BayerColor[] { BayerColor.Red, BayerColor.Green, BayerColor.Green, BayerColor.Blue };

            //If you do not have CUDNN, set the last parameter to false (use NPP instead)
            denoiseAndDemoisaic = new DenoiseAndDemoisaic(TileSize, ctx, modPRelu, true);
            CuRandStates        = new CudaDeviceVariable <byte>(TileSize * TileSize * 48); //one state has the size of 48 bytes
            setupCurandKernel.RunSafe(CuRandStates, TileSize * TileSize);
            tile = new NPPImage_32fC3(TileSize, TileSize);
            cmb_IsoValue.SelectedIndex = 0;
Ejemplo n.º 4
 public GpuMathOperations()
     this.cuBlas               = new CudaBlas();
     this.cudaContext          = new CudaContext();
     this.cuModule             = cudaContext.LoadModulePTX(kernalFile);
     this.maxThreadPerBlockDim = (int)Math.Sqrt(this.cudaContext.GetDeviceInfo().MaxThreadsPerBlock);
Ejemplo n.º 5
        static void InitKernels()
            cntxt = new CudaContext();
            CUmodule cumodule = cntxt.LoadModulePTX(@"C:\work\Sobel\CudaTest\x64\Debug\kernel.ptx");

            matrixSumCude = new CudaKernel("_Z15matrixSumKernelPdPKdiii", cumodule, cntxt);
Ejemplo n.º 6
        public void JITcompile()
            foreach (var sc in sourceCodes)
                if (sc.ptx == null)
                    sc.mod = ctx.LoadModulePTX(sc.cubin, null, null);  // no jit, just upload
                    sc.mod = ctx.LoadModulePTX(sc.ptx); //jit

            SampleCollector.Collect("JIT", toc());
Ejemplo n.º 7
        public GrabCutGMM()
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId(), false);

            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
                resName = "GrabCutGMM_x64.ptx";
                resName = "GrabCutGMM.ptx";

            string resNamespace = "GrabCutNPP";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
                throw new ArgumentException("Kernel not found in resources.");
            byte[] kernel = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;

            while (bytesToRead > 0)
                bytesToRead -= stream.Read(kernel, (int)stream.Position, bytesToRead);

            CUmodule module = ctx.LoadModulePTX(kernel);

            GMMReductionKernelCreateGmmFlags   = new CudaKernel("_Z18GMMReductionKernelILi4ELb1EEviPfiPK6uchar4iPhiiiPj", module, ctx);
            GMMReductionKernelNoCreateGmmFlags = new CudaKernel("_Z18GMMReductionKernelILi4ELb0EEviPfiPK6uchar4iPhiiiPj", module, ctx);
            GMMFinalizeKernelInvertSigma       = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb1EEvPfS0_ii", module, ctx);
            GMMFinalizeKernelNoInvertSigma     = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb0EEvPfS0_ii", module, ctx);
            GMMcommonTerm   = new CudaKernel("_Z13GMMcommonTermiPfi", module, ctx);
            DataTermKernel  = new CudaKernel("_Z14DataTermKernelPiiiPKfiPK6uchar4iPKhiii", module, ctx);
            GMMAssignKernel = new CudaKernel("_Z15GMMAssignKerneliPKfiPK6uchar4iPhiii", module, ctx);
            GMMFindSplit    = new CudaKernel("_Z12GMMFindSplitP10GMMSplit_tiPfi", module, ctx);
            GMMDoSplit      = new CudaKernel("_Z10GMMDoSplitPK10GMMSplit_tiPfiPK6uchar4iPhiii", module, ctx);
            MeanEdgeStrengthReductionKernel = new CudaKernel("_Z31MeanEdgeStrengthReductionKerneliiPf", module, ctx);
            MeanEdgeStrengthFinalKernel     = new CudaKernel("_Z27MeanEdgeStrengthFinalKernelPfi", module, ctx);
            EdgeCuesKernel            = new CudaKernel("_Z14EdgeCuesKernelfPKfPiS1_S1_S1_S1_S1_S1_S1_iiii", module, ctx);
            SegmentationChangedKernel = new CudaKernel("_Z25SegmentationChangedKernelPiPhS0_iii", module, ctx);
            downscaleKernel1          = new CudaKernel("_Z18downscaleKernelBoxI6uchar4EvPT_iiiPKS1_iii", module, ctx);
            downscaleKernel2          = new CudaKernel("_Z18downscaleKernelMaxIhEvPT_iiiPKS0_iii", module, ctx);
            upsampleAlphaKernel       = new CudaKernel("_Z19upsampleAlphaKernelPhS_iiii", module, ctx);

            GMMFinalizeKernelInvertSigma.SetConstantVariable("det_indices", det_indices);
            GMMFinalizeKernelInvertSigma.SetConstantVariable("inv_indices", inv_indices);
            GMMFinalizeKernelNoInvertSigma.SetConstantVariable("det_indices", det_indices);
            GMMFinalizeKernelNoInvertSigma.SetConstantVariable("inv_indices", inv_indices);
Ejemplo n.º 8
        private void InitializeCUDA()
            context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11);

            module = context.LoadModulePTX(@"Kernels\kernel.ptx");

            kernelPositionWeightNoiseCube     = new CudaKernel("position_weight_noise_cube", module, context);
            kernelNormalAmbient               = new CudaKernel("normal_ambient", module, context);
            kernelMarchingCubesCases          = new CudaKernel("marching_cubes_cases", module, context);
            kernelMarchingCubesVertices       = new CudaKernel("marching_cubes_vertices", module, context);
            kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context);
            kernelPositionWeightFormula       = new CudaKernel("position_weight_formula", module, context);

            prefixScan = new CUDAPrefixScan(module, context);
Ejemplo n.º 9
        public OpticalFlow(int width, int height, CudaContext ctx)
            CUmodule mod = ctx.LoadModulePTX("opticalFlow.ptx");

            warpingKernel            = new WarpingKernel(ctx, mod);
            createFlowFieldFromTiles = new CreateFlowFieldFromTiles(ctx, mod);
            computeDerivativesKernel = new ComputeDerivativesKernel(ctx, mod);
            lukasKanade = new LukasKanadeKernel(ctx, mod);

            d_tmp  = new NPPImage_32fC1(width, height);
            d_Ix   = new NPPImage_32fC1(width, height);
            d_Iy   = new NPPImage_32fC1(width, height);
            d_Iz   = new NPPImage_32fC1(width, height);
            d_flow = new NPPImage_32fC2(width, height);

            buffer = new CudaDeviceVariable <byte>(d_tmp.MeanStdDevGetBufferHostSize() * 3);
            mean   = new CudaDeviceVariable <double>(1);
            std    = new CudaDeviceVariable <double>(1);

            d_filterX = new float[] { -0.25f, 0.25f, -0.25f, 0.25f };
            d_filterY = new float[] { -0.25f, -0.25f, 0.25f, 0.25f };
            d_filterT = new float[] { 0.25f, 0.25f, 0.25f, 0.25f };
Ejemplo n.º 10
        public ShiftCollection(int aFrameCount, int aMaxTileCountX, int aMaxTileCountY, int aReferenceIndex, TrackingStrategy aStrategy, int aBlockSize, CudaContext ctx)
            strategy       = aStrategy;
            referenceIndex = aReferenceIndex;
            frameCount     = aFrameCount;
            if (aBlockSize >= aFrameCount)
                blockSize = aFrameCount - 1;
                blockSize = aBlockSize;

            blas = new CudaBlas(PointerMode.Device, AtomicsMode.Allowed);
            one  = 1.0f;
            zero = 0.0f;

            shiftPairs = new List <ShiftPair>();
            int shiftCount = GetShiftCount();


            if (shiftPairs.Count != shiftCount)
                throw new Exception("Ooups, something went wrong with my math...");

            shifts = new List <NPPImage_32fC2>(shiftCount);

            int[]         shiftPitches_h = new int[shiftCount];
            CUdeviceptr[] ptrList        = new CUdeviceptr[shiftCount];
            for (int i = 0; i < shiftCount; i++)
                NPPImage_32fC2 devVar = new NPPImage_32fC2(aMaxTileCountX, aMaxTileCountY);
                shiftPitches_h[i] = devVar.Pitch;
                ptrList[i]        = devVar.DevicePointer;
            shiftPitches     = shiftPitches_h;
            AllShifts_d      = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * shiftCount);
            shiftsOneToOne_d = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1));
            shifts_d         = ptrList;

            status               = new CudaDeviceVariable <int>(aMaxTileCountX * aMaxTileCountY);
            infoInverse          = new CudaDeviceVariable <int>(aMaxTileCountX * aMaxTileCountY);
            shiftMatrixArray     = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            shiftMatrixSafeArray = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            matrixSquareArray    = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            matrixInvertedArray  = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            solvedMatrixArray    = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            shiftOneToOneArray   = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            shiftMeasuredArray   = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            shiftOptimArray      = new CudaDeviceVariable <CUdeviceptr>(aMaxTileCountX * aMaxTileCountY);
            shiftMatrices        = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * shiftCount * (frameCount - 1));
            shiftSafeMatrices    = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * shiftCount * (frameCount - 1));
            matricesSquared      = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1) * (frameCount - 1));
            matricesInverted     = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1) * (frameCount - 1));
            solvedMatrices       = new CudaDeviceVariable <float>(aMaxTileCountX * aMaxTileCountY * shiftCount * (frameCount - 1));
            shiftsOneToOne       = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1));
            pivotArray           = new CudaDeviceVariable <int>(aMaxTileCountX * aMaxTileCountY * (frameCount - 1));
            shiftsMeasured       = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * shiftCount);
            shiftsOptim          = new CudaDeviceVariable <float2>(aMaxTileCountX * aMaxTileCountY * shiftCount);
            buffer               = new CudaDeviceVariable <byte>(status.SumGetBufferSize());
            statusSum            = new CudaDeviceVariable <int>(1);

            CUmodule mod = ctx.LoadModulePTX("ShiftMinimizerKernels.ptx");

            concatenateShifts     = new concatenateShiftsKernel(ctx, mod);
            separateShifts        = new separateShiftsKernel(ctx, mod);
            getOptimalShifts      = new getOptimalShiftsKernel(ctx, mod);
            copyShiftMatrixKernel = new copyShiftMatrixKernel(ctx, mod);
            setPointers           = new setPointersKernel(ctx, mod);
            checkForOutliers      = new checkForOutliersKernel(ctx, mod);
            transposeShifts       = new transposeShiftsKernel(ctx, mod);

            setPointers.RunSafe(shiftMatrixArray, shiftMatrixSafeArray, matrixSquareArray, matrixInvertedArray, solvedMatrixArray,
                                shiftOneToOneArray, shiftMeasuredArray, shiftOptimArray, shiftMatrices, shiftSafeMatrices, matricesSquared,
                                matricesInverted, solvedMatrices, shiftsOneToOne, shiftsMeasured, shiftsOptim, aMaxTileCountX * aMaxTileCountY, frameCount, shiftCount);

Ejemplo n.º 11
        static void Main(string[] args)
            //Read CL arguments
            for (int i = 0; i < args.Length; i++)
                if (args[i] == "-d")
                    deviceID = int.Parse(args[++i]);
                if (args[i] == "-lr")
                    learning_rate = double.Parse(args[++i], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture);
                if (args[i] == "-iso")
                    ISO = args[++i];
                if (args[i] == "-t")
                    crosscheck = true;
                if (args[i] == "-w")
                    warmStart = int.Parse(args[++i]);
                    Console.WriteLine("Start with epoch " + warmStart);
                if (args[i] == "-s")
                    saveImages = true;

            Console.WriteLine("Using device ID: " + deviceID);
            Console.WriteLine("Learning rate: " + learning_rate);

            //Init Cuda stuff
            ctx = new PrimaryContext(deviceID);
            Console.WriteLine("Context created");
            CUmodule modPatch = ctx.LoadModulePTX("PatchProcessing.ptx");

            Console.WriteLine("modPatch loaded");
            CUmodule modBorder = ctx.LoadModulePTX("BorderTreatment.ptx");

            Console.WriteLine("modBorder loaded");
            CUmodule modError = ctx.LoadModulePTX("ErrorComputation.ptx");

            Console.WriteLine("modError loaded");
            CUmodule modPRelu = ctx.LoadModulePTX("PRelu.ptx");

            Console.WriteLine("modPRelu loaded");
            CUmodule modDeBayer = ctx.LoadModulePTX("DeBayer.ptx");

            Console.WriteLine("all modules loaded");
            deBayerGreenKernel   = new DeBayerGreenKernel(modDeBayer, ctx);
            deBayerRedBlueKernel = new DeBayerRedBlueKernel(modDeBayer, ctx);
            //Both deBayer kernels are load from the same module: setting the constant variable for bayer pattern one is enough...
            deBayerGreenKernel.BayerPattern = new BayerColor[] { BayerColor.Red, BayerColor.Green, BayerColor.Green, BayerColor.Blue };

            prepareDataKernel  = new PrepareDataKernel(modPatch, ctx);
            restoreImageKernel = new RestoreImageKernel(modPatch, ctx);
            Console.WriteLine("kernels loaded");

            int countOwn = 468083;
            int count5k  = 33408;

            string fileBase = @"/ssd/data/TrainingsDataNN/";

            List <float3> WhiteBalanceFactors = new List <float3>();
            FileStream    fs1 = new FileStream(fileBase + "FromOwnDataset/WhiteBalancesOwn.txt", FileMode.Open, FileAccess.Read);
            FileStream    fs2 = new FileStream(fileBase + "From5kDataset/WhiteBalances5k.txt", FileMode.Open, FileAccess.Read);
            StreamReader  sr1 = new StreamReader(fs1);
            StreamReader  sr2 = new StreamReader(fs2);

            for (int i = 0; i < countOwn; i++)
                fileRawList.Add(fileBase + "FromOwnDataset/ISO" + ISO + "/img_" + i.ToString("0000000") + ".bin");
                fileTrouthList.Add(fileBase + "FromOwnDataset/GroundTruth/img_" + i.ToString("0000000") + ".bin");

                string   line   = sr1.ReadLine();
                string[] values = line.Split('\t');
                float3   wb     = new float3(float.Parse(values[1], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[2], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[3], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture));

            for (int i = 0; i < count5k; i++)
                fileRawList.Add(fileBase + "From5kDataset/ISO" + ISO + "/img_" + i.ToString("0000000") + ".bin");
                fileTrouthList.Add(fileBase + "From5kDataset/GroundTruth/img_" + i.ToString("0000000") + ".bin");

                string   line   = sr2.ReadLine();
                string[] values = line.Split('\t');
                float3   wb     = new float3(float.Parse(values[1], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[2], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[3], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture));


            baOriginal = new float3[countOwn + count5k][];
            baRAW      = new float[countOwn + count5k][];

            Random rand = new Random(0);

            //random order for the image patches
            for (int i = 0; i < countOwn + count5k - 1; i++)
                int    r    = i + (rand.Next() % (countOwn + count5k - i));
                string temp = fileRawList[i];
                fileRawList[i] = fileRawList[r];
                fileRawList[r] = temp;

                temp = fileTrouthList[i];
                fileTrouthList[i] = fileTrouthList[r];
                fileTrouthList[r] = temp;

                float3 tempf = WhiteBalanceFactors[i];
                WhiteBalanceFactors[i] = WhiteBalanceFactors[r];
                WhiteBalanceFactors[r] = tempf;

            Console.WriteLine("Initialization done!");

            int trainingSize = (int)((countOwn + count5k) * 0.9f); //4 patches per file
            int testSize     = fileRawList.Count - trainingSize;

            CudaBlas       blas  = new CudaBlas(PointerMode.Host);
            CudaDNNContext cudnn = new CudaDNNContext();

            int   patchSize     = 31;
            int   patchSize4    = 66; //Size of an 2x2 patch read from file
            int   batch         = 64;
            float normalization = 0.5f;

            //define neural network:
            StartLayer         start = new StartLayer(patchSize, patchSize, 3, batch);
            FinalLayer         final = new FinalLayer(patchSize, patchSize, 3, batch, FinalLayer.Norm.Mix, ctx, modError);
            ConvolutionalLayer conv1 = new ConvolutionalLayer(patchSize, patchSize, 3, patchSize, patchSize, 64, batch, 9, 9, ConvolutionalLayer.Activation.PRelu, blas, cudnn, ctx, modBorder, modPRelu);
            ConvolutionalLayer conv2 = new ConvolutionalLayer(patchSize, patchSize, 64, patchSize, patchSize, 64, batch, 5, 5, ConvolutionalLayer.Activation.PRelu, blas, cudnn, ctx, modBorder, modPRelu);
            ConvolutionalLayer conv3 = new ConvolutionalLayer(patchSize, patchSize, 64, patchSize, patchSize, 3, batch, 5, 5, ConvolutionalLayer.Activation.None, blas, cudnn, ctx, modBorder, modPRelu);


            CudaDeviceVariable <float3> imgA = new CudaDeviceVariable <float3>(patchSize4 * patchSize4);
            CudaDeviceVariable <float3> imgB = new CudaDeviceVariable <float3>(patchSize4 * patchSize4);
            CudaDeviceVariable <float>  rawd = new CudaDeviceVariable <float>(patchSize4 * patchSize4);

            CudaDeviceVariable <float> inputImgs    = new CudaDeviceVariable <float>(patchSize * patchSize * 3 * batch);
            CudaDeviceVariable <float> groundTrouth = new CudaDeviceVariable <float>(patchSize * patchSize * 3 * batch);
            NPPImage_8uC3 imgU3a = new NPPImage_8uC3(patchSize, patchSize);
            NPPImage_8uC3 imgU3b = new NPPImage_8uC3(patchSize, patchSize);
            NPPImage_8uC3 imgU3c = new NPPImage_8uC3(patchSize, patchSize);

            Bitmap a = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb);
            Bitmap b = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb);
            Bitmap c = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb);

            Random randImageOutput = new Random(0);
            Random randForInit     = new Random(0);


            int startEpoch = warmStart;

            FileStream fs;

            //restore network in case of warm start:
            if (warmStart > 0)
                fs = new FileStream("epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + (warmStart - 1) + ".cnn", FileMode.Open, FileAccess.Read);

            //validate results on validation data set
            if (crosscheck)
                FileStream   csvResult = new FileStream("results_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write);
                StreamWriter sw        = new StreamWriter(csvResult);

                for (int i = 0; i < 2000; i += 1)
                    string filename = "epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + i + ".cnn";
                        FileStream cnn = new FileStream(filename, FileMode.Open, FileAccess.Read);
                    catch (Exception)
                        Console.WriteLine("Skipping: " + i);

                    double errorL1  = 0;
                    double errorL2  = 0;
                    double errorMix = 0;
                    for (int iter = 0; iter < testSize / batch * 4; iter++)
                        //Prepare batch for training:
                        for (int ba = 0; ba < batch / 4; ba++)
                            int idx = iter * (batch / 4) + ba + trainingSize;

                            float3[] original;
                            float[]  raw;
                            if (baRAW[idx - trainingSize] == null)
                                original = ReadRAWFloat3(fileTrouthList[idx]);
                                raw      = ReadRAWFloat(fileRawList[idx]);
                                baOriginal[idx - trainingSize] = original;
                                baRAW[idx - trainingSize]      = raw;
                                original = baOriginal[idx - trainingSize];
                                raw      = baRAW[idx - trainingSize];


                            deBayerGreenKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            deBayerRedBlueKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            prepareDataKernel.RunSafe(imgA, imgB, groundTrouth, inputImgs, ba, normalization, WhiteBalanceFactors[idx]);


                        float err = start.InferenceTraining(inputImgs);

                        errorMix += err;
                        errorL1  += final.GetError(FinalLayer.Norm.L1);
                        errorL2  += final.GetError(FinalLayer.Norm.L2);
                    Console.WriteLine("Results for: " + filename);
                    Console.WriteLine("Mean Error L1: " + errorL1 / testSize * batch / 4);
                    Console.WriteLine("Mean Error L2: " + errorL2 / testSize * batch / 4);
                    Console.WriteLine("Mean Error Mix: " + errorMix / testSize * batch / 4);
                    sw.Write((errorL1 / testSize * batch / 4).ToString().Replace(".", ","));
                    sw.Write((errorL2 / testSize * batch / 4).ToString().Replace(".", ","));
                    sw.Write((errorMix / testSize * batch / 4).ToString().Replace(".", ","));
            //or train existing network:
                double error      = 0;
                double errorEpoch = 0;
                for (int epoch = startEpoch; epoch < 2000; epoch++)
                    errorEpoch = 0;
                    error      = 0;

                    for (int iter = 0; iter < trainingSize / batch * 4; iter++)
                        //Prepare batch for training:
                        for (int ba = 0; ba < batch / 4; ba++)
                            int idx = iter * (batch / 4) + ba;

                            float3[] original;
                            float[]  raw;
                            if (baRAW[idx] == null)
                                original        = ReadRAWFloat3(fileTrouthList[idx]);
                                raw             = ReadRAWFloat(fileRawList[idx]);
                                baOriginal[idx] = original;
                                baRAW[idx]      = raw;
                                original = baOriginal[idx];
                                raw      = baRAW[idx];


                            deBayerGreenKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            deBayerRedBlueKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            prepareDataKernel.RunSafe(imgA, imgB, groundTrouth, inputImgs, ba, normalization, WhiteBalanceFactors[idx]);


                        float err = start.InferenceTraining(inputImgs);


                        start.UpdateWeights(GetLearningRate(epoch * (trainingSize) / batch * 4 + iter));//*0+951342

                        error      += err;
                        errorEpoch += err;
                        if ((epoch * trainingSize / batch * 4 + iter) % 1000 == 0 && iter != 0)
                            FileStream   status = new FileStream("status_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write);
                            StreamWriter sw     = new StreamWriter(status);

                            sw.WriteLine((error / 1000.0).ToString().Replace(".", ",") + ";" + GetLearningRate(epoch * trainingSize / batch * 4 + iter).ToString().Replace(".", ","));

                            error = 0;

                        //if ((epoch * trainingSize / batch * 4 + iter) % 10000 == 0)
                        //    fs = new FileStream("iter_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + (epoch * trainingSize / batch * 4 + iter) + ".cnn", FileMode.Create, FileAccess.Write);
                        //    start.SaveValues(fs);
                        //    fs.Close();
                        //    fs.Dispose();
                        //    Console.WriteLine("Network saved for iteration " + (epoch * trainingSize / batch * 4 + iter) + "!");

                        Console.WriteLine("Epoch: " + epoch + " Iteration: " + (epoch * trainingSize / batch * 4 + iter) + ", Error: " + err);

                        if (saveImages && iter == 0)//(epoch * trainingSize / batch * 4 + iter) % 10000 == 0 &&
                            for (int i = 0; i < 1; i++)
                                int    imgidx = randImageOutput.Next(batch);
                                float3 wb     = WhiteBalanceFactors[iter * (batch / 4) + imgidx / 4];
                                restoreImageKernel.RunSafe(groundTrouth, imgU3a, imgidx, wb.x, wb.y, wb.z, normalization);
                                restoreImageKernel.RunSafe(inputImgs, imgU3b, imgidx, wb.x, wb.y, wb.z, normalization);
                                CudaDeviceVariable <float> res = final.GetResult();
                                restoreImageKernel.RunSafe(res, imgU3c, imgidx, wb.x, wb.y, wb.z, normalization);


                                a.Save("GroundTrouth_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png");// * trainingSize / batch * 4 + iter
                                b.Save("Input_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png");
                                c.Save("Result_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png");
                    errorEpoch /= trainingSize / batch * 4;
                    fs          = new FileStream("errorEpoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write);
                    StreamWriter sw2 = new StreamWriter(fs);
                    sw2.WriteLine(errorEpoch.ToString().Replace(".", ","));

                    fs = new FileStream("epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + ".cnn", FileMode.Create, FileAccess.Write);
Ejemplo n.º 12
        private void InitializeD3D()
            // Create the D3D object.
            d3d = new Direct3DEx();

            PresentParameters pp = new PresentParameters();

            pp.BackBufferWidth        = 512;
            pp.BackBufferHeight       = 512;
            pp.BackBufferFormat       = Format.Unknown;
            pp.BackBufferCount        = 0;
            pp.Multisample            = MultisampleType.None;
            pp.MultisampleQuality     = 0;
            pp.SwapEffect             = SwapEffect.Discard;
            pp.DeviceWindowHandle     = panel1.Handle;
            pp.Windowed               = true;
            pp.EnableAutoDepthStencil = false;
            pp.AutoDepthStencilFormat = Format.Unknown;
            pp.PresentationInterval   = PresentInterval.Default;

            bDeviceFound = false;
            CUdevice[] cudaDevices = null;
            for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++)
                device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp);
                    cudaDevices  = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9);
                    bDeviceFound = cudaDevices.Length > 0;
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                                      + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.");
                catch (CudaException)
                    //No Cuda device found for this Direct3D9 device
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                                      + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.");

            // we check to make sure we have found a cuda-compatible D3D device to work on
            if (!bDeviceFound)
                Console.WriteLine("No CUDA-compatible Direct3D9 device available");
                if (device != null)

            ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9);

            // Set projection matrix
            SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1);
            device.SetTransform(TransformState.Projection, matProj);

            // Turn off D3D lighting, since we are providing our own vertex colors
            device.SetRenderState(RenderState.Lighting, false);

            //Load kernels
            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k       = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k  = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k  = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k  = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_k", module, ctx);
Ejemplo n.º 15
        private void InitializeCUDA()
            context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11);

            module = context.LoadModulePTX(@"Kernels\kernel.ptx");

            kernelPositionWeightNoiseCube = new CudaKernel("position_weight_noise_cube", module, context);
            kernelNormalAmbient = new CudaKernel("normal_ambient", module, context);
            kernelMarchingCubesCases = new CudaKernel("marching_cubes_cases", module, context);
            kernelMarchingCubesVertices = new CudaKernel("marching_cubes_vertices", module, context);
            kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context);
            kernelPositionWeightFormula = new CudaKernel("position_weight_formula", module, context);

            prefixScan = new CUDAPrefixScan(module, context);
Ejemplo n.º 16
Ejemplo n.º 17
        private void InitializeD3D()
            // Create the D3D object.
            d3d = new Direct3DEx();

            PresentParameters pp = new PresentParameters();
            pp.BackBufferWidth = 512;
            pp.BackBufferHeight = 512;
            pp.BackBufferFormat = Format.Unknown;
            pp.BackBufferCount = 0;
            pp.Multisample = MultisampleType.None;
            pp.MultisampleQuality = 0;
            pp.SwapEffect = SwapEffect.Discard;
            pp.DeviceWindowHandle = panel1.Handle;
            pp.Windowed = true;
            pp.EnableAutoDepthStencil = false;
            pp.AutoDepthStencilFormat = Format.Unknown;
            pp.PresentationInterval = PresentInterval.Default;

            bDeviceFound = false;
            CUdevice[] cudaDevices = null;
            for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++)
                device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp);
                    cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9);
                    bDeviceFound = cudaDevices.Length > 0;
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.");
                catch (CudaException)
                    //No Cuda device found for this Direct3D9 device
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.");

            // we check to make sure we have found a cuda-compatible D3D device to work on
            if (!bDeviceFound)
                Console.WriteLine("No CUDA-compatible Direct3D9 device available");
                if (device != null)

            ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9);

            // Set projection matrix
            SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1);
            device.SetTransform(TransformState.Projection, matProj);

            // Turn off D3D lighting, since we are providing our own vertex colors
            device.SetRenderState(RenderState.Lighting, false);

            //Load kernels
            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_k", module, ctx);
Ejemplo n.º 18
        private void initGLAndCuda()
            //Create render target control
            m_renderControl              = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default);
            m_renderControl.Dock         = DockStyle.Fill;
            m_renderControl.BackColor    = Color.White;
            m_renderControl.BorderStyle  = BorderStyle.FixedSingle;
            m_renderControl.KeyDown     += new KeyEventHandler(m_renderControl_KeyDown);
            m_renderControl.MouseMove   += new MouseEventHandler(m_renderControl_MouseMove);
            m_renderControl.MouseDown   += new MouseEventHandler(m_renderControl_MouseDown);
            m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged);

            Console.WriteLine("   OpenGL device is Available");

            int deviceID = CudaContext.GetMaxGflopsDeviceId();

            ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync);
            string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount);


            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k       = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k  = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k  = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k  = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx);

            hvfield = new cData[DS];
            dvfield = new CudaPitchedDeviceVariable <cData>(DIM, DIM);
            tPitch  = dvfield.Pitch;


            vxfield = new CudaDeviceVariable <cData>(DS);
            vyfield = new CudaDeviceVariable <cData>(DS);

            // Create particle array
            particles = new cData[DS];
            initParticles(particles, DIM, DIM);

            // TODO: update kernels to use the new unpadded memory layout for perf
            // rather than the old FFTW-compatible layout
            planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding);
            planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding);

            GL.GenBuffers(1, out vbo);
            GL.BindBuffer(BufferTarget.ArrayBuffer, vbo);
            GL.BufferData <cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw);
            int bsize;

            GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize);

            if (bsize != DS * cData.SizeOf)
                throw new Exception("Sizes don't match.");

            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);

            cuda_vbo_resource = new CudaGraphicsInteropResourceCollection();
            cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None));

            texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two);

            stopwatch = new CudaStopWatch(CUEventFlags.Default);

            isInit = true;
Ejemplo n.º 19
        private static async Task <CudaModule> CompileAsync(
            IMethod method,
            IEnumerable <ITypeMember> memberRoots,
            IEnumerable <IType> typeRoots,
            int threadIdParamIndex,
            ClrAssembly assembly,
            CudaContext context)
            // Figure out which members we need to compile.
            var desc = await CreateContentDescriptionAsync(method, memberRoots, typeRoots, assembly);

            // Compile those members to LLVM IR. Use an Itanium name mangling scheme.
            var mangler       = new ItaniumMangler(assembly.Resolver.TypeEnvironment);
            var moduleBuilder = LlvmBackend.Compile(desc, assembly.Resolver.TypeEnvironment);
            var module        = moduleBuilder.Module;

            // Generate type metadata for all type roots.
            foreach (var type in typeRoots)
                moduleBuilder.Metadata.GetMetadata(type, moduleBuilder);

            // Get the compiled kernel function.
            var kernelFuncName = mangler.Mangle(method, true);
            var kernelFunc     = LLVM.GetNamedFunction(module, kernelFuncName);

            if (threadIdParamIndex >= 0)
                // If we have a thread ID parameter, then we need to generate a thunk
                // kernel function that calls our actual kernel function. This thunk's
                // responsibility is to determine the thread ID of the kernel.
                var thunkKernelName = "kernel";
                var thunkTargetType = kernelFunc.TypeOf().GetElementType();
                var thunkParamTypes = new List <LLVMTypeRef>(thunkTargetType.GetParamTypes());
                if (threadIdParamIndex < thunkParamTypes.Count)
                var thunkKernel = LLVM.AddFunction(

                using (var builder = new IRBuilder(moduleBuilder.Context))
                    var args = new List <LLVMValueRef>(thunkKernel.GetParams());
                    args.Insert(threadIdParamIndex, ComputeUniqueThreadId(builder, module));
                    var call = builder.CreateCall(kernelFunc, args.ToArray(), "");
                    if (call.TypeOf().TypeKind == LLVMTypeKind.LLVMVoidTypeKind)

                kernelFuncName = thunkKernelName;
                kernelFunc     = thunkKernel;

            // Mark the compiled kernel as a kernel symbol.
                LLVM.MDNode(new LLVMValueRef[]
                LLVM.ConstInt(LLVM.Int32TypeInContext(LLVM.GetModuleContext(module)), 1, false)

            // LLVM.DumpModule(module);

            // Compile that LLVM IR down to PTX.
            LLVMTargetMachineRef machine;
            var ptx = CompileToPtx(module, context.GetDeviceComputeCapability(), out machine);

            // Console.WriteLine(System.Text.Encoding.UTF8.GetString(ptx));

            // Load the PTX kernel.
            return(new CudaModule(assembly, moduleBuilder, machine, context.LoadModulePTX(ptx), kernelFuncName, context));