Example #1
0
        /// <summary>
        /// Creates a new 1D texture from array memory
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="texName"></param>
        /// <param name="addressMode"></param>
        /// <param name="filterMode"></param>
        /// <param name="flags"></param>
        /// <param name="array"></param>
        public CudaTextureArray1D(CudaKernel kernel, string texName, CUAddressMode addressMode, CUFilterMode filterMode, CUTexRefSetFlags flags, CudaArray1D array)
        {
            _texref = new CUtexref();
            res     = DriverAPINativeMethods.ModuleManagement.cuModuleGetTexRef(ref _texref, kernel.CUModule, texName);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Texture name: {3}", DateTime.Now, "cuModuleGetTexRef", res, texName));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 0, addressMode);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFilterMode(_texref, filterMode);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFilterMode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFlags(_texref, flags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFlags", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFormat(_texref, array.ArrayDescriptor.Format, (int)array.ArrayDescriptor.NumChannels);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFormat", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }

            _filtermode  = filterMode;
            _flags       = flags;
            _addressMode = addressMode;
            _format      = array.ArrayDescriptor.Format;
            _size        = array.Width;
            _numChannels = (int)array.ArrayDescriptor.NumChannels;
            _name        = texName;
            _module      = kernel.CUModule;
            _cufunction  = kernel.CUFunction;

            _channelSize = CudaHelperMethods.GetChannelSize(array.ArrayDescriptor.Format);
            _dataSize    = array.Width * array.ArrayDescriptor.NumChannels * _channelSize;
            _array       = array;

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetArray(_texref, _array.CUArray, CUTexRefSetArrayFlags.OverrideFormat);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetArray", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            //res = DriverAPINativeMethods.ParameterManagement.cuParamSetTexRef(kernel.CUFunction, CUParameterTexRef.Default, _texref);
            //Debug.WriteLine("{0:G}, {1}: {2}", DateTime.Now, "cuParamSetTexRef", res);
            //if (res != CUResult.Success) throw new CudaException(res);
        }
Example #2
0
        /// <summary> see CUDA doc; </summary>
        public static void ModuleLoad(out CUmodule module, string fname)
        {
            IntPtr   _fname = Marshal.StringToHGlobalAnsi(fname);
            CUresult res    = my.cuModuleLoad(out module, _fname);

            Marshal.FreeHGlobal(_fname);
            TestResult(res);
        }
Example #3
0
        //private double ComputeObj(float[] w, float[] alpha, Problem<SparseVec> sub_prob, float[] diag)
        //{
        //    double v = 0, v1=0;
        //    int nSV = 0;
        //    for (int i = 0; i < w.Length; i++)
        //    {
        //        v += w[i] * w[i];
        //        v1 += 0.5*w[i] * w[i];
        //    }
        //    for (int i = 0; i < alpha.Length; i++)
        //    {
        //        sbyte y_i = (sbyte)sub_prob.Y[i];

        //        //original line
        //        //v += alpha[i] * (alpha[i] * diag[GETI(y_i, i)] - 2);
        //        v += alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        v1 += 0.5* alpha[i] * (alpha[i] * diag[y_i + 1] - 2);
        //        if (alpha[i] > 0) ++nSV;
        //    }

        //    v = v / 2;
        //  //  Debug.WriteLine("Objective value = {0}", v);
        //  //  Debug.WriteLine("nSV = {0}", nSV);

        //    return v;
        //}



        protected void InitCudaModule()
        {
            cuda          = new CUDA(0, true);
            cuModule      = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));
            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);
            cuFuncSolver  = cuda.GetModuleFunction(cudaSolveL2SVM);
            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);
        }
Example #4
0
        public uint GetModuleGlobalBytes(CUmodule mod, string globalName)
        {
            CUdeviceptr dptr  = new CUdeviceptr();
            SizeT       bytes = 0;

            this.LastError = CUDADriver.cuModuleGetGlobal(ref dptr, ref bytes, mod, globalName);
            return(bytes);
        }
Example #5
0
        /// <summary> see CUDA doc; </summary>
        public static void ModuleGetFunction(out CUfunction func, CUmodule module, string name)
        {
            IntPtr   _name = Marshal.StringToHGlobalAnsi(name);
            CUresult res   = my.cuModuleGetFunction(out func, module, _name);

            Marshal.FreeHGlobal(_name);
            TestResult(res);
        }
        void InitKernels()
        {
            var path = @"..\..\..\CudaParticleSimulation\kernel.ptx";

            if (!System.IO.File.Exists(path))
            {
                Debug.Error(path + " doesnt exists");
                return;
            }

            var cntxt = new CudaContext();

            uint deviceCount = 1;
            var  devices     = new CUdevice[50];

            OpenGLNativeMethods.CUDA3.cuGLGetDevices(ref deviceCount, devices, 50, CUGLDeviceList.All);

            var context = cntxt.Context;

            OpenGLNativeMethods.CUDA3.cuGLCtxCreate(ref context, CUCtxFlags.BlockingSync, devices[0]);

            Debug.Info("Found " + deviceCount + " OpenGL devices associated with current context");


            CUmodule cumodule = cntxt.LoadModule(path);

            updateParticles = new CudaKernel("updateParticles", cumodule, cntxt);
            updateParticles.BlockDimensions = new dim3(16 * 16, 1, 1);
            updateParticles.GridDimensions  = new dim3(16 * 16, 1, 1);


            generateParticles = new CudaKernel("generateParticles", cumodule, cntxt);
            generateParticles.BlockDimensions = updateParticles.BlockDimensions;
            generateParticles.GridDimensions  = updateParticles.GridDimensions;

            var random       = new Random();
            var randomFloats = new float[1000];

            for (int i = 0; i < randomFloats.Length; i++)
            {
                randomFloats[i] = (float)random.NextDouble();
            }

            generateParticles.SetConstantVariable("randomFloats", randomFloats);

            // CudaGraphicsInteropResourceCollection

            resources.Clear();
            foreach (var h in renderer.particleMesh.allBufferHandles)
            {
                var resoure = new CudaOpenGLBufferInteropResource(h, CUGraphicsRegisterFlags.None, CUGraphicsMapResourceFlags.None);
                resources.Add(resoure);
            }


            randomIndex_D = 0;
            randomIndex_D.CopyToDevice(0);
        }
        public GrabCutGMM()
        {
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId(), false);


            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "GrabCutGMM_x64.ptx";
            }
            else
            {
                resName = "GrabCutGMM.ptx";
            }

            string resNamespace = "GrabCutNPP";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }
            byte[] kernel = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;

            while (bytesToRead > 0)
            {
                bytesToRead -= stream.Read(kernel, (int)stream.Position, bytesToRead);
            }

            CUmodule module = ctx.LoadModulePTX(kernel);

            GMMReductionKernelCreateGmmFlags   = new CudaKernel("_Z18GMMReductionKernelILi4ELb1EEviPfiPK6uchar4iPhiiiPj", module, ctx);
            GMMReductionKernelNoCreateGmmFlags = new CudaKernel("_Z18GMMReductionKernelILi4ELb0EEviPfiPK6uchar4iPhiiiPj", module, ctx);
            GMMFinalizeKernelInvertSigma       = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb1EEvPfS0_ii", module, ctx);
            GMMFinalizeKernelNoInvertSigma     = new CudaKernel("_Z17GMMFinalizeKernelILi4ELb0EEvPfS0_ii", module, ctx);
            GMMcommonTerm   = new CudaKernel("_Z13GMMcommonTermiPfi", module, ctx);
            DataTermKernel  = new CudaKernel("_Z14DataTermKernelPiiiPKfiPK6uchar4iPKhiii", module, ctx);
            GMMAssignKernel = new CudaKernel("_Z15GMMAssignKerneliPKfiPK6uchar4iPhiii", module, ctx);
            GMMFindSplit    = new CudaKernel("_Z12GMMFindSplitP10GMMSplit_tiPfi", module, ctx);
            GMMDoSplit      = new CudaKernel("_Z10GMMDoSplitPK10GMMSplit_tiPfiPK6uchar4iPhiii", module, ctx);
            MeanEdgeStrengthReductionKernel = new CudaKernel("_Z31MeanEdgeStrengthReductionKerneliiPf", module, ctx);
            MeanEdgeStrengthFinalKernel     = new CudaKernel("_Z27MeanEdgeStrengthFinalKernelPfi", module, ctx);
            EdgeCuesKernel            = new CudaKernel("_Z14EdgeCuesKernelfPKfPiS1_S1_S1_S1_S1_S1_S1_iiii", module, ctx);
            SegmentationChangedKernel = new CudaKernel("_Z25SegmentationChangedKernelPiPhS0_iii", module, ctx);
            downscaleKernel1          = new CudaKernel("_Z18downscaleKernelBoxI6uchar4EvPT_iiiPKS1_iii", module, ctx);
            downscaleKernel2          = new CudaKernel("_Z18downscaleKernelMaxIhEvPT_iiiPKS0_iii", module, ctx);
            upsampleAlphaKernel       = new CudaKernel("_Z19upsampleAlphaKernelPhS_iiii", module, ctx);

            GMMFinalizeKernelInvertSigma.SetConstantVariable("det_indices", det_indices);
            GMMFinalizeKernelInvertSigma.SetConstantVariable("inv_indices", inv_indices);
            GMMFinalizeKernelNoInvertSigma.SetConstantVariable("det_indices", det_indices);
            GMMFinalizeKernelNoInvertSigma.SetConstantVariable("inv_indices", inv_indices);
        }
Example #8
0
        public override void Init()
        {
            cuda = new CUDA(0, true);

            var cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost);

            cuda.SetCurrentContext(cuCtx);

            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));
            cuFunc   = cuda.GetModuleFunction(cudaEvaluatorKernelName);

            cuFuncSign = cuda.GetModuleFunction(cudaSignKernelName);

            //reserved memory based on dimension of support vector
            //svVector = new float[TrainedModel.SupportElements[0].Count];

            stream = cuda.CreateStream();
            //memSvSize = (uint)(TrainedModel.SupportElements[0].Count * sizeof(float));
            memSvSize = (uint)(TrainedModel.SupportElements[0].Dim * sizeof(float));

            //allocates memory for buffers
            svVecIntPtrs[0] = cuda.AllocateHost(memSvSize);
            svVecIntPtrs[1] = cuda.AllocateHost(memSvSize);
            mainVecPtr      = cuda.CopyHostToDeviceAsync(svVecIntPtrs[0], memSvSize, stream);

            cuSVTexRef = cuda.GetModuleTexture(cuModule, "svTexRef");
            cuda.SetTextureFlags(cuSVTexRef, 0);
            cuda.SetTextureAddress(cuSVTexRef, mainVecPtr, memSvSize);

            //todo: copy labels and alphas

            float[] svLabels = new float[TrainedModel.SupportElements.Length];
            float[] svAlphas = new float[TrainedModel.SupportElements.Length];


            Parallel.For(0, TrainedModel.SupportElementsIndexes.Length,
                         i => {
                int idx = TrainedModel.SupportElementsIndexes[i];

                svLabels[i] = TrainedModel.Y[i];
                //svLabels[i] = TrainningProblem.Labels[idx];
                svAlphas[i] = TrainedModel.Alpha[idx];
            });

            //for (int i = 0; i < TrainedModel.SupportElementsIndexes.Length; i++)
            //{
            //    int idx = TrainedModel.SupportElementsIndexes[i];
            //    svLabels[i]= TrainningProblem.Labels[idx];
            //    svAlphas[i] = TrainedModel.Alpha[idx];

            //}

            labelsPtr = cuda.CopyHostToDevice(svLabels);
            alphasPtr = cuda.CopyHostToDevice(svAlphas);

            IsInitialized = true;
        }
Example #9
0
        public static void InitKernels()
        {
            CudaContext cntxt = new CudaContext();

            //CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Michał\Documents\Visual Studio 2013\Projects\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");
            CUmodule cumodule = cntxt.LoadModule(@"D:\Grafika\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");

            addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt);
        }
Example #10
0
        public static void Initialize()
        {
            CUmodule module = _context.LoadModulePTX("Framework/Algorithms/Kernels/FlowMapUncertain.ptx");

            //__global__ void FlowMapStep(cudaTextureObject_t mapT0, float* mapT1, const int width, const int height, const int numMembers, const float particleDensity, /*float timeScale, */ float stepSize, float minDensity)
            _advectParticlesKernel = new CudaKernel("FlowMapStep", module, _context);
            _copyMapDataKernel     = new CudaKernel("FlowMapUpdate", module, _context);
            //_advectParticlesKernel = _context.LoadKernelPTX("Framework/Algorithms/Kernels/FlowMapUncertain.ptx", "FlowMapStep", new CUJITOption[] { }, null);
        }
Example #11
0
        public PatchTracker(int aMaxWidth, int aMaxHeight, List <int> aTileSizes, List <int> aMaxShifts, List <int> aLevels, CudaContext ctx)
        {
            forward  = new CudaFFTPlanMany[aLevels.Count];
            backward = new CudaFFTPlanMany[aLevels.Count];


            //Allocate FFT plans
            SizeT oldFFTSize = 0;

            for (int i = 0; i < aTileSizes.Count; i++)
            {
                SizeT memFFT = InitFFT(i, aMaxWidth / aLevels[i], aMaxHeight / aLevels[i], aTileSizes[i], aMaxShifts[i]);
                if (memFFT > oldFFTSize)
                {
                    oldFFTSize = memFFT;
                }
            }
            FTTBufferSize = oldFFTSize;

            //find maximum for allocations:
            for (int i = 0; i < aTileSizes.Count; i++)
            {
                currentWidth    = aMaxWidth / aLevels[i];
                currentHeight   = aMaxHeight / aLevels[i];
                currentTileSize = aTileSizes[i];
                currentMaxShift = aMaxShifts[i];

                int currentMaxPixelsShiftImage = (2 * currentMaxShift + 1) * (2 * currentMaxShift + 1) * CurrentBlockCountX * CurrentBlockCountY;
                maxPixelsShiftImage = Math.Max(currentMaxPixelsShiftImage, maxPixelsShiftImage);

                int tilePixels = CurrentBlockSize * CurrentBlockSize * CurrentBlockCountX * CurrentBlockCountY;
                maxPixelsImage = Math.Max(tilePixels, maxPixelsImage);
                int fftWidth  = CurrentBlockSize / 2 + 1;
                int fftPixels = fftWidth * CurrentBlockSize * CurrentBlockCountX * CurrentBlockCountY;
                maxPixelsFFT = Math.Max(fftPixels, maxPixelsFFT);

                maxWidth  = Math.Max(aMaxWidth / aLevels[i], maxWidth);
                maxHeight = Math.Max(aMaxHeight / aLevels[i], maxHeight);

                maxBlockCountX = Math.Max(maxBlockCountX, CurrentBlockCountX);
                maxBlockCountY = Math.Max(maxBlockCountY, CurrentBlockCountY);
            }

            CUmodule mod = ctx.LoadModule("kernel.ptx");

            conjKernel           = new conjugateComplexMulKernel(ctx, mod);
            convertToTiles       = new convertToTilesOverlapKernel(ctx, mod);
            convertToTilesBorder = new convertToTilesOverlapBorderKernel(ctx, mod);
            squaredSumKernel     = new squaredSumKernel(ctx, mod);
            boxFilterXKernel     = new boxFilterWithBorderXKernel(ctx, mod);
            boxFilterYKernel     = new boxFilterWithBorderYKernel(ctx, mod);
            normalizedCCKernel   = new normalizedCCKernel(ctx, mod);
            findMinimumKernel    = new findMinimumKernel(ctx, mod);
        }
        private void InitCudaModule()
        {
            string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName);

            if (!File.Exists(modluePath))
            {
                throw new ArgumentException("Failed access to cuda module" + modluePath);
            }

            cuModule    = cuda.LoadModule(modluePath);
            cuFuncDense = cuda.GetModuleFunction(funcName);
        }
 public AccumulateImagesSuperResKernel(CudaContext ctx, CUmodule module)
     : base("accumulateImagesSuperRes", module, ctx, BlockSizeX, BlockSizeY)
 {
     /*
      * accumulateImages(
      * unsigned short* __restrict__ dataIn,
      * float3 * __restrict__ imgOut,
      * float3 * __restrict__ totalWeights,
      * const float3 * __restrict__ certaintyMask,
      * const float3* __restrict__ kernelParam,
      * const float2* __restrict__ shifts,
      * float maxVal, int dimX, int dimY, int strideOut)
      */
 }
Example #14
0
        private void InitializeCUDA()
        {
            context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11);

            module = context.LoadModulePTX(@"Kernels\kernel.ptx");

            kernelPositionWeightNoiseCube     = new CudaKernel("position_weight_noise_cube", module, context);
            kernelNormalAmbient               = new CudaKernel("normal_ambient", module, context);
            kernelMarchingCubesCases          = new CudaKernel("marching_cubes_cases", module, context);
            kernelMarchingCubesVertices       = new CudaKernel("marching_cubes_vertices", module, context);
            kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context);
            kernelPositionWeightFormula       = new CudaKernel("position_weight_formula", module, context);

            prefixScan = new CUDAPrefixScan(module, context);
        }
Example #15
0
        public JittedModule(String ptx, CUmodule handle)
        {
            CudaDriver.Ensure();
            Ptx    = ptx.AssertNotNull();
            Handle = handle.AssertThat(h => h.IsNotNull);

            var match = Regex.Match(Ptx, @"\.entry\s*(?<entrypoint>\w*?)\s*\(");

            Functions = match.Unfoldi(m => m.NextMatch(), m => m.Success).Select(m =>
            {
                var name  = match.Result("${entrypoint}");
                var hfunc = nvcuda.cuModuleGetFunction(this, name);
                return(new JittedFunction(hfunc, name));
            }).ToReadOnly();
        }
Example #16
0
 private CudaModule(
     ClrAssembly sourceAssembly,
     ModuleBuilder intermediateModule,
     LLVMTargetMachineRef targetMachine,
     CUmodule compiledModule,
     string entryPointName,
     CudaContext context)
 {
     this.SourceAssembly     = sourceAssembly;
     this.IntermediateModule = intermediateModule;
     this.TargetMachine      = targetMachine;
     this.TargetData         = LLVM.CreateTargetDataLayout(TargetMachine);
     this.CompiledModule     = compiledModule;
     this.EntryPointName     = entryPointName;
     this.Context            = context;
 }
Example #17
0
        private void InitCuda()
        {
            cuda = new CUDA(0, true);

            var cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost);

            cuda.SetCurrentContext(cuCtx);



            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));

            cuFuncEval = cuda.GetModuleFunction(cudaEvaluatorKernelName);


            cuFuncReduce = cuda.GetModuleFunction(cudaReduceKernelName);
        }
Example #18
0
        public PreAlignment(NPPImage_32fC1 img, CudaContext ctx)
        {
            width             = img.WidthRoi;
            height            = img.HeightRoi;
            imgToTrackRotated = new NPPImage_32fC1(width, height);

            CUmodule mod = ctx.LoadModule("kernel.ptx");

            int fftWidth = width / 2 + 1;

            conjKernel          = new conjugateComplexMulKernel(ctx, mod);
            fourierFilterKernel = new fourierFilterKernel(ctx, mod);
            fftshiftKernel      = new fftshiftKernel(ctx, mod);

            squaredSumKernel   = new squaredSumKernel(ctx, mod);
            boxFilterXKernel   = new boxFilterWithBorderXKernel(ctx, mod);
            boxFilterYKernel   = new boxFilterWithBorderYKernel(ctx, mod);
            normalizedCCKernel = new normalizedCCKernel(ctx, mod);
            findMinimumKernel  = new findMinimumKernel(ctx, mod);



            int n = 2;

            int[] dims    = new int[] { height, width };
            int   batches = 1;

            int[] inembed = new int[] { 1, imgToTrackRotated.Pitch / 4 };
            int[] onembed = new int[] { 1, fftWidth };
            int   idist   = height * imgToTrackRotated.Pitch / 4;
            int   odist   = height * fftWidth;
            int   istride = 1;
            int   ostride = 1;

            cufftHandle handleForward  = cufftHandle.Create();
            cufftHandle handleBackward = cufftHandle.Create();

            SizeT sizeForward  = new SizeT();
            SizeT sizeBackward = new SizeT();

            forward  = new CudaFFTPlanMany(handleForward, n, dims, batches, cufftType.R2C, inembed, istride, idist, onembed, ostride, odist, ref sizeForward, false);
            backward = new CudaFFTPlanMany(handleBackward, n, dims, batches, cufftType.C2R, onembed, ostride, odist, inembed, istride, idist, ref sizeBackward, false);

            FFTBufferSize = sizeForward > sizeBackward ? sizeForward : sizeBackward;
        }
Example #19
0
        protected void InitCudaModule()
        {
            int deviceNr = 0;

            cuda  = new CUDA(deviceNr, true);
            cuCtx = cuda.CreateContext(deviceNr, CUCtxFlags.MapHost);


            string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName);

            if (!File.Exists(modluePath))
            {
                throw new ArgumentException("Failed access to cuda module" + modluePath);
            }

            cuModule = cuda.LoadModule(modluePath);
            cuFunc   = cuda.GetModuleFunction(cudaProductKernelName);
        }
Example #20
0
        public HuForceDirectedLayout(int steps)
        {
#if !DEBUG
            try
            {
#endif
            CUDADriver.cuInit(0);

            dev = new CUdevice();
            CUDADriver.cuDeviceGet(ref dev, 0);

            ctx = new CUcontext();
            CUDADriver.cuCtxCreate(ref ctx, 0, dev);

            mod = new CUmodule();
            CUDADriver.cuModuleLoad(ref mod, "BarnesHut.cubin");

            prop = new CUDeviceProperties();
            CUDADriver.cuDeviceGetProperties(ref prop, dev);
            int version = 0;
            CUDADriver.cuDriverGetVersion(ref version);

            string caps = "";

            GASS.CUDA.CUDARuntime.cudaRuntimeGetVersion(ref version);


            caps += "\tClock rate        = " + prop.clockRate / 1000000 + " MHz\n";
            caps += "\tMemory size       = " + prop.totalConstantMemory / 1024 + " KB\n";
            caps += "\tThreads per block = " + prop.maxThreadsPerBlock + "\n";
            caps += "\tWarp size         = " + prop.SIMDWidth + "\n";
            caps += "\tCUDA version      = " + version + "\n";

            Logger.AddMessage(LogEntryType.Info, "Successfully initialized CUDA GPU computation\n" + caps);
#if !DEBUG
        }
        catch (Exception ex)
        {
            Logger.AddMessage(LogEntryType.Warning, "CUDA not available, falling back to CPU. Exception was: " + ex.Message);
            CUDAEnabled = false;
        }
#endif
        }
Example #21
0
        private void InitCudaModule()
        {
            cuda     = new CUDA(0, true);
            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));

            cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName);

            cuFuncGradFinalize = cuda.GetModuleFunction(cudaGradFinalizeName);

            cuFuncComputeBBstep = cuda.GetModuleFunction(cudaComputeBBStepName);

            cuFuncObjSquareW     = cuda.GetModuleFunction(cudaObjWName);
            cuFuncObjSquareAlpha = cuda.GetModuleFunction(cudaObjAlphaName);

            cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW);

            cuFuncUpdateAlpha = cuda.GetModuleFunction(cudaUpdateAlphaName);

            cuFuncLinPart = cuda.GetModuleFunction(cudaLinPartName);
        }
Example #22
0
        public override void Init()
        {
            int         N        = DataGenerator.InputCount;
            CudaContext cntxt    = new CudaContext();
            CUmodule    cumodule = cntxt.LoadModule(@"kernel.cubin");

            myKernel = new CudaKernel("proccess", cumodule, cntxt);
            //myKernel.GridDimensions = (N + 255) / 256;
            //myKernel.BlockDimensions = Math.Min(N, 256);
            myKernel.GridDimensions  = (N + 255) / 256;
            myKernel.BlockDimensions = 256;

            // https://softwarehut.com/blog/general-purpose-computing-gpu-net-world-part-1/
            //https://stackoverflow.com/questions/2392250/understanding-cuda-grid-dimensions-block-dimensions-and-threads-organization-s
            //myKernel.GridDimensions = new dim3(1, 1, 1);
            //myKernel.BlockDimensions = new dim3(16, 16);

            // init input parameters
            input1_dev = new CudaDeviceVariable <int>(DataGenerator.In1.Length);
            input2_dev = new CudaDeviceVariable <int>(DataGenerator.In2.Length);
            input3_dev = new CudaDeviceVariable <double>(DataGenerator.In3.Length);
            input4_dev = new CudaDeviceVariable <byte>(DataGenerator.In4_3_bytes.Length);

            result_dev     = new CudaDeviceVariable <byte>(resultsBytes.Length);
            resultCalc_dev = new CudaDeviceVariable <double>(calculatables.Length);

            // copy input parameters
            input1_dev.CopyToDevice(DataGenerator.In1);
            input2_dev.CopyToDevice(DataGenerator.In2);
            input3_dev.CopyToDevice(DataGenerator.In3);
            input4_dev.CopyToDevice(DataGenerator.In4_3_bytes);

            // init output parameters
            //result_dev = new CudaDeviceVariable<bool>(results.Length);

            //myKernel.SetConstantVariable("width", DataGenerator.Width);
            //myKernel.SetConstantVariable("inputCount", N);
            //myKernel.SetConstantVariable("height", DataGenerator.Height);
        }
Example #23
0
        public static CUdeviceptr GetGlobalAddress(LLVMValueRef value, CUmodule module, out SizeT size)
        {
            if (value.IsAConstantExpr().Pointer != IntPtr.Zero && value.GetConstOpcode() == LLVMOpcode.LLVMBitCast)
            {
                return(GetGlobalAddress(value.GetOperand(0), module, out size));
            }

            var ptr = new CUdeviceptr();

            size = new SizeT();
            var result = ManagedCuda.DriverAPINativeMethods.ModuleManagement.cuModuleGetGlobal_v2(
                ref ptr, ref size, module, GetGlobalName(value));

            if (result == ManagedCuda.BasicTypes.CUResult.Success)
            {
                return(ptr);
            }
            else
            {
                throw new CudaException(result);
            }
        }
Example #24
0
        public OpticalFlow(int width, int height, CudaContext ctx)
        {
            CUmodule mod = ctx.LoadModulePTX("opticalFlow.ptx");

            warpingKernel            = new WarpingKernel(ctx, mod);
            createFlowFieldFromTiles = new CreateFlowFieldFromTiles(ctx, mod);
            computeDerivativesKernel = new ComputeDerivativesKernel(ctx, mod);
            lukasKanade = new LukasKanadeKernel(ctx, mod);

            d_tmp  = new NPPImage_32fC1(width, height);
            d_Ix   = new NPPImage_32fC1(width, height);
            d_Iy   = new NPPImage_32fC1(width, height);
            d_Iz   = new NPPImage_32fC1(width, height);
            d_flow = new NPPImage_32fC2(width, height);

            buffer = new CudaDeviceVariable <byte>(d_tmp.MeanStdDevGetBufferHostSize() * 3);
            mean   = new CudaDeviceVariable <double>(1);
            std    = new CudaDeviceVariable <double>(1);


            d_filterX = new float[] { -0.25f, 0.25f, -0.25f, 0.25f };
            d_filterY = new float[] { -0.25f, -0.25f, 0.25f, 0.25f };
            d_filterT = new float[] { 0.25f, 0.25f, 0.25f, 0.25f };
        }
Example #25
0
 public KernelModule(CudaContext context, string path)
 {
     _context = context;
     _module  = _context.LoadModule(path);
 }
Example #26
0
        public static void For(int number_of_threads, SimpleKernel simpleKernel)
        {
            if (Campy.Utils.Options.IsOn("import-only"))
            {
                JustImport(simpleKernel);
                return;
            }

            GCHandle handle1 = default(GCHandle);
            GCHandle handle2 = default(GCHandle);

            try
            {
                unsafe
                {
                    System.Reflection.MethodInfo method_info = simpleKernel.Method;
                    String kernel_assembly_file_name         = method_info.DeclaringType.Assembly.Location;
                    Mono.Cecil.ModuleDefinition md           = Campy.Meta.StickyReadMod.StickyReadModule(
                        kernel_assembly_file_name, new ReaderParameters {
                        ReadSymbols = true
                    });
                    MethodReference method_reference = md.ImportReference(method_info);

                    CUfunction ptr_to_kernel = default(CUfunction);
                    CUmodule   module        = default(CUmodule);

                    Campy.Utils.TimePhase.Time("compile     ", () =>
                    {
                        IntPtr image = Singleton._compiler.Compile(method_reference, simpleKernel.Target);
                        module       = Singleton._compiler.SetModule(method_reference, image);
                        Singleton._compiler.StoreJits(module);
                        ptr_to_kernel = Singleton._compiler.GetCudaFunction(method_reference, module);
                    });

                    RUNTIME.BclCheckHeap();

                    BUFFERS buffer = Singleton.Buffer;
                    IntPtr  kernel_target_object = IntPtr.Zero;

                    Campy.Utils.TimePhase.Time("deep copy ", () =>
                    {
                        int count = simpleKernel.Method.GetParameters().Length;
                        var bb    = Singleton._compiler.GetBasicBlock(method_reference);
                        if (bb.HasThis)
                        {
                            count++;
                        }
                        if (!(count == 1 || count == 2))
                        {
                            throw new Exception("Expecting at least one parameter for kernel.");
                        }

                        if (bb.HasThis)
                        {
                            kernel_target_object = buffer.AddDataStructure(simpleKernel.Target);
                        }
                    });

                    Campy.Utils.TimePhase.Time("kernel cctor set up", () =>
                    {
                        // For each cctor, run on GPU.
                        // Construct dependency graph of methods.
                        List <MethodReference> order_list = COMPILER.Singleton.ConstructCctorOrder();

                        // Finally, call cctors.
                        foreach (var bb in order_list)
                        {
                            if (Campy.Utils.Options.IsOn("trace-cctors"))
                            {
                                System.Console.WriteLine("Executing cctor "
                                                         + bb.FullName);
                            }
                            var cctor = Singleton._compiler.GetCudaFunction(bb, module);

                            var res = CUresult.CUDA_SUCCESS;
                            Campy.Utils.CudaHelpers.MakeLinearTiling(1,
                                                                     out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                            res = Cuda.cuLaunchKernel(
                                cctor,
                                tiles.x, tiles.y, tiles.z,             // grid has one block.
                                tile_size.x, tile_size.y, tile_size.z, // n threads.
                                0,                                     // no shared memory
                                default(CUstream),
                                (IntPtr)IntPtr.Zero,
                                (IntPtr)IntPtr.Zero
                                );

                            CudaHelpers.CheckCudaError(res);
                            res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host.
                            CudaHelpers.CheckCudaError(res);
                        }
                    });

                    if (Campy.Utils.Options.IsOn("trace-cctors"))
                    {
                        System.Console.WriteLine("Done with cctors");
                    }

                    Campy.Utils.TimePhase.Time("kernel call ", () =>
                    {
                        IntPtr[] parm1 = new IntPtr[1];
                        IntPtr[] parm2 = new IntPtr[1];

                        parm1[0] = kernel_target_object;
                        parm2[0] = buffer.New(BUFFERS.SizeOf(typeof(int)));

                        IntPtr[] x1     = parm1;
                        handle1         = GCHandle.Alloc(x1, GCHandleType.Pinned);
                        IntPtr pointer1 = handle1.AddrOfPinnedObject();

                        IntPtr[] x2     = parm2;
                        handle2         = GCHandle.Alloc(x2, GCHandleType.Pinned);
                        IntPtr pointer2 = handle2.AddrOfPinnedObject();

                        IntPtr[] kp = new IntPtr[] { pointer1, pointer2 };
                        var res     = CUresult.CUDA_SUCCESS;
                        fixed(IntPtr * kernelParams = kp)
                        {
                            Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads,
                                                                     out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                            //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles);

                            res = Cuda.cuLaunchKernel(
                                ptr_to_kernel,
                                tiles.x, tiles.y, tiles.z,             // grid has one block.
                                tile_size.x, tile_size.y, tile_size.z, // n threads.
                                0,                                     // no shared memory
                                default(CUstream),
                                (IntPtr)kernelParams,
                                (IntPtr)IntPtr.Zero
                                );
                        }
        //public void SetConstantVariable(string name, CUdeviceptr value) { m_kernel.SetConstantVariable(name, value); }

        public MyCudaKernel(string kernelName, CUmodule module, CudaContext cuda, int GPU)
        {
            m_GPU       = GPU;
            m_kernel    = new CudaKernel(kernelName, module, cuda);
            MAX_THREADS = m_kernel.MaxThreadsPerBlock;
        }
Example #28
0
        /// <summary> see CUDA doc; </summary>
        public static void ModuleLoadData(out CUmodule module, IntPtr img)
        {
            CUresult res = my.cuModuleLoadData(out module, img);

            TestResult(res);
        }
Example #29
0
        /// <summary>
        /// Creates a new mipmapped texture from array memory. Allocates a new mipmapped array.
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="texName"></param>
        /// <param name="addressMode0"></param>
        /// <param name="addressMode1"></param>
        /// <param name="addressMode2"></param>
        /// <param name="filterMode"></param>
        /// <param name="flags"></param>
        /// <param name="descriptor"></param>
        /// <param name="numMipmapLevels"></param>
        /// <param name="maxAniso"></param>
        /// <param name="mipmapFilterMode"></param>
        /// <param name="mipmapLevelBias"></param>
        /// <param name="minMipmapLevelClamp"></param>
        /// <param name="maxMipmapLevelClamp"></param>
        public CudaTextureMipmappedArray(CudaKernel kernel, string texName, CUAddressMode addressMode0, CUAddressMode addressMode1, CUAddressMode addressMode2,
                                         CUFilterMode filterMode, CUTexRefSetFlags flags, CUDAArray3DDescriptor descriptor, uint numMipmapLevels,
                                         uint maxAniso, CUFilterMode mipmapFilterMode, float mipmapLevelBias, float minMipmapLevelClamp, float maxMipmapLevelClamp)
        {
            _maxAniso            = maxAniso;
            _mipmapFilterMode    = mipmapFilterMode;
            _mipmapLevelBias     = mipmapLevelBias;
            _minMipmapLevelClamp = minMipmapLevelClamp;
            _maxMipmapLevelClamp = maxMipmapLevelClamp;

            _texref = new CUtexref();
            res     = DriverAPINativeMethods.ModuleManagement.cuModuleGetTexRef(ref _texref, kernel.CUModule, texName);
            Debug.Write("");//Line(String.Format("{0:G}, {1}: {2}, Texture name: {3}", DateTime.Now, "cuModuleGetTexRef", res, texName));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 0, addressMode0);
            Debug.Write("");//Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 1, addressMode1);
            Debug.Write("");//Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 2, addressMode2);
            Debug.Write("");//Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFilterMode(_texref, filterMode);
            Debug.Write("");//Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFilterMode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFlags(_texref, flags);
            Debug.Write("");//Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFlags", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFormat(_texref, descriptor.Format, (int)descriptor.NumChannels);
            Debug.Write("");//Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFormat", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }

            _filtermode      = filterMode;
            _flags           = flags;
            _addressMode0    = addressMode0;
            _addressMode1    = addressMode1;
            _addressMode2    = addressMode2;
            _arrayDescriptor = descriptor;
            _name            = texName;
            _module          = kernel.CUModule;
            _cufunction      = kernel.CUFunction;

            _array = new CudaMipmappedArray(descriptor, numMipmapLevels);

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmappedArray(_texref, _array.CUMipmappedArray, CUTexRefSetArrayFlags.OverrideFormat);
            Debug.Write("");            //Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmappedArray", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMaxAnisotropy(_texref, maxAniso);
            Debug.Write("");            //Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMaxAnisotropy", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapFilterMode(_texref, mipmapFilterMode);
            Debug.Write("");            //Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapFilterMode", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapLevelBias(_texref, mipmapLevelBias);
            Debug.Write("");            //Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapLevelBias", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapLevelClamp(_texref, minMipmapLevelClamp, maxMipmapLevelClamp);
            Debug.Write("");            //Line(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapLevelClamp", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
        }
        static void Main(string[] args)
        {
            //Read CL arguments
            for (int i = 0; i < args.Length; i++)
            {
                if (args[i] == "-d")
                {
                    deviceID = int.Parse(args[++i]);
                }
                if (args[i] == "-lr")
                {
                    learning_rate = double.Parse(args[++i], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture);
                }
                if (args[i] == "-iso")
                {
                    ISO = args[++i];
                }
                if (args[i] == "-t")
                {
                    crosscheck = true;
                }
                if (args[i] == "-w")
                {
                    warmStart = int.Parse(args[++i]);
                    Console.WriteLine("Start with epoch " + warmStart);
                }
                if (args[i] == "-s")
                {
                    saveImages = true;
                }
            }

            Console.WriteLine("Using device ID: " + deviceID);
            Console.WriteLine("Learning rate: " + learning_rate);

            //Init Cuda stuff
            ctx = new PrimaryContext(deviceID);
            ctx.SetCurrent();
            Console.WriteLine("Context created");
            CUmodule modPatch = ctx.LoadModulePTX("PatchProcessing.ptx");

            Console.WriteLine("modPatch loaded");
            CUmodule modBorder = ctx.LoadModulePTX("BorderTreatment.ptx");

            Console.WriteLine("modBorder loaded");
            CUmodule modError = ctx.LoadModulePTX("ErrorComputation.ptx");

            Console.WriteLine("modError loaded");
            CUmodule modPRelu = ctx.LoadModulePTX("PRelu.ptx");

            Console.WriteLine("modPRelu loaded");
            CUmodule modDeBayer = ctx.LoadModulePTX("DeBayer.ptx");

            Console.WriteLine("all modules loaded");
            deBayerGreenKernel   = new DeBayerGreenKernel(modDeBayer, ctx);
            deBayerRedBlueKernel = new DeBayerRedBlueKernel(modDeBayer, ctx);
            //Both deBayer kernels are load from the same module: setting the constant variable for bayer pattern one is enough...
            deBayerGreenKernel.BayerPattern = new BayerColor[] { BayerColor.Red, BayerColor.Green, BayerColor.Green, BayerColor.Blue };

            prepareDataKernel  = new PrepareDataKernel(modPatch, ctx);
            restoreImageKernel = new RestoreImageKernel(modPatch, ctx);
            Console.WriteLine("kernels loaded");


            int countOwn = 468083;
            int count5k  = 33408;


            string fileBase = @"/ssd/data/TrainingsDataNN/";



            List <float3> WhiteBalanceFactors = new List <float3>();
            FileStream    fs1 = new FileStream(fileBase + "FromOwnDataset/WhiteBalancesOwn.txt", FileMode.Open, FileAccess.Read);
            FileStream    fs2 = new FileStream(fileBase + "From5kDataset/WhiteBalances5k.txt", FileMode.Open, FileAccess.Read);
            StreamReader  sr1 = new StreamReader(fs1);
            StreamReader  sr2 = new StreamReader(fs2);

            for (int i = 0; i < countOwn; i++)
            {
                fileRawList.Add(fileBase + "FromOwnDataset/ISO" + ISO + "/img_" + i.ToString("0000000") + ".bin");
                fileTrouthList.Add(fileBase + "FromOwnDataset/GroundTruth/img_" + i.ToString("0000000") + ".bin");

                string   line   = sr1.ReadLine();
                string[] values = line.Split('\t');
                float3   wb     = new float3(float.Parse(values[1], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[2], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[3], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture));

                WhiteBalanceFactors.Add(wb);
            }
            for (int i = 0; i < count5k; i++)
            {
                fileRawList.Add(fileBase + "From5kDataset/ISO" + ISO + "/img_" + i.ToString("0000000") + ".bin");
                fileTrouthList.Add(fileBase + "From5kDataset/GroundTruth/img_" + i.ToString("0000000") + ".bin");

                string   line   = sr2.ReadLine();
                string[] values = line.Split('\t');
                float3   wb     = new float3(float.Parse(values[1], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[2], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture),
                                             float.Parse(values[3], System.Globalization.NumberStyles.AllowDecimalPoint, CultureInfo.InvariantCulture));

                WhiteBalanceFactors.Add(wb);
            }
            sr2.Close();
            sr1.Close();

            baOriginal = new float3[countOwn + count5k][];
            baRAW      = new float[countOwn + count5k][];

            Random rand = new Random(0);

            //random order for the image patches
            for (int i = 0; i < countOwn + count5k - 1; i++)
            {
                int    r    = i + (rand.Next() % (countOwn + count5k - i));
                string temp = fileRawList[i];
                fileRawList[i] = fileRawList[r];
                fileRawList[r] = temp;

                temp = fileTrouthList[i];
                fileTrouthList[i] = fileTrouthList[r];
                fileTrouthList[r] = temp;

                float3 tempf = WhiteBalanceFactors[i];
                WhiteBalanceFactors[i] = WhiteBalanceFactors[r];
                WhiteBalanceFactors[r] = tempf;
            }


            Console.WriteLine("Initialization done!");

            int trainingSize = (int)((countOwn + count5k) * 0.9f); //4 patches per file
            int testSize     = fileRawList.Count - trainingSize;

            CudaBlas       blas  = new CudaBlas(PointerMode.Host);
            CudaDNNContext cudnn = new CudaDNNContext();

            int   patchSize     = 31;
            int   patchSize4    = 66; //Size of an 2x2 patch read from file
            int   batch         = 64;
            float normalization = 0.5f;

            //define neural network:
            StartLayer         start = new StartLayer(patchSize, patchSize, 3, batch);
            FinalLayer         final = new FinalLayer(patchSize, patchSize, 3, batch, FinalLayer.Norm.Mix, ctx, modError);
            ConvolutionalLayer conv1 = new ConvolutionalLayer(patchSize, patchSize, 3, patchSize, patchSize, 64, batch, 9, 9, ConvolutionalLayer.Activation.PRelu, blas, cudnn, ctx, modBorder, modPRelu);
            ConvolutionalLayer conv2 = new ConvolutionalLayer(patchSize, patchSize, 64, patchSize, patchSize, 64, batch, 5, 5, ConvolutionalLayer.Activation.PRelu, blas, cudnn, ctx, modBorder, modPRelu);
            ConvolutionalLayer conv3 = new ConvolutionalLayer(patchSize, patchSize, 64, patchSize, patchSize, 3, batch, 5, 5, ConvolutionalLayer.Activation.None, blas, cudnn, ctx, modBorder, modPRelu);

            start.ConnectFollowingLayer(conv1);
            conv1.ConnectFollowingLayer(conv2);
            conv2.ConnectFollowingLayer(conv3);
            conv3.ConnectFollowingLayer(final);

            CudaDeviceVariable <float3> imgA = new CudaDeviceVariable <float3>(patchSize4 * patchSize4);
            CudaDeviceVariable <float3> imgB = new CudaDeviceVariable <float3>(patchSize4 * patchSize4);
            CudaDeviceVariable <float>  rawd = new CudaDeviceVariable <float>(patchSize4 * patchSize4);

            CudaDeviceVariable <float> inputImgs    = new CudaDeviceVariable <float>(patchSize * patchSize * 3 * batch);
            CudaDeviceVariable <float> groundTrouth = new CudaDeviceVariable <float>(patchSize * patchSize * 3 * batch);
            NPPImage_8uC3 imgU3a = new NPPImage_8uC3(patchSize, patchSize);
            NPPImage_8uC3 imgU3b = new NPPImage_8uC3(patchSize, patchSize);
            NPPImage_8uC3 imgU3c = new NPPImage_8uC3(patchSize, patchSize);

            Bitmap a = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb);
            Bitmap b = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb);
            Bitmap c = new Bitmap(patchSize, patchSize, PixelFormat.Format24bppRgb);

            Random randImageOutput = new Random(0);
            Random randForInit     = new Random(0);

            start.InitRandomWeight(randForInit);
            conv1.SetActivation(0.1f);
            conv2.SetActivation(0.1f);

            int startEpoch = warmStart;

            FileStream fs;

            //restore network in case of warm start:
            if (warmStart > 0)
            {
                fs = new FileStream("epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + (warmStart - 1) + ".cnn", FileMode.Open, FileAccess.Read);
                start.RestoreValues(fs);
                fs.Close();
                fs.Dispose();
            }

            //validate results on validation data set
            if (crosscheck)
            {
                FileStream   csvResult = new FileStream("results_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write);
                StreamWriter sw        = new StreamWriter(csvResult);

                sw.WriteLine("L1;L2;Mix;Filename");
                for (int i = 0; i < 2000; i += 1)
                {
                    string filename = "epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + i + ".cnn";
                    try
                    {
                        FileStream cnn = new FileStream(filename, FileMode.Open, FileAccess.Read);
                        start.RestoreValues(cnn);
                        cnn.Close();
                        cnn.Dispose();
                    }
                    catch (Exception)
                    {
                        Console.WriteLine("Skipping: " + i);
                        continue;
                    }

                    double errorL1  = 0;
                    double errorL2  = 0;
                    double errorMix = 0;
                    for (int iter = 0; iter < testSize / batch * 4; iter++)
                    {
                        //Prepare batch for training:
                        for (int ba = 0; ba < batch / 4; ba++)
                        {
                            int idx = iter * (batch / 4) + ba + trainingSize;

                            float3[] original;
                            float[]  raw;
                            if (baRAW[idx - trainingSize] == null)
                            {
                                original = ReadRAWFloat3(fileTrouthList[idx]);
                                raw      = ReadRAWFloat(fileRawList[idx]);
                                baOriginal[idx - trainingSize] = original;
                                baRAW[idx - trainingSize]      = raw;
                            }
                            else
                            {
                                original = baOriginal[idx - trainingSize];
                                raw      = baRAW[idx - trainingSize];
                            }

                            rawd.CopyToDevice(raw);
                            imgA.CopyToDevice(original);

                            deBayerGreenKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            deBayerRedBlueKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            prepareDataKernel.RunSafe(imgA, imgB, groundTrouth, inputImgs, ba, normalization, WhiteBalanceFactors[idx]);
                        }

                        start.SetData(inputImgs);
                        final.SetGroundTrouth(groundTrouth);

                        float err = start.InferenceTraining(inputImgs);

                        errorMix += err;
                        errorL1  += final.GetError(FinalLayer.Norm.L1);
                        errorL2  += final.GetError(FinalLayer.Norm.L2);
                    }
                    Console.WriteLine("Results for: " + filename);
                    Console.WriteLine("Mean Error L1: " + errorL1 / testSize * batch / 4);
                    Console.WriteLine("Mean Error L2: " + errorL2 / testSize * batch / 4);
                    Console.WriteLine("Mean Error Mix: " + errorMix / testSize * batch / 4);
                    sw.Write((errorL1 / testSize * batch / 4).ToString().Replace(".", ","));
                    sw.Write(";");
                    sw.Write((errorL2 / testSize * batch / 4).ToString().Replace(".", ","));
                    sw.Write(";");
                    sw.Write((errorMix / testSize * batch / 4).ToString().Replace(".", ","));
                    sw.Write(";");
                    sw.WriteLine(filename);
                    sw.Flush();
                }
                sw.Close();
                csvResult.Close();
                csvResult.Dispose();
            }
            //or train existing network:
            else
            {
                double error      = 0;
                double errorEpoch = 0;
                for (int epoch = startEpoch; epoch < 2000; epoch++)
                {
                    errorEpoch = 0;
                    error      = 0;

                    for (int iter = 0; iter < trainingSize / batch * 4; iter++)
                    {
                        //Prepare batch for training:
                        for (int ba = 0; ba < batch / 4; ba++)
                        {
                            int idx = iter * (batch / 4) + ba;

                            float3[] original;
                            float[]  raw;
                            if (baRAW[idx] == null)
                            {
                                original        = ReadRAWFloat3(fileTrouthList[idx]);
                                raw             = ReadRAWFloat(fileRawList[idx]);
                                baOriginal[idx] = original;
                                baRAW[idx]      = raw;
                            }
                            else
                            {
                                original = baOriginal[idx];
                                raw      = baRAW[idx];
                            }

                            rawd.CopyToDevice(raw);
                            imgA.CopyToDevice(original);

                            deBayerGreenKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            deBayerRedBlueKernel.RunSafe(rawd, imgB, patchSize4, new float3(0, 0, 0), WhiteBalanceFactors[idx]);
                            prepareDataKernel.RunSafe(imgA, imgB, groundTrouth, inputImgs, ba, normalization, WhiteBalanceFactors[idx]);
                        }

                        start.SetData(inputImgs);
                        final.SetGroundTrouth(groundTrouth);

                        float err = start.InferenceTraining(inputImgs);

                        final.BackPropagation(groundTrouth);

                        start.UpdateWeights(GetLearningRate(epoch * (trainingSize) / batch * 4 + iter));//*0+951342

                        error      += err;
                        errorEpoch += err;
                        if ((epoch * trainingSize / batch * 4 + iter) % 1000 == 0 && iter != 0)
                        {
                            FileStream   status = new FileStream("status_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write);
                            StreamWriter sw     = new StreamWriter(status);

                            sw.WriteLine((error / 1000.0).ToString().Replace(".", ",") + ";" + GetLearningRate(epoch * trainingSize / batch * 4 + iter).ToString().Replace(".", ","));

                            sw.Close();
                            status.Close();
                            status.Dispose();
                            error = 0;
                        }

                        //if ((epoch * trainingSize / batch * 4 + iter) % 10000 == 0)
                        //{
                        //    fs = new FileStream("iter_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + (epoch * trainingSize / batch * 4 + iter) + ".cnn", FileMode.Create, FileAccess.Write);
                        //    start.SaveValues(fs);
                        //    fs.Close();
                        //    fs.Dispose();
                        //    Console.WriteLine("Network saved for iteration " + (epoch * trainingSize / batch * 4 + iter) + "!");
                        //}

                        Console.WriteLine("Epoch: " + epoch + " Iteration: " + (epoch * trainingSize / batch * 4 + iter) + ", Error: " + err);

                        if (saveImages && iter == 0)//(epoch * trainingSize / batch * 4 + iter) % 10000 == 0 &&
                        {
                            for (int i = 0; i < 1; i++)
                            {
                                int    imgidx = randImageOutput.Next(batch);
                                float3 wb     = WhiteBalanceFactors[iter * (batch / 4) + imgidx / 4];
                                restoreImageKernel.RunSafe(groundTrouth, imgU3a, imgidx, wb.x, wb.y, wb.z, normalization);
                                restoreImageKernel.RunSafe(inputImgs, imgU3b, imgidx, wb.x, wb.y, wb.z, normalization);
                                CudaDeviceVariable <float> res = final.GetResult();
                                restoreImageKernel.RunSafe(res, imgU3c, imgidx, wb.x, wb.y, wb.z, normalization);

                                imgU3a.CopyToHost(a);
                                imgU3b.CopyToHost(b);
                                imgU3c.CopyToHost(c);

                                a.Save("GroundTrouth_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png");// * trainingSize / batch * 4 + iter
                                b.Save("Input_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png");
                                c.Save("Result_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + "_" + imgidx + ".png");
                            }
                        }
                    }
                    errorEpoch /= trainingSize / batch * 4;
                    fs          = new FileStream("errorEpoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + ".csv", FileMode.Append, FileAccess.Write);
                    StreamWriter sw2 = new StreamWriter(fs);
                    sw2.WriteLine(errorEpoch.ToString().Replace(".", ","));
                    sw2.Close();
                    fs.Close();
                    fs.Dispose();

                    fs = new FileStream("epoch_" + learning_rate.ToString(CultureInfo.InvariantCulture) + "_" + ISO + "_" + epoch + ".cnn", FileMode.Create, FileAccess.Write);
                    start.SaveValues(fs);
                    fs.Close();
                    fs.Dispose();
                }
            }
        }