Example #1
0
        private void display(Int64 iter)
        {
            stopwatch.Start();
            simulateFluids();

            // render points from vertex buffer
            GL.Clear(ClearBufferMask.ColorBufferBit);

            if (iter % 1000 == 0)
            {
                R = (float)rnd.NextDouble();
                G = (float)rnd.NextDouble();
                B = (float)rnd.NextDouble();
            }

            DrawPoints(1.0f, 0.0f, 0.0f, 0, DS / 2);
            DrawPoints(0.0f, 1.0f, 0.0f, DS / 2, DS);

            // Finish timing before swap buffers to avoid refresh sync
            stopwatch.Stop();
            m_renderControl.SwapBuffers();

            fpsCount++;
            if (fpsCount == fpsLimit)
            {
                float  ifps = 1.0f / (stopwatch.GetElapsedTime() / 1000.0f);
                string fps  = string.Format(System.Globalization.CultureInfo.InvariantCulture, "Cuda/GL Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps);
                this.Text = fps;
                fpsCount  = 0;
                fpsLimit  = (int)Math.Max(ifps, 1.0f);
            }
        }
Example #2
0
    public void CUDA_AddFloatArrays()
    {
        //Load Kernel image from resources
        Stream stream = new StreamReader(resName).BaseStream;

        if (stream == null)
        {
            throw new ArgumentException("Kernel not found in resources.");
        }

        vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

        var threadsPerBlock = 1024;

        vectorAddKernel.BlockDimensions = threadsPerBlock;
        vectorAddKernel.GridDimensions  = (Count + threadsPerBlock - 1) / threadsPerBlock;

        CudaStopWatch w = new CudaStopWatch();

        w.Start();
        vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count);
        w.Stop();

        Debug.Log(w.GetElapsedTime() / 1000.0f);
        Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}");
        Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}");

        // Copy result from device memory to host memory
        // h_C contains the result in host memory
        // h_C = d_C;
    }
Example #3
0
        private void display()
        {
            stopwatch.Start();

            advectVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, DT, g_tPitch);

            {
                g_planr2c.Exec(g_vxfield.DevicePointer);
                g_planr2c.Exec(g_vyfield.DevicePointer);

                diffuseProject(g_vxfield, g_vyfield, CPADW, DIM, DT, VIS, g_tPitch);

                g_planc2r.Exec(g_vxfield.DevicePointer);
                g_planc2r.Exec(g_vyfield.DevicePointer);
            }
            updateVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, g_tPitch);

            // Map D3D9 vertex buffer to CUDA
            {
                graphicsres.MapAllResources();
                g_mparticles = graphicsres[0].GetMappedPointer <vertex>();
                advectParticles(g_mparticles, g_dvfield, DIM, DIM, DT, g_tPitch);
                graphicsres.UnmapAllResources();
            }

            device.Clear(ClearFlags.Target, new Color4(0.0f, 0, 0), 0.0f, 0);
            device.SetRenderState(RenderState.ZWriteEnable, false);
            device.SetRenderState(RenderState.AlphaBlendEnable, true);
            device.SetRenderState(RenderState.SourceBlend, Blend.One);
            device.SetRenderState(RenderState.DestinationBlend, Blend.One);
            device.SetRenderState(RenderState.PointSpriteEnable, true);
            float size = 16.0f;

            device.SetRenderState(RenderState.PointSize, size);
            device.SetTexture(0, g_pTexture);

            if (device.BeginScene().IsSuccess)
            {
                Result res;
                //Draw particles
                res = device.SetStreamSource(0, g_pVB, 0, Marshal.SizeOf(typeof(vertex)));
                device.VertexFormat = VertexFormat.Position | VertexFormat.Diffuse;
                res = device.DrawPrimitives(PrimitiveType.PointList, 0, DS);
                device.EndScene();
            }
            stopwatch.Stop();

            device.Present();
            fpsCount++;

            if (fpsCount == fpsLimit)
            {
                float  ifps = 1.0f / (stopwatch.GetElapsedTime() / 1000.0f);
                string fps  = string.Format(System.Globalization.CultureInfo.InvariantCulture, "CUDA/D3D9 Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps);
                this.Text = fps;
                fpsCount  = 0;
                fpsLimit  = (int)Math.Max(ifps, 1.0f);
            }
        }
Example #4
0
        private void display()
        {
            stopwatch.Start();
            simulateFluids();

            // render points from vertex buffer
            GL.Clear(ClearBufferMask.ColorBufferBit);
            GL.Color4(0, 1, 0, 0.5f);
            GL.PointSize(1);
            GL.Enable(EnableCap.PointSmooth);
            GL.Enable(EnableCap.Blend);
            GL.BlendFunc(BlendingFactorSrc.SrcAlpha, BlendingFactorDest.OneMinusSrcAlpha);
            GL.EnableClientState(ArrayCap.VertexArray);
            GL.Disable(EnableCap.DepthTest);
            GL.Disable(EnableCap.CullFace);
            GL.BindBuffer(BufferTarget.ArrayBuffer, vbo);
            GL.VertexPointer(2, VertexPointerType.Float, 0, 0);
            GL.DrawArrays(BeginMode.Points, 0, DS);
            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);
            GL.DisableClientState(ArrayCap.VertexArray);
            GL.DisableClientState(ArrayCap.TextureCoordArray);
            GL.Disable(EnableCap.Texture2D);

            // Finish timing before swap buffers to avoid refresh sync
            stopwatch.Stop();
            m_renderControl.SwapBuffers();

            fpsCount++;
            if (fpsCount == fpsLimit)
            {
                float  ifps = 1.0f / (stopwatch.GetElapsedTime() / 1000.0f);
                string fps  = string.Format(System.Globalization.CultureInfo.InvariantCulture, "Cuda/GL Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps);
                this.Text = fps;
                fpsCount  = 0;
                fpsLimit  = (int)Math.Max(ifps, 1.0f);
            }
        }
Example #5
0
        static void Main(string[] args)
        {
            const int nx = 2048;
            const int ny = 2048;

            // shifts applied to x and y data
            const int x_shift = 5;
            const int y_shift = 7;

            ShrQATest.shrQAStart(args);

            if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0))
            {
                Console.Write("nx and ny must be multiples of TILE_DIM\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED);
            }

            // execution configuration parameters
            dim3 grid    = new dim3(nx / TILE_DIM, ny / TILE_DIM, 1);
            dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1);

            // This will pick the best possible CUDA capable device
            int devID = findCudaDevice(args);


            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "simplePitchLinearTexture_x64.ptx";
            }
            else
            {
                resName = "simplePitchLinearTexture.ptx";
            }

            string resNamespace = "simplePitchLinearTexture";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }
            byte[] kernels = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;

            while (bytesToRead > 0)
            {
                bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead);
            }

            CudaKernel PLKernel    = ctx.LoadKernelPTX(kernels, "shiftPitchLinear");
            CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray");

            CudaStopWatch stopwatch = new CudaStopWatch();

            // ----------------------------------
            // Host allocation and initialization
            // ----------------------------------

            float[] h_idata = new float[nx * ny];
            float[] h_odata = new float[nx * ny];
            float[] gold    = new float[nx * ny];

            for (int i = 0; i < nx * ny; ++i)
            {
                h_idata[i] = (float)i;
            }

            // ------------------------
            // Device memory allocation
            // ------------------------

            // Pitch linear input data
            CudaPitchedDeviceVariable <float> d_idataPL = new CudaPitchedDeviceVariable <float>(nx, ny);

            // Array input data
            CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One);

            // Pitch linear output data
            CudaPitchedDeviceVariable <float> d_odata = new CudaPitchedDeviceVariable <float>(nx, ny);

            // ------------------------
            // copy host data to device
            // ------------------------

            // Pitch linear
            d_idataPL.CopyToDevice(h_idata);

            // Array
            d_idataArray.CopyFromHostToThis <float>(h_idata);

            // ----------------------
            // Bind texture to memory
            // ----------------------

            // Pitch linear
            CudaTextureLinearPitched2D <float> texRefPL = new CudaTextureLinearPitched2D <float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL);
            CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray);

            // ---------------------
            // reference calculation
            // ---------------------

            for (int j = 0; j < ny; j++)
            {
                int jshift = (j + y_shift) % ny;
                for (int i = 0; i < nx; i++)
                {
                    int ishift = (i + x_shift) % nx;
                    gold[j * nx + i] = h_idata[jshift * nx + ishift];
                }
            }

            // ----------------
            // shiftPitchLinear
            // ----------------

            ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes);
            PLKernel.BlockDimensions = threads;
            PLKernel.GridDimensions  = grid;
            stopwatch.Start();
            for (int i = 0; i < NUM_REPS; i++)
            {
                PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift);
            }
            stopwatch.Stop();
            stopwatch.StopEvent.Synchronize();
            float timePL = stopwatch.GetElapsedTime();

            // check results
            d_odata.CopyToHost(h_odata);

            bool res = cutComparef(gold, h_odata);

            bool success = true;

            if (res == false)
            {
                Console.Write("*** shiftPitchLinear failed ***\n");
                success = false;
            }

            // ----------
            // shiftArray
            // ----------

            ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes);
            ArrayKernel.BlockDimensions = threads;
            ArrayKernel.GridDimensions  = grid;
            stopwatch.Start();
            for (int i = 0; i < NUM_REPS; i++)
            {
                ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift);
            }

            stopwatch.Stop();
            stopwatch.StopEvent.Synchronize();
            float timeArray = stopwatch.GetElapsedTime();

            // check results
            d_odata.CopyToHost(h_odata);

            res = cutComparef(gold, h_odata);

            if (res == false)
            {
                Console.Write("*** shiftArray failed ***\n");
                success = false;
            }

            float bandwidthPL    = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timePL / NUM_REPS);
            float bandwidthArray = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timeArray / NUM_REPS);

            Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n",
                          bandwidthPL, bandwidthArray);

            float fetchRatePL    = nx * ny / 1e+6f / (timePL / (1000.0f * NUM_REPS));
            float fetchRateArray = nx * ny / 1e+6f / (timeArray / (1000.0f * NUM_REPS));

            Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n",
                          fetchRatePL, fetchRateArray);


            // cleanup
            texRefPL.Dispose();
            texRefArray.Dispose();
            d_idataPL.Dispose();
            d_idataArray.Dispose();
            d_odata.Dispose();
            stopwatch.Dispose();
            ctx.Dispose();

            ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Example #6
0
        public void computeSegmentationFromTrimap()
        {
            CudaStopWatch stopwatch = new CudaStopWatch();

            stopwatch.Start();

            iteration     = 0;
            current_alpha = 0;

            // Solve Grabcut on lower resolution first. Reduces total computation time.
            createSmallTrimap();

            d_alpha[0].AsyncCopyToDevice(d_small_trimap[small_trimap_idx], new CUstream());

            for (int i = 0; i < 2; ++i)
            {
                grabCutGMM.GMMInitialize(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_small_image, d_alpha[current_alpha], small_size.width, small_size.height);

                grabCutGMM.GMMUpdate(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_small_image, d_alpha[current_alpha], small_size.width, small_size.height);

                grabCutGMM.EdgeCues(edge_strength, d_small_image, d_left_transposed, d_right_transposed, d_top, d_bottom, d_topleft, d_topright, d_bottomleft, d_bottomright, small_size.width, small_size.height, d_scratch_mem);

                grabCutGMM.DataTerm(d_terminals, gmms, d_gmm, gmm_pitch, d_small_image, d_small_trimap[small_trimap_idx], small_size.width, small_size.height);

                graphcut8Small.GraphCut(d_terminals, d_left_transposed, d_right_transposed, d_top, d_topleft, d_topright, d_bottom, d_bottomleft, d_bottomright, d_alpha[1 - current_alpha]);

                current_alpha = 1 - current_alpha;
            }

            grabCutGMM.UpsampleAlpha(d_alpha[1 - current_alpha], d_alpha[current_alpha], size.width, size.height, small_size.width, small_size.height);
            current_alpha = 1 - current_alpha;

            grabCutGMM.GMMInitialize(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height);
            grabCutGMM.GMMUpdate(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height);

            while (true)
            {
                grabCutGMM.EdgeCues(edge_strength, d_image, d_left_transposed, d_right_transposed, d_top, d_bottom, d_topleft, d_topright, d_bottomleft, d_bottomright, size.width, size.height, d_scratch_mem);
                grabCutGMM.DataTerm(d_terminals, gmms, d_gmm, gmm_pitch, d_image, d_trimap, size.width, size.height);

                current_alpha = 1 ^ current_alpha;

                graphcut8.GraphCut(d_terminals, d_left_transposed, d_right_transposed, d_top, d_topleft, d_topright, d_bottom, d_bottomleft, d_bottomright, d_alpha[current_alpha]);

                if (iteration > 0)
                {
                    bool changed = grabCutGMM.SegmentationChanged(d_scratch_mem, d_alpha[1 - current_alpha], d_alpha[current_alpha], size.width, size.height);

                    // Solution has converged
                    if (!changed)
                    {
                        break;
                    }
                }

                if (iteration > MAX_ITERATIONS)
                {
                    // Does not converge, fallback to rect selection
                    System.Windows.Forms.MessageBox.Show("Warning: Color models did not converge after " + MAX_ITERATIONS + " iterations.");
                    break;
                }

                grabCutGMM.GMMInitialize(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height);
                grabCutGMM.GMMUpdate(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height);

                iteration++;
            }

            stopwatch.Stop();
            stopwatch.StopEvent.Synchronize();

            runtime = stopwatch.GetElapsedTime();
        }
Example #7
0
        private void Btn_ProcessPEF_Click(object sender, EventArgs e)
        {
            if (pef == null)
            {
                return;
            }

            denoiseAndDemoisaic.LoadNetwork("epoch_" + learningRate.ToString(CultureInfo.InvariantCulture) + "_" + noiseLevelsFolders[cmb_IsoValue.SelectedIndex] + "_1999.cnn");
            NPPImage_16uC1 rawTemp = new NPPImage_16uC1(pef.RawWidth, pef.RawHeight);

            rawTemp.CopyToDevice(pef.RawImage);
            rawTemp.Convert(imageBayer);

            float  whiteLevelAll = pef.WhiteLevel.Value;
            float3 whitePoint    = new float3(whiteLevelAll, whiteLevelAll, whiteLevelAll);
            float3 blackPoint    = new float3(pef.BlackPoint.Value[0], pef.BlackPoint.Value[1], pef.BlackPoint.Value[3]);

            whitePoint -= blackPoint;
            float  scale   = pef.Scaling.Value;
            float3 scaling = new float3(pef.WhitePoint.Value[0] / scale, pef.WhitePoint.Value[1] / scale, pef.WhitePoint.Value[3] / scale);


            inputImage32f.Set(new float[] { 0, 0, 0 });
            deBayerGreenKernel.RunSafe(imageBayer, inputImage32f, blackPoint, scaling);
            deBayerRedBlueKernel.RunSafe(imageBayer, inputImage32f, blackPoint, scaling);
            inputImage32f.Div(new float[] { whitePoint.x *scaling.x, whitePoint.y *scaling.y, whitePoint.z *scaling.z });



            highlightRecoveryKernel.RunSafe(inputImage32f, new float3(scaling.x, scaling.y, scaling.z), 1);

            inputImage32f.Sub(new float[] { 0.5f, 0.5f, 0.5f }, noiseImage32f);

            Console.WriteLine("Start denoising...");
            CudaStopWatch csw = new CudaStopWatch();

            csw.Start();
            denoiseAndDemoisaic.RunImage(noiseImage32f, resultImage32f);
            csw.Stop();
            Console.WriteLine("Needed time: " + csw.GetElapsedTime() + " [msec]");
            csw.Dispose();

            resultImage32f.Add(new float[] { 0.5f, 0.5f, 0.5f });

            ColorManagment cm = new ColorManagment();

            float3  wp        = 1.0f / scaling;
            double3 wb        = new double3(wp.x, wp.y, wp.z);
            double2 neutralXY = cm.NeutralToXY(wb);

            cm.SetWhiteXY(neutralXY);
            ColorMatrix camToXYZ2 = cm.CameraToPCS;


            ColorMatrix d50Tod65  = new ColorMatrix(new double[] { 0.9555766, -0.0230393, 0.0631636, -0.0282895, 1.0099416, 0.0210077, 0.0122982, -0.0204830, 1.3299098 });
            ColorMatrix d65TosRGB = new ColorMatrix(new double[] { 3.2406, -1.5372, -0.4986, -0.9689, 1.8758, 0.0415, 0.0557, -0.2040, 1.0570 });
            ColorMatrix final     = d65TosRGB * d50Tod65 * camToXYZ2;

            float[] matData = new float[9];
            for (int i = 0; i < 3; i++)
            {
                for (int j = 0; j < 3; j++)
                {
                    matData[j + i * 3] = (float)final[i, j];
                }
            }

            camToXYZKernel.RunSafe(inputImage32f, matData);
            camToXYZKernel.RunSafe(resultImage32f, matData);

            //This is a LUT that maps well to most of the JPEGs out of camera, but not always... found somewhere on internet, if I remember well from darktable?
            float[] x = new float[] { 0,
                                      0.004754f,
                                      0.009529f,
                                      0.023713f,
                                      0.031866f,
                                      0.046734f,
                                      0.059989f,
                                      0.088415f,
                                      0.13661f,
                                      0.17448f,
                                      0.205192f,
                                      0.228896f,
                                      0.286411f,
                                      0.355314f,
                                      0.440014f,
                                      0.567096f,
                                      0.620597f,
                                      0.760355f,
                                      0.875139f,
                                      1 };
            float[] y = new float[] { 0,
                                      0.002208f,
                                      0.004214f,
                                      0.013508f,
                                      0.020352f,
                                      0.034063f,
                                      0.052413f,
                                      0.09603f,
                                      0.190629f,
                                      0.256484f,
                                      0.30743f,
                                      0.348447f,
                                      0.42868f,
                                      0.513527f,
                                      0.607651f,
                                      0.732791f,
                                      0.775968f,
                                      0.881828f,
                                      0.960682f,
                                      1 };

            CudaDeviceVariable <float> d_x = x;
            CudaDeviceVariable <float> d_y = y;

            inputImage32f.LUTCubic(new CudaDeviceVariable <float>[] { d_y, d_y, d_y }, new CudaDeviceVariable <float>[] { d_x, d_x, d_x });
            resultImage32f.LUTCubic(new CudaDeviceVariable <float>[] { d_y, d_y, d_y }, new CudaDeviceVariable <float>[] { d_x, d_x, d_x });

            convertRGBTosRGBKernel.RunSafe(inputImage32f);
            convertRGBTosRGBKernel.RunSafe(resultImage32f);

            inputImage32f.Convert(noisyImage8u, NppRoundMode.Near);
            resultImage32f.Convert(resultImage8u, NppRoundMode.Near);

            noisyImage8u.SetRoi(0, 0, bmpNoisy.Width - 4, bmpNoisy.Height - 4);
            noisyImage8u.CopyToHostRoi(bmpNoisy, new NppiRect(2, 2, bmpNoisy.Width - 4, bmpNoisy.Height - 4));

            resultImage8u.SetRoi(0, 0, bmpResult.Width - 16, bmpResult.Height - 16);
            resultImage8u.CopyToHostRoi(bmpResult, new NppiRect(8, 8, bmpResult.Width - 16, bmpResult.Height - 16));

            pictureBox2.Image = bmpNoisy;
            pictureBox3.Image = bmpResult;

            rawTemp.Dispose();
            d_y.Dispose();
            d_x.Dispose();
        }
Example #8
0
        private void btn_Process_Click(object sender, EventArgs e)
        {
            if (bmpInput == null)
            {
                return;
            }

            denoiseAndDemoisaic.LoadNetwork("epoch_" + learningRate.ToString(CultureInfo.InvariantCulture) + "_" + noiseLevelsFolders[cmb_IsoValue.SelectedIndex] + "_1999.cnn");

            if (bmpInput.PixelFormat == PixelFormat.Format32bppArgb)
            {
                inputImage8uC4.CopyToDeviceRoi(bmpInput, new NppiRect(0, 0, bmpInput.Width, bmpInput.Height));
                //Convert C4 to C3 and BGR to RGB
                inputImage8uC4.Copy(inputImage8uC1, 0);
                inputImage8uC1.Copy(inputImage8uC3, 2);
                inputImage8uC4.Copy(inputImage8uC1, 1);
                inputImage8uC1.Copy(inputImage8uC3, 1);
                inputImage8uC4.Copy(inputImage8uC1, 2);
                inputImage8uC1.Copy(inputImage8uC3, 0);
            }
            else
            {
                inputImage8uC3.CopyToDeviceRoi(bmpInput, new NppiRect(0, 0, bmpInput.Width, bmpInput.Height));
                inputImage8uC3.ColorTwist(twist);
            }

            inputImage8uC3.Convert(inputImage32f);
            inputImage32f.Div(new float[] { 255, 255, 255 });

            NppiRect oldRoi = new NppiRect(0, 0, inputImage32f.WidthRoi, inputImage32f.HeightRoi);
            IEnumerable <Tiler.RoiInputOutput> rois = Tiler.GetROIs(oldRoi, TileSize, 0);

            foreach (var roi in rois)
            {
                inputImage32f.SetRoi(roi.inputROI);
                tile.ResetRoi();
                inputImage32f.Copy(tile);
                tile.SetRoi(roi.outputROI);
                imageBayer.SetRoi(roi.positionInFinalImage);
                createBayerKernel.RunSafe(CuRandStates, tile, imageBayer, noiseLevels[cmb_IsoValue.SelectedIndex], 0);
            }
            imageBayer.SetRoi(oldRoi);
            inputImage32f.SetRoi(oldRoi);

            deBayerGreenKernel.RunSafe(imageBayer, inputImage32f, new float3(), new float3(1, 1, 1));
            deBayerRedBlueKernel.RunSafe(imageBayer, inputImage32f, new float3(), new float3(1, 1, 1));

            inputImage32f.Mul(new float[] { 255, 255, 255 }, noiseImage32f);
            noiseImage32f.Convert(noisyImage8u, NppRoundMode.Near);
            noisyImage8u.ColorTwist(twist);
            noisyImage8u.CopyToHostRoi(bmpNoisy, new NppiRect(0, 0, bmpNoisy.Width, bmpNoisy.Height));

            inputImage32f.Sub(new float[] { 0.5f, 0.5f, 0.5f });

            CudaStopWatch csw = new CudaStopWatch();

            csw.Start();

            denoiseAndDemoisaic.RunImage(inputImage32f, resultImage32f);

            csw.Stop();

            Console.WriteLine("Needed time: " + csw.GetElapsedTime() + " [msec]");
            csw.Dispose();

            resultImage32f.Add(new float[] { 0.5f, 0.5f, 0.5f });
            resultImage32f.Mul(new float[] { 255, 255, 255 });
            resultImage32f.Convert(resultImage8u, NppRoundMode.Near);
            resultImage8u.ColorTwist(twist);
            resultImage8u.SetRoi(0, 0, bmpResult.Width - 16, bmpResult.Height - 16);
            resultImage8u.CopyToHostRoi(bmpResult, new NppiRect(8, 8, bmpResult.Width - 16, bmpResult.Height - 16));

            pictureBox2.Image = bmpNoisy;
            pictureBox3.Image = bmpResult;
        }
        public void MinimizeCUBLAS(int tileCountX, int tileCountY)
        {
            int shiftCount;// = shifts.Count;

            shiftCount = GetShiftCount();

            concatenateShifts.RunSafe(shifts_d, shiftPitches, AllShifts_d, shiftCount, tileCountX, tileCountY);


            shiftsMeasured.CopyToDevice(AllShifts_d);

            CudaStopWatch sw = new CudaStopWatch();

            sw.Start();


            int imageCount = frameCount;
            int tileCount  = tileCountX * tileCountY;
            int n1         = imageCount - 1;
            int m          = shiftCount;

            status.Memset(0);
            shiftMatrices.Memset(0);
            float[] shiftMatrix = CreateShiftMatrix();
            shiftMatrices.CopyToDevice(shiftMatrix, 0, 0, shiftMatrix.Length * sizeof(float));

            copyShiftMatrixKernel.RunSafe(shiftMatrices, tileCount, imageCount, shiftCount);
            shiftSafeMatrices.CopyToDevice(shiftMatrices);


            for (int i = 0; i < 10; i++)
            {
                blas.GemmBatched(Operation.Transpose, Operation.NonTranspose, n1, n1, m, one, shiftMatrixArray, m, shiftMatrixArray, m, zero, matrixSquareArray, n1, tileCount);
                //float[] mSqr = matricesSquared;

                if (n1 <= 32)
                {
                    //MatinvBatchedS can only invert up to 32x32 matrices
                    blas.MatinvBatchedS(n1, matrixSquareArray, n1, matrixInvertedArray, n1, infoInverse, tileCount);
                }
                else
                {
                    blas.GetrfBatchedS(n1, matrixSquareArray, n1, pivotArray, infoInverse, tileCount);
                    blas.GetriBatchedS(n1, matrixSquareArray, n1, pivotArray, matrixInvertedArray, n1, infoInverse, tileCount);
                }


                //int[] info = infoInverse;
                //mSqr = matricesInverted;
                blas.GemmBatched(Operation.NonTranspose, Operation.Transpose, n1, m, n1, one, matrixInvertedArray, n1, shiftMatrixArray, m, zero, solvedMatrixArray, n1, tileCount);
                blas.GemmBatched(Operation.NonTranspose, Operation.Transpose, n1, 2, m, one, solvedMatrixArray, n1, shiftMeasuredArray, 2, zero, shiftOneToOneArray, n1, tileCount);
                blas.GemmBatched(Operation.NonTranspose, Operation.NonTranspose, m, 2, n1, one, shiftMatrixArray, m, shiftOneToOneArray, n1, zero, shiftOptimArray, m, tileCount);

                checkForOutliers.RunSafe(shiftsMeasured, shiftsOptim, shiftMatrices, status, infoInverse, tileCount, imageCount, shiftCount);

                status.Sum(statusSum, buffer, 0);
                int[] stats = status;

                for (int j = 0; j < tileCount; j++)
                {
                    if (stats[j] >= 0)
                    {
                        Console.Write(j + ": " + stats[j] + "; ");
                    }
                }
                Console.WriteLine();

                int stat = statusSum;
                if (stat == -tileCount)
                {
                    break;
                }

                //float2[] AllShifts_h = shiftsMeasured;
            }

            blas.GemmBatched(Operation.NonTranspose, Operation.NonTranspose, m, 2, n1, one, shiftMatrixSafeArray, m, shiftOneToOneArray, n1, zero, shiftMeasuredArray, m, tileCount);

            AllShifts_d.Memset(0);
            transposeShifts.RunSafe(AllShifts_d, shiftsMeasured, shiftsOneToOne, shiftsOneToOne_d, tileCount, imageCount, shiftCount);
            //shiftsMeasured.CopyToDevice(AllShifts_d);

            //float2[] AllShiftsFinal_h = shiftsMeasured;

            sw.Stop();
            Console.WriteLine("Time for optimisation: " + sw.GetElapsedTime() + " msec.");

            separateShifts.RunSafe(AllShifts_d, shifts_d, shiftPitches, shiftCount, tileCountX, tileCountY);
        }