private void display(Int64 iter) { stopwatch.Start(); simulateFluids(); // render points from vertex buffer GL.Clear(ClearBufferMask.ColorBufferBit); if (iter % 1000 == 0) { R = (float)rnd.NextDouble(); G = (float)rnd.NextDouble(); B = (float)rnd.NextDouble(); } DrawPoints(1.0f, 0.0f, 0.0f, 0, DS / 2); DrawPoints(0.0f, 1.0f, 0.0f, DS / 2, DS); // Finish timing before swap buffers to avoid refresh sync stopwatch.Stop(); m_renderControl.SwapBuffers(); fpsCount++; if (fpsCount == fpsLimit) { float ifps = 1.0f / (stopwatch.GetElapsedTime() / 1000.0f); string fps = string.Format(System.Globalization.CultureInfo.InvariantCulture, "Cuda/GL Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps); this.Text = fps; fpsCount = 0; fpsLimit = (int)Math.Max(ifps, 1.0f); } }
public void CUDA_AddFloatArrays() { //Load Kernel image from resources Stream stream = new StreamReader(resName).BaseStream; if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); var threadsPerBlock = 1024; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (Count + threadsPerBlock - 1) / threadsPerBlock; CudaStopWatch w = new CudaStopWatch(); w.Start(); vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count); w.Stop(); Debug.Log(w.GetElapsedTime() / 1000.0f); Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}"); Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}"); // Copy result from device memory to host memory // h_C contains the result in host memory // h_C = d_C; }
private void display() { stopwatch.Start(); advectVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, DT, g_tPitch); { g_planr2c.Exec(g_vxfield.DevicePointer); g_planr2c.Exec(g_vyfield.DevicePointer); diffuseProject(g_vxfield, g_vyfield, CPADW, DIM, DT, VIS, g_tPitch); g_planc2r.Exec(g_vxfield.DevicePointer); g_planc2r.Exec(g_vyfield.DevicePointer); } updateVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, g_tPitch); // Map D3D9 vertex buffer to CUDA { graphicsres.MapAllResources(); g_mparticles = graphicsres[0].GetMappedPointer <vertex>(); advectParticles(g_mparticles, g_dvfield, DIM, DIM, DT, g_tPitch); graphicsres.UnmapAllResources(); } device.Clear(ClearFlags.Target, new Color4(0.0f, 0, 0), 0.0f, 0); device.SetRenderState(RenderState.ZWriteEnable, false); device.SetRenderState(RenderState.AlphaBlendEnable, true); device.SetRenderState(RenderState.SourceBlend, Blend.One); device.SetRenderState(RenderState.DestinationBlend, Blend.One); device.SetRenderState(RenderState.PointSpriteEnable, true); float size = 16.0f; device.SetRenderState(RenderState.PointSize, size); device.SetTexture(0, g_pTexture); if (device.BeginScene().IsSuccess) { Result res; //Draw particles res = device.SetStreamSource(0, g_pVB, 0, Marshal.SizeOf(typeof(vertex))); device.VertexFormat = VertexFormat.Position | VertexFormat.Diffuse; res = device.DrawPrimitives(PrimitiveType.PointList, 0, DS); device.EndScene(); } stopwatch.Stop(); device.Present(); fpsCount++; if (fpsCount == fpsLimit) { float ifps = 1.0f / (stopwatch.GetElapsedTime() / 1000.0f); string fps = string.Format(System.Globalization.CultureInfo.InvariantCulture, "CUDA/D3D9 Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps); this.Text = fps; fpsCount = 0; fpsLimit = (int)Math.Max(ifps, 1.0f); } }
private void display() { stopwatch.Start(); simulateFluids(); // render points from vertex buffer GL.Clear(ClearBufferMask.ColorBufferBit); GL.Color4(0, 1, 0, 0.5f); GL.PointSize(1); GL.Enable(EnableCap.PointSmooth); GL.Enable(EnableCap.Blend); GL.BlendFunc(BlendingFactorSrc.SrcAlpha, BlendingFactorDest.OneMinusSrcAlpha); GL.EnableClientState(ArrayCap.VertexArray); GL.Disable(EnableCap.DepthTest); GL.Disable(EnableCap.CullFace); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.VertexPointer(2, VertexPointerType.Float, 0, 0); GL.DrawArrays(BeginMode.Points, 0, DS); GL.BindBuffer(BufferTarget.ArrayBuffer, 0); GL.DisableClientState(ArrayCap.VertexArray); GL.DisableClientState(ArrayCap.TextureCoordArray); GL.Disable(EnableCap.Texture2D); // Finish timing before swap buffers to avoid refresh sync stopwatch.Stop(); m_renderControl.SwapBuffers(); fpsCount++; if (fpsCount == fpsLimit) { float ifps = 1.0f / (stopwatch.GetElapsedTime() / 1000.0f); string fps = string.Format(System.Globalization.CultureInfo.InvariantCulture, "Cuda/GL Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps); this.Text = fps; fpsCount = 0; fpsLimit = (int)Math.Max(ifps, 1.0f); } }
static void Main(string[] args) { const int nx = 2048; const int ny = 2048; // shifts applied to x and y data const int x_shift = 5; const int y_shift = 7; ShrQATest.shrQAStart(args); if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) { Console.Write("nx and ny must be multiples of TILE_DIM\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED); } // execution configuration parameters dim3 grid = new dim3(nx / TILE_DIM, ny / TILE_DIM, 1); dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1); // This will pick the best possible CUDA capable device int devID = findCudaDevice(args); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "simplePitchLinearTexture_x64.ptx"; } else { resName = "simplePitchLinearTexture.ptx"; } string resNamespace = "simplePitchLinearTexture"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } byte[] kernels = new byte[stream.Length]; int bytesToRead = (int)stream.Length; while (bytesToRead > 0) { bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead); } CudaKernel PLKernel = ctx.LoadKernelPTX(kernels, "shiftPitchLinear"); CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray"); CudaStopWatch stopwatch = new CudaStopWatch(); // ---------------------------------- // Host allocation and initialization // ---------------------------------- float[] h_idata = new float[nx * ny]; float[] h_odata = new float[nx * ny]; float[] gold = new float[nx * ny]; for (int i = 0; i < nx * ny; ++i) { h_idata[i] = (float)i; } // ------------------------ // Device memory allocation // ------------------------ // Pitch linear input data CudaPitchedDeviceVariable <float> d_idataPL = new CudaPitchedDeviceVariable <float>(nx, ny); // Array input data CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One); // Pitch linear output data CudaPitchedDeviceVariable <float> d_odata = new CudaPitchedDeviceVariable <float>(nx, ny); // ------------------------ // copy host data to device // ------------------------ // Pitch linear d_idataPL.CopyToDevice(h_idata); // Array d_idataArray.CopyFromHostToThis <float>(h_idata); // ---------------------- // Bind texture to memory // ---------------------- // Pitch linear CudaTextureLinearPitched2D <float> texRefPL = new CudaTextureLinearPitched2D <float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL); CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray); // --------------------- // reference calculation // --------------------- for (int j = 0; j < ny; j++) { int jshift = (j + y_shift) % ny; for (int i = 0; i < nx; i++) { int ishift = (i + x_shift) % nx; gold[j * nx + i] = h_idata[jshift * nx + ishift]; } } // ---------------- // shiftPitchLinear // ---------------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); PLKernel.BlockDimensions = threads; PLKernel.GridDimensions = grid; stopwatch.Start(); for (int i = 0; i < NUM_REPS; i++) { PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timePL = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); bool res = cutComparef(gold, h_odata); bool success = true; if (res == false) { Console.Write("*** shiftPitchLinear failed ***\n"); success = false; } // ---------- // shiftArray // ---------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); ArrayKernel.BlockDimensions = threads; ArrayKernel.GridDimensions = grid; stopwatch.Start(); for (int i = 0; i < NUM_REPS; i++) { ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timeArray = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); res = cutComparef(gold, h_odata); if (res == false) { Console.Write("*** shiftArray failed ***\n"); success = false; } float bandwidthPL = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timePL / NUM_REPS); float bandwidthArray = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timeArray / NUM_REPS); Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n", bandwidthPL, bandwidthArray); float fetchRatePL = nx * ny / 1e+6f / (timePL / (1000.0f * NUM_REPS)); float fetchRateArray = nx * ny / 1e+6f / (timeArray / (1000.0f * NUM_REPS)); Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n", fetchRatePL, fetchRateArray); // cleanup texRefPL.Dispose(); texRefArray.Dispose(); d_idataPL.Dispose(); d_idataArray.Dispose(); d_odata.Dispose(); stopwatch.Dispose(); ctx.Dispose(); ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
public void computeSegmentationFromTrimap() { CudaStopWatch stopwatch = new CudaStopWatch(); stopwatch.Start(); iteration = 0; current_alpha = 0; // Solve Grabcut on lower resolution first. Reduces total computation time. createSmallTrimap(); d_alpha[0].AsyncCopyToDevice(d_small_trimap[small_trimap_idx], new CUstream()); for (int i = 0; i < 2; ++i) { grabCutGMM.GMMInitialize(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_small_image, d_alpha[current_alpha], small_size.width, small_size.height); grabCutGMM.GMMUpdate(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_small_image, d_alpha[current_alpha], small_size.width, small_size.height); grabCutGMM.EdgeCues(edge_strength, d_small_image, d_left_transposed, d_right_transposed, d_top, d_bottom, d_topleft, d_topright, d_bottomleft, d_bottomright, small_size.width, small_size.height, d_scratch_mem); grabCutGMM.DataTerm(d_terminals, gmms, d_gmm, gmm_pitch, d_small_image, d_small_trimap[small_trimap_idx], small_size.width, small_size.height); graphcut8Small.GraphCut(d_terminals, d_left_transposed, d_right_transposed, d_top, d_topleft, d_topright, d_bottom, d_bottomleft, d_bottomright, d_alpha[1 - current_alpha]); current_alpha = 1 - current_alpha; } grabCutGMM.UpsampleAlpha(d_alpha[1 - current_alpha], d_alpha[current_alpha], size.width, size.height, small_size.width, small_size.height); current_alpha = 1 - current_alpha; grabCutGMM.GMMInitialize(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height); grabCutGMM.GMMUpdate(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height); while (true) { grabCutGMM.EdgeCues(edge_strength, d_image, d_left_transposed, d_right_transposed, d_top, d_bottom, d_topleft, d_topright, d_bottomleft, d_bottomright, size.width, size.height, d_scratch_mem); grabCutGMM.DataTerm(d_terminals, gmms, d_gmm, gmm_pitch, d_image, d_trimap, size.width, size.height); current_alpha = 1 ^ current_alpha; graphcut8.GraphCut(d_terminals, d_left_transposed, d_right_transposed, d_top, d_topleft, d_topright, d_bottom, d_bottomleft, d_bottomright, d_alpha[current_alpha]); if (iteration > 0) { bool changed = grabCutGMM.SegmentationChanged(d_scratch_mem, d_alpha[1 - current_alpha], d_alpha[current_alpha], size.width, size.height); // Solution has converged if (!changed) { break; } } if (iteration > MAX_ITERATIONS) { // Does not converge, fallback to rect selection System.Windows.Forms.MessageBox.Show("Warning: Color models did not converge after " + MAX_ITERATIONS + " iterations."); break; } grabCutGMM.GMMInitialize(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height); grabCutGMM.GMMUpdate(gmms, d_gmm, d_scratch_mem, gmm_pitch, d_image, d_alpha[current_alpha], size.width, size.height); iteration++; } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); runtime = stopwatch.GetElapsedTime(); }
private void Btn_ProcessPEF_Click(object sender, EventArgs e) { if (pef == null) { return; } denoiseAndDemoisaic.LoadNetwork("epoch_" + learningRate.ToString(CultureInfo.InvariantCulture) + "_" + noiseLevelsFolders[cmb_IsoValue.SelectedIndex] + "_1999.cnn"); NPPImage_16uC1 rawTemp = new NPPImage_16uC1(pef.RawWidth, pef.RawHeight); rawTemp.CopyToDevice(pef.RawImage); rawTemp.Convert(imageBayer); float whiteLevelAll = pef.WhiteLevel.Value; float3 whitePoint = new float3(whiteLevelAll, whiteLevelAll, whiteLevelAll); float3 blackPoint = new float3(pef.BlackPoint.Value[0], pef.BlackPoint.Value[1], pef.BlackPoint.Value[3]); whitePoint -= blackPoint; float scale = pef.Scaling.Value; float3 scaling = new float3(pef.WhitePoint.Value[0] / scale, pef.WhitePoint.Value[1] / scale, pef.WhitePoint.Value[3] / scale); inputImage32f.Set(new float[] { 0, 0, 0 }); deBayerGreenKernel.RunSafe(imageBayer, inputImage32f, blackPoint, scaling); deBayerRedBlueKernel.RunSafe(imageBayer, inputImage32f, blackPoint, scaling); inputImage32f.Div(new float[] { whitePoint.x *scaling.x, whitePoint.y *scaling.y, whitePoint.z *scaling.z }); highlightRecoveryKernel.RunSafe(inputImage32f, new float3(scaling.x, scaling.y, scaling.z), 1); inputImage32f.Sub(new float[] { 0.5f, 0.5f, 0.5f }, noiseImage32f); Console.WriteLine("Start denoising..."); CudaStopWatch csw = new CudaStopWatch(); csw.Start(); denoiseAndDemoisaic.RunImage(noiseImage32f, resultImage32f); csw.Stop(); Console.WriteLine("Needed time: " + csw.GetElapsedTime() + " [msec]"); csw.Dispose(); resultImage32f.Add(new float[] { 0.5f, 0.5f, 0.5f }); ColorManagment cm = new ColorManagment(); float3 wp = 1.0f / scaling; double3 wb = new double3(wp.x, wp.y, wp.z); double2 neutralXY = cm.NeutralToXY(wb); cm.SetWhiteXY(neutralXY); ColorMatrix camToXYZ2 = cm.CameraToPCS; ColorMatrix d50Tod65 = new ColorMatrix(new double[] { 0.9555766, -0.0230393, 0.0631636, -0.0282895, 1.0099416, 0.0210077, 0.0122982, -0.0204830, 1.3299098 }); ColorMatrix d65TosRGB = new ColorMatrix(new double[] { 3.2406, -1.5372, -0.4986, -0.9689, 1.8758, 0.0415, 0.0557, -0.2040, 1.0570 }); ColorMatrix final = d65TosRGB * d50Tod65 * camToXYZ2; float[] matData = new float[9]; for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { matData[j + i * 3] = (float)final[i, j]; } } camToXYZKernel.RunSafe(inputImage32f, matData); camToXYZKernel.RunSafe(resultImage32f, matData); //This is a LUT that maps well to most of the JPEGs out of camera, but not always... found somewhere on internet, if I remember well from darktable? float[] x = new float[] { 0, 0.004754f, 0.009529f, 0.023713f, 0.031866f, 0.046734f, 0.059989f, 0.088415f, 0.13661f, 0.17448f, 0.205192f, 0.228896f, 0.286411f, 0.355314f, 0.440014f, 0.567096f, 0.620597f, 0.760355f, 0.875139f, 1 }; float[] y = new float[] { 0, 0.002208f, 0.004214f, 0.013508f, 0.020352f, 0.034063f, 0.052413f, 0.09603f, 0.190629f, 0.256484f, 0.30743f, 0.348447f, 0.42868f, 0.513527f, 0.607651f, 0.732791f, 0.775968f, 0.881828f, 0.960682f, 1 }; CudaDeviceVariable <float> d_x = x; CudaDeviceVariable <float> d_y = y; inputImage32f.LUTCubic(new CudaDeviceVariable <float>[] { d_y, d_y, d_y }, new CudaDeviceVariable <float>[] { d_x, d_x, d_x }); resultImage32f.LUTCubic(new CudaDeviceVariable <float>[] { d_y, d_y, d_y }, new CudaDeviceVariable <float>[] { d_x, d_x, d_x }); convertRGBTosRGBKernel.RunSafe(inputImage32f); convertRGBTosRGBKernel.RunSafe(resultImage32f); inputImage32f.Convert(noisyImage8u, NppRoundMode.Near); resultImage32f.Convert(resultImage8u, NppRoundMode.Near); noisyImage8u.SetRoi(0, 0, bmpNoisy.Width - 4, bmpNoisy.Height - 4); noisyImage8u.CopyToHostRoi(bmpNoisy, new NppiRect(2, 2, bmpNoisy.Width - 4, bmpNoisy.Height - 4)); resultImage8u.SetRoi(0, 0, bmpResult.Width - 16, bmpResult.Height - 16); resultImage8u.CopyToHostRoi(bmpResult, new NppiRect(8, 8, bmpResult.Width - 16, bmpResult.Height - 16)); pictureBox2.Image = bmpNoisy; pictureBox3.Image = bmpResult; rawTemp.Dispose(); d_y.Dispose(); d_x.Dispose(); }
private void btn_Process_Click(object sender, EventArgs e) { if (bmpInput == null) { return; } denoiseAndDemoisaic.LoadNetwork("epoch_" + learningRate.ToString(CultureInfo.InvariantCulture) + "_" + noiseLevelsFolders[cmb_IsoValue.SelectedIndex] + "_1999.cnn"); if (bmpInput.PixelFormat == PixelFormat.Format32bppArgb) { inputImage8uC4.CopyToDeviceRoi(bmpInput, new NppiRect(0, 0, bmpInput.Width, bmpInput.Height)); //Convert C4 to C3 and BGR to RGB inputImage8uC4.Copy(inputImage8uC1, 0); inputImage8uC1.Copy(inputImage8uC3, 2); inputImage8uC4.Copy(inputImage8uC1, 1); inputImage8uC1.Copy(inputImage8uC3, 1); inputImage8uC4.Copy(inputImage8uC1, 2); inputImage8uC1.Copy(inputImage8uC3, 0); } else { inputImage8uC3.CopyToDeviceRoi(bmpInput, new NppiRect(0, 0, bmpInput.Width, bmpInput.Height)); inputImage8uC3.ColorTwist(twist); } inputImage8uC3.Convert(inputImage32f); inputImage32f.Div(new float[] { 255, 255, 255 }); NppiRect oldRoi = new NppiRect(0, 0, inputImage32f.WidthRoi, inputImage32f.HeightRoi); IEnumerable <Tiler.RoiInputOutput> rois = Tiler.GetROIs(oldRoi, TileSize, 0); foreach (var roi in rois) { inputImage32f.SetRoi(roi.inputROI); tile.ResetRoi(); inputImage32f.Copy(tile); tile.SetRoi(roi.outputROI); imageBayer.SetRoi(roi.positionInFinalImage); createBayerKernel.RunSafe(CuRandStates, tile, imageBayer, noiseLevels[cmb_IsoValue.SelectedIndex], 0); } imageBayer.SetRoi(oldRoi); inputImage32f.SetRoi(oldRoi); deBayerGreenKernel.RunSafe(imageBayer, inputImage32f, new float3(), new float3(1, 1, 1)); deBayerRedBlueKernel.RunSafe(imageBayer, inputImage32f, new float3(), new float3(1, 1, 1)); inputImage32f.Mul(new float[] { 255, 255, 255 }, noiseImage32f); noiseImage32f.Convert(noisyImage8u, NppRoundMode.Near); noisyImage8u.ColorTwist(twist); noisyImage8u.CopyToHostRoi(bmpNoisy, new NppiRect(0, 0, bmpNoisy.Width, bmpNoisy.Height)); inputImage32f.Sub(new float[] { 0.5f, 0.5f, 0.5f }); CudaStopWatch csw = new CudaStopWatch(); csw.Start(); denoiseAndDemoisaic.RunImage(inputImage32f, resultImage32f); csw.Stop(); Console.WriteLine("Needed time: " + csw.GetElapsedTime() + " [msec]"); csw.Dispose(); resultImage32f.Add(new float[] { 0.5f, 0.5f, 0.5f }); resultImage32f.Mul(new float[] { 255, 255, 255 }); resultImage32f.Convert(resultImage8u, NppRoundMode.Near); resultImage8u.ColorTwist(twist); resultImage8u.SetRoi(0, 0, bmpResult.Width - 16, bmpResult.Height - 16); resultImage8u.CopyToHostRoi(bmpResult, new NppiRect(8, 8, bmpResult.Width - 16, bmpResult.Height - 16)); pictureBox2.Image = bmpNoisy; pictureBox3.Image = bmpResult; }
public void MinimizeCUBLAS(int tileCountX, int tileCountY) { int shiftCount;// = shifts.Count; shiftCount = GetShiftCount(); concatenateShifts.RunSafe(shifts_d, shiftPitches, AllShifts_d, shiftCount, tileCountX, tileCountY); shiftsMeasured.CopyToDevice(AllShifts_d); CudaStopWatch sw = new CudaStopWatch(); sw.Start(); int imageCount = frameCount; int tileCount = tileCountX * tileCountY; int n1 = imageCount - 1; int m = shiftCount; status.Memset(0); shiftMatrices.Memset(0); float[] shiftMatrix = CreateShiftMatrix(); shiftMatrices.CopyToDevice(shiftMatrix, 0, 0, shiftMatrix.Length * sizeof(float)); copyShiftMatrixKernel.RunSafe(shiftMatrices, tileCount, imageCount, shiftCount); shiftSafeMatrices.CopyToDevice(shiftMatrices); for (int i = 0; i < 10; i++) { blas.GemmBatched(Operation.Transpose, Operation.NonTranspose, n1, n1, m, one, shiftMatrixArray, m, shiftMatrixArray, m, zero, matrixSquareArray, n1, tileCount); //float[] mSqr = matricesSquared; if (n1 <= 32) { //MatinvBatchedS can only invert up to 32x32 matrices blas.MatinvBatchedS(n1, matrixSquareArray, n1, matrixInvertedArray, n1, infoInverse, tileCount); } else { blas.GetrfBatchedS(n1, matrixSquareArray, n1, pivotArray, infoInverse, tileCount); blas.GetriBatchedS(n1, matrixSquareArray, n1, pivotArray, matrixInvertedArray, n1, infoInverse, tileCount); } //int[] info = infoInverse; //mSqr = matricesInverted; blas.GemmBatched(Operation.NonTranspose, Operation.Transpose, n1, m, n1, one, matrixInvertedArray, n1, shiftMatrixArray, m, zero, solvedMatrixArray, n1, tileCount); blas.GemmBatched(Operation.NonTranspose, Operation.Transpose, n1, 2, m, one, solvedMatrixArray, n1, shiftMeasuredArray, 2, zero, shiftOneToOneArray, n1, tileCount); blas.GemmBatched(Operation.NonTranspose, Operation.NonTranspose, m, 2, n1, one, shiftMatrixArray, m, shiftOneToOneArray, n1, zero, shiftOptimArray, m, tileCount); checkForOutliers.RunSafe(shiftsMeasured, shiftsOptim, shiftMatrices, status, infoInverse, tileCount, imageCount, shiftCount); status.Sum(statusSum, buffer, 0); int[] stats = status; for (int j = 0; j < tileCount; j++) { if (stats[j] >= 0) { Console.Write(j + ": " + stats[j] + "; "); } } Console.WriteLine(); int stat = statusSum; if (stat == -tileCount) { break; } //float2[] AllShifts_h = shiftsMeasured; } blas.GemmBatched(Operation.NonTranspose, Operation.NonTranspose, m, 2, n1, one, shiftMatrixSafeArray, m, shiftOneToOneArray, n1, zero, shiftMeasuredArray, m, tileCount); AllShifts_d.Memset(0); transposeShifts.RunSafe(AllShifts_d, shiftsMeasured, shiftsOneToOne, shiftsOneToOne_d, tileCount, imageCount, shiftCount); //shiftsMeasured.CopyToDevice(AllShifts_d); //float2[] AllShiftsFinal_h = shiftsMeasured; sw.Stop(); Console.WriteLine("Time for optimisation: " + sw.GetElapsedTime() + " msec."); separateShifts.RunSafe(AllShifts_d, shifts_d, shiftPitches, shiftCount, tileCountX, tileCountY); }