private void initGLAndCuda() { //Create render target control m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default); m_renderControl.Dock = DockStyle.Fill; m_renderControl.BackColor = Color.White; m_renderControl.BorderStyle = BorderStyle.FixedSingle; m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown); m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove); m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown); m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged); panel1.Controls.Add(m_renderControl); Console.WriteLine(" OpenGL device is Available"); int deviceID = CudaContext.GetMaxGflopsDeviceId(); ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync); string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount); Console.WriteLine(console); CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx); hvfield = new cData[DS]; dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM); tPitch = dvfield.Pitch; dvfield.CopyToDevice(hvfield); vxfield = new CudaDeviceVariable<cData>(DS); vyfield = new CudaDeviceVariable<cData>(DS); // Create particle array particles = new cData[DS]; initParticles(particles, DIM, DIM); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); GL.GenBuffers(1, out vbo); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw); int bsize; GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize); if (bsize != DS * cData.SizeOf) throw new Exception("Sizes don't match."); GL.BindBuffer(BufferTarget.ArrayBuffer, 0); cuda_vbo_resource = new CudaGraphicsInteropResourceCollection(); cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None)); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); stopwatch = new CudaStopWatch(CUEventFlags.Default); reshape(); isInit = true; display(); }
public void ApplyMatte(int mode, CudaPitchedDeviceVariable <uchar4> result, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> matte, int width, int height) { dim3 block = new dim3(32, 8, 1); dim3 grid = new dim3((width + 31) / 32, (height + 31) / 32, 1); switch (mode) { case 0: ApplyMatteKernelMode0.BlockDimensions = block; ApplyMatteKernelMode0.GridDimensions = grid; ApplyMatteKernelMode0.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height); //ApplyMatteKernel<0><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height); break; case 1: ApplyMatteKernelMode1.BlockDimensions = block; ApplyMatteKernelMode1.GridDimensions = grid; ApplyMatteKernelMode1.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height); //ApplyMatteKernel<1><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height); break; case 2: ApplyMatteKernelMode2.BlockDimensions = block; ApplyMatteKernelMode2.GridDimensions = grid; ApplyMatteKernelMode2.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height); //ApplyMatteKernel<2><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height); break; } }
/// <summary> /// Binds a linear address range to the texture reference. <para/> /// Any previous address or CUDA array state associated with the texture reference is superseded by this function. <para/> /// Any memory previously bound to the texture reference is unbound.<para/> /// Size my differ to the previous bound variable, but type must be the same. /// </summary> /// <param name="deviceVar">New device variable to bind this texture reference to.</param> public void Reset(CudaPitchedDeviceVariable <T> deviceVar) { if (disposed) { throw new ObjectDisposedException(this.ToString()); } _height = deviceVar.Height; _width = deviceVar.Width; _dataSize = deviceVar.TotalSizeInBytes; _devVar = deviceVar; CUDAArrayDescriptor arrayDescr = new CUDAArrayDescriptor(); arrayDescr.Format = _format; arrayDescr.Height = _height; arrayDescr.NumChannels = (uint)_numChannels; arrayDescr.Width = _width; res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddress2D_v2(_texref, ref arrayDescr, _devVar.DevicePointer, _devVar.Pitch); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddress2D", res)); if (res != CUResult.Success) { throw new CudaException(res); } }
private void addForces(CudaPitchedDeviceVariable <float2> v, int dx, int dy, int spx, int spy, float fx, float fy, int r, SizeT tPitch) { dim3 tids = new dim3((uint)(2 * r + 1), (uint)(2 * r + 1), 1); addForces_k.GridDimensions = new dim3(1); addForces_k.BlockDimensions = tids; addForces_k.Run(v.DevicePointer, dx, dy, spx, spy, fx, fy, r, tPitch); }
/// <summary> /// Graphcut of a flow network (32bit floating point edge capacities). The /// function computes the minimal cut (graphcut) of a 2D regular 4-connected /// graph. <para/> /// The inputs are the capacities of the horizontal (in transposed form), /// vertical and terminal (source and sink) edges. The capacities to source and /// sink /// are stored as capacity differences in the terminals array /// ( terminals(x) = source(x) - sink(x) ). The implementation assumes that the /// edge capacities /// for boundary edges that would connect to nodes outside the specified domain /// are set to 0 (for example left(0,*) == 0). If this is not fulfilled the /// computed labeling may be wrong!<para/> /// The computed binary labeling is encoded as unsigned 8bit values (0 and >0). /// </summary> /// <param name="Terminals">Pointer to differences of terminal edge capacities</param> /// <param name="LeftTransposed">Pointer to transposed left edge capacities</param> /// <param name="RightTransposed">Pointer to transposed right edge capacities</param> /// <param name="Top">Pointer to top edge capacities (top(*,0) must be 0)</param> /// <param name="Bottom">Pointer to bottom edge capacities (bottom(*,height-1)</param> /// <param name="Label">Pointer to destination label image </param> /// <returns></returns> public void GraphCut(CudaPitchedDeviceVariable <float> Terminals, CudaPitchedDeviceVariable <float> LeftTransposed, CudaPitchedDeviceVariable <float> RightTransposed, CudaPitchedDeviceVariable <float> Top, CudaPitchedDeviceVariable <float> Bottom, CudaPitchedDeviceVariable <byte> Label) { status = NPPNativeMethods.NPPi.ImageLabeling.nppiGraphcut_32f8u(Terminals.DevicePointer, LeftTransposed.DevicePointer, RightTransposed.DevicePointer, Top.DevicePointer, Bottom.DevicePointer, Terminals.Pitch, LeftTransposed.Pitch, _size, Label.DevicePointer, Label.Pitch, _state); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiGraphcut_32f8u", status)); NPPException.CheckNppStatus(status, this); }
private void advectParticles(CudaDeviceVariable <vertex> p, CudaPitchedDeviceVariable <float2> v, int dx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); advectParticles_k.GridDimensions = grid; advectParticles_k.BlockDimensions = tids; advectParticles_k.Run(p.DevicePointer, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch); }
public void DownscaleTrimap(CudaPitchedDeviceVariable <byte> small_image, int small_width, int small_height, CudaPitchedDeviceVariable <byte> image, int width, int height) { dim3 grid = new dim3((width + 63) / 64, (height + 63) / 64, 1); dim3 block = new dim3(32, 8, 1); downscaleKernel2.BlockDimensions = block; downscaleKernel2.GridDimensions = grid; downscaleKernel2.Run(small_image.DevicePointer, (int)small_image.Pitch, small_width, small_height, image.DevicePointer, (int)image.Pitch, width, height); //downscaleKernel<<<grid, block>>>(small_image, small_pitch, small_width, small_height, image, pitch, width, height, maxfilter_functor()); }
public void DataTerm(CudaPitchedDeviceVariable <int> terminals, int gmmN, CudaDeviceVariable <float> gmm, int gmm_pitch, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> trimap, int width, int height) { dim3 block = new dim3(32, 8, 1); dim3 grid = new dim3((int)((width + block.x - 1) / block.x), (int)((height + block.y - 1) / block.y), 1); DataTermKernel.BlockDimensions = block; DataTermKernel.GridDimensions = grid; DataTermKernel.Run(terminals.DevicePointer, (int)terminals.Pitch / 4, gmmN, gmm.DevicePointer, (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, trimap.DevicePointer, (int)trimap.Pitch, width, height); //DataTermKernel<<<grid, block>>>(terminals, terminal_pitch/4, gmmN, gmm, gmm_pitch/4, image, image_pitch/4, trimap, trimap_pitch, width, height); }
private void updateVelocity(CudaPitchedDeviceVariable <float2> v, CudaDeviceVariable <float2> vx, CudaDeviceVariable <float2> vy, int dx, int pdx, int dy, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); updateVelocity_k.GridDimensions = grid; updateVelocity_k.BlockDimensions = tids; updateVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, TILEY / TIDSY, tPitch); }
/// <summary> /// 1 channel connected region contours image to generate contours geometry info list in host memory. /// <para/> /// Note that ALL input and output data for the function MUST be in device memory except where noted otherwise. /// Also nFirstContourID and nLastContourID allow only a portion of the contour geometry lists in the image to be output. /// <para/> /// Note that the geometry list for each contour will begin at pContoursGeometryListsHost[pContoursPixelStartingOffsetHost[nContourID] /// sizeof(NppiContourPixelGeometryInfo). /// <para/> /// Note that due to the nature of some imput images contour ID 0 can sometimes contain ALL contours in the image which /// can significantly increase the time taken to output the geometry lists. In these cases setting nFirstContourGeometryListID to >= 1 /// significantly speed up geometry list output performance and all individual contours will still be output. /// </summary> /// <param name="pMarkerLabelsInfoListDev">pointer to device memory buffer which contains the output returned by the corresponding nppiCompressedMarkerLabelsUFInfo_32u_C1R_Ctx call.</param> /// <param name="pMarkerLabelsInfoListHost">pointer to host memory buffer which will be output by this function with additional information added.</param> /// <param name="pContoursDirectionImageDev">Source-Image Pointer to output image in device memory containing per contour pixel direction info around each uniquely labeled connected pixel region returned by corresponding nppiCompressedMarkerLabelsUFInfo call. </param> /// <param name="pContoursPixelGeometryListsHost">pointer to host memory buffer allocated to be at least as big as size returned by corresponding nppiCompressedMarkerLabelsUFGetGeometryListsSize call. </param> /// <param name="pContoursPixelCountsListHost">host memory pointer to array of nMaxMarkerLabelID uintegers returned by previous call to nppiCompressedMarkerLabelsUFContoursPixelGeometryLists_C1R_Ctx. </param> /// <param name="pContoursPixelsFoundListHost">host memory pointer to array of nMaxMarkerLabelID uintegers returned by this call representing the number of contour pixels found during geometry list generation. </param> /// <param name="pContoursPixelsStartingOffsetHost">host memory pointer to array of uintegers returned by this call representing the starting offset index of each contour found during geometry list generation. </param> /// <param name="nMaxMarkerLabelID">the value of the maximum marker label ID returned by corresponding compress marker labels UF call. </param> /// <param name="nFirstContourGeometryListID">the ID of the first contour geometry list to output. </param> /// <param name="nLastContourGeometryListID">the ID of the last contour geometry list to output. </param> /// <param name="nppStreamCtx">NPP stream context.</param> public void CompressedMarkerLabelsUFContoursGenerateGeometryLists(CudaDeviceVariable <NppiCompressedMarkerLabelsInfo> pMarkerLabelsInfoListDev, NppiCompressedMarkerLabelsInfo[] pMarkerLabelsInfoListHost, CudaPitchedDeviceVariable <NppiContourPixelDirectionInfo> pContoursDirectionImageDev, NppiContourPixelGeometryInfo[] pContoursPixelGeometryListsHost, uint[] pContoursPixelCountsListHost, uint[] pContoursPixelsFoundListHost, uint[] pContoursPixelsStartingOffsetHost, uint nMaxMarkerLabelID, uint nFirstContourGeometryListID, uint nLastContourGeometryListID, NppStreamContext nppStreamCtx) { status = NPPNativeMethods_Ctx.NPPi.LabelMarkers.nppiCompressedMarkerLabelsUFContoursGenerateGeometryLists_C1R_Ctx(pMarkerLabelsInfoListDev.DevicePointer, pMarkerLabelsInfoListHost, pContoursDirectionImageDev.DevicePointer, pContoursDirectionImageDev.Pitch, pContoursPixelGeometryListsHost, pContoursPixelCountsListHost, pContoursPixelsFoundListHost, pContoursPixelsStartingOffsetHost, nMaxMarkerLabelID, nFirstContourGeometryListID, nLastContourGeometryListID, _sizeRoi, nppStreamCtx); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCompressedMarkerLabelsUFContoursGenerateGeometryLists_C1R_Ctx", status)); NPPException.CheckNppStatus(status, this); }
public void convertRGBToRGBA(CudaPitchedDeviceVariable <uchar4> i4, CudaPitchedDeviceVariable <uchar3> i3, int width, int height) { dim3 block = new dim3(32, 8, 1); dim3 grid = new dim3((width + 31) / 32, (height + 31) / 32, 1); convertRGBToRGBAKernel.BlockDimensions = block; convertRGBToRGBAKernel.GridDimensions = grid; convertRGBToRGBAKernel.Run(i4.DevicePointer, (int)i4.Pitch, i3.DevicePointer, (int)i3.Pitch, width, height); //convertRGBToRGBAKernel<<<grid, block>>>(i4, i4_pitch, i3, i3_pitch, width, height); }
public GrabCut(CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> trimap, int width, int height) { d_trimap = trimap; //The first one will also init the CUDA context! grabCutUtils = new GrabCutUtils(); grabCutGMM = new GrabCutGMM(); size.width = width; size.height = height; graphcut8 = new GraphCut8(size); gmms = 2 * COLOR_CLUSTER; edge_strength = EDGE_STRENGTH; blocks = ((width + 31) / 32) * ((height + 31) / 32); gmm_pitch = 11 * sizeof(float); //d_image = new CudaPitchedDeviceVariable<uchar4>(size.width, size.height); //d_image.CopyToDevice(image); d_image = image; // Doublebuffered alpha d_alpha = new CudaPitchedDeviceVariable <byte> [2]; d_alpha[0] = new CudaPitchedDeviceVariable <byte>(size.width, size.height, 4); d_alpha[1] = new CudaPitchedDeviceVariable <byte>(size.width, size.height, 4); // Graph d_terminals = new CudaPitchedDeviceVariable <int>(size.width, size.height); d_top = new CudaPitchedDeviceVariable <int>(size.width, size.height); d_topleft = new CudaPitchedDeviceVariable <int>(size.width, size.height); d_topright = new CudaPitchedDeviceVariable <int>(size.width, size.height); d_bottom = new CudaPitchedDeviceVariable <int>(size.width, size.height); d_bottomleft = new CudaPitchedDeviceVariable <int>(size.width, size.height); d_bottomright = new CudaPitchedDeviceVariable <int>(size.width, size.height); d_left_transposed = new CudaPitchedDeviceVariable <int>(size.height, size.width); d_right_transposed = new CudaPitchedDeviceVariable <int>(size.height, size.width); //int scratch_gc_size = 0; //nppiGraphcut8GetSize(size, &scratch_gc_size); int scratch_gmm_size = (int)(blocks * gmm_pitch * gmms + blocks * 4); d_scratch_mem = new CudaDeviceVariable <byte>(scratch_gmm_size); //CUDA_SAFE_CALL( cudaMalloc(&d_scratch_mem, MAX(scratch_gmm_size, scratch_gc_size)) ); //NPP_CHECK_NPP(nppiGraphcutInitAlloc(size, &pState, d_scratch_mem) ); d_gmm = new CudaDeviceVariable <float>(gmm_pitch * gmms); //CUDA_SAFE_CALL( cudaMalloc(&d_gmm, gmm_pitch * gmms) ); // Estimate color models on lower res input image first createSmallImage(Math.Max(width / 4, height / 4)); }
public void TrimapFromRect(CudaPitchedDeviceVariable <byte> alpha, NppiRect rect, int width, int height) { dim3 block = new dim3(32, 8, 1); dim3 grid = new dim3((int)((width + (block.x * 4) - 1) / (block.x * 4)), (height + 31) / 32, 1); //rect.y = height - 1 - (rect.y + rect.height - 1) ; // Flip horizontal (FreeImage inverts y axis) TrimapFromRectKernel.BlockDimensions = block; TrimapFromRectKernel.GridDimensions = grid; TrimapFromRectKernel.Run(alpha.DevicePointer, (int)alpha.Pitch, rect, width, height); //TrimapFromRectKernel<<<grid, block>>>(alpha, alpha_pitch, rect, width, height ); }
void advectVelocity(CudaPitchedDeviceVariable <cData> v, CudaDeviceVariable <cData> vx, CudaDeviceVariable <cData> vy, int dx, int pdx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); updateTexture(v, DIM * float2.SizeOf, DIM, tPitch); advectVelocity_k.GridDimensions = grid; advectVelocity_k.BlockDimensions = tids; advectVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, dt, TILEY / TIDSY); }
void advectParticles(uint vbo, CudaPitchedDeviceVariable <cData> v, int dx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); cuda_vbo_resource.MapAllResources(); CUdeviceptr p = cuda_vbo_resource[0].GetMappedPointer(); advectParticles_k.GridDimensions = grid; advectParticles_k.BlockDimensions = tids; advectParticles_k.Run(p, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch); cuda_vbo_resource.UnmapAllResources(); }
public void UpsampleAlpha(CudaPitchedDeviceVariable <byte> alpha, CudaPitchedDeviceVariable <byte> small_alpha, int width, int height, int small_width, int small_height) { dim3 grid = new dim3((width + 127) / 128, (height + 31) / 32, 1); dim3 block = new dim3(32, 8, 1); int factor = width / small_width; int shift = 0; while (factor > (1 << shift)) { shift++; } upsampleAlphaKernel.BlockDimensions = block; upsampleAlphaKernel.GridDimensions = grid; upsampleAlphaKernel.Run(alpha.DevicePointer, small_alpha.DevicePointer, (int)alpha.Pitch, width, height, shift); //upsampleAlphaKernel<<<grid, block>>>(alpha, small_alpha, alpha_pitch, width, height, shift); }
public void EdgeCues(float alpha, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <int> left_transposed, CudaPitchedDeviceVariable <int> right_transposed, CudaPitchedDeviceVariable <int> top, CudaPitchedDeviceVariable <int> bottom, CudaPitchedDeviceVariable <int> topleft, CudaPitchedDeviceVariable <int> topright, CudaPitchedDeviceVariable <int> bottomleft, CudaPitchedDeviceVariable <int> bottomright, int width, int height, CudaDeviceVariable <byte> scratch_mem) { if (texref == null) { texref = new CudaTextureLinearPitched2D <uchar4>(MeanEdgeStrengthReductionKernel, "imageTex", CUAddressMode.Clamp, CUFilterMode.Point, CUTexRefSetFlags.ReadAsInteger, CUArrayFormat.UnsignedInt8, image); } else { texref.Reset(image); } if (texref2 == null) { texref2 = new CudaTextureLinearPitched2D <uchar4>(EdgeCuesKernel, "imageTex", CUAddressMode.Clamp, CUFilterMode.Point, CUTexRefSetFlags.ReadAsInteger, CUArrayFormat.UnsignedInt8, image); } else { texref2.Reset(image); } dim3 grid = new dim3((width + 31) / 32, (height + 31) / 32, 1); dim3 block = new dim3(32, 4, 1); dim3 large_block = new dim3(32, 8, 1); MeanEdgeStrengthReductionKernel.BlockDimensions = large_block; MeanEdgeStrengthReductionKernel.GridDimensions = grid; MeanEdgeStrengthFinalKernel.BlockDimensions = block; MeanEdgeStrengthFinalKernel.GridDimensions = new dim3(1, 1, 1); EdgeCuesKernel.BlockDimensions = block; EdgeCuesKernel.GridDimensions = grid; MeanEdgeStrengthReductionKernel.Run(width, height, scratch_mem.DevicePointer); //MeanEdgeStrengthReductionKernel<<<grid, large_block>>>( width, height, scratch_mem); MeanEdgeStrengthFinalKernel.Run(scratch_mem.DevicePointer, grid.x * grid.y); //MeanEdgeStrengthFinalKernel<<<1,block>>>( scratch_mem, grid.x * grid.y); EdgeCuesKernel.Run(alpha, scratch_mem.DevicePointer, left_transposed.DevicePointer, right_transposed.DevicePointer, top.DevicePointer, bottom.DevicePointer, topleft.DevicePointer, topright.DevicePointer, bottomleft.DevicePointer, bottomright.DevicePointer, (int)top.Pitch / 4, (int)right_transposed.Pitch / 4, width, height); //EdgeCuesKernel<<<grid, block>>>( alpha , scratch_mem, left_transposed, right_transposed, top, bottom, topleft, topright, bottomleft, bottomright, pitch / 4, transposed_pitch/ 4, width, height ); }
/// <summary> /// Copy from this array to a pitched device variable /// </summary> /// <typeparam name="T">device variable base type</typeparam> /// <param name="aDeviceVariable">Destination</param> public void CopyFromThisToDevice <T>(CudaPitchedDeviceVariable <T> aDeviceVariable) where T : struct { CUDAMemCpy2D copyParams = new CUDAMemCpy2D(); copyParams.dstDevice = aDeviceVariable.DevicePointer; copyParams.dstMemoryType = CUMemoryType.Device; copyParams.dstPitch = aDeviceVariable.Pitch; copyParams.srcArray = _cuArray; copyParams.srcMemoryType = CUMemoryType.Array; copyParams.Height = aDeviceVariable.Height; copyParams.WidthInBytes = aDeviceVariable.WidthInBytes; res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref copyParams); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D", res)); if (res != CUResult.Success) { throw new CudaException(res); } }
private void InitializeCUFFT() { g_hvfield = new float2[DS]; g_dvfield = new CudaPitchedDeviceVariable <float2>(DIM, DIM); g_tPitch = g_dvfield.Pitch; //Store pitch in g_tPitch to keep consistency to the C++ code g_dvfield.CopyToDevice(g_hvfield); // Temporary complex velocity field data g_vxfield = new CudaDeviceVariable <float2>(PDS); g_vyfield = new CudaDeviceVariable <float2>(PDS); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); g_particles = new float2[DS]; initParticles(g_particles, DIM, DIM); g_planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.PADDING); g_planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.PADDING); }
public bool SegmentationChanged(CudaDeviceVariable <byte> d_changed, CudaPitchedDeviceVariable <byte> alpha_old, CudaPitchedDeviceVariable <byte> alpha_new, int width, int height) { dim3 grid = new dim3((width + 31) / 32, (height + 31) / 32, 1); dim3 block = new dim3(32, 8, 1); CudaDeviceVariable <int> d_changedInt = new CudaDeviceVariable <int>(d_changed.DevicePointer, false); d_changedInt[0] = 0; SegmentationChangedKernel.BlockDimensions = block; SegmentationChangedKernel.GridDimensions = grid; SegmentationChangedKernel.Run(d_changedInt.DevicePointer, alpha_old.DevicePointer, alpha_new.DevicePointer, (int)alpha_old.Pitch, width, height); //SegmentationChangedKernel<<<grid, block>>>(d_changed, alpha_old, alpha_new, alpha_pitch, width, height); int h_changed = d_changedInt[0]; //error = cudaMemcpy(&h_changed, d_changed, 4, cudaMemcpyDeviceToHost); return(h_changed != 0); }
/// <summary> /// Creates a memset node and adds it to a graph<para/> /// Creates a new memset node and adds it to graph with /// dependencies specified via dependencies.<para/> /// It is possible for dependencies to be null, in which case the node will be placed /// at the root of the graph. Dependencies may not have any duplicate entries.<para/> /// The element size must be 1, 2, or 4 bytes.<para/> /// When the graph is launched, the node will perform the memset described by memsetParams. /// </summary> /// <param name="dependencies">can be null</param> /// <param name="deviceVariable">When the graph is launched, the node will perform the memset on deviceVariable.</param> /// <param name="value">Value to set</param> /// <param name="ctx">Cuda context used for the operation</param> /// <returns>A handle to the new node will be returned.</returns> public CUgraphNode AddMemsetNode <T>(CUgraphNode[] dependencies, CudaPitchedDeviceVariable <T> deviceVariable, uint value, CudaContext ctx) where T : struct { CUgraphNode node = new CUgraphNode(); SizeT numDependencies = 0; if (dependencies != null) { numDependencies = dependencies.Length; } CudaMemsetNodeParams memsetParams = CudaMemsetNodeParams.init <T>(deviceVariable, value); res = DriverAPINativeMethods.GraphManagment.cuGraphAddMemsetNode(ref node, _graph, dependencies, numDependencies, ref memsetParams, ctx.Context); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuGraphAddMemsetNode", res)); if (res != CUResult.Success) { throw new CudaException(res); } return(node); }
private void createSmallImage(int max_dim) { int[] temp_width = new int[2]; int[] temp_height = new int[2]; CudaPitchedDeviceVariable <uchar4>[] d_temp = new CudaPitchedDeviceVariable <uchar4> [2]; temp_width[0] = (int)Math.Ceiling(size.width * 0.5f); temp_height[0] = (int)Math.Ceiling(size.height * 0.5f); temp_width[1] = (int)Math.Ceiling(temp_width[0] * 0.5f); temp_height[1] = (int)Math.Ceiling(temp_height[0] * 0.5f); d_temp[0] = new CudaPitchedDeviceVariable <uchar4>(temp_width[0], temp_height[0]); d_temp[1] = new CudaPitchedDeviceVariable <uchar4>(temp_width[1], temp_height[1]); // Alloc also the small trimaps d_small_trimap = new CudaPitchedDeviceVariable <byte> [2]; d_small_trimap[0] = new CudaPitchedDeviceVariable <byte>(temp_width[0], temp_height[0], 4); d_small_trimap[1] = new CudaPitchedDeviceVariable <byte>(temp_width[1], temp_height[1], 4); grabCutGMM.Downscale(d_temp[0], temp_width[0], temp_height[0], d_image, size.width, size.height); int current = 0; while (temp_width[current] > max_dim || temp_height[current] > max_dim) { grabCutGMM.Downscale(d_temp[1 - current], temp_width[1 - current], temp_height[1 - current], d_temp[current], temp_width[current], temp_height[current]); current ^= 1; temp_width[1 - current] = (int)Math.Ceiling(temp_width[current] * 0.5f); temp_height[1 - current] = (int)Math.Ceiling(temp_height[current] * 0.5f); } d_small_image = d_temp[current]; small_size.width = temp_width[current]; small_size.height = temp_height[current]; graphcut8Small = new GraphCut8(small_size); d_temp[1 - current].Dispose(); }
/// <summary> /// 1 channel 32-bit uinteger connected region marker label renumbered from a previous call to nppiCompressMarkerLabelsUF or /// nppiCmpressMarkerLabelsUFBatch functions to eliminate label ID sparseness. /// </summary> /// <param name="nMaxMarkerLabelID">the value of the maximum marker label ID returned by corresponding compress marker labels UF call. </param> /// <param name="pMarkerLabelsInfoList">pointer to device memory buffer at least as large as value returned by the corresponding CompressedMarkerLabelsGetInfoListSize call.</param> /// <param name="pContoursImage">optional output image containing contours (boundaries) around each uniquely labeled connected pixel region, set to NULL if not needed. </param> /// <param name="pContoursDirectionImage">optional output image containing per contour pixel direction info around each uniquely labeled connected pixel region, set to NULL if not needed. </param> /// <param name="pContoursTotalsInfoHost">unique per call optional host memory pointer to NppiContourTotalsInfo structure in host memory, MUST be set if pContoursDirectionImage is set. </param> /// <param name="pContoursPixelCountsListDev">unique per call optional device memory pointer to array of nMaxMarkerLabelID uintegers in host memory, MUST be set if pContoursDirectionImage is set. </param> /// <param name="pContoursPixelCountsListHost">unique per call optional host memory pointer to array of nMaxMarkerLabelID uintegers in host memory, MUST be set if pContoursDirectionImage is set. </param> /// <param name="pContoursPixelStartingOffsetHost">unique per call optional host memory pointer to array of uintegers returned by this call representing the starting offset index of each contour found during geometry list generation </param> /// <param name="nppStreamCtx">NPP stream context.</param> public void CompressedMarkerLabelsUFInfo( uint nMaxMarkerLabelID, CudaDeviceVariable <NppiCompressedMarkerLabelsInfo> pMarkerLabelsInfoList, NPPImage_8uC1 pContoursImage, CudaPitchedDeviceVariable <NppiContourPixelDirectionInfo> pContoursDirectionImage, NppiContourTotalsInfo[] pContoursTotalsInfoHost, CudaDeviceVariable <uint> pContoursPixelCountsListDev, uint[] pContoursPixelCountsListHost, uint[] pContoursPixelStartingOffsetHost, NppStreamContext nppStreamCtx) { CUdeviceptr ptrMarkerLabelsInfoList = new CUdeviceptr(); CUdeviceptr ptrContoursImage = new CUdeviceptr(); CUdeviceptr ptrContoursDirectionImage = new CUdeviceptr(); CUdeviceptr ptrContoursPixelCountsListDev = new CUdeviceptr(); int pitchContoursImage = 0; int pitchContoursDirectionImage = 0; if (pMarkerLabelsInfoList != null) { ptrMarkerLabelsInfoList = pMarkerLabelsInfoList.DevicePointer; } if (pContoursImage != null) { ptrContoursImage = pContoursImage.DevicePointerRoi; pitchContoursImage = pContoursImage.Pitch; } if (pContoursDirectionImage != null) { ptrContoursDirectionImage = pContoursDirectionImage.DevicePointer; pitchContoursDirectionImage = pContoursDirectionImage.Pitch; } if (pContoursPixelCountsListDev != null) { ptrContoursPixelCountsListDev = pContoursPixelCountsListDev.DevicePointer; } status = NPPNativeMethods_Ctx.NPPi.LabelMarkers.nppiCompressedMarkerLabelsUFInfo_32u_C1R_Ctx(_devPtrRoi, _pitch, _sizeRoi, nMaxMarkerLabelID, ptrMarkerLabelsInfoList, ptrContoursImage, pitchContoursImage, ptrContoursDirectionImage, pitchContoursDirectionImage, pContoursTotalsInfoHost, ptrContoursPixelCountsListDev, pContoursPixelCountsListHost, pContoursPixelStartingOffsetHost, nppStreamCtx); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCompressedMarkerLabelsUFInfo_32u_C1R_Ctx", status)); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Asynchron copy device to host /// </summary> /// <param name="deviceVar"></param> /// <param name="stream"></param> public void AsyncCopyFromDevice(CudaPitchedDeviceVariable <T> deviceVar, CUstream stream) { AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream); }
/// <summary> /// Synchron copy device to host /// </summary> /// <param name="deviceVar"></param> public void SynchronCopyFromDevice(CudaPitchedDeviceVariable <T> deviceVar) { SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch); }
public static NPPImage_16uC4 ToNPPImage(this CudaPitchedDeviceVariable <ushort4> deviceVar) { return(new NPPImage_16uC4(deviceVar.DevicePointer, deviceVar.Width, deviceVar.Height, deviceVar.Pitch)); }
/// <summary> /// /// </summary> /// <param name="var"></param> public CudaResourceDesc(CudaPitchedDeviceVariable<VectorTypes.uint4> var) { resType = CUResourceType.Pitch2D; flags = 0; res = new CudaResourceDescUnion(); res.hArray = new CUarray(); res.hMipmappedArray = new CUmipmappedArray(); res.linear = new CudaResourceDescLinear(); res.pitch2D = new CudaResourceDescPitch2D(); res.pitch2D.devPtr = var.DevicePointer; res.pitch2D.format = CUArrayFormat.UnsignedInt32; res.pitch2D.height = var.Height; res.pitch2D.numChannels = 4; res.pitch2D.pitchInBytes = var.Pitch; res.pitch2D.width = var.Width; }
/// <summary> /// Creates a new NPPImage from allocated device ptr. /// </summary> /// <param name="devPtr">Already allocated device ptr.</param> public NPPImage_8uC4(CudaPitchedDeviceVariable<ManagedCuda.VectorTypes.uchar4> devPtr) :this(devPtr, false) { }
void updateVelocity(CudaPitchedDeviceVariable<cData> v, CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int pdx, int dy, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); updateVelocity_k.GridDimensions = grid; updateVelocity_k.BlockDimensions = tids; updateVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, TILEY / TIDSY, tPitch); }
/// <summary> /// Graphcut of a flow network (32bit floating point edge capacities). The /// function computes the minimal cut (graphcut) of a 2D regular 8-connected /// graph. <para/> /// The inputs are the capacities of the horizontal (in transposed form), /// vertical and terminal (source and sink) edges. The capacities to source and /// sink /// are stored as capacity differences in the terminals array /// ( terminals(x) = source(x) - sink(x) ). The implementation assumes that the /// edge capacities /// for boundary edges that would connect to nodes outside the specified domain /// are set to 0 (for example left(0,*) == 0). If this is not fulfilled the /// computed labeling may be wrong!<para/> /// The computed binary labeling is encoded as unsigned 8bit values (0 and >0). /// </summary> /// <param name="Terminals">Pointer to differences of terminal edge capacities</param> /// <param name="LeftTransposed">Pointer to transposed left edge capacities</param> /// <param name="RightTransposed">Pointer to transposed right edge capacities</param> /// <param name="Top">Pointer to top edge capacities (top(*,0) must be 0)</param> /// <param name="TopLeft">Pointer to top left edge capacities (topleft(*,0) </param> /// <param name="TopRight">Pointer to top right edge capacities (topright(*,0)</param> /// <param name="Bottom">Pointer to bottom edge capacities (bottom(*,height-1)</param> /// <param name="BottomLeft">Pointer to bottom left edge capacities </param> /// <param name="BottomRight">Pointer to bottom right edge capacities </param> /// <param name="Label">Pointer to destination label image </param> /// <returns></returns> public void GraphCut(CudaPitchedDeviceVariable<float> Terminals, CudaPitchedDeviceVariable<float> LeftTransposed, CudaPitchedDeviceVariable<float> RightTransposed, CudaPitchedDeviceVariable<float> Top, CudaPitchedDeviceVariable<float> TopLeft, CudaPitchedDeviceVariable<float> TopRight, CudaPitchedDeviceVariable<float> Bottom, CudaPitchedDeviceVariable<float> BottomLeft, CudaPitchedDeviceVariable<float> BottomRight, CudaPitchedDeviceVariable<byte> Label) { status = NPPNativeMethods.NPPi.ImageLabeling.nppiGraphcut8_32f8u(Terminals.DevicePointer, LeftTransposed.DevicePointer, RightTransposed.DevicePointer, Top.DevicePointer, TopLeft.DevicePointer, TopRight.DevicePointer, Bottom.DevicePointer, BottomLeft.DevicePointer, BottomRight.DevicePointer, Terminals.Pitch, LeftTransposed.Pitch, _size, Label.DevicePointer, Label.Pitch, _state); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiGraphcut8_32f8u", status)); NPPException.CheckNppStatus(status, this); }
private void advectParticles(CudaDeviceVariable<vertex> p, CudaPitchedDeviceVariable<float2> v, int dx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); advectParticles_k.GridDimensions = grid; advectParticles_k.BlockDimensions = tids; advectParticles_k.Run(p.DevicePointer, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch); }
private void InitializeCUFFT() { g_hvfield = new float2[DS]; g_dvfield = new CudaPitchedDeviceVariable<float2>(DIM, DIM); g_tPitch = g_dvfield.Pitch; //Store pitch in g_tPitch to keep consistency to the C++ code g_dvfield.CopyToDevice(g_hvfield); // Temporary complex velocity field data g_vxfield = new CudaDeviceVariable<float2>(PDS); g_vyfield = new CudaDeviceVariable<float2>(PDS); //create the texture reference explicitly texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); g_particles = new float2[DS]; initParticles(g_particles, DIM, DIM); g_planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); g_planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); }
private void updateTexture(CudaPitchedDeviceVariable <float2> data, SizeT wib, SizeT h, SizeT pitch) { texref.Array.CopyFromDeviceToThis <float2>(data); }
/// <summary> /// One-channel 8-bit unsigned image SqrIntegral. /// Destination integral image is 32-bit signed int. /// Destination square integral image is 64-bit double floating point. /// </summary> /// <param name="dst">Destination-Image</param> /// <param name="sqr">Destination-Image</param> /// <param name="nVal">The value to add to pDst image pixels</param> /// <param name="nValSqr">The value to add to pSqr image pixels</param> public void Integral(NPPImage_32sC1 dst, CudaPitchedDeviceVariable<double> sqr, int nVal, double nValSqr) { status = NPPNativeMethods.NPPi.Integral.nppiSqrIntegral_8u32s64f_C1R(_devPtrRoi, _pitch, dst.DevicePointerRoi, dst.Pitch, sqr.DevicePointer, sqr.Pitch, _sizeRoi, nVal, nValSqr); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiSqrIntegral_8u32s64f_C1R", status)); NPPException.CheckNppStatus(status, this); }
void advectParticles(uint vbo, CudaPitchedDeviceVariable<cData> v, int dx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); cuda_vbo_resource.MapAllResources(); CUdeviceptr p = cuda_vbo_resource[0].GetMappedPointer(); advectParticles_k.GridDimensions = grid; advectParticles_k.BlockDimensions = tids; advectParticles_k.Run(p, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch); cuda_vbo_resource.UnmapAllResources(); }
public static NPPImage_32fC2 ToNPPImage(this CudaPitchedDeviceVariable <float2> deviceVar) { return(new NPPImage_32fC2(deviceVar.DevicePointer, deviceVar.Width, deviceVar.Height, deviceVar.Pitch)); }
void advectVelocity(CudaPitchedDeviceVariable<cData> v, CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int pdx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); updateTexture(v, DIM * float2.SizeOf, DIM, tPitch); advectVelocity_k.GridDimensions = grid; advectVelocity_k.BlockDimensions = tids; advectVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, dt, TILEY / TIDSY); }
public static NPPImage_32fcC1 ToNPPImage(this CudaPitchedDeviceVariable <cuFloatComplex> deviceVar) { return(new NPPImage_32fcC1(deviceVar.DevicePointer, deviceVar.Width, deviceVar.Height, deviceVar.Pitch)); }
void updateTexture(CudaPitchedDeviceVariable<cData> data, SizeT wib, SizeT h, SizeT pitch) { texref.Array.CopyFromDeviceToThis<float2>(data); }
public static NPPImage_8uC1 ToNPPImage(this CudaPitchedDeviceVariable <byte> deviceVar) { return(new NPPImage_8uC1(deviceVar.DevicePointer, deviceVar.Width, deviceVar.Height, deviceVar.Pitch)); }
void addForces(CudaPitchedDeviceVariable<float2> v, int dx, int dy, int spx, int spy, float fx, float fy, int r, SizeT tPitch) { dim3 tids = new dim3((uint)(2 * r + 1), (uint)(2 * r + 1), 1); addForces_k.GridDimensions = new dim3(1); addForces_k.BlockDimensions = tids; addForces_k.Run(v.DevicePointer, dx, dy, spx, spy, fx, fy, r, tPitch); }
public static NPPImage_8sC3 ToNPPImage(this CudaPitchedDeviceVariable <char3> deviceVar) { return(new NPPImage_8sC3(deviceVar.DevicePointer, deviceVar.Width, deviceVar.Height, deviceVar.Pitch)); }
/// <summary> /// Creates a new NPPImage from allocated device ptr. /// </summary> /// <param name="devPtr">Already allocated device ptr.</param> /// <param name="isOwner">If TRUE, devPtr is freed when disposing</param> public NPPImage_8uC4(CudaPitchedDeviceVariable<ManagedCuda.VectorTypes.uchar4> devPtr, bool isOwner) { _devPtr = devPtr.DevicePointer; _devPtrRoi = _devPtr; _sizeOriginal.width = devPtr.Width; _sizeOriginal.height = devPtr.Height; _sizeRoi.width = devPtr.Width; _sizeRoi.height = devPtr.Height; _pitch = devPtr.Pitch; _channels = 4; _isOwner = isOwner; _typeSize = sizeof(byte); }
static void Main(string[] args) { const int nx = 2048; const int ny = 2048; // shifts applied to x and y data const int x_shift = 5; const int y_shift = 7; ShrQATest.shrQAStart(args); if ((nx%TILE_DIM != 0) || (ny%TILE_DIM != 0)) { Console.Write("nx and ny must be multiples of TILE_DIM\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED); } // execution configuration parameters dim3 grid = new dim3(nx/TILE_DIM, ny/TILE_DIM, 1); dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1); // This will pick the best possible CUDA capable device int devID = findCudaDevice(args); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) resName = "simplePitchLinearTexture_x64.ptx"; else resName = "simplePitchLinearTexture.ptx"; string resNamespace = "simplePitchLinearTexture"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) throw new ArgumentException("Kernel not found in resources."); byte[] kernels = new byte[stream.Length]; int bytesToRead = (int)stream.Length; while (bytesToRead > 0) { bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead); } CudaKernel PLKernel = ctx.LoadKernelPTX(kernels, "shiftPitchLinear"); CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray"); CudaStopWatch stopwatch = new CudaStopWatch(); // ---------------------------------- // Host allocation and initialization // ---------------------------------- float[] h_idata = new float[nx * ny]; float[] h_odata = new float[nx * ny]; float[] gold = new float[nx * ny]; for (int i = 0; i < nx * ny; ++i) h_idata[i] = (float)i; // ------------------------ // Device memory allocation // ------------------------ // Pitch linear input data CudaPitchedDeviceVariable<float> d_idataPL = new CudaPitchedDeviceVariable<float>(nx, ny); // Array input data CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One); // Pitch linear output data CudaPitchedDeviceVariable<float> d_odata = new CudaPitchedDeviceVariable<float>(nx, ny); // ------------------------ // copy host data to device // ------------------------ // Pitch linear d_idataPL.CopyToDevice(h_idata); // Array d_idataArray.CopyFromHostToThis<float>(h_idata); // ---------------------- // Bind texture to memory // ---------------------- // Pitch linear CudaTextureLinearPitched2D<float> texRefPL = new CudaTextureLinearPitched2D<float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL); CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray); // --------------------- // reference calculation // --------------------- for (int j = 0; j < ny; j++) { int jshift = (j + y_shift) % ny; for (int i = 0; i < nx; i++) { int ishift = (i + x_shift) % nx; gold[j * nx + i] = h_idata[jshift * nx + ishift]; } } // ---------------- // shiftPitchLinear // ---------------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); PLKernel.BlockDimensions = threads; PLKernel.GridDimensions = grid; stopwatch.Start(); for (int i=0; i < NUM_REPS; i++) { PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch/sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timePL = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); bool res = cutComparef(gold, h_odata); bool success = true; if (res == false) { Console.Write("*** shiftPitchLinear failed ***\n"); success = false; } // ---------- // shiftArray // ---------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); ArrayKernel.BlockDimensions = threads; ArrayKernel.GridDimensions = grid; stopwatch.Start(); for (int i=0; i < NUM_REPS; i++) { ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch/sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timeArray = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); res = cutComparef(gold, h_odata); if (res == false) { Console.Write("*** shiftArray failed ***\n"); success = false; } float bandwidthPL = 2.0f*1000.0f*nx*ny*sizeof(float)/(1e+9f)/(timePL/NUM_REPS); float bandwidthArray = 2.0f*1000.0f*nx*ny*sizeof(float)/(1e+9f)/(timeArray/NUM_REPS); Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n", bandwidthPL, bandwidthArray); float fetchRatePL = nx*ny/1e+6f/(timePL/(1000.0f*NUM_REPS)); float fetchRateArray = nx*ny/1e+6f/(timeArray/(1000.0f*NUM_REPS)); Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n", fetchRatePL, fetchRateArray); // cleanup texRefPL.Dispose(); texRefArray.Dispose(); d_idataPL.Dispose(); d_idataArray.Dispose(); d_odata.Dispose(); stopwatch.Dispose(); ctx.Dispose(); ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }