public void Dispose() { DeviceVar?.Dispose(); DeviceVar = null; ConvWorkspace?.Dispose(); ConvWorkspace = null; ConvBackWorkspace?.Dispose(); ConvBackWorkspace = null; ConvBackKernelWorkspace?.Dispose(); ConvBackKernelWorkspace = null; }
private void CleanupResources() { // Free device memory if (d_A != null) { d_A?.Dispose(); } if (d_B != null) { d_B?.Dispose(); } // d_C?.Dispose(); if (C != null) { C?.Dispose(); } if (ctx != null) { ctx?.Dispose(); } // Free host memory // We have a GC for that :-) }
/// <summary> /// image maximum relative error. User buffer is internally allocated and freed. /// </summary> /// <param name="src2">2nd source image</param> /// <param name="pError">Pointer to the computed error.</param> /// <param name="nppStreamCtx">NPP stream context.</param> public void MaximumRelativeError(NPPImage_32fcC2 src2, CudaDeviceVariable <double> pError, NppStreamContext nppStreamCtx) { int bufferSize = MaximumRelativeErrorGetBufferHostSize(nppStreamCtx); CudaDeviceVariable <byte> buffer = new CudaDeviceVariable <byte>(bufferSize); status = NPPNativeMethods_Ctx.NPPi.MaximumRelativeError.nppiMaximumRelativeError_32fc_C2R_Ctx(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pError.DevicePointer, buffer.DevicePointer, nppStreamCtx); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMaximumRelativeError_32fc_C2R_Ctx", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// image average relative error. User buffer is internally allocated and freed. /// </summary> /// <param name="src2">2nd source image</param> /// <param name="pError">Pointer to the computed error.</param> public void AverageRelativeError(NPPImage_16scC1 src2, CudaDeviceVariable <double> pError) { int bufferSize = AverageRelativeErrorGetBufferHostSize(); CudaDeviceVariable <byte> buffer = new CudaDeviceVariable <byte>(bufferSize); status = NPPNativeMethods.NPPi.AverageRelativeError.nppiAverageRelativeError_16sc_C1R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pError.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiAverageRelativeError_16sc_C1R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Four-channel 32-bit unsigned image DotProd. Buffer is internally allocated and freed. /// </summary> /// <param name="src2">2nd source image</param> /// <param name="pDp">Pointer to the computed dot product of the two images. (4 * sizeof(double))</param> /// <param name="nppStreamCtx">NPP stream context.</param> public void DotProduct(NPPImage_32uC4 src2, CudaDeviceVariable <double> pDp, NppStreamContext nppStreamCtx) { int bufferSize = DotProdGetBufferHostSize(nppStreamCtx); CudaDeviceVariable <byte> buffer = new CudaDeviceVariable <byte>(bufferSize); status = NPPNativeMethods_Ctx.NPPi.DotProd.nppiDotProd_32u64f_C4R_Ctx(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pDp.DevicePointer, buffer.DevicePointer, nppStreamCtx); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiDotProd_32u64f_C4R_Ctx", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
public virtual void Dispose() { DevicePositions.Dispose(); DevicePersonalBestValues.Dispose(); DeviceVelocities.Dispose(); DevicePersonalBests.Dispose(); _phis1.Dispose(); _phis2.Dispose(); Ctx.Dispose(); }
public void FreeResources() { foreach (var item in backward) { item.Dispose(); } foreach (var item in forward) { item.Dispose(); } FFTBuffer.Dispose(); patchShift.Dispose(); shiftImages.Dispose(); squaredSumsOfTiles.Dispose(); imgCrossCorrelation.Dispose(); imgRefSortedTiles.Dispose(); imgToTrackSortedTiles.Dispose(); imgRefCplx.Dispose(); imgToTrackCplx.Dispose(); }
public override void Dispose() { if (m_valuesHistory != null) { m_valuesHistory.Dispose(); } if (m_canvas != null) { m_canvas.Dispose(); } base.Dispose(); }
protected override void Reset() { TextureHeight = BIN_PIXEL_HEIGHT; TextureWidth = BIN_PIXEL_WIDTH * BINS; if (m_d_HistogramData != null) { m_d_HistogramData.Dispose(); } m_d_HistogramData = new CudaDeviceVariable <int>(BINS); m_d_HistogramData.Memset(0); }
/// <summary> /// For IDisposable /// </summary> /// <param name="fDisposing"></param> protected virtual void Dispose(bool fDisposing) { if (fDisposing && !disposed) { _devVar.Dispose(); disposed = true; // the _texref reference is not destroyed explicitly, as it is done automatically when module is unloaded } if (!fDisposing && !disposed) { Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType())); } }
protected virtual void Dispose(bool disposing) { #if DEBUG if (_id == _badDispose) { Debugger.Break(); } #endif if (_shouldDispose && disposing && !_disposed) { _data.Dispose(); _disposed = true; } }
private void updateHistoryBuffer() { if (Count == 0) { return; } if (Count > nbCurvesMax) { MyLog.ERROR.WriteLine("Number of displayed curved is too high (" + Count + ", max " + nbCurvesMax + ")"); return; } if (m_valuesHistory != null) { m_valuesHistory.Dispose(); } // Allocate the history int historySize = m_plotAreaWidth * Count; m_valuesHistory = new CudaDeviceVariable <float>(historySize); m_valuesHistory.Memset(0); }
public CudaArray3D GenerateUniformArray(int width, int height, int depth) { int count = width * height * depth; CudaDeviceVariable<float> randomVariable = new CudaDeviceVariable<float>(count); CudaArray3D randomArray = new CudaArray3D(CUArrayFormat.Float, width, height, depth, CudaArray3DNumChannels.One, CUDAArray3DFlags.None); randomDevice.SetPseudoRandomGeneratorSeed((ulong)DateTime.Now.Ticks); randomDevice.GenerateUniform32(randomVariable.DevicePointer, count); randomArray.CopyFromDeviceToThis(randomVariable.DevicePointer, sizeof(float)); randomVariable.Dispose(); return randomArray; }
public CudaArray3D GenerateUniformArray(int width, int height, int depth) { int count = width * height * depth; CudaDeviceVariable <float> randomVariable = new CudaDeviceVariable <float>(count); CudaArray3D randomArray = new CudaArray3D(CUArrayFormat.Float, width, height, depth, CudaArray3DNumChannels.One, CUDAArray3DFlags.None); randomDevice.SetPseudoRandomGeneratorSeed((ulong)DateTime.Now.Ticks); randomDevice.GenerateUniform32(randomVariable.DevicePointer, count); randomArray.CopyFromDeviceToThis(randomVariable.DevicePointer, sizeof(float)); randomVariable.Dispose(); return(randomArray); }
public static void update_particles(float[] xx, float[] yy, float[] zz, int cnt, int size) { CudaDeviceVariable <float> d_xx = xx; CudaDeviceVariable <float> d_yy = yy; CudaDeviceVariable <float> d_zz = zz; _gpu.BlockDimensions = new dim3(1, 1, 1); _gpu.GridDimensions = new dim3(cnt, 1, 1); _gpu.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer, d_xx.DevicePointer, d_yy.DevicePointer, d_zz.DevicePointer, size); d_xx.Dispose(); d_yy.Dispose(); d_zz.Dispose(); }
public void FreeDeviceMemory() { d_tmp.Dispose(); d_Ix.Dispose(); d_Iy.Dispose(); d_Iz.Dispose(); //d_imageHalf.Dispose(); d_flow.Dispose(); buffer.Dispose(); mean.Dispose(); std.Dispose(); d_filterX.Dispose(); d_filterY.Dispose(); d_filterT.Dispose(); }
public void Destroy() { #if DEBUG if (_index == _badDispose) { Debugger.Break(); } #endif if (!_disposed) { _data.Dispose(); _disposed = true; } #if DEBUG GC.SuppressFinalize(this); #endif }
public cuDoubleComplex[] PerformFFT(cuDoubleComplex[] data, int n, TransformDirection direction) { f_plan = new CudaFFTPlan2D(n, n, cufftType.Z2Z); CudaDeviceVariable <cuDoubleComplex> d_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n); CudaDeviceVariable <cuDoubleComplex> o_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n); d_signal.CopyToDevice(data); f_plan.Exec(d_signal.DevicePointer, o_signal.DevicePointer, direction); cuDoubleComplex[] result = new cuDoubleComplex[n * n]; o_signal.CopyToHost(result); d_signal.Dispose(); return(result); }
//Clean up before closing private void Form1_FormClosing(object sender, FormClosingEventArgs e) { isRunning = false; isInit = false; cuda_vbo_resource.Dispose(); texref.Dispose(); dvfield.Dispose(); vxfield.Dispose(); vyfield.Dispose(); planc2r.Dispose(); planr2c.Dispose(); GL.BindBuffer(BufferTarget.ArrayBuffer, 0); GL.DeleteBuffers(1, ref vbo); stopwatch.Dispose(); ctx.Dispose(); }
// unregister this buffer object with CUDA and destroy buffer private void DeleteVertexVBO() { if ((m_cudaVertexSource == null) && (m_cudaVertexVar == null)) { return; } if (m_cudaVertexSource != null) { m_cudaVertexSource.Dispose(); m_cudaVertexSource = null; } else if (m_cudaVertexVar != null) { m_cudaVertexVar.Dispose(); m_cudaVertexVar = null; } GL.BindBuffer(BufferTarget.ArrayBuffer, 0); GL.DeleteBuffers(1, ref m_vertexVBO); m_vertexVBO = 0; }
protected override void Init() { var kernelFileName = KernelFile; var initKernel = Ctx.LoadKernel(kernelFileName, "generateData"); Xopt = new CudaDeviceVariable<double>(DimensionsCount); var d_fopt = new CudaDeviceVariable<double>(1); int rseed = FunctionNumber + 10000 * InstanceNumber; initKernel.Run( DimensionsCount, rseed, FunctionNumber, InstanceNumber, Xopt.DevicePointer, d_fopt.DevicePointer); double[] fopt_arr = d_fopt; d_fopt.Dispose(); Fopt = fopt_arr[0]; }
/// <summary> /// image CountInRange. Not affecting Alpha. /// </summary> /// <param name="pCounts">Pointer to the number of pixels that fall into the specified range. (3 * sizeof(int))</param> /// <param name="nLowerBound">Fixed size array of the lower bound of the specified range, one per channel.</param> /// <param name="nUpperBound">Fixed size array of the upper bound of the specified range, one per channel.</param> public void CountInRangeA(CudaDeviceVariable<int> pCounts, byte[] nLowerBound, byte[] nUpperBound) { int bufferSize = CountInRangeAGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.CountInRange.nppiCountInRange_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, pCounts.DevicePointer, nLowerBound, nUpperBound, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCountInRange_8u_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// 1 channel 8-bit unsigned image resize. This primitive matches the behavior of GraphicsMagick++. /// </summary> /// <param name="dst">Destination-Image</param> /// <param name="nXFactor">Factor by which x dimension is changed.</param> /// <param name="nYFactor">Factor by which y dimension is changed.</param> /// <param name="eInterpolationMode">The type of eInterpolation to perform resampling. Currently only supports NPPI_INTER_LANCZOS3_Advanced.</param> public void ResizeSqrPixelAdvanced(NPPImage_8uC1 dst, double nXFactor, double nYFactor, InterpolationMode eInterpolationMode) { int bufferSize = ResizeAdvancedGetBufferHostSize(dst.SizeRoi, eInterpolationMode); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); NppiRect roiIn = new NppiRect(_pointRoi, _sizeRoi); NppiRect roiOut = new NppiRect(dst._pointRoi, dst._sizeRoi); status = NPPNativeMethods.NPPi.ResizeSqrPixel.nppiResizeSqrPixel_8u_C1R_Advanced(_devPtrRoi, _sizeOriginal, _pitch, roiIn, dst.DevicePointerRoi, dst.Pitch, roiOut, nXFactor, nYFactor, buffer.DevicePointer, eInterpolationMode); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiResizeSqrPixel_8u_C1R_Advanced", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// image mean with 64-bit double precision result. Buffer is internally allocated and freed. Not affecting alpha. /// </summary> /// <param name="mean">Allocated device memory with size of at least 3 * sizeof(double)</param> public void MeanA(CudaDeviceVariable<double> mean) { int bufferSize = MeanGetBufferHostSizeA(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.MeanNew.nppiMean_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, buffer.DevicePointer, mean.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMean_8u_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// CrossCorrSame_NormLevel. Buffer is internally allocated and freed. /// </summary> /// <param name="tpl">template image.</param> /// <param name="dst">Destination image</param> public void CrossCorrSame_NormLevel(NPPImage_8uC4 tpl, NPPImage_32fC4 dst) { int bufferSize = SameNormLevelGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.ImageProximity.nppiCrossCorrSame_NormLevel_8u32f_C4R(_devPtrRoi, _pitch, _sizeRoi, tpl.DevicePointerRoi, tpl.Pitch, tpl.SizeRoi, dst.DevicePointer, dst.Pitch, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCrossCorrSame_NormLevel_8u32f_C4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Histogram with bins determined by pLevels array. Buffer is internally allocated and freed. Alpha channel is ignored during the histograms computations. /// </summary> /// <param name="histogram">array that receives the computed histogram. The CudaDeviceVariable must be of size nLevels-1. Array size = 3</param> /// <param name="pLevels">Array in device memory containing the level sizes of the bins. The CudaDeviceVariable must be of size nLevels. Array size = 3</param> public void HistogramRangeA(CudaDeviceVariable<int>[] histogram, CudaDeviceVariable<int>[] pLevels) { int[] size = new int[] { histogram[0].Size, histogram[1].Size, histogram[2].Size }; CUdeviceptr[] devPtrs = new CUdeviceptr[] { histogram[0].DevicePointer, histogram[1].DevicePointer, histogram[2].DevicePointer }; CUdeviceptr[] devLevels = new CUdeviceptr[] { pLevels[0].DevicePointer, pLevels[1].DevicePointer, pLevels[2].DevicePointer }; int bufferSize = HistogramRangeGetBufferSizeA(size); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.Histogram.nppiHistogramRange_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, devPtrs, devLevels, size, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramRange_8u_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// image sum with 64-bit long long result. Buffer is internally allocated and freed. /// </summary> /// <param name="result">Allocated device memory with size of at least 4 * sizeof(long)</param> public void Sum(CudaDeviceVariable<long> result) { int bufferSize = SumLongGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.Sum.nppiSum_8u64s_C4R(_devPtrRoi, _pitch, _sizeRoi, buffer.DevicePointer, result.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiSum_8u64s_C4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Histogram with bins determined by pLevels array. Buffer is internally allocated and freed. /// </summary> /// <param name="histogram">array that receives the computed histogram. The array must be of size nLevels-1.</param> /// <param name="pLevels">Array in device memory containing the level sizes of the bins. The array must be of size nLevels</param> public void HistogramRange(CudaDeviceVariable<int> histogram, CudaDeviceVariable<int> pLevels) { int bufferSize = HistogramRangeGetBufferSize(histogram.Size); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.Histogram.nppiHistogramRange_16u_C1R(_devPtrRoi, _pitch, _sizeRoi, histogram.DevicePointer, pLevels.DevicePointer, pLevels.Size, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramRange_16u_C1R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
static void Main(string[] args) { int SIGNAL_SIZE = 50; int FILTER_KERNEL_SIZE = 11; Console.WriteLine("[simpleCUFFT] is starting..."); var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx"; CudaContext ctx = new CudaContext(0); CudaKernel ComplexPointwiseMulAndScale; string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale"); } // Allocate host memory for the signal cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code... Random rand = new Random(0); // Initialize the memory for the signal for (int i = 0; i < SIGNAL_SIZE; ++i) { h_signal[i].real = (float)rand.NextDouble(); h_signal[i].imag = 0; } // Allocate host memory for the filter cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE]; // Initialize the memory for the filter for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { h_filter_kernel[i].real = (float)rand.NextDouble(); h_filter_kernel[i].imag = 0; } // Pad signal and filter kernel cuFloatComplex[] h_padded_signal = null; cuFloatComplex[] h_padded_filter_kernel = null; int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE, h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE); int mem_size = (int)cuFloatComplex.SizeOf * new_size; // Allocate device memory for signal CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_signal.CopyToDevice(h_padded_signal); // Allocate device memory for filter kernel CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_filter_kernel.CopyToDevice(h_padded_filter_kernel); // CUFFT plan simple API CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1); // Transform signal and kernel Console.WriteLine("Transforming signal cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Forward); plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward); // Multiply the coefficients together and normalize the result Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>"); ComplexPointwiseMulAndScale.BlockDimensions = 256; ComplexPointwiseMulAndScale.GridDimensions = 32; ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size); // Transform signal back Console.WriteLine("Transforming signal back cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse); // Copy device memory to host cuFloatComplex[] h_convolved_signal = d_signal; // Allocate host memory for the convolution result cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE]; // Convolve on the host Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); // check result bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f); //Destroy CUFFT context plan.Dispose(); // cleanup memory d_filter_kernel.Dispose(); d_signal.Dispose(); ctx.Dispose(); if (bTestResult) { Console.WriteLine("Test Passed"); } else { Console.WriteLine("Test Failed"); } }
public void Dispose() // free memory allocated on gpu { gpuArray.Dispose(); }
private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth) { int count = width * height * depth; int widthD = width - 1; int heightD = height - 1; int depthD = depth - 1; int countDecremented = widthD * heightD * depthD; dim3 blockDimensions = new dim3(8, 8, 8); dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0)); dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0)); CUDANoiseCube noiseCube = new CUDANoiseCube(); CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16); CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray); CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count); kernelPositionWeight.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions); kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth); kernelNormalAmbient.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions); kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount); int nearestW = NearestPowerOfTwo(widthD); int nearestH = NearestPowerOfTwo(heightD); int nearestD = NearestPowerOfTwo(depthD); int nearestCount = nearestW * nearestH * nearestD; CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount); trisCountDevice.Memset(0); CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented); kernelMarchingCubesCases.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented); kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD); CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount); int lastTrisCount = 0; trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int)); int lastPrefixSum = 0; prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int)); int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3; if (totalVerticesCount > 0) { if (container.Geometry != null) container.Geometry.Dispose(); container.VertexCount = totalVerticesCount; container.Geometry = new Buffer(graphicsDevice, new BufferDescription() { BindFlags = BindFlags.VertexBuffer, CpuAccessFlags = CpuAccessFlags.None, OptionFlags = ResourceOptionFlags.None, SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount, Usage = ResourceUsage.Default }); CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None); kernelMarchingCubesVertices.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented); directResource.Map(); kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD); directResource.UnMap(); directResource.Dispose(); } else { container.VertexCount = 0; if (container.Geometry != null) container.Geometry.Dispose(); } noiseCube.Dispose(); prefixSumsDev.Dispose(); trisCountDevice.Dispose(); offsetsDev.Dispose(); noiseArray.Dispose(); noiseTexture.Dispose(); voxelsDev.Dispose(); }
//Compute histogram and apply LUT to image private void btn_calc_Click(object sender, EventArgs e) { if (_colorChannels < 1 || !_nppOK) { return; } try { int binCount = 255; int levelCount = binCount + 1; int[] levels; int[] bins; int[] lut = new int[levelCount]; int totalSum = 0; float mutiplier = 0; int runningSum = 0; Bitmap res; switch (_colorChannels) { case 1: //The NPP library sets up a CUDA context, we can directly use it without access to it CudaDeviceVariable <int> bins_d = new CudaDeviceVariable <int>(binCount); levels = src_c1.EvenLevels(levelCount, 0, levelCount); //Even levels in Cuda 5.5 seems to be broken: set it manually for (int i = 0; i < levelCount; i++) { levels[i] = i; } //Compute histogram from source image src_c1.HistogramEven(bins_d, 0, binCount + 1); //Copy data from device to host: bins = bins_d; //draw histogram image hist_rb_src.Image = GetHistogramImage(bins, 0); //compute histogram equalization for (int i = 0; i < binCount; i++) { totalSum += bins[i]; } Debug.Assert(totalSum == src_c1.Width * src_c1.Height); if (totalSum == 0) { totalSum = 1; } mutiplier = 1.0f / (float)totalSum * 255.0f; for (int i = 0; i < binCount; i++) { lut[i] = (int)(runningSum * mutiplier + 0.5f); runningSum += bins[i]; } lut[binCount] = 255; //Aplly this lut to src image and get result in dest image src_c1.LUT(dest_c1, lut, levels); //Create new bitmap in host memory for result image res = new Bitmap(src_c1.Width, src_c1.Height, PixelFormat.Format8bppIndexed); SetPalette(res); //Copy result from device to host dest_c1.CopyToHost(res); pictureBox_dest.Image = res; //Compute new histogram and show it dest_c1.HistogramEven(bins_d, 0, binCount); hist_g_src.Image = GetHistogramImage(bins_d, 0); //Free temp memory bins_d.Dispose(); break; case 3: //The NPP library sets up a CUDA context, we can directly use it without access to it CudaDeviceVariable <int>[] bins_ds = new CudaDeviceVariable <int> [3]; bins_ds[0] = new CudaDeviceVariable <int>(binCount); bins_ds[1] = new CudaDeviceVariable <int>(binCount); bins_ds[2] = new CudaDeviceVariable <int>(binCount); levels = src_c3.EvenLevels(levelCount, 0, levelCount); //Even levels in Cuda 5.5 seems to be broken: set it manually for (int i = 0; i < levelCount; i++) { levels[i] = i; } int[] ll = new int[] { 0, 0, 0 }; int[] up = new int[] { binCount + 1, binCount + 1, binCount + 1 }; //Compute histogram from source image src_c3.HistogramEven(bins_ds, ll, up); int[][] bins3 = new int[3][]; int[][] luts = new int[3][]; for (int c = 0; c < 3; c++) { //Copy data from device to host: bins3[c] = bins_ds[c]; luts[c] = new int[levelCount]; } //draw histogram images hist_rb_src.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1); hist_g_src.Image = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2); hist_b_src.Image = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3); //compute histogram equalization for (int c = 0; c < 3; c++) { totalSum = 0; runningSum = 0; for (int i = 0; i < binCount; i++) { totalSum += bins3[c][i]; } Debug.Assert(totalSum == src_c3.Width * src_c3.Height); if (totalSum == 0) { totalSum = 1; } mutiplier = 1.0f / (float)totalSum * 255.0f; for (int i = 0; i < binCount; i++) { luts[c][i] = (int)(runningSum * mutiplier + 0.5f); runningSum += bins3[c][i]; } luts[c][binCount] = 255; } //Aplly this lut to src image and get result in dest image src_c3.Lut(dest_c3, luts[0], levels, luts[1], levels, luts[2], levels); res = new Bitmap(src_c3.Width, src_c3.Height, PixelFormat.Format24bppRgb); //Copy result from device to host dest_c3.CopyToHost(res); pictureBox_dest.Image = res; //Compute new histogram and show it dest_c3.HistogramEven(bins_ds, ll, up); bins3[0] = bins_ds[0]; bins3[1] = bins_ds[1]; bins3[2] = bins_ds[2]; hist_rb_dest.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1); //r hist_g_dest.Image = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2); //g hist_b_dest.Image = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3); //b //Free temp memory bins_ds[0].Dispose(); bins_ds[1].Dispose(); bins_ds[2].Dispose(); break; case 4: //The NPP library sets up a CUDA context, we can directly use it without access to it CudaDeviceVariable <int>[] bins_ds4 = new CudaDeviceVariable <int> [4]; bins_ds4[0] = new CudaDeviceVariable <int>(binCount); bins_ds4[1] = new CudaDeviceVariable <int>(binCount); bins_ds4[2] = new CudaDeviceVariable <int>(binCount); bins_ds4[3] = new CudaDeviceVariable <int>(binCount); levels = src_c4.EvenLevels(levelCount, 0, levelCount); //Even levels in Cuda 5.5 seems to be broken: set it manually for (int i = 0; i < levelCount; i++) { levels[i] = i; } int[] ll4 = new int[] { 0, 0, 0, 0 }; int[] up4 = new int[] { binCount + 1, binCount + 1, binCount + 1, binCount + 1 }; //Compute histogram from source image src_c4.HistogramEven(bins_ds4, ll4, up4); int[][] bins4 = new int[4][]; int[][] luts4 = new int[4][]; for (int c = 0; c < 4; c++) { //Copy data from device to host: bins4[c] = bins_ds4[c]; luts4[c] = new int[levelCount]; } //draw histogram images hist_rb_src.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1); hist_g_src.Image = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2); hist_b_src.Image = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3); //compute histogram equalization for (int c = 0; c < 3; c++) { totalSum = 0; runningSum = 0; for (int i = 0; i < binCount; i++) { totalSum += bins4[c][i]; } Debug.Assert(totalSum == src_c4.Width * src_c4.Height); if (totalSum == 0) { totalSum = 1; } mutiplier = 1.0f / (float)totalSum * 255.0f; for (int i = 0; i < binCount; i++) { luts4[c][i] = (int)(runningSum * mutiplier + 0.5f); runningSum += bins4[c][i]; } luts4[c][binCount] = 255; } //Aplly this lut to src image and get result in dest image src_c4.LutA(dest_c4, luts4[0], levels, luts4[1], levels, luts4[2], levels); //Set alpha channel to 255 dest_c4.Set(255, 3); res = new Bitmap(src_c4.Width, src_c4.Height, PixelFormat.Format32bppArgb); //Copy result from device to host dest_c4.CopyToHost(res); pictureBox_dest.Image = res; //Compute new histogram and show it dest_c4.HistogramEven(bins_ds4, ll4, up4); bins4[0] = bins_ds4[0]; bins4[1] = bins_ds4[1]; bins4[2] = bins_ds4[2]; hist_rb_dest.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1); //r hist_g_dest.Image = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2); //g hist_b_dest.Image = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3); //b //Free temp memory bins_ds4[0].Dispose(); bins_ds4[1].Dispose(); bins_ds4[2].Dispose(); bins_ds4[3].Dispose(); break; } } catch (Exception ex) { if (ex is NPPException) { txt_info.AppendText("NPPException: " + ex.Message + "\n"); CleanUp(); } else if (ex is CudaException) { txt_info.AppendText("CudaException: " + ex.Message + "\n"); CleanUp(); } else { throw; } } }
//Compute histogram and apply LUT to image private void btn_calc_Click(object sender, EventArgs e) { if (_colorChannels < 1 || !_nppOK) return; try { int binCount = 255; int levelCount = binCount + 1; int[] levels; int[] bins; int[] lut = new int[levelCount]; int totalSum = 0; float mutiplier = 0; int runningSum = 0; Bitmap res; switch (_colorChannels) { case 1: //The NPP library sets up a CUDA context, we can directly use it without access to it CudaDeviceVariable<int> bins_d = new CudaDeviceVariable<int>(binCount); levels = src_c1.EvenLevels(levelCount, 0, levelCount); //Even levels in Cuda 5.5 seems to be broken: set it manually for (int i = 0; i < levelCount; i++) { levels[i] = i; } //Compute histogram from source image src_c1.HistogramEven(bins_d, 0, binCount+1); //Copy data from device to host: bins = bins_d; //draw histogram image hist_rb_src.Image = GetHistogramImage(bins, 0); //compute histogram equalization for (int i = 0; i < binCount; i++) { totalSum += bins[i]; } Debug.Assert(totalSum == src_c1.Width * src_c1.Height); if (totalSum == 0) totalSum = 1; mutiplier = 1.0f / (float)totalSum * 255.0f; for (int i = 0; i < binCount; i++) { lut[i] = (int)(runningSum * mutiplier + 0.5f); runningSum += bins[i]; } lut[binCount] = 255; //Aplly this lut to src image and get result in dest image src_c1.LUT(dest_c1, lut, levels); //Create new bitmap in host memory for result image res = new Bitmap(src_c1.Width, src_c1.Height, PixelFormat.Format8bppIndexed); SetPalette(res); //Copy result from device to host dest_c1.CopyToHost(res); pictureBox_dest.Image = res; //Compute new histogram and show it dest_c1.HistogramEven(bins_d, 0, binCount); hist_g_src.Image = GetHistogramImage(bins_d, 0); //Free temp memory bins_d.Dispose(); break; case 3: //The NPP library sets up a CUDA context, we can directly use it without access to it CudaDeviceVariable<int>[] bins_ds = new CudaDeviceVariable<int>[3]; bins_ds[0] = new CudaDeviceVariable<int>(binCount); bins_ds[1] = new CudaDeviceVariable<int>(binCount); bins_ds[2] = new CudaDeviceVariable<int>(binCount); levels = src_c3.EvenLevels(levelCount, 0, levelCount); //Even levels in Cuda 5.5 seems to be broken: set it manually for (int i = 0; i < levelCount; i++) { levels[i] = i; } int[] ll = new int[] { 0, 0, 0 }; int[] up = new int[] { binCount+1, binCount+1, binCount+1 }; //Compute histogram from source image src_c3.HistogramEven(bins_ds, ll, up); int[][] bins3 = new int[3][]; int[][] luts = new int[3][]; for (int c = 0; c < 3; c++) { //Copy data from device to host: bins3[c] = bins_ds[c]; luts[c] = new int[levelCount]; } //draw histogram images hist_rb_src.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1); hist_g_src.Image = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2); hist_b_src.Image = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3); //compute histogram equalization for (int c = 0; c < 3; c++) { totalSum = 0; runningSum = 0; for (int i = 0; i < binCount; i++) { totalSum += bins3[c][i]; } Debug.Assert(totalSum == src_c3.Width * src_c3.Height); if (totalSum == 0) totalSum = 1; mutiplier = 1.0f / (float)totalSum * 255.0f; for (int i = 0; i < binCount; i++) { luts[c][i] = (int)(runningSum * mutiplier + 0.5f); runningSum += bins3[c][i]; } luts[c][binCount] = 255; } //Aplly this lut to src image and get result in dest image src_c3.Lut(dest_c3, luts[0], levels, luts[1], levels, luts[2], levels); res = new Bitmap(src_c3.Width, src_c3.Height, PixelFormat.Format24bppRgb); //Copy result from device to host dest_c3.CopyToHost(res); pictureBox_dest.Image = res; //Compute new histogram and show it dest_c3.HistogramEven(bins_ds, ll, up); bins3[0] = bins_ds[0]; bins3[1] = bins_ds[1]; bins3[2] = bins_ds[2]; hist_rb_dest.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1);//r hist_g_dest.Image = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2);//g hist_b_dest.Image = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3);//b //Free temp memory bins_ds[0].Dispose(); bins_ds[1].Dispose(); bins_ds[2].Dispose(); break; case 4: //The NPP library sets up a CUDA context, we can directly use it without access to it CudaDeviceVariable<int>[] bins_ds4 = new CudaDeviceVariable<int>[4]; bins_ds4[0] = new CudaDeviceVariable<int>(binCount); bins_ds4[1] = new CudaDeviceVariable<int>(binCount); bins_ds4[2] = new CudaDeviceVariable<int>(binCount); bins_ds4[3] = new CudaDeviceVariable<int>(binCount); levels = src_c4.EvenLevels(levelCount, 0, levelCount); //Even levels in Cuda 5.5 seems to be broken: set it manually for (int i = 0; i < levelCount; i++) { levels[i] = i; } int[] ll4 = new int[] { 0, 0, 0, 0 }; int[] up4 = new int[] { binCount+1, binCount+1, binCount+1, binCount+1 }; //Compute histogram from source image src_c4.HistogramEven(bins_ds4, ll4, up4); int[][] bins4 = new int[4][]; int[][] luts4 = new int[4][]; for (int c = 0; c < 4; c++) { //Copy data from device to host: bins4[c] = bins_ds4[c]; luts4[c] = new int[levelCount]; } //draw histogram images hist_rb_src.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1); hist_g_src.Image = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2); hist_b_src.Image = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3); //compute histogram equalization for (int c = 0; c < 3; c++) { totalSum = 0; runningSum = 0; for (int i = 0; i < binCount; i++) { totalSum += bins4[c][i]; } Debug.Assert(totalSum == src_c4.Width * src_c4.Height); if (totalSum == 0) totalSum = 1; mutiplier = 1.0f / (float)totalSum * 255.0f; for (int i = 0; i < binCount; i++) { luts4[c][i] = (int)(runningSum * mutiplier + 0.5f); runningSum += bins4[c][i]; } luts4[c][binCount] = 255; } //Aplly this lut to src image and get result in dest image src_c4.LutA(dest_c4, luts4[0], levels, luts4[1], levels, luts4[2], levels); //Set alpha channel to 255 dest_c4.Set(255, 3); res = new Bitmap(src_c4.Width, src_c4.Height, PixelFormat.Format32bppArgb); //Copy result from device to host dest_c4.CopyToHost(res); pictureBox_dest.Image = res; //Compute new histogram and show it dest_c4.HistogramEven(bins_ds4, ll4, up4); bins4[0] = bins_ds4[0]; bins4[1] = bins_ds4[1]; bins4[2] = bins_ds4[2]; hist_rb_dest.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1);//r hist_g_dest.Image = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2);//g hist_b_dest.Image = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3);//b //Free temp memory bins_ds4[0].Dispose(); bins_ds4[1].Dispose(); bins_ds4[2].Dispose(); bins_ds4[3].Dispose(); break; } } catch (Exception ex) { if (ex is NPPException) { txt_info.AppendText("NPPException: " + ex.Message + "\n"); CleanUp(); } else if (ex is CudaException) { txt_info.AppendText("CudaException: " + ex.Message + "\n"); CleanUp(); } else throw; } }
static void Main(string[] args) { string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder... string fileToCompile = File.ReadAllText(filename); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel"); rtc.Compile(args); string log = rtc.GetLogAsString(); Console.WriteLine(log); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); CudaContext ctx = new CudaContext(0); CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd"); // Print the vector length to be used, and compute its size int numElements = 50000; SizeT size = numElements * sizeof(float); Console.WriteLine("[Vector addition of {0} elements]", numElements); // Allocate the host input vector A float[] h_A = new float[numElements]; // Allocate the host input vector B float[] h_B = new float[numElements]; // Allocate the host output vector C float[] h_C = new float[numElements]; Random rand = new Random(0); // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); } Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n"); // Allocate the device input vector A and copy to device CudaDeviceVariable<float> d_A = h_A; // Allocate the device input vector B and copy to device CudaDeviceVariable<float> d_B = h_B; // Allocate the device output vector C CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements); // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock); vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1); vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1); vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements); // Copy the device result vector in device memory to the host result vector // in host memory. Console.WriteLine("Copy output data from the CUDA device to the host memory\n"); d_C.CopyToHost(h_C); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { Console.WriteLine("Result verification failed at element {0}!\n", i); return; } } Console.WriteLine("Test PASSED\n"); // Free device global memory d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); ctx.Dispose(); Console.WriteLine("Done\n"); }
public static void SaveJpeg(string aFilename, int aQuality, Bitmap aImage) { if (aImage.PixelFormat != System.Drawing.Imaging.PixelFormat.Format24bppRgb) { throw new ArgumentException("Only three channel color images are supported."); } if (aImage.Width % 16 != 0 || aImage.Height % 16 != 0) { throw new ArgumentException("The provided bitmap must have a height and width of a multiple of 16."); } JPEGCompression compression = new JPEGCompression(); NPPImage_8uC3 src = new NPPImage_8uC3(aImage.Width, aImage.Height); NPPImage_8uC1 srcY = new NPPImage_8uC1(aImage.Width, aImage.Height); NPPImage_8uC1 srcCb = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2); NPPImage_8uC1 srcCr = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2); src.CopyToDevice(aImage); //System.Drawing.Bitmap is ordered BGR not RGB //The NPP routine BGR to YCbCR outputs the values in clamped range, following the YCbCr standard. //But JPEG uses unclamped values ranging all from [0..255], thus use our own color matrix: float[,] BgrToYCbCr = new float[3, 4] {{0.114f, 0.587f, 0.299f, 0}, {0.5f, -0.33126f, -0.16874f, 128}, {-0.08131f, -0.41869f, 0.5f, 128}}; src.ColorTwist(BgrToYCbCr); //Reduce size of of Cb and Cr channel src.Copy(srcY, 2); srcY.Resize(srcCr, 0.5, 0.5, InterpolationMode.SuperSampling); src.Copy(srcY, 1); srcY.Resize(srcCb, 0.5, 0.5, InterpolationMode.SuperSampling); src.Copy(srcY, 0); FrameHeader oFrameHeader = new FrameHeader(); oFrameHeader.nComponents = 3; oFrameHeader.nHeight = (ushort)aImage.Height; oFrameHeader.nSamplePrecision = 8; oFrameHeader.nWidth = (ushort)aImage.Width; oFrameHeader.aComponentIdentifier = new byte[] { 1, 2, 3 }; oFrameHeader.aSamplingFactors = new byte[] { 34, 17, 17 }; //Y channel is twice the sice of Cb/Cr channel oFrameHeader.aQuantizationTableSelector = new byte[] { 0, 1, 1 }; //Get quantization tables from JPEG standard with quality scaling QuantizationTable[] aQuantizationTables = new QuantizationTable[2]; aQuantizationTables[0] = new QuantizationTable(QuantizationTable.QuantizationType.Luminance, aQuality); aQuantizationTables[1] = new QuantizationTable(QuantizationTable.QuantizationType.Chroma, aQuality); CudaDeviceVariable<byte>[] pdQuantizationTables = new CudaDeviceVariable<byte>[2]; pdQuantizationTables[0] = aQuantizationTables[0].aTable; pdQuantizationTables[1] = aQuantizationTables[1].aTable; //Get Huffman tables from JPEG standard HuffmanTable[] aHuffmanTables = new HuffmanTable[4]; aHuffmanTables[0] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceDC); aHuffmanTables[1] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaDC); aHuffmanTables[2] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceAC); aHuffmanTables[3] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaAC); //Set header ScanHeader oScanHeader = new ScanHeader(); oScanHeader.nA = 0; oScanHeader.nComponents = 3; oScanHeader.nSe = 63; oScanHeader.nSs = 0; oScanHeader.aComponentSelector = new byte[] { 1, 2, 3 }; oScanHeader.aHuffmanTablesSelector = new byte[] { 0, 17, 17 }; NPPImage_16sC1[] apdDCT = new NPPImage_16sC1[3]; NPPImage_8uC1[] apDstImage = new NPPImage_8uC1[3]; NppiSize[] aDstSize = new NppiSize[3]; aDstSize[0] = new NppiSize(srcY.Width, srcY.Height); aDstSize[1] = new NppiSize(srcCb.Width, srcCb.Height); aDstSize[2] = new NppiSize(srcCr.Width, srcCr.Height); // Compute channel sizes as stored in the output JPEG (8x8 blocks & MCU block layout) NppiSize oDstImageSize = new NppiSize(); float frameWidth = (float)Math.Floor((float)oFrameHeader.nWidth); float frameHeight = (float)Math.Floor((float)oFrameHeader.nHeight); oDstImageSize.width = (int)Math.Max(1.0f, frameWidth); oDstImageSize.height = (int)Math.Max(1.0f, frameHeight); //Console.WriteLine("Output Size: " + oDstImageSize.width + "x" + oDstImageSize.height + "x" + (int)(oFrameHeader.nComponents)); apDstImage[0] = srcY; apDstImage[1] = srcCb; apDstImage[2] = srcCr; int nMCUBlocksH = 0; int nMCUBlocksV = 0; // Compute channel sizes as stored in the JPEG (8x8 blocks & MCU block layout) for (int i = 0; i < oFrameHeader.nComponents; ++i) { nMCUBlocksV = Math.Max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] >> 4); nMCUBlocksH = Math.Max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] & 0x0f); } for (int i = 0; i < oFrameHeader.nComponents; ++i) { NppiSize oBlocks = new NppiSize(); NppiSize oBlocksPerMCU = new NppiSize(oFrameHeader.aSamplingFactors[i] & 0x0f, oFrameHeader.aSamplingFactors[i] >> 4); oBlocks.width = (int)Math.Ceiling((oFrameHeader.nWidth + 7) / 8 * (float)(oBlocksPerMCU.width) / nMCUBlocksH); oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width; oBlocks.height = (int)Math.Ceiling((oFrameHeader.nHeight + 7) / 8 * (float)(oBlocksPerMCU.height) / nMCUBlocksV); oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height; // Allocate Memory apdDCT[i] = new NPPImage_16sC1(oBlocks.width * 64, oBlocks.height); } /*************************** * * Output * ***************************/ // Forward DCT for (int i = 0; i < 3; ++i) { compression.DCTQuantFwd8x8LS(apDstImage[i], apdDCT[i], aDstSize[i], pdQuantizationTables[oFrameHeader.aQuantizationTableSelector[i]]); } // Huffman Encoding CudaDeviceVariable<byte> pdScan = new CudaDeviceVariable<byte>(BUFFER_SIZE); int nScanLength = 0; int nTempSize = JPEGCompression.EncodeHuffmanGetSize(aDstSize[0], 3); CudaDeviceVariable<byte> pJpegEncoderTemp = new CudaDeviceVariable<byte>(nTempSize); NppiEncodeHuffmanSpec[] apHuffmanDCTableEnc = new NppiEncodeHuffmanSpec[3]; NppiEncodeHuffmanSpec[] apHuffmanACTableEnc = new NppiEncodeHuffmanSpec[3]; for (int i = 0; i < 3; ++i) { apHuffmanDCTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] >> 4)].aCodes, NppiHuffmanTableType.nppiDCTable); apHuffmanACTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] & 0x0f) + 2].aCodes, NppiHuffmanTableType.nppiACTable); } JPEGCompression.EncodeHuffmanScan(apdDCT, 0, oScanHeader.nSs, oScanHeader.nSe, oScanHeader.nA >> 4, oScanHeader.nA & 0x0f, pdScan, ref nScanLength, apHuffmanDCTableEnc, apHuffmanACTableEnc, aDstSize, pJpegEncoderTemp); for (int i = 0; i < 3; ++i) { JPEGCompression.EncodeHuffmanSpecFree(apHuffmanDCTableEnc[i]); JPEGCompression.EncodeHuffmanSpecFree(apHuffmanACTableEnc[i]); } // Write JPEG to byte array, as in original sample code byte[] pDstOutput = new byte[BUFFER_SIZE]; int pos = 0; oFrameHeader.nWidth = (ushort)oDstImageSize.width; oFrameHeader.nHeight = (ushort)oDstImageSize.height; writeMarker(0x0D8, pDstOutput, ref pos); writeJFIFTag(pDstOutput, ref pos); writeQuantizationTable(aQuantizationTables[0], pDstOutput, ref pos); writeQuantizationTable(aQuantizationTables[1], pDstOutput, ref pos); writeFrameHeader(oFrameHeader, pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[0], pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[1], pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[2], pDstOutput, ref pos); writeHuffmanTable(aHuffmanTables[3], pDstOutput, ref pos); writeScanHeader(oScanHeader, pDstOutput, ref pos); pdScan.CopyToHost(pDstOutput, 0, pos, nScanLength); pos += nScanLength; writeMarker(0x0D9, pDstOutput, ref pos); FileStream fs = new FileStream(aFilename, FileMode.Create, FileAccess.Write); fs.Write(pDstOutput, 0, pos); fs.Close(); //cleanup: fs.Dispose(); pJpegEncoderTemp.Dispose(); pdScan.Dispose(); apdDCT[2].Dispose(); apdDCT[1].Dispose(); apdDCT[0].Dispose(); pdQuantizationTables[1].Dispose(); pdQuantizationTables[0].Dispose(); srcCr.Dispose(); srcCb.Dispose(); srcY.Dispose(); src.Dispose(); compression.Dispose(); }
/// <summary> /// image NormRel_Inf. Buffer is internally allocated and freed. /// </summary> /// <param name="tpl">template image.</param> /// <param name="pNormRel">Pointer to the computed relative error for the infinity norm of two images. (1 * sizeof(double))</param> /// <param name="nCOI">channel of interest.</param> /// <param name="pMask">Mask image.</param> public void NormRel_Inf(NPPImage_16uC3 tpl, CudaDeviceVariable<double> pNormRel, int nCOI, NPPImage_8uC1 pMask) { int bufferSize = NormRelInfMaskedGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.NormRel.nppiNormRel_Inf_16u_C3CMR(_devPtrRoi, _pitch, tpl.DevicePointerRoi, tpl.Pitch, pMask.DevicePointerRoi, pMask.Pitch, _sizeRoi, nCOI, pNormRel.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNormRel_Inf_16u_C3CMR", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// image mean and standard deviation. Buffer is internally allocated and freed. /// </summary> /// <param name="coi">Channel of interest (0, 1 or 2)</param> /// <param name="mean">Allocated device memory with size of at least 1 * sizeof(double)</param> /// <param name="stdDev">Allocated device memory with size of at least 1 * sizeof(double)</param> /// <param name="mask">mask</param> public void MeanStdDev(int coi, CudaDeviceVariable<double> mean, CudaDeviceVariable<double> stdDev, NPPImage_8uC1 mask) { int bufferSize = MeanStdDevMaskedGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.MeanStdDevNew.nppiMean_StdDev_16u_C3CMR(_devPtrRoi, _pitch, mask.DevicePointerRoi, mask.Pitch, _sizeRoi, coi, buffer.DevicePointer, mean.DevicePointer, stdDev.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMean_StdDev_16u_C3CMR", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
static void Main(string[] args) { int N = 275; float[] h_A; float[] h_B; float[] h_C; float[] h_C_ref; CudaDeviceVariable <float> d_A; CudaDeviceVariable <float> d_B; CudaDeviceVariable <float> d_C; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; CudaBlas handle; /* Initialize CUBLAS */ Console.WriteLine("simpleCUBLAS test running."); handle = new CudaBlas(); /* Allocate host memory for the matrices */ h_A = new float[n2]; h_B = new float[n2]; //h_C = new float[n2]; h_C_ref = new float[n2]; Random rand = new Random(0); /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); //h_C[i] = (float)rand.NextDouble(); } /* Allocate device memory for the matrices */ d_A = new CudaDeviceVariable <float>(n2); d_B = new CudaDeviceVariable <float>(n2); d_C = new CudaDeviceVariable <float>(n2); /* Initialize the device matrices with the host matrices */ d_A.CopyToDevice(h_A); d_B.CopyToDevice(h_B); //d_C.CopyToDevice(h_C); /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); /* Performs operation using cublas */ handle.Gemm(Operation.NonTranspose, Operation.NonTranspose, N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N); /* Allocate host memory for reading back the result from device memory */ h_C = d_C; /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)Math.Sqrt((double)error_norm); ref_norm = (float)Math.Sqrt((double)ref_norm); if (Math.Abs(ref_norm) < 1e-7) { Console.WriteLine("!!!! reference norm is 0"); return; } /* Memory clean up */ d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); /* Shutdown */ handle.Dispose(); if (error_norm / ref_norm < 1e-6f) { Console.WriteLine("simpleCUBLAS test passed."); return; } else { Console.WriteLine("simpleCUBLAS test failed."); return; } }
static void Main(string[] args) { int cuda_device = 0; int nstreams = 4; // number of streams for CUDA calls int nreps = 10; // number of times each experiment is repeated int n = 16 * 1024 * 1024; // number of ints in the data set int nbytes = n * sizeof(int); // number of data bytes dim3 threads, blocks; // kernel launch configuration float elapsed_time, time_memcpy, time_kernel; // timing variables float scale_factor = 1.0f; // allocate generic memory and pin it laster instead of using cudaHostAlloc() // Untested in C#, so stick to cudaHostAlloc(). bool bPinGenericMemory = false; // we want this to be the default behavior CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync int niterations; // number of iterations for the loop inside the kernel ShrQATest.shrQAStart(args); Console.WriteLine("[ simpleStreams ]"); foreach (var item in args) { if (item.Contains("help")) { printHelp(); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED); } } bPinGenericMemory = false; foreach (var item in args) { if (item.Contains("use_generic_memory")) { bPinGenericMemory = true; } } for (int i = 0; i < args.Length; i++) { if (args[i].Contains("sync_method")) { int temp = -1; bool error = false; if (i < args.Length - 1) { error = int.TryParse(args[i + 1], out temp); switch (temp) { case 0: device_sync_method = CUCtxFlags.SchedAuto; break; case 1: device_sync_method = CUCtxFlags.SchedSpin; break; case 2: device_sync_method = CUCtxFlags.SchedYield; break; case 4: device_sync_method = CUCtxFlags.BlockingSync; break; default: error = true; break; } } if (!error) { Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]); nreps = 100; } else { Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } } } int num_devices = CudaContext.GetDeviceCount(); if (0 == num_devices) { Console.Write("your system does not have a CUDA capable device, waiving test...\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } cuda_device = CudaContext.GetMaxGflopsDeviceId(); CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device); if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1)) { Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName); } if (deviceProp.ComputeCapability.Major >= 2) { niterations = 100; } else { if (deviceProp.ComputeCapability.Minor > 1) { niterations = 5; } else { niterations = 1; // reduced workload for compute capability 1.0 and 1.1 } } // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case... if (bPinGenericMemory) { Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No"); if (deviceProp.CanMapHostMemory == false) { Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n"); bPinGenericMemory = false; } } // Anything that is less than 32 Cores will have scaled down workload scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f); n = (int)Math.Round((float)n / scale_factor); Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor); Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n", deviceProp.MultiProcessorCount, ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor), ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount); Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor); Console.Write("> array_size = {0}\n\n", n); // enable use of blocking sync, to reduce CPU usage Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]); CudaContext ctx; if (bPinGenericMemory) { ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost); } else { ctx = new CudaContext(cuda_device, device_sync_method); } //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "simpleStreams_x64.ptx"; } else { resName = "simpleStreams.ptx"; } string resNamespace = "simpleStreams"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array"); // allocate host memory int c = 5; // value to which the array will be initialized int[] h_a = null; // pointer to the array data in host memory CudaPageLockedHostMemory <int> hAligned_a = null; // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT) //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a). //In C++ hAligned_a would point somewhere inside the h_a array. AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes); Console.Write("\nStarting Test\n"); // allocate device memory CudaDeviceVariable <int> d_c = c; //using new implicit cast to allocate memory and asign value CudaDeviceVariable <int> d_a = new CudaDeviceVariable <int>(nbytes / sizeof(int)); CudaStream[] streams = new CudaStream[nstreams]; for (int i = 0; i < nstreams; i++) { streams[i] = new CudaStream(); } // create CUDA event handles // use blocking sync CudaEvent start_event, stop_event; CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default); start_event = new CudaEvent(eventflags); stop_event = new CudaEvent(eventflags); // time memcopy from device start_event.Record(); // record in stream-0, to ensure that all previous CUDA calls have completed hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream); stop_event.Record(); stop_event.Synchronize(); // block until the event is actually recorded time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("memcopy:\t{0:0.00}\n", time_memcpy); // time kernel threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations); stop_event.Record(); stop_event.Synchronize(); time_kernel = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("kernel:\t\t{0:0.00}\n", time_kernel); ////////////////////////////////////////////////////////////////////// // time non-streamed execution for reference threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); for (int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations); hAligned_a.SynchronCopyToHost(d_a); } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy); ////////////////////////////////////////////////////////////////////// // time execution with nstreams streams threads = new dim3(512, 1); blocks = new dim3(n / (int)(nstreams * threads.x), 1); byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness for (int i = 0; i < nbytes; i++) { memset[i] = 255; } System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes); d_a.Memset(0); // set device memory to all 0s, for testing correctness start_event.Record(); for (int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; // asynchronously launch nstreams kernels, each operating on its own portion of data for (int i = 0; i < nstreams; i++) { init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations); } // asynchronously launch nstreams memcopies. Note that memcopy in stream x will only // commence executing when all previous CUDA calls in stream x have completed for (int i = 0; i < nstreams; i++) { hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream); } } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams); // check whether the output is correct Console.Write("-------------------------------\n"); //We can directly access data in hAligned_a using the [] operator, but copying //data first to h_a is faster. System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int)); bool bResults = correct_data(h_a, n, c * nreps * niterations); // release resources for (int i = 0; i < nstreams; i++) { streams[i].Dispose(); } start_event.Dispose(); stop_event.Dispose(); hAligned_a.Dispose(); d_a.Dispose(); d_c.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); Console.ReadKey(); ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
static void Main(string[] args) { int N = 275; float[] h_A; float[] h_B; float[] h_C; float[] h_C_ref; CudaDeviceVariable<float> d_A; CudaDeviceVariable<float> d_B; CudaDeviceVariable<float> d_C; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; CudaBlas handle; /* Initialize CUBLAS */ Console.WriteLine("simpleCUBLAS test running."); handle = new CudaBlas(); /* Allocate host memory for the matrices */ h_A = new float[n2]; h_B = new float[n2]; //h_C = new float[n2]; h_C_ref = new float[n2]; Random rand = new Random(0); /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); //h_C[i] = (float)rand.NextDouble(); } /* Allocate device memory for the matrices */ d_A = new CudaDeviceVariable<float>(n2); d_B = new CudaDeviceVariable<float>(n2); d_C = new CudaDeviceVariable<float>(n2); /* Initialize the device matrices with the host matrices */ d_A.CopyToDevice(h_A); d_B.CopyToDevice(h_B); //d_C.CopyToDevice(h_C); /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref); /* Performs operation using cublas */ handle.Gemm(Operation.NonTranspose, Operation.NonTranspose, N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N); /* Allocate host memory for reading back the result from device memory */ h_C = d_C; /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)Math.Sqrt((double)error_norm); ref_norm = (float)Math.Sqrt((double)ref_norm); if (Math.Abs(ref_norm) < 1e-7) { Console.WriteLine("!!!! reference norm is 0"); return; } /* Memory clean up */ d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); /* Shutdown */ handle.Dispose(); if (error_norm / ref_norm < 1e-6f) { Console.WriteLine("simpleCUBLAS test passed."); return; } else { Console.WriteLine("simpleCUBLAS test failed."); return; } }
/// <summary> /// Image pixel minimum and maximum values with their indices. Buffer is internally allocated and freed. /// </summary> /// <param name="coi">Channel of interest (0, 1 or 2)</param> /// <param name="min">Allocated device memory with size of at least 1 * sizeof(ushort)</param> /// <param name="max">Allocated device memory with size of at least 1 * sizeof(ushort)</param> /// <param name="minIndex">Allocated device memory with size of at least 1 * sizeof(NppiPoint)</param> /// <param name="maxIndex">Allocated device memory with size of at least 1 * sizeof(NppiPoint)</param> /// <param name="mask">If the mask is filled with zeros, then all the returned values are zeros, i.e., pMinIndex = {0, 0}, pMaxIndex = {0, 0}, pMinValue = 0, pMaxValue = 0.</param> public void MinMaxIndex(int coi, CudaDeviceVariable<ushort> min, CudaDeviceVariable<ushort> max, CudaDeviceVariable<NppiPoint> minIndex, CudaDeviceVariable<NppiPoint> maxIndex, NPPImage_8uC1 mask) { int bufferSize = MinMaxIndexMaskedGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.MinMaxIndxNew.nppiMinMaxIndx_16u_C3CMR(_devPtrRoi, _pitch, mask.DevicePointerRoi, mask.Pitch, _sizeRoi, coi, min.DevicePointer, max.DevicePointer, minIndex.DevicePointer, maxIndex.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMinMaxIndx_16u_C3CMR", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// CrossCorrValid_NormLevel. Buffer is internally allocated and freed. Not affecting Alpha. /// </summary> /// <param name="tpl">template image.</param> /// <param name="dst">Destination image</param> /// <param name="nScaleFactor">Integer Result Scaling.</param> public void CrossCorrValid_NormLevelA(NPPImage_8uC4 tpl, NPPImage_8uC4 dst, int nScaleFactor) { int bufferSize = ValidNormLevelScaledAGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.ImageProximity.nppiCrossCorrValid_NormLevel_8u_AC4RSfs(_devPtrRoi, _pitch, _sizeRoi, tpl.DevicePointerRoi, tpl.Pitch, tpl.SizeRoi, dst.DevicePointer, dst.Pitch, nScaleFactor, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCrossCorrValid_NormLevel_8u_AC4RSfs", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// image L2 norm. Buffer is internally allocated and freed. /// </summary> /// <param name="coi">Channel of interest (0, 1 or 2)</param> /// <param name="norm">Allocated device memory with size of at least 1 * sizeof(double)</param> /// <param name="mask">mask</param> public void NormL2(int coi, CudaDeviceVariable<double> norm, NPPImage_8uC1 mask) { int bufferSize = NormL2MaskedGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.NormL2.nppiNorm_L2_16u_C3CMR(_devPtrRoi, _pitch, mask.DevicePointerRoi, mask.Pitch, _sizeRoi, coi, norm.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNorm_L2_16u_C3CMR", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Four-channel 32-bit unsigned image DotProd. Buffer is internally allocated and freed. Ignoring alpha channel. /// </summary> /// <param name="src2">2nd source image</param> /// <param name="pDp">Pointer to the computed dot product of the two images. (3 * sizeof(double))</param> public void ADotProduct(NPPImage_32sC4 src2, CudaDeviceVariable<double> pDp) { int bufferSize = DotProdGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.DotProd.nppiDotProd_32s64f_AC4R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pDp.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiDotProd_32s64f_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Histogram with evenly distributed bins. Buffer is internally allocated and freed. /// </summary> /// <param name="histogram">Allocated device memory of size nLevels (4 Variables)</param> /// <param name="nLowerLevel">Lower boundary of lowest level bin. E.g. 0 for [0..255]. Size = 4</param> /// <param name="nUpperLevel">Upper boundary of highest level bin. E.g. 256 for [0..255]. Size = 4</param> public void HistogramEven(CudaDeviceVariable<int>[] histogram, int[] nLowerLevel, int[] nUpperLevel) { int[] size = new int[] { histogram[0].Size + 1, histogram[1].Size + 1, histogram[2].Size + 1, histogram[3].Size + 1 }; CUdeviceptr[] devPtrs = new CUdeviceptr[] { histogram[0].DevicePointer, histogram[1].DevicePointer, histogram[2].DevicePointer, histogram[3].DevicePointer }; int bufferSize = HistogramEvenGetBufferSize(size); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.Histogram.nppiHistogramEven_8u_C4R(_devPtrRoi, _pitch, _sizeRoi, devPtrs, size, nLowerLevel, nUpperLevel, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramEven_8u_C4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
public void Dispose() { _ptr.Dispose(); }
/// <summary> /// Image pixel minimum and maximum. Buffer is internally allocated and freed. /// </summary> /// <param name="min">Allocated device memory with size of at least 4 * sizeof(byte)</param> /// <param name="max">Allocated device memory with size of at least 4 * sizeof(byte)</param> public void MinMax(CudaDeviceVariable<byte> min, CudaDeviceVariable<byte> max) { int bufferSize = MinMaxGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.MinMaxNew.nppiMinMax_8u_C4R(_devPtrRoi, _pitch, _sizeRoi, min.DevicePointer, max.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMinMax_8u_C4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
private void display() { stopwatch.Start(); advectVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, DT, g_tPitch); { // Forward FFT g_planr2c.Exec(g_vxfield.DevicePointer); g_planr2c.Exec(g_vyfield.DevicePointer); diffuseProject(g_vxfield, g_vyfield, CPADW, DIM, DT, VIS, g_tPitch); // Inverse FFT g_planc2r.Exec(g_vxfield.DevicePointer); g_planc2r.Exec(g_vyfield.DevicePointer); } updateVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, g_tPitch); // Map D3D9 vertex buffer to CUDA { graphicsres.MapAllResources(); if (g_mparticles != null) { g_mparticles.Dispose(); } g_mparticles = graphicsres[0].GetMappedPointer <vertex>(); advectParticles(g_mparticles, g_dvfield, DIM, DIM, DT, g_tPitch); graphicsres.UnmapAllResources(); } device.Clear(ClearFlags.Target, new Color4(0.0f, 0, 0), 0.0f, 0); device.SetRenderState(RenderState.ZWriteEnable, false); device.SetRenderState(RenderState.AlphaBlendEnable, true); device.SetRenderState(RenderState.SourceBlend, Blend.One); device.SetRenderState(RenderState.DestinationBlend, Blend.One); device.SetRenderState(RenderState.PointSpriteEnable, true); float size = 16.0f; device.SetRenderState(RenderState.PointSize, size); device.SetTexture(0, g_pTexture); if (device.BeginScene().IsSuccess) { Result res; //Draw particles res = device.SetStreamSource(0, g_pVB, 0, Marshal.SizeOf(typeof(vertex))); device.VertexFormat = VertexFormat.Position | VertexFormat.Diffuse; res = device.DrawPrimitives(PrimitiveType.PointList, 0, DS); device.EndScene(); } stopwatch.Stop(); device.Present(); fpsCount++; if (fpsCount == fpsLimit) { float elaps = stopwatch.GetElapsedTime(); float ifps = 1.0f / (elaps / 1000.0f); string fps = string.Format(System.Globalization.CultureInfo.InvariantCulture, "CUDA/D3D9 Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps); myWindow.Title = fps; fpsCount = 0; fpsLimit = (int)Math.Max(ifps, 1.0f); } }
/// <summary> /// Image pixel maximum. Buffer is internally allocated and freed. Not affecting alpha. /// </summary> /// <param name="max">Allocated device memory with size of at least 3 * sizeof(byte)</param> /// <param name="indexX">Allocated device memory with size of at least 3 * sizeof(int)</param> /// <param name="indexY">Allocated device memory with size of at least 3 * sizeof(int)</param> public void MaxIndexA(CudaDeviceVariable<byte> max, CudaDeviceVariable<int> indexX, CudaDeviceVariable<int> indexY) { int bufferSize = MaxIndexGetBufferHostSizeA(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.MaxIdx.nppiMaxIndx_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, buffer.DevicePointer, max.DevicePointer, indexX.DevicePointer, indexY.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMaxIndx_8u_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
private void myWindow_Closing(object sender, System.ComponentModel.CancelEventArgs e) { //Stop render loop before closing if (frameTimer != null) { frameTimer.Tick -= new EventHandler(frameTimer_Tick); frameTimer.Stop(); } //Cleanup if (graphicsres != null) { graphicsres.Dispose(); } if (g_mparticles != null) { g_mparticles.Dispose(); } if (stopwatch != null) { stopwatch.Dispose(); } if (texref != null) { texref.Dispose(); } if (g_dvfield != null) { g_dvfield.Dispose(); } if (g_vxfield != null) { g_vxfield.Dispose(); } if (g_vyfield != null) { g_vyfield.Dispose(); } if (g_planc2r != null) { g_planc2r.Dispose(); } if (g_planr2c != null) { g_planr2c.Dispose(); } if (g_pVB != null) { g_pVB.Dispose(); } if (g_pTexture != null) { g_pTexture.Dispose(); } if (device != null) { device.Dispose(); } if (d3d != null) { d3d.Dispose(); } if (ctx != null) { ctx.Dispose(); } }
/// <summary> /// image L1 norm. Buffer is internally allocated and freed. /// </summary> /// <param name="norm">Allocated device memory with size of at least 3 * sizeof(double)</param> public void NormL1(CudaDeviceVariable<double> norm) { int bufferSize = NormL1GetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.NormL1.nppiNorm_L1_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, norm.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNorm_L1_8u_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
public BarycentricReturnMultiple Execute(List <Triangle> primitives, Dictionary <string, object> .KeyCollection dataKeys) { h_v0 = new float2[primitives.Count]; h_v1 = new float2[primitives.Count]; h_v2 = new float2[primitives.Count]; for (int i = 0; i < primitives.Count; i++) { h_v0[i] = new float2(((Vector4)primitives[i][0][VertexShader.PositionName]).X, ((Vector4)primitives[i][0][VertexShader.PositionName]).Y); h_v1[i] = new float2(((Vector4)primitives[i][1][VertexShader.PositionName]).X, ((Vector4)primitives[i][1][VertexShader.PositionName]).Y); h_v2[i] = new float2(((Vector4)primitives[i][2][VertexShader.PositionName]).X, ((Vector4)primitives[i][2][VertexShader.PositionName]).Y); } int dataByteSize = 1; foreach (var key in dataKeys) { if (key == VertexShader.PositionName) { continue; } switch (primitives[0][0][key]) { case float _: { dataByteSize += 1; break; } case Vector2 _: { dataByteSize += 2; break; } case Vector3 _: { dataByteSize += 3; break; } case Vector4 _: { dataByteSize += 4; break; } } } float[] h_da = new float[dataByteSize * primitives.Count]; float[] h_db = new float[dataByteSize * primitives.Count]; float[] h_dc = new float[dataByteSize * primitives.Count]; for (int i = 0; i < primitives.Count; i++) { h_da[i * dataByteSize] = ((Vector4)primitives[i][0][VertexShader.PositionName]).Z; h_db[i * dataByteSize] = ((Vector4)primitives[i][1][VertexShader.PositionName]).Z; h_dc[i * dataByteSize] = ((Vector4)primitives[i][2][VertexShader.PositionName]).Z; int currentIndex = i * dataByteSize + 1; foreach (var key in dataKeys) { if (key == VertexShader.PositionName) { continue; } switch (primitives[i][0][key]) { case float _: { h_da[currentIndex] = (float)primitives[i][0][key]; h_db[currentIndex] = (float)primitives[i][1][key]; h_dc[currentIndex] = (float)primitives[i][2][key]; currentIndex += 1; break; } case Vector2 _: { Vector2 v0 = (Vector2)primitives[i][0][key]; Vector2 v1 = (Vector2)primitives[i][1][key]; Vector2 v2 = (Vector2)primitives[i][2][key]; h_da[currentIndex] = v0.X; h_da[currentIndex + 1] = v0.Y; h_db[currentIndex] = v1.X; h_db[currentIndex + 1] = v1.Y; h_dc[currentIndex] = v2.X; h_dc[currentIndex + 1] = v2.Y; currentIndex += 2; break; } case Vector3 _: { Vector3 v0 = (Vector3)primitives[i][0][key]; Vector3 v1 = (Vector3)primitives[i][1][key]; Vector3 v2 = (Vector3)primitives[i][2][key]; h_da[currentIndex] = v0.X; h_da[currentIndex + 1] = v0.Y; h_da[currentIndex + 2] = v0.Z; h_db[currentIndex] = v1.X; h_db[currentIndex + 1] = v1.Y; h_db[currentIndex + 2] = v1.Z; h_dc[currentIndex] = v2.X; h_dc[currentIndex + 1] = v2.Y; h_dc[currentIndex + 2] = v2.Z; currentIndex += 3; break; } case Vector4 _: { Vector4 v0 = (Vector4)primitives[i][0][key]; Vector4 v1 = (Vector4)primitives[i][1][key]; Vector4 v2 = (Vector4)primitives[i][2][key]; h_da[currentIndex] = v0.X; h_da[currentIndex + 1] = v0.Y; h_da[currentIndex + 2] = v0.Z; h_da[currentIndex + 3] = v0.W; h_db[currentIndex] = v1.X; h_db[currentIndex + 1] = v1.Y; h_db[currentIndex + 2] = v1.Z; h_db[currentIndex + 3] = v1.W; h_dc[currentIndex] = v2.X; h_dc[currentIndex + 1] = v2.Y; h_dc[currentIndex + 2] = v2.Z; h_dc[currentIndex + 3] = v2.W; currentIndex += 4; break; } } } } h_dOut = new float[Width * Height * dataByteSize * primitives.Count]; h_dOut_valid_fragment = new int[Width * Height * primitives.Count]; h_dOut_valid_pixel = new int[Width * Height]; // Allocate vectors in device memory and copy vectors from host memory to device memory // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation. CudaDeviceVariable <float2> dev_v0 = h_v0; CudaDeviceVariable <float2> dev_v1 = h_v1; CudaDeviceVariable <float2> dev_v2 = h_v2; CudaDeviceVariable <float> dev_da = h_da; CudaDeviceVariable <float> dev_db = h_db; CudaDeviceVariable <float> dev_dc = h_dc; CudaDeviceVariable <float> dev_dOut = new CudaDeviceVariable <float>(Width * Height * dataByteSize * primitives.Count); CudaDeviceVariable <int> dev_dOut_valid = new CudaDeviceVariable <int>(Width * Height * primitives.Count); CudaDeviceVariable <int> dev_dOut_valid2 = h_dOut_valid_pixel; dim3 windowSize = new dim3(Width, Height); dim3 blockSize = new dim3(8, 8, 8); dim3 gridSize = new dim3(windowSize.x / blockSize.x + 1, windowSize.y / blockSize.y + 1, ((uint)primitives.Count * (uint)dataByteSize) / blockSize.z + 1); baryKernel.BlockDimensions = blockSize; baryKernel.GridDimensions = gridSize; baryKernel.Run(dev_v0.DevicePointer, dev_v1.DevicePointer, dev_v2.DevicePointer, dataByteSize, primitives.Count, dev_da.DevicePointer, dev_db.DevicePointer, dev_dc.DevicePointer, dev_dOut.DevicePointer, dev_dOut_valid.DevicePointer, dev_dOut_valid2.DevicePointer, Width, Height); // Copy result from device memory to host memory // h_C contains the result in host memory h_dOut = dev_dOut; h_dOut_valid_fragment = dev_dOut_valid; h_dOut_valid_pixel = dev_dOut_valid2; //Cleanup if (dev_v0 != null) { dev_v0.Dispose(); } if (dev_v1 != null) { dev_v1.Dispose(); } if (dev_v2 != null) { dev_v2.Dispose(); } if (dev_da != null) { dev_da.Dispose(); } if (dev_db != null) { dev_db.Dispose(); } if (dev_dc != null) { dev_dc.Dispose(); } if (dev_dOut != null) { dev_dOut.Dispose(); } if (dev_dOut_valid != null) { dev_dOut_valid.Dispose(); } if (dev_dOut_valid2 != null) { dev_dOut_valid2.Dispose(); } OutDataThreaded = new BarycentricReturnMultiple(Width, Height); int dataRowSize = Width; int dataGridSize = dataRowSize * Height; int triangleBlockSize = dataGridSize * dataByteSize; Parallel.For(0, ThreadCount, (i) => { for (int x = i; x < Width; x += ThreadCount) { for (int y = 0; y < Height; y++) { if (h_dOut_valid_pixel[x + y * dataRowSize] == 0) { continue; } for (int z = 0; z < primitives.Count; z++) { if (h_dOut_valid_fragment[x + y * dataRowSize + z * dataGridSize] == 0) { continue; } int dataBaseIndex = x + y * dataRowSize + z * triangleBlockSize; if (OutDataThreaded.Depths[x, y] == null) { OutDataThreaded.Depths[x, y] = new List <float>(); OutDataThreaded.FragmentData[x, y] = new Dictionary <string, IList>(); OutDataThreaded.FragmentCount[x, y] = 0; } OutDataThreaded.Depths[x, y].Add(h_dOut[dataBaseIndex]); OutDataThreaded.FragmentCount[x, y]++; //i == 0 is the depth and already handled above int currentDataPoint = 1; foreach (var key in dataKeys) { if (key == VertexShader.PositionName) { continue; } if (!OutDataThreaded.FragmentData[x, y].ContainsKey(key)) { OutDataThreaded.FragmentData[x, y].Add(key, new List <object>()); } switch (primitives[z][0][key]) { case float _: { OutDataThreaded.FragmentData[x, y][key].Add((float)h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize]); currentDataPoint += 1; break; } case Vector2 _: { Vector2 vec2 = new Vector2(h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize], h_dOut[dataBaseIndex + (currentDataPoint + 1) * dataGridSize]); OutDataThreaded.FragmentData[x, y][key].Add(vec2); currentDataPoint += 2; break; } case Vector3 _: { Vector3 vec3 = new Vector3(h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize], h_dOut[dataBaseIndex + (currentDataPoint + 1) * dataGridSize], h_dOut[dataBaseIndex + (currentDataPoint + 2) * dataGridSize]); OutDataThreaded.FragmentData[x, y][key].Add(vec3); currentDataPoint += 3; break; } case Vector4 _: { Vector4 vec4 = new Vector4(h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize], h_dOut[dataBaseIndex + (currentDataPoint + 1) * dataGridSize], h_dOut[dataBaseIndex + (currentDataPoint + 2) * dataGridSize], h_dOut[dataBaseIndex + (currentDataPoint + 3) * dataGridSize]); OutDataThreaded.FragmentData[x, y][key].Add(vec4); currentDataPoint += 4; break; } } } } } } }); return(OutDataThreaded); }
/// <summary> /// image QualityIndex. Not affecting Alpha. /// </summary> /// <param name="src2">2nd source image</param> /// <param name="dst">Pointer to the quality index. (3 * sizeof(float))</param> public void QualityIndexA(NPPImage_8uC4 src2, CudaDeviceVariable<float> dst) { int bufferSize = QualityIndexAGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.QualityIndex.nppiQualityIndex_8u32f_AC4R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, dst.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiQualityIndex_8u32f_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
private void Form1_FormClosing(object sender, FormClosingEventArgs e) { isRunning = false; //Cleanup if (graphicsres != null) { graphicsres.Dispose(); } if (g_mparticles != null) { g_mparticles.Dispose(); } if (stopwatch != null) { stopwatch.Dispose(); } if (texref != null) { texref.Dispose(); } if (g_dvfield != null) { g_dvfield.Dispose(); } if (g_vxfield != null) { g_vxfield.Dispose(); } if (g_vyfield != null) { g_vyfield.Dispose(); } if (g_planc2r != null) { g_planc2r.Dispose(); } if (g_planr2c != null) { g_planr2c.Dispose(); } if (g_pVB != null) { g_pVB.Dispose(); } if (g_pTexture != null) { g_pTexture.Dispose(); } if (device != null) { device.Dispose(); } if (d3d != null) { d3d.Dispose(); } if (ctx != null) { ctx.Dispose(); } }
/// <summary> /// image NormRel_L2. Buffer is internally allocated and freed. Not affecting Alpha. /// </summary> /// <param name="tpl">template image.</param> /// <param name="pNormRel">Pointer to the computed relative error for the infinity norm of two images. (3 * sizeof(double))</param> public void NormRel_L2A(NPPImage_8uC4 tpl, CudaDeviceVariable<double> pNormRel) { int bufferSize = NormRelL2AGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.NormRel.nppiNormRel_L2_8u_AC4R(_devPtrRoi, _pitch, tpl.DevicePointerRoi, tpl.Pitch, _sizeRoi, pNormRel.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNormRel_L2_8u_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Histogram with evenly distributed bins. Buffer is internally allocated and freed. /// </summary> /// <param name="histogram">Allocated device memory of size nLevels</param> /// <param name="nLowerLevel">Lower boundary of lowest level bin. E.g. 0 for [0..255]</param> /// <param name="nUpperLevel">Upper boundary of highest level bin. E.g. 256 for [0..255]</param> public void HistogramEven(CudaDeviceVariable<int> histogram, int nLowerLevel, int nUpperLevel) { int bufferSize = HistogramEvenGetBufferSize(histogram.Size + 1); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.Histogram.nppiHistogramEven_16s_C1R(_devPtrRoi, _pitch, _sizeRoi, histogram.DevicePointer, histogram.Size + 1, nLowerLevel, nUpperLevel, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramEven_16s_C1R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
/// <summary> /// Result pixel value is the median of pixel values under the rectangular mask region, ignoring alpha channel. /// </summary> /// <param name="dst">Destination-Image</param> /// <param name="oMaskSize">Width and Height of the neighborhood region for the local Median operation.</param> /// <param name="oAnchor">X and Y offsets of the kernel origin frame of reference relative to the source pixel.</param> public void FilterMedianA(NPPImage_8uC4 dst, NppiSize oMaskSize, NppiPoint oAnchor) { int bufferSize = FilterMedianGetBufferHostSizeA(oMaskSize); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.ImageMedianFilter.nppiFilterMedian_8u_AC4R(_devPtrRoi, _pitch, dst.DevicePointerRoi, dst.Pitch, _sizeRoi, oMaskSize, oAnchor, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiFilterMedian_8u_AC4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
public override void Dispose() { Xopt.Dispose(); base.Dispose(); }
/// <summary> /// image average relative error. User buffer is internally allocated and freed. /// </summary> /// <param name="src2">2nd source image</param> /// <param name="pError">Pointer to the computed error.</param> public void AverageRelativeError(NPPImage_32sC4 src2, CudaDeviceVariable<double> pError) { int bufferSize = AverageRelativeErrorGetBufferHostSize(); CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize); status = NPPNativeMethods.NPPi.AverageRelativeError.nppiAverageRelativeError_32s_C4R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pError.DevicePointer, buffer.DevicePointer); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiAverageRelativeError_32s_C4R", status)); buffer.Dispose(); NPPException.CheckNppStatus(status, this); }
static void Main(string[] args) { try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost); meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A"); meanSeedA.BlockDimensions = 128; meanSeedA.GridDimensions = 2048; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB.BlockDimensions = 128; meanSeedB.GridDimensions = 2048; meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB_4.BlockDimensions = 128; meanSeedB_4.GridDimensions = 1024; meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound_2.BlockDimensions = 512; meanRound_2.GridDimensions = 2048; meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J"); meanRoundJoin.BlockDimensions = 512; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); streamSecondary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && loopCnt++ > 100) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[40]; d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); lock (Comms.graphSolutionsOut) { Comms.graphSolutionsOut.Enqueue(s); } Comms.SetEvent(); } uint[] count; do { if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } currentJob = currentJob.Next(); Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); //d_indexesA.MemsetAsync(0, streamPrimary.Stream); //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4); for (int i = 0; i < trimRounds; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); ctx.Synchronize(); streamPrimary.Synchronize(); count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 4194304) { // trouble count[0] = 4194304; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); timer.Stop(); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height)); }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST)); if (TEST) { //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); CGraph cg = FinderBag.GetFinder(); if (cg == null) { continue; } cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); //currentJob = currentJob.Next(); Task.Factory.StartNew(() => { Stopwatch sw = new Stopwatch(); sw.Start(); if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } sw.Stop(); if (++trims % 50 == 0) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); //Console.WriteLine("Duped edges: {0}", cg.dupes); Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); }); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); ; } else { CGraph cg = FinderBag.GetFinder(); cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); Task.Factory.StartNew(() => { if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } }); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(5000).Wait(); } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }