Example #1
0
 public void Dispose()
 {
     DeviceVar?.Dispose(); DeviceVar                             = null;
     ConvWorkspace?.Dispose(); ConvWorkspace                     = null;
     ConvBackWorkspace?.Dispose(); ConvBackWorkspace             = null;
     ConvBackKernelWorkspace?.Dispose(); ConvBackKernelWorkspace = null;
 }
Example #2
0
    private void CleanupResources()
    {
        // Free device memory
        if (d_A != null)
        {
            d_A?.Dispose();
        }
        if (d_B != null)
        {
            d_B?.Dispose();
        }
        //   d_C?.Dispose();

        if (C != null)
        {
            C?.Dispose();
        }
        if (ctx != null)
        {
            ctx?.Dispose();
        }

        // Free host memory
        // We have a GC for that :-)
    }
        /// <summary>
        /// image maximum relative error. User buffer is internally allocated and freed.
        /// </summary>
        /// <param name="src2">2nd source image</param>
        /// <param name="pError">Pointer to the computed error.</param>
        /// <param name="nppStreamCtx">NPP stream context.</param>
        public void MaximumRelativeError(NPPImage_32fcC2 src2, CudaDeviceVariable <double> pError, NppStreamContext nppStreamCtx)
        {
            int bufferSize = MaximumRelativeErrorGetBufferHostSize(nppStreamCtx);
            CudaDeviceVariable <byte> buffer = new CudaDeviceVariable <byte>(bufferSize);

            status = NPPNativeMethods_Ctx.NPPi.MaximumRelativeError.nppiMaximumRelativeError_32fc_C2R_Ctx(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pError.DevicePointer, buffer.DevicePointer, nppStreamCtx);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMaximumRelativeError_32fc_C2R_Ctx", status));
            buffer.Dispose();
            NPPException.CheckNppStatus(status, this);
        }
Example #4
0
        /// <summary>
        /// image average relative error. User buffer is internally allocated and freed.
        /// </summary>
        /// <param name="src2">2nd source image</param>
        /// <param name="pError">Pointer to the computed error.</param>
        public void AverageRelativeError(NPPImage_16scC1 src2, CudaDeviceVariable <double> pError)
        {
            int bufferSize = AverageRelativeErrorGetBufferHostSize();
            CudaDeviceVariable <byte> buffer = new CudaDeviceVariable <byte>(bufferSize);

            status = NPPNativeMethods.NPPi.AverageRelativeError.nppiAverageRelativeError_16sc_C1R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pError.DevicePointer, buffer.DevicePointer);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiAverageRelativeError_16sc_C1R", status));
            buffer.Dispose();
            NPPException.CheckNppStatus(status, this);
        }
        /// <summary>
        /// Four-channel 32-bit unsigned image DotProd. Buffer is internally allocated and freed.
        /// </summary>
        /// <param name="src2">2nd source image</param>
        /// <param name="pDp">Pointer to the computed dot product of the two images. (4 * sizeof(double))</param>
        /// <param name="nppStreamCtx">NPP stream context.</param>
        public void DotProduct(NPPImage_32uC4 src2, CudaDeviceVariable <double> pDp, NppStreamContext nppStreamCtx)
        {
            int bufferSize = DotProdGetBufferHostSize(nppStreamCtx);
            CudaDeviceVariable <byte> buffer = new CudaDeviceVariable <byte>(bufferSize);

            status = NPPNativeMethods_Ctx.NPPi.DotProd.nppiDotProd_32u64f_C4R_Ctx(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pDp.DevicePointer, buffer.DevicePointer, nppStreamCtx);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiDotProd_32u64f_C4R_Ctx", status));
            buffer.Dispose();
            NPPException.CheckNppStatus(status, this);
        }
 public virtual void Dispose()
 {
     DevicePositions.Dispose();
     DevicePersonalBestValues.Dispose();
     DeviceVelocities.Dispose();
     DevicePersonalBests.Dispose();
     _phis1.Dispose();
     _phis2.Dispose();
     Ctx.Dispose();
 }
Example #7
0
 public void FreeResources()
 {
     foreach (var item in backward)
     {
         item.Dispose();
     }
     foreach (var item in forward)
     {
         item.Dispose();
     }
     FFTBuffer.Dispose();
     patchShift.Dispose();
     shiftImages.Dispose();
     squaredSumsOfTiles.Dispose();
     imgCrossCorrelation.Dispose();
     imgRefSortedTiles.Dispose();
     imgToTrackSortedTiles.Dispose();
     imgRefCplx.Dispose();
     imgToTrackCplx.Dispose();
 }
 public override void Dispose()
 {
     if (m_valuesHistory != null)
     {
         m_valuesHistory.Dispose();
     }
     if (m_canvas != null)
     {
         m_canvas.Dispose();
     }
     base.Dispose();
 }
        protected override void Reset()
        {
            TextureHeight = BIN_PIXEL_HEIGHT;
            TextureWidth  = BIN_PIXEL_WIDTH * BINS;

            if (m_d_HistogramData != null)
            {
                m_d_HistogramData.Dispose();
            }
            m_d_HistogramData = new CudaDeviceVariable <int>(BINS);
            m_d_HistogramData.Memset(0);
        }
Example #10
0
 /// <summary>
 /// For IDisposable
 /// </summary>
 /// <param name="fDisposing"></param>
 protected virtual void Dispose(bool fDisposing)
 {
     if (fDisposing && !disposed)
     {
         _devVar.Dispose();
         disposed = true;
         // the _texref reference is not destroyed explicitly, as it is done automatically when module is unloaded
     }
     if (!fDisposing && !disposed)
     {
         Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
     }
 }
Example #11
0
        protected virtual void Dispose(bool disposing)
        {
#if DEBUG
            if (_id == _badDispose)
            {
                Debugger.Break();
            }
#endif
            if (_shouldDispose && disposing && !_disposed)
            {
                _data.Dispose();
                _disposed = true;
            }
        }
        private void updateHistoryBuffer()
        {
            if (Count == 0)
            {
                return;
            }

            if (Count > nbCurvesMax)
            {
                MyLog.ERROR.WriteLine("Number of displayed curved is too high (" + Count + ", max " + nbCurvesMax + ")");
                return;
            }

            if (m_valuesHistory != null)
            {
                m_valuesHistory.Dispose();
            }

            // Allocate the history
            int historySize = m_plotAreaWidth * Count;

            m_valuesHistory = new CudaDeviceVariable <float>(historySize);
            m_valuesHistory.Memset(0);
        }
Example #13
0
        public CudaArray3D GenerateUniformArray(int width, int height, int depth)
        {
            int count = width * height * depth;

            CudaDeviceVariable<float> randomVariable = new CudaDeviceVariable<float>(count);
            CudaArray3D randomArray = new CudaArray3D(CUArrayFormat.Float, width, height, depth, CudaArray3DNumChannels.One, CUDAArray3DFlags.None);

            randomDevice.SetPseudoRandomGeneratorSeed((ulong)DateTime.Now.Ticks);
            randomDevice.GenerateUniform32(randomVariable.DevicePointer, count);

            randomArray.CopyFromDeviceToThis(randomVariable.DevicePointer, sizeof(float));

            randomVariable.Dispose();

            return randomArray;
        }
Example #14
0
        public CudaArray3D GenerateUniformArray(int width, int height, int depth)
        {
            int count = width * height * depth;

            CudaDeviceVariable <float> randomVariable = new CudaDeviceVariable <float>(count);
            CudaArray3D randomArray = new CudaArray3D(CUArrayFormat.Float, width, height, depth, CudaArray3DNumChannels.One, CUDAArray3DFlags.None);

            randomDevice.SetPseudoRandomGeneratorSeed((ulong)DateTime.Now.Ticks);
            randomDevice.GenerateUniform32(randomVariable.DevicePointer, count);

            randomArray.CopyFromDeviceToThis(randomVariable.DevicePointer, sizeof(float));

            randomVariable.Dispose();

            return(randomArray);
        }
Example #15
0
        public static void update_particles(float[] xx, float[] yy, float[] zz, int cnt, int size)
        {
            CudaDeviceVariable <float> d_xx = xx;
            CudaDeviceVariable <float> d_yy = yy;
            CudaDeviceVariable <float> d_zz = zz;

            _gpu.BlockDimensions = new dim3(1, 1, 1);
            _gpu.GridDimensions  = new dim3(cnt, 1, 1);
            _gpu.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer,
                     d_xx.DevicePointer, d_yy.DevicePointer, d_zz.DevicePointer,
                     size);

            d_xx.Dispose();
            d_yy.Dispose();
            d_zz.Dispose();
        }
Example #16
0
        public void FreeDeviceMemory()
        {
            d_tmp.Dispose();
            d_Ix.Dispose();
            d_Iy.Dispose();
            d_Iz.Dispose();
            //d_imageHalf.Dispose();

            d_flow.Dispose();
            buffer.Dispose();
            mean.Dispose();
            std.Dispose();
            d_filterX.Dispose();
            d_filterY.Dispose();
            d_filterT.Dispose();
        }
Example #17
0
            public void Destroy()
            {
#if DEBUG
                if (_index == _badDispose)
                {
                    Debugger.Break();
                }
#endif
                if (!_disposed)
                {
                    _data.Dispose();
                    _disposed = true;
                }
#if DEBUG
                GC.SuppressFinalize(this);
#endif
            }
Example #18
0
        public cuDoubleComplex[] PerformFFT(cuDoubleComplex[] data, int n, TransformDirection direction)
        {
            f_plan = new CudaFFTPlan2D(n, n, cufftType.Z2Z);

            CudaDeviceVariable <cuDoubleComplex> d_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n);
            CudaDeviceVariable <cuDoubleComplex> o_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n);

            d_signal.CopyToDevice(data);
            f_plan.Exec(d_signal.DevicePointer, o_signal.DevicePointer, direction);


            cuDoubleComplex[] result = new cuDoubleComplex[n * n];

            o_signal.CopyToHost(result);
            d_signal.Dispose();
            return(result);
        }
Example #19
0
        //Clean up before closing
        private void Form1_FormClosing(object sender, FormClosingEventArgs e)
        {
            isRunning = false;
            isInit    = false;
            cuda_vbo_resource.Dispose();
            texref.Dispose();
            dvfield.Dispose();
            vxfield.Dispose();
            vyfield.Dispose();

            planc2r.Dispose();
            planr2c.Dispose();

            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);
            GL.DeleteBuffers(1, ref vbo);

            stopwatch.Dispose();
            ctx.Dispose();
        }
Example #20
0
        // unregister this buffer object with CUDA and destroy buffer
        private void DeleteVertexVBO()
        {
            if ((m_cudaVertexSource == null) && (m_cudaVertexVar == null))
            {
                return;
            }

            if (m_cudaVertexSource != null)
            {
                m_cudaVertexSource.Dispose();
                m_cudaVertexSource = null;
            }
            else if (m_cudaVertexVar != null)
            {
                m_cudaVertexVar.Dispose();
                m_cudaVertexVar = null;
            }

            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);
            GL.DeleteBuffers(1, ref m_vertexVBO);

            m_vertexVBO = 0;
        }
Example #21
0
        protected override void Init()
        {
            var kernelFileName = KernelFile;
            var initKernel = Ctx.LoadKernel(kernelFileName, "generateData");
            Xopt = new CudaDeviceVariable<double>(DimensionsCount);

            var d_fopt = new CudaDeviceVariable<double>(1);

            int rseed = FunctionNumber + 10000 * InstanceNumber;

            initKernel.Run(
                DimensionsCount,
                rseed,
                FunctionNumber,
                InstanceNumber,
                Xopt.DevicePointer,
                d_fopt.DevicePointer);

            double[] fopt_arr = d_fopt;

            d_fopt.Dispose();

            Fopt = fopt_arr[0];
        }
Example #22
0
		/// <summary>
		/// image CountInRange. Not affecting Alpha.
		/// </summary>
		/// <param name="pCounts">Pointer to the number of pixels that fall into the specified range. (3 * sizeof(int))</param>
		/// <param name="nLowerBound">Fixed size array of the lower bound of the specified range, one per channel.</param>
		/// <param name="nUpperBound">Fixed size array of the upper bound of the specified range, one per channel.</param>
		public void CountInRangeA(CudaDeviceVariable<int> pCounts, byte[] nLowerBound, byte[] nUpperBound)
		{
			int bufferSize = CountInRangeAGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.CountInRange.nppiCountInRange_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, pCounts.DevicePointer, nLowerBound, nUpperBound, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCountInRange_8u_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #23
0
		/// <summary>
		/// 1 channel 8-bit unsigned image resize. This primitive matches the behavior of GraphicsMagick++.
		/// </summary>
		/// <param name="dst">Destination-Image</param>
		/// <param name="nXFactor">Factor by which x dimension is changed.</param>
		/// <param name="nYFactor">Factor by which y dimension is changed.</param>
		/// <param name="eInterpolationMode">The type of eInterpolation to perform resampling. Currently only supports NPPI_INTER_LANCZOS3_Advanced.</param>
		public void ResizeSqrPixelAdvanced(NPPImage_8uC1 dst, double nXFactor, double nYFactor, InterpolationMode eInterpolationMode)
		{
			int bufferSize = ResizeAdvancedGetBufferHostSize(dst.SizeRoi, eInterpolationMode);
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);
			NppiRect roiIn = new NppiRect(_pointRoi, _sizeRoi);
			NppiRect roiOut = new NppiRect(dst._pointRoi, dst._sizeRoi);
			status = NPPNativeMethods.NPPi.ResizeSqrPixel.nppiResizeSqrPixel_8u_C1R_Advanced(_devPtrRoi, _sizeOriginal, _pitch, roiIn, dst.DevicePointerRoi, dst.Pitch, roiOut, nXFactor, nYFactor, buffer.DevicePointer, eInterpolationMode);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiResizeSqrPixel_8u_C1R_Advanced", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #24
0
		/// <summary>
		/// image mean with 64-bit double precision result. Buffer is internally allocated and freed. Not affecting alpha.
		/// </summary>
		/// <param name="mean">Allocated device memory with size of at least 3 * sizeof(double)</param>
		public void MeanA(CudaDeviceVariable<double> mean)
		{
			int bufferSize = MeanGetBufferHostSizeA();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.MeanNew.nppiMean_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, buffer.DevicePointer, mean.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMean_8u_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #25
0
		/// <summary>
		/// CrossCorrSame_NormLevel. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="tpl">template image.</param>
		/// <param name="dst">Destination image</param>
		public void CrossCorrSame_NormLevel(NPPImage_8uC4 tpl, NPPImage_32fC4 dst)
		{
			int bufferSize = SameNormLevelGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.ImageProximity.nppiCrossCorrSame_NormLevel_8u32f_C4R(_devPtrRoi, _pitch, _sizeRoi, tpl.DevicePointerRoi, tpl.Pitch, tpl.SizeRoi, dst.DevicePointer, dst.Pitch, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCrossCorrSame_NormLevel_8u32f_C4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #26
0
		/// <summary>
		/// Histogram with bins determined by pLevels array. Buffer is internally allocated and freed. Alpha channel is ignored during the histograms computations.
		/// </summary>
		/// <param name="histogram">array that receives the computed histogram. The CudaDeviceVariable must be of size nLevels-1. Array size = 3</param>
		/// <param name="pLevels">Array in device memory containing the level sizes of the bins. The CudaDeviceVariable must be of size nLevels. Array size = 3</param>
		public void HistogramRangeA(CudaDeviceVariable<int>[] histogram, CudaDeviceVariable<int>[] pLevels)
		{
			int[] size = new int[] { histogram[0].Size, histogram[1].Size, histogram[2].Size };
			CUdeviceptr[] devPtrs = new CUdeviceptr[] { histogram[0].DevicePointer, histogram[1].DevicePointer, histogram[2].DevicePointer };
			CUdeviceptr[] devLevels = new CUdeviceptr[] { pLevels[0].DevicePointer, pLevels[1].DevicePointer, pLevels[2].DevicePointer };

			int bufferSize = HistogramRangeGetBufferSizeA(size);
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.Histogram.nppiHistogramRange_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, devPtrs, devLevels, size, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramRange_8u_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #27
0
		/// <summary>
		/// image sum with 64-bit long long result. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="result">Allocated device memory with size of at least 4 * sizeof(long)</param>
		public void Sum(CudaDeviceVariable<long> result)
		{
			int bufferSize = SumLongGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.Sum.nppiSum_8u64s_C4R(_devPtrRoi, _pitch, _sizeRoi, buffer.DevicePointer, result.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiSum_8u64s_C4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #28
0
        /// <summary>
        /// Histogram with bins determined by pLevels array. Buffer is internally allocated and freed.
        /// </summary>
        /// <param name="histogram">array that receives the computed histogram. The array must be of size nLevels-1.</param>
        /// <param name="pLevels">Array in device memory containing the level sizes of the bins. The array must be of size nLevels</param>
        public void HistogramRange(CudaDeviceVariable<int> histogram, CudaDeviceVariable<int> pLevels)
        {
            int bufferSize = HistogramRangeGetBufferSize(histogram.Size);
            CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

            status = NPPNativeMethods.NPPi.Histogram.nppiHistogramRange_16u_C1R(_devPtrRoi, _pitch, _sizeRoi, histogram.DevicePointer, pLevels.DevicePointer, pLevels.Size, buffer.DevicePointer);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramRange_16u_C1R", status));
            buffer.Dispose();
            NPPException.CheckNppStatus(status, this);
        }
Example #29
0
        static void Main(string[] args)
        {
            int SIGNAL_SIZE = 50;
            int FILTER_KERNEL_SIZE = 11;

            Console.WriteLine("[simpleCUFFT] is starting...");

            var assembly = Assembly.GetExecutingAssembly();
            var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx";

            CudaContext ctx = new CudaContext(0);
            CudaKernel ComplexPointwiseMulAndScale;
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale");
            }

            // Allocate host memory for the signal
            cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code...

            Random rand = new Random(0);
            // Initialize the memory for the signal
            for (int i = 0; i < SIGNAL_SIZE; ++i)
            {
                h_signal[i].real = (float)rand.NextDouble();
                h_signal[i].imag = 0;
            }

            // Allocate host memory for the filter
            cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE];

            // Initialize the memory for the filter
            for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
            {
                h_filter_kernel[i].real = (float)rand.NextDouble();
                h_filter_kernel[i].imag = 0;
            }

            // Pad signal and filter kernel
            cuFloatComplex[] h_padded_signal = null;
            cuFloatComplex[] h_padded_filter_kernel = null;
            int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE,
                                   h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE);
            int mem_size = (int)cuFloatComplex.SizeOf * new_size;

            // Allocate device memory for signal
            CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size);
            // Copy host memory to device
            d_signal.CopyToDevice(h_padded_signal);

            // Allocate device memory for filter kernel
            CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size);

            // Copy host memory to device
            d_filter_kernel.CopyToDevice(h_padded_filter_kernel);

            // CUFFT plan simple API
            CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1);

            // Transform signal and kernel
            Console.WriteLine("Transforming signal cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Forward);
            plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward);

            // Multiply the coefficients together and normalize the result
            Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>");
            ComplexPointwiseMulAndScale.BlockDimensions = 256;
            ComplexPointwiseMulAndScale.GridDimensions = 32;
            ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size);

            // Transform signal back
            Console.WriteLine("Transforming signal back cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse);

            // Copy device memory to host
            cuFloatComplex[] h_convolved_signal = d_signal;

            // Allocate host memory for the convolution result
            cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE];

            // Convolve on the host
            Convolve(h_signal, SIGNAL_SIZE,
                     h_filter_kernel, FILTER_KERNEL_SIZE,
                     h_convolved_signal_ref);

            // check result
            bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f);

            //Destroy CUFFT context
            plan.Dispose();

            // cleanup memory
            d_filter_kernel.Dispose();
            d_signal.Dispose();
            ctx.Dispose();

            if (bTestResult)
            {
                Console.WriteLine("Test Passed");
            }
            else
            {
                Console.WriteLine("Test Failed");
            }
        }
Example #30
0
 public void Dispose()   // free memory allocated on gpu
 {
     gpuArray.Dispose();
 }
Example #31
0
        private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
        {
            int count = width * height * depth;
            int widthD = width - 1;
            int heightD = height - 1;
            int depthD = depth - 1;
            int countDecremented = widthD * heightD * depthD;

            dim3 blockDimensions = new dim3(8, 8, 8);
            dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
            dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));

            CUDANoiseCube noiseCube = new CUDANoiseCube();

            CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16);
            CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);

            CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count);

            kernelPositionWeight.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);

            kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);

            kernelNormalAmbient.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);

            kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);

            int nearestW = NearestPowerOfTwo(widthD);
            int nearestH = NearestPowerOfTwo(heightD);
            int nearestD = NearestPowerOfTwo(depthD);
            int nearestCount = nearestW * nearestH * nearestD;

            CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount);
            trisCountDevice.Memset(0);
            CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented);

            kernelMarchingCubesCases.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);

            kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);

            CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);

            int lastTrisCount = 0;
            trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));

            int lastPrefixSum = 0;
            prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));

            int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;

            if (totalVerticesCount > 0)
            {
                if (container.Geometry != null)
                    container.Geometry.Dispose();

                container.VertexCount = totalVerticesCount;

                container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
                {
                    BindFlags = BindFlags.VertexBuffer,
                    CpuAccessFlags = CpuAccessFlags.None,
                    OptionFlags = ResourceOptionFlags.None,
                    SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
                    Usage = ResourceUsage.Default
                });

                CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);
                
                kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
                typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);

                directResource.Map();
                kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
                directResource.UnMap();

                directResource.Dispose();
            }
            else
            {
                container.VertexCount = 0;

                if (container.Geometry != null)
                    container.Geometry.Dispose();
            }

            noiseCube.Dispose();
            prefixSumsDev.Dispose();
            trisCountDevice.Dispose();
            offsetsDev.Dispose();
            noiseArray.Dispose();
            noiseTexture.Dispose();
            voxelsDev.Dispose();
        }
Example #32
0
        //Compute histogram and apply LUT to image
        private void btn_calc_Click(object sender, EventArgs e)
        {
            if (_colorChannels < 1 || !_nppOK)
            {
                return;
            }

            try
            {
                int binCount   = 255;
                int levelCount = binCount + 1;

                int[]  levels;
                int[]  bins;
                int[]  lut        = new int[levelCount];
                int    totalSum   = 0;
                float  mutiplier  = 0;
                int    runningSum = 0;
                Bitmap res;

                switch (_colorChannels)
                {
                case 1:
                    //The NPP library sets up a CUDA context, we can directly use it without access to it
                    CudaDeviceVariable <int> bins_d = new CudaDeviceVariable <int>(binCount);
                    levels = src_c1.EvenLevels(levelCount, 0, levelCount);
                    //Even levels in Cuda 5.5 seems to be broken: set it manually
                    for (int i = 0; i < levelCount; i++)
                    {
                        levels[i] = i;
                    }

                    //Compute histogram from source image
                    src_c1.HistogramEven(bins_d, 0, binCount + 1);
                    //Copy data from device to host:
                    bins = bins_d;

                    //draw histogram image
                    hist_rb_src.Image = GetHistogramImage(bins, 0);

                    //compute histogram equalization
                    for (int i = 0; i < binCount; i++)
                    {
                        totalSum += bins[i];
                    }
                    Debug.Assert(totalSum == src_c1.Width * src_c1.Height);

                    if (totalSum == 0)
                    {
                        totalSum = 1;
                    }

                    mutiplier = 1.0f / (float)totalSum * 255.0f;

                    for (int i = 0; i < binCount; i++)
                    {
                        lut[i]      = (int)(runningSum * mutiplier + 0.5f);
                        runningSum += bins[i];
                    }

                    lut[binCount] = 255;

                    //Aplly this lut to src image and get result in dest image
                    src_c1.LUT(dest_c1, lut, levels);

                    //Create new bitmap in host memory for result image
                    res = new Bitmap(src_c1.Width, src_c1.Height, PixelFormat.Format8bppIndexed);
                    SetPalette(res);

                    //Copy result from device to host
                    dest_c1.CopyToHost(res);

                    pictureBox_dest.Image = res;

                    //Compute new histogram and show it
                    dest_c1.HistogramEven(bins_d, 0, binCount);
                    hist_g_src.Image = GetHistogramImage(bins_d, 0);
                    //Free temp memory
                    bins_d.Dispose();
                    break;

                case 3:
                    //The NPP library sets up a CUDA context, we can directly use it without access to it
                    CudaDeviceVariable <int>[] bins_ds = new CudaDeviceVariable <int> [3];
                    bins_ds[0] = new CudaDeviceVariable <int>(binCount);
                    bins_ds[1] = new CudaDeviceVariable <int>(binCount);
                    bins_ds[2] = new CudaDeviceVariable <int>(binCount);
                    levels     = src_c3.EvenLevels(levelCount, 0, levelCount);
                    //Even levels in Cuda 5.5 seems to be broken: set it manually
                    for (int i = 0; i < levelCount; i++)
                    {
                        levels[i] = i;
                    }
                    int[] ll = new int[] { 0, 0, 0 };
                    int[] up = new int[] { binCount + 1, binCount + 1, binCount + 1 };

                    //Compute histogram from source image
                    src_c3.HistogramEven(bins_ds, ll, up);

                    int[][] bins3 = new int[3][];
                    int[][] luts  = new int[3][];
                    for (int c = 0; c < 3; c++)
                    {
                        //Copy data from device to host:
                        bins3[c] = bins_ds[c];
                        luts[c]  = new int[levelCount];
                    }

                    //draw histogram images
                    hist_rb_src.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1);
                    hist_g_src.Image  = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2);
                    hist_b_src.Image  = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3);

                    //compute histogram equalization
                    for (int c = 0; c < 3; c++)
                    {
                        totalSum   = 0;
                        runningSum = 0;
                        for (int i = 0; i < binCount; i++)
                        {
                            totalSum += bins3[c][i];
                        }
                        Debug.Assert(totalSum == src_c3.Width * src_c3.Height);

                        if (totalSum == 0)
                        {
                            totalSum = 1;
                        }

                        mutiplier = 1.0f / (float)totalSum * 255.0f;

                        for (int i = 0; i < binCount; i++)
                        {
                            luts[c][i]  = (int)(runningSum * mutiplier + 0.5f);
                            runningSum += bins3[c][i];
                        }
                        luts[c][binCount] = 255;
                    }
                    //Aplly this lut to src image and get result in dest image
                    src_c3.Lut(dest_c3, luts[0], levels, luts[1], levels, luts[2], levels);

                    res = new Bitmap(src_c3.Width, src_c3.Height, PixelFormat.Format24bppRgb);

                    //Copy result from device to host
                    dest_c3.CopyToHost(res);

                    pictureBox_dest.Image = res;

                    //Compute new histogram and show it
                    dest_c3.HistogramEven(bins_ds, ll, up);
                    bins3[0]           = bins_ds[0];
                    bins3[1]           = bins_ds[1];
                    bins3[2]           = bins_ds[2];
                    hist_rb_dest.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1);                           //r
                    hist_g_dest.Image  = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2);                           //g
                    hist_b_dest.Image  = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3);                           //b

                    //Free temp memory
                    bins_ds[0].Dispose();
                    bins_ds[1].Dispose();
                    bins_ds[2].Dispose();
                    break;

                case 4:
                    //The NPP library sets up a CUDA context, we can directly use it without access to it
                    CudaDeviceVariable <int>[] bins_ds4 = new CudaDeviceVariable <int> [4];
                    bins_ds4[0] = new CudaDeviceVariable <int>(binCount);
                    bins_ds4[1] = new CudaDeviceVariable <int>(binCount);
                    bins_ds4[2] = new CudaDeviceVariable <int>(binCount);
                    bins_ds4[3] = new CudaDeviceVariable <int>(binCount);
                    levels      = src_c4.EvenLevels(levelCount, 0, levelCount);
                    //Even levels in Cuda 5.5 seems to be broken: set it manually
                    for (int i = 0; i < levelCount; i++)
                    {
                        levels[i] = i;
                    }
                    int[] ll4 = new int[] { 0, 0, 0, 0 };
                    int[] up4 = new int[] { binCount + 1, binCount + 1, binCount + 1, binCount + 1 };

                    //Compute histogram from source image
                    src_c4.HistogramEven(bins_ds4, ll4, up4);

                    int[][] bins4 = new int[4][];
                    int[][] luts4 = new int[4][];
                    for (int c = 0; c < 4; c++)
                    {
                        //Copy data from device to host:
                        bins4[c] = bins_ds4[c];
                        luts4[c] = new int[levelCount];
                    }

                    //draw histogram images
                    hist_rb_src.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1);
                    hist_g_src.Image  = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2);
                    hist_b_src.Image  = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3);

                    //compute histogram equalization
                    for (int c = 0; c < 3; c++)
                    {
                        totalSum   = 0;
                        runningSum = 0;
                        for (int i = 0; i < binCount; i++)
                        {
                            totalSum += bins4[c][i];
                        }
                        Debug.Assert(totalSum == src_c4.Width * src_c4.Height);

                        if (totalSum == 0)
                        {
                            totalSum = 1;
                        }

                        mutiplier = 1.0f / (float)totalSum * 255.0f;

                        for (int i = 0; i < binCount; i++)
                        {
                            luts4[c][i] = (int)(runningSum * mutiplier + 0.5f);
                            runningSum += bins4[c][i];
                        }
                        luts4[c][binCount] = 255;
                    }

                    //Aplly this lut to src image and get result in dest image
                    src_c4.LutA(dest_c4, luts4[0], levels, luts4[1], levels, luts4[2], levels);

                    //Set alpha channel to 255
                    dest_c4.Set(255, 3);
                    res = new Bitmap(src_c4.Width, src_c4.Height, PixelFormat.Format32bppArgb);

                    //Copy result from device to host
                    dest_c4.CopyToHost(res);

                    pictureBox_dest.Image = res;

                    //Compute new histogram and show it
                    dest_c4.HistogramEven(bins_ds4, ll4, up4);
                    bins4[0]           = bins_ds4[0];
                    bins4[1]           = bins_ds4[1];
                    bins4[2]           = bins_ds4[2];
                    hist_rb_dest.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1);                           //r
                    hist_g_dest.Image  = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2);                           //g
                    hist_b_dest.Image  = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3);                           //b

                    //Free temp memory
                    bins_ds4[0].Dispose();
                    bins_ds4[1].Dispose();
                    bins_ds4[2].Dispose();
                    bins_ds4[3].Dispose();
                    break;
                }
            }
            catch (Exception ex)
            {
                if (ex is NPPException)
                {
                    txt_info.AppendText("NPPException: " + ex.Message + "\n");
                    CleanUp();
                }
                else if (ex is CudaException)
                {
                    txt_info.AppendText("CudaException: " + ex.Message + "\n");
                    CleanUp();
                }
                else
                {
                    throw;
                }
            }
        }
Example #33
0
        //Compute histogram and apply LUT to image
        private void btn_calc_Click(object sender, EventArgs e)
        {
            if (_colorChannels < 1 || !_nppOK) return;

            try
            {
                int binCount = 255;
                int levelCount = binCount + 1;

                int[] levels;
                int[] bins;
                int[] lut = new int[levelCount];
                int totalSum = 0;
                float mutiplier = 0;
                int runningSum = 0;
                Bitmap res;

                switch (_colorChannels)
                {
                    case 1:
                        //The NPP library sets up a CUDA context, we can directly use it without access to it
                        CudaDeviceVariable<int> bins_d = new CudaDeviceVariable<int>(binCount);
                        levels = src_c1.EvenLevels(levelCount, 0, levelCount);
                        //Even levels in Cuda 5.5 seems to be broken: set it manually
                        for (int i = 0; i < levelCount; i++)
                        {
                            levels[i] = i;
                        }

                        //Compute histogram from source image
                        src_c1.HistogramEven(bins_d, 0, binCount+1);
                        //Copy data from device to host:
                        bins = bins_d;

                        //draw histogram image
                        hist_rb_src.Image = GetHistogramImage(bins, 0);

                        //compute histogram equalization
                        for (int i = 0; i < binCount; i++)
                        {
                            totalSum += bins[i];
                        }
                        Debug.Assert(totalSum == src_c1.Width * src_c1.Height);

                        if (totalSum == 0) totalSum = 1;

                        mutiplier = 1.0f / (float)totalSum * 255.0f;

                        for (int i = 0; i < binCount; i++)
                        {
                            lut[i] = (int)(runningSum * mutiplier + 0.5f);
                            runningSum += bins[i];
                        }

                        lut[binCount] = 255;

                        //Aplly this lut to src image and get result in dest image
                        src_c1.LUT(dest_c1, lut, levels);

                        //Create new bitmap in host memory for result image
                        res = new Bitmap(src_c1.Width, src_c1.Height, PixelFormat.Format8bppIndexed);
                        SetPalette(res);

                        //Copy result from device to host
                        dest_c1.CopyToHost(res);

                        pictureBox_dest.Image = res;

                        //Compute new histogram and show it
                        dest_c1.HistogramEven(bins_d, 0, binCount);
                        hist_g_src.Image = GetHistogramImage(bins_d, 0);
                        //Free temp memory
                        bins_d.Dispose();
                        break;
                    case 3:
                        //The NPP library sets up a CUDA context, we can directly use it without access to it
                        CudaDeviceVariable<int>[] bins_ds = new CudaDeviceVariable<int>[3];
                        bins_ds[0] = new CudaDeviceVariable<int>(binCount);
                        bins_ds[1] = new CudaDeviceVariable<int>(binCount);
                        bins_ds[2] = new CudaDeviceVariable<int>(binCount);
                        levels = src_c3.EvenLevels(levelCount, 0, levelCount);
                        //Even levels in Cuda 5.5 seems to be broken: set it manually
                        for (int i = 0; i < levelCount; i++)
                        {
                            levels[i] = i;
                        }
                        int[] ll = new int[] { 0, 0, 0 };
                        int[] up = new int[] { binCount+1, binCount+1, binCount+1 };

                        //Compute histogram from source image
                        src_c3.HistogramEven(bins_ds, ll, up);

                        int[][] bins3 = new int[3][];
                        int[][] luts = new int[3][];
                        for (int c = 0; c < 3; c++)
                        {
                            //Copy data from device to host:
                            bins3[c] = bins_ds[c];
                            luts[c] = new int[levelCount];
                        }

                        //draw histogram images
                        hist_rb_src.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1);
                        hist_g_src.Image = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2);
                        hist_b_src.Image = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3);

                        //compute histogram equalization
                        for (int c = 0; c < 3; c++)
                        {
                            totalSum = 0;
                            runningSum = 0;
                            for (int i = 0; i < binCount; i++)
                            {
                                totalSum += bins3[c][i];
                            }
                            Debug.Assert(totalSum == src_c3.Width * src_c3.Height);

                            if (totalSum == 0) totalSum = 1;

                            mutiplier = 1.0f / (float)totalSum * 255.0f;

                            for (int i = 0; i < binCount; i++)
                            {
                                luts[c][i] = (int)(runningSum * mutiplier + 0.5f);
                                runningSum += bins3[c][i];
                            }
                            luts[c][binCount] = 255;
                        }
                        //Aplly this lut to src image and get result in dest image
                        src_c3.Lut(dest_c3, luts[0], levels, luts[1], levels, luts[2], levels);

                        res = new Bitmap(src_c3.Width, src_c3.Height, PixelFormat.Format24bppRgb);

                        //Copy result from device to host
                        dest_c3.CopyToHost(res);

                        pictureBox_dest.Image = res;

                        //Compute new histogram and show it
                        dest_c3.HistogramEven(bins_ds, ll, up);
                        bins3[0] = bins_ds[0];
                        bins3[1] = bins_ds[1];
                        bins3[2] = bins_ds[2];
                        hist_rb_dest.Image = GetHistogramImage(bins3[2], bins3[1], bins3[0], 1);//r
                        hist_g_dest.Image = GetHistogramImage(bins3[1], bins3[0], bins3[2], 2);//g
                        hist_b_dest.Image = GetHistogramImage(bins3[0], bins3[1], bins3[2], 3);//b

                        //Free temp memory
                        bins_ds[0].Dispose();
                        bins_ds[1].Dispose();
                        bins_ds[2].Dispose();
                        break;
                    case 4:
                        //The NPP library sets up a CUDA context, we can directly use it without access to it
                        CudaDeviceVariable<int>[] bins_ds4 = new CudaDeviceVariable<int>[4];
                        bins_ds4[0] = new CudaDeviceVariable<int>(binCount);
                        bins_ds4[1] = new CudaDeviceVariable<int>(binCount);
                        bins_ds4[2] = new CudaDeviceVariable<int>(binCount);
                        bins_ds4[3] = new CudaDeviceVariable<int>(binCount);
                        levels = src_c4.EvenLevels(levelCount, 0, levelCount);
                        //Even levels in Cuda 5.5 seems to be broken: set it manually
                        for (int i = 0; i < levelCount; i++)
                        {
                            levels[i] = i;
                        }
                        int[] ll4 = new int[] { 0, 0, 0, 0 };
                        int[] up4 = new int[] { binCount+1, binCount+1, binCount+1, binCount+1 };

                        //Compute histogram from source image
                        src_c4.HistogramEven(bins_ds4, ll4, up4);

                        int[][] bins4 = new int[4][];
                        int[][] luts4 = new int[4][];
                        for (int c = 0; c < 4; c++)
                        {
                            //Copy data from device to host:
                            bins4[c] = bins_ds4[c];
                            luts4[c] = new int[levelCount];
                        }

                        //draw histogram images
                        hist_rb_src.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1);
                        hist_g_src.Image = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2);
                        hist_b_src.Image = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3);

                        //compute histogram equalization
                        for (int c = 0; c < 3; c++)
                        {
                            totalSum = 0;
                            runningSum = 0;
                            for (int i = 0; i < binCount; i++)
                            {
                                totalSum += bins4[c][i];
                            }
                            Debug.Assert(totalSum == src_c4.Width * src_c4.Height);

                            if (totalSum == 0) totalSum = 1;

                            mutiplier = 1.0f / (float)totalSum * 255.0f;

                            for (int i = 0; i < binCount; i++)
                            {
                                luts4[c][i] = (int)(runningSum * mutiplier + 0.5f);
                                runningSum += bins4[c][i];
                            }
                            luts4[c][binCount] = 255;
                        }

                        //Aplly this lut to src image and get result in dest image
                        src_c4.LutA(dest_c4, luts4[0], levels, luts4[1], levels, luts4[2], levels);

                        //Set alpha channel to 255
                        dest_c4.Set(255, 3);
                        res = new Bitmap(src_c4.Width, src_c4.Height, PixelFormat.Format32bppArgb);

                        //Copy result from device to host
                        dest_c4.CopyToHost(res);

                        pictureBox_dest.Image = res;

                        //Compute new histogram and show it
                        dest_c4.HistogramEven(bins_ds4, ll4, up4);
                        bins4[0] = bins_ds4[0];
                        bins4[1] = bins_ds4[1];
                        bins4[2] = bins_ds4[2];
                        hist_rb_dest.Image = GetHistogramImage(bins4[2], bins4[1], bins4[0], 1);//r
                        hist_g_dest.Image = GetHistogramImage(bins4[1], bins4[0], bins4[2], 2);//g
                        hist_b_dest.Image = GetHistogramImage(bins4[0], bins4[1], bins4[2], 3);//b

                        //Free temp memory
                        bins_ds4[0].Dispose();
                        bins_ds4[1].Dispose();
                        bins_ds4[2].Dispose();
                        bins_ds4[3].Dispose();
                        break;
                }
            }
            catch (Exception ex)
            {
                if (ex is NPPException)
                {
                    txt_info.AppendText("NPPException: " + ex.Message + "\n");
                    CleanUp();
                }
                else if (ex is CudaException)
                {
                    txt_info.AppendText("CudaException: " + ex.Message + "\n");
                    CleanUp();
                }
                else throw;
            }
        }
Example #34
0
        static void Main(string[] args)
        {
            string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder...
            string fileToCompile = File.ReadAllText(filename);

            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel");

            rtc.Compile(args);

            string log = rtc.GetLogAsString();

            Console.WriteLine(log);

            byte[] ptx = rtc.GetPTX();

            rtc.Dispose();

            CudaContext ctx = new CudaContext(0);

            CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd");

            // Print the vector length to be used, and compute its size
            int numElements = 50000;
            SizeT size = numElements * sizeof(float);
            Console.WriteLine("[Vector addition of {0} elements]", numElements);

            // Allocate the host input vector A
            float[] h_A = new float[numElements];
            // Allocate the host input vector B
            float[] h_B = new float[numElements];
            // Allocate the host output vector C
            float[] h_C = new float[numElements];

            Random rand = new Random(0);

            // Initialize the host input vectors
            for (int i = 0; i < numElements; ++i)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
            }

            Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n");
            // Allocate the device input vector A and copy to device
            CudaDeviceVariable<float> d_A = h_A;

            // Allocate the device input vector B and copy to device
            CudaDeviceVariable<float> d_B = h_B;

            // Allocate the device output vector C
            CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements);

            // Launch the Vector Add CUDA Kernel
            int threadsPerBlock = 256;
            int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
            Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock);
            vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1);
            vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1);

            vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements);

            // Copy the device result vector in device memory to the host result vector
            // in host memory.
            Console.WriteLine("Copy output data from the CUDA device to the host memory\n");
            d_C.CopyToHost(h_C);

            // Verify that the result vector is correct
            for (int i = 0; i < numElements; ++i)
            {
                if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
                {
                    Console.WriteLine("Result verification failed at element {0}!\n", i);
                    return;
                }
            }

            Console.WriteLine("Test PASSED\n");

            // Free device global memory
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            ctx.Dispose();
            Console.WriteLine("Done\n");
        }
Example #35
0
        public static void SaveJpeg(string aFilename, int aQuality, Bitmap aImage)
        {
            if (aImage.PixelFormat != System.Drawing.Imaging.PixelFormat.Format24bppRgb)
            {
                throw new ArgumentException("Only three channel color images are supported.");
            }

            if (aImage.Width % 16 != 0 || aImage.Height % 16 != 0)
            {
                throw new ArgumentException("The provided bitmap must have a height and width of a multiple of 16.");
            }

            JPEGCompression compression = new JPEGCompression();

            NPPImage_8uC3 src = new NPPImage_8uC3(aImage.Width, aImage.Height);
            NPPImage_8uC1 srcY = new NPPImage_8uC1(aImage.Width, aImage.Height);
            NPPImage_8uC1 srcCb = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2);
            NPPImage_8uC1 srcCr = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2);
            src.CopyToDevice(aImage);

            //System.Drawing.Bitmap is ordered BGR not RGB
            //The NPP routine BGR to YCbCR outputs the values in clamped range, following the YCbCr standard.
            //But JPEG uses unclamped values ranging all from [0..255], thus use our own color matrix:
            float[,] BgrToYCbCr = new float[3, 4]
            {{0.114f,     0.587f,    0.299f,   0},
             {0.5f,      -0.33126f, -0.16874f, 128},
             {-0.08131f, -0.41869f,  0.5f,     128}};

            src.ColorTwist(BgrToYCbCr);

            //Reduce size of of Cb and Cr channel
            src.Copy(srcY, 2);
            srcY.Resize(srcCr, 0.5, 0.5, InterpolationMode.SuperSampling);
            src.Copy(srcY, 1);
            srcY.Resize(srcCb, 0.5, 0.5, InterpolationMode.SuperSampling);
            src.Copy(srcY, 0);

            FrameHeader oFrameHeader = new FrameHeader();
            oFrameHeader.nComponents = 3;
            oFrameHeader.nHeight = (ushort)aImage.Height;
            oFrameHeader.nSamplePrecision = 8;
            oFrameHeader.nWidth = (ushort)aImage.Width;
            oFrameHeader.aComponentIdentifier = new byte[] { 1, 2, 3 };
            oFrameHeader.aSamplingFactors = new byte[] { 34, 17, 17 }; //Y channel is twice the sice of Cb/Cr channel
            oFrameHeader.aQuantizationTableSelector = new byte[] { 0, 1, 1 };

            //Get quantization tables from JPEG standard with quality scaling
            QuantizationTable[] aQuantizationTables = new QuantizationTable[2];
            aQuantizationTables[0] = new QuantizationTable(QuantizationTable.QuantizationType.Luminance, aQuality);
            aQuantizationTables[1] = new QuantizationTable(QuantizationTable.QuantizationType.Chroma, aQuality);

            CudaDeviceVariable<byte>[] pdQuantizationTables = new CudaDeviceVariable<byte>[2];
            pdQuantizationTables[0] = aQuantizationTables[0].aTable;
            pdQuantizationTables[1] = aQuantizationTables[1].aTable;

            //Get Huffman tables from JPEG standard
            HuffmanTable[] aHuffmanTables = new HuffmanTable[4];
            aHuffmanTables[0] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceDC);
            aHuffmanTables[1] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaDC);
            aHuffmanTables[2] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceAC);
            aHuffmanTables[3] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaAC);

            //Set header
            ScanHeader oScanHeader = new ScanHeader();
            oScanHeader.nA = 0;
            oScanHeader.nComponents = 3;
            oScanHeader.nSe = 63;
            oScanHeader.nSs = 0;
            oScanHeader.aComponentSelector = new byte[] { 1, 2, 3 };
            oScanHeader.aHuffmanTablesSelector = new byte[] { 0, 17, 17 };

            NPPImage_16sC1[] apdDCT = new NPPImage_16sC1[3];

            NPPImage_8uC1[] apDstImage = new NPPImage_8uC1[3];
            NppiSize[] aDstSize = new NppiSize[3];
            aDstSize[0] = new NppiSize(srcY.Width, srcY.Height);
            aDstSize[1] = new NppiSize(srcCb.Width, srcCb.Height);
            aDstSize[2] = new NppiSize(srcCr.Width, srcCr.Height);

            // Compute channel sizes as stored in the output JPEG (8x8 blocks & MCU block layout)
            NppiSize oDstImageSize = new NppiSize();
            float frameWidth = (float)Math.Floor((float)oFrameHeader.nWidth);
            float frameHeight = (float)Math.Floor((float)oFrameHeader.nHeight);

            oDstImageSize.width = (int)Math.Max(1.0f, frameWidth);
            oDstImageSize.height = (int)Math.Max(1.0f, frameHeight);

            //Console.WriteLine("Output Size: " + oDstImageSize.width + "x" + oDstImageSize.height + "x" + (int)(oFrameHeader.nComponents));

            apDstImage[0] = srcY;
            apDstImage[1] = srcCb;
            apDstImage[2] = srcCr;

            int nMCUBlocksH = 0;
            int nMCUBlocksV = 0;

            // Compute channel sizes as stored in the JPEG (8x8 blocks & MCU block layout)
            for (int i = 0; i < oFrameHeader.nComponents; ++i)
            {
                nMCUBlocksV = Math.Max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] >> 4);
                nMCUBlocksH = Math.Max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] & 0x0f);
            }

            for (int i = 0; i < oFrameHeader.nComponents; ++i)
            {
                NppiSize oBlocks = new NppiSize();
                NppiSize oBlocksPerMCU = new NppiSize(oFrameHeader.aSamplingFactors[i] & 0x0f, oFrameHeader.aSamplingFactors[i] >> 4);

                oBlocks.width = (int)Math.Ceiling((oFrameHeader.nWidth + 7) / 8 *
                                          (float)(oBlocksPerMCU.width) / nMCUBlocksH);
                oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

                oBlocks.height = (int)Math.Ceiling((oFrameHeader.nHeight + 7) / 8 *
                                           (float)(oBlocksPerMCU.height) / nMCUBlocksV);
                oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

                // Allocate Memory
                apdDCT[i] = new NPPImage_16sC1(oBlocks.width * 64, oBlocks.height);

            }

            /***************************
            *
            *   Output
            *
            ***************************/

            // Forward DCT
            for (int i = 0; i < 3; ++i)
            {
                compression.DCTQuantFwd8x8LS(apDstImage[i], apdDCT[i], aDstSize[i], pdQuantizationTables[oFrameHeader.aQuantizationTableSelector[i]]);
            }

            // Huffman Encoding
            CudaDeviceVariable<byte> pdScan = new CudaDeviceVariable<byte>(BUFFER_SIZE);
            int nScanLength = 0;

            int nTempSize = JPEGCompression.EncodeHuffmanGetSize(aDstSize[0], 3);
            CudaDeviceVariable<byte> pJpegEncoderTemp = new CudaDeviceVariable<byte>(nTempSize);

            NppiEncodeHuffmanSpec[] apHuffmanDCTableEnc = new NppiEncodeHuffmanSpec[3];
            NppiEncodeHuffmanSpec[] apHuffmanACTableEnc = new NppiEncodeHuffmanSpec[3];

            for (int i = 0; i < 3; ++i)
            {
                apHuffmanDCTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] >> 4)].aCodes, NppiHuffmanTableType.nppiDCTable);
                apHuffmanACTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] & 0x0f) + 2].aCodes, NppiHuffmanTableType.nppiACTable);
            }

            JPEGCompression.EncodeHuffmanScan(apdDCT, 0, oScanHeader.nSs, oScanHeader.nSe, oScanHeader.nA >> 4, oScanHeader.nA & 0x0f, pdScan, ref nScanLength, apHuffmanDCTableEnc, apHuffmanACTableEnc, aDstSize, pJpegEncoderTemp);

            for (int i = 0; i < 3; ++i)
            {
                JPEGCompression.EncodeHuffmanSpecFree(apHuffmanDCTableEnc[i]);
                JPEGCompression.EncodeHuffmanSpecFree(apHuffmanACTableEnc[i]);
            }

            // Write JPEG to byte array, as in original sample code
            byte[] pDstOutput = new byte[BUFFER_SIZE];
            int pos = 0;

            oFrameHeader.nWidth = (ushort)oDstImageSize.width;
            oFrameHeader.nHeight = (ushort)oDstImageSize.height;

            writeMarker(0x0D8, pDstOutput, ref pos);
            writeJFIFTag(pDstOutput, ref pos);
            writeQuantizationTable(aQuantizationTables[0], pDstOutput, ref pos);
            writeQuantizationTable(aQuantizationTables[1], pDstOutput, ref pos);
            writeFrameHeader(oFrameHeader, pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[0], pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[1], pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[2], pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[3], pDstOutput, ref pos);
            writeScanHeader(oScanHeader, pDstOutput, ref pos);

            pdScan.CopyToHost(pDstOutput, 0, pos, nScanLength);

            pos += nScanLength;
            writeMarker(0x0D9, pDstOutput, ref pos);

            FileStream fs = new FileStream(aFilename, FileMode.Create, FileAccess.Write);
            fs.Write(pDstOutput, 0, pos);
            fs.Close();

            //cleanup:
            fs.Dispose();
            pJpegEncoderTemp.Dispose();
            pdScan.Dispose();
            apdDCT[2].Dispose();
            apdDCT[1].Dispose();
            apdDCT[0].Dispose();
            pdQuantizationTables[1].Dispose();
            pdQuantizationTables[0].Dispose();

            srcCr.Dispose();
            srcCb.Dispose();
            srcY.Dispose();
            src.Dispose();
            compression.Dispose();
        }
Example #36
0
		/// <summary>
		/// image NormRel_Inf. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="tpl">template image.</param>
		/// <param name="pNormRel">Pointer to the computed relative error for the infinity norm of two images. (1 * sizeof(double))</param>
		/// <param name="nCOI">channel of interest.</param>
		/// <param name="pMask">Mask image.</param>
		public void NormRel_Inf(NPPImage_16uC3 tpl, CudaDeviceVariable<double> pNormRel, int nCOI, NPPImage_8uC1 pMask)
		{
			int bufferSize = NormRelInfMaskedGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.NormRel.nppiNormRel_Inf_16u_C3CMR(_devPtrRoi, _pitch, tpl.DevicePointerRoi, tpl.Pitch, pMask.DevicePointerRoi, pMask.Pitch, _sizeRoi, nCOI, pNormRel.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNormRel_Inf_16u_C3CMR", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #37
0
		/// <summary>
		/// image mean and standard deviation. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="coi">Channel of interest (0, 1 or 2)</param>
		/// <param name="mean">Allocated device memory with size of at least 1 * sizeof(double)</param>
		/// <param name="stdDev">Allocated device memory with size of at least 1 * sizeof(double)</param>
		/// <param name="mask">mask</param>
		public void MeanStdDev(int coi, CudaDeviceVariable<double> mean, CudaDeviceVariable<double> stdDev, NPPImage_8uC1 mask)
		{
			int bufferSize = MeanStdDevMaskedGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.MeanStdDevNew.nppiMean_StdDev_16u_C3CMR(_devPtrRoi, _pitch, mask.DevicePointerRoi, mask.Pitch, _sizeRoi, coi, buffer.DevicePointer, mean.DevicePointer, stdDev.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMean_StdDev_16u_C3CMR", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #38
0
        static void Main(string[] args)
        {
            int N = 275;

            float[] h_A;
            float[] h_B;
            float[] h_C;
            float[] h_C_ref;

            CudaDeviceVariable <float> d_A;
            CudaDeviceVariable <float> d_B;
            CudaDeviceVariable <float> d_C;
            float    alpha = 1.0f;
            float    beta  = 0.0f;
            int      n2    = N * N;
            int      i;
            float    error_norm;
            float    ref_norm;
            float    diff;
            CudaBlas handle;


            /* Initialize CUBLAS */
            Console.WriteLine("simpleCUBLAS test running.");

            handle = new CudaBlas();

            /* Allocate host memory for the matrices */
            h_A = new float[n2];
            h_B = new float[n2];
            //h_C = new float[n2];
            h_C_ref = new float[n2];

            Random rand = new Random(0);

            /* Fill the matrices with test data */
            for (i = 0; i < n2; i++)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
                //h_C[i] = (float)rand.NextDouble();
            }

            /* Allocate device memory for the matrices */
            d_A = new CudaDeviceVariable <float>(n2);
            d_B = new CudaDeviceVariable <float>(n2);
            d_C = new CudaDeviceVariable <float>(n2);


            /* Initialize the device matrices with the host matrices */
            d_A.CopyToDevice(h_A);
            d_B.CopyToDevice(h_B);
            //d_C.CopyToDevice(h_C);

            /* Performs operation using plain C code */
            simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref);

            /* Performs operation using cublas */
            handle.Gemm(Operation.NonTranspose, Operation.NonTranspose, N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);


            /* Allocate host memory for reading back the result from device memory */
            h_C = d_C;



            /* Check result against reference */
            error_norm = 0;
            ref_norm   = 0;

            for (i = 0; i < n2; ++i)
            {
                diff        = h_C_ref[i] - h_C[i];
                error_norm += diff * diff;
                ref_norm   += h_C_ref[i] * h_C_ref[i];
            }

            error_norm = (float)Math.Sqrt((double)error_norm);
            ref_norm   = (float)Math.Sqrt((double)ref_norm);

            if (Math.Abs(ref_norm) < 1e-7)
            {
                Console.WriteLine("!!!! reference norm is 0");
                return;
            }

            /* Memory clean up */
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();


            /* Shutdown */
            handle.Dispose();

            if (error_norm / ref_norm < 1e-6f)
            {
                Console.WriteLine("simpleCUBLAS test passed.");
                return;
            }
            else
            {
                Console.WriteLine("simpleCUBLAS test failed.");
                return;
            }
        }
Example #39
0
        static void Main(string[] args)
        {
            int   cuda_device = 0;
            int   nstreams = 4;                           // number of streams for CUDA calls
            int   nreps = 10;                             // number of times each experiment is repeated
            int   n = 16 * 1024 * 1024;                   // number of ints in the data set
            int   nbytes = n * sizeof(int);               // number of data bytes
            dim3  threads, blocks;                        // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel; // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool       bPinGenericMemory  = false;                   // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;                                         // number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int  temp  = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                        case 0:
                            device_sync_method = CUCtxFlags.SchedAuto;
                            break;

                        case 1:
                            device_sync_method = CUCtxFlags.SchedSpin;
                            break;

                        case 2:
                            device_sync_method = CUCtxFlags.SchedYield;
                            break;

                        case 4:
                            device_sync_method = CUCtxFlags.BlockingSync;
                            break;

                        default:
                            error = true;
                            break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();

            if (0 == num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);

            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1;                     // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n            = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                          deviceProp.MultiProcessorCount,
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;

            if (bPinGenericMemory)
            {
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            }
            else
            {
                ctx = new CudaContext(cuda_device, device_sync_method);
            }

            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "simpleStreams_x64.ptx";
            }
            else
            {
                resName = "simpleStreams.ptx";
            }

            string resNamespace = "simpleStreams";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");


            // allocate host memory
            int c = 5;                                                          // value to which the array will be initialized

            int[] h_a = null;                                                   // pointer to the array data in host memory
            CudaPageLockedHostMemory <int> hAligned_a = null;                   // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable <int> d_c = c;            //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable <int> d_a = new CudaDeviceVariable <int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent    start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event  = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();                 // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();               // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions  = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);


            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes];             // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0);             // set device memory to all 0s, for testing correctness

            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for (int i = 0; i < nstreams; i++)
                {
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);
                }

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                {
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
                }
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c * nreps * niterations);

            // release resources
            for (int i = 0; i < nstreams; i++)
            {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            Console.ReadKey();
            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Example #40
0
        static void Main(string[] args)
        {
            int N = 275;

            float[] h_A;
            float[] h_B;
            float[] h_C;
            float[] h_C_ref;

            CudaDeviceVariable<float> d_A;
            CudaDeviceVariable<float> d_B;
            CudaDeviceVariable<float> d_C;
            float alpha = 1.0f;
            float beta = 0.0f;
            int n2 = N * N;
            int i;
            float error_norm;
            float ref_norm;
            float diff;
            CudaBlas handle;

            /* Initialize CUBLAS */
            Console.WriteLine("simpleCUBLAS test running.");

            handle = new CudaBlas();

            /* Allocate host memory for the matrices */
            h_A = new float[n2];
            h_B = new float[n2];
            //h_C = new float[n2];
            h_C_ref = new float[n2];

            Random rand = new Random(0);
            /* Fill the matrices with test data */
            for (i = 0; i < n2; i++)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
                //h_C[i] = (float)rand.NextDouble();
            }

            /* Allocate device memory for the matrices */
            d_A = new CudaDeviceVariable<float>(n2);
            d_B = new CudaDeviceVariable<float>(n2);
            d_C = new CudaDeviceVariable<float>(n2);

            /* Initialize the device matrices with the host matrices */
            d_A.CopyToDevice(h_A);
            d_B.CopyToDevice(h_B);
            //d_C.CopyToDevice(h_C);

            /* Performs operation using plain C code */
            simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref);

            /* Performs operation using cublas */
            handle.Gemm(Operation.NonTranspose, Operation.NonTranspose, N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);

            /* Allocate host memory for reading back the result from device memory */
            h_C = d_C;

            /* Check result against reference */
            error_norm = 0;
            ref_norm = 0;

            for (i = 0; i < n2; ++i)
            {
                diff = h_C_ref[i] - h_C[i];
                error_norm += diff * diff;
                ref_norm += h_C_ref[i] * h_C_ref[i];
            }

            error_norm = (float)Math.Sqrt((double)error_norm);
            ref_norm = (float)Math.Sqrt((double)ref_norm);

            if (Math.Abs(ref_norm) < 1e-7)
            {
                Console.WriteLine("!!!! reference norm is 0");
                return;
            }

            /* Memory clean up */
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            /* Shutdown */
            handle.Dispose();

            if (error_norm / ref_norm < 1e-6f)
            {
                Console.WriteLine("simpleCUBLAS test passed.");
                return;
            }
            else
            {
                Console.WriteLine("simpleCUBLAS test failed.");
                return;
            }
        }
Example #41
0
		/// <summary>
		/// Image pixel minimum and maximum values with their indices. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="coi">Channel of interest (0, 1 or 2)</param>
		/// <param name="min">Allocated device memory with size of at least 1 * sizeof(ushort)</param>
		/// <param name="max">Allocated device memory with size of at least 1 * sizeof(ushort)</param>
		/// <param name="minIndex">Allocated device memory with size of at least 1 * sizeof(NppiPoint)</param>
		/// <param name="maxIndex">Allocated device memory with size of at least 1 * sizeof(NppiPoint)</param>
		/// <param name="mask">If the mask is filled with zeros, then all the returned values are zeros, i.e., pMinIndex = {0, 0}, pMaxIndex = {0, 0}, pMinValue = 0, pMaxValue = 0.</param>
		public void MinMaxIndex(int coi, CudaDeviceVariable<ushort> min, CudaDeviceVariable<ushort> max, CudaDeviceVariable<NppiPoint> minIndex, CudaDeviceVariable<NppiPoint> maxIndex, NPPImage_8uC1 mask)
		{
			int bufferSize = MinMaxIndexMaskedGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.MinMaxIndxNew.nppiMinMaxIndx_16u_C3CMR(_devPtrRoi, _pitch, mask.DevicePointerRoi, mask.Pitch, _sizeRoi, coi, min.DevicePointer, max.DevicePointer, minIndex.DevicePointer, maxIndex.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMinMaxIndx_16u_C3CMR", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #42
0
		/// <summary>
		/// CrossCorrValid_NormLevel. Buffer is internally allocated and freed. Not affecting Alpha.
		/// </summary>
		/// <param name="tpl">template image.</param>
		/// <param name="dst">Destination image</param>
		/// <param name="nScaleFactor">Integer Result Scaling.</param>
		public void CrossCorrValid_NormLevelA(NPPImage_8uC4 tpl, NPPImage_8uC4 dst, int nScaleFactor)
		{
			int bufferSize = ValidNormLevelScaledAGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.ImageProximity.nppiCrossCorrValid_NormLevel_8u_AC4RSfs(_devPtrRoi, _pitch, _sizeRoi, tpl.DevicePointerRoi, tpl.Pitch, tpl.SizeRoi, dst.DevicePointer, dst.Pitch, nScaleFactor, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiCrossCorrValid_NormLevel_8u_AC4RSfs", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #43
0
		/// <summary>
		/// image L2 norm. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="coi">Channel of interest (0, 1 or 2)</param>
		/// <param name="norm">Allocated device memory with size of at least 1 * sizeof(double)</param>
		/// <param name="mask">mask</param>
		public void NormL2(int coi, CudaDeviceVariable<double> norm, NPPImage_8uC1 mask)
		{
			int bufferSize = NormL2MaskedGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.NormL2.nppiNorm_L2_16u_C3CMR(_devPtrRoi, _pitch, mask.DevicePointerRoi, mask.Pitch, _sizeRoi, coi, norm.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNorm_L2_16u_C3CMR", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #44
0
		/// <summary>
		/// Four-channel 32-bit unsigned image DotProd. Buffer is internally allocated and freed. Ignoring alpha channel.
		/// </summary>
		/// <param name="src2">2nd source image</param>
		/// <param name="pDp">Pointer to the computed dot product of the two images. (3 * sizeof(double))</param>
		public void ADotProduct(NPPImage_32sC4 src2, CudaDeviceVariable<double> pDp)
		{
			int bufferSize = DotProdGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.DotProd.nppiDotProd_32s64f_AC4R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pDp.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiDotProd_32s64f_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #45
0
		/// <summary>
		/// Histogram with evenly distributed bins. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="histogram">Allocated device memory of size nLevels (4 Variables)</param>
		/// <param name="nLowerLevel">Lower boundary of lowest level bin. E.g. 0 for [0..255]. Size = 4</param>
		/// <param name="nUpperLevel">Upper boundary of highest level bin. E.g. 256 for [0..255]. Size = 4</param>
		public void HistogramEven(CudaDeviceVariable<int>[] histogram, int[] nLowerLevel, int[] nUpperLevel)
		{
			int[] size = new int[] { histogram[0].Size + 1, histogram[1].Size + 1, histogram[2].Size + 1, histogram[3].Size + 1 };
			CUdeviceptr[] devPtrs = new CUdeviceptr[] { histogram[0].DevicePointer, histogram[1].DevicePointer, histogram[2].DevicePointer, histogram[3].DevicePointer };


			int bufferSize = HistogramEvenGetBufferSize(size);
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.Histogram.nppiHistogramEven_8u_C4R(_devPtrRoi, _pitch, _sizeRoi, devPtrs, size, nLowerLevel, nUpperLevel, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramEven_8u_C4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
 public void Dispose()
 {
     _ptr.Dispose();
 }
Example #47
0
		/// <summary>
		/// Image pixel minimum and maximum. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="min">Allocated device memory with size of at least 4 * sizeof(byte)</param>
		/// <param name="max">Allocated device memory with size of at least 4 * sizeof(byte)</param>
		public void MinMax(CudaDeviceVariable<byte> min, CudaDeviceVariable<byte> max)
		{
			int bufferSize = MinMaxGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.MinMaxNew.nppiMinMax_8u_C4R(_devPtrRoi, _pitch, _sizeRoi, min.DevicePointer, max.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMinMax_8u_C4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
        private void display()
        {
            stopwatch.Start();

            advectVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, DT, g_tPitch);

            {
                // Forward FFT
                g_planr2c.Exec(g_vxfield.DevicePointer);
                g_planr2c.Exec(g_vyfield.DevicePointer);

                diffuseProject(g_vxfield, g_vyfield, CPADW, DIM, DT, VIS, g_tPitch);

                // Inverse FFT
                g_planc2r.Exec(g_vxfield.DevicePointer);
                g_planc2r.Exec(g_vyfield.DevicePointer);
            }
            updateVelocity(g_dvfield, g_vxfield, g_vyfield, DIM, RPADW, DIM, g_tPitch);

            // Map D3D9 vertex buffer to CUDA
            {
                graphicsres.MapAllResources();

                if (g_mparticles != null)
                {
                    g_mparticles.Dispose();
                }

                g_mparticles = graphicsres[0].GetMappedPointer <vertex>();
                advectParticles(g_mparticles, g_dvfield, DIM, DIM, DT, g_tPitch);
                graphicsres.UnmapAllResources();
            }

            device.Clear(ClearFlags.Target, new Color4(0.0f, 0, 0), 0.0f, 0);
            device.SetRenderState(RenderState.ZWriteEnable, false);
            device.SetRenderState(RenderState.AlphaBlendEnable, true);
            device.SetRenderState(RenderState.SourceBlend, Blend.One);
            device.SetRenderState(RenderState.DestinationBlend, Blend.One);
            device.SetRenderState(RenderState.PointSpriteEnable, true);
            float size = 16.0f;

            device.SetRenderState(RenderState.PointSize, size);
            device.SetTexture(0, g_pTexture);

            if (device.BeginScene().IsSuccess)
            {
                Result res;
                //Draw particles
                res = device.SetStreamSource(0, g_pVB, 0, Marshal.SizeOf(typeof(vertex)));
                device.VertexFormat = VertexFormat.Position | VertexFormat.Diffuse;
                res = device.DrawPrimitives(PrimitiveType.PointList, 0, DS);
                device.EndScene();
            }
            stopwatch.Stop();

            device.Present();
            fpsCount++;

            if (fpsCount == fpsLimit)
            {
                float  elaps = stopwatch.GetElapsedTime();
                float  ifps  = 1.0f / (elaps / 1000.0f);
                string fps   = string.Format(System.Globalization.CultureInfo.InvariantCulture,
                                             "CUDA/D3D9 Stable Fluids ({0} x {1}): {2} fps", DIM, DIM, ifps);
                myWindow.Title = fps;
                fpsCount       = 0;
                fpsLimit       = (int)Math.Max(ifps, 1.0f);
            }
        }
Example #49
0
		/// <summary>
		/// Image pixel maximum. Buffer is internally allocated and freed. Not affecting alpha.
		/// </summary>
		/// <param name="max">Allocated device memory with size of at least 3 * sizeof(byte)</param>
		/// <param name="indexX">Allocated device memory with size of at least 3 * sizeof(int)</param>
		/// <param name="indexY">Allocated device memory with size of at least 3 * sizeof(int)</param>
		public void MaxIndexA(CudaDeviceVariable<byte> max, CudaDeviceVariable<int> indexX, CudaDeviceVariable<int> indexY)
		{
			int bufferSize = MaxIndexGetBufferHostSizeA();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.MaxIdx.nppiMaxIndx_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, buffer.DevicePointer, max.DevicePointer, indexX.DevicePointer, indexY.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiMaxIndx_8u_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
        private void myWindow_Closing(object sender, System.ComponentModel.CancelEventArgs e)
        {
            //Stop render loop before closing
            if (frameTimer != null)
            {
                frameTimer.Tick -= new EventHandler(frameTimer_Tick);
                frameTimer.Stop();
            }

            //Cleanup
            if (graphicsres != null)
            {
                graphicsres.Dispose();
            }
            if (g_mparticles != null)
            {
                g_mparticles.Dispose();
            }
            if (stopwatch != null)
            {
                stopwatch.Dispose();
            }

            if (texref != null)
            {
                texref.Dispose();
            }
            if (g_dvfield != null)
            {
                g_dvfield.Dispose();
            }
            if (g_vxfield != null)
            {
                g_vxfield.Dispose();
            }
            if (g_vyfield != null)
            {
                g_vyfield.Dispose();
            }

            if (g_planc2r != null)
            {
                g_planc2r.Dispose();
            }
            if (g_planr2c != null)
            {
                g_planr2c.Dispose();
            }

            if (g_pVB != null)
            {
                g_pVB.Dispose();
            }
            if (g_pTexture != null)
            {
                g_pTexture.Dispose();
            }

            if (device != null)
            {
                device.Dispose();
            }
            if (d3d != null)
            {
                d3d.Dispose();
            }

            if (ctx != null)
            {
                ctx.Dispose();
            }
        }
Example #51
0
		/// <summary>
		/// image L1 norm. Buffer is internally allocated and freed.
		/// </summary>
		/// <param name="norm">Allocated device memory with size of at least 3 * sizeof(double)</param>
		public void NormL1(CudaDeviceVariable<double> norm)
		{
			int bufferSize = NormL1GetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.NormL1.nppiNorm_L1_8u_AC4R(_devPtrRoi, _pitch, _sizeRoi, norm.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNorm_L1_8u_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #52
0
        public BarycentricReturnMultiple Execute(List <Triangle> primitives, Dictionary <string, object> .KeyCollection dataKeys)
        {
            h_v0 = new float2[primitives.Count];
            h_v1 = new float2[primitives.Count];
            h_v2 = new float2[primitives.Count];

            for (int i = 0; i < primitives.Count; i++)
            {
                h_v0[i] = new float2(((Vector4)primitives[i][0][VertexShader.PositionName]).X, ((Vector4)primitives[i][0][VertexShader.PositionName]).Y);
                h_v1[i] = new float2(((Vector4)primitives[i][1][VertexShader.PositionName]).X, ((Vector4)primitives[i][1][VertexShader.PositionName]).Y);
                h_v2[i] = new float2(((Vector4)primitives[i][2][VertexShader.PositionName]).X, ((Vector4)primitives[i][2][VertexShader.PositionName]).Y);
            }

            int dataByteSize = 1;

            foreach (var key in dataKeys)
            {
                if (key == VertexShader.PositionName)
                {
                    continue;
                }

                switch (primitives[0][0][key])
                {
                case float _:
                {
                    dataByteSize += 1;
                    break;
                }

                case Vector2 _:
                {
                    dataByteSize += 2;
                    break;
                }

                case Vector3 _:
                {
                    dataByteSize += 3;
                    break;
                }

                case Vector4 _:
                {
                    dataByteSize += 4;
                    break;
                }
                }
            }

            float[] h_da = new float[dataByteSize * primitives.Count];
            float[] h_db = new float[dataByteSize * primitives.Count];
            float[] h_dc = new float[dataByteSize * primitives.Count];

            for (int i = 0; i < primitives.Count; i++)
            {
                h_da[i * dataByteSize] = ((Vector4)primitives[i][0][VertexShader.PositionName]).Z;
                h_db[i * dataByteSize] = ((Vector4)primitives[i][1][VertexShader.PositionName]).Z;
                h_dc[i * dataByteSize] = ((Vector4)primitives[i][2][VertexShader.PositionName]).Z;

                int currentIndex = i * dataByteSize + 1;

                foreach (var key in dataKeys)
                {
                    if (key == VertexShader.PositionName)
                    {
                        continue;
                    }

                    switch (primitives[i][0][key])
                    {
                    case float _:
                    {
                        h_da[currentIndex] = (float)primitives[i][0][key];
                        h_db[currentIndex] = (float)primitives[i][1][key];
                        h_dc[currentIndex] = (float)primitives[i][2][key];
                        currentIndex      += 1;
                        break;
                    }

                    case Vector2 _:
                    {
                        Vector2 v0 = (Vector2)primitives[i][0][key];
                        Vector2 v1 = (Vector2)primitives[i][1][key];
                        Vector2 v2 = (Vector2)primitives[i][2][key];

                        h_da[currentIndex]     = v0.X;
                        h_da[currentIndex + 1] = v0.Y;
                        h_db[currentIndex]     = v1.X;
                        h_db[currentIndex + 1] = v1.Y;
                        h_dc[currentIndex]     = v2.X;
                        h_dc[currentIndex + 1] = v2.Y;

                        currentIndex += 2;
                        break;
                    }

                    case Vector3 _:
                    {
                        Vector3 v0 = (Vector3)primitives[i][0][key];
                        Vector3 v1 = (Vector3)primitives[i][1][key];
                        Vector3 v2 = (Vector3)primitives[i][2][key];

                        h_da[currentIndex]     = v0.X;
                        h_da[currentIndex + 1] = v0.Y;
                        h_da[currentIndex + 2] = v0.Z;
                        h_db[currentIndex]     = v1.X;
                        h_db[currentIndex + 1] = v1.Y;
                        h_db[currentIndex + 2] = v1.Z;
                        h_dc[currentIndex]     = v2.X;
                        h_dc[currentIndex + 1] = v2.Y;
                        h_dc[currentIndex + 2] = v2.Z;

                        currentIndex += 3;
                        break;
                    }

                    case Vector4 _:
                    {
                        Vector4 v0 = (Vector4)primitives[i][0][key];
                        Vector4 v1 = (Vector4)primitives[i][1][key];
                        Vector4 v2 = (Vector4)primitives[i][2][key];

                        h_da[currentIndex]     = v0.X;
                        h_da[currentIndex + 1] = v0.Y;
                        h_da[currentIndex + 2] = v0.Z;
                        h_da[currentIndex + 3] = v0.W;
                        h_db[currentIndex]     = v1.X;
                        h_db[currentIndex + 1] = v1.Y;
                        h_db[currentIndex + 2] = v1.Z;
                        h_db[currentIndex + 3] = v1.W;
                        h_dc[currentIndex]     = v2.X;
                        h_dc[currentIndex + 1] = v2.Y;
                        h_dc[currentIndex + 2] = v2.Z;
                        h_dc[currentIndex + 3] = v2.W;

                        currentIndex += 4;
                        break;
                    }
                    }
                }
            }


            h_dOut = new float[Width * Height * dataByteSize * primitives.Count];
            h_dOut_valid_fragment = new int[Width * Height * primitives.Count];
            h_dOut_valid_pixel    = new int[Width * Height];


            // Allocate vectors in device memory and copy vectors from host memory to device memory
            // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
            CudaDeviceVariable <float2> dev_v0 = h_v0;
            CudaDeviceVariable <float2> dev_v1 = h_v1;
            CudaDeviceVariable <float2> dev_v2 = h_v2;
            CudaDeviceVariable <float>  dev_da = h_da;
            CudaDeviceVariable <float>  dev_db = h_db;
            CudaDeviceVariable <float>  dev_dc = h_dc;

            CudaDeviceVariable <float> dev_dOut        = new CudaDeviceVariable <float>(Width * Height * dataByteSize * primitives.Count);
            CudaDeviceVariable <int>   dev_dOut_valid  = new CudaDeviceVariable <int>(Width * Height * primitives.Count);
            CudaDeviceVariable <int>   dev_dOut_valid2 = h_dOut_valid_pixel;


            dim3 windowSize = new dim3(Width, Height);
            dim3 blockSize  = new dim3(8, 8, 8);
            dim3 gridSize   = new dim3(windowSize.x / blockSize.x + 1, windowSize.y / blockSize.y + 1, ((uint)primitives.Count * (uint)dataByteSize) / blockSize.z + 1);

            baryKernel.BlockDimensions = blockSize;
            baryKernel.GridDimensions  = gridSize;


            baryKernel.Run(dev_v0.DevicePointer,
                           dev_v1.DevicePointer,
                           dev_v2.DevicePointer,
                           dataByteSize,
                           primitives.Count,
                           dev_da.DevicePointer,
                           dev_db.DevicePointer,
                           dev_dc.DevicePointer,
                           dev_dOut.DevicePointer,
                           dev_dOut_valid.DevicePointer,
                           dev_dOut_valid2.DevicePointer,
                           Width,
                           Height);


            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            h_dOut = dev_dOut;
            h_dOut_valid_fragment = dev_dOut_valid;
            h_dOut_valid_pixel    = dev_dOut_valid2;


            //Cleanup
            if (dev_v0 != null)
            {
                dev_v0.Dispose();
            }

            if (dev_v1 != null)
            {
                dev_v1.Dispose();
            }

            if (dev_v2 != null)
            {
                dev_v2.Dispose();
            }

            if (dev_da != null)
            {
                dev_da.Dispose();
            }

            if (dev_db != null)
            {
                dev_db.Dispose();
            }

            if (dev_dc != null)
            {
                dev_dc.Dispose();
            }

            if (dev_dOut != null)
            {
                dev_dOut.Dispose();
            }

            if (dev_dOut_valid != null)
            {
                dev_dOut_valid.Dispose();
            }

            if (dev_dOut_valid2 != null)
            {
                dev_dOut_valid2.Dispose();
            }

            OutDataThreaded = new BarycentricReturnMultiple(Width, Height);


            int dataRowSize       = Width;
            int dataGridSize      = dataRowSize * Height;
            int triangleBlockSize = dataGridSize * dataByteSize;

            Parallel.For(0, ThreadCount, (i) =>
            {
                for (int x = i; x < Width; x += ThreadCount)
                {
                    for (int y = 0; y < Height; y++)
                    {
                        if (h_dOut_valid_pixel[x + y * dataRowSize] == 0)
                        {
                            continue;
                        }

                        for (int z = 0; z < primitives.Count; z++)
                        {
                            if (h_dOut_valid_fragment[x + y * dataRowSize + z * dataGridSize] == 0)
                            {
                                continue;
                            }

                            int dataBaseIndex = x + y * dataRowSize + z * triangleBlockSize;

                            if (OutDataThreaded.Depths[x, y] == null)
                            {
                                OutDataThreaded.Depths[x, y]        = new List <float>();
                                OutDataThreaded.FragmentData[x, y]  = new Dictionary <string, IList>();
                                OutDataThreaded.FragmentCount[x, y] = 0;
                            }

                            OutDataThreaded.Depths[x, y].Add(h_dOut[dataBaseIndex]);
                            OutDataThreaded.FragmentCount[x, y]++;


                            //i == 0 is the depth and already handled above
                            int currentDataPoint = 1;
                            foreach (var key in dataKeys)
                            {
                                if (key == VertexShader.PositionName)
                                {
                                    continue;
                                }

                                if (!OutDataThreaded.FragmentData[x, y].ContainsKey(key))
                                {
                                    OutDataThreaded.FragmentData[x, y].Add(key, new List <object>());
                                }

                                switch (primitives[z][0][key])
                                {
                                case float _:
                                    {
                                        OutDataThreaded.FragmentData[x, y][key].Add((float)h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize]);
                                        currentDataPoint += 1;
                                        break;
                                    }

                                case Vector2 _:
                                    {
                                        Vector2 vec2 = new Vector2(h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize],
                                                                   h_dOut[dataBaseIndex + (currentDataPoint + 1) * dataGridSize]);
                                        OutDataThreaded.FragmentData[x, y][key].Add(vec2);
                                        currentDataPoint += 2;
                                        break;
                                    }

                                case Vector3 _:
                                    {
                                        Vector3 vec3 = new Vector3(h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize],
                                                                   h_dOut[dataBaseIndex + (currentDataPoint + 1) * dataGridSize],
                                                                   h_dOut[dataBaseIndex + (currentDataPoint + 2) * dataGridSize]);
                                        OutDataThreaded.FragmentData[x, y][key].Add(vec3);
                                        currentDataPoint += 3;
                                        break;
                                    }

                                case Vector4 _:
                                    {
                                        Vector4 vec4 = new Vector4(h_dOut[dataBaseIndex + (currentDataPoint + 0) * dataGridSize],
                                                                   h_dOut[dataBaseIndex + (currentDataPoint + 1) * dataGridSize],
                                                                   h_dOut[dataBaseIndex + (currentDataPoint + 2) * dataGridSize],
                                                                   h_dOut[dataBaseIndex + (currentDataPoint + 3) * dataGridSize]);
                                        OutDataThreaded.FragmentData[x, y][key].Add(vec4);
                                        currentDataPoint += 4;
                                        break;
                                    }
                                }
                            }
                        }
                    }
                }
            });


            return(OutDataThreaded);
        }
Example #53
0
		/// <summary>
		/// image QualityIndex. Not affecting Alpha.
		/// </summary>
		/// <param name="src2">2nd source image</param>
		/// <param name="dst">Pointer to the quality index. (3 * sizeof(float))</param>
		public void QualityIndexA(NPPImage_8uC4 src2, CudaDeviceVariable<float> dst)
		{
			int bufferSize = QualityIndexAGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.QualityIndex.nppiQualityIndex_8u32f_AC4R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, dst.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiQualityIndex_8u32f_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #54
0
        private void Form1_FormClosing(object sender, FormClosingEventArgs e)
        {
            isRunning = false;

            //Cleanup
            if (graphicsres != null)
            {
                graphicsres.Dispose();
            }
            if (g_mparticles != null)
            {
                g_mparticles.Dispose();
            }
            if (stopwatch != null)
            {
                stopwatch.Dispose();
            }

            if (texref != null)
            {
                texref.Dispose();
            }
            if (g_dvfield != null)
            {
                g_dvfield.Dispose();
            }
            if (g_vxfield != null)
            {
                g_vxfield.Dispose();
            }
            if (g_vyfield != null)
            {
                g_vyfield.Dispose();
            }

            if (g_planc2r != null)
            {
                g_planc2r.Dispose();
            }
            if (g_planr2c != null)
            {
                g_planr2c.Dispose();
            }

            if (g_pVB != null)
            {
                g_pVB.Dispose();
            }
            if (g_pTexture != null)
            {
                g_pTexture.Dispose();
            }

            if (device != null)
            {
                device.Dispose();
            }
            if (d3d != null)
            {
                d3d.Dispose();
            }

            if (ctx != null)
            {
                ctx.Dispose();
            }
        }
Example #55
0
		/// <summary>
		/// image NormRel_L2. Buffer is internally allocated and freed. Not affecting Alpha.
		/// </summary>
		/// <param name="tpl">template image.</param>
		/// <param name="pNormRel">Pointer to the computed relative error for the infinity norm of two images. (3 * sizeof(double))</param>
		public void NormRel_L2A(NPPImage_8uC4 tpl, CudaDeviceVariable<double> pNormRel)
		{
			int bufferSize = NormRelL2AGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

			status = NPPNativeMethods.NPPi.NormRel.nppiNormRel_L2_8u_AC4R(_devPtrRoi, _pitch, tpl.DevicePointerRoi, tpl.Pitch, _sizeRoi, pNormRel.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiNormRel_L2_8u_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
Example #56
0
        /// <summary>
        /// Histogram with evenly distributed bins. Buffer is internally allocated and freed.
        /// </summary>
        /// <param name="histogram">Allocated device memory of size nLevels</param>
        /// <param name="nLowerLevel">Lower boundary of lowest level bin. E.g. 0 for [0..255]</param>
        /// <param name="nUpperLevel">Upper boundary of highest level bin. E.g. 256 for [0..255]</param>
        public void HistogramEven(CudaDeviceVariable<int> histogram, int nLowerLevel, int nUpperLevel)
        {
            int bufferSize = HistogramEvenGetBufferSize(histogram.Size + 1);
            CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);

            status = NPPNativeMethods.NPPi.Histogram.nppiHistogramEven_16s_C1R(_devPtrRoi, _pitch, _sizeRoi, histogram.DevicePointer, histogram.Size + 1, nLowerLevel, nUpperLevel, buffer.DevicePointer);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiHistogramEven_16s_C1R", status));
            buffer.Dispose();
            NPPException.CheckNppStatus(status, this);
        }
Example #57
0
		/// <summary>
		/// Result pixel value is the median of pixel values under the rectangular mask region, ignoring alpha channel.
		/// </summary>
		/// <param name="dst">Destination-Image</param>
		/// <param name="oMaskSize">Width and Height of the neighborhood region for the local Median operation.</param>
		/// <param name="oAnchor">X and Y offsets of the kernel origin frame of reference relative to the source pixel.</param>
		public void FilterMedianA(NPPImage_8uC4 dst, NppiSize oMaskSize, NppiPoint oAnchor)
		{
			int bufferSize = FilterMedianGetBufferHostSizeA(oMaskSize);
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);
			status = NPPNativeMethods.NPPi.ImageMedianFilter.nppiFilterMedian_8u_AC4R(_devPtrRoi, _pitch, dst.DevicePointerRoi, dst.Pitch, _sizeRoi, oMaskSize, oAnchor, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiFilterMedian_8u_AC4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
 public override void Dispose()
 {
     Xopt.Dispose();
     base.Dispose();
 }
Example #59
0
		/// <summary>
		/// image average relative error. User buffer is internally allocated and freed.
		/// </summary>
		/// <param name="src2">2nd source image</param>
		/// <param name="pError">Pointer to the computed error.</param>
		public void AverageRelativeError(NPPImage_32sC4 src2, CudaDeviceVariable<double> pError)
		{
			int bufferSize = AverageRelativeErrorGetBufferHostSize();
			CudaDeviceVariable<byte> buffer = new CudaDeviceVariable<byte>(bufferSize);
			status = NPPNativeMethods.NPPi.AverageRelativeError.nppiAverageRelativeError_32s_C4R(_devPtrRoi, _pitch, src2.DevicePointerRoi, src2.Pitch, _sizeRoi, pError.DevicePointer, buffer.DevicePointer);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "nppiAverageRelativeError_32s_C4R", status));
			buffer.Dispose();
			NPPException.CheckNppStatus(status, this);
		}
        static void Main(string[] args)
        {
            try
            {
                if (args.Length > 0)
                {
                    deviceID = int.Parse(args[0]);
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Device ID parse error");
            }

            try
            {
                if (args.Length > 1)
                {
                    port = int.Parse(args[1]);
                    Comms.ConnectToMaster(port);
                }
                else
                {
                    TEST = true;
                    Logger.CopyToConsole = true;
                    CGraph.ShowCycles    = true;
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Master connection error");
            }

            try
            {
                if (args.Length > 3)
                {
                    gpuCount = int.Parse(args[3]);
                    fastCuda = gpuCount <= (Environment.ProcessorCount / 2);
                    if (fastCuda)
                    {
                        Logger.Log(LogLevel.Info, "Using single GPU blocking mode");
                    }
                }
            }
            catch
            {
            }

            if (TEST)
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };
            }
            else
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };

                if (!Comms.IsConnected())
                {
                    Console.WriteLine("Master connection failed, aborting");
                    Logger.Log(LogLevel.Error, "No master connection, exitting!");
                    return;
                }

                if (deviceID < 0)
                {
                    int devCnt             = CudaContext.GetDeviceCount();
                    GpuDevicesMessage gpum = new GpuDevicesMessage()
                    {
                        devices = new List <GpuDevice>(devCnt)
                    };
                    for (int i = 0; i < devCnt; i++)
                    {
                        string name = CudaContext.GetDeviceName(i);
                        var    info = CudaContext.GetDeviceInfo(i);
                        gpum.devices.Add(new GpuDevice()
                        {
                            deviceID = i, name = name, memory = info.TotalGlobalMemory
                        });
                    }
                    //Console.WriteLine(devCnt);
                    Comms.gpuMsg = gpum;
                    Comms.SetEvent();
                    //Console.WriteLine("event fired");
                    Task.Delay(1000).Wait();
                    //Console.WriteLine("closing");
                    Comms.Close();
                    return;
                }
            }


            try
            {
                var assembly       = Assembly.GetEntryAssembly();
                var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx");
                ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost);

                meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A");
                meanSeedA.BlockDimensions = 128;
                meanSeedA.GridDimensions  = 2048;
                meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B");
                meanSeedB.BlockDimensions = 128;
                meanSeedB.GridDimensions  = 2048;
                meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B");
                meanSeedB_4.BlockDimensions = 128;
                meanSeedB_4.GridDimensions  = 1024;
                meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound");
                meanRound.BlockDimensions = 512;
                meanRound.GridDimensions  = 4096;
                meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound");
                meanRound_2.BlockDimensions = 512;
                meanRound_2.GridDimensions  = 2048;
                meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J");
                meanRoundJoin.BlockDimensions = 512;
                meanRoundJoin.GridDimensions  = 4096;
                meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail");
                meanTail.BlockDimensions = 1024;
                meanTail.GridDimensions  = 4096;
                meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;

                meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery");
                meanRecover.BlockDimensions = 256;
                meanRecover.GridDimensions  = 2048;
                meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message);
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                d_buffer    = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32);
                d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8));
                d_bufferB   = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8));

                d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2);
                d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2);

                Array.Clear(h_indexesA, 0, h_indexesA.Length);
                Array.Clear(h_indexesB, 0, h_indexesA.Length);

                d_indexesA = h_indexesA;
                d_indexesB = h_indexesB;

                streamPrimary   = new CudaStream(CUStreamFlags.NonBlocking);
                streamSecondary = new CudaStream(CUStreamFlags.NonBlocking);
            }
            catch (Exception ex)
            {
                Task.Delay(200).Wait();
                Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32);
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create pinned memory.");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            int loopCnt = 0;

            while (!Comms.IsTerminated)
            {
                try
                {
                    if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow))
                    {
                        Logger.Log(LogLevel.Info, string.Format("Waiting for job...."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                    {
                        currentJob           = Comms.nextJob;
                        currentJob.timestamp = DateTime.Now;
                    }

                    if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now)
                    {
                        Logger.Log(LogLevel.Info, string.Format("Job too old..."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    // test runs only once
                    if (TEST && loopCnt++ > 100)
                    {
                        Comms.IsTerminated = true;
                    }

                    Solution s;
                    while (graphSolutions.TryDequeue(out s))
                    {
                        meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges());
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer);
                        streamPrimary.Synchronize();
                        s.nonces = new uint[40];
                        d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4);
                        s.nonces = s.nonces.OrderBy(n => n).ToArray();
                        lock (Comms.graphSolutionsOut)
                        {
                            Comms.graphSolutionsOut.Enqueue(s);
                        }
                        Comms.SetEvent();
                    }
                    uint[] count;
                    do
                    {
                        if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                        {
                            currentJob           = Comms.nextJob;
                            currentJob.timestamp = DateTime.Now;
                        }
                        currentJob = currentJob.Next();

                        Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID));

                        timer.Restart();

                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);

                        meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48);

                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2);
                        meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2);
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);

                        //d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4);

                        for (int i = 0; i < trimRounds; i++)
                        {
                            d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                            meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4);
                            d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                            meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4);
                        }

                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer);

                        ctx.Synchronize();
                        streamPrimary.Synchronize();

                        count = new uint[2];
                        d_indexesA.CopyToHost(count, 0, 0, 8);

                        if (count[0] > 4194304)
                        {
                            // trouble
                            count[0] = 4194304;
                            // log
                        }

                        hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream);
                        streamPrimary.Synchronize();
                        System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int));

                        timer.Stop();
                        currentJob.solvedAt = DateTime.Now;
                        currentJob.trimTime = timer.ElapsedMilliseconds;

                        //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);
                        Logger.Log(LogLevel.Info, string.Format("GPU NV{2}:     Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height));
                    }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST));

                    if (TEST)
                    {
                        //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);

                        CGraph cg = FinderBag.GetFinder();
                        if (cg == null)
                        {
                            continue;
                        }

                        cg.SetEdges(h_a, (int)count[0]);
                        cg.SetHeader(currentJob);

                        //currentJob = currentJob.Next();

                        Task.Factory.StartNew(() =>
                        {
                            Stopwatch sw = new Stopwatch();
                            sw.Start();

                            if (count[0] < 200000)
                            {
                                try
                                {
                                    if (findersInFlight++ < 3)
                                    {
                                        Stopwatch cycleTime = new Stopwatch();
                                        cycleTime.Start();
                                        cg.FindSolutions(graphSolutions);
                                        cycleTime.Stop();
                                        AdjustTrims(cycleTime.ElapsedMilliseconds);
                                        if (graphSolutions.Count > 0)
                                        {
                                            solutions++;
                                        }
                                    }
                                    else
                                    {
                                        Logger.Log(LogLevel.Warning, "CPU overloaded!");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message);
                                }
                                finally
                                {
                                    findersInFlight--;
                                    FinderBag.ReturnFinder(cg);
                                }
                            }

                            sw.Stop();

                            if (++trims % 50 == 0)
                            {
                                Console.ForegroundColor = ConsoleColor.Green;
                                Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions);
                                Console.ResetColor();
                            }
                            //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count);
                            //Console.WriteLine("Duped edges: {0}", cg.dupes);
                            Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes));
                        });

                        //h_indexesA = d_indexesA;
                        //h_indexesB = d_indexesB;

                        //var sumA = h_indexesA.Sum(e => e);
                        //var sumB = h_indexesB.Sum(e => e);

                        ;
                    }
                    else
                    {
                        CGraph cg = FinderBag.GetFinder();
                        cg.SetEdges(h_a, (int)count[0]);
                        cg.SetHeader(currentJob);

                        Task.Factory.StartNew(() =>
                        {
                            if (count[0] < 200000)
                            {
                                try
                                {
                                    if (findersInFlight++ < 3)
                                    {
                                        Stopwatch cycleTime = new Stopwatch();
                                        cycleTime.Start();
                                        cg.FindSolutions(graphSolutions);
                                        cycleTime.Stop();
                                        AdjustTrims(cycleTime.ElapsedMilliseconds);
                                        if (graphSolutions.Count > 0)
                                        {
                                            solutions++;
                                        }
                                    }
                                    else
                                    {
                                        Logger.Log(LogLevel.Warning, "CPU overloaded!");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message);
                                }
                                finally
                                {
                                    findersInFlight--;
                                    FinderBag.ReturnFinder(cg);
                                }
                            }
                        });
                    }
                }
                catch (Exception ex)
                {
                    Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message);
                    Task.Delay(5000).Wait();
                }
            }

            // clean up
            try
            {
                Task.Delay(500).Wait();

                Comms.Close();

                d_buffer.Dispose();
                d_indexesA.Dispose();
                d_indexesB.Dispose();

                streamPrimary.Dispose();
                streamSecondary.Dispose();

                hAligned_a.Dispose();

                if (ctx != null)
                {
                    ctx.Dispose();
                }
            }
            catch { }
        }