Example #1
1
        private void initGLAndCuda()
        {
            //Create render target control
            m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default);
            m_renderControl.Dock = DockStyle.Fill;
            m_renderControl.BackColor = Color.White;
            m_renderControl.BorderStyle = BorderStyle.FixedSingle;
            m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown);
            m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove);
            m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown);
            m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged);

            panel1.Controls.Add(m_renderControl);
            Console.WriteLine("   OpenGL device is Available");

            int deviceID = CudaContext.GetMaxGflopsDeviceId();

            ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync);
            string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount);
            Console.WriteLine(console);

            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx);

            hvfield = new cData[DS];
            dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM);
            tPitch = dvfield.Pitch;

            dvfield.CopyToDevice(hvfield);

            vxfield = new CudaDeviceVariable<cData>(DS);
            vyfield = new CudaDeviceVariable<cData>(DS);

            // Create particle array
            particles = new cData[DS];
            initParticles(particles, DIM, DIM);

            // TODO: update kernels to use the new unpadded memory layout for perf
            // rather than the old FFTW-compatible layout
            planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding);
            planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding);

            GL.GenBuffers(1, out vbo);
            GL.BindBuffer(BufferTarget.ArrayBuffer, vbo);
            GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw);
            int bsize;
            GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize);

            if (bsize != DS * cData.SizeOf)
                throw new Exception("Sizes don't match.");

            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);

            cuda_vbo_resource = new CudaGraphicsInteropResourceCollection();
            cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None));

            texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two);

            stopwatch = new CudaStopWatch(CUEventFlags.Default);

            reshape();
            isInit = true;
            display();
        }
Example #2
0
    public DimensionReductionFitness(
        CudaContext context,
        IDimensionAccuracy accuracyFunc,
        int popSize,
        int genLength
        )
    {
        this.accuracyFunc = accuracyFunc;
        this.popSize      = popSize;
        this.context      = context;

        deviceVectorSizes = new CudaDeviceVariable <int>(popSize);


        fitnessKernel = context.LoadKernel(
            "kernels/dimensionsReductions.ptx",
            "fitnessFunction"
            );
        fitnessKernel.GridDimensions  = 1;
        fitnessKernel.BlockDimensions = popSize;
        Alpha = 0.7f;

        sizeAndIndecesKernel = context.LoadKernel("kernels/Common.ptx", "countVectorsIndeces");
        sizeAndIndecesKernel.SetConstantVariable("genLength", genLength);
        sizeAndIndecesKernel.GridDimensions  = 1;
        sizeAndIndecesKernel.BlockDimensions = popSize;
        populationIndeces = new CudaDeviceVariable <int>(genLength * popSize);
    }
Example #3
0
        internal CudaError LaunchKernelWithStreamBinding(
            CudaStream stream,
            CudaKernel kernel,
            RuntimeKernelConfig config,
            IntPtr args,
            IntPtr kernelArgs)
        {
            var binding = stream.BindScoped();

            var result = LaunchKernel(
                kernel.FunctionPtr,
                config.GridDim.X,
                config.GridDim.Y,
                config.GridDim.Z,
                config.GroupDim.X,
                config.GroupDim.Y,
                config.GroupDim.Z,
                config.SharedMemoryConfig.DynamicArraySize,
                stream.StreamPtr,
                args,
                kernelArgs);

            binding.Recover();
            return(result);
        }
        //Test CUDA kernel for complex multiplication
        public void test(int N)
        {
            CudaContext ctx    = new CudaContext();
            CudaKernel  kernel = ctx.LoadKernel("kernel.ptx", "ComplexMultCUDA");

            kernel.GridDimensions  = N;
            kernel.BlockDimensions = 1;
            double2[] a = new double2[N];
            double2[] b = new double2[N];
            double2[] c = new double2[N];
            for (int i = 0; i < N; i++)
            {
                a[i].x = 1;
                a[i].y = 3;
                b[i].x = 2;
                b[i].y = 2;
            }

            CudaDeviceVariable <double2> d_a = null;
            CudaDeviceVariable <double2> d_b = null;

            try
            {
                d_a = a;
                d_b = b;
            }
            catch (Exception e)
            {
                Console.WriteLine("{0} Exception caught.", e);
                return;
            }
            kernel.Run(d_a.DevicePointer, d_b.DevicePointer, N);
            c = d_b;
            Console.WriteLine("C.last()={0}+i{1}", c.Last().x, c.Last().y);
        }
            private nvrtcResult LoadKernel(string kernelSourceFile, out CudaKernel kernel)
            {
                nvrtcResult result;

                kernel = null;

                using (var compiler = new CudaRuntimeCompiler(File.ReadAllText(kernelSourceFile), Path.GetFileName(kernelSourceFile)))
                {
                    try
                    {
                        compiler.Compile(new string[0]);
                        result = nvrtcResult.Success;
                    }
                    catch (NVRTCException ex)
                    {
                        result = ex.NVRTCError;
                    }

                    var outputFileWithoutExt = Path.Combine(Path.GetDirectoryName(kernelSourceFile), Path.GetFileNameWithoutExtension(kernelSourceFile));
                    File.WriteAllText(outputFileWithoutExt + ".ptx.log", compiler.GetLogAsString());

                    if (result == nvrtcResult.Success)
                    {
                        var ptx = compiler.GetPTX();
                        kernel = _CudaContext.LoadKernelFatBin(ptx, "Run");
                        File.WriteAllBytes(outputFileWithoutExt + ".ptx", ptx);
                    }
                }
                return(result);
            }
Example #6
0
        public Layer(Int3 size, Layer prev, ref CudaContext ctx, int type)
        {
            this.ctx = ctx;

            this.type = type;
            this.size = size;

            data  = new float[size.Mul];
            bias  = new float[size.Mul];
            error = new float[size.Mul];

            generateWeights(size, prev.size, kernelType.fullyConnected);

            forward = ctx.LoadKernel("kernel.ptx", "Forward");
            forward.GridDimensions  = new dim3(size.x, size.y, size.z);
            forward.BlockDimensions = new dim3(prev.size.x, prev.size.y, prev.size.z);

            back = ctx.LoadKernel("kernel.ptx", "Backprop");
            back.GridDimensions  = new dim3(size.x, size.y, size.z);
            back.BlockDimensions = new dim3(prev.size.x, prev.size.y, prev.size.z);

            clear = ctx.LoadKernel("kernel.ptx", "Clear");
            clear.GridDimensions = new dim3(size.x, size.y, size.z);

            activate = ctx.LoadKernel("kernel.ptx", "Activate");
            activate.GridDimensions = new dim3(size.x, size.y, size.z);

            SoftmaxSigma = ctx.LoadKernel("kernel.ptx", "SoftmaxSigma");
            SoftmaxSigma.GridDimensions = new dim3(size.x, size.y, size.z);

            SoftmaxFinal = ctx.LoadKernel("kernel.ptx", "SoftmaxFinal");
            SoftmaxFinal.BlockDimensions = new dim3(size.x, size.y, size.z);

            SoftmaxVal = new float[] { 0 };
        }
Example #7
0
        public Layer(FileLayer fl, ref CudaContext ctx)
        {
            this.ctx = ctx;

            type = fl.type;
            size = fl.size;

            data  = new float[fl.size.Mul];
            bias  = new float[fl.size.Mul];
            error = new float[fl.size.Mul];

            forward = ctx.LoadKernel("kernel.ptx", "Forward");
            forward.GridDimensions  = new dim3(size.x, size.y, size.z);
            forward.BlockDimensions = new dim3(fl.prevSize.x, fl.prevSize.y, fl.prevSize.z);

            back = ctx.LoadKernel("kernel.ptx", "Backprop");
            back.GridDimensions  = new dim3(size.x, size.y, size.z);
            back.BlockDimensions = new dim3(fl.prevSize.x, fl.prevSize.y, fl.prevSize.z);

            clear = ctx.LoadKernel("kernel.ptx", "Clear");
            clear.GridDimensions = new dim3(size.x, size.y, size.z);

            activate = ctx.LoadKernel("kernel.ptx", "Activate");
            activate.GridDimensions = new dim3(size.x, size.y, size.z);
        }
Example #8
0
		/// <summary>
		/// Creates a new surface from array memory. Allocates new array.
		/// </summary>
		/// <param name="kernel"></param>
		/// <param name="surfName"></param>
		/// <param name="flags"></param>
		/// <param name="format"></param>
		/// <param name="width">In elements</param>
		/// <param name="height">In elements</param>
		/// <param name="depth">In elements</param>
		/// <param name="numChannels"></param>
		/// <param name="arrayFlags"></param>
		public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, SizeT depth, CudaArray3DNumChannels numChannels, CUDAArray3DFlags arrayFlags)
		{
			_surfref = new CUsurfref();
			res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName));
			if (res != CUResult.Success) throw new CudaException(res);

			_flags = flags;
			_format = format;
			_height = height;
			_width = width;
			_depth = depth;
			_numChannels = (int)numChannels;
			_name = surfName;
			_module = kernel.CUModule;
			_cufunction = kernel.CUFunction;

			_channelSize = CudaHelperMethods.GetChannelSize(format);
			_dataSize = height * width * depth * _numChannels * _channelSize;
			_array = new CudaArray3D(format, width, height, depth, numChannels, arrayFlags);

			res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res));
			if (res != CUResult.Success) throw new CudaException(res);
		}
Example #9
0
        //private CudaKernel kernel1;

        //public Class1()
        //{
        //    //int deviceID = 0;

        //    //CudaContext ctx = new CudaContext(deviceID);
        //    //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
        //    //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);
        //}

        public static double[,] TestMatrix(double[][,] a)
        {
            using (CudaContext ctx = new CudaContext(0))
            {
                CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
                var      kernel   = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);

                int dimZ = a.Length;
                int dimX = a[0].GetLength(0);
                int dimY = a[0].GetLength(1);

                kernel.GridDimensions  = new dim3(28, 28, 1);
                kernel.BlockDimensions = new dim3(1, 1, 1);
                //kernel.BlockDimensions = new dim3(dimX, dimY, 1);

                // Allocate vectors in device memory and copy vectors from host memory to device memory
                CudaDeviceVariable <double> dA = a.ToLinearArray();
                //CudaDeviceVariable<double> dB = ToLinearArray(b);
                CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY);

                // Invoke kernel
                kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ);

                // Copy result from device memory to host memory
                double[] c = dC;

                //ctx.FreeMemory(dC.DevicePointer);
                //ctx.FreeMemory(dA.DevicePointer);
                //ctx.Dispose();

                return(ToMultyArray(c, dimX));
            }
        }
Example #10
0
        public static void blaa()
        {
            int num = 10;
            //NewContext creation
            CudaContext cntxt = new CudaContext();

            //Module loading from precompiled .ptx in a project output folder
            CUmodule cumodule = cntxt.LoadModule("kernel.ptx");

            //_Z9addKernelPf - function name, can be found in *.ptx file
            CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt);

            //Create device array for data
            CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num);

            //Create arrays with data
            float[] vec1 = new float[num];

            //Copy data to device
            vec1_device.CopyToDevice(vec1);

            //Set grid and block dimensions
            addWithCuda.GridDimensions  = new dim3(8, 1, 1);
            addWithCuda.BlockDimensions = new dim3(512, 1, 1);

            //Run the kernel
            addWithCuda.Run(
                vec1_device.DevicePointer);

            //Copy data from device
            vec1_device.CopyToHost(vec1);
        }
Example #11
0
 /// <summary>
 /// Creates a new mipmapped texture from array memory. Allocates a new mipmapped array.
 /// </summary>
 /// <param name="kernel"></param>
 /// <param name="texName"></param>
 /// <param name="addressModeForAllDimensions"></param>
 /// <param name="filterMode"></param>
 /// <param name="flags"></param>
 /// <param name="descriptor"></param>
 /// <param name="numMipmapLevels"></param>
 /// <param name="maxAniso"></param>
 /// <param name="mipmapFilterMode"></param>
 /// <param name="mipmapLevelBias"></param>
 /// <param name="minMipmapLevelClamp"></param>
 /// <param name="maxMipmapLevelClamp"></param>
 public CudaTextureMipmappedArray(CudaKernel kernel, string texName, CUAddressMode addressModeForAllDimensions,
                                  CUFilterMode filterMode, CUTexRefSetFlags flags, CUDAArray3DDescriptor descriptor, uint numMipmapLevels,
                                  uint maxAniso, CUFilterMode mipmapFilterMode, float mipmapLevelBias, float minMipmapLevelClamp, float maxMipmapLevelClamp)
     : this(kernel, texName, addressModeForAllDimensions, addressModeForAllDimensions, addressModeForAllDimensions, filterMode, flags, descriptor,
            numMipmapLevels, maxAniso, mipmapFilterMode, mipmapLevelBias, minMipmapLevelClamp, maxMipmapLevelClamp)
 {
 }
Example #12
0
        /// <summary>
        /// Creates a new surface from array memory.
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="surfName"></param>
        /// <param name="flags"></param>
        /// <param name="array"></param>
        public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CudaArray3D array)
        {
            _surfref = new CUsurfref();
            res      = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }

            _flags       = flags;
            _format      = array.Array3DDescriptor.Format;
            _height      = array.Height;
            _width       = array.Width;
            _depth       = array.Depth;
            _numChannels = (int)array.Array3DDescriptor.NumChannels;
            _name        = surfName;
            _module      = kernel.CUModule;
            _cufunction  = kernel.CUFunction;
            _channelSize = CudaHelperMethods.GetChannelSize(array.Array3DDescriptor.Format);
            _dataSize    = array.Height * array.Width * array.Depth * array.Array3DDescriptor.NumChannels * _channelSize;
            _array       = array;

            res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            _isOwner = false;
        }
Example #13
0
 public CUDAPrefixScan(CUmodule module, CudaContext context)
 {
     this.context = context;
     kernelScanExclusiveShared  = new CudaKernel("scanExclusiveShared", module, context);
     kernelScanExclusiveShared2 = new CudaKernel("scanExclusiveShared2", module, context);
     kernelUniformUpdate        = new CudaKernel("uniformUpdate", module, context);
 }
Example #14
0
        public override void DqnStanfordEvaluation(Matrix predictedActionIndices, Matrix chosenActionIndices, Matrix currentRewards, Matrix matchPredictRewards, Matrix nonMatchPredictRewards,
                                                   bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false)
        {
            this.VerifyDimentionalityOfMatrices(predictedActionIndices, chosenActionIndices, currentRewards);
            this.VerifyDimentionalityOfMatrices(currentRewards, matchPredictRewards, nonMatchPredictRewards);

            if (copyInputsFromCpuToGpu)
            {
                predictedActionIndices.CopyToCuda();
                chosenActionIndices.CopyToCuda();
                currentRewards.CopyToCuda();
            }

            CudaKernel kernel = InitializeGridsAndThreads("_Z21DqnStanfordEvaluationPfS_S_S_S_i", matchPredictRewards);

            kernel.Run(predictedActionIndices.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer,
                       matchPredictRewards.DeviceData.DevicePointer, nonMatchPredictRewards.DeviceData.DevicePointer, matchPredictRewards.Row);


            if (copyOutputsFromGpuToCpu)
            {
                matchPredictRewards.CopyToCuda();
                nonMatchPredictRewards.CopyToCuda();
            }
        }
Example #15
0
        /// <summary>
        /// Creates a new surface from array memory. Allocates new array.
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="surfName"></param>
        /// <param name="flags"></param>
        /// <param name="format"></param>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="depth">In elements</param>
        /// <param name="numChannels"></param>
        /// <param name="arrayFlags"></param>
        public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, SizeT depth, CudaArray3DNumChannels numChannels, CUDAArray3DFlags arrayFlags)
        {
            _surfref = new CUsurfref();
            res      = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }

            _flags       = flags;
            _format      = format;
            _height      = height;
            _width       = width;
            _depth       = depth;
            _numChannels = (int)numChannels;
            _name        = surfName;
            _module      = kernel.CUModule;
            _cufunction  = kernel.CUFunction;

            _channelSize = CudaHelperMethods.GetChannelSize(format);
            _dataSize    = height * width * depth * _numChannels * _channelSize;
            _array       = new CudaArray3D(format, width, height, depth, numChannels, arrayFlags);

            res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res));
            if (res != CUResult.Success)
            {
                throw new CudaException(res);
            }
            _isOwner = true;
        }
Example #16
0
        private nvrtcResult LoadKernel(string path, out CudaKernel kernel, out string log)
        {
            nvrtcResult result;

            using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path)))
            {
                try
                {
                    rtc.Compile(new string[0]); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options
                    result = nvrtcResult.Success;
                }
                catch (NVRTCException ex)
                {
                    result = ex.NVRTCError;
                }

                log = rtc.GetLogAsString();

                if (result == nvrtcResult.Success)
                {
                    var ptx = rtc.GetPTX();
                    kernel = this._context.CudaContext.LoadKernelFatBin(ptx, "Run"); // hard-coded method name from the CUDA kernel
                }
                else
                {
                    kernel = null;
                }
            }

            return(result);
        }
Example #17
0
        public override void MatrixBellmanErrorAndDerivative(Matrix predictedQValues, Matrix maxQHatValues, Matrix chosenActionIndices, Matrix currentRewards, Matrix error, Matrix errorDerivative,
                                                             float discount, Matrix isLastEpisode, bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false)
        {
            this.VerifyDimentionalityOfMatrices(predictedQValues, errorDerivative);
            this.VerifyColumnWithRowOfMatrices(predictedQValues, maxQHatValues);
            this.VerifyDimentionalityOfMatrices(maxQHatValues, chosenActionIndices, currentRewards);
            this.VerifyDimentionalityOfMatrices(currentRewards, error, isLastEpisode);

            if (copyInputsFromCpuToGpu)
            {
                predictedQValues.CopyToCuda();
                maxQHatValues.CopyToCuda();
            }

            CudaKernel kernel = InitializeGridsAndThreads("_Z26matrixBellmanErrorAndDerivPfS_S_S_S_S_fS_ii", errorDerivative);

            kernel.Run(predictedQValues.DeviceData.DevicePointer, maxQHatValues.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer, error.DeviceData.DevicePointer,
                       errorDerivative.DeviceData.DevicePointer, discount, isLastEpisode.DeviceData.DevicePointer, errorDerivative.Row, errorDerivative.Column);

            if (copyOutputsFromGpuToCpu)
            {
                error.CopyFromCuda();
                errorDerivative.CopyFromCuda();
            }
        }
Example #18
0
        private void RunKernel(Volume <T> input, Volume <T> output, CudaKernel kernel, params object[] extraParameters)
        {
            if (!(input.Storage is IVolumeStorage <T> inputStorage))
            {
                throw new ArgumentException($"{nameof(input)} storage should be VolumeStorage", nameof(input));
            }

            if (!(output.Storage is IVolumeStorage <T> outputStorage))
            {
                throw new ArgumentException($"{nameof(output)} storage should be VolumeStorage", nameof(output));
            }

            inputStorage.CopyToDevice();
            outputStorage.CopyToDevice();

            var count      = (int)output.Shape.TotalLength;
            var parameters = new object[] { inputStorage.DeviceBuffer.DevicePointer, outputStorage.DeviceBuffer.DevicePointer };

            if (extraParameters != null)
            {
                parameters = parameters.Concat(extraParameters).ToArray();
            }

            this.RunKernel(kernel, count, parameters);
        }
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            cudaContext.SetCurrent();

            CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

            IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
            Tensor firstTensor       = allTensors.First();
            long   elementCount      = firstTensor.ElementCount();
            ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
            ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
Example #20
0
        static void InitKernels()
        {
            cntxt = new CudaContext();
            CUmodule cumodule = cntxt.LoadModulePTX(@"C:\work\Sobel\CudaTest\x64\Debug\kernel.ptx");

            matrixSumCude = new CudaKernel("_Z15matrixSumKernelPdPKdiii", cumodule, cntxt);
        }
Example #21
0
    public VectorReductionAccuracy(CudaContext context, DeviceDataSet <int> teaching, DeviceDataSet <int> test, int popSize)
    {
        this.teaching = teaching;
        this.test     = test;
        this.popSize  = popSize;
        this.context  = context;

        calculatedNeabours = new CudaDeviceVariable <int>(teaching.length * test.length);
        deviceAccuracy     = new CudaDeviceVariable <float>(popSize);

        Profiler.Start("calculate neabours");
        Neabours.CalculateNeabours(context, teaching, test, calculatedNeabours, ThreadsPerBlock);
        Profiler.Stop("calculate neabours");


        accuracyKernel = context.LoadKernel("kernels/VectorReduction.ptx", "calculateAccuracy");
        dim3 gridDimension = new dim3()
        {
            x = (uint)(test.length / ThreadsPerBlock + 1),
            y = (uint)popSize,
            z = 1
        };

        accuracyKernel.GridDimensions  = gridDimension;
        accuracyKernel.BlockDimensions = ThreadsPerBlock;

        accuracyKernel.SetConstantVariable("testVectorsCount", test.length);
        accuracyKernel.SetConstantVariable("teachingVectorsCount", teaching.length);
        accuracyKernel.SetConstantVariable("attributeCount", teaching.attributeCount);
        accuracyKernel.SetConstantVariable("genLength", teaching.length);

        K           = 3;
        CountToPass = 2;
    }
Example #22
0
        public void CreateKernelObjects()
        {
            kernels.Clear();
            int funcNo = 0;

            foreach (var sc in sourceCodes)
            {
                for (int ki = 0; ki < (grm.kernelPerIndividual? sc.numberOfIndividuals:1); ki++)
                {
                    CudaKernel kernel = new CudaKernel("createdFunc" + (grm.kernelPerIndividual ? funcNo++ : 0).ToString(), sc.mod, ctx);

                    if (NUMTESTCASE > 256)
                    {   // ---  multi dim block if testcases > 256
                        kernel.GridDimensions  = NUMTESTCASE / 256;
                        kernel.BlockDimensions = 256;
                    }
                    else
                    {
                        kernel.GridDimensions  = 1;
                        kernel.BlockDimensions = NUMTESTCASE;
                    }

                    if (ki == 0)  //   <--   this is due to managedcuda not implementing setconstantvar as a module method!
                    {
                        grm.SetKernelParameters(kernel);
                    }

                    kernels.Add(kernel);
                }
            }
        }
        public CudaIntersectionDevice(RayEngineScene scene, NVContext ctx)
            : base(scene)
        {
            wallclock = new Stopwatch();
            this.todoRayBuffers = new ConcurrentQueue<Tuple<int, RayBuffer>>();
            this.doneRayBuffers = new List<ConcurrentQueue<RayBuffer>>() { { new ConcurrentQueue<RayBuffer>() } };
            this.started = false;
            if (ctx != null)
            {
                this.cudaContext = ctx;
            }
            else
            {
                this.cudaContext = new NVContext() { Context = new CudaContext(CudaContext.GetMaxGflopsDeviceId()) };
            }
            using (var sr = new StreamReader(@"G:\Git\RayDen\CudaMegaRay\x64\Release\kernel.cu.ptx"))
            {
                intersectKernel = cudaContext.Context.LoadKernelPTX(sr.BaseStream, "IntersectLBvh");
            }

            this.rays = new CudaDeviceVariable<RayData>(RayBuffer.RayBufferSize);
            this.hits = new CudaDeviceVariable<RayHit>(RayBuffer.RayBufferSize);
            verts = scene.Vertices.ToArray();
            //scene.Triangles.Select(i => i.GetInfo()).ToArray();

            var ti = scene.Triangles.Select(i => i.GetInfo()).ToArray();
            var da = new BvhDataAdapter(scene);
            var treeData = da.GetMpData();
            bvh = treeData;
            trianglesCount = ti.Length;
            tris = ti; 

            nodesCount = treeData.Length;
            Tracer.TraceLine("BVH Data Size {0:F3} MBytes", (treeData.Length * 32f) / (1024f * 1024f));
        }
Example #24
0
        public GPU_Functionality(int deviceID = 0)
        {
            ctx     = new CudaContext(deviceID);
            version = ctx.GetDeviceComputeCapability();
            Trace.WriteLine($"cuda compute capability {version.Major}.{version.Minor}");

            CUmodule collision_module = ctx.LoadModulePTX("collision_kernels.ptx");

            kNarrowPhase            = new CudaKernel("kNarrowPhase_new", collision_module, ctx);
            kFindClosestFace        = new CudaKernel("kFindClosestFace", collision_module, ctx);
            kCollisionResponseForce = new CudaKernel("kCollisionResponseForce", collision_module, ctx);
            dim3 block = new dim3(block_size, 1, 1);

            kNarrowPhase.BlockDimensions            = block;
            kFindClosestFace.BlockDimensions        = block;
            kCollisionResponseForce.BlockDimensions = block;

            // cz
            CUmodule module_cz_kernels = ctx.LoadModulePTX("cz_kernels.ptx");

            kczCZForce = new CudaKernel("kczCZForce", module_cz_kernels, ctx);
            kczCZForce.BlockDimensions = block;

            // elem
            CUmodule module_elem_kernels = ctx.LoadModulePTX("elem_kernels.ptx");

            kelElementElasticityForce = new CudaKernel("kelElementElasticityForce", module_elem_kernels, ctx);
            kelElementElasticityForce.BlockDimensions = block;
        }
Example #25
0
        internal ManagedCuda.NVRTC.nvrtcResult LoadKernel(out string log)
        {
            string path = "MyKernels.c";

            ManagedCuda.NVRTC.nvrtcResult result;
            using (var rtc = new ManagedCuda.NVRTC.CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path)))
            {
                try
                {
                    rtc.Compile(new string[0]); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options
                    result = ManagedCuda.NVRTC.nvrtcResult.Success;
                } catch (ManagedCuda.NVRTC.NVRTCException ex)
                {
                    result = ex.NVRTCError;
                }
                log = rtc.GetLogAsString();

                if (result == ManagedCuda.NVRTC.nvrtcResult.Success)
                {
                    byte[] ptx = rtc.GetPTX();
                    multiply = ctx.LoadKernelFatBin(ptx, "Multiply"); // hard-coded method name from the CUDA kernel
                }
            }
            return(result);
        }
Example #26
0
        private void RunKernel(CudaKernel kernel, int count, IEnumerable <object> parameters)
        {
            // configure the dimensions; note, usually this is a lot more dynamic based
            // on input data, but we'll still go through the motions
            int threadsPerBlock, blockCount;

            if (count <= this._context.DefaultThreadsPerBlock) // a single block
            {
                blockCount      = 1;
                threadsPerBlock = RoundUp(count, this._context.WarpSize); // slight caveat here; if you are using "shuffle" operations, you
                // need to use entire "warp"s - otherwise the result is undefined
            }
            else if (count <= this._context.DefaultThreadsPerBlock * this._context.DefaultBlockCount)
            {
                // more than enough work to keep us busy; just use that
                threadsPerBlock = this._context.DefaultThreadsPerBlock;
                blockCount      = this._context.DefaultBlockCount;
            }
            else
            {
                // do the math to figure out how many blocks we need
                threadsPerBlock = this._context.DefaultThreadsPerBlock;
                blockCount      = (count + threadsPerBlock - 1) / threadsPerBlock;
            }

            // we're using 1-D math, but actually CUDA supports blocks and grids that span 3 dimensions
            kernel.BlockDimensions = new dim3(threadsPerBlock, 1, 1);
            kernel.GridDimensions  = new dim3(blockCount, 1, 1);

            // invoke the kernel
            var withCount = parameters.ToList();

            withCount.Insert(0, count);
            kernel.RunAsync(this._context.DefaultStream.Stream, withCount.ToArray());
        }
Example #27
0
        public CudaError LaunchKernelWithStreamBinding(
            CudaStream stream,
            CudaKernel kernel,
            int gridDimX,
            int gridDimY,
            int gridDimZ,
            int blockDimX,
            int blockDimY,
            int blockDimZ,
            int sharedMemSizeInBytes,
            IntPtr args,
            IntPtr kernelArgs)
        {
            var binding = stream.BindScoped();

            var result = LaunchKernel(
                kernel.FunctionPtr,
                gridDimX,
                gridDimY,
                gridDimZ,
                blockDimX,
                blockDimY,
                blockDimZ,
                sharedMemSizeInBytes,
                stream.StreamPtr,
                args,
                kernelArgs);

            binding.Recover();
            return(result);
        }
        public static void init(int maxCnt)
        {
            _gpuVelocity = KernelLoader.load_kernel("update_velocities");
            _gpuUpdate   = KernelLoader.load_kernel("update_particles");

            _dt = ISF.properties.dt;

            torus_d = new float[3] {
                ISF.properties.dx, ISF.properties.dy, ISF.properties.dz
            };
            torus_res = new int[3] {
                ISF.properties.resx, ISF.properties.resy, ISF.properties.resz
            };
            torus_size = new int[3] {
                ISF.properties.sizex, ISF.properties.sizey, ISF.properties.sizez
            };

            d_k1x = new CudaDeviceVariable <float>(maxCnt);
            d_k1y = new CudaDeviceVariable <float>(maxCnt);
            d_k1z = new CudaDeviceVariable <float>(maxCnt);

            d_k2x = new CudaDeviceVariable <float>(maxCnt);
            d_k2y = new CudaDeviceVariable <float>(maxCnt);
            d_k2z = new CudaDeviceVariable <float>(maxCnt);

            d_k3x = new CudaDeviceVariable <float>(maxCnt);
            d_k3y = new CudaDeviceVariable <float>(maxCnt);
            d_k3z = new CudaDeviceVariable <float>(maxCnt);

            d_k4x = new CudaDeviceVariable <float>(maxCnt);
            d_k4y = new CudaDeviceVariable <float>(maxCnt);
            d_k4z = new CudaDeviceVariable <float>(maxCnt);
        }
Example #29
0
        internal nvrtcResult LoadKernel(out string log)
        {
            nvrtcResult result;

            using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path)))
            {
                try
                {
                    rtc.Compile(Array.Empty <string>());
                    result = nvrtcResult.Success;
                }
                catch (NVRTCException ex)
                {
                    result = ex.NVRTCError;
                }
                log = rtc.GetLogAsString();

                if (result == nvrtcResult.Success)
                {
                    byte[] ptx = rtc.GetPTX();
                    multiply = ctx.LoadKernelFatBin(ptx, methodName);
                }
            }
            return(result);
        }
Example #30
0
        // Testing managed CUDA call
        private static void RunCudaWithAKernel()
        {
            // C# Cuda code to call kernel

            int         N            = 50000;
            int         deviceID     = 0;
            CudaContext ctx          = new CudaContext(deviceID);
            CudaKernel  kernel       = ctx.LoadKernel("kernel_x64.ptx", "VecAdd");
            int         numOfThreads = 256;

            kernel.GridDimensions  = (N + numOfThreads - 1) / numOfThreads;
            kernel.BlockDimensions = numOfThreads;

            // allocate memory in host (not gpu)
            var h_A = InitWithData(N, numOfThreads * 4);
            var h_B = InitWithData(N, numOfThreads);

            // Allocate vectors in device memory and copy from host to device.
            CudaDeviceVariable <float> d_A = h_A;
            CudaDeviceVariable <float> d_B = h_B;
            CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N);

            //Invoke kernel
            kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            Console.WriteLine("kernel has runeth");
            //Copy from memory of device to host.
            float[] h_C = d_C;
        }
        public CudaKernel GetOrCreateCudaKernel(string moduleName, string kernelName)
        {
            CudaKernel kernel;

            moduleName = "MasterInclude";
            if (!_kernels.TryGetValue(kernelName, out kernel))
            {
                CUmodule module;
                if (!_modules.TryGetValue(moduleName, out module))
                {
                    string fatbinName = "";
                    if (IntPtr.Size == 8)
                    {
                        fatbinName = moduleName + ".x64.fatbin";
                    }
                    else
                    {
                        fatbinName = moduleName + ".fatbin";
                    }

                    using (Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(fatbinName)) {
                        if (stream == null)
                        {
                            throw new Exception($"Fatbin embedded resource '{fatbinName}' could not be found");
                        }
                        module = Context.LoadModuleFatBin(stream);
                        _modules[moduleName] = module;
                    }
                }
                kernel = new CudaKernel(kernelName, module, Context);
                _kernels[kernelName] = kernel;
            }
            return(kernel);
        }
Example #32
0
        private void RunKernel(Volume <T> input, Volume <T> output, CudaKernel kernel)
        {
            if (!Equals(input.Shape, output.Shape))
            {
                throw new ArgumentException($"{nameof(input)} and {nameof(output)} should have the same shape.");
            }

            var inputStorage = input.Storage as IVolumeStorage <T>;

            if (inputStorage == null)
            {
                throw new ArgumentException($"{nameof(input)} storage should be VolumeStorage", nameof(input));
            }

            var outputStorage = output.Storage as IVolumeStorage <T>;

            if (outputStorage == null)
            {
                throw new ArgumentException($"{nameof(output)} storage should be VolumeStorage", nameof(output));
            }

            inputStorage.CopyToDevice();
            outputStorage.CopyToDevice();

            var count      = (int)input.Shape.TotalLength;
            var parameters = new object[] { inputStorage.DeviceBuffer.DevicePointer, outputStorage.DeviceBuffer.DevicePointer };

            RunKernel(kernel, count, parameters);
        }
Example #33
0
    public void CUDA_AddFloatArrays()
    {
        //Load Kernel image from resources
        Stream stream = new StreamReader(resName).BaseStream;

        if (stream == null)
        {
            throw new ArgumentException("Kernel not found in resources.");
        }

        vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

        var threadsPerBlock = 1024;

        vectorAddKernel.BlockDimensions = threadsPerBlock;
        vectorAddKernel.GridDimensions  = (Count + threadsPerBlock - 1) / threadsPerBlock;

        CudaStopWatch w = new CudaStopWatch();

        w.Start();
        vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count);
        w.Stop();

        Debug.Log(w.GetElapsedTime() / 1000.0f);
        Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}");
        Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}");

        // Copy result from device memory to host memory
        // h_C contains the result in host memory
        // h_C = d_C;
    }
        /// <summary>
		/// Creates a new mipmapped texture from array memory. Allocates a new mipmapped array. 
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="texName"></param>
        /// <param name="addressModeForAllDimensions"></param>
        /// <param name="filterMode"></param>
        /// <param name="flags"></param>
        /// <param name="descriptor"></param>
        /// <param name="numMipmapLevels"></param>
        /// <param name="maxAniso"></param>
        /// <param name="mipmapFilterMode"></param>
        /// <param name="mipmapLevelBias"></param>
        /// <param name="minMipmapLevelClamp"></param>
        /// <param name="maxMipmapLevelClamp"></param>
        public CudaTextureMipmappedArray(CudaKernel kernel, string texName, CUAddressMode addressModeForAllDimensions,
			CUFilterMode filterMode, CUTexRefSetFlags flags, CUDAArray3DDescriptor descriptor, uint numMipmapLevels,
			uint maxAniso, CUFilterMode mipmapFilterMode, float mipmapLevelBias, float minMipmapLevelClamp, float maxMipmapLevelClamp)
            : this(kernel, texName, addressModeForAllDimensions, addressModeForAllDimensions, addressModeForAllDimensions, filterMode, flags, descriptor,
			numMipmapLevels, maxAniso, mipmapFilterMode, mipmapLevelBias, minMipmapLevelClamp, maxMipmapLevelClamp)
        {

        }
        public CudaKernel GetKernel(string name, bool isStrongName = false)
        {
            if (!isStrongName)
                name = GetStrongName(name);

            var kernel = new CudaKernel(name, Module, Context);
            return kernel;
        }
Example #36
0
        static void InitKernels()
        {
            //max thread number - 65534x256=16776704
            _matrixSize = 256;
            _threadsPerBlock = 256;

            CleanUpResources();
            _cnContext = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            CUmodule cumodule = _cnContext.LoadModule(@"\Kernel\kernel.ptx");
            _multiplyTwoVectorWithCuda = new CudaKernel("_Z6kernel_", cumodule, _cnContext);
        }
        /// <summary>
        /// Creates a new 2D texture from array memory. Allocates a new 2D array.
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="texName"></param>
        /// <param name="addressMode0"></param>
        /// <param name="addressMode1"></param>
        /// <param name="filterMode"></param>
        /// <param name="flags"></param>
        /// <param name="format"></param>
        /// <param name="height">In elements</param>
        /// <param name="width">In elements</param>
        /// <param name="numChannels">1,2 or 4</param>
        public CudaTextureArray2D(CudaKernel kernel, string texName, CUAddressMode addressMode0, CUAddressMode addressMode1, CUFilterMode filterMode, CUTexRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, CudaArray2DNumChannels numChannels)
        {
            _texref = new CUtexref();
            res = DriverAPINativeMethods.ModuleManagement.cuModuleGetTexRef(ref _texref, kernel.CUModule, texName);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Texture name: {3}", DateTime.Now, "cuModuleGetTexRef", res, texName));
            if (res != CUResult.Success) throw new CudaException(res);

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 0, addressMode0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 1, addressMode1);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFilterMode(_texref, filterMode);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFilterMode", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFlags(_texref, flags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFlags", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFormat(_texref, format, (int)numChannels);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFormat", res));
            if (res != CUResult.Success) throw new CudaException(res);

            _filtermode = filterMode;
            _flags = flags;
            _addressMode0 = addressMode0;
            _addressMode1 = addressMode1;
            _format = format;
            _height = height;
            _width = width;
            _numChannels = (int)numChannels;
            _name = texName;
            _module = kernel.CUModule;
            _cufunction = kernel.CUFunction;

            _channelSize = CudaHelperMethods.GetChannelSize(format);
            _dataSize = height * width * _numChannels * _channelSize;
            _array = new CudaArray2D(format, width, height, numChannels);

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetArray(_texref, _array.CUArray, CUTexRefSetArrayFlags.OverrideFormat);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetArray", res));
            if (res != CUResult.Success) throw new CudaException(res);
            //res = DriverAPINativeMethods.ParameterManagement.cuParamSetTexRef(kernel.CUFunction, CUParameterTexRef.Default, _texref);
            //Debug.WriteLine("{0:G}, {1}: {2}", DateTime.Now, "cuParamSetTexRef", res);
            //if (res != CUResult.Success) throw new CudaException(res);
        }
        public DadeCudaIntersectionDevice(RayEngineScene scene, NVContext ctx)
            : base(scene)
        {
            this.scene = scene;
            wallclock = new Stopwatch();
            this.todoRayBuffers = new InputRayBufferCollection();
            this.doneRayBuffers = new OutputRayBufferCollection();
            this.started = false;
            if (ctx != null)
            {
                this.cudaContext = ctx;
            }
            else
            {
                this.cudaContext = new NVContext() { Context = new CudaContext(CudaContext.GetMaxGflopsDeviceId()) };
            }
            using (var sr = new StreamReader(@"G:\Git\RayDen\CudaMegaRay\x64\Release\Intersection.cu.ptx"))
            {
                intersectKernel = cudaContext.Context.LoadKernelPTX(sr.BaseStream, "Intersect");
            }

            this.rays = new CudaDeviceVariable<RayData>(RayBuffer.RayBufferSize);
            this.hits = new CudaDeviceVariable<RayHit>(RayBuffer.RayBufferSize);
            verts = scene.Vertices.ToArray();
            tris=scene.Triangles.Select(i => i.GetInfo()).ToArray();

            if (GlobalConfiguration.Instance.UseSceneCaching && scene.Cache != null)
            {
                bvh = scene.Cache.BvhData;
                nodesCount = scene.Cache.BvhData.Length;
            }
            else
            {
                var da = new BvhDataAdapter(scene);
                var treeData = da.BuildData();
                bvh = treeData;
                nodesCount = treeData.Length;

            }

            Tracer.TraceLine("BVH Data Size {0:F3} MBytes", (nodesCount * 32f) / (1024f * 1024f));
        }
        /// <summary>
        /// Creates a new 2D texture from array memory. Allocates a new 2D array.
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="texName"></param>
        /// <param name="addressMode"></param>
        /// <param name="filterMode"></param>
        /// <param name="flags"></param>
        /// <param name="format"></param>
        /// <param name="height">In elements</param>
        /// <param name="width">In elements</param>
        /// <param name="numChannels">1,2 or 4</param>
        public CudaTextureArray2D(CudaKernel kernel, string texName, CUAddressMode addressMode, CUFilterMode filterMode, CUTexRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, CudaArray2DNumChannels numChannels)
            : this(kernel, texName, addressMode, addressMode, filterMode, flags, format, width, height, numChannels)
        {

        }
Example #40
0
        private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
        {
            int count = width * height * depth;
            int widthD = width - 1;
            int heightD = height - 1;
            int depthD = depth - 1;
            int countDecremented = widthD * heightD * depthD;

            dim3 blockDimensions = new dim3(8, 8, 8);
            dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
            dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));

            CUDANoiseCube noiseCube = new CUDANoiseCube();

            CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16);
            CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);

            CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count);

            kernelPositionWeight.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);

            kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);

            kernelNormalAmbient.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);

            kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);

            int nearestW = NearestPowerOfTwo(widthD);
            int nearestH = NearestPowerOfTwo(heightD);
            int nearestD = NearestPowerOfTwo(depthD);
            int nearestCount = nearestW * nearestH * nearestD;

            CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount);
            trisCountDevice.Memset(0);
            CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented);

            kernelMarchingCubesCases.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);

            kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);

            CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);

            int lastTrisCount = 0;
            trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));

            int lastPrefixSum = 0;
            prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));

            int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;

            if (totalVerticesCount > 0)
            {
                if (container.Geometry != null)
                    container.Geometry.Dispose();

                container.VertexCount = totalVerticesCount;

                container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
                {
                    BindFlags = BindFlags.VertexBuffer,
                    CpuAccessFlags = CpuAccessFlags.None,
                    OptionFlags = ResourceOptionFlags.None,
                    SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
                    Usage = ResourceUsage.Default
                });

                CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);
                
                kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
                typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);

                directResource.Map();
                kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
                directResource.UnMap();

                directResource.Dispose();
            }
            else
            {
                container.VertexCount = 0;

                if (container.Geometry != null)
                    container.Geometry.Dispose();
            }

            noiseCube.Dispose();
            prefixSumsDev.Dispose();
            trisCountDevice.Dispose();
            offsetsDev.Dispose();
            noiseArray.Dispose();
            noiseTexture.Dispose();
            voxelsDev.Dispose();
        }
        /// <summary>
		/// Creates a new mipmapped texture from array memory. Allocates a new mipmapped array.
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="texName"></param>
        /// <param name="addressMode0"></param>
        /// <param name="addressMode1"></param>
        /// <param name="addressMode2"></param>
        /// <param name="filterMode"></param>
        /// <param name="flags"></param>
        /// <param name="descriptor"></param>
        /// <param name="numMipmapLevels"></param>
        /// <param name="maxAniso"></param>
        /// <param name="mipmapFilterMode"></param>
        /// <param name="mipmapLevelBias"></param>
        /// <param name="minMipmapLevelClamp"></param>
        /// <param name="maxMipmapLevelClamp"></param>
        public CudaTextureMipmappedArray(CudaKernel kernel, string texName, CUAddressMode addressMode0, CUAddressMode addressMode1, CUAddressMode addressMode2,
			CUFilterMode filterMode, CUTexRefSetFlags flags, CUDAArray3DDescriptor descriptor, uint numMipmapLevels, 
			uint maxAniso, CUFilterMode mipmapFilterMode, float mipmapLevelBias, float minMipmapLevelClamp, float maxMipmapLevelClamp)
        {
			_maxAniso = maxAniso;
			_mipmapFilterMode = mipmapFilterMode;
			_mipmapLevelBias = mipmapLevelBias;
			_minMipmapLevelClamp = minMipmapLevelClamp;
			_maxMipmapLevelClamp = maxMipmapLevelClamp;

            _texref = new CUtexref();
            res = DriverAPINativeMethods.ModuleManagement.cuModuleGetTexRef(ref _texref, kernel.CUModule, texName);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Texture name: {3}", DateTime.Now, "cuModuleGetTexRef", res, texName));
            if (res != CUResult.Success) throw new CudaException(res);

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 0, addressMode0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 1, addressMode1);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 2, addressMode2);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFilterMode(_texref, filterMode);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFilterMode", res));
            if (res != CUResult.Success) throw new CudaException(res);
            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFlags(_texref, flags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFlags", res));
            if (res != CUResult.Success) throw new CudaException(res);
			res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFormat(_texref, descriptor.Format, (int)descriptor.NumChannels);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFormat", res));
            if (res != CUResult.Success) throw new CudaException(res);

            _filtermode = filterMode;
            _flags = flags;
            _addressMode0 = addressMode0;
            _addressMode1 = addressMode1;
            _addressMode2 = addressMode2;
			_arrayDescriptor = descriptor;
            _name = texName;
            _module = kernel.CUModule;
            _cufunction = kernel.CUFunction;

            _array = new CudaMipmappedArray(descriptor, numMipmapLevels);

            res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmappedArray(_texref, _array.CUMipmappedArray, CUTexRefSetArrayFlags.OverrideFormat);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmappedArray", res));
			if (res != CUResult.Success) throw new CudaException(res);
			res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMaxAnisotropy(_texref, maxAniso);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMaxAnisotropy", res));
			if (res != CUResult.Success) throw new CudaException(res);
			res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapFilterMode(_texref, mipmapFilterMode);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapFilterMode", res));
			if (res != CUResult.Success) throw new CudaException(res);
			res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapLevelBias(_texref, mipmapLevelBias);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapLevelBias", res));
			if (res != CUResult.Success) throw new CudaException(res);
			res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapLevelClamp(_texref, minMipmapLevelClamp, maxMipmapLevelClamp);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapLevelClamp", res));
			if (res != CUResult.Success) throw new CudaException(res);
        }
Example #42
0
        private void button1_Click(object sender, EventArgs e)
        {
            triangulation = cuda.LoadPTX("Triangulation", "PTX", "Triangulation");
            merge_vertical = cuda.LoadPTX("MergeVertical", "PTX", "merge");
            regionSplitH = cuda.LoadPTX("RegionSplit", "PTX", "splitRegionH");
            regionSplitV_Phase1 = cuda.LoadPTX("RegionSplit", "PTX", "splitRegionV_phase1");
            regionSplitV_Phase2 = cuda.LoadPTX("RegionSplit", "PTX", "splitRegionV_phase2");

            // add a random points  TODO: add external source (ex. file)
            CreateRandomPoints(1024 * 8, new FxVector2f(0, 0), new FxVector2f(5000, 5000));

            #region Set the max face/he/ve/boundary

            NumVertex = listAllVertex.Count;

            // select the spliting numbers
            // find the split points
            NumRegions = (int)Math.Ceiling((float)NumVertex / (float)maxVertexPerRegion);

            HorizontalRegions = (int)Math.Floor(Math.Sqrt(NumRegions));
            VerticalRegions = (int)Math.Floor((float)NumRegions / (float)HorizontalRegions);
            NumRegions = HorizontalRegions * VerticalRegions;

            // init the array sizes

            // max faces per thread
            maxFacesPerThread = maxVertexPerRegion * 5;
            maxFacesPerThread += maxFacesPerThread % 32;

            // max Half edge per thread
            maxHalfEdgePerThread = maxFacesPerThread * 5;
            maxHalfEdgePerThread += maxHalfEdgePerThread % 32;

            // max vertex per thread
            maxBoundaryNodesPerThread = maxVertexPerRegion * 5;
            maxBoundaryNodesPerThread += maxBoundaryNodesPerThread % 32;

            WriteLine("maxFacesPerThread:" + maxFacesPerThread.ToString());
            WriteLine("maxHalfEdgePerThread:" + maxHalfEdgePerThread.ToString());
            WriteLine("maxBoundaryNodesPerThread:" + maxBoundaryNodesPerThread.ToString());

            #endregion

            // init the array on cpu side
            threadInfo = new csThreadInfo[NumRegions];
            regionInfo = new RegionInfo[NumRegions];
            threadParam = new cbThreadParam();

            #region init the thread param

            // init the thread param
            threadParam.maxFacesPerThread = (uint)maxFacesPerThread;
            threadParam.maxHalfEdgePerThread = (uint)maxHalfEdgePerThread;
            threadParam.maxBoundaryNodesPerThread = (uint)maxBoundaryNodesPerThread;
            threadParam.RegionsNum = (uint)NumRegions;

            MV_threadParam.ThreadNumPerRow = (uint)(VerticalRegions-1);
            MV_threadParam.HorizontalThreadNum = (uint)(HorizontalRegions);
            MV_threadParam.ThreadNum = MV_threadParam.HorizontalThreadNum * MV_threadParam.ThreadNumPerRow;
            MV_threadParam.stackMaxSize = stackMaxSize;
            MV_threadParam.depth = 0;
            #endregion

            // copy the data to the hardware
            d_threadInfo = threadInfo;
            d_regionInfo = regionInfo;
            d_threadParam = threadParam;

            d_FaceList = new CudaDeviceVariable<csFace>(maxFacesPerThread * NumRegions);
            d_BoundaryList = new CudaDeviceVariable<csBoundaryNode>(maxBoundaryNodesPerThread * NumRegions);
            d_HalfEdgeList = new CudaDeviceVariable<csHalfEdge>(maxHalfEdgePerThread * NumRegions);
            d_Stack = new CudaDeviceVariable<csStack>(stackMaxSize * NumRegions);
            d_UintStack = new CudaDeviceVariable<uint>(2 * stackMaxSize * NumRegions);

            // Update the region info by sort the vertex
            // try to sort the list
            GPUSort = new BitonicSort<FxVector2f>(cuda);
        }
 private CudaKernel InitializeGridsAndThreads(string kernalName, Matrix a)
 {
     int maxThreads = Math.Min(this.maxThreadPerBlockDim, a.Row);
     dim3 threads = a.Column == 1 ? new dim3(maxThreads, 1) : new dim3(maxThreads, maxThreads);
     dim3 blocks = new dim3((a.Row + maxThreads - 1) / maxThreads, (a.Column + maxThreads - 1) / maxThreads);
     CudaKernel kernel = new CudaKernel(kernalName, this.cuModule, this.cudaContext)
     {
         GridDimensions = blocks,
         BlockDimensions = threads
     };
     return kernel;
 }
Example #44
0
        private void InitializeCUDA()
        {
            context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11);

            module = context.LoadModulePTX(@"Kernels\kernel.ptx");

            kernelPositionWeightNoiseCube = new CudaKernel("position_weight_noise_cube", module, context);
            kernelNormalAmbient = new CudaKernel("normal_ambient", module, context);
            kernelMarchingCubesCases = new CudaKernel("marching_cubes_cases", module, context);
            kernelMarchingCubesVertices = new CudaKernel("marching_cubes_vertices", module, context);
            kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context);
            kernelPositionWeightFormula = new CudaKernel("position_weight_formula", module, context);

            prefixScan = new CUDAPrefixScan(module, context);
        }
        protected void InitContext()
        {
            var size = ParticlesCount * DimensionsCount;

            var threadsNum = 32;
            var blocksNum = ParticlesCount / threadsNum;
            Ctx = new CudaContext(0);

            UpdateVelocity = Ctx.LoadKernel("update_velocity_kernel.ptx", "updateVelocityKernel");
            UpdateVelocity.GridDimensions = blocksNum;
            UpdateVelocity.BlockDimensions = threadsNum;

            Transpose = Ctx.LoadKernel(KernelFile, "transposeKernel");
            Transpose.GridDimensions = blocksNum;
            Transpose.BlockDimensions = threadsNum;

            HostPositions = Random.RandomVector(size, -5.0, 5.0);
            HostVelocities = Random.RandomVector(size, -2.0, 2.0);
            HostPersonalBests = (double[]) HostPositions.Clone();
            HostPersonalBestValues = Enumerable.Repeat(double.MaxValue,ParticlesCount).ToArray();

            HostNeighbors = new int[ParticlesCount * 2];

            for (var i = 0; i < ParticlesCount*2; i += 2)
            {
                int left, right;

                if (i == 0)
                    left = ParticlesCount - 1;
                else
                    left = i - 1;

                if (i == ParticlesCount - 1)
                    right = 0;
                else
                    right = i + 1;

                HostNeighbors[i] = left;
                HostNeighbors[i + 1] = right;
            }

            DevicePositions = HostPositions;
            DeviceVelocities = HostVelocities;
            DevicePersonalBests = HostPersonalBests;
            DevicePersonalBestValues = HostPersonalBestValues;
            DeviceNeighbors = HostNeighbors;

            Init();
        }
Example #46
0
		/// <summary>
		/// Creates a new surface from array memory.
		/// </summary>
		/// <param name="kernel"></param>
		/// <param name="surfName"></param>
		/// <param name="flags"></param>
		/// <param name="array"></param>
		public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CudaArray3D array)
		{
			_surfref = new CUsurfref();
			res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName));
			if (res != CUResult.Success) throw new CudaException(res);

			_flags = flags;
			_format = array.Array3DDescriptor.Format;
			_height = array.Height;
			_width = array.Width;
			_depth = array.Depth;
			_numChannels = (int)array.Array3DDescriptor.NumChannels;
			_name = surfName;
			_module = kernel.CUModule;
			_cufunction = kernel.CUFunction;
			_channelSize = CudaHelperMethods.GetChannelSize(array.Array3DDescriptor.Format);
			_dataSize = array.Height * array.Width * array.Depth * array.Array3DDescriptor.NumChannels * _channelSize;
			_array = array;

			res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res));
			if (res != CUResult.Success) throw new CudaException(res);
		}
        /// <summary>
        /// Creates a new 2D texture from array memory
        /// </summary>
        /// <param name="kernel"></param>
        /// <param name="texName"></param>
        /// <param name="addressMode"></param>
        /// <param name="filterMode"></param>
        /// <param name="flags"></param>
        /// <param name="array"></param>
        public CudaTextureArray2D(CudaKernel kernel, string texName, CUAddressMode addressMode, CUFilterMode filterMode, CUTexRefSetFlags flags, CudaArray2D array)
            : this(kernel, texName, addressMode, addressMode, filterMode, flags, array)
        {

        }
Example #48
0
 static void InitKernels()
 {
     CudaContext cntxt = new CudaContext();
     CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Niels\Documents\uni ting\P10\P10\programs\small programs\CUDA 1D MA in C Sharp\CUDA 1D MA in C Sharp\Debug\kernel.ptx");
     addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt);
 }
Example #49
0
        static void Main(string[] args)
        {
            // NOTE: You need to change this location to match your own machine.
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment.");
            Console.ResetColor();
            System.Threading.Thread.Sleep(500);

            string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx";
            CudaContext ctx = new CudaContext();
            CUmodule module = ctx.LoadModule(path);
            kernel = new CudaKernel("kernel", module, ctx);

            // This tells the kernel to allocate a lot of threads for the Gpu.
            kernel.BlockDimensions = THREADS_PER_BLOCK;
            kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ;

            // Now let's load the kernel!
            // Create the topology.
            int[] topology = new int[] { 1, 200, 200, 100, 1 };

            int height = topology.Length;
            int width = 0;

            for (int i = 0; i < topology.Length; i++)
                if (width < topology[i]) width = topology[i];

            // Launch!
            float[] res = new float[height * width];
            for (int i = 0; i < 10; i++)
            {
                float[] matrix = new float[height * width];
                float[] weights = new float[height * width];
                Random rand = new Random(424242);
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0;
                        weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0;
                    }
                }

                // Load the kernel with some variables.
                CudaDeviceVariable<int> cuda_topology = topology;
                CudaDeviceVariable<float> cuda_membank = matrix;
                CudaDeviceVariable<float> cuda_weights = weights;

                Stopwatch sw = new Stopwatch();
                sw.Start();
                kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width);
                cuda_membank.CopyToHost(res);
                sw.Stop();

                Console.ForegroundColor = ConsoleColor.Green;
                Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]);
                Console.ResetColor();

            }

            Console.ReadKey();
        }
Example #50
0
		/// <summary>
		/// Bind a CudaArray3D to a surface reference.
		/// </summary>
		/// <param name="kernel"></param>
		/// <param name="surfName"></param>
		/// <param name="flags"></param>
		/// <param name="array"></param>
		public static void BindArray(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CudaArray3D array)
		{
			CUsurfref surfref = new CUsurfref();
			CUResult res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref surfref, kernel.CUModule, surfName);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName));
			if (res != CUResult.Success) throw new CudaException(res);
			
			res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(surfref, array.CUArray, flags);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res));
			if (res != CUResult.Success) throw new CudaException(res);
		}
Example #51
0
        private void InitializeD3D()
        {
            // Create the D3D object.
            d3d = new Direct3DEx();

            PresentParameters pp = new PresentParameters();
            pp.BackBufferWidth = 512;
            pp.BackBufferHeight = 512;
            pp.BackBufferFormat = Format.Unknown;
            pp.BackBufferCount = 0;
            pp.Multisample = MultisampleType.None;
            pp.MultisampleQuality = 0;
            pp.SwapEffect = SwapEffect.Discard;
            pp.DeviceWindowHandle = panel1.Handle;
            pp.Windowed = true;
            pp.EnableAutoDepthStencil = false;
            pp.AutoDepthStencilFormat = Format.Unknown;
            pp.PresentationInterval = PresentInterval.Default;

            bDeviceFound = false;
            CUdevice[] cudaDevices = null;
            for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++)
            {
                device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp);
                try
                {
                    cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9);
                    bDeviceFound = cudaDevices.Length > 0;
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.");
                    break;
                }
                catch (CudaException)
                {
                    //No Cuda device found for this Direct3D9 device
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.");
                }
            }

            // we check to make sure we have found a cuda-compatible D3D device to work on
            if (!bDeviceFound)
            {
                Console.WriteLine("No CUDA-compatible Direct3D9 device available");
                if (device != null)
                    device.Dispose();
                Close();
                return;
            }

            ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9);

            // Set projection matrix
            SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1);
            device.SetTransform(TransformState.Projection, matProj);

            // Turn off D3D lighting, since we are providing our own vertex colors
            device.SetRenderState(RenderState.Lighting, false);

            //Load kernels
            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_k", module, ctx);
        }
Example #52
0
        private bool InitializeD3D()
        {
            HwndSource hwnd = new HwndSource(0, 0, 0, 0, 0, "null", IntPtr.Zero);
            // Create the D3D object.
            d3d = new Direct3DEx();

            PresentParameters pp = new PresentParameters();
            pp.BackBufferWidth = 512;
            pp.BackBufferHeight = 512;
            pp.BackBufferFormat = Format.Unknown;
            pp.BackBufferCount = 0;
            pp.Multisample = MultisampleType.None;
            pp.MultisampleQuality = 0;
            pp.SwapEffect = SwapEffect.Discard;
            pp.DeviceWindowHandle = (IntPtr)0;
            pp.Windowed = true;
            pp.EnableAutoDepthStencil = false;
            pp.AutoDepthStencilFormat = Format.Unknown;
            pp.PresentationInterval = PresentInterval.Default;

            bDeviceFound = false;
            CUdevice[] cudaDevices = null;
            for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++)
            {
                device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, hwnd.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp);
                try
                {
                    cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9);
                    bDeviceFound = cudaDevices.Length > 0;
                    infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.\n");
                    break;
                }
                catch (CudaException)
                {
                    //No Cuda device found for this Direct3D9 device
                    infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.\n");
                }
            }

            // we check to make sure we have found a cuda-compatible D3D device to work on
            if (!bDeviceFound)
            {
                infoLog.AppendText("No CUDA-compatible Direct3D9 device available");
                if (device != null)
                    device.Dispose();
                return false;
            }

            ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9);
            deviceName.Text = "Device name: " + ctx.GetDeviceName();

            // Set projection matrix
            SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1);
            device.SetTransform(TransformState.Projection, matProj);

            // Turn off D3D lighting, since we are providing our own vertex colors
            device.SetRenderState(RenderState.Lighting, false);

            //Load kernels
            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_k", module, ctx);

            d3dimage.Lock();
            Surface surf = device.GetBackBuffer(0, 0);
            d3dimage.SetBackBuffer(D3DResourceType.IDirect3DSurface9, surf.ComPointer);
            d3dimage.Unlock();
            surf.Dispose();

            //Setup the "real" frame rate counter.
            //The cuda counter only measures cuda runtime, not the overhead to actually
            //show the result via DirectX and WPF.
            realLastTick = Environment.TickCount;
            return true;
        }
        public void Compile()
        {
            using (var ctx = new CudaContext())
            {
                // with verbaim string @, we only have to double up double quotes: no other escaping
                string source = @"
                extern ""C"" __global__
                void saxpy(float a, float *x, float *y, float *out, size_t n)
                {
                    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
                    if (tid < n)
                    {
                        out[tid] = a * x[tid] + y[tid];
                    }
                }
                ";

                source += Environment.NewLine;

                var name = "Test";
                var headers = new string[0];
                var includeNames = new string[0];

                var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames);

                //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames);
                // --ptxas-options=-v -keep
                compiler.Compile(new string[] { "-G" });

                //var ptxString = compiler.GetPTXAsString(); // for debugging

                var ptx = compiler.GetPTX();

                //compiler2.Compile(new string[] { });

                var kernel = ctx.LoadKernelPTX(ptx, "kernelName");

                //One kernel per cu file:
                //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname");
                kernel.GridDimensions = new dim3(1, 1, 1);
                kernel.BlockDimensions = new dim3(16, 16);

                //kernel.Run()

                var a = new CudaDeviceVariable<double>(100);
                //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr()

                //Multiple kernels per cu file:
                CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx");
                CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
                CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };

            }
        }
Example #54
0
		/// <summary>
		/// Create a new CudaArray3D and bind it to a surface reference.
		/// </summary>
		/// <param name="kernel"></param>
		/// <param name="surfName"></param>
		/// <param name="flags"></param>
		/// <param name="format"></param>
		/// <param name="width">In elements</param>
		/// <param name="height">In elements</param>
		/// <param name="depth">In elements</param>
		/// <param name="numChannels"></param>
		/// <param name="arrayFlags"></param>
		public static CudaArray3D BindArray(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, SizeT depth, CudaArray3DNumChannels numChannels, CUDAArray3DFlags arrayFlags)
		{
			CUsurfref surfref = new CUsurfref();
			CUResult res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref surfref, kernel.CUModule, surfName);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName));
			if (res != CUResult.Success) throw new CudaException(res);

			CudaArray3D array = new CudaArray3D(format, width, height, depth, numChannels, arrayFlags);

			res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(surfref, array.CUArray, flags);
			Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res));
			if (res != CUResult.Success) throw new CudaException(res);

			return array;
		}
Example #55
0
			/// <summary>
			/// 
			/// </summary>
			/// <param name="aKernel"></param>
			public cudaOccFuncAttributes(CudaKernel aKernel)
				: this(aKernel.MaxThreadsPerBlock, aKernel.Registers, aKernel.SharedMemory)
			{ 
			
			}
Example #56
0
		///////////////////////////////////////////////
		//    Occupancy calculation Functions        //
		///////////////////////////////////////////////

		/// <summary>
		/// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
		/// This is equivalent to the calculation done in the CUDA Occupancy Calculator
		/// spreadsheet
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="kernel"></param>
		/// <param name="state"></param>
		/// <returns></returns>
		public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
			CudaDeviceProperties properties,
			CudaKernel kernel,
			cudaOccDeviceState state)
		{
			cudaOccDeviceProp props = new cudaOccDeviceProp(properties);
			cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

			return cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state);			
		}
Example #57
0
		/// <summary>
		/// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously 
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="kernel"></param>
		/// <param name="state"></param>
		/// <param name="blockSizeToSMem">
		/// A function to convert from block size to dynamic shared memory size.<para/>
		/// e.g.:
		/// If no dynamic shared memory is used: x => 0<para/>
		/// If 4 bytes shared memory per thread is used: x = 4 * x</param>
		/// <returns>maxBlockSize</returns>
		public static int cudaOccMaxPotentialOccupancyBlockSize(
			CudaDeviceProperties properties,
			CudaKernel kernel,
			cudaOccDeviceState state,
			del_blockSizeToDynamicSMemSize blockSizeToSMem)
		{
			cudaOccDeviceProp props = new cudaOccDeviceProp(properties);
			cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);
			return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem);
		}
Example #58
0
        static void Main(string[] args)
        {
            var assembly = Assembly.GetExecutingAssembly();
            var resourceName = "simpleOccupancy.simpleOccupancy.ptx";

            ctx = new CudaContext(0);
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                kernel = ctx.LoadKernelPTX(stream, "square");
            }

            Console.WriteLine("starting Simple Occupancy");
            Console.WriteLine();

            Console.WriteLine("[ Manual configuration with {0} threads per block ]", manualBlockSize);

            int status = test(false);
            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();

            Console.WriteLine("[ Automatic, occupancy-based configuration ]");
            status = test(true);
            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();
            Console.WriteLine("Test PASSED");
        }
Example #59
0
 /// <summary>
 /// 
 /// </summary>
 /// <param name="aKernel"></param>
 public cudaOccFuncAttributes(CudaKernel aKernel)
     : this(aKernel.MaxThreadsPerBlock, aKernel.Registers, aKernel.SharedMemory, cudaOccPartitionedGCConfig.Off)
 {
 }