Exemple #1
1
        private void initGLAndCuda()
        {
            //Create render target control
            m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default);
            m_renderControl.Dock = DockStyle.Fill;
            m_renderControl.BackColor = Color.White;
            m_renderControl.BorderStyle = BorderStyle.FixedSingle;
            m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown);
            m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove);
            m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown);
            m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged);

            panel1.Controls.Add(m_renderControl);
            Console.WriteLine("   OpenGL device is Available");

            int deviceID = CudaContext.GetMaxGflopsDeviceId();

            ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync);
            string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount);
            Console.WriteLine(console);

            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx);

            hvfield = new cData[DS];
            dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM);
            tPitch = dvfield.Pitch;

            dvfield.CopyToDevice(hvfield);

            vxfield = new CudaDeviceVariable<cData>(DS);
            vyfield = new CudaDeviceVariable<cData>(DS);

            // Create particle array
            particles = new cData[DS];
            initParticles(particles, DIM, DIM);

            // TODO: update kernels to use the new unpadded memory layout for perf
            // rather than the old FFTW-compatible layout
            planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding);
            planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding);

            GL.GenBuffers(1, out vbo);
            GL.BindBuffer(BufferTarget.ArrayBuffer, vbo);
            GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw);
            int bsize;
            GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize);

            if (bsize != DS * cData.SizeOf)
                throw new Exception("Sizes don't match.");

            GL.BindBuffer(BufferTarget.ArrayBuffer, 0);

            cuda_vbo_resource = new CudaGraphicsInteropResourceCollection();
            cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None));

            texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two);

            stopwatch = new CudaStopWatch(CUEventFlags.Default);

            reshape();
            isInit = true;
            display();
        }
Exemple #2
0
 public GpuMathOperations()
 {
     this.cuBlas               = new CudaBlas();
     this.cudaContext          = new CudaContext();
     this.cuModule             = cudaContext.LoadModulePTX(kernalFile);
     this.maxThreadPerBlockDim = (int)Math.Sqrt(this.cudaContext.GetDeviceInfo().MaxThreadsPerBlock);
 }
Exemple #3
0
        private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue)
        {
            CudaContext cudaContext = context.CudaContextForTensor(originalLengths);

            cudaContext.SetCurrent();

            int  ndim        = result.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides);
            long cols        = result.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr          = CudaHelpers.GetBufferStart(result);
            CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths);


            Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue);
        }
Exemple #4
0
        private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src1);

            cudaContext.SetCurrent();

            int  ndim        = src1.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides);
            long cols        = src1.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr src1Ptr   = CudaHelpers.GetBufferStart(src1);
            CUdeviceptr src2Ptr   = CudaHelpers.GetBufferStart(src2);
            CUdeviceptr alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Exemple #5
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();
            int  ndim        = inGrad.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides);
            long cols        = inGrad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            CUdeviceptr out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            CUdeviceptr betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            CUdeviceptr inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            CUdeviceptr yPtr         = CudaHelpers.GetBufferStart(y);
            CUdeviceptr x1Ptr        = CudaHelpers.GetBufferStart(x1);
            CUdeviceptr x2Ptr        = CudaHelpers.GetBufferStart(x2);
            CUdeviceptr alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Exemple #6
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            CudaContext cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            int  ndim        = grad.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides);
            long cols        = grad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            int iAddGrad = addGrad ? 1 : 0;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad);
            CUdeviceptr adjPtr  = CudaHelpers.GetBufferStart(adj);
            CUdeviceptr valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
        }
Exemple #7
0
        // Testing getting device information via managedCuda
        private static void GetInformationAboutDevice()
        {
            // Number of devices
            var deviceCount = CudaContext.GetDeviceCount();

            Console.WriteLine(deviceCount + " Devices");

            if (deviceCount <= 0)
            {
                throw new Exception("No cuda device detected");
            }

            // Pick device based on performance.
            var deviceByFlops = CudaContext.GetMaxGflopsDeviceId();

            Console.WriteLine("Unit {0} has the most Gflops", deviceByFlops);

            var deviceProperties = CudaContext.GetDeviceInfo(deviceByFlops);

            Console.WriteLine("And has the following properties: ");
            Console.WriteLine(deviceProperties.DeviceName);
            Console.WriteLine("Can execute concurrent kernels: " + deviceProperties.ConcurrentKernels);
            Console.WriteLine("Multi processor count: " + deviceProperties.MultiProcessorCount);
            Console.WriteLine("Clockrate (mhz): " + (int)deviceProperties.ClockRate / 1000.0);
            Console.WriteLine("Total global memory (MB): " + deviceProperties.TotalGlobalMemory / 1000000);
            Console.WriteLine("Is integrated: " + deviceProperties.Integrated);
            Console.WriteLine("Max block dimension: " + deviceProperties.MaxGridDim);
            Console.WriteLine("Max block dimension: " + deviceProperties.MaxBlockDim);
            Console.WriteLine("Max threads per block: " + deviceProperties.MaxThreadsPerBlock);
            Console.WriteLine("Max threads per multiprocessor: " + deviceProperties.MaxThreadsPerMultiProcessor);
            Console.WriteLine("Max shared mem block can use (b): " + deviceProperties.SharedMemoryPerBlock);
            Console.WriteLine("If device can do mem copy and kernel execution: " + deviceProperties.GpuOverlap);
            Console.WriteLine("can map memory adress space on host and device: " + deviceProperties.CanMapHostMemory);
        }
Exemple #8
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            long rows = inGrad.Sizes[0];
            long cols = inGrad.Sizes[1];

            int  ndim     = inGrad.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= inGrad.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            CUdeviceptr out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            CUdeviceptr betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            CUdeviceptr inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            CUdeviceptr yPtr         = CudaHelpers.GetBufferStart(y);
            CUdeviceptr x1Ptr        = CudaHelpers.GetBufferStart(x1);
            CUdeviceptr x2Ptr        = CudaHelpers.GetBufferStart(x2);
            CUdeviceptr alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Exemple #9
0
        //private CudaKernel kernel1;

        //public Class1()
        //{
        //    //int deviceID = 0;

        //    //CudaContext ctx = new CudaContext(deviceID);
        //    //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
        //    //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);
        //}

        public static double[,] TestMatrix(double[][,] a)
        {
            using (CudaContext ctx = new CudaContext(0))
            {
                CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
                var      kernel   = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);

                int dimZ = a.Length;
                int dimX = a[0].GetLength(0);
                int dimY = a[0].GetLength(1);

                kernel.GridDimensions  = new dim3(28, 28, 1);
                kernel.BlockDimensions = new dim3(1, 1, 1);
                //kernel.BlockDimensions = new dim3(dimX, dimY, 1);

                // Allocate vectors in device memory and copy vectors from host memory to device memory
                CudaDeviceVariable <double> dA = a.ToLinearArray();
                //CudaDeviceVariable<double> dB = ToLinearArray(b);
                CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY);

                // Invoke kernel
                kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ);

                // Copy result from device memory to host memory
                double[] c = dC;

                //ctx.FreeMemory(dC.DevicePointer);
                //ctx.FreeMemory(dA.DevicePointer);
                //ctx.Dispose();

                return(ToMultyArray(c, dimX));
            }
        }
Exemple #10
0
        private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src1);

            cudaContext.SetCurrent();

            long rows = src1.Sizes[0];
            long cols = src1.Sizes[1];

            int  ndim     = src1.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src1.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr src1Ptr   = CudaHelpers.GetBufferStart(src1);
            CUdeviceptr src2Ptr   = CudaHelpers.GetBufferStart(src2);
            CUdeviceptr alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Exemple #11
0
        private void Softmax(TSCudaContext context, Tensor result, Tensor src)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            long rows = src.Sizes[0];
            long cols = src.Sizes[1];

            int  ndim     = src.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr srcPtr    = CudaHelpers.GetBufferStart(src);

            Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols);
        }
Exemple #12
0
        private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            CudaContext cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            long rows = weight.Sizes[0];
            long cols = weight.Sizes[1];

            int  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr weightPtr   = CudaHelpers.GetBufferStart(weight);
            CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient);
            CUdeviceptr cachePtr    = CudaHelpers.GetBufferStart(cache);

            Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Exemple #13
0
 public CUDAPrefixScan(CUmodule module, CudaContext context)
 {
     this.context = context;
     kernelScanExclusiveShared  = new CudaKernel("scanExclusiveShared", module, context);
     kernelScanExclusiveShared2 = new CudaKernel("scanExclusiveShared2", module, context);
     kernelUniformUpdate        = new CudaKernel("uniformUpdate", module, context);
 }
Exemple #14
0
        public static void blaa()
        {
            int num = 10;
            //NewContext creation
            CudaContext cntxt = new CudaContext();

            //Module loading from precompiled .ptx in a project output folder
            CUmodule cumodule = cntxt.LoadModule("kernel.ptx");

            //_Z9addKernelPf - function name, can be found in *.ptx file
            CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt);

            //Create device array for data
            CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num);

            //Create arrays with data
            float[] vec1 = new float[num];

            //Copy data to device
            vec1_device.CopyToDevice(vec1);

            //Set grid and block dimensions
            addWithCuda.GridDimensions  = new dim3(8, 1, 1);
            addWithCuda.BlockDimensions = new dim3(512, 1, 1);

            //Run the kernel
            addWithCuda.Run(
                vec1_device.DevicePointer);

            //Copy data from device
            vec1_device.CopyToHost(vec1);
        }
Exemple #15
0
 public PtrToMemory(CudaContext context, IDeviceMemoryPtr rootBlock, CUdeviceptr ptr, SizeT size)
 {
     _context   = context;
     _ptr       = new CudaDeviceVariable <float>(ptr, size);
     _rootBlock = rootBlock;
     rootBlock.AddRef();
 }
Exemple #16
0
        public static float2[] calculateCudaFFT(float[] h_dataIn)
        {
            CudaContext cntxt = new CudaContext();
            //Caution: Array sizes matter! Based on CUFFFT-Documentation...
            int size_real    = h_dataIn.Length;
            int size_complex = (int)Math.Floor(size_real / 2.0) + 1;

            //Crating FFT Plan
            CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C);

            //Size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real
            CudaDeviceVariable <float> d_data = new CudaDeviceVariable <float>(size_complex * 2);

            //device allocation and host have different sizes, why the amount of data must be given explicitly for copying:
            d_data.CopyToDevice(h_dataIn, 0, 0, size_real * sizeof(float));

            //executa plan
            fftPlan.Exec(d_data.DevicePointer, TransformDirection.Forward);

            //Output to host, either as float2 or float, but array sizes must be right!
            float2[] h_dataOut  = new float2[size_complex];
            float[]  h_dataOut2 = new float[size_complex * 2];
            d_data.CopyToHost(h_dataOut);
            d_data.CopyToHost(h_dataOut2);
            fftPlan.Dispose();
            return(h_dataOut);
        }
Exemple #17
0
        public DenoiseAndDemoisaic(int tileSize, CudaContext ctx, CUmodule mod, bool UseCUDNN)
        {
            _tileSize = tileSize;
            start     = new StartLayer(tileSize, tileSize, 3, 1);
            final     = new FinalLayer(tileSize - 16, tileSize - 16, 3, 1, ctx, mod);

            if (UseCUDNN)
            {
                CudaDNNContext cuddn = new CudaDNNContext();
                conv1 = new ConvolutionalLayer(tileSize, tileSize, 3, tileSize - 8, tileSize - 8, 64, 1, 9, 9, ConvolutionalLayer.Activation.PRelu, cuddn, ctx, mod);
                conv2 = new ConvolutionalLayer(tileSize - 8, tileSize - 8, 64, tileSize - 12, tileSize - 12, 64, 1, 5, 5, ConvolutionalLayer.Activation.PRelu, cuddn, ctx, mod);
                conv3 = new ConvolutionalLayer(tileSize - 12, tileSize - 12, 64, tileSize - 16, tileSize - 16, 3, 1, 5, 5, ConvolutionalLayer.Activation.None, cuddn, ctx, mod);
                start.ConnectFollowingLayer(conv1);
                conv1.ConnectFollowingLayer(conv2);
                conv2.ConnectFollowingLayer(conv3);
                conv3.ConnectFollowingLayer(final);
            }
            else
            {
                conv1NPP = new ConvolutionalLayerNPP(tileSize, tileSize, 3, tileSize - 8, tileSize - 8, 64, 1, 9, 9, ConvolutionalLayerNPP.Activation.PRelu, ctx, mod);
                conv2NPP = new ConvolutionalLayerNPP(tileSize - 8, tileSize - 8, 64, tileSize - 12, tileSize - 12, 64, 1, 5, 5, ConvolutionalLayerNPP.Activation.PRelu, ctx, mod);
                conv3NPP = new ConvolutionalLayerNPP(tileSize - 12, tileSize - 12, 64, tileSize - 16, tileSize - 16, 3, 1, 5, 5, ConvolutionalLayerNPP.Activation.None, ctx, mod);
                start.ConnectFollowingLayer(conv1NPP);
                conv1NPP.ConnectFollowingLayer(conv2NPP);
                conv2NPP.ConnectFollowingLayer(conv3NPP);
                conv3NPP.ConnectFollowingLayer(final);
            }

            tileAsPlanes = new CudaDeviceVariable <float>(tileSize * tileSize * 3);
            tile         = new NPPImage_32fC3(tileSize, tileSize);
        }
Exemple #18
0
    public static void CalculateNeabours <T>
        (CudaContext context,
        DeviceDataSet <T> teaching,
        DeviceDataSet <T> test,
        CudaDeviceVariable <int> calculatedNeabours,
        int threadsPerBlock
        ) where T : struct
    {
        var kernel = context.LoadKernel("kernels/VectorReduction.ptx", "calculateNearestNeabours");

        kernel.GridDimensions  = test.length / threadsPerBlock + 1;
        kernel.BlockDimensions = threadsPerBlock;

        kernel.SetConstantVariable("testVectorsCount", test.length);
        kernel.SetConstantVariable("teachingVectorsCount", teaching.length);
        kernel.SetConstantVariable("attributeCount", teaching.attributeCount);

        using (var deviceDistanceMemory =
                   new CudaDeviceVariable <float>(teaching.length * test.length))
        {
            kernel.Run(
                teaching.vectors.DevicePointer,
                test.vectors.DevicePointer,
                deviceDistanceMemory.DevicePointer,
                calculatedNeabours.DevicePointer
                );
            Thrust.sort_by_key_multiple(deviceDistanceMemory, calculatedNeabours, teaching.length, test.length);
        }
    }
Exemple #19
0
    public VectorReductionAccuracy(CudaContext context, DeviceDataSet <int> teaching, DeviceDataSet <int> test, int popSize)
    {
        this.teaching = teaching;
        this.test     = test;
        this.popSize  = popSize;
        this.context  = context;

        calculatedNeabours = new CudaDeviceVariable <int>(teaching.length * test.length);
        deviceAccuracy     = new CudaDeviceVariable <float>(popSize);

        Profiler.Start("calculate neabours");
        Neabours.CalculateNeabours(context, teaching, test, calculatedNeabours, ThreadsPerBlock);
        Profiler.Stop("calculate neabours");


        accuracyKernel = context.LoadKernel("kernels/VectorReduction.ptx", "calculateAccuracy");
        dim3 gridDimension = new dim3()
        {
            x = (uint)(test.length / ThreadsPerBlock + 1),
            y = (uint)popSize,
            z = 1
        };

        accuracyKernel.GridDimensions  = gridDimension;
        accuracyKernel.BlockDimensions = ThreadsPerBlock;

        accuracyKernel.SetConstantVariable("testVectorsCount", test.length);
        accuracyKernel.SetConstantVariable("teachingVectorsCount", teaching.length);
        accuracyKernel.SetConstantVariable("attributeCount", teaching.attributeCount);
        accuracyKernel.SetConstantVariable("genLength", teaching.length);

        K           = 3;
        CountToPass = 2;
    }
Exemple #20
0
        private static dim3 GetContigReduceBlock(CudaContext cudaContext, long numSlices, long reductionSize)
        {
            // If the number of slices is low but the reduction dimension size
            // is high, then we should increase block size for greater parallelism.
            // Aim for at least 32 warps per SM (assume 15 SMs; don't bother
            // inquiring the real number for now).
            var smCount  = 15;
            var maxWarps = 4; // better occupancy if many blocks are around

            // For numSlices > smCount * 8, there are > 32 warps active per SM.
            if (numSlices < smCount * 8)
            {
                maxWarps = 8;
                if (numSlices < smCount * 4)
                {
                    maxWarps = 16;
                    if (numSlices < smCount * 2)
                    {
                        maxWarps = 32;
                    }
                }
            }

            // Scale up block size based on the reduction dimension size
            var warpsInReductionSize = ApplyUtils.CeilDiv(reductionSize, 32);
            var numWarps             =
                warpsInReductionSize > maxWarps ? maxWarps : (int)warpsInReductionSize;

            var targetSize = numWarps * 32;

            targetSize = Math.Min(targetSize, (int)cudaContext.GetDeviceInfo().MaxBlockDim.x);
            return(new dim3(targetSize));
        }
Exemple #21
0
        private int findGraphicsGPU(out string devName)
        {
            int    nGraphicsGPU = 0;
            int    deviceCount = 0;
            bool   bFoundGraphics = false;
            string firstGraphicsName = string.Empty, temp;

            devName = string.Empty;

            deviceCount = CudaContext.GetDeviceCount();

            // This function call returns 0 if there are no CUDA capable devices.
            if (deviceCount == 0)
            {
                Console.WriteLine("There are no device(s) supporting CUDA");
                return(0);
            }
            else
            {
                Console.WriteLine("> Found " + deviceCount + " CUDA Capable Device(s)");
            }

            for (int dev = 0; dev < deviceCount; dev++)
            {
                temp = CudaContext.GetDeviceName(dev);
                bool          bGraphics = !temp.Contains("Tesla");
                StringBuilder sb        = new StringBuilder();
                sb.Append("> ");
                if (bGraphics)
                {
                    sb.Append("Graphics");
                }
                else
                {
                    sb.Append("Compute");
                }

                sb.Append("\t\tGPU ").Append(dev).Append(": ").Append(CudaContext.GetDeviceName(dev));
                Console.WriteLine(sb.ToString());

                if (bGraphics)
                {
                    if (!bFoundGraphics)
                    {
                        firstGraphicsName = temp;
                    }
                    nGraphicsGPU++;
                }
            }

            if (nGraphicsGPU != 0)
            {
                devName = firstGraphicsName;
            }
            else
            {
                devName = "this hardware";
            }
            return(nGraphicsGPU);
        }
Exemple #22
0
        private void ReduceIndexOuterDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, int dimension, Tuple <float, float> init, string baseKernelName)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            int  ndim      = src.DimensionCount;
            long num_orows = 1;

            for (int dim = 0; dim < dimension; dim++)
            {
                num_orows *= src.Sizes[dim];
            }
            long row_size  = src.Sizes[dimension];
            long num_irows = 1;

            for (int dim = dimension + 1; dim < ndim; dim++)
            {
                num_irows *= src.Sizes[dim];
            }

            dim3 threads    = new dim3((uint)Math.Min(512, num_irows));
            int  maxGridDim = 1024;
            dim3 grid       = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x)));

            CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues);
            CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices);
            CUdeviceptr srcPtr       = CudaHelpers.GetBufferStart(src);

            string kernelName = "outer_index_" + baseKernelName;

            Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_orows, num_irows, row_size, init.Item1, init.Item2);
        }
Exemple #23
0
        /// <summary>
        /// Initializes a new instance of the <see cref="TSCudaContext"/> class.
        /// </summary>
        public TSCudaContext()
        {
            try
            {
                this.deviceCount = CudaContext.GetDeviceCount();
            }
            catch
            {
                // CudaContext.GetDeviceCount() throws if CUDA drivers are not installed
                this.deviceCount = 0;
            }

            this.devices = Enumerable.Repeat(0, deviceCount)
                           .Select(x => new DeviceState(x))
                           .ToArray();

            if (deviceCount > 0)
            {
                p2pAccess = EnablePeerAccess(devices.Select(x => x.CudaContext).ToArray(), devices[0].CudaContext);
            }
            else
            {
                p2pAccess = new bool[0, 0];
            }

            this.diskCache = new RuntimeCompiler.KernelDiskCache(Path.Combine(Environment.CurrentDirectory, CacheDir));
            this.compiler  = new RuntimeCompiler.CudaCompiler(diskCache);

            OpRegistry.RegisterAssembly(Assembly.GetExecutingAssembly());
        }
Exemple #24
0
        public CudaKernel Get(CudaContext context, byte[] ptx, string kernelName)
        {
            lock (locker)
            {
                try
                {
                    if (activeKernels.TryGetValue(Tuple.Create(context, ptx, kernelName), out CudaKernel value))
                    {
                        return(value);
                    }
                    else
                    {
                        value = context.LoadKernelPTX(ptx, kernelName);
                        activeKernels.Add(Tuple.Create(context, ptx, kernelName), value);
                        return(value);
                    }
                }
                catch (Exception err)
                {
                    Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Exception: '{err.Message}'");
                    Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Call stack: '{err.StackTrace}'");

                    throw err;
                }
            }
        }
Exemple #25
0
        static void InitKernels()
        {
            cntxt = new CudaContext();
            CUmodule cumodule = cntxt.LoadModulePTX(@"C:\work\Sobel\CudaTest\x64\Debug\kernel.ptx");

            matrixSumCude = new CudaKernel("_Z15matrixSumKernelPdPKdiii", cumodule, cntxt);
        }
Exemple #26
0
        // Testing managed CUDA call
        private static void RunCudaWithAKernel()
        {
            // C# Cuda code to call kernel

            int         N            = 50000;
            int         deviceID     = 0;
            CudaContext ctx          = new CudaContext(deviceID);
            CudaKernel  kernel       = ctx.LoadKernel("kernel_x64.ptx", "VecAdd");
            int         numOfThreads = 256;

            kernel.GridDimensions  = (N + numOfThreads - 1) / numOfThreads;
            kernel.BlockDimensions = numOfThreads;

            // allocate memory in host (not gpu)
            var h_A = InitWithData(N, numOfThreads * 4);
            var h_B = InitWithData(N, numOfThreads);

            // Allocate vectors in device memory and copy from host to device.
            CudaDeviceVariable <float> d_A = h_A;
            CudaDeviceVariable <float> d_B = h_B;
            CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N);

            //Invoke kernel
            kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            Console.WriteLine("kernel has runeth");
            //Copy from memory of device to host.
            float[] h_C = d_C;
        }
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            cudaContext.SetCurrent();

            CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

            IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
            Tensor firstTensor       = allTensors.First();
            long   elementCount      = firstTensor.ElementCount();
            ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
            ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
Exemple #28
0
    public void GenerateRandomNumbers()
    {
        CleanupResources();

        //Init Cuda context
        ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

        // Allocate input vectors h_A and h_B in host memory
        h_A = new float[Count];
        h_B = new float[Count];

        // Initialize input vectors
        RandomInit(h_A, Count);
        RandomInit(h_B, Count);

        // Allocate vectors in device memory and copy vectors from host memory to device memory
        // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
        d_A = h_A;
        d_B = h_B;
        //d_C = new CudaDeviceVariable<float>(Count);

        // Allocate Shared Memory. The GPU will write here
        // A = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global);
        // B = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global);
        C = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global);
    }
        /// <summary>
        /// Invokes the specified kernels.
        /// </summary>
        /// <param name="kernels">The kernels.</param>
        /// <param name="context">The context.</param>
        /// <param name="cudaContext">The cuda context.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        public static void Invoke(FillCopyKernels kernels, TSCudaContext context, CudaContext cudaContext, NDArray result, NDArray src)
        {
            var ptx          = kernels.GetPtx(context.Compiler);
            var elementCount = result.ElementCount();

            ApplyOpInvoke.Invoke(context, cudaContext, ptx, "copy", result, src, elementCount);
        }
Exemple #30
0
        // General GPU Device CUDA Initialization
        static int gpuDeviceInit(int devID)
        {
            int deviceCount = CudaContext.GetDeviceCount();

            if (deviceCount == 0)
            {
                Console.Write("gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
                Environment.Exit(-1);
            }
            if (devID < 0)
            {
                devID = 0;
            }
            if (devID > deviceCount - 1)
            {
                Console.Write("\n");
                Console.Write(">> {0} CUDA capable GPU device(s) detected. <<\n", deviceCount);
                Console.Write(">> gpuDeviceInit (-device={0}) is not a valid GPU device. <<\n", devID);
                Console.Write("\n");
                return(-devID);
            }


            if (CudaContext.GetDeviceComputeCapability(devID).Major < 1)
            {
                Console.Write("gpuDeviceInit(): GPU device does not support CUDA.\n");
                Environment.Exit(-1);
            }
            ctx = new CudaContext(devID);
            Console.Write("> gpuDeviceInit() CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName());
            return(devID);
        }
Exemple #31
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            CudaContext cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            long rows     = grad.Sizes[0];
            long cols     = grad.Sizes[1];
            int  iAddGrad = addGrad ? 1 : 0;

            int  ndim     = grad.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= grad.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad);
            CUdeviceptr adjPtr  = CudaHelpers.GetBufferStart(adj);
            CUdeviceptr valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
        }
Exemple #32
0
        public static void InitKernels()
        {
            CudaContext cntxt = new CudaContext();

            //CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Michał\Documents\Visual Studio 2013\Projects\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");
            CUmodule cumodule = cntxt.LoadModule(@"D:\Grafika\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx");
            addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt);
        }
Exemple #33
0
        public FxCuda(Boolean initUtils=false)
        {
            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            // init utils
            if (initUtils)
                Utils = new CudaUtils(this);
        }
 public CudaModuleHelper(CudaContext context, string file)
 {
     Context = context;
     Module = context.LoadModule(file);
     PtxFile = file;
     functionNames = File.ReadAllLines(file)
         .Where(x => x.Contains("// .globl"))
         .Select(x => x.Replace("// .globl", "").Trim())
         .ToArray();
 }
Exemple #35
0
        static void InitKernels()
        {
            //max thread number - 65534x256=16776704
            _matrixSize = 256;
            _threadsPerBlock = 256;

            CleanUpResources();
            _cnContext = new CudaContext(CudaContext.GetMaxGflopsDeviceId());
            CUmodule cumodule = _cnContext.LoadModule(@"\Kernel\kernel.ptx");
            _multiplyTwoVectorWithCuda = new CudaKernel("_Z6kernel_", cumodule, _cnContext);
        }
        public void ContextWithStream()
        {
            using ( var cuda = new CudaContext() )
            using ( var stream = new CudaStream(CUStreamFlags.Default) )
            {
                using (var context = CudnnContext.Create(stream))
                {
                    Assert.True(context.IsInitialized);

                    var streamId = default (CUstream);
                    CudnnContext.Invoke(() => CudnnNativeMethods.cudnnGetStream(context.Handle, out streamId));

                    Assert.Equal(stream.Stream, streamId);
                }
            }
        }
        public uint[] Run()
        {
            var ptx = @"C:\Src\_Tree\SmallPrograms\Buddhabrot\Buddhabrot.Cuda70\x64\Release\Buddhabrot.ptx";

            var context = new CudaContext();
            var module = new CudaModuleHelper(context, ptx);

            var init = module.GetKernel("Init");
            var setSettings = module.GetKernel("SetSettings");
            var runBuddha = module.GetKernel("RunBuddha");

            var nBlocks = 4196;
            var nThreads = 256;

            var dSettings = context.AllocateMemoryFor(settings);
            context.CopyToDevice(dSettings, settings);

            var array = new uint[settings.Width * settings.Height];
            var dState = context.AllocateMemory(nThreads * nBlocks * SizeOfCurandState);
            var dArray = context.AllocateMemoryFor(array);
            context.CopyToDevice(dArray, array);

            init.Launch(nBlocks, nThreads, dState);
            setSettings.Launch(1, 1, dSettings);

            Console.WriteLine("Starting...");
            var sw = Stopwatch.StartNew();
            long i = 0;

            while (!IsStopping)
            {
                runBuddha.Launch(nBlocks, nThreads, dArray, dState);

                double count = (++i * nBlocks * nThreads);
                if (i % 5 == 0)
                {
                    Console.WriteLine("Generated {0:0.0} Million samples in {1:0.000} sec", count / 1000000.0, sw.ElapsedMilliseconds / 1000.0);
                }

                if (maxSamples.HasValue && count >= maxSamples)
                    break;
            }

            context.CopyToHost(array, dArray);
            return array;
        }
Exemple #38
0
 static void Test(byte[] ptxFile)
 {
     const int size = 16;
     var context = new CudaContext();
     var kernel = context.LoadKernelPTX(ptxFile, "kernel");
     var memory = context.AllocateMemory(4 * size);
     var gpuMemory = new CudaDeviceVariable<int>(memory);
     var cpuMemory = new int[size];
     for (var i = 0; i < size; i++)
         cpuMemory[i] = i - 2;
     gpuMemory.CopyToDevice(cpuMemory);
     kernel.BlockDimensions = 4;
     kernel.GridDimensions = 4;
     kernel.Run(memory);
     gpuMemory.CopyToHost(cpuMemory);
     for (var i = 0; i < size; i++)
         Console.WriteLine("{0} = {1}", i, cpuMemory[i]);
 }
        private void InitializeCUDA()
        {
            context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11);

            module = context.LoadModulePTX(@"Kernels\kernel.ptx");

            kernelPositionWeightNoiseCube = new CudaKernel("position_weight_noise_cube", module, context);
            kernelNormalAmbient = new CudaKernel("normal_ambient", module, context);
            kernelMarchingCubesCases = new CudaKernel("marching_cubes_cases", module, context);
            kernelMarchingCubesVertices = new CudaKernel("marching_cubes_vertices", module, context);
            kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context);
            kernelPositionWeightFormula = new CudaKernel("position_weight_formula", module, context);

            prefixScan = new CUDAPrefixScan(module, context);
        }
Exemple #40
0
        // General GPU Device CUDA Initialization
        static int gpuDeviceInit(int devID)
        {
            int deviceCount = CudaContext.GetDeviceCount();

            if (deviceCount == 0)
            {
                Console.Write("gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
                Environment.Exit(-1);
            }
            if (devID < 0)
                devID = 0;
            if (devID > deviceCount - 1)
            {
                Console.Write("\n");
                Console.Write(">> {0} CUDA capable GPU device(s) detected. <<\n", deviceCount);
                Console.Write(">> gpuDeviceInit (-device={0}) is not a valid GPU device. <<\n", devID);
                Console.Write("\n");
                return -devID;
            }

            if (CudaContext.GetDeviceComputeCapability(devID).Major < 1)
            {
                Console.Write("gpuDeviceInit(): GPU device does not support CUDA.\n");
                Environment.Exit(-1);
            }
            ctx = new CudaContext(devID);
            Console.Write("> gpuDeviceInit() CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName());
            return devID;
        }
Exemple #41
0
        static void Main(string[] args)
        {
            int cuda_device = 0;
            int nstreams = 4;               // number of streams for CUDA calls
            int nreps = 10;                 // number of times each experiment is repeated
            int n = 16 * 1024 * 1024;       // number of ints in the data set
            int nbytes = n * sizeof(int);   // number of data bytes
            dim3 threads, blocks;           // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel;   // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool bPinGenericMemory = false; // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;	// number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int temp = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                            case 0:
                                device_sync_method = CUCtxFlags.SchedAuto;
                                break;
                            case 1:
                                device_sync_method = CUCtxFlags.SchedSpin;
                                break;
                            case 2:
                                device_sync_method = CUCtxFlags.SchedYield;
                                break;
                            case 4:
                                device_sync_method = CUCtxFlags.BlockingSync;
                                break;
                            default:
                                error = true;
                                break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();
            if(0==num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);
            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1; // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                    deviceProp.MultiProcessorCount,
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;
            if (bPinGenericMemory)
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            else
                ctx = new CudaContext(cuda_device, device_sync_method);

            //Load Kernel image from resources
            string resName;
            if (IntPtr.Size == 8)
                resName = "simpleStreams_x64.ptx";
            else
                resName = "simpleStreams.ptx";

            string resNamespace = "simpleStreams";
            string resource = resNamespace + "." + resName;
            Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);
            if (stream == null) throw new ArgumentException("Kernel not found in resources.");

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");

            // allocate host memory
            int c = 5;											// value to which the array will be initialized
            int[] h_a = null;									// pointer to the array data in host memory
            CudaPageLockedHostMemory<int> hAligned_a = null;	// pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)
            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable<int> d_c = c; //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable<int> d_a = new CudaDeviceVariable<int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();     // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();   // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);

            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0); // set device memory to all 0s, for testing correctness

            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for(int i = 0; i < nstreams; i++)
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c*nreps*niterations);

            // release resources
            for(int i = 0; i < nstreams; i++) {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Exemple #42
0
        private void InitializeD3D()
        {
            // Create the D3D object.
            d3d = new Direct3DEx();

            PresentParameters pp = new PresentParameters();
            pp.BackBufferWidth = 512;
            pp.BackBufferHeight = 512;
            pp.BackBufferFormat = Format.Unknown;
            pp.BackBufferCount = 0;
            pp.Multisample = MultisampleType.None;
            pp.MultisampleQuality = 0;
            pp.SwapEffect = SwapEffect.Discard;
            pp.DeviceWindowHandle = panel1.Handle;
            pp.Windowed = true;
            pp.EnableAutoDepthStencil = false;
            pp.AutoDepthStencilFormat = Format.Unknown;
            pp.PresentationInterval = PresentInterval.Default;

            bDeviceFound = false;
            CUdevice[] cudaDevices = null;
            for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++)
            {
                device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp);
                try
                {
                    cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9);
                    bDeviceFound = cudaDevices.Length > 0;
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.");
                    break;
                }
                catch (CudaException)
                {
                    //No Cuda device found for this Direct3D9 device
                    Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.");
                }
            }

            // we check to make sure we have found a cuda-compatible D3D device to work on
            if (!bDeviceFound)
            {
                Console.WriteLine("No CUDA-compatible Direct3D9 device available");
                if (device != null)
                    device.Dispose();
                Close();
                return;
            }

            ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9);

            // Set projection matrix
            SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1);
            device.SetTransform(TransformState.Projection, matProj);

            // Turn off D3D lighting, since we are providing our own vertex colors
            device.SetRenderState(RenderState.Lighting, false);

            //Load kernels
            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_k", module, ctx);
        }
Exemple #43
0
 public static void Initialize(CudaContext context, Device device)
 {
     _context = context;
     _device = device;
 }
 public GpuMathOperations()
 {
     this.cuBlas = new CudaBlas();
     this.cudaContext = new CudaContext();
     this.cuModule = cudaContext.LoadModulePTX(kernalFile);
     this.maxThreadPerBlockDim = (int)Math.Sqrt(this.cudaContext.GetDeviceInfo().MaxThreadsPerBlock);
 }
Exemple #45
0
        static void Main(string[] args)
        {
            int SIGNAL_SIZE = 50;
            int FILTER_KERNEL_SIZE = 11;

            Console.WriteLine("[simpleCUFFT] is starting...");

            var assembly = Assembly.GetExecutingAssembly();
            var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx";

            CudaContext ctx = new CudaContext(0);
            CudaKernel ComplexPointwiseMulAndScale;
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale");
            }

            // Allocate host memory for the signal
            cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code...

            Random rand = new Random(0);
            // Initialize the memory for the signal
            for (int i = 0; i < SIGNAL_SIZE; ++i)
            {
                h_signal[i].real = (float)rand.NextDouble();
                h_signal[i].imag = 0;
            }

            // Allocate host memory for the filter
            cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE];

            // Initialize the memory for the filter
            for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
            {
                h_filter_kernel[i].real = (float)rand.NextDouble();
                h_filter_kernel[i].imag = 0;
            }

            // Pad signal and filter kernel
            cuFloatComplex[] h_padded_signal = null;
            cuFloatComplex[] h_padded_filter_kernel = null;
            int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE,
                                   h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE);
            int mem_size = (int)cuFloatComplex.SizeOf * new_size;

            // Allocate device memory for signal
            CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size);
            // Copy host memory to device
            d_signal.CopyToDevice(h_padded_signal);

            // Allocate device memory for filter kernel
            CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size);

            // Copy host memory to device
            d_filter_kernel.CopyToDevice(h_padded_filter_kernel);

            // CUFFT plan simple API
            CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1);

            // Transform signal and kernel
            Console.WriteLine("Transforming signal cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Forward);
            plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward);

            // Multiply the coefficients together and normalize the result
            Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>");
            ComplexPointwiseMulAndScale.BlockDimensions = 256;
            ComplexPointwiseMulAndScale.GridDimensions = 32;
            ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size);

            // Transform signal back
            Console.WriteLine("Transforming signal back cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse);

            // Copy device memory to host
            cuFloatComplex[] h_convolved_signal = d_signal;

            // Allocate host memory for the convolution result
            cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE];

            // Convolve on the host
            Convolve(h_signal, SIGNAL_SIZE,
                     h_filter_kernel, FILTER_KERNEL_SIZE,
                     h_convolved_signal_ref);

            // check result
            bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f);

            //Destroy CUFFT context
            plan.Dispose();

            // cleanup memory
            d_filter_kernel.Dispose();
            d_signal.Dispose();
            ctx.Dispose();

            if (bTestResult)
            {
                Console.WriteLine("Test Passed");
            }
            else
            {
                Console.WriteLine("Test Failed");
            }
        }
Exemple #46
0
        static void Main(string[] args)
        {
            var assembly = Assembly.GetExecutingAssembly();
            var resourceName = "simpleOccupancy.simpleOccupancy.ptx";

            ctx = new CudaContext(0);
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                kernel = ctx.LoadKernelPTX(stream, "square");
            }

            Console.WriteLine("starting Simple Occupancy");
            Console.WriteLine();

            Console.WriteLine("[ Manual configuration with {0} threads per block ]", manualBlockSize);

            int status = test(false);
            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();

            Console.WriteLine("[ Automatic, occupancy-based configuration ]");
            status = test(true);
            if (status != 0)
            {
                Console.WriteLine("Test failed");
                return;
            }

            Console.WriteLine();
            Console.WriteLine("Test PASSED");
        }
        protected void InitContext()
        {
            var size = ParticlesCount * DimensionsCount;

            var threadsNum = 32;
            var blocksNum = ParticlesCount / threadsNum;
            Ctx = new CudaContext(0);

            UpdateVelocity = Ctx.LoadKernel("update_velocity_kernel.ptx", "updateVelocityKernel");
            UpdateVelocity.GridDimensions = blocksNum;
            UpdateVelocity.BlockDimensions = threadsNum;

            Transpose = Ctx.LoadKernel(KernelFile, "transposeKernel");
            Transpose.GridDimensions = blocksNum;
            Transpose.BlockDimensions = threadsNum;

            HostPositions = Random.RandomVector(size, -5.0, 5.0);
            HostVelocities = Random.RandomVector(size, -2.0, 2.0);
            HostPersonalBests = (double[]) HostPositions.Clone();
            HostPersonalBestValues = Enumerable.Repeat(double.MaxValue,ParticlesCount).ToArray();

            HostNeighbors = new int[ParticlesCount * 2];

            for (var i = 0; i < ParticlesCount*2; i += 2)
            {
                int left, right;

                if (i == 0)
                    left = ParticlesCount - 1;
                else
                    left = i - 1;

                if (i == ParticlesCount - 1)
                    right = 0;
                else
                    right = i + 1;

                HostNeighbors[i] = left;
                HostNeighbors[i + 1] = right;
            }

            DevicePositions = HostPositions;
            DeviceVelocities = HostVelocities;
            DevicePersonalBests = HostPersonalBests;
            DevicePersonalBestValues = HostPersonalBestValues;
            DeviceNeighbors = HostNeighbors;

            Init();
        }
Exemple #48
0
 protected void SetupCuda()
 {
     // Try to bind a CUDA context to the graphics card that WPF is working with.
     Adapter d3dAdapter = Device.Factory.GetAdapter(0);
     CUdevice[] cudaDevices = null;
     try
     {
         // Build a CUDA context from the first adapter in the used D3D11 device.
         cudaDevices = CudaContext.GetDirectXDevices(Device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D11);
         Debug.Assert(cudaDevices.Length > 0);
         Console.WriteLine("> Display Device #" + d3dAdapter
             + ": \"" + d3dAdapter.Description + "\" supports Direct3D11 and CUDA.\n");
     }
     catch (CudaException)
     {
         // No Cuda device found for this Direct3D11 device.
         Console.Write("> Display Device #" + d3dAdapter
             + ": \"" + d3dAdapter.Description + "\" supports Direct3D11 but not CUDA.\n");
     }
     ContextCuda = new CudaContext(cudaDevices[0], Device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D11);
     var info = ContextCuda.GetDeviceInfo();
     Console.WriteLine("Max. Nr. Threads: " + info.MaxBlockDim + ", Total: " + info.MaxThreadsPerBlock + "\nMax. Nr. Blocks: " + info.MaxGridDim + "\nMax. Bytes Shared Per Block: " + info.SharedMemoryPerBlock);
 }
Exemple #49
0
        static void Main(string[] args)
        {
            string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder...
            string fileToCompile = File.ReadAllText(filename);

            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel");

            rtc.Compile(args);

            string log = rtc.GetLogAsString();

            Console.WriteLine(log);

            byte[] ptx = rtc.GetPTX();

            rtc.Dispose();

            CudaContext ctx = new CudaContext(0);

            CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd");

            // Print the vector length to be used, and compute its size
            int numElements = 50000;
            SizeT size = numElements * sizeof(float);
            Console.WriteLine("[Vector addition of {0} elements]", numElements);

            // Allocate the host input vector A
            float[] h_A = new float[numElements];
            // Allocate the host input vector B
            float[] h_B = new float[numElements];
            // Allocate the host output vector C
            float[] h_C = new float[numElements];

            Random rand = new Random(0);

            // Initialize the host input vectors
            for (int i = 0; i < numElements; ++i)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
            }

            Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n");
            // Allocate the device input vector A and copy to device
            CudaDeviceVariable<float> d_A = h_A;

            // Allocate the device input vector B and copy to device
            CudaDeviceVariable<float> d_B = h_B;

            // Allocate the device output vector C
            CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements);

            // Launch the Vector Add CUDA Kernel
            int threadsPerBlock = 256;
            int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
            Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock);
            vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1);
            vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1);

            vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements);

            // Copy the device result vector in device memory to the host result vector
            // in host memory.
            Console.WriteLine("Copy output data from the CUDA device to the host memory\n");
            d_C.CopyToHost(h_C);

            // Verify that the result vector is correct
            for (int i = 0; i < numElements; ++i)
            {
                if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
                {
                    Console.WriteLine("Result verification failed at element {0}!\n", i);
                    return;
                }
            }

            Console.WriteLine("Test PASSED\n");

            // Free device global memory
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            ctx.Dispose();
            Console.WriteLine("Done\n");
        }
Exemple #50
0
        static void Main(string[] args)
        {
            // NOTE: You need to change this location to match your own machine.
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment.");
            Console.ResetColor();
            System.Threading.Thread.Sleep(500);

            string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx";
            CudaContext ctx = new CudaContext();
            CUmodule module = ctx.LoadModule(path);
            kernel = new CudaKernel("kernel", module, ctx);

            // This tells the kernel to allocate a lot of threads for the Gpu.
            kernel.BlockDimensions = THREADS_PER_BLOCK;
            kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ;

            // Now let's load the kernel!
            // Create the topology.
            int[] topology = new int[] { 1, 200, 200, 100, 1 };

            int height = topology.Length;
            int width = 0;

            for (int i = 0; i < topology.Length; i++)
                if (width < topology[i]) width = topology[i];

            // Launch!
            float[] res = new float[height * width];
            for (int i = 0; i < 10; i++)
            {
                float[] matrix = new float[height * width];
                float[] weights = new float[height * width];
                Random rand = new Random(424242);
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0;
                        weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0;
                    }
                }

                // Load the kernel with some variables.
                CudaDeviceVariable<int> cuda_topology = topology;
                CudaDeviceVariable<float> cuda_membank = matrix;
                CudaDeviceVariable<float> cuda_weights = weights;

                Stopwatch sw = new Stopwatch();
                sw.Start();
                kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width);
                cuda_membank.CopyToHost(res);
                sw.Stop();

                Console.ForegroundColor = ConsoleColor.Green;
                Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]);
                Console.ResetColor();

            }

            Console.ReadKey();
        }
Exemple #51
0
 public GpuContext()
 {
     this.ctx = new CudaContext();
     
 }
Exemple #52
0
 static void InitKernels()
 {
     CudaContext cntxt = new CudaContext();
     CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Niels\Documents\uni ting\P10\P10\programs\small programs\CUDA 1D MA in C Sharp\CUDA 1D MA in C Sharp\Debug\kernel.ptx");
     addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt);
 }
Exemple #53
0
 public NVContext()
 {
     Context = new CudaContext(true);
     
 }
        public void Compile()
        {
            using (var ctx = new CudaContext())
            {
                // with verbaim string @, we only have to double up double quotes: no other escaping
                string source = @"
                extern ""C"" __global__
                void saxpy(float a, float *x, float *y, float *out, size_t n)
                {
                    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
                    if (tid < n)
                    {
                        out[tid] = a * x[tid] + y[tid];
                    }
                }
                ";

                source += Environment.NewLine;

                var name = "Test";
                var headers = new string[0];
                var includeNames = new string[0];

                var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames);

                //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames);
                // --ptxas-options=-v -keep
                compiler.Compile(new string[] { "-G" });

                //var ptxString = compiler.GetPTXAsString(); // for debugging

                var ptx = compiler.GetPTX();

                //compiler2.Compile(new string[] { });

                var kernel = ctx.LoadKernelPTX(ptx, "kernelName");

                //One kernel per cu file:
                //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname");
                kernel.GridDimensions = new dim3(1, 1, 1);
                kernel.BlockDimensions = new dim3(16, 16);

                //kernel.Run()

                var a = new CudaDeviceVariable<double>(100);
                //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr()

                //Multiple kernels per cu file:
                CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx");
                CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
                CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx)
                {
                    GridDimensions = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };

            }
        }
Exemple #55
0
        static void Main(string[] args)
        {
            ShrQATest.shrQAStart(args);

            Console.WriteLine("Vector Addition");
            int N = 50000;

            //Init Cuda context
            ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId());

            //Load Kernel image from resources
            string resName;
            if (IntPtr.Size == 8)
                resName = "vectorAdd_x64.ptx";
            else
                resName = "vectorAdd.ptx";

            string resNamespace = "vectorAdd";
            string resource = resNamespace + "." + resName;
            Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);
            if (stream == null) throw new ArgumentException("Kernel not found in resources.");

            CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

            // Allocate input vectors h_A and h_B in host memory
            h_A = new float[N];
            h_B = new float[N];

            // Initialize input vectors
            RandomInit(h_A, N);
            RandomInit(h_B, N);

            // Allocate vectors in device memory and copy vectors from host memory to device memory
            // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation.
            d_A = h_A;
            d_B = h_B;
            d_C = new CudaDeviceVariable<float>(N);

            // Invoke kernel
            int threadsPerBlock = 256;
            vectorAddKernel.BlockDimensions = threadsPerBlock;
            vectorAddKernel.GridDimensions = (N + threadsPerBlock - 1) / threadsPerBlock;

            vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            // Copy result from device memory to host memory
            // h_C contains the result in host memory
            h_C = d_C;

            // Verify result
            int i;
            for (i = 0; i < N; ++i)
            {
                float sum = h_A[i] + h_B[i];
                if (Math.Abs(h_C[i] - sum) > 1e-5)
                    break;
            }

            CleanupResources();

            ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Exemple #56
0
        private bool InitializeD3D()
        {
            HwndSource hwnd = new HwndSource(0, 0, 0, 0, 0, "null", IntPtr.Zero);
            // Create the D3D object.
            d3d = new Direct3DEx();

            PresentParameters pp = new PresentParameters();
            pp.BackBufferWidth = 512;
            pp.BackBufferHeight = 512;
            pp.BackBufferFormat = Format.Unknown;
            pp.BackBufferCount = 0;
            pp.Multisample = MultisampleType.None;
            pp.MultisampleQuality = 0;
            pp.SwapEffect = SwapEffect.Discard;
            pp.DeviceWindowHandle = (IntPtr)0;
            pp.Windowed = true;
            pp.EnableAutoDepthStencil = false;
            pp.AutoDepthStencilFormat = Format.Unknown;
            pp.PresentationInterval = PresentInterval.Default;

            bDeviceFound = false;
            CUdevice[] cudaDevices = null;
            for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++)
            {
                device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, hwnd.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp);
                try
                {
                    cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9);
                    bDeviceFound = cudaDevices.Length > 0;
                    infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.\n");
                    break;
                }
                catch (CudaException)
                {
                    //No Cuda device found for this Direct3D9 device
                    infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter
                        + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.\n");
                }
            }

            // we check to make sure we have found a cuda-compatible D3D device to work on
            if (!bDeviceFound)
            {
                infoLog.AppendText("No CUDA-compatible Direct3D9 device available");
                if (device != null)
                    device.Dispose();
                return false;
            }

            ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9);
            deviceName.Text = "Device name: " + ctx.GetDeviceName();

            // Set projection matrix
            SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1);
            device.SetTransform(TransformState.Projection, matProj);

            // Turn off D3D lighting, since we are providing our own vertex colors
            device.SetRenderState(RenderState.Lighting, false);

            //Load kernels
            CUmodule module = ctx.LoadModulePTX("kernel.ptx");

            addForces_k = new CudaKernel("addForces_k", module, ctx);
            advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx);
            diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx);
            updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx);
            advectParticles_k = new CudaKernel("advectParticles_k", module, ctx);

            d3dimage.Lock();
            Surface surf = device.GetBackBuffer(0, 0);
            d3dimage.SetBackBuffer(D3DResourceType.IDirect3DSurface9, surf.ComPointer);
            d3dimage.Unlock();
            surf.Dispose();

            //Setup the "real" frame rate counter.
            //The cuda counter only measures cuda runtime, not the overhead to actually
            //show the result via DirectX and WPF.
            realLastTick = Environment.TickCount;
            return true;
        }
Exemple #57
0
        // Initialization code to find the best CUDA Device
        static int findCudaDevice(string[] args)
        {
            int devID = 0;
            // If the command-line has a device number specified, use it
            bool found = false;
            foreach (var item in args)
            {
                if (item.Contains("device="))
                {
                    found = true;
                    if (!int.TryParse(item, out devID))
                    {
                        Console.WriteLine("Invalid command line parameters");
                        Environment.Exit(-1);
                    }
                    if (devID < 0)
                    {
                        Console.WriteLine("Invalid command line parameters\n");
                        Environment.Exit(-1);
                    }
                    else
                    {
                        devID = gpuDeviceInit(devID);
                        if (devID < 0)
                        {
                            Console.WriteLine("exiting...\n");
                            ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                            Environment.Exit(-1);
                        }
                    }
                }
            }

            if (!found)
            {
                // Otherwise pick the device with highest Gflops/s
                devID = CudaContext.GetMaxGflopsDeviceId();
                ctx = new CudaContext(devID, CUCtxFlags.SchedAuto);
                Console.Write("> Using CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName());
            }
            return devID;
        }