public void GMMUpdate(int gmm_N, CudaDeviceVariable <float> gmm, CudaDeviceVariable <byte> scratch_mem, int gmm_pitch, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> alpha, int width, int height)
        {
            dim3 grid  = new dim3((width + 31) / 32, (height + 31) / 32, 1);
            dim3 block = new dim3(32, 4, 1);

            GMMReductionKernelCreateGmmFlags.BlockDimensions   = block;
            GMMReductionKernelCreateGmmFlags.GridDimensions    = grid;
            GMMReductionKernelNoCreateGmmFlags.BlockDimensions = block;
            GMMReductionKernelNoCreateGmmFlags.GridDimensions  = grid;

            GMMReductionKernelCreateGmmFlags.Run((int)0, scratch_mem.DevicePointer + (grid.x * grid.y * 4), (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, alpha.DevicePointer, (int)alpha.Pitch, width, height, scratch_mem.DevicePointer);

            //GMMReductionKernel<4, true><<<grid, block>>>(0, &scratch_mem[grid.x * grid.y], gmm_pitch/4, image, image_pitch/4, alpha, alpha_pitch, width, height, (unsigned int*) scratch_mem);
            for (int i = 1; i < gmm_N; ++i)
            {
                GMMReductionKernelNoCreateGmmFlags.Run(i, scratch_mem.DevicePointer + (grid.x * grid.y * 4), (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, alpha.DevicePointer, (int)alpha.Pitch, width, height, scratch_mem.DevicePointer);

                //GMMReductionKernel<4, false><<<grid, block>>>(i, &scratch_mem[grid.x * grid.y], gmm_pitch/4, image, image_pitch/4, alpha, alpha_pitch, width, height, (unsigned int*) scratch_mem);
            }
            GMMFinalizeKernelInvertSigma.BlockDimensions = new dim3(32 * 4, 1, 1);
            GMMFinalizeKernelInvertSigma.GridDimensions  = new dim3(gmm_N, 1, 1);
            GMMFinalizeKernelInvertSigma.Run(gmm.DevicePointer, scratch_mem.DevicePointer + (grid.x * grid.y * 4), (int)gmm_pitch / 4, grid.x * grid.y);
            //GMMFinalizeKernel<4, true><<<gmm_N, 32*4>>>(gmm, &scratch_mem[grid.x * grid.y], gmm_pitch/4, grid.x * grid.y);

            block.x = 32; block.y = 2;
            GMMcommonTerm.BlockDimensions = block;
            GMMcommonTerm.GridDimensions  = new dim3(1, 1, 1);
            GMMcommonTerm.Run(gmm_N / 2, gmm.DevicePointer, (int)gmm_pitch / 4);
            //GMMcommonTerm<<<1, block>>>(gmm_N / 2, gmm, gmm_pitch/4);
        }
Beispiel #2
0
        static double[] SumMatrixManagedCuda(double[][,] matrix)
        {
            int Z = matrix.Length;
            int Y = matrix[0].GetLength(0);
            int X = matrix[0].GetLength(1);

            var result = new double[Y * X];
            var lm     = ToLinearArray(matrix);
            int N      = lm.Length;

            matrixSumCude.SetComputeSize((uint)X, (uint)Y);
            //matrixSumCude.BlockDimensions = 128;
            //matrixSumCude.GridDimensions = (N + 127) / 128;

            var da = cntxt.AllocateMemory(N * sizeof(double));
            var db = cntxt.AllocateMemory(result.Length * sizeof(double));

            cntxt.CopyToDevice(da, lm);
            cntxt.CopyToDevice(db, result);

            //CudaDeviceVariable<int> dA = a;
            //CudaDeviceVariable<int> dB = b;
            //CudaDeviceVariable<int> dC = new CudaDeviceVariable<int>(N);

            // Invoke kernel
            //kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ);
            matrixSumCude.Run(db, da, X, Y, Z);

            cntxt.CopyToHost <double>(result, db);

            return(result);
        }
        public static void update_particles(CudaDeviceVariable <float> d_vx, CudaDeviceVariable <float> d_vy, CudaDeviceVariable <float> d_vz, int cnt)
        {
            float d_dt = 0;

            _gpuVelocity.BlockDimensions = new dim3(1, 1, 1);
            _gpuVelocity.GridDimensions  = new dim3(cnt, 1, 1);

            _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer,
                             d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer,
                             d_dt,
                             d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer,
                             d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer,
                             torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer);

            d_dt = _dt * (float)0.5;
            _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer,
                             d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer,
                             d_dt,
                             d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer,
                             d_k2x.DevicePointer, d_k2y.DevicePointer, d_k2z.DevicePointer,
                             torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer);

            _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer,
                             d_k2x.DevicePointer, d_k2y.DevicePointer, d_k2z.DevicePointer,
                             d_dt,
                             d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer,
                             d_k3x.DevicePointer, d_k3y.DevicePointer, d_k3z.DevicePointer,
                             torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer);

            d_dt = _dt;
            _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer,
                             d_k3x.DevicePointer, d_k3y.DevicePointer, d_k3z.DevicePointer,
                             d_dt,
                             d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer,
                             d_k4x.DevicePointer, d_k4y.DevicePointer, d_k4z.DevicePointer,
                             torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer);


            _gpuUpdate.BlockDimensions = new dim3(1, 1, 1);
            _gpuUpdate.GridDimensions  = new dim3(cnt * 3, 1, 1);

            _gpuUpdate.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer,
                           d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer,
                           d_k2x.DevicePointer, d_k2y.DevicePointer, d_k2z.DevicePointer,
                           d_k3x.DevicePointer, d_k3y.DevicePointer, d_k3z.DevicePointer,
                           d_k4x.DevicePointer, d_k4y.DevicePointer, d_k4z.DevicePointer,
                           d_dt);
        }
Beispiel #4
0
        public void ApplyMatte(int mode, CudaPitchedDeviceVariable <uchar4> result, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> matte, int width, int height)
        {
            dim3 block = new dim3(32, 8, 1);
            dim3 grid  = new dim3((width + 31) / 32, (height + 31) / 32, 1);

            switch (mode)
            {
            case 0:
                ApplyMatteKernelMode0.BlockDimensions = block;
                ApplyMatteKernelMode0.GridDimensions  = grid;
                ApplyMatteKernelMode0.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height);

                //ApplyMatteKernel<0><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height);
                break;

            case 1:
                ApplyMatteKernelMode1.BlockDimensions = block;
                ApplyMatteKernelMode1.GridDimensions  = grid;
                ApplyMatteKernelMode1.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height);

                //ApplyMatteKernel<1><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height);
                break;

            case 2:
                ApplyMatteKernelMode2.BlockDimensions = block;
                ApplyMatteKernelMode2.GridDimensions  = grid;
                ApplyMatteKernelMode2.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height);

                //ApplyMatteKernel<2><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height);
                break;
            }
        }
        //Test CUDA kernel for complex multiplication
        public void test(int N)
        {
            CudaContext ctx    = new CudaContext();
            CudaKernel  kernel = ctx.LoadKernel("kernel.ptx", "ComplexMultCUDA");

            kernel.GridDimensions  = N;
            kernel.BlockDimensions = 1;
            double2[] a = new double2[N];
            double2[] b = new double2[N];
            double2[] c = new double2[N];
            for (int i = 0; i < N; i++)
            {
                a[i].x = 1;
                a[i].y = 3;
                b[i].x = 2;
                b[i].y = 2;
            }

            CudaDeviceVariable <double2> d_a = null;
            CudaDeviceVariable <double2> d_b = null;

            try
            {
                d_a = a;
                d_b = b;
            }
            catch (Exception e)
            {
                Console.WriteLine("{0} Exception caught.", e);
                return;
            }
            kernel.Run(d_a.DevicePointer, d_b.DevicePointer, N);
            c = d_b;
            Console.WriteLine("C.last()={0}+i{1}", c.Last().x, c.Last().y);
        }
    public void CalculateFitness(CudaDeviceVariable <byte> population, CudaDeviceVariable <float> fitness)
    {
        Profiler.Start("Calculate accuracy");
        var deviceAccuracy = accuracyCalc.CalculateAccuracy(population);

        Profiler.Stop("Calculate accuracy");
        float[] asdf = deviceAccuracy;

        Profiler.Start("Calculate vectorSizes");
        countVectorsKernel.Calculate(population, vectorSizes);
        Profiler.Stop("Calculate vectorSizes");

        int[] v = vectorSizes;
        Profiler.Start("Avrage VectorSizes");
        float avrageVectorSize = Thrust.Avrage(vectorSizes);

        Profiler.Stop("Avrage VectorSizes");

        Profiler.Start("Avrage accuracy");
        float avrageAccuracy = Thrust.Avrage(deviceAccuracy);

        Profiler.Stop("Avrage accuracy");


        Profiler.Start("fittness kernel");
        fitnessKernel.Run(
            deviceAccuracy.DevicePointer,
            avrageAccuracy,
            vectorSizes.DevicePointer,
            avrageVectorSize,
            fitness.DevicePointer
            );
        Profiler.Stop("fittness kernel");
    }
        /// <summary>Runs the kernel in synchronous mode.</summary>
        /// <param name="args">MyMemoryBlock arguments are automatically converted to device pointers.</param>
        public void RunSync(params object[] args)
        {
            CheckExecutionSetup();
            ConvertMemoryBlocksToDevicePtrs(args);

            m_kernel.Run(args);
        }
Beispiel #8
0
    public void CreateNewPopulation()
    {
        var tmp = populationGens;

        populationGens  = populationGens2;
        populationGens2 = tmp;

        // var b = new FlattArray<byte>((byte[])populationGens, genLength).To2d();
        Profiler.Start("Calculate fitness");
        fitnessCalc.CalculateFitness(populationGens, deviceFitnes);
        Profiler.Stop("Calculate fitness");

        Profiler.Start("sqauance");
        Thrust.seaquance(fitnessIndeces);
        Profiler.Stop("sqauance");

        Profiler.Start("sorting fitness");
        Thrust.sort_by_key(deviceFitnes, fitnessIndeces);
        Profiler.Stop("sorting fitness");

        Profiler.Start("performing genetics");
        // var c = new FlattArray<byte>((byte[])populationGens, genLength).To2d();

        performGeneticAlgorythm.Run(
            populationGens.DevicePointer,
            populationGens2.DevicePointer,
            deviceFitnes.DevicePointer,
            fitnessIndeces.DevicePointer
            );
        //var a = new FlattArray<byte>((byte[])populationGens, genLength).To2d() ;

        Profiler.Stop("performing genetics");
    }
Beispiel #9
0
    public void CUDA_AddFloatArrays()
    {
        //Load Kernel image from resources
        Stream stream = new StreamReader(resName).BaseStream;

        if (stream == null)
        {
            throw new ArgumentException("Kernel not found in resources.");
        }

        vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd");

        var threadsPerBlock = 1024;

        vectorAddKernel.BlockDimensions = threadsPerBlock;
        vectorAddKernel.GridDimensions  = (Count + threadsPerBlock - 1) / threadsPerBlock;

        CudaStopWatch w = new CudaStopWatch();

        w.Start();
        vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count);
        w.Stop();

        Debug.Log(w.GetElapsedTime() / 1000.0f);
        Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}");
        Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}");

        // Copy result from device memory to host memory
        // h_C contains the result in host memory
        // h_C = d_C;
    }
Beispiel #10
0
        public CudaDeviceVariable <T> PrefixSumArray <T>(CudaDeviceVariable <T> input, int n) where T : struct
        {
            int arrayLength = n;
            int batchSize   = n / arrayLength;

            if (!IsPowerOfTwo(n))
            {
                throw new Exception("Input array length is not power of two.");
            }

            CudaDeviceVariable <T> output = new CudaDeviceVariable <T>(n);
            CudaDeviceVariable <T> buffer = new CudaDeviceVariable <T>(n);

            kernelScanExclusiveShared.BlockDimensions = threadBlockSize;
            kernelScanExclusiveShared.GridDimensions  = (batchSize * arrayLength) / (4 * threadBlockSize);
            kernelScanExclusiveShared.Run(output.DevicePointer, input.DevicePointer, 4 * threadBlockSize);

            kernelScanExclusiveShared2.BlockDimensions = threadBlockSize;
            kernelScanExclusiveShared2.GridDimensions  = (int)Math.Ceiling(((batchSize * arrayLength) / (4 * threadBlockSize)) / (double)threadBlockSize);
            kernelScanExclusiveShared2.Run(buffer.DevicePointer, output.DevicePointer, input.DevicePointer, (batchSize * arrayLength) / (4 * threadBlockSize), arrayLength / (4 * threadBlockSize));

            kernelUniformUpdate.BlockDimensions = threadBlockSize;
            kernelUniformUpdate.GridDimensions  = (batchSize * arrayLength) / (4 * threadBlockSize);
            kernelUniformUpdate.Run(output.DevicePointer, buffer.DevicePointer);

            buffer.Dispose();

            return(output);
        }
Beispiel #11
0
        // Testing managed CUDA call
        private static void RunCudaWithAKernel()
        {
            // C# Cuda code to call kernel

            int         N            = 50000;
            int         deviceID     = 0;
            CudaContext ctx          = new CudaContext(deviceID);
            CudaKernel  kernel       = ctx.LoadKernel("kernel_x64.ptx", "VecAdd");
            int         numOfThreads = 256;

            kernel.GridDimensions  = (N + numOfThreads - 1) / numOfThreads;
            kernel.BlockDimensions = numOfThreads;

            // allocate memory in host (not gpu)
            var h_A = InitWithData(N, numOfThreads * 4);
            var h_B = InitWithData(N, numOfThreads);

            // Allocate vectors in device memory and copy from host to device.
            CudaDeviceVariable <float> d_A = h_A;
            CudaDeviceVariable <float> d_B = h_B;
            CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N);

            //Invoke kernel
            kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N);

            Console.WriteLine("kernel has runeth");
            //Copy from memory of device to host.
            float[] h_C = d_C;
        }
Beispiel #12
0
        public override void MatrixBellmanErrorAndDerivative(Matrix predictedQValues, Matrix maxQHatValues, Matrix chosenActionIndices, Matrix currentRewards, Matrix error, Matrix errorDerivative,
                                                             float discount, Matrix isLastEpisode, bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false)
        {
            this.VerifyDimentionalityOfMatrices(predictedQValues, errorDerivative);
            this.VerifyColumnWithRowOfMatrices(predictedQValues, maxQHatValues);
            this.VerifyDimentionalityOfMatrices(maxQHatValues, chosenActionIndices, currentRewards);
            this.VerifyDimentionalityOfMatrices(currentRewards, error, isLastEpisode);

            if (copyInputsFromCpuToGpu)
            {
                predictedQValues.CopyToCuda();
                maxQHatValues.CopyToCuda();
            }

            CudaKernel kernel = InitializeGridsAndThreads("_Z26matrixBellmanErrorAndDerivPfS_S_S_S_S_fS_ii", errorDerivative);

            kernel.Run(predictedQValues.DeviceData.DevicePointer, maxQHatValues.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer, error.DeviceData.DevicePointer,
                       errorDerivative.DeviceData.DevicePointer, discount, isLastEpisode.DeviceData.DevicePointer, errorDerivative.Row, errorDerivative.Column);

            if (copyOutputsFromGpuToCpu)
            {
                error.CopyFromCuda();
                errorDerivative.CopyFromCuda();
            }
        }
Beispiel #13
0
        public override void DqnStanfordEvaluation(Matrix predictedActionIndices, Matrix chosenActionIndices, Matrix currentRewards, Matrix matchPredictRewards, Matrix nonMatchPredictRewards,
                                                   bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false)
        {
            this.VerifyDimentionalityOfMatrices(predictedActionIndices, chosenActionIndices, currentRewards);
            this.VerifyDimentionalityOfMatrices(currentRewards, matchPredictRewards, nonMatchPredictRewards);

            if (copyInputsFromCpuToGpu)
            {
                predictedActionIndices.CopyToCuda();
                chosenActionIndices.CopyToCuda();
                currentRewards.CopyToCuda();
            }

            CudaKernel kernel = InitializeGridsAndThreads("_Z21DqnStanfordEvaluationPfS_S_S_S_i", matchPredictRewards);

            kernel.Run(predictedActionIndices.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer,
                       matchPredictRewards.DeviceData.DevicePointer, nonMatchPredictRewards.DeviceData.DevicePointer, matchPredictRewards.Row);


            if (copyOutputsFromGpuToCpu)
            {
                matchPredictRewards.CopyToCuda();
                nonMatchPredictRewards.CopyToCuda();
            }
        }
Beispiel #14
0
        public static void blaa()
        {
            int num = 10;
            //NewContext creation
            CudaContext cntxt = new CudaContext();

            //Module loading from precompiled .ptx in a project output folder
            CUmodule cumodule = cntxt.LoadModule("kernel.ptx");

            //_Z9addKernelPf - function name, can be found in *.ptx file
            CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt);

            //Create device array for data
            CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num);

            //Create arrays with data
            float[] vec1 = new float[num];

            //Copy data to device
            vec1_device.CopyToDevice(vec1);

            //Set grid and block dimensions
            addWithCuda.GridDimensions  = new dim3(8, 1, 1);
            addWithCuda.BlockDimensions = new dim3(512, 1, 1);

            //Run the kernel
            addWithCuda.Run(
                vec1_device.DevicePointer);

            //Copy data from device
            vec1_device.CopyToHost(vec1);
        }
Beispiel #15
0
        public void Step(float stepSize)
        {
            // Load the next vector fields to device memory.
            LoadNextField();
            _cudaDxMapper.MapAllResources();
            CudaArray2D lastFlowMap = _cudaDxMapper[0].GetMappedArray2D(0, 0);

            // Advect from each member to each member. In each block, the same configuration is choosen.
            dim3 grid = new dim3((int)((float)_width / BLOCK_SIZE + 0.5f), (int)((float)_height / BLOCK_SIZE + 0.5f), _numMembers);

            // Advect a block in each member-member combination.
            dim3 threads = new dim3(BLOCK_SIZE, BLOCK_SIZE);

            _advectParticlesKernel.GridDimensions  = grid;
            _advectParticlesKernel.BlockDimensions = threads;
            // (float* mapT1, const int width, const int height, const int numMembers, /*float timeScale, */ float stepSize, float minDensity, float invalid)
            _advectParticlesKernel.Run(_pongFlowMap.DevicePointer, _width, _height, _numMembers, stepSize, 0.000001f, _texInvalidValue);



            // Swap the Texture2D handles.
            CudaSurfObject surf = new CudaSurfObject(lastFlowMap);

            grid.z = 1;
            _copyMapDataKernel.GridDimensions  = grid;
            _copyMapDataKernel.BlockDimensions = threads;
            _copyMapDataKernel.Run(surf.SurfObject, _pongFlowMap.DevicePointer, _width, _height);

            _cudaDxMapper.UnmapAllResources();
        }
Beispiel #16
0
        //private CudaKernel kernel1;

        //public Class1()
        //{
        //    //int deviceID = 0;

        //    //CudaContext ctx = new CudaContext(deviceID);
        //    //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
        //    //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);
        //}

        public static double[,] TestMatrix(double[][,] a)
        {
            using (CudaContext ctx = new CudaContext(0))
            {
                CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx");
                var      kernel   = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx);

                int dimZ = a.Length;
                int dimX = a[0].GetLength(0);
                int dimY = a[0].GetLength(1);

                kernel.GridDimensions  = new dim3(28, 28, 1);
                kernel.BlockDimensions = new dim3(1, 1, 1);
                //kernel.BlockDimensions = new dim3(dimX, dimY, 1);

                // Allocate vectors in device memory and copy vectors from host memory to device memory
                CudaDeviceVariable <double> dA = a.ToLinearArray();
                //CudaDeviceVariable<double> dB = ToLinearArray(b);
                CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY);

                // Invoke kernel
                kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ);

                // Copy result from device memory to host memory
                double[] c = dC;

                //ctx.FreeMemory(dC.DevicePointer);
                //ctx.FreeMemory(dA.DevicePointer);
                //ctx.Dispose();

                return(ToMultyArray(c, dimX));
            }
        }
Beispiel #17
0
    public float BaseAccuracy()
    {
        var  baseKernel    = context.LoadKernel("kernels/VectorReduction.ptx", "calculateAccuracy");
        dim3 gridDimension = new dim3()
        {
            x = (uint)(test.length / ThreadsPerBlock + 1),
            y = (uint)1,
            z = 1
        };

        baseKernel.GridDimensions  = gridDimension;
        baseKernel.BlockDimensions = ThreadsPerBlock;

        baseKernel.SetConstantVariable("testVectorsCount", test.length);
        baseKernel.SetConstantVariable("teachingVectorsCount", teaching.length);
        baseKernel.SetConstantVariable("attributeCount", teaching.attributeCount);
        baseKernel.SetConstantVariable("genLength", teaching.length);

        var BaseRMSEKernel = context.LoadKernel("kernels/VectorReduction.ptx", "RMSE");

        BaseRMSEKernel.GridDimensions  = 1;
        BaseRMSEKernel.BlockDimensions = 1;
        BaseRMSEKernel.SetConstantVariable("testVectorsCount", test.length);

        byte[] gen = new byte[teaching.length];
        for (int i = 0; i < gen.Length; i++)
        {
            gen[i] = 1;
        }

        using (CudaDeviceVariable <byte> deviceGen = gen)
            using (CudaDeviceVariable <float> baseAccuracy = new CudaDeviceVariable <float>(1))
            {
                accuracyKernel.Run(
                    test.classes.DevicePointer,
                    teaching.classes.DevicePointer,
                    deviceGen.DevicePointer,
                    calculatedNeabours.DevicePointer,
                    deviceAccuracy.DevicePointer
                    );

                BaseRMSEKernel.Run(baseAccuracy.DevicePointer);

                float[] host = baseAccuracy;
                return(host[0]);
            }
    }
        public List <float> hypotesis(List <double> x, List <double> h, int N)
        {
            //int N = 2000000;

            string      path   = Path.GetDirectoryName(mv.plugins[0].filename);
            CudaContext ctx    = new CudaContext();
            CudaKernel  kernel = ctx.LoadKernel(path + "\\kernel.ptx", "ComplexMultCUDA");

            kernel.GridDimensions  = (int)Math.Ceiling((double)(N + h.Count - 1) / 1024);
            kernel.BlockDimensions = 1024;

            double[] temp_y = new double[N + h.Count - 1];
            double[] temp_h = new double[N + h.Count - 1];
            double[] temp_x = new double[N + h.Count - 1];

            double2[] temp_x2 = new double2[N + h.Count - 1];


            h.ToArray().CopyTo(temp_h, 0);
            x.ToArray().CopyTo(temp_x, 0);

            CudaDeviceVariable <double>  d_x = null;
            CudaDeviceVariable <double2> d_X = new CudaDeviceVariable <double2>(N + h.Count - 1);

            CudaDeviceVariable <double>  d_h = new CudaDeviceVariable <double>(N + h.Count - 1);
            CudaDeviceVariable <double2> d_H = new CudaDeviceVariable <double2>(N + h.Count - 1);

            CudaDeviceVariable <double> d_y = new CudaDeviceVariable <double>(N + h.Count - 1);


            CudaFFTPlan1D planForward = new CudaFFTPlan1D(N + h.Count - 1, cufftType.D2Z, 1);
            CudaFFTPlan1D planInverse = new CudaFFTPlan1D(N + h.Count - 1, cufftType.Z2D, 1);

            try
            {
                d_h = temp_h;
                planForward.Exec(d_h.DevicePointer, d_H.DevicePointer, TransformDirection.Forward);
            }
            catch (Exception exp)
            {
                mainView.log(exp, "CUDA error: Impulse response FFT", this);
                return(null);
            }

            try
            {
                d_x = temp_x;
                planForward.Exec(d_x.DevicePointer, d_X.DevicePointer);
                kernel.Run(d_H.DevicePointer, d_X.DevicePointer, N + h.Count - 1);
                planInverse.Exec(d_X.DevicePointer, d_y.DevicePointer);
            }
            catch (Exception exp)
            {
                mainView.log(exp, "Cuda error: kernel run cuda error", this);
            }
            temp_y = d_y;

            return(Array.ConvertAll <double, float>(temp_y, d => (float)d).ToList().GetRange(500, x.Count));
        }
Beispiel #19
0
        private void addForces(CudaPitchedDeviceVariable <float2> v, int dx, int dy, int spx, int spy, float fx, float fy, int r, SizeT tPitch)
        {
            dim3 tids = new dim3((uint)(2 * r + 1), (uint)(2 * r + 1), 1);

            addForces_k.GridDimensions  = new dim3(1);
            addForces_k.BlockDimensions = tids;
            addForces_k.Run(v.DevicePointer, dx, dy, spx, spy, fx, fy, r, tPitch);
        }
        static void Main(string[] args)
        {
            InitKernels();
            int n = 0x1000000;

            float[] xx = new float[n];
            float[] yy = new float[n];
            float[] zz = new float[n];
            int[]   tt = new int[n];

            float L = 512;
            float Lx = L, Ly = L, Lz = L;
            CudaDeviceVariable <float> gpu_lengths  = new float[] { Lx, Ly, Lz };

            for (int i = 0; i < n; i++)
            {
                xx[i] = ((float)(i & 0x0000FF) / (float)0x000100 - 0.5F) * Lx;
                yy[i] = ((float)(i & 0x00FF00) / (float)0x010000 - 0.5F) * Ly;
                zz[i] = ((float)(i & 0xFF0000) / (float)0x1000000 - 0.5F) * Lz;
                tt[i] = 1;
            }
            SimulationMolecules simulationMolecules = new SimulationMolecules(xx, yy, zz, tt);

            for (int i = 0; i < 10; i++)
            {
                Console.WriteLine($"{i}  =  ({simulationMolecules.x[i]},{simulationMolecules.y[i]},{simulationMolecules.z[i]})");
            }
            int ii = 0x808080;

            Console.WriteLine($"{ii}  =  ({simulationMolecules.x[ii]},{simulationMolecules.y[ii]},{simulationMolecules.z[ii]})");

            Interactions interactions = new Interactions();

            interactions.SetLJParameters(1, 1, 1.5F, 1.5F);

            float[] cachedEnergies = new float[n / THREADS_PER_BLOCK + 1];
            CudaDeviceVariable <float> gpu_energies = cachedEnergies;

            energyOfExistingMolecule.Run(n,
                                         simulationMolecules.gpu_x.DevicePointer,
                                         simulationMolecules.gpu_y.DevicePointer,
                                         simulationMolecules.gpu_z.DevicePointer,
                                         simulationMolecules.gpu_types.DevicePointer,
                                         interactions.SigmaPointer(),
                                         interactions.EpsilonPointer(),
                                         gpu_lengths.DevicePointer,
                                         2.5F,
                                         0x808080,
                                         gpu_energies.DevicePointer
                                         );

            gpu_energies.CopyToHost(cachedEnergies);
            double e = cachedEnergies.Sum();

            Console.WriteLine($"Energy = {e}");
            Console.Read();
        }
        public void DownscaleTrimap(CudaPitchedDeviceVariable <byte> small_image, int small_width, int small_height, CudaPitchedDeviceVariable <byte> image, int width, int height)
        {
            dim3 grid  = new dim3((width + 63) / 64, (height + 63) / 64, 1);
            dim3 block = new dim3(32, 8, 1);

            downscaleKernel2.BlockDimensions = block;
            downscaleKernel2.GridDimensions  = grid;
            downscaleKernel2.Run(small_image.DevicePointer, (int)small_image.Pitch, small_width, small_height, image.DevicePointer, (int)image.Pitch, width, height);
            //downscaleKernel<<<grid, block>>>(small_image, small_pitch, small_width, small_height, image, pitch, width, height, maxfilter_functor());
        }
        private void diffuseProject(CudaDeviceVariable <float2> vx, CudaDeviceVariable <float2> vy, int dx, int dy, float dt, float visc, SizeT tPitch)
        {
            dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1);

            dim3 tids = new dim3(TIDSX, TIDSY, 1);

            diffuseProject_k.GridDimensions  = grid;
            diffuseProject_k.BlockDimensions = tids;
            diffuseProject_k.Run(vx.DevicePointer, vy.DevicePointer, dx, dy, dt, visc, TILEY / TIDSY);
        }
        public void DataTerm(CudaPitchedDeviceVariable <int> terminals, int gmmN, CudaDeviceVariable <float> gmm, int gmm_pitch, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> trimap, int width, int height)
        {
            dim3 block = new dim3(32, 8, 1);
            dim3 grid  = new dim3((int)((width + block.x - 1) / block.x), (int)((height + block.y - 1) / block.y), 1);

            DataTermKernel.BlockDimensions = block;
            DataTermKernel.GridDimensions  = grid;
            DataTermKernel.Run(terminals.DevicePointer, (int)terminals.Pitch / 4, gmmN, gmm.DevicePointer, (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, trimap.DevicePointer, (int)trimap.Pitch, width, height);
            //DataTermKernel<<<grid, block>>>(terminals, terminal_pitch/4, gmmN, gmm, gmm_pitch/4, image, image_pitch/4, trimap, trimap_pitch, width, height);
        }
        public void GMMAssign(int gmmN, CudaDeviceVariable <float> gmm, int gmm_pitch, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> alpha, int width, int height)
        {
            dim3 block = new dim3(32, 16, 1);
            dim3 grid  = new dim3((int)((width + block.x - 1) / block.x), (int)((height + block.y - 1) / block.y), 1);

            GMMAssignKernel.BlockDimensions = block;
            GMMAssignKernel.GridDimensions  = grid;
            GMMAssignKernel.Run(gmmN, gmm.DevicePointer, (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, alpha.DevicePointer, (int)alpha.Pitch, width, height);
            //GMMAssignKernel<<<grid, block>>>(gmmN, gmm, gmm_pitch/4, image, image_pitch/4, alpha, alpha_pitch, width, height);
        }
Beispiel #25
0
        private void advectParticles(CudaDeviceVariable <vertex> p, CudaPitchedDeviceVariable <float2> v, int dx, int dy, float dt, SizeT tPitch)
        {
            dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1);

            dim3 tids = new dim3(TIDSX, TIDSY, 1);

            advectParticles_k.GridDimensions  = grid;
            advectParticles_k.BlockDimensions = tids;
            advectParticles_k.Run(p.DevicePointer, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch);
        }
Beispiel #26
0
        private void updateVelocity(CudaPitchedDeviceVariable <float2> v, CudaDeviceVariable <float2> vx, CudaDeviceVariable <float2> vy, int dx, int pdx, int dy, SizeT tPitch)
        {
            dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1);

            dim3 tids = new dim3(TIDSX, TIDSY, 1);

            updateVelocity_k.GridDimensions  = grid;
            updateVelocity_k.BlockDimensions = tids;
            updateVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, TILEY / TIDSY, tPitch);
        }
Beispiel #27
0
 private void BlurGpu(int iterations)
 {
     for (int i = 0; i < BlurIterations; i++)
     {
         kernel.Run(input.DevicePointer, output.DevicePointer, Width, Height);
     }
     // Copy result from device memory to host memory
     // h_C contains the result in host memory
     //float[] copyOutput = output;
     image = output;
 }
 public void Run(params object[] args)
 {
     for (int i = 0; i < args.Length; i++)
     {
         if (args[i] is MyAbstractMemoryBlock)
         {
             args[i] = (args[i] as MyAbstractMemoryBlock).GetDevicePtr(m_GPU);
         }
     }
     m_kernel.Run(args);
 }
Beispiel #29
0
        ////////////////////////////////////////////////////////////////////////////////
        // Occupancy-based launch configurator
        //
        // The launch configurator, cudaOccupancyMaxPotentialBlockSize and
        // cudaOccupancyMaxPotentialBlockSizeVariableSMem, suggests a block
        // size that achieves the best theoretical occupancy. It also returns
        // the minimum number of blocks needed to achieve the occupancy on the
        // whole device.
        //
        // This launch configurator is purely occupancy-based. It doesn't
        // translate directly to performance, but the suggestion should
        // nevertheless be a good starting point for further optimizations.
        //
        // This function configures the launch based on the "automatic"
        // argument, records the runtime, and reports occupancy and runtime.
        ////////////////////////////////////////////////////////////////////////////////
        static int launchConfig(CudaDeviceVariable <int> array, int arrayCount, bool automatic)
        {
            int   blockSize   = 0;
            int   minGridSize = 0;
            int   gridSize;
            SizeT dynamicSMemUsage = 0;


            float elapsedTime;

            double potentialOccupancy;

            CudaOccupancy.cudaOccDeviceState state = new CudaOccupancy.cudaOccDeviceState();
            state.cacheConfig = CudaOccupancy.cudaOccCacheConfig.PreferNone;

            if (automatic)
            {
                CudaOccupancy.cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, new CudaOccupancy.cudaOccDeviceProp(0), new CudaOccupancy.cudaOccFuncAttributes(kernel), state, dynamicSMemUsage);

                Console.WriteLine("Suggested block size: {0}", blockSize);
                Console.WriteLine("Minimum grid size for maximum occupancy: {0}", minGridSize);
            }
            else
            {
                // This block size is too small. Given limited number of
                // active blocks per multiprocessor, the number of active
                // threads will be limited, and thus unable to achieve maximum
                // occupancy.
                //
                blockSize = manualBlockSize;
            }

            // Round up
            //
            gridSize = (arrayCount + blockSize - 1) / blockSize;

            // Launch and profile
            //
            kernel.GridDimensions  = gridSize;
            kernel.BlockDimensions = blockSize;
            elapsedTime            = kernel.Run(array.DevicePointer, arrayCount);

            // Calculate occupancy
            //
            potentialOccupancy = reportPotentialOccupancy(blockSize, dynamicSMemUsage);

            Console.WriteLine("Potential occupancy: {0}%", potentialOccupancy * 100);

            // Report elapsed time
            //
            Console.WriteLine("Elapsed time: {0}ms", elapsedTime * 100);

            return(0);
        }
Beispiel #30
0
        internal long ParallelFor(long value)
        {
            result.CopyToDevice(new long[] { 0 });

            parallelFor.Run(new object[]
                            { value, result.DevicePointer });

            long[] returned = new long[1];
            result.CopyToHost(returned);
            return(returned[0]);
        }
Beispiel #31
0
        static void Main(string[] args)
        {
            // NOTE: You need to change this location to match your own machine.
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment.");
            Console.ResetColor();
            System.Threading.Thread.Sleep(500);

            string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx";
            CudaContext ctx = new CudaContext();
            CUmodule module = ctx.LoadModule(path);
            kernel = new CudaKernel("kernel", module, ctx);

            // This tells the kernel to allocate a lot of threads for the Gpu.
            kernel.BlockDimensions = THREADS_PER_BLOCK;
            kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ;

            // Now let's load the kernel!
            // Create the topology.
            int[] topology = new int[] { 1, 200, 200, 100, 1 };

            int height = topology.Length;
            int width = 0;

            for (int i = 0; i < topology.Length; i++)
                if (width < topology[i]) width = topology[i];

            // Launch!
            float[] res = new float[height * width];
            for (int i = 0; i < 10; i++)
            {
                float[] matrix = new float[height * width];
                float[] weights = new float[height * width];
                Random rand = new Random(424242);
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0;
                        weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0;
                    }
                }

                // Load the kernel with some variables.
                CudaDeviceVariable<int> cuda_topology = topology;
                CudaDeviceVariable<float> cuda_membank = matrix;
                CudaDeviceVariable<float> cuda_weights = weights;

                Stopwatch sw = new Stopwatch();
                sw.Start();
                kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width);
                cuda_membank.CopyToHost(res);
                sw.Stop();

                Console.ForegroundColor = ConsoleColor.Green;
                Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]);
                Console.ResetColor();

            }

            Console.ReadKey();
        }
Beispiel #32
0
        private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
        {
            int count = width * height * depth;
            int widthD = width - 1;
            int heightD = height - 1;
            int depthD = depth - 1;
            int countDecremented = widthD * heightD * depthD;

            dim3 blockDimensions = new dim3(8, 8, 8);
            dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
            dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));

            CUDANoiseCube noiseCube = new CUDANoiseCube();

            CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16);
            CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);

            CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count);

            kernelPositionWeight.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);

            kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);

            kernelNormalAmbient.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);

            kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);

            int nearestW = NearestPowerOfTwo(widthD);
            int nearestH = NearestPowerOfTwo(heightD);
            int nearestD = NearestPowerOfTwo(depthD);
            int nearestCount = nearestW * nearestH * nearestD;

            CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount);
            trisCountDevice.Memset(0);
            CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented);

            kernelMarchingCubesCases.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);

            kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);

            CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);

            int lastTrisCount = 0;
            trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));

            int lastPrefixSum = 0;
            prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));

            int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;

            if (totalVerticesCount > 0)
            {
                if (container.Geometry != null)
                    container.Geometry.Dispose();

                container.VertexCount = totalVerticesCount;

                container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
                {
                    BindFlags = BindFlags.VertexBuffer,
                    CpuAccessFlags = CpuAccessFlags.None,
                    OptionFlags = ResourceOptionFlags.None,
                    SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
                    Usage = ResourceUsage.Default
                });

                CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);
                
                kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
                typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);

                directResource.Map();
                kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
                directResource.UnMap();

                directResource.Dispose();
            }
            else
            {
                container.VertexCount = 0;

                if (container.Geometry != null)
                    container.Geometry.Dispose();
            }

            noiseCube.Dispose();
            prefixSumsDev.Dispose();
            trisCountDevice.Dispose();
            offsetsDev.Dispose();
            noiseArray.Dispose();
            noiseTexture.Dispose();
            voxelsDev.Dispose();
        }