Esempio n. 1
0
        public static void Cuda(
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n)
        {
            var gpu = Gpu.Default;

            using (var cudaSquaredDistance = gpu.AllocateDevice(mSquaredDistances))
                using (var cudaCoordinates = gpu.AllocateDevice(mCoordinates))
                {
                    var timer = Stopwatch.StartNew();

                    const int blockSize = 128;

                    var gridSize = Util.DivUp(n * n, blockSize);
                    var lp       = new LaunchParam(gridSize, blockSize);

                    gpu.Launch(Kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, c, n);

                    gpu.Synchronize();
                    Util.PrintPerformance(timer, "SquaredDistance.Cuda", n, c, n);

                    Gpu.Copy(cudaSquaredDistance, mSquaredDistances);
                }
        }
Esempio n. 2
0
 public void Mult(int wA, int wB, int hC, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C)
 {
     var block = new dim3(BlockSize, BlockSize);
     var grid = new dim3(wB/block.x, hC/block.y);
     var lp = new LaunchParam(grid, block);
     GPULaunch(Kernel, lp, wA, wB, A, B, C);
 }
Esempio n. 3
0
        private AdaptiveLBP(Size size)
        {
            this.size = size;

            // initialize data structures to avoid reallocating with every call
            hist          = new int[numSubuniformPatterns * numVarBins];
            hist2         = new int[numSubuniformPatterns * numVarBins];
            lbpImageGPU   = worker.Malloc <short>(size.Width * size.Height);
            varImageGPU   = worker.Malloc <short>(size.Width * size.Height);
            histGPU       = worker.Malloc <int>(hist.Length);
            floatImageGPU = worker.Malloc <float>(size.Width * size.Height);

            // precompute the subuniform bin for each LBP pattern, and push it to the GPU
            subuniformBins = new short[(short)Math.Pow(2, numNeighbors)];
            for (int i = 0; i < subuniformBins.Length; i++)
            {
                short bin = GetPatternNum(i);
                subuniformBins[i] = bin;
            }
            subuniformBinsGPU = worker.Malloc(subuniformBins);

            neighborCoordinateX = new float[numNeighbors];
            neighborCoordinateY = new float[numNeighbors];
            for (int i = 0; i < numNeighbors; i++)
            {
                float xx = (float)Math.Cos(2.0 * PI * (double)i / (double)numNeighbors);
                float yy = (float)Math.Sin(2.0 * PI * (double)i / (double)numNeighbors);
                neighborCoordinateX[i] = xx;
                neighborCoordinateY[i] = yy;
            }
            neighborCoordinateXGPU = worker.Malloc(neighborCoordinateX);
            neighborCoordinateYGPU = worker.Malloc(neighborCoordinateY);

            varBinsGPU = worker.Malloc(varBins);

            // initialize CUDA parameters
            var blockDims = new dim3(8, 8);
            var gridDims  = new dim3(Common.divup(size.Width, blockDims.x), Common.divup(size.Height, blockDims.y));

            lp = new LaunchParam(gridDims, blockDims);

            // create filters
            for (int i = 0; i < numScales; i++)
            {
                float[,] filter = LaplacianOfGaussian.Generate(i + 1);
                filters[i]      = Utils.Flatten(filter);
                filtersGPU[i]   = worker.Malloc(filters[i]);
                filterSizes[i]  = (filter.GetLength(0) - 1) / 2;
            }

            // allocate space for scale space images
            deviceptr <float>[] tempPointers = new deviceptr <float> [numScales];
            for (int i = 0; i < numScales; i++)
            {
                scaledImages[i] = worker.Malloc <float>(size.Width * size.Height);
                tempPointers[i] = scaledImages[i].Ptr;
            }
            scaledImagePointers = worker.Malloc(tempPointers);
            pixelScaleImage     = worker.Malloc <short>(size.Width * size.Height);
        }
        public static Matrix <FieldType> GpuMultiply <FieldType, GpuStructType>(Matrix <FieldType> left, Matrix <FieldType> right) where FieldType : Field <FieldType>, IGpuCompatibleField <FieldType, GpuStructType>, new() where GpuStructType : struct
        {
            if (left.Width != right.Height)
            {
                throw new InvalidOperationException("Matrices of incompatible sizes can't be multiplied.");
            }

            IGpuStructManager <FieldType, GpuStructType> gpuStructManager = new FieldType().GetDefaultGpuStructManager();

            GpuStructType[,] resultArr = new GpuStructType[left.Rows, right.Columns];
            GpuStructType[,] leftArr   = new GpuStructType[left.Rows, left.Columns];
            GpuStructType[,] rightArr  = new GpuStructType[right.Rows, right.Columns];

            resultArr.AssignAll(gpuStructManager.GetStructDefaultValue());
            leftArr.AssignAll(ind => gpuStructManager.ToStruct(left[ind[0], ind[1]]));
            rightArr.AssignAll(ind => gpuStructManager.ToStruct(right[ind[0], ind[1]]));


            Alea.Gpu gpu = Alea.Gpu.Default;

            int threadCount = left.Rows * right.Columns;
            int blockDimX   = gpu.Device.Attributes.MaxThreadsPerBlock;           // Threads per block
            int gridDimX    = (int)Math.Ceiling((double)threadCount / blockDimX); // Blocks per thread

            LaunchParam lp = new LaunchParam(gridDimX, blockDimX);

            gpu.Launch(multiplicationKernel, lp, leftArr, rightArr, resultArr, gpuStructManager.GetStructAddition(), gpuStructManager.GetStructMultiplication());

            FieldType[,] fieldResultArr = new FieldType[resultArr.GetLength(0), resultArr.GetLength(1)];
            fieldResultArr.AssignAll(ind => gpuStructManager.ToClass(resultArr[ind[0], ind[1]]));

            return(new Matrix <FieldType>(fieldResultArr));
        }
Esempio n. 5
0
        //[/GenericReduceSPUK]

        ///[genericReduceScalarProdUse]
        public T Apply(T[] values1, T[] values2)
        {
            var n         = values1.Length;
            var numSm     = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;
            var tup       = _plan.BlockRanges(numSm, n);
            var ranges    = tup.Item1;
            var numRanges = tup.Item2;
            var lpUpsweep = new LaunchParam(numRanges, _plan.NumThreads);
            var lpReduce  = new LaunchParam(1, _plan.NumThreadsReduction);

            using (var dRanges = GPUWorker.Malloc(ranges))
                using (var dRangeTotals = GPUWorker.Malloc <T>(numRanges))
                    using (var dValues1 = GPUWorker.Malloc(values1))
                        using (var dValues2 = GPUWorker.Malloc(values2))
                        {
                            // Launch range reduction kernel to calculate the totals per range.
                            GPUWorker.EvalAction(
                                () =>
                            {
                                GPULaunch(Upsweep, lpUpsweep, dValues1.Ptr, dValues2.Ptr, dRanges.Ptr, dRangeTotals.Ptr);
                                if (numRanges > 1)
                                {
                                    // Need to aggregate the block sums as well.
                                    GPULaunch(_reduce.ReduceRangeTotals, lpReduce, numRanges, dRangeTotals.Ptr);
                                }
                            });
                            return(dRangeTotals.Gather()[0]);
                        }
        }
Esempio n. 6
0
        //[/cuRANDComputeValue]

        //[cuRANDPiEstimator]
        public double RunEstimation(int numSims, int threadBlockSize)
        {
            // Aim to launch around ten or more times as many blocks as there
            // are multiprocessors on the target device.
            const int blocksPerSm = 10;
            var       numSMs      = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;

            // Determine how to divide the work between cores
            var block = new dim3(threadBlockSize);
            var grid  = new dim3((numSims + threadBlockSize - 1) / threadBlockSize);

            while (grid.x > 2 * blocksPerSm * numSims)
            {
                grid.x >>= 1;
            }

            var n = 2 * numSims;

            using (var dPoints = GPUWorker.Malloc <double>(n))
                using (var dResults = GPUWorker.Malloc <double>(grid.x))
                {
                    // Generate random points in unit square
                    var curand = new CURAND(GPUWorker, CURANDInterop.curandRngType.CURAND_RNG_QUASI_SOBOL64);
                    curand.SetQuasiRandomGeneratorDimensions(2);
                    curand.SetGeneratorOrdering(CURANDInterop.curandOrdering.CURAND_ORDERING_QUASI_DEFAULT);
                    curand.GenerateUniformDouble(dPoints.Ptr, new IntPtr(n));

                    var lp = new LaunchParam(grid, block, block.x * sizeof(uint));
                    GPULaunch(ComputeValue, lp, dResults.Ptr, dPoints.Ptr, numSims);
                    var value = dResults.Gather().Sum();
                    return((value / numSims) * 4.0);
                }
        }
Esempio n. 7
0
        public static void Cuda(
            Real[] mIntraReturn,
            Real[] vClose,
            Real[] vIsAlive,
            Real[] vIsValidDay,
            int m,
            int n)
        {
            var gpu = Gpu.Default;

            using (var cudaIntraReturn = gpu.AllocateDevice(mIntraReturn))
                using (var cudaClose = gpu.AllocateDevice(vClose))
                    using (var cudaIsAlive = gpu.AllocateDevice(vIsAlive))
                        using (var cudaIsValidDay = gpu.AllocateDevice(vIsValidDay))
                        {
                            var timer = Stopwatch.StartNew();

                            var gridSizeX = Util.DivUp(n, 32);
                            var gridSizeY = Util.DivUp(m, 8);
                            var lp        = new LaunchParam(new dim3(gridSizeX, gridSizeY), new dim3(32, 8));

                            gpu.Launch(CudaKernel, lp, cudaIntraReturn.Ptr, cudaClose.Ptr, cudaIsAlive.Ptr, cudaIsValidDay.Ptr, m, n);

                            gpu.Synchronize();
                            Util.PrintPerformance(timer, "IntraReturn.Cuda", 5, m, n);

                            Gpu.Copy(cudaIntraReturn, mIntraReturn);
                        }
        }
Esempio n. 8
0
        //[/GenericScanDownsweepKernel]

        public T[] Apply(T[] input, bool inclusive)
        {
            var n         = input.Length;
            var numSm     = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;
            var tup       = Plan.BlockRanges(numSm, n);
            var ranges    = tup.Item1;
            var numRanges = tup.Item2;


            var lpUpsweep   = new LaunchParam(numRanges, Plan.NumThreads);
            var lpReduce    = new LaunchParam(1, Plan.NumThreadsReduction);
            var lpDownsweep = new LaunchParam(numRanges, Plan.NumThreads);
            var _inclusive  = inclusive ? 1 : 0;

            using (var dRanges = GPUWorker.Malloc(ranges))
                using (var dRangeTotals = GPUWorker.Malloc <T>(numRanges + 1))
                    using (var dInput = GPUWorker.Malloc(input))
                        using (var dOutput = GPUWorker.Malloc(input))
                        {
                            _reduceModule.Upsweep(lpUpsweep, dInput.Ptr, dRanges.Ptr, dRangeTotals.Ptr);
                            GPULaunch(ScanReduce, lpReduce, numRanges, dRangeTotals.Ptr);
                            GPULaunch(Downsweep, lpDownsweep, dInput.Ptr, dOutput.Ptr, dRangeTotals.Ptr, dRanges.Ptr, _inclusive);
                            return(dOutput.Gather());
                        }
        }
Esempio n. 9
0
        private static void AleaOptimisedImpl(
            Gpu gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n,
            string name,
            Action <deviceptr <Real>, deviceptr <Real>, Constant <int>, Constant <int>, int, int> kernel)
        {
            using var cudaSquaredDistance = gpu.AllocateDevice <Real>(n, n);
            using var cudaCoordinates     = gpu.AllocateDevice(mCoordinates);
            var timer = Stopwatch.StartNew();

            const int blockSize = 256;
            var       gridSize  = Util.DivUp(n, blockSize);
            var       lp        = new LaunchParam(new dim3(gridSize, gridSize, 1), new dim3(blockSize, 1, 1));
            var       pitch     = cudaSquaredDistance.PitchInElements.ToInt32();

            gpu.Launch(kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, Gpu.Constant(blockSize), Gpu.Constant(c), n, pitch);
            gpu.Synchronize();

            Util.PrintPerformance(timer, name, n, c, n);

            Gpu.Copy2D(cudaSquaredDistance, mSquaredDistances, n, n);
        }
Esempio n. 10
0
        public static void flatten_ongpu(float[] x, int spatial, int layers, int batch, int forward, float[] output)
        {
            int size = spatial * batch * layers;
            var lp   = new LaunchParam(CudaUtils.cuda_gridsize(size), new dim3(CudaUtils.BlockSize));

            Gpu.Default.Launch(flatten_kernel, lp, size, x, spatial, layers, batch, forward, output);
        }
Esempio n. 11
0
        public static void reorg_ongpu(float[] x, int w, int h, int c, int batch, int stride, int forward, float[] output)
        {
            int size = w * h * c * batch;
            var lp   = new LaunchParam(CudaUtils.cuda_gridsize(size), new dim3(CudaUtils.BlockSize));

            Gpu.Default.Launch(reorg_kernel, lp, size, x, w, h, c, batch, stride, forward, output);
        }
Esempio n. 12
0
        public static void normalize_gpu(float[] x, float[] mean, float[] variance, int batch, int filters, int spatial)
        {
            var n  = batch * filters * spatial;
            var lp = new LaunchParam(CudaUtils.cuda_gridsize(n), new dim3(CudaUtils.BlockSize));

            Gpu.Default.Launch(normalize_kernel, lp, n, x, mean, variance, batch, filters, spatial);
        }
Esempio n. 13
0
        public static void fast_variance_delta_gpu(float[] x, float[] delta, float[] mean, float[] variance, int batch, int filters, int spatial, float[] varianceDelta)
        {
            var lp = new LaunchParam(CudaUtils.cuda_gridsize(filters), new dim3(CudaUtils.BlockSize));

            Gpu.Default.Launch(fast_variance_delta_kernel, lp, x, delta, mean, variance, batch, filters, spatial,
                               varianceDelta);
        }
Esempio n. 14
0
        // Fixed Block Size!
        internal static Image Render3(Bitmap image, ConvolutionFilter filter)
        {
            var gpu = Gpu.Default;

            var width = image.Width;
            var array = BitmapUtility.ToColorArray(image);

            var mFilter = filter.Filter;
            var mFactor = filter.Factor;
            var mOffset = filter.Offset;

            var inputMemory = gpu.ArrayGetMemory(array, true, false);
            var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle);

            var resultLength = array.Length;
            var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength);
            var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle);

            var lp = new LaunchParam(256, 256);

            gpu.Launch(() =>
            {
                var i = blockDim.x * blockIdx.x + threadIdx.x;

                while (i < resultLength)
                {
                    ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width);
                    i += blockDim.x * gridDim.x;
                }
            }, lp);

            return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height));
        }
Esempio n. 15
0
        private static void CudaOptimisedImpl <TInt>(
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n,
            string name,
            Action <deviceptr <float>, deviceptr <float>, TInt, int, int> kernel,
            Func <int, TInt> numCoordGetter)
        {
            var gpu = Gpu.Default;

            using (var cudaSquaredDistance = gpu.AllocateDevice <Real>(n, n))
                using (var cudaCoordinates = gpu.AllocateDevice(mCoordinates))
                {
                    var timer = Stopwatch.StartNew();

                    const int blockSize = 128;
                    var       gridSize  = Util.DivUp(n, blockSize);
                    var       lp        = new LaunchParam(new dim3(gridSize, gridSize, 1), new dim3(blockSize, 1, 1), 2 * c * blockSize * sizeof(Real));
                    var       pitch     = cudaSquaredDistance.PitchInElements.ToInt32();

                    gpu.Launch(kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, numCoordGetter(c), n, pitch);

                    gpu.Synchronize();
                    Util.PrintPerformance(timer, name, n, c, n);

                    Gpu.Copy2D(cudaSquaredDistance, mSquaredDistances, n, n);
                }
        }
Esempio n. 16
0
        public static void scale_bias_gpu(float[] output, float[] biases, int batch, int n, int size)
        {
            var dimGrid  = new dim3((size - 1) / CudaUtils.BlockSize + 1, n, batch);
            var dimBlock = new dim3(CudaUtils.BlockSize, 1, 1);
            var lp       = new LaunchParam(dimGrid, dimBlock);

            Gpu.Default.Launch(scale_bias_kernel, lp, output, biases, n, size);
        }
Esempio n. 17
0
        public void Apply(int numSystems, int n, deviceptr <double> dl, deviceptr <double> dd, deviceptr <double> du,
                          deviceptr <double> db, deviceptr <double> dx)
        {
            var sharedSize = 9 * n * sizeof(double);
            var lp         = new LaunchParam(numSystems, n, sharedSize);

            this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx);
        }
Esempio n. 18
0
        public static void softmax_gpu(float[] input, int n, int offset, int groups, float temp, float[] output, int inputStart = 0, int outputStart = 0)
        {
            int inputs = n;
            int batch  = groups;
            var lp     = new LaunchParam(CudaUtils.cuda_gridsize(batch), new dim3(CudaUtils.BlockSize));

            Gpu.Default.Launch(softmax_kernel, lp, inputs, offset, batch, input, temp, output, inputStart, outputStart);
        }
        //[/transformKernel]

        //[transformGPUDevice]
        public void Apply(int n, deviceptr <T> x, deviceptr <T> y, deviceptr <T> z)
        {
            const int blockSize = 256;
            var       numSm     = this.GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;
            var       gridSize  = Math.Min(16 * numSm, Common.divup(n, blockSize));
            var       lp        = new LaunchParam(gridSize, blockSize);

            GPULaunch(Kernel, lp, n, x, y, z);
        }
Esempio n. 20
0
        protected override RlmCacheDataArray LaunchKernel(long[] rneurons, double[][] inputs, double[] from, double[] to, bool[] rneuronsCache, double[] fromCache, double[] toCache, int lparam1, int lparam2)
        {
            var resultArr = new long[rneurons.Length];
            var lp        = new LaunchParam(lparam1, lparam2);

            gpu.Launch(RlmAleaGpu.KernelCache, lp, rneurons, inputs, resultArr, from, to, rneuronsCache, fromCache, toCache);

            return(FindBestSolutionAndBuildCache(rneurons, resultArr, inputs, rneuronsCache));
        }
Esempio n. 21
0
        protected override void LaunchKernel(long[] rneurons, double[][] inputs, double[] from, double[] to, int lparam1, int lparam2)
        {
            var resultArr = new long[rneurons.Length];
            var lp        = new LaunchParam(lparam1, lparam2);

            gpu.Launch(RlmAleaGpu.Kernel, lp, rneurons, inputs, resultArr, from, to);

            FindBestSolution(resultArr);
        }
Esempio n. 22
0
        public override void Forward(Executor executor)
        {
            var wh = executor.GetTensor(Wh);
            var wd = executor.GetTensor(Wd);
            var v  = executor.GetTensor(V);
            var h  = executor.GetTensor(EncoderHiddenStates).Reshape(SeqLength * Batch, -1);
            var d  = executor.GetTensor(DecoderHiddenState);

            var whh = Dot(h, wh);       // [n*b, EncoderHiddenSize] * [EncoderHiddenSize, AttentionDim] = [n*b, AttentionDim]
            var wdd = Dot(d, wd);       // [b, DecoderHiddenSize] * [DecoderHiddenSize, AttentionDim] = [b, AttentionDim]
            var whd = Tanh(whh + wdd);  // broadcasting to [n*b, AttentionDim]

            var u = Dot(whd, v);        // [n*b, AttentionDim] * [AttentionDim] = [n*b]

            var expu    = Exp(u.Reshape(SeqLength, Batch));
            var softmax = expu / ReduceSum(expu, true, 0);  // [n, b]

            executor.AssignTensor(Softmax, softmax);

            var ctx = executor.Context;

            if (ctx.Type == ContextType.Gpu && typeof(T) == typeof(float))
            {
                var stream         = ctx.ToGpuContext().Stream;
                var hPtr           = h.Buffer.Ptr.Reinterpret <float>();
                var softmaxPtr     = executor.GetTensor(Softmax).Buffer.Ptr.Reinterpret <float>();
                var attentionState = executor.GetTensor(AttentionState).Buffer.Ptr.Reinterpret <float>();

                var batchSize         = Batch;
                var seqLength         = SeqLength;
                var encoderHiddenSize = EncoderHiddenSize;

                // strides for hPtr: [n*b, b, 1]
                // TODO proper size
                var lp = new LaunchParam(new dim3(batchSize / 32, encoderHiddenSize / 32, 1), new dim3(32, 32));
                stream.Launch(() =>
                {
                    var batch  = blockIdx.x * blockDim.x + threadIdx.x;
                    var hidden = blockIdx.y * blockDim.y + threadIdx.y;
                    if (batch < batchSize && hidden < EncoderHiddenSize)
                    {
                        var sum = 0.0f;
                        for (var i = 0; i < seqLength; ++i)
                        {
                            var alpha = softmaxPtr[i * batchSize + batch];
                            sum      += alpha * hPtr[i * seqLength * batchSize + batch * batchSize + hidden];
                        }
                        attentionState[batch * encoderHiddenSize + hidden] = sum;
                    }
                }, lp);
            }
            else
            {
                throw new NotImplementedException();
            }
        }
Esempio n. 23
0
        public static void RunGpuWithAutomaticMemoryManagement()
        {
            var n      = GetData(out var x, out var y);
            var result = new float[n];

            var gpu = Gpu.Default;
            var lp  = new LaunchParam(16, 256);

            gpu.Launch(Kernel, lp, result, x, y);
        }
        public static void TestSimpleMultiply()
        {
            for (var iter = 1; iter <= 3; ++iter)
            {
                Console.WriteLine("====> Test SimpleMultiply with Alea GPU C# AOT instance usage (#.{0}) <====", iter);

                var timer = Stopwatch.StartNew();
                var worker = Util.Worker;
                Console.WriteLine("GPU: {0}", worker.Device.Name);
                timer.Stop();
                Console.WriteLine("Step 1) Runtime setup                   {0} ms", timer.Elapsed.TotalMilliseconds);

                timer.Restart();
                using (var module = new InstanceUsageAOT(GPUModuleTarget.Worker(worker)))
                {
                    module.GPUForceLoad();
                    timer.Stop();
                    Console.WriteLine("Step 2+3) Compile and Load module       {0} ms", timer.Elapsed.TotalMilliseconds);

                    const int factor = 8;
                    var a = Util.RandomMatrix(100 * factor, 200 * factor);
                    var b = Util.RandomMatrix(200 * factor, 300 * factor);
                    var aRows = 100 * factor;
                    var bCols = 300 * factor;
                    var aCols_bRows = 200 * factor;
                    var gridDim = new dim3(Util.Divup(bCols, TileSize), Util.Divup(aRows, TileSize));
                    var blockDim = new dim3(TileSize, TileSize);
                    var lp = new LaunchParam(gridDim, blockDim);

                    using (var devA = worker.Malloc(a))
                    using (var devB = worker.Malloc(b))
                    using (var devC = worker.Malloc<float>(aRows * bCols))
                    {
                        timer.Restart();
                        module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows);
                        worker.Synchronize();
                        timer.Stop();
                        Console.WriteLine("Kernel launch first time                {0} ms", timer.Elapsed.TotalMilliseconds);

                        const int repetitions = 50;
                        timer.Restart();
                        for (var i = 0; i < repetitions; ++i)
                        {
                            module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows);
                        }
                        worker.Synchronize();
                        timer.Stop();
                        Console.WriteLine("Kernel launch average time              {0} ms", (timer.Elapsed.TotalMilliseconds / (float)repetitions));

                        var c = devC.Gather();
                        Util.VerifyResult(a, b, c, aRows, bCols, aCols_bRows);
                    }
                }
            }
        }
    public void Test()
    {
        const int n  = 32;
        var       lp = new LaunchParam(1, n);

        using (var outputs = GPUWorker.Malloc <int>(n))
        {
            this.MyGPULaunch(Kernel, lp, outputs.Ptr, 1, 3);
            Console.WriteLine("{0}", (outputs.Gather())[4]);
        }
    }
        //[/DynamicStartKernel]

        //[DynamicPrepareAndLaunchKernel]
        public void IntegrateNbodySystem(deviceptr<float4> newPos, deviceptr<float4> oldPos, 
                                         deviceptr<float4> vel, int numBodies, float deltaTime, 
                                         float softeningSquared, float damping, int blockSize)
        {
            var numBlocks = Alea.CUDA.Utilities.Common.divup(numBodies, blockSize);
            var numTiles = Alea.CUDA.Utilities.Common.divup(numBodies, blockSize);
            var sharedMemSize = blockSize * Operators.SizeOf<float4>();
            var lp = new LaunchParam(numBlocks, blockSize, sharedMemSize);
            GPULaunch(IntegrateBodies, lp, newPos, oldPos, vel, numBodies, deltaTime, 
                      softeningSquared, damping, numTiles);
        }
        //[/StaticStartKernel]

        //[StaticPrepareAndLaunchKernel]
        public void IntegrateNbodySystem(deviceptr <float4> newPos, deviceptr <float4> oldPos,
                                         deviceptr <float4> vel, int numBodies, float deltaTime,
                                         float softeningSquared, float damping)
        {
            var numBlocks = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize);
            var numTiles  = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize);
            var lp        = new LaunchParam(numBlocks, _blockSize);

            GPULaunch(IntegrateBodies, lp, newPos, oldPos, vel, numBodies, deltaTime, softeningSquared, damping,
                      numTiles);
        }
Esempio n. 28
0
        public override void Backward(Executor executor)
        {
            var ctx     = executor.Context;
            var indices = executor.GetTensor(Indices);
            var gradout = executor.GetGradient(Output);

            // for performance fix.
            if (ctx.Type == ContextType.Gpu && gradout.Layout.IsInnerChangeMostFullyPacked && indices.Layout.IsInnerChangeMostFullyPacked)
            {
                var embedDim   = EmbedDim;
                var batchSize  = (int)indices.Shape.Length;
                var threadSize = 256;

                // first set all to 0
                executor.AssignGradient(Weights, Fill(executor.GetTensor(Weights).Shape, ScalarOps.Conv <T>(0.0)));
                var dW = executor.GetGradient(Weights);

                // then use a 1 block kernel to update it, cause usually the batch size is not huge, but the embedsize is huge!
                var stream = ctx.ToGpuContext().Stream;
                var iPtr   = indices.Buffer.Ptr;

                // the following kernel is for 1 block, so there is no need for synchornization,
                // there could be further optimized.

                if (typeof(T) == typeof(float))
                {
                    var dOPtr = gradout.Buffer.Ptr.Reinterpret <float>();
                    var dWPtr = dW.Buffer.Ptr.Reinterpret <float>();
                    var lp    = new LaunchParam(1, threadSize);
                    //Console.WriteLine($"{indices.Shape} {gradout.Shape} {dW.Shape}");
                    stream.Launch(() =>
                    {
                        for (var i = 0; i < batchSize; ++i)
                        {
                            var row = iPtr[i];

                            for (var k = threadIdx.x; k < embedDim; k += blockDim.x)
                            {
                                dWPtr[row * embedDim + k] += dOPtr[i * embedDim + k];
                            }
                        }
                    }, lp);

                    return;
                }

                throw new NotImplementedException();
            }
            else
            {
                executor.AssignGradient(Weights, TakeGrad(indices, gradout, EmbedSize));
            }
        }
Esempio n. 29
0
        private void mRunKernel( )
        {
            int kiLength   = 1000;
            var koArg1     = Enumerable.Range(0, kiLength).ToArray( );
            var koArg2     = Enumerable.Range(0, kiLength).ToArray( );
            var koExpected = koArg1.Zip(koArg2, (x, y) => x + y);
            var koResult   = new int[kiLength];
            var koLP       = new LaunchParam(4, 32); // Use 4 blocks with 32 threads per block

            this.voGpu.Launch(this.mKernel, koLP, koResult, koArg1, koArg2);
            Assert.That(koResult, Is.EqualTo(koExpected));
            MessageBox.Show("Result[600] = " + koResult[600].ToString( ));
        }
Esempio n. 30
0
        /// <summary>
        /// Private Constructor
        /// </summary>
        Camera()
        {
            try
            {
                // pre-allocate and initialize the variables and data structures for brightness correction
                imgGPU           = worker.Malloc <byte>(new byte[640 * 640]);
                meanImg          = worker.Malloc <float>(new float[640 * 640]);
                addReduce        = DeviceSumModuleF32.Default.Create(numPixels);
                correctionFactor = worker.Malloc <float>(640 * 640);
                float[] temp = new float[640 * 640];
                for (int i = 0; i < temp.Length; i++)
                {
                    temp[i] = 1;
                }
                correctionFactor.Scatter(temp);
                scalarOutput = worker.Malloc <float>(1);

                if (File.Exists("correctionFactor.dat"))
                {
                    FileStream stream = new FileStream("correctionFactor.dat", FileMode.Open);
                    byte[]     buffer = new byte[640 * 640 * 4];
                    stream.Read(buffer, 0, (int)Math.Min(buffer.Length, stream.Length));
                    for (int i = 0; i < 640 * 640; i++)
                    {
                        temp[i] = BitConverter.ToSingle(buffer, 4 * i);
                    }
                    stream.Close();

                    correctionFactor.Scatter(temp);
                }

                // initialize CUDA parameters
                var blockDims = new dim3(32, 32);
                var gridDims  = new dim3(Common.divup(640, blockDims.x), Common.divup(640, blockDims.y));
                lp = new LaunchParam(gridDims, blockDims);

                // set up the camera parameters and events
                provider = new IduleProviderCsCam(0);
                provider.Initialize();
                if (provider.IsConnected)
                {
                    provider.ImageTransaction += provider_ImageTransaction;
                    provider.Interrupt        += provider_Interrupt;
                    provider.Exception        += camera_Exception;
                    provider.WriteRegister(new NanEyeGSRegisterPayload(false, 0x05, true, 0, prescaler));
                    provider.WriteRegister(new NanEyeGSRegisterPayload(false, 0x06, true, 0, exposure));
                    ProcessingWrapper.pr[0].ReduceProcessing = true;
                }
            }
            catch (Exception ex) { OnError(ex.Message); }
        }
Esempio n. 31
0
        public static void col2im_ongpu(float[] dataCol,
                                        int channels, int height, int width,
                                        int ksize, int stride, int pad, float[] dataIm, int imStart = 0)
        {
            // We are going to launch channels * height_col * width_col kernels, each
            // kernel responsible for copying a single-channel grid.
            int heightCol  = (height + 2 * pad - ksize) / stride + 1;
            int widthCol   = (width + 2 * pad - ksize) / stride + 1;
            int numKernels = channels * height * width;
            var lp         = new LaunchParam((numKernels + CudaUtils.BlockSize - 1) / CudaUtils.BlockSize, CudaUtils.BlockSize);

            Gpu.Default.Launch(col2im_gpu_kernel, lp, numKernels, dataCol, height, width, ksize, pad, stride,
                               heightCol, widthCol, dataIm, imStart);
        }
Esempio n. 32
0
        //[/parallelSquareKernel]

        //[parallelSquareLaunch]
        static double[] SquareGPU(double[] inputs)
        {
            var worker = Worker.Default;
            using (var dInputs = worker.Malloc(inputs))
            using (var dOutputs = worker.Malloc<double>(inputs.Length))
            {
                const int blockSize = 256;
                var numSm = worker.Device.Attributes.MULTIPROCESSOR_COUNT;
                var gridSize = Math.Min(16 * numSm, Common.divup(inputs.Length, blockSize));
                var lp = new LaunchParam(gridSize, blockSize);
                worker.Launch(SquareKernel, lp, dOutputs.Ptr, dInputs.Ptr, inputs.Length);
                return dOutputs.Gather();
            }
        }
    public static void MyGPULaunch <T1, T2, T3>(
        this ILGPUModule module,
        Action <T1, T2, T3> kernelD, LaunchParam lp,
        T1 arg1, T2 arg2, T3 arg3)
    {
        // get the kernel object by method name
        var kernel = module.GPUEntities.GetKernel(kernelD.Method.Name).Kernel;
        // create parameter list (which is FSharpList)
        var parameterArray = new object[] { arg1, arg2, arg3 };
        var parameterList  = ListModule.OfArray(parameterArray);

        // use untyped LaunchRaw to launch the kernel
        kernel.LaunchRaw(lp, parameterList);
    }
Esempio n. 34
0
        private void mRunMatrixMultiplyGPU3( )
        {
            Stopwatch   koSW = new Stopwatch( );
            LaunchParam koLP;

            this.mClearMatrix(this.vfC);

            koSW.Start( );
            koLP = new LaunchParam(new dim3(2, 2), new dim3(25, 25));
            this.voGpu.Launch(MatrixMulCuda.MMultiplyKernel3, koLP, this.vfC, this.vfA, this.vfB);
            koSW.Stop( );
            MessageBox.Show("C[200][200] = " + this.vfC[200][200].ToString( ) + Environment.NewLine +
                            "Time Elapsed = " + koSW.ElapsedMilliseconds.ToString( ));
        }
Esempio n. 35
0
 public void Apply(int numSystems, int n, deviceptr<double> dl, deviceptr<double> dd, deviceptr<double> du,
     deviceptr<double> db, deviceptr<double> dx)
 {
     var sharedSize = 9*n*sizeof (double);
     var lp = new LaunchParam(numSystems, n, sharedSize);
     this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx);
 }
Esempio n. 36
0
        //[/cuRANDComputeValue]

        //[cuRANDPiEstimator]
        public double RunEstimation(int numSims, int threadBlockSize)
        {
            // Aim to launch around ten or more times as many blocks as there
            // are multiprocessors on the target device.
            const int blocksPerSm = 10;
            var numSMs = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;

            // Determine how to divide the work between cores
            var block = new dim3(threadBlockSize);
            var grid = new dim3((numSims + threadBlockSize - 1) / threadBlockSize);
            while (grid.x > 2 * blocksPerSm * numSims) grid.x >>= 1;

            var n = 2 * numSims;
            using (var dPoints = GPUWorker.Malloc<double>(n))
            using (var dResults = GPUWorker.Malloc<double>(grid.x))
            {
                // Generate random points in unit square
                var curand = new CURAND(GPUWorker, CURANDInterop.curandRngType.CURAND_RNG_QUASI_SOBOL64);
                curand.SetQuasiRandomGeneratorDimensions(2);
                curand.SetGeneratorOrdering(CURANDInterop.curandOrdering.CURAND_ORDERING_QUASI_DEFAULT);
                curand.GenerateUniformDouble(dPoints.Ptr, new IntPtr(n));

                var lp = new LaunchParam(grid, block, block.x * sizeof(uint));
                GPULaunch(ComputeValue, lp, dResults.Ptr, dPoints.Ptr, numSims);
                var value = dResults.Gather().Sum();
                return (value/numSims)*4.0;
            }
        }