示例#1
0
        // ReSharper disable once SuggestBaseTypeForParameter
        private static void KernelSequentialReduceIdleThreads <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op)
        {
            var shared = __shared__.ExternArray <T>();

            var tid = threadIdx.x;
            var bid = blockIdx.x;
            var gid = 2 * blockDim.x * bid + tid;

            shared[tid] = (gid < length && gid + blockDim.x < length)
                ? op(array[gid], array[gid + blockDim.x])
                : array[gid];

            DeviceFunction.SyncThreads();

            for (int s = blockDim.x / 2; s > 0; s >>= 1)
            {
                if (tid < s && gid + s < length)
                {
                    shared[tid] = op(shared[tid], shared[tid + s]);
                }

                DeviceFunction.SyncThreads();
            }

            if (tid == 0)
            {
                result[bid] = shared[0];
            }
        }
示例#2
0
        // ReSharper disable once SuggestBaseTypeForParameter
        private static void KernelInterleavedAccess <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op)
        {
            var shared = __shared__.ExternArray <T>();

            var tid = threadIdx.x;
            var bid = blockIdx.x;
            var gid = blockDim.x * bid + tid;

            if (gid < length)
            {
                shared[tid] = array[gid];
            }

            DeviceFunction.SyncThreads();

            for (var s = 1; s < blockDim.x; s *= 2)
            {
                if (tid % (2 * s) == 0 && gid + s < length)
                {
                    shared[tid] = op(shared[tid], shared[tid + s]);
                }

                DeviceFunction.SyncThreads();
            }

            if (tid == 0)
            {
                result[bid] = shared[0];
            }
        }
示例#3
0
        public void ComputeValue(deviceptr <double> results, deviceptr <double> points, int numSims)
        {
            // Determine thread ID
            var bid  = blockIdx.x;
            var tid  = blockIdx.x * blockDim.x + threadIdx.x;
            var step = gridDim.x * blockDim.x;

            // Shift the input/output pointers
            var pointx = points + tid;
            var pointy = pointx + numSims;

            var pointsInside = 0;

            for (var i = tid; i < numSims; i += step, pointx += step, pointy += step)
            {
                var x       = pointx[0];
                var y       = pointy[0];
                var l2norm2 = x * x + y * y;
                if (l2norm2 < 1.0)
                {
                    pointsInside++;
                }
            }

            // Reduce within the block
            pointsInside = ReduceSum(pointsInside);

            // Store the result
            if (threadIdx.x == 0)
            {
                results[bid] = pointsInside;
            }
        }
示例#4
0
        private static void AleaKernel(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            int c,
            int n)
        {
            var j = blockIdx.x * blockDim.x + threadIdx.x;

            if (j < n)
            {
                for (int i = 0; i < n; ++i)
                {
                    Real dist = 0;

                    for (int k = 0; k != c; ++k)
                    {
                        var coord1 = mCoordinates[k * n + i];
                        var coord2 = mCoordinates[k * n + j];
                        var diff   = coord1 - coord2;

                        dist += diff * diff;
                    }

                    mSquaredDistances[i * n + j] = dist;
                }
            }
        }
示例#5
0
        // Link: https://mail.google.com/mail/u/0/#inbox/1598d0b3b2850009?projector=1
        // I'm sure memory management is far from optimal!
        // Fixed Block and Thread!
        internal static T ComputeGpu5 <T>(T[] array, Func <T, T, T> op)
        {
            const int dimGrid  = 256;
            const int blockDim = 256;

            var gpu = Gpu.Default;

            var inputLength = array.Length;
            var inputMemory = gpu.ArrayGetMemory(array, true, false);
            var inputDevPtr = new deviceptr <T>(inputMemory.Handle);

            var resultMemory = gpu.AllocateDevice <T>(dimGrid);
            var resultDevPtr = new deviceptr <T>(resultMemory.Handle);

            gpu.Launch(() => KernelSequentialReduceIdleThreadsWarpMultiple(inputDevPtr, inputLength, resultDevPtr, op), new LaunchParam(dimGrid, blockDim));

            inputDevPtr = resultDevPtr;

            resultMemory = gpu.AllocateDevice <T>(dimGrid);
            resultDevPtr = new deviceptr <T>(resultMemory.Handle);

            gpu.Launch(() => KernelSequentialReduceIdleThreadsWarpMultiple(inputDevPtr, dimGrid, resultDevPtr, op), new LaunchParam(1, blockDim));

            return(Gpu.CopyToHost(resultMemory)[0]);
        }
示例#6
0
        // Alea Parallel.For!
        internal static Image Render1(Bounds bounds)
        {
            bounds.AdjustAspectRatio();

            var width  = bounds.Width;
            var height = bounds.Height;
            var scale  = (bounds.XMax - bounds.XMin) / width;

            var resultLength = ColorComponents * width * height;
            var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength);
            var resultDevPtr = new deviceptr <byte>(resultMemory.Handle);

            Gpu.Default.For(0, width * height, i =>
            {
                var x      = i % width;
                var y      = i / width;
                var offset = ColorComponents * i;

                if (offset < resultLength)
                {
                    // ReSharper disable once PossibleLossOfFraction
                    var c = new Complex
                    {
                        Real      = bounds.XMin + x * scale,
                        Imaginary = bounds.YMin + y * scale,
                    };

                    ComputeMandelbrotAtOffset(resultDevPtr, c, offset);
                }
            });

            return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height));
        }
示例#7
0
        static void Filter_Symmetric_Kernel(int width, int height, deviceptr <float> inputImg, deviceptr <float> outputImg, deviceptr <float> filter, int filterSize)
        {
            var x = blockIdx.x * blockDim.x + threadIdx.x;
            var y = blockIdx.y * blockDim.y + threadIdx.y;

            if (x < 0 || x >= width || y < 0 || y >= height)
            {
                return;
            }

            int   filterWidth = 2 * filterSize + 1;
            float sum = 0;
            int   xx, yy;

            for (int i = -filterSize; i <= filterSize; i++)
            {
                for (int j = -filterSize; j <= filterSize; j++)
                {
                    xx = x + j;
                    yy = y + i;
                    if (xx >= 0 && xx < width && yy >= 0 && yy < height)
                    {
                        sum += filter[(i + filterSize) * filterWidth + (j + filterSize)] * inputImg[yy * width + xx];
                    }
                }
            }
            outputImg[y * width + x] = sum;
        }
        //[/GLdescription]

        //[LockPositions]
        void LockPos(Del f)
        {
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[0],
                (uint) CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY));
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[1],
                (uint) CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD));
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsMapResourcesEx(2u, _resources, IntPtr.Zero));

            var bytes = IntPtr.Zero;
            var handle0 = IntPtr.Zero;
            var handle1 = IntPtr.Zero;
            unsafe
            {
                CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle0, &bytes, 
                                                                                      _resources[0]));
                CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle1, &bytes, 
                                                                                      _resources[1]));
            }
            var pos0 = new deviceptr<float4>(handle0);
            var pos1 = new deviceptr<float4>(handle1);
            try
            {
                f(pos0, pos1);
            }
            finally
            {
                CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsUnmapResourcesEx(2u, _resources, IntPtr.Zero));
            }
        }
示例#9
0
        //[/GLdescription]

        //[LockPositions]
        void LockPos(Del f)
        {
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[0],
                                                                             (uint)CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY));
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceSetMapFlags(_resources[1],
                                                                             (uint)CUDAInterop.CUgraphicsMapResourceFlags_enum.CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD));
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsMapResourcesEx(2u, _resources, IntPtr.Zero));

            var bytes   = IntPtr.Zero;
            var handle0 = IntPtr.Zero;
            var handle1 = IntPtr.Zero;

            unsafe
            {
                CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle0, &bytes,
                                                                                      _resources[0]));
                CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&handle1, &bytes,
                                                                                      _resources[1]));
            }
            var pos0 = new deviceptr <float4>(handle0);
            var pos1 = new deviceptr <float4>(handle1);

            try
            {
                f(pos0, pos1);
            }
            finally
            {
                CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsUnmapResourcesEx(2u, _resources, IntPtr.Zero));
            }
        }
示例#10
0
        private AdaptiveLBP(Size size)
        {
            this.size = size;

            // initialize data structures to avoid reallocating with every call
            hist          = new int[numSubuniformPatterns * numVarBins];
            hist2         = new int[numSubuniformPatterns * numVarBins];
            lbpImageGPU   = worker.Malloc <short>(size.Width * size.Height);
            varImageGPU   = worker.Malloc <short>(size.Width * size.Height);
            histGPU       = worker.Malloc <int>(hist.Length);
            floatImageGPU = worker.Malloc <float>(size.Width * size.Height);

            // precompute the subuniform bin for each LBP pattern, and push it to the GPU
            subuniformBins = new short[(short)Math.Pow(2, numNeighbors)];
            for (int i = 0; i < subuniformBins.Length; i++)
            {
                short bin = GetPatternNum(i);
                subuniformBins[i] = bin;
            }
            subuniformBinsGPU = worker.Malloc(subuniformBins);

            neighborCoordinateX = new float[numNeighbors];
            neighborCoordinateY = new float[numNeighbors];
            for (int i = 0; i < numNeighbors; i++)
            {
                float xx = (float)Math.Cos(2.0 * PI * (double)i / (double)numNeighbors);
                float yy = (float)Math.Sin(2.0 * PI * (double)i / (double)numNeighbors);
                neighborCoordinateX[i] = xx;
                neighborCoordinateY[i] = yy;
            }
            neighborCoordinateXGPU = worker.Malloc(neighborCoordinateX);
            neighborCoordinateYGPU = worker.Malloc(neighborCoordinateY);

            varBinsGPU = worker.Malloc(varBins);

            // initialize CUDA parameters
            var blockDims = new dim3(8, 8);
            var gridDims  = new dim3(Common.divup(size.Width, blockDims.x), Common.divup(size.Height, blockDims.y));

            lp = new LaunchParam(gridDims, blockDims);

            // create filters
            for (int i = 0; i < numScales; i++)
            {
                float[,] filter = LaplacianOfGaussian.Generate(i + 1);
                filters[i]      = Utils.Flatten(filter);
                filtersGPU[i]   = worker.Malloc(filters[i]);
                filterSizes[i]  = (filter.GetLength(0) - 1) / 2;
            }

            // allocate space for scale space images
            deviceptr <float>[] tempPointers = new deviceptr <float> [numScales];
            for (int i = 0; i < numScales; i++)
            {
                scaledImages[i] = worker.Malloc <float>(size.Width * size.Height);
                tempPointers[i] = scaledImages[i].Ptr;
            }
            scaledImagePointers = worker.Malloc(tempPointers);
            pixelScaleImage     = worker.Malloc <short>(size.Width * size.Height);
        }
示例#11
0
        private void TraceKernel(int index, deviceptr <ColorRaw> image, int width)
        {
            var x = index % width;
            var y = index / width;

            var color    = ColorRaw.FromRgb(0xed, 0x95, 0x64);
            var maxDepth = float.MinValue;

            for (var i = 0; i < _spheres.Length; i++)
            {
                // Todo: Get rid of 'n' until we have a way to properly shade!
                float n;
                var   sphere = _spheres[i];
                var   depth  = sphere.GetIntersection(x, y, out n);

                if (depth > maxDepth)
                {
                    color = ColorRaw.FromRgb(
                        (byte)(sphere.Color.R * n),
                        (byte)(sphere.Color.G * n),
                        (byte)(sphere.Color.B * n));

                    maxDepth = depth;
                }
            }

            image[index] = color;
        }
示例#12
0
        // Alea Parallel.For!
        internal static Image Render1(Bitmap image, ConvolutionFilter filter)
        {
            var gpu = Gpu.Default;

            var width = image.Width;
            var array = BitmapUtility.ToColorArray(image);

            var mFilter = filter.Filter;
            var mFactor = filter.Factor;
            var mOffset = filter.Offset;

            var inputMemory = gpu.ArrayGetMemory(array, true, false);
            var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle);

            var resultLength = array.Length;
            var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength);
            var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle);

            gpu.For(0, resultLength, i =>
            {
                if (i < resultLength)
                {
                    ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width);
                }
            });

            return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height));
        }
示例#13
0
        // Fixed Block Size!
        internal static Image Render3(Bitmap image, ConvolutionFilter filter)
        {
            var gpu = Gpu.Default;

            var width = image.Width;
            var array = BitmapUtility.ToColorArray(image);

            var mFilter = filter.Filter;
            var mFactor = filter.Factor;
            var mOffset = filter.Offset;

            var inputMemory = gpu.ArrayGetMemory(array, true, false);
            var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle);

            var resultLength = array.Length;
            var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength);
            var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle);

            var lp = new LaunchParam(256, 256);

            gpu.Launch(() =>
            {
                var i = blockDim.x * blockIdx.x + threadIdx.x;

                while (i < resultLength)
                {
                    ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width);
                    i += blockDim.x * gridDim.x;
                }
            }, lp);

            return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height));
        }
示例#14
0
        public void Upsweep(deviceptr <T> dValues, deviceptr <int> dRanges, deviceptr <T> dRangeTotals)
        {
            // Each block is processing a range.
            var range  = blockIdx.x;
            var tid    = threadIdx.x;
            var rangeX = dRanges[range];
            var rangeY = dRanges[range + 1];

            // Loop through all elements in the interval, adding up values.
            // There is no need to synchronize until we perform the multi-reduce.
            var reduced = _initFunc();
            var index   = rangeX + tid;

            while (index < rangeY)
            {
                reduced = _reductionOp.Invoke(reduced, _transform.Invoke(dValues[index]));
                index  += _numThreads;
            }

            // Get the total.
            var total = _multiReduce.Invoke(tid, reduced);

            if (tid == 0)
            {
                dRangeTotals[range] = total;
            }
        }
示例#15
0
 public void Mult(int wA, int wB, int hC, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C)
 {
     var block = new dim3(BlockSize, BlockSize);
     var grid = new dim3(wB/block.x, hC/block.y);
     var lp = new LaunchParam(grid, block);
     GPULaunch(Kernel, lp, wA, wB, A, B, C);
 }
示例#16
0
        // Custom!
        internal static Image Render2(Bounds bounds)
        {
            bounds.AdjustAspectRatio();

            var width  = bounds.Width;
            var height = bounds.Height;
            var scale  = (bounds.XMax - bounds.XMin) / width;

            var resultLength = ColorComponents * width * height;
            var resultMemory = Gpu.Default.AllocateDevice <byte>(resultLength);
            var resultDevPtr = new deviceptr <byte>(resultMemory.Handle);

            var lp = ComputeLaunchParameters(bounds);

            Gpu.Default.Launch(() =>
            {
                var i      = blockDim.x * blockIdx.x + threadIdx.x;
                var x      = i % width;
                var y      = i / width;
                var offset = ColorComponents * i;

                if (offset < resultLength)
                {
                    var c = new Complex
                    {
                        Real      = bounds.XMin + x * scale,
                        Imaginary = bounds.YMin + y * scale,
                    };

                    ComputeMandelbrotAtOffset(resultDevPtr, c, offset);
                }
            }, lp);

            return(BitmapUtility.FromByteArray(Gpu.CopyToHost(resultMemory), width, height));
        }
示例#17
0
        public void Upsweep(deviceptr <T> dValues1, deviceptr <T> dValues2, deviceptr <int> dRanges, deviceptr <T> dRangeTotals)
        {
            var block  = blockIdx.x;
            var tid    = threadIdx.x;
            var rangeX = dRanges[block];
            var rangeY = dRanges[block + 1];

            // Loop through all elements in the interval, adding up values.
            // There is no need to synchronize until we perform the multireduce.
            var sum   = default(T);
            var index = rangeX + tid;

            while (index < rangeY)
            {
                sum    = _add(sum, _mult(dValues1[index], dValues2[index]));
                index += _plan.NumThreads;
            }

            // Get the total.
            var total = _multiReduce(tid, sum);

            if (tid == 0)
            {
                dRangeTotals[block] = total;
            }
        }
示例#18
0
        public Tensor <T> Cast <T>()
        {
            var ptr    = new deviceptr <T>(Ptr);
            var buffer = new Buffer <T>(Device, Memory, Layout, ptr);

            return(new Tensor <T>(buffer));
        }
示例#19
0
        public Buffer(Device device, BufferMemory memory, Layout layout, deviceptr <T> ptr)
        {
            Device = device;
            Memory = memory;
            Layout = layout;
            Ptr    = ptr;

            switch (Device.Type)
            {
            case DeviceType.Gpu:
                _hptr = null;
                var dptr = ptr;
                RawReader = i => dptr.LongGet(i);
                RawWriter = (i, value) => dptr.LongSet(i, value);
                break;

            case DeviceType.Cpu:
                var hptr = new HostPtrAccessor <T>(memory, ptr);
                _hptr     = hptr;
                RawReader = hptr.Get;
                RawWriter = hptr.Set;
                break;

            default:
                throw new ArgumentOutOfRangeException();
            }
        }
        public void IntegrateBodies(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel,
            int numBodies, float deltaTime, float softeningSquared, float damping, int numTiles)
        {
            var index = threadIdx.x + blockIdx.x*_blockSize;

            if (index >= numBodies) return;
            var position = oldPos[index];
            var accel = ComputeBodyAccel(softeningSquared, position, oldPos, numTiles);

            // acceleration = force \ mass
            // new velocity = old velocity + acceleration*deltaTime
            // note we factor out the body's mass from the equation, here and in bodyBodyInteraction 
            // (because they cancel out).  Thus here force = acceleration
            var velocity = vel[index];

            velocity.x = velocity.x + accel.x*deltaTime;
            velocity.y = velocity.y + accel.y*deltaTime;
            velocity.z = velocity.z + accel.z*deltaTime;

            velocity.x = velocity.x*damping;
            velocity.y = velocity.y*damping;
            velocity.z = velocity.z*damping;

            // new position = old position + velocity*deltaTime
            position.x = position.x + velocity.x*deltaTime;
            position.y = position.y + velocity.y*deltaTime;
            position.z = position.z + velocity.z*deltaTime;

            // store new position and velocity
            newPos[index] = position;
            vel[index] = velocity;
        }
示例#21
0
            /// <summary>
            /// Checks whether or not the Cuda features are currently supported
            /// </summary>
            public static bool IsGpuAccelerationSupported()
            {
                try
                {
                    // CUDA test
                    Gpu gpu = Gpu.Default;
                    if (gpu == null)
                    {
                        return(false);
                    }
                    if (!Dnn.IsAvailable)
                    {
                        return(false);                  // cuDNN
                    }
                    using (DeviceMemory <float> sample_gpu = gpu.AllocateDevice <float>(1024))
                    {
                        deviceptr <float> ptr = sample_gpu.Ptr;
                        void Kernel(int i) => ptr[i] = i;

                        Alea.Parallel.GpuExtension.For(gpu, 0, 1024, Kernel); // JIT test
                        float[] sample = Gpu.CopyToHost(sample_gpu);
                        return(Enumerable.Range(0, 1024).Select <int, float>(i => i).ToArray().ContentEquals(sample));
                    }
                }
                catch
                {
                    // Missing .dll or other errors
                    return(false);
                }
            }
        public void IntegrateBodies(deviceptr <float4> newPos, deviceptr <float4> oldPos, deviceptr <float4> vel,
                                    int numBodies, float deltaTime, float softeningSquared, float damping, int numTiles)
        {
            var index = threadIdx.x + blockIdx.x * _blockSize;

            if (index >= numBodies)
            {
                return;
            }
            var position = oldPos[index];
            var accel    = ComputeBodyAccel(softeningSquared, position, oldPos, numTiles);

            // acceleration = force \ mass
            // new velocity = old velocity + acceleration*deltaTime
            // note we factor out the body's mass from the equation, here and in bodyBodyInteraction
            // (because they cancel out).  Thus here force = acceleration
            var velocity = vel[index];

            velocity.x = velocity.x + accel.x * deltaTime;
            velocity.y = velocity.y + accel.y * deltaTime;
            velocity.z = velocity.z + accel.z * deltaTime;

            velocity.x = velocity.x * damping;
            velocity.y = velocity.y * damping;
            velocity.z = velocity.z * damping;

            // new position = old position + velocity*deltaTime
            position.x = position.x + velocity.x * deltaTime;
            position.y = position.y + velocity.y * deltaTime;
            position.z = position.z + velocity.z * deltaTime;

            // store new position and velocity
            newPos[index] = position;
            vel[index]    = velocity;
        }
示例#23
0
        public void ComputeValue(deviceptr<double> results, deviceptr<double> points, int numSims)
        {
            // Determine thread ID
            var bid = blockIdx.x;
            var tid = blockIdx.x * blockDim.x + threadIdx.x;
            var step = gridDim.x * blockDim.x;

            // Shift the input/output pointers
            var pointx = points + tid;
            var pointy = pointx + numSims;

            var pointsInside = 0;

            for (var i = tid; i < numSims; i += step, pointx += step, pointy += step)
            {
                var x = pointx[0];
                var y = pointy[0];
                var l2norm2 = x * x + y * y;
                if (l2norm2 < 1.0) pointsInside++;
            }

            // Reduce within the block
            pointsInside = ReduceSum(pointsInside);

            // Store the result
            if (threadIdx.x == 0) results[bid] = pointsInside;
        }
示例#24
0
        public GpuSpaceBufferContext(uint buffer, DiscreteBounds bounds)
        {
            CUDAInterop.cuGLRegisterBufferObject(buffer);

            _buffer        = buffer;
            _devicePointer = new deviceptr <VoxelFace>(GetDevicePointer());
            _bounds        = bounds;
        }
示例#25
0
        public void Apply(int numSystems, int n, deviceptr <double> dl, deviceptr <double> dd, deviceptr <double> du,
                          deviceptr <double> db, deviceptr <double> dx)
        {
            var sharedSize = 9 * n * sizeof(double);
            var lp         = new LaunchParam(numSystems, n, sharedSize);

            this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx);
        }
示例#26
0
        static void Truncate_Kernel(int width, deviceptr <float> input, deviceptr <byte> output)
        {
            var x = blockIdx.x * blockDim.x + threadIdx.x;
            var y = blockIdx.y * blockDim.y + threadIdx.y;
            var p = y * width + x;

            output[p] = (byte)input[p];
        }
示例#27
0
        static void UpdateCorrectionFactor_Kernel(int width, deviceptr <float> meanImg, deviceptr <float> correctionFactor, float mean)
        {
            var x = blockIdx.x * blockDim.x + threadIdx.x;
            var y = blockIdx.y * blockDim.y + threadIdx.y;
            var p = y * width + x;

            correctionFactor[p] = mean / meanImg[p];
        }
示例#28
0
        static void RunningMean_Kernel(int width, deviceptr <byte> image, deviceptr <float> meanImg, float numCalibrationSamples)
        {
            var x = blockIdx.x * blockDim.x + threadIdx.x;
            var y = blockIdx.y * blockDim.y + threadIdx.y;
            var p = y * width + x;

            meanImg[p] = meanImg[p] * numCalibrationSamples / (numCalibrationSamples + 1) + image[p] * 1.0f / (numCalibrationSamples + 1);
        }
示例#29
0
        private static void AleaKernelConstants(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            Constant <int> c,
            int n,
            int pitch)
        {
            // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant.
            // Also, we write the results as float2.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c.Value * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c.Value; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            var line = threadIdx.x / (blockDim.x / 2);
            var tid  = threadIdx.x % (blockDim.x / 2);

            if (bJ + tid * 2 < n)
            {
                var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>();

                for (int i = line; i < blockDim.x && bI + i < n; i += 2)
                {
                    var dist = default(Real2);

                    for (int k = 0; k != c.Value; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];
                        var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid];
                        var diff   = new Real2(coord1 - coord2.x, coord1 - coord2.y);

                        dist.x += diff.x * diff.x;
                        dist.y += diff.y * diff.y;
                    }

                    var dst = mSquaredDistances.Ptr((bI + i) * pitch + bJ).Reinterpret <Real2>();
                    dst[tid] = dist;
                }
            }
        }
示例#30
0
        private static void AleaKernelFloat2(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            int c,
            int n,
            int pitch)
        {
            // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            var line = threadIdx.x / (blockDim.x / 2);
            var tid  = threadIdx.x % (blockDim.x / 2);

            if (bJ + tid * 2 < n)
            {
                var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>();

                for (int i = line; i < blockDim.x && bI + i < n; i += 2)
                {
                    Real dist0 = 0;
                    Real dist1 = 0;

                    for (int k = 0; k != c; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];
                        var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid];
                        var diff   = new Real2(coord1 - coord2.x, coord1 - coord2.y);

                        dist0 += diff.x * diff.x;
                        dist1 += diff.y * diff.y;
                    }

                    mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 0)] = dist0;
                    mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 1)] = dist1;
                }
            }
        }
 static void SquareKernel(deviceptr<double> outputs, deviceptr<double> inputs, int n)
 {
     var start = blockIdx.x * blockDim.x + threadIdx.x;
     var stride = gridDim.x * blockDim.x;
     for (var i = start; i < n; i += stride)
     {
         outputs[i] = inputs[i] * inputs[i];
     }
 }
示例#32
0
        internal Image Trace(int width, int height)
        {
            var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(width * height);
            var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle);

            Gpu.Default.For(0, width * height, i => TraceKernel(i, resultDevPtr, width));

            return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), width, height));
        }
        //[/transformKernel]

        //[transformGPUDevice]
        public void Apply(int n, deviceptr <T> x, deviceptr <T> y, deviceptr <T> z)
        {
            const int blockSize = 256;
            var       numSm     = this.GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;
            var       gridSize  = Math.Min(16 * numSm, Common.divup(n, blockSize));
            var       lp        = new LaunchParam(gridSize, blockSize);

            GPULaunch(Kernel, lp, n, x, y, z);
        }
示例#34
0
 Public MyKernel(deviceptr<int> Data)
 {
   var start = blockIdx.x * blockDim.x + threadIdx.x;
   int ind = threadIdx.x;
   for (int i=0;i<100;i++)
   {
     //Kernel Code here
   }
 }
        public void ClassifyVoxel(deviceptr <int3> d_gridIdx, deviceptr <float3> d_voxelV, deviceptr <int> d_voxelVerts,
                                  deviceptr <int> d_voxelOccupied, deviceptr <float3> d_samplePts, int sampleLength)
        {
            int blockId = blockIdx.y * gridDim.x + blockIdx.x; //block在grid中的位置
            int i       = blockId * blockDim.x + threadIdx.x;  //线程索引

            // compute 3d index in the grid
            int3 gridPos = calcGridPos(i, constGridSize.Value);

            d_gridIdx[i] = gridPos;
            float3 p = new float3();

            p.x = constBasePoint.Value.x + gridPos.x * constVoxelSize.Value.x;
            p.y = constBasePoint.Value.y + gridPos.y * constVoxelSize.Value.y;
            p.z = constBasePoint.Value.z + gridPos.z * constVoxelSize.Value.z;

            // compute all vertices
            d_voxelV[i * 8]     = p;
            d_voxelV[i * 8 + 1] = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, 0 + p.z);
            d_voxelV[i * 8 + 2] = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, 0 + p.z);
            d_voxelV[i * 8 + 3] = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, 0 + p.z);
            d_voxelV[i * 8 + 4] = CreateFloat3(0 + p.x, 0 + p.y, constVoxelSize.Value.z + p.z);
            d_voxelV[i * 8 + 5] = CreateFloat3(constVoxelSize.Value.x + p.x, 0 + p.y, constVoxelSize.Value.z + p.z);
            d_voxelV[i * 8 + 6] = CreateFloat3(constVoxelSize.Value.x + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z);
            d_voxelV[i * 8 + 7] = CreateFloat3(0 + p.x, constVoxelSize.Value.y + p.y, constVoxelSize.Value.z + p.z);

            // compute cube value of each vertex
            float d0 = ComputeValue(d_samplePts, d_voxelV[i * 8], sampleLength);
            float d1 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 1], sampleLength);
            float d2 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 2], sampleLength);
            float d3 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 3], sampleLength);
            float d4 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 4], sampleLength);
            float d5 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 5], sampleLength);
            float d6 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 6], sampleLength);
            float d7 = ComputeValue(d_samplePts, d_voxelV[i * 8 + 7], sampleLength);

            // check their status
            int cubeindex;

            cubeindex  = Compact(d0, constIsovalue.Value);
            cubeindex += Compact(d1, constIsovalue.Value) * 2;
            cubeindex += Compact(d2, constIsovalue.Value) * 4;
            cubeindex += Compact(d3, constIsovalue.Value) * 8;
            cubeindex += Compact(d4, constIsovalue.Value) * 16;
            cubeindex += Compact(d5, constIsovalue.Value) * 32;
            cubeindex += Compact(d6, constIsovalue.Value) * 64;
            cubeindex += Compact(d7, constIsovalue.Value) * 128;

            //find out the number of vertices in each voxel
            int numVerts = verticesTable[cubeindex];

            d_voxelVerts[i] = numVerts;
            if (numVerts > 0)
            {
                d_voxelOccupied[i] = 1;
            }
        }
        //[/StaticStartKernel]

        //[StaticPrepareAndLaunchKernel]
        public void IntegrateNbodySystem(deviceptr<float4> newPos, deviceptr<float4> oldPos, 
                                         deviceptr<float4> vel, int numBodies, float deltaTime, 
                                         float softeningSquared, float damping)
        {
            var numBlocks = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize);
            var numTiles = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize);
            var lp = new LaunchParam(numBlocks, _blockSize);
            GPULaunch(IntegrateBodies, lp, newPos, oldPos, vel, numBodies, deltaTime, softeningSquared, damping,
                numTiles);
        }
 static void SimpleMultiplyKernel(deviceptr<float> a, deviceptr<float> b, deviceptr<float> c,
     int aRows, int bCols, int aCols_bRows)
 {
     var row = blockDim.y * blockIdx.y + threadIdx.y;
     var col = blockDim.x * blockIdx.x + threadIdx.x;
     if (row >= aRows || col >= bCols) return;
     var sum = 0.0f;
     for (var k = 0; k < aCols_bRows; ++k)
     {
         sum += a[row * aCols_bRows + k] * b[k * bCols + col];
     }
     c[row * bCols + col] = sum;
 }
示例#38
0
        public void Kernel(int wA, int wB, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C)
        {
            var blx = blockIdx.x;
            var bly = blockIdx.y;
            var tx = threadIdx.x;
            var ty = threadIdx.y;

            // offset to first element of the first sub-matrix of A processed by the block
            var aBegin = wA * BlockSize * bly;

            // index of the last sub-matrix of A processed by the block
            var aEnd = aBegin + wA - 1;

            // step size used to iterate through the sub-matrices of A
            var aStep = BlockSize;

            // offset to first element of the first sub-matrix of B processed by the block
            var bBegin = BlockSize * blx;

            // step size used to iterate through the sub-matrices of B
            var bStep = BlockSize * wB;

            // Csub is used to store the element of the block sub-matrix that is computed by the thread
            var Csub = 0.0;

            // loop over all the sub-matrices of A and B required to compute the block sub-matrix
            var a = aBegin;
            var b = bBegin;
            for (; a <= aEnd; a += aStep, b += bStep)
            {
                var As = __shared__.Array2D<double>(BlockSize, BlockSize);
                var Bs = __shared__.Array2D<double>(BlockSize, BlockSize);

                // load the matrices from device memory to shared memory; each thread loads one element of each matrix 
                As[ty, tx] = A[a + wA * ty + tx];
                Bs[ty, tx] = B[b + wB * ty + tx];

                Intrinsic.__syncthreads();

                // multiply the two matrices together; each thread computes one element of the block sub-matrix
                for (var k = 0; k < BlockSize; ++k)
                    Csub += As[ty, k] * Bs[k, tx];

                Intrinsic.__syncthreads();
            }

            // write the block sub-matrix to device memory; each thread writes one element
            var c = wB * BlockSize * bly + BlockSize * blx;
            C[c + wB * ty + tx] = Csub;
        }
示例#39
0
        public void Kernel(deviceptr<float4> pos, float time)
        {
            var x = blockIdx.x*blockDim.x + threadIdx.x;
            var y = blockIdx.y*blockDim.y + threadIdx.y;

            var u = ((float) x)/((float) Width);
            var v = ((float) y)/((float) Height);
            u = u*2.0f - 1.0f;
            v = v*2.0f - 1.0f;

            const float freq = 4.0f;
            var w = LibDevice.__nv_sinf(u*freq + time)*LibDevice.__nv_cosf(v*freq + time)*0.5f;

            pos[y * Width + x] = new float4(u, w, v, LibDevice.__nv_uint_as_float(0xff00ff00));
        }
示例#40
0
        unsafe public void Update(IntPtr vbRes, float time)
        {
            // 1. map resource to cuda space, means lock to cuda space
            var vbRes1 = vbRes;
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsMapResources(1, &vbRes1, IntPtr.Zero));

            // 2. get memory pointer from mapped resource
            var vbPtr = IntPtr.Zero;
            var vbSize = IntPtr.Zero;
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsResourceGetMappedPointer(&vbPtr, &vbSize, vbRes1));

            // 3. create device pointer, and run the kernel
            var pos = new deviceptr<float4>(vbPtr);
            GPULaunch(Kernel, LaunchParam, pos, time);

            // 4. unmap resource, means unlock, so that DirectX can then use it again
            CUDAInterop.cuSafeCall(CUDAInterop.cuGraphicsUnmapResources(1u, &vbRes1, IntPtr.Zero));
        }
        //[/DynamicAOTCompile]

        //[DynamicComputeBodyAccel]
        public float3 ComputeBodyAccel(float softeningSquared, float4 bodyPos, deviceptr<float4> positions, 
                                       int numTiles)
        {
            var sharedPos = __shared__.ExternArray<float4>();
            var acc = new float3(0.0f, 0.0f, 0.0f);

            for (var tile = 0; tile < numTiles; tile++)
            {
                sharedPos[threadIdx.x] = positions[tile*blockDim.x + threadIdx.x];

                Intrinsic.__syncthreads();

                // This is the "tile_calculation" function from the GPUG3 article.
                for (var counter = 0; counter < blockDim.x; counter++)
                {
                    acc = Common.BodyBodyInteraction(softeningSquared, acc, bodyPos, sharedPos[counter]);
                }
                Intrinsic.__syncthreads();
            }
            return (acc);
        }
示例#42
0
 void ISimulator.Integrate(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping)
 {
     _worker.Gather(oldPos, _hpos, FSharpOption<int>.None, FSharpOption<int>.None);
     _worker.Gather(vel, _hvel, FSharpOption<int>.None, FSharpOption<int>.None);
     CpuIntegrator.IntegrateNbodySystem(_haccel, _hpos, _hvel, _numBodies, deltaTime, softeningSquared, damping);
 }
示例#43
0
 public void Apply(int numSystems, int n, deviceptr<double> dl, deviceptr<double> dd, deviceptr<double> du,
     deviceptr<double> db, deviceptr<double> dx)
 {
     var sharedSize = 9*n*sizeof (double);
     var lp = new LaunchParam(numSystems, n, sharedSize);
     this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx);
 }
示例#44
0
        public void Kernel(int n, deviceptr<double> dl, deviceptr<double> dd, deviceptr<double> du, deviceptr<double> db, deviceptr<double> dx)
        {
            var tid = threadIdx.x;
            var gid = blockIdx.x * n + tid;

            var shared = Intrinsic.__array_to_ptr(__shared__.ExternArray<double>());
            var l = shared;
            var d = l + n;
            var u = d + n;
            var b = u + n;

            l[tid] = dl[gid];
            d[tid] = dd[gid];
            u[tid] = du[gid];
            b[tid] = db[gid];

            Intrinsic.__syncthreads();

            Solve(n, l, d, u, b);

            dx[gid] = b[tid];
        }
示例#45
0
        // core solver function
        //     n  the dimension of the tridiagonal system, must fit into one block
        //     l  lower diagonal
        //     d  diagonal
        //     u  upper diagonal
        //     h  right hand side and solution at exit
        public static void Solve(int n, deviceptr<double> l, deviceptr<double> d, deviceptr<double> u, deviceptr<double> h)
        {
            var rank = threadIdx.x;

            var ltemp = 0.0;
            var utemp = 0.0;
            var htemp = 0.0;

            var span = 1;
            while (span < n)
            {
                if (rank < n)
                {
                    ltemp = (rank - span >= 0) ?
                        (d[rank - span] != 0.0) ? -l[rank] / d[rank - span] : 0.0
                        :
                        0.0;


                    utemp = (rank + span < n) ?
                        (d[rank + span] != 0.0) ? -u[rank] / d[rank + span] : 0.0
                        :
                        0.0;

                    htemp = h[rank];
                }

                Intrinsic.__syncthreads();

                if (rank < n)
                {
                    if (rank - span >= 0)
                    {
                        d[rank] = d[rank] + ltemp * u[rank - span];
                        htemp = htemp + ltemp * h[rank - span];
                        ltemp = ltemp * l[rank - span];
                    }
                    if (rank + span < n)
                    {
                        d[rank] = d[rank] + utemp * l[rank + span];
                        htemp = htemp + utemp * h[rank + span];
                        utemp = utemp * u[rank + span];
                    }
                }

                Intrinsic.__syncthreads();

                if (rank < n)
                {
                    l[rank] = ltemp;
                    u[rank] = utemp;
                    h[rank] = htemp;
                }

                Intrinsic.__syncthreads();

                span *= 2;
            }

            if (rank < n) h[rank] = h[rank] / d[rank];
        }
 void ISimulator.Integrate(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel,
                           int numBodies, float deltaTime, float softeningSquared, float damping)
 {
     IntegrateNbodySystem(newPos, oldPos, vel, numBodies, deltaTime, softeningSquared, damping);
 }