Esempio n. 1
0
        /// <summary>
        /// Col2s the im.
        /// </summary>
        /// <param name="col">The col.</param>
        /// <param name="im">The im.</param>
        /// <param name="channels">The channels.</param>
        /// <param name="height">The height.</param>
        /// <param name="width">The width.</param>
        /// <param name="patch_h">The patch h.</param>
        /// <param name="patch_w">The patch w.</param>
        /// <param name="pad_h">The pad h.</param>
        /// <param name="pad_w">The pad w.</param>
        /// <param name="stride_h">The stride h.</param>
        /// <param name="stride_w">The stride w.</param>
        /// <param name="dilation_h">The dilation h.</param>
        /// <param name="dilation_w">The dilation w.</param>
        public void Col2Im(Tensor col, Tensor im, int channels, int height, int width,
                           int patch_h, int patch_w, int pad_h,
                           int pad_w, int stride_h, int stride_w,
                           int dilation_h, int dilation_w)
        {
            var context     = CudaHelpers.TSContextForTensor(im);
            var cudaContext = context.CudaContextForTensor(im);


            int height_col = (height + 2 * pad_h - (dilation_h * (patch_h - 1) + 1))
                             / stride_h + 1;
            int width_col = (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1))
                            / stride_w + 1;
            int num_kernels = channels * height * width;

            var data_im  = CudaHelpers.GetBufferStart(im);
            var data_col = CudaHelpers.GetBufferStart(col);

            // From Torch source:
            // To avoid involving atomic operations, we will launch one kernel per
            // bottom dimension, and then in the kernel add up the top dimensions.

            Invoke(context, cudaContext, "col2im_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                   num_kernels, data_col, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
                   dilation_h, dilation_w,
                   height_col, width_col, data_im);
        }
Esempio n. 2
0
        /// <summary>
        /// Im2s the col.
        /// </summary>
        /// <param name="im">The im.</param>
        /// <param name="col">The col.</param>
        /// <param name="channels">The channels.</param>
        /// <param name="height">The height.</param>
        /// <param name="width">The width.</param>
        /// <param name="ksize_h">The ksize h.</param>
        /// <param name="ksize_w">The ksize w.</param>
        /// <param name="pad_h">The pad h.</param>
        /// <param name="pad_w">The pad w.</param>
        /// <param name="stride_h">The stride h.</param>
        /// <param name="stride_w">The stride w.</param>
        /// <param name="dilation_h">The dilation h.</param>
        /// <param name="dilation_w">The dilation w.</param>
        public void Im2Col(Tensor im, Tensor col, int channels,
                           int height, int width,
                           int ksize_h, int ksize_w, int pad_h,
                           int pad_w, int stride_h, int stride_w,
                           int dilation_h, int dilation_w)
        {
            var context     = CudaHelpers.TSContextForTensor(im);
            var cudaContext = context.CudaContextForTensor(im);

            // From Torch source:
            // We are going to launch channels * height_col * width_col kernels, each
            // kernel responsible for copying a single-channel grid.
            int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1))
                             / stride_h + 1;
            int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1))
                            / stride_w + 1;
            int num_kernels = channels * height_col * width_col;

            var data_im  = CudaHelpers.GetBufferStart(im);
            var data_col = CudaHelpers.GetBufferStart(col);

            Invoke(context, cudaContext, "im2col_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                   num_kernels, data_im, height, width, ksize_h, ksize_w,
                   pad_h, pad_w, stride_h, stride_w,
                   dilation_h, dilation_w,
                   height_col, width_col, data_col);
        }
Esempio n. 3
0
        /// <summary>
        /// Copies the gpu to cpu.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="totalElements">The total elements.</param>
        /// <exception cref="CudaException"></exception>
        public void CopyGpuToCpu(Tensor result, Tensor src, long totalElements)
        {
            var context    = CudaHelpers.TSContextForTensor(src);
            var srcContext = context.CudaContextForTensor(src);

            using (var srcContig = Ops.AsContiguous(src))
                using (var resultContig = AsTypeCpu(result, src.ElementType, true))
                {
                    var resultContigPtr = ((Cpu.CpuStorage)resultContig.Storage).PtrAtElement(resultContig.StorageOffset);
                    var srcContigPtr    = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset);

                    var totalBytes = totalElements * srcContig.ElementType.Size();

                    // Use DriverAPINativeMethods directly here instead of CudaContext.CopyToHost, because CopyToHost only has an overload
                    // for specifying totalBytes as a uint, but we may exceed the range of a uint here.
                    var res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(resultContigPtr, srcContigPtr, totalBytes);
                    if (res != CUResult.Success)
                    {
                        throw new CudaException(res);
                    }

                    if (result.Storage != resultContig.Storage)
                    {
                        Ops.Copy(result, resultContig); // copy on CPU
                    }
                }
        }
Esempio n. 4
0
        public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode)
        {
            var context     = CudaHelpers.TSContextForTensor(gradOutput);
            var cudaContext = context.CudaContextForTensor(gradOutput);

            var dimw = 3;
            var dimh = 2;
            var dimc = 1;

            var nbatch  = input.Sizes[0];
            var nslices = input.Sizes[dimc];
            var iheight = input.Sizes[dimh];
            var iwidth  = input.Sizes[dimw];
            var owidth  = gradOutput.Sizes[dimw];
            var oheight = gradOutput.Sizes[dimh];

            using var gradOutputContig = Ops.AsContiguous(gradOutput);
            var gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig);
            var indicesPtr    = CudaHelpers.GetBufferStart(indices);
            var gradInputPtr  = CudaHelpers.GetBufferStart(gradInput);

            var count = (int)input.ElementCount();

            this.Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                        count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth,
                        cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr);
        }
Esempio n. 5
0
        public static Tensor Invoke(CudaCode kernels, string funcName, Tensor result, Tensor src)
        {
            try
            {
                TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
                CudaContext   cudaContext = context.CudaContextForTensor(src);

                cudaContext.SetCurrent();

                Tensor writeTarget  = TensorResultBuilder.GetWriteTarget(result, src, false, src.Sizes);
                long   elementCount = writeTarget.ElementCount();

                byte[] ptx = kernels.GetPtx(context.Compiler);

                if (result == src)
                {
                    ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t1_" + funcName, writeTarget, elementCount);
                }
                else
                {
                    ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t2_" + funcName, writeTarget, src, elementCount);
                }

                return(writeTarget);
            }
            catch (Exception e)
            {
                Logger.WriteLine($"Error = '{e.Message}', Call stack = '{e.StackTrace}'");
                throw;
            }
        }
        public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            if (result == null)
            {
                throw new ArgumentNullException("result");
            }

            if (result.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("result and src must have same number of dimensions");
            }

            if (dim < 0 && dim >= result.DimensionCount)
            {
                throw new ArgumentOutOfRangeException("dim");
            }

            if (indices.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("src and indices must have same number of dimensions");
            }

            if (!src.IsSameSizeAs(indices))
            {
                throw new InvalidOperationException("src and indices must be the same size");
            }

            if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim))
            {
                throw new InvalidOperationException("result and src must be the same size except in dimension dim");
            }

            Tensor writeTarget = result;

            long nElement = indices.ElementCount();
            dim3 block    = ApplyUtils.GetApplyBlock();
            dim3 grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                int    dims       = indices.DimensionCount <= 3 ? indices.DimensionCount : -1;
                string kernelName = MakeKernelName(ScatterBaseName, true, dims);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                       writeTarget, src, indices, dim, (int)nElement);
            }
            else
            {
                string kernelName = MakeKernelName(ScatterBaseName, false, -1);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false,
                       writeTarget, src, indices, dim, nElement);
            }

            return(writeTarget);
        }
Esempio n. 7
0
        public Tensor BuildTriMask(Tensor result, float value, float maskedValue)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(result);

            BuildTriMask(context, result, value, maskedValue);

            return(result);
        }
Esempio n. 8
0
        public Tensor SoftmaxGrad(Tensor grad, Tensor adj, Tensor val)
        {
            var context = CudaHelpers.TSContextForTensor(grad);

            SoftmaxGrad(context, grad, adj, val);

            return(grad);
        }
Esempio n. 9
0
        public Tensor SGD(Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            var context = CudaHelpers.TSContextForTensor(weight);

            SGD(context, weight, gradient, cache, lrw, batchSize, step_size, clipval, regc, decay_rate, eps);

            return(weight);
        }
Esempio n. 10
0
        public Tensor SoftmaxGrad(Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(grad);

            SoftmaxGrad(context, grad, adj, val, addGrad);

            return(grad);
        }
Esempio n. 11
0
        public Tensor Adam(Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(weight);

            Adam(context, weight, gradient, v, m, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps);

            return(weight);
        }
Esempio n. 12
0
        public void AddLayerNormGrad(Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            TSCudaContext context      = CudaHelpers.TSContextForTensor(inGrad);
            Tensor        writeTarget1 = TensorResultBuilder.GetWriteTarget(out1Grad, inGrad, false, inGrad.Sizes);
            Tensor        writeTarget2 = TensorResultBuilder.GetWriteTarget(out2Grad, inGrad, false, inGrad.Sizes);

            AddLayerNormGrad(context, writeTarget1, writeTarget2, alphaGrad, betaGrad, inGrad, y, x1, x2, alpha, beta, eps);
        }
Esempio n. 13
0
        public Tensor RMSProp(Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(weight);

            RMSProp(context, weight, gradient, cache, batchSize, step_size, clipval, regc, decay_rate, eps);

            return(weight);
        }
Esempio n. 14
0
        public Tensor BuildPadSelfTriMask(Tensor originalLengths, int batchSize, int paddedLength)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(originalLengths);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(null, originalLengths.Allocator, DType.Float32, true, new long[] { batchSize, paddedLength, paddedLength });

            BuildPadSelfTriMask(context, writeTarget, originalLengths, batchSize);

            return(writeTarget);
        }
Esempio n. 15
0
        public Tensor LayerNorm(Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var context     = CudaHelpers.TSContextForTensor(src);
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, src.Sizes);

            this.LayerNorm(context, writeTarget, src, alpha, beta, eps);

            return(writeTarget);
        }
Esempio n. 16
0
        public Tensor SoftmaxMask(Tensor result, Tensor src, Tensor mask)
        {
            var context     = CudaHelpers.TSContextForTensor(src);
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, src.Sizes);

            this.SoftmaxMask(context, writeTarget, src, mask);

            return(writeTarget);
        }
Esempio n. 17
0
        public Tensor BuildSelfTriMask(Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(originalLengths);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(result, originalLengths, true, new long[] { originalLengths.Sizes[0], paddedSeqLen, paddedSeqLen });

            BuildSelfTriMask(context, writeTarget, originalLengths, paddedSeqLen, value, maskedValue);

            return(writeTarget);
        }
Esempio n. 18
0
        /// <summary>
        /// Invokes the specified kernels.
        /// </summary>
        /// <param name="kernels">The kernels.</param>
        /// <param name="result">The result.</param>
        /// <param name="value">The value.</param>
        public static void Invoke(FillCopyKernels kernels, NDArray result, float value)
        {
            var context      = CudaHelpers.TSContextForTensor(result);
            var cudaContext  = context.CudaContextForTensor(result);
            var ptx          = kernels.GetPtx(context.Compiler);
            var elementCount = result.ElementCount();

            ApplyOpInvoke.Invoke(context, cudaContext, ptx, "fill", result, value, elementCount);
        }
Esempio n. 19
0
        public Tensor BuildSrcTgtMask(Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize, int paddedSrcLength, int paddedTgtLength)
        {
            var context     = CudaHelpers.TSContextForTensor(originalSrcLengths);
            var writeTarget = TensorResultBuilder.GetWriteTarget(null, originalSrcLengths.Allocator, DType.Float32, true, new long[] { batchSize, paddedTgtLength, paddedSrcLength });

            this.BuildSrcTgtMask(context, writeTarget, originalSrcLengths, originalTgtLengths, batchSize);

            return(writeTarget);
        }
Esempio n. 20
0
        public Tensor AddLayerNorm(Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src1);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(result, src1, false, src1.Sizes);

            AddLayerNorm(context, writeTarget, src1, src2, alpha, beta, eps);

            return(writeTarget);
        }
Esempio n. 21
0
        public Tensor Softmax(Tensor result, Tensor src)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, src.Sizes);

            Softmax(context, writeTarget, src);

            return(writeTarget);
        }
Esempio n. 22
0
        public Tensor LayerNormGrad(Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(inGrad);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(outGrad, inGrad, false, inGrad.Sizes);

            LayerNormGrad(context, writeTarget, alphaGrad, betaGrad, inGrad, y, x, alpha, beta, eps);

            return(writeTarget);
        }
Esempio n. 23
0
        public Tensor BuildSrcTgtMask(Tensor result, Tensor srcOriginalLengths, Tensor tgtOriginalLengths, int srcPaddedSeqLen, int tgtPaddedSeqLen, float value, float maskedValue)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(srcOriginalLengths);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(result, srcOriginalLengths, true, new long[] { srcOriginalLengths.Sizes[0], tgtPaddedSeqLen, srcPaddedSeqLen });

            BuildSrcTgtMask(context, writeTarget, srcOriginalLengths, tgtOriginalLengths, tgtPaddedSeqLen, value, maskedValue);

            return(writeTarget);
        }
Esempio n. 24
0
        public Tensor IndexSelect(Tensor result, Tensor src, Tensor indice)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, new long[] { indice.Sizes[0], src.Sizes[1] });

            IndexSelect(context, writeTarget, src, indice);

            return(writeTarget);
        }
Esempio n. 25
0
        public Tensor Gather(Tensor result, Tensor src, int dim, Tensor indices)
        {
            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForTensor(src);

            if (result != null && result.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("result and src must have same number of dimensions");
            }

            if (result != null && dim < 0 && dim >= result.DimensionCount)
            {
                throw new ArgumentOutOfRangeException(nameof(dim));
            }

            if (indices.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("src and indices must have same number of dimensions");
            }

            if (result != null && !result.IsSameSizeAs(indices))
            {
                throw new InvalidOperationException("result and indices must be the same size");
            }

            if (result != null && !TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim))
            {
                throw new InvalidOperationException("result and src must be the same size except in dimension dim");
            }

            var writeTarget = TensorResultBuilder.GetWriteTarget(result, indices.Allocator, src.ElementType, false, indices.Sizes);

            var nElement = indices.ElementCount();
            var block    = ApplyUtils.GetApplyBlock();
            var grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                var dims       = indices.DimensionCount <= 3 ? indices.DimensionCount : -1;
                var kernelName = MakeKernelName(GatherBaseName, true, dims);
                this.Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                            writeTarget, src, indices, dim, (int)nElement);
            }
            else
            {
                var kernelName = MakeKernelName(GatherBaseName, false, -1);
                this.Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false,
                            writeTarget, src, indices, dim, nElement);
            }

            return(writeTarget);
        }
Esempio n. 26
0
        public Tensor UpdateCost(Tensor costs, Tensor weight, Tensor ids)
        {
            var context     = CudaHelpers.TSContextForTensor(weight);
            var writeTarget = TensorResultBuilder.GetWriteTarget(costs, weight, true, ids.Sizes);

            Ops.Fill(writeTarget, 0.0f);

            this.UpdateCost(context, weight, ids, writeTarget);

            return(writeTarget);
        }
Esempio n. 27
0
        public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices)
        {
            try
            {
                TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
                CudaContext   cudaContext = context.CudaContextForTensor(src);

                if (result == null)
                {
                    throw new ArgumentNullException("result");
                }

                if (result.DimensionCount != src.DimensionCount)
                {
                    throw new InvalidOperationException($"result and src must have same number of dimensions. result dim count = '{result.DimensionCount}', source dim count = '{src.DimensionCount}'");
                }

                if (dim < 0 && dim >= result.DimensionCount)
                {
                    throw new ArgumentOutOfRangeException("dim");
                }

                if (indices.DimensionCount != src.DimensionCount)
                {
                    throw new InvalidOperationException("src and indices must have same number of dimensions");
                }

                if (!src.IsSameSizeAs(indices))
                {
                    throw new InvalidOperationException("src and indices must be the same size");
                }

                if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim))
                {
                    throw new InvalidOperationException("result and src must be the same size except in dimension dim");
                }

                Tensor writeTarget = result;

                long nElement = indices.ElementCount();
                dim3 block    = ApplyUtils.GetApplyBlock();
                dim3 grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

                Invoke(context, cudaContext, "scatter_kernel", grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement);

                return(writeTarget);
            }
            catch (Exception err)
            {
                Logger.WriteLine($"Error = '{err.Message}', Call stack = '{err.StackTrace}'");
                throw;
            }
        }
Esempio n. 28
0
        public void SpatialMaxPoolingForward(Tensor input, Tensor output, Tensor indices, ConvolutionDesc2d cd, bool ceilMode)
        {
            var context     = CudaHelpers.TSContextForTensor(input);
            var cudaContext = context.CudaContextForTensor(input);

            var iwidth      = input.Sizes[3];
            var iheight     = input.Sizes[2];
            var nInputPlane = input.Sizes[1];
            var batchSize   = input.Sizes[0];

            long owidth;
            long oheight;

            if (ceilMode)
            {
                // ReSharper disable once ArrangeRedundantParentheses
                oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1;
                // ReSharper disable once ArrangeRedundantParentheses
                owidth = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1;
            }
            else
            {
                // ReSharper disable once ArrangeRedundantParentheses
                oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1;
                // ReSharper disable once ArrangeRedundantParentheses
                owidth = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1;
            }

            if (cd.padW != 0 || cd.padH != 0)
            {
                // ensure that the last pooling starts inside the image
                if ((oheight - 1) * cd.dH >= iheight + cd.padH)
                {
                    --oheight;
                }

                if ((owidth - 1) * cd.dW >= iwidth + cd.padW)
                {
                    --owidth;
                }
            }

            using var inputContig = Ops.AsContiguous(input);
            var inputPtr   = CudaHelpers.GetBufferStart(inputContig);
            var outputPtr  = CudaHelpers.GetBufferStart(output);
            var indicesPtr = CudaHelpers.GetBufferStart(indices);

            var count = (int)output.ElementCount();

            this.Invoke(context, cudaContext, "MaxPoolForward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                        count, inputPtr, batchSize, nInputPlane, iheight, iwidth, oheight, owidth,
                        cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, outputPtr, indicesPtr);
        }
        public static void Invoke(FillCopyKernels kernels, Tensor result, float value)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(result);
            CudaContext   cudaContext = context.CudaContextForTensor(result);

            cudaContext.SetCurrent();

            byte[] ptx          = kernels.GetPtx(context.Compiler);
            long   elementCount = result.ElementCount();

            ApplyOpInvoke.Invoke(context, cudaContext, ptx, "fill", result, value, elementCount);
        }
Esempio n. 30
0
        public Tensor IndexSelectGrad(Tensor grad, Tensor adj, Tensor indice)
        {
            if (grad == null)
            {
                throw new ArgumentNullException($"Tensor grad should not be null.");
            }

            TSCudaContext context = CudaHelpers.TSContextForTensor(adj);

            IndexSelectGrad(context, grad, adj, indice);

            return(grad);
        }