Example #1
0
        public NDArray MeanAll(NDArray result, NDArray src)
        {
            if (src.DimensionCount == 0 || src.ElementCount() == 0)
            {
                throw new ArgumentException("src must be a non-empty tensor");
            }
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1);

            SumAll(writeTarget, src);
            Div(writeTarget, writeTarget, src.ElementCount());
            return(writeTarget);
        }
Example #2
0
        public NDArray VarAll(NDArray result, NDArray src)
        {
            if (src.DimensionCount == 0 || src.ElementCount() == 0)
            {
                throw new ArgumentException("src must be a non-empty tensor");
            }

            var mean        = Ops.MeanAll(src);
            var writeTarget = ReduceAllOp.Invoke(cudaReduceAllKernels, 0.0f, ReduceInitType.GivenValue, "en_norm", result, src, mean);

            Div(writeTarget, writeTarget, src.ElementCount() - 1);
            return(writeTarget);
        }
Example #3
0
        /// <summary>
        /// Determines whether this instance [can use32 bit index math] the specified tensor.
        /// </summary>
        /// <param name="tensor">The tensor.</param>
        /// <returns><c>true</c> if this instance [can use32 bit index math] the specified tensor; otherwise, <c>false</c>.</returns>
        public static bool CanUse32BitIndexMath(NDArray tensor)
        {
            var elements = tensor.ElementCount();

            if (elements >= uint.MaxValue)
            {
                return(false);
            }

            long offset   = 0;
            long linearId = elements - 1;

            for (int i = tensor.DimensionCount - 1; i >= 0; --i)
            {
                var curDimIndex  = linearId % tensor.Shape[i];
                var curDimOffset = curDimIndex * tensor.Strides[i];
                offset   += curDimOffset;
                linearId /= tensor.Shape[i];
            }

            if (offset >= uint.MaxValue)
            {
                return(false);
            }

            return(true);
        }
Example #4
0
        /// <summary>
        /// Invokes the specified kernels.
        /// </summary>
        /// <param name="kernels">The kernels.</param>
        /// <param name="context">The context.</param>
        /// <param name="cudaContext">The cuda context.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        public static void Invoke(FillCopyKernels kernels, TSCudaContext context, CudaContext cudaContext, NDArray result, NDArray src)
        {
            var ptx          = kernels.GetPtx(context.Compiler);
            var elementCount = result.ElementCount();

            ApplyOpInvoke.Invoke(context, cudaContext, ptx, "copy", result, src, elementCount);
        }
Example #5
0
        /// <summary>
        /// Spatials the maximum pooling backward.
        /// </summary>
        /// <param name="input">The input.</param>
        /// <param name="gradOutput">The grad output.</param>
        /// <param name="gradInput">The grad input.</param>
        /// <param name="indices">The indices.</param>
        /// <param name="cd">The cd.</param>
        /// <param name="ceilMode">if set to <c>true</c> [ceil mode].</param>
        public void SpatialMaxPoolingBackward(NDArray input, NDArray gradOutput, NDArray gradInput, NDArray indices, ConvolutionDesc2d cd, bool ceilMode)
        {
            var context     = CudaHelpers.TSContextForTensor(gradOutput);
            var cudaContext = context.CudaContextForTensor(gradOutput);

            var dimw = 3;
            var dimh = 2;
            var dimc = 1;

            var nbatch  = input.Shape[0];
            var nslices = input.Shape[dimc];
            var iheight = input.Shape[dimh];
            var iwidth  = input.Shape[dimw];
            var owidth  = gradOutput.Shape[dimw];
            var oheight = gradOutput.Shape[dimh];


            using (var gradOutputContig = Ops.AsContiguous(gradOutput))
            {
                var gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig);
                var indicesPtr    = CudaHelpers.GetBufferStart(indices);
                var gradInputPtr  = CudaHelpers.GetBufferStart(gradInput);

                var count = (int)input.ElementCount();

                Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                       count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth,
                       cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr);
            }
        }
Example #6
0
 public void Copy(NDArray result, NDArray src)
 {
     if (result.ElementCount() != src.ElementCount())
     {
         throw new InvalidOperationException("Tensors must have equal numbers of elements");
     }
     NativeWrapper.Invoke(copy_func, result, src);
 }
Example #7
0
        /// <summary>
        /// Invokes the specified kernels.
        /// </summary>
        /// <param name="kernels">The kernels.</param>
        /// <param name="result">The result.</param>
        /// <param name="value">The value.</param>
        public static void Invoke(FillCopyKernels kernels, NDArray result, float value)
        {
            var context      = CudaHelpers.TSContextForTensor(result);
            var cudaContext  = context.CudaContextForTensor(result);
            var ptx          = kernels.GetPtx(context.Compiler);
            var elementCount = result.ElementCount();

            ApplyOpInvoke.Invoke(context, cudaContext, ptx, "fill", result, value, elementCount);
        }
Example #8
0
        public NDArray UpdateGradInput(NDArray input, NDArray target)
        {
            var norm = 2.0f / input.ElementCount();

            ((input.TVar() - target) * norm)
            .Evaluate(gradInput);

            return(gradInput);
        }
        /// <summary>
        /// Scatters the specified result.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="dim">The dim.</param>
        /// <param name="indices">The indices.</param>
        /// <returns>Tensor.</returns>
        /// <exception cref="ArgumentNullException">result</exception>
        /// <exception cref="InvalidOperationException">
        /// result and src must have same number of dimensions
        /// or
        /// src and indices must have same number of dimensions
        /// or
        /// src and indices must be the same size
        /// or
        /// result and src must be the same size except in dimension dim
        /// </exception>
        /// <exception cref="ArgumentOutOfRangeException">dim</exception>
        public NDArray Scatter(NDArray result, NDArray src, int dim, NDArray indices)
        {
            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForTensor(src);

            if (result == null)
            {
                throw new ArgumentNullException("result");
            }

            if (result.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("result and src must have same number of dimensions");
            }
            if (dim < 0 && dim >= result.DimensionCount)
            {
                throw new ArgumentOutOfRangeException("dim");
            }
            if (indices.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("src and indices must have same number of dimensions");
            }
            if (!src.IsSameSizeAs(indices))
            {
                throw new InvalidOperationException("src and indices must be the same size");
            }
            if (!TensorResultBuilder.ArrayEqualExcept(src.Shape, result.Shape, dim))
            {
                throw new InvalidOperationException("result and src must be the same size except in dimension dim");
            }

            var writeTarget = result;

            var nElement = indices.ElementCount();
            var block    = ApplyUtils.GetApplyBlock();
            var grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                var dims       = indices.DimensionCount <= 3 ? indices.DimensionCount : -1;
                var kernelName = MakeKernelName(ScatterBaseName, true, dims);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                       writeTarget, src, indices, dim, (int)nElement);
            }
            else
            {
                var kernelName = MakeKernelName(ScatterBaseName, false, -1);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false,
                       writeTarget, src, indices, dim, (long)nElement);
            }

            return(writeTarget);
        }
Example #10
0
        /// <summary>
        /// Spatials the maximum pooling forward.
        /// </summary>
        /// <param name="input">The input.</param>
        /// <param name="output">The output.</param>
        /// <param name="indices">The indices.</param>
        /// <param name="cd">The cd.</param>
        /// <param name="ceilMode">if set to <c>true</c> [ceil mode].</param>
        public void SpatialMaxPoolingForward(NDArray input, NDArray output, NDArray indices, ConvolutionDesc2d cd, bool ceilMode)
        {
            var context     = CudaHelpers.TSContextForTensor(input);
            var cudaContext = context.CudaContextForTensor(input);

            var iwidth      = input.Shape[3];
            var iheight     = input.Shape[2];
            var nInputPlane = input.Shape[1];
            var batchSize   = input.Shape[0];

            long owidth;
            long oheight;

            if (ceilMode)
            {
                oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1;
                owidth  = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1;
            }
            else
            {
                oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1;
                owidth  = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1;
            }

            if (cd.padW != 0 || cd.padH != 0)
            {
                // ensure that the last pooling starts inside the image
                if ((oheight - 1) * cd.dH >= iheight + cd.padH)
                {
                    --oheight;
                }
                if ((owidth - 1) * cd.dW >= iwidth + cd.padW)
                {
                    --owidth;
                }
            }

            using (var inputContig = Ops.AsContiguous(input))
            {
                var inputPtr   = CudaHelpers.GetBufferStart(inputContig);
                var outputPtr  = CudaHelpers.GetBufferStart(output);
                var indicesPtr = CudaHelpers.GetBufferStart(indices);

                var count = (int)output.ElementCount();

                Invoke(context, cudaContext, "MaxPoolForward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                       count, inputPtr, batchSize, nInputPlane, iheight, iwidth, oheight, owidth,
                       cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, outputPtr, indicesPtr);
            }
        }
Example #11
0
        public override void FlattenParams(NDArray parameters, NDArray gradParameters)
        {
            var weightSize = weights.ElementCount();
            var biasSize   = bias.ElementCount();

            weights.TVar().View(weightSize)
            .Evaluate(parameters.TVar().Narrow(0, 0, weightSize));

            bias.TVar().View(biasSize)
            .Evaluate(parameters.TVar().Narrow(0, weightSize, biasSize));

            gradWeights.TVar().View(weightSize)
            .Evaluate(gradParameters.TVar().Narrow(0, 0, weightSize));

            gradBias.TVar().View(biasSize)
            .Evaluate(gradParameters.TVar().Narrow(0, weightSize, biasSize));
        }
Example #12
0
        public void CopyGpuToCpu(
            [OpArgStorageType(typeof(Cpu.CpuStorage))] NDArray result,
            [OpArgStorageType(typeof(CudaStorage))] NDArray src)
        {
            var totalElements = result.ElementCount();

            if (totalElements != src.ElementCount())
            {
                throw new InvalidOperationException("Tensors must have equal numbers of elements");
            }

            if (src.DimensionCount == 0)
            {
                return;
            }

            copyOps.CopyGpuToCpu(result, src, totalElements);
        }
Example #13
0
        /// <summary>
        /// Invokes the specified reduce all kernels.
        /// </summary>
        /// <param name="reduceAllKernels">The reduce all kernels.</param>
        /// <param name="init">The initialize.</param>
        /// <param name="initType">Type of the initialize.</param>
        /// <param name="kernelName">Name of the kernel.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="extraArg">The extra argument.</param>
        /// <returns>Tensor.</returns>
        /// <exception cref="InvalidOperationException">Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported</exception>
        public static NDArray Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, NDArray result, NDArray src, object extraArg = null)
        {
            var deviceId    = CudaHelpers.GetDeviceId(src);
            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForDevice(deviceId);

            if (src.DimensionCount > TSCudaContext.MaxDims)
            {
                throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported");
            }

            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1);

            if (src.DimensionCount == 0)
            {
                return(result);
            }

            var    totalElements      = src.ElementCount();
            var    config             = new ApplySpecialization(src);
            object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements;
            object initValueTyped     = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            dim3 grid;
            dim3 block;

            var ptx            = reduceAllKernels.GetPtx(context.Compiler);
            var fullKernelName = PermutationGenerator.GetMangledName(kernelName, config);

            var outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget);

            if (isTwoPassReductionSize(totalElements))
            {
                getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                var scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer;

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg);
                }


                uint numPass1Blocks = grid.x;
                getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                smemSize = block.x * sizeof(float);

                InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr);
            }
            else
            {
                getSinglePassReduceBlockGrid(totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg);
                }
            }

            return(writeTarget);
        }
Example #14
0
        /// <summary>
        /// Invokes the specified reduce kernels.
        /// </summary>
        /// <param name="reduceKernels">The reduce kernels.</param>
        /// <param name="kernelName">Name of the kernel.</param>
        /// <param name="init">The initialize.</param>
        /// <param name="initType">Type of the initialize.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="dim">The dim.</param>
        /// <param name="extraArg">The extra argument.</param>
        /// <returns>Tensor.</returns>
        public static NDArray Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, NDArray result, NDArray src, int dim, object extraArg = null)
        {
            if (src.DimensionCount == 0)
            {
                return(result);
            }

            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForTensor(src);

            var requiredOutputSize = (long[])src.Shape.Clone();

            requiredOutputSize[dim] = 1;
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize);

            ThrowIfAnyTensorInvalid(writeTarget, src);

            var inElements      = src.ElementCount();
            var reductionSize   = src.Shape[dim];
            var reductionStride = src.Strides[dim];
            var outElements     = inElements / reductionSize;
            var contigReduction = reductionStride == 1;


            // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1
            // This includes for the purposes of determining which tensor specializations to use (changing
            // the dimension size to 1 may make the tensor non-contiguous
            var newSizes = (long[])src.Shape.Clone();

            newSizes[dim] = 1;
            var srcSlim = new NDArray(newSizes, src.Strides, src.Storage, src.StorageOffset);

            var    config               = new ApplySpecialization(writeTarget, srcSlim);
            object totalSlices          = config.Use32BitIndices ? (uint)outElements : (ulong)outElements;
            object reductionSizeTyped   = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize;
            object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride;
            object initValueTyped       = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            var ptx = reduceKernels.GetPtx(context.Compiler);

            if (contigReduction)
            {
                var  block    = GetContigReduceBlock(cudaContext, outElements, reductionSize);
                var  grid     = GetContigReduceGrid(outElements);
                uint smemSize = (uint)src.ElementType.Size() * block.x;

                var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }
            else
            {
                var  deviceProps = context.DeviceInfoForContext(cudaContext);
                var  block       = GetNonContigReduceBlock(deviceProps);
                var  grid        = GetNoncontigReduceGrid(deviceProps, outElements);
                uint smemSize    = 0;

                var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }

            return(writeTarget);
        }
Example #15
0
        /// <summary>
        /// Indexes the select.
        /// </summary>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="dim">The dim.</param>
        /// <param name="indices">The indices.</param>
        /// <returns>Tensor.</returns>
        public NDArray IndexSelect(NDArray result, NDArray src, int dim, NDArray indices)
        {
            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForTensor(src);

            var requiredOutputSize = (long[])src.Shape.Clone();

            requiredOutputSize[dim] = 1;
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, requiredOutputSize);


            // The `src` is partitioned into two parts:
            // -the size of each slice we are indexing, which is the
            // total size of the tensor ignoring dimension `dim`;
            // -the number of indices we are choosing, which is the total size
            // of the tensor `indices`.
            var numIndices       = indices.ElementCount();
            var dstTotalSize     = writeTarget.ElementCount();
            var srcSelectDimSize = src.Shape[dim];
            var sliceSize        = dstTotalSize / numIndices;

            var mpc             = context.DeviceInfoForContext(cudaContext).MultiProcessorCount;
            var smallIndexGrid  = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(sliceSize, 128), (mpc * 8)));
            var smallIndexBlock = new dim3((uint)Math.Min(sliceSize, 128));

            var largeIndexGrid  = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(dstTotalSize, 128), (mpc * 8)));
            var largeIndexBlock = new dim3((uint)Math.Min(dstTotalSize, 128));


            var newResultSize = (long[])writeTarget.Shape.Clone();

            newResultSize[dim] = 1;
            var resultFlat = new NDArray(newResultSize, writeTarget.Strides, writeTarget.Storage, writeTarget.StorageOffset);

            var newSrcSize = (long[])src.Shape.Clone();

            newSrcSize[dim] = 1;
            var srcFlat = new NDArray(newSrcSize, src.Strides, src.Storage, src.StorageOffset);


            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                // Threshold for small kernel
                var    smallKernel = numIndices <= 16;
                string kernelName  = "";
                var    indContig   = indices.IsContiguous();

                if (writeTarget.DimensionCount == src.DimensionCount &&
                    writeTarget.DimensionCount <= 3 &&
                    indContig)
                {
                    kernelName = MakeKernelName(smallKernel, true, writeTarget.DimensionCount, src.DimensionCount, -2);
                }
                else
                {
                    kernelName = MakeKernelName(smallKernel, true, -1, -1, -1);
                }

                var grid  = smallKernel ? smallIndexGrid : largeIndexGrid;
                var block = smallKernel ? smallIndexBlock : largeIndexBlock;
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                       writeTarget, src, indices, dim, dim, sliceSize, srcSelectDimSize);
            }
            else
            {
                var kernelName = MakeKernelName(false, false, -1, -1, -1);

                Invoke(context, cudaContext, kernelName, largeIndexGrid, largeIndexBlock, 0, CUstream.NullStream, false,
                       writeTarget, src, indices, dim, dim, dstTotalSize, sliceSize, srcSelectDimSize);
            }



            return(writeTarget);
        }