public NDArray MeanAll(NDArray result, NDArray src) { if (src.DimensionCount == 0 || src.ElementCount() == 0) { throw new ArgumentException("src must be a non-empty tensor"); } var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); SumAll(writeTarget, src); Div(writeTarget, writeTarget, src.ElementCount()); return(writeTarget); }
public NDArray VarAll(NDArray result, NDArray src) { if (src.DimensionCount == 0 || src.ElementCount() == 0) { throw new ArgumentException("src must be a non-empty tensor"); } var mean = Ops.MeanAll(src); var writeTarget = ReduceAllOp.Invoke(cudaReduceAllKernels, 0.0f, ReduceInitType.GivenValue, "en_norm", result, src, mean); Div(writeTarget, writeTarget, src.ElementCount() - 1); return(writeTarget); }
/// <summary> /// Determines whether this instance [can use32 bit index math] the specified tensor. /// </summary> /// <param name="tensor">The tensor.</param> /// <returns><c>true</c> if this instance [can use32 bit index math] the specified tensor; otherwise, <c>false</c>.</returns> public static bool CanUse32BitIndexMath(NDArray tensor) { var elements = tensor.ElementCount(); if (elements >= uint.MaxValue) { return(false); } long offset = 0; long linearId = elements - 1; for (int i = tensor.DimensionCount - 1; i >= 0; --i) { var curDimIndex = linearId % tensor.Shape[i]; var curDimOffset = curDimIndex * tensor.Strides[i]; offset += curDimOffset; linearId /= tensor.Shape[i]; } if (offset >= uint.MaxValue) { return(false); } return(true); }
/// <summary> /// Invokes the specified kernels. /// </summary> /// <param name="kernels">The kernels.</param> /// <param name="context">The context.</param> /// <param name="cudaContext">The cuda context.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> public static void Invoke(FillCopyKernels kernels, TSCudaContext context, CudaContext cudaContext, NDArray result, NDArray src) { var ptx = kernels.GetPtx(context.Compiler); var elementCount = result.ElementCount(); ApplyOpInvoke.Invoke(context, cudaContext, ptx, "copy", result, src, elementCount); }
/// <summary> /// Spatials the maximum pooling backward. /// </summary> /// <param name="input">The input.</param> /// <param name="gradOutput">The grad output.</param> /// <param name="gradInput">The grad input.</param> /// <param name="indices">The indices.</param> /// <param name="cd">The cd.</param> /// <param name="ceilMode">if set to <c>true</c> [ceil mode].</param> public void SpatialMaxPoolingBackward(NDArray input, NDArray gradOutput, NDArray gradInput, NDArray indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(gradOutput); var cudaContext = context.CudaContextForTensor(gradOutput); var dimw = 3; var dimh = 2; var dimc = 1; var nbatch = input.Shape[0]; var nslices = input.Shape[dimc]; var iheight = input.Shape[dimh]; var iwidth = input.Shape[dimw]; var owidth = gradOutput.Shape[dimw]; var oheight = gradOutput.Shape[dimh]; using (var gradOutputContig = Ops.AsContiguous(gradOutput)) { var gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig); var indicesPtr = CudaHelpers.GetBufferStart(indices); var gradInputPtr = CudaHelpers.GetBufferStart(gradInput); var count = (int)input.ElementCount(); Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr); } }
public void Copy(NDArray result, NDArray src) { if (result.ElementCount() != src.ElementCount()) { throw new InvalidOperationException("Tensors must have equal numbers of elements"); } NativeWrapper.Invoke(copy_func, result, src); }
/// <summary> /// Invokes the specified kernels. /// </summary> /// <param name="kernels">The kernels.</param> /// <param name="result">The result.</param> /// <param name="value">The value.</param> public static void Invoke(FillCopyKernels kernels, NDArray result, float value) { var context = CudaHelpers.TSContextForTensor(result); var cudaContext = context.CudaContextForTensor(result); var ptx = kernels.GetPtx(context.Compiler); var elementCount = result.ElementCount(); ApplyOpInvoke.Invoke(context, cudaContext, ptx, "fill", result, value, elementCount); }
public NDArray UpdateGradInput(NDArray input, NDArray target) { var norm = 2.0f / input.ElementCount(); ((input.TVar() - target) * norm) .Evaluate(gradInput); return(gradInput); }
/// <summary> /// Scatters the specified result. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="dim">The dim.</param> /// <param name="indices">The indices.</param> /// <returns>Tensor.</returns> /// <exception cref="ArgumentNullException">result</exception> /// <exception cref="InvalidOperationException"> /// result and src must have same number of dimensions /// or /// src and indices must have same number of dimensions /// or /// src and indices must be the same size /// or /// result and src must be the same size except in dimension dim /// </exception> /// <exception cref="ArgumentOutOfRangeException">dim</exception> public NDArray Scatter(NDArray result, NDArray src, int dim, NDArray indices) { var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); if (result == null) { throw new ArgumentNullException("result"); } if (result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("result and src must have same number of dimensions"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (!src.IsSameSizeAs(indices)) { throw new InvalidOperationException("src and indices must be the same size"); } if (!TensorResultBuilder.ArrayEqualExcept(src.Shape, result.Shape, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } var writeTarget = result; var nElement = indices.ElementCount(); var block = ApplyUtils.GetApplyBlock(); var grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { var dims = indices.DimensionCount <= 3 ? indices.DimensionCount : -1; var kernelName = MakeKernelName(ScatterBaseName, true, dims); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, (int)nElement); } else { var kernelName = MakeKernelName(ScatterBaseName, false, -1); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, (long)nElement); } return(writeTarget); }
/// <summary> /// Spatials the maximum pooling forward. /// </summary> /// <param name="input">The input.</param> /// <param name="output">The output.</param> /// <param name="indices">The indices.</param> /// <param name="cd">The cd.</param> /// <param name="ceilMode">if set to <c>true</c> [ceil mode].</param> public void SpatialMaxPoolingForward(NDArray input, NDArray output, NDArray indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(input); var cudaContext = context.CudaContextForTensor(input); var iwidth = input.Shape[3]; var iheight = input.Shape[2]; var nInputPlane = input.Shape[1]; var batchSize = input.Shape[0]; long owidth; long oheight; if (ceilMode) { oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; owidth = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } else { oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; owidth = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } if (cd.padW != 0 || cd.padH != 0) { // ensure that the last pooling starts inside the image if ((oheight - 1) * cd.dH >= iheight + cd.padH) { --oheight; } if ((owidth - 1) * cd.dW >= iwidth + cd.padW) { --owidth; } } using (var inputContig = Ops.AsContiguous(input)) { var inputPtr = CudaHelpers.GetBufferStart(inputContig); var outputPtr = CudaHelpers.GetBufferStart(output); var indicesPtr = CudaHelpers.GetBufferStart(indices); var count = (int)output.ElementCount(); Invoke(context, cudaContext, "MaxPoolForward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, inputPtr, batchSize, nInputPlane, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, outputPtr, indicesPtr); } }
public override void FlattenParams(NDArray parameters, NDArray gradParameters) { var weightSize = weights.ElementCount(); var biasSize = bias.ElementCount(); weights.TVar().View(weightSize) .Evaluate(parameters.TVar().Narrow(0, 0, weightSize)); bias.TVar().View(biasSize) .Evaluate(parameters.TVar().Narrow(0, weightSize, biasSize)); gradWeights.TVar().View(weightSize) .Evaluate(gradParameters.TVar().Narrow(0, 0, weightSize)); gradBias.TVar().View(biasSize) .Evaluate(gradParameters.TVar().Narrow(0, weightSize, biasSize)); }
public void CopyGpuToCpu( [OpArgStorageType(typeof(Cpu.CpuStorage))] NDArray result, [OpArgStorageType(typeof(CudaStorage))] NDArray src) { var totalElements = result.ElementCount(); if (totalElements != src.ElementCount()) { throw new InvalidOperationException("Tensors must have equal numbers of elements"); } if (src.DimensionCount == 0) { return; } copyOps.CopyGpuToCpu(result, src, totalElements); }
/// <summary> /// Invokes the specified reduce all kernels. /// </summary> /// <param name="reduceAllKernels">The reduce all kernels.</param> /// <param name="init">The initialize.</param> /// <param name="initType">Type of the initialize.</param> /// <param name="kernelName">Name of the kernel.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="extraArg">The extra argument.</param> /// <returns>Tensor.</returns> /// <exception cref="InvalidOperationException">Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported</exception> public static NDArray Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, NDArray result, NDArray src, object extraArg = null) { var deviceId = CudaHelpers.GetDeviceId(src); var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForDevice(deviceId); if (src.DimensionCount > TSCudaContext.MaxDims) { throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported"); } var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); if (src.DimensionCount == 0) { return(result); } var totalElements = src.ElementCount(); var config = new ApplySpecialization(src); object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); dim3 grid; dim3 block; var ptx = reduceAllKernels.GetPtx(context.Compiler); var fullKernelName = PermutationGenerator.GetMangledName(kernelName, config); var outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget); if (isTwoPassReductionSize(totalElements)) { getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); var scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer; if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace); } else { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg); } uint numPass1Blocks = grid.x; getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); smemSize = block.x * sizeof(float); InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr); } else { getSinglePassReduceBlockGrid(totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr); } else { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg); } } return(writeTarget); }
/// <summary> /// Invokes the specified reduce kernels. /// </summary> /// <param name="reduceKernels">The reduce kernels.</param> /// <param name="kernelName">Name of the kernel.</param> /// <param name="init">The initialize.</param> /// <param name="initType">Type of the initialize.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="dim">The dim.</param> /// <param name="extraArg">The extra argument.</param> /// <returns>Tensor.</returns> public static NDArray Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, NDArray result, NDArray src, int dim, object extraArg = null) { if (src.DimensionCount == 0) { return(result); } var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); var requiredOutputSize = (long[])src.Shape.Clone(); requiredOutputSize[dim] = 1; var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize); ThrowIfAnyTensorInvalid(writeTarget, src); var inElements = src.ElementCount(); var reductionSize = src.Shape[dim]; var reductionStride = src.Strides[dim]; var outElements = inElements / reductionSize; var contigReduction = reductionStride == 1; // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1 // This includes for the purposes of determining which tensor specializations to use (changing // the dimension size to 1 may make the tensor non-contiguous var newSizes = (long[])src.Shape.Clone(); newSizes[dim] = 1; var srcSlim = new NDArray(newSizes, src.Strides, src.Storage, src.StorageOffset); var config = new ApplySpecialization(writeTarget, srcSlim); object totalSlices = config.Use32BitIndices ? (uint)outElements : (ulong)outElements; object reductionSizeTyped = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize; object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); var ptx = reduceKernels.GetPtx(context.Compiler); if (contigReduction) { var block = GetContigReduceBlock(cudaContext, outElements, reductionSize); var grid = GetContigReduceGrid(outElements); uint smemSize = (uint)src.ElementType.Size() * block.x; var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } else { var deviceProps = context.DeviceInfoForContext(cudaContext); var block = GetNonContigReduceBlock(deviceProps); var grid = GetNoncontigReduceGrid(deviceProps, outElements); uint smemSize = 0; var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } return(writeTarget); }
/// <summary> /// Indexes the select. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="dim">The dim.</param> /// <param name="indices">The indices.</param> /// <returns>Tensor.</returns> public NDArray IndexSelect(NDArray result, NDArray src, int dim, NDArray indices) { var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); var requiredOutputSize = (long[])src.Shape.Clone(); requiredOutputSize[dim] = 1; var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, requiredOutputSize); // The `src` is partitioned into two parts: // -the size of each slice we are indexing, which is the // total size of the tensor ignoring dimension `dim`; // -the number of indices we are choosing, which is the total size // of the tensor `indices`. var numIndices = indices.ElementCount(); var dstTotalSize = writeTarget.ElementCount(); var srcSelectDimSize = src.Shape[dim]; var sliceSize = dstTotalSize / numIndices; var mpc = context.DeviceInfoForContext(cudaContext).MultiProcessorCount; var smallIndexGrid = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(sliceSize, 128), (mpc * 8))); var smallIndexBlock = new dim3((uint)Math.Min(sliceSize, 128)); var largeIndexGrid = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(dstTotalSize, 128), (mpc * 8))); var largeIndexBlock = new dim3((uint)Math.Min(dstTotalSize, 128)); var newResultSize = (long[])writeTarget.Shape.Clone(); newResultSize[dim] = 1; var resultFlat = new NDArray(newResultSize, writeTarget.Strides, writeTarget.Storage, writeTarget.StorageOffset); var newSrcSize = (long[])src.Shape.Clone(); newSrcSize[dim] = 1; var srcFlat = new NDArray(newSrcSize, src.Strides, src.Storage, src.StorageOffset); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { // Threshold for small kernel var smallKernel = numIndices <= 16; string kernelName = ""; var indContig = indices.IsContiguous(); if (writeTarget.DimensionCount == src.DimensionCount && writeTarget.DimensionCount <= 3 && indContig) { kernelName = MakeKernelName(smallKernel, true, writeTarget.DimensionCount, src.DimensionCount, -2); } else { kernelName = MakeKernelName(smallKernel, true, -1, -1, -1); } var grid = smallKernel ? smallIndexGrid : largeIndexGrid; var block = smallKernel ? smallIndexBlock : largeIndexBlock; Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, dim, sliceSize, srcSelectDimSize); } else { var kernelName = MakeKernelName(false, false, -1, -1, -1); Invoke(context, cudaContext, kernelName, largeIndexGrid, largeIndexBlock, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, dim, dstTotalSize, sliceSize, srcSelectDimSize); } return(writeTarget); }