Esempio n. 1
0
        /// <summary>
        /// Run the Forward computation using the Engine CUDNN mode as specified in the LayerParameter.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        protected void forward_cudnn(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            long          hWeight = m_colBlobs[0].gpu_data;
            WorkspaceArgs wsArgs  = getWorkspace();

            for (int i = 0; i < colBottom.Count; i++)
            {
                long hBottomData = colBottom[i].gpu_data;
                long hTopData    = colTop[i].mutable_gpu_data;

                // Forward through cuDNN in parallel over groups.
                for (int g = 0; g < m_nGroup; g++)
                {
                    // Filters.
                    m_cuda.ConvolutionForward(m_rghCudnn[g],
                                              m_tOne,
                                              m_rghBottomDesc[i],
                                              hBottomData, m_nBottomOffset * g,
                                              m_hFilterDesc,
                                              hWeight, m_nWeightOffset * g,
                                              m_rghConvDesc[i],
                                              m_rgfwdAlgo[i],
                                              wsArgs.Data, (int)m_rglWorkspaceFwdOffsets[g], m_rglWorkspaceFwdSizes[i],
                                              m_tZero,
                                              m_rghTopDesc[i],
                                              hTopData, m_nTopOffset * g,
                                              false);
                }

                // Synchronize the work across groups, each of which went into its own stream.
                for (int g = 0; g < m_nGroup; g++)
                {
                    m_cuda.SynchronizeStream(m_rghStream[g]);
                }

                // Bias.
                if (m_bBiasTerm)
                {
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        long hBiasData = m_colBlobs[1].gpu_data;

                        m_cuda.AddTensor(m_rghCudnn[g],
                                         m_tOne,
                                         m_hBiasDesc,
                                         hBiasData, m_nBiasOffset * g,
                                         m_tOne,
                                         m_rghTopDesc[i],
                                         hTopData, m_nTopOffset * g);
                    }

                    // Synchronize the work across groups, each of which went into its own stream.
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        m_cuda.SynchronizeStream(m_rghStream[g]);
                    }
                }
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Run the Forward computation with Engine.CUDNN.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        protected void forward_cudnn(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            long          hWeight = m_colBlobs[0].gpu_data;
            WorkspaceArgs wsArgs  = getWorkspace();

            for (int i = 0; i < colBottom.Count; i++)
            {
                long hBottomData = colBottom[i].gpu_data;
                long hTopData    = colTop[i].mutable_gpu_data;

                // Forward through cuDNN in parallel over groups.
                for (int g = 0; g < m_nGroup; g++)
                {
                    // Filters.
                    m_cuda.ConvolutionBackwardData(m_rghCudnn[g],
                                                   m_tOne,
                                                   m_hFilterDesc,
                                                   hWeight, m_nWeightOffset * g,
                                                   m_rghBottomDesc[i],
                                                   hBottomData, m_nBottomOffset * g,
                                                   m_rghConvDesc[i],
                                                   m_rgbwdDataAlgo[i],
                                                   wsArgs.Data, (int)m_rglWorkspaceBwdDataOffsets[g], m_rglWorkspaceBwdDataSizes[i],
                                                   m_tZero,
                                                   m_rghTopDesc[i],
                                                   hTopData, m_nTopOffset * g);

                    // Bias.
                    if (m_bBiasTerm)
                    {
                        long hBiasData = m_colBlobs[1].gpu_data;

                        m_cuda.AddTensor(m_rghCudnn[g],
                                         m_tOne,
                                         m_hBiasDesc,
                                         hBiasData, m_nBiasOffset * g,
                                         m_tOne,
                                         m_rghTopDesc[i],
                                         hTopData, m_nTopOffset * g);
                    }
                }

                // Synchronize the work across groups, each of which went into its own
                // stream, by launching an empty kernel into the default (null) stream.
                m_cuda.SynchronizeThread();
            }

            for (int g = 0; g < m_nGroup; g++)
            {
                m_cuda.SynchronizeStream(m_rghStream[g]);
            }

            m_cuda.SynchronizeDevice();
        }
        /// <summary>
        /// Retruns the WorkspaceArgs containing the workspace used by this Layer.
        /// </summary>
        /// <returns></returns>
        protected override WorkspaceArgs getWorkspace()
        {
            WorkspaceArgs args = base.getWorkspace();

            if (args != null)
            {
                return(args);
            }

            m_bWorkspaceOwner = true;
            return(new common.WorkspaceArgs(m_hWorkspaceData, m_lWorkspaceSize));
        }
Esempio n. 4
0
        private void layer_OnSetWorkspace(object sender, WorkspaceArgs e)
        {
            if (e.Size < m_lWorkspaceSize)
            {
                return;
            }

            m_lWorkspaceSize = e.Size;
            m_cuda.DisableGhostMemory();

            if (m_hWorkspaceData != 0)
            {
                m_cuda.FreeMemory(m_hWorkspaceData);
            }

            m_hWorkspaceData = m_cuda.AllocMemory((long)m_lWorkspaceSize);
            m_cuda.ResetGhostMemory();
        }
Esempio n. 5
0
        /// <summary>
        /// Run the Backward computation using Engine.CUDNN.
        /// </summary>
        /// <param name="colTop">top output Blob vector (length 1).</param>
        /// <param name="rgbPropagateDown">see Layer::Backward</param>
        /// <param name="colBottom">bottom input Blob vector (length 1).</param>
        protected void backward_cudnn(BlobCollection <T> colTop, List <bool> rgbPropagateDown, BlobCollection <T> colBottom)
        {
            long          hWeight     = 0;
            long          hWeightDiff = 0;
            WorkspaceArgs wsArgs      = getWorkspace();

            if (m_rgbParamPropagateDown[0])
            {
                hWeight     = m_colBlobs[0].gpu_data;
                hWeightDiff = m_colBlobs[0].mutable_gpu_diff;
            }

            long hBiasDiff = 0;

            if (m_bBiasTerm && m_rgbParamPropagateDown[1])
            {
                hBiasDiff = m_colBlobs[1].mutable_gpu_diff;
            }

            for (int i = 0; i < colTop.Count; i++)
            {
                long hTopDiff = colTop[i].gpu_diff;

                // Backward through cuDNN in parallel over groups and gradients.
                for (int g = 0; g < m_nGroup; g++)
                {
                    // Gradient w.r.t. bias.
                    if (m_bBiasTerm && m_rgbParamPropagateDown[1])
                    {
                        m_cuda.ConvolutionBackwardBias(m_rghCudnn[0 * m_nGroup + g],
                                                       m_tOne,
                                                       m_rghTopDesc[i],
                                                       hTopDiff, m_nTopOffset * g,
                                                       m_tOne,
                                                       m_hBiasDesc,
                                                       hBiasDiff, m_nBiasOffset * g);
                    }

                    // Gradient w.r.t weights.
                    if (m_rgbParamPropagateDown[0])
                    {
                        long hBottomData = colBottom[i].gpu_data;

                        m_cuda.ConvolutionBackwardFilter(m_rghCudnn[1 * m_nGroup + g],
                                                         m_tOne,
                                                         m_rghTopDesc[i],
                                                         hTopDiff, m_nTopOffset * g,
                                                         m_rghBottomDesc[i],
                                                         hBottomData, m_nBottomOffset * g,
                                                         m_rghConvDesc[i],
                                                         m_rgbwdFilterAlgo[i],
                                                         wsArgs.Data, (int)m_rglWorkspaceBwdFilterOffsets[1 * m_nGroup + g],
                                                         m_rglWorkspaceBwdFilterSizes[i],
                                                         m_tOne,
                                                         m_hFilterDesc,
                                                         hWeightDiff, m_nWeightOffset * g);
                    }

                    // Gradient w.r.t. bottom data.
                    if (rgbPropagateDown[i])
                    {
                        if (hWeight == 0)
                        {
                            hWeightDiff = m_colBlobs[0].gpu_data;
                        }

                        long hBottomDiff = colBottom[i].mutable_gpu_diff;

                        m_cuda.ConvolutionForward(m_rghCudnn[2 * m_nGroup + g],
                                                  m_tOne,
                                                  m_rghTopDesc[i],
                                                  hTopDiff, m_nTopOffset * g,
                                                  m_hFilterDesc,
                                                  hWeight, m_nWeightOffset * g,
                                                  m_rghConvDesc[i],
                                                  m_rgfwdAlgo[i],
                                                  wsArgs.Data, (int)m_rglWorkspaceFwdOffsets[2 * m_nGroup + g],
                                                  m_rglWorkspaceFwdSizes[i],
                                                  m_tZero,
                                                  m_rghBottomDesc[i],
                                                  hBottomDiff, m_nBottomOffset * g);
                    }
                }

                // Synchronize the work across groups, each of which went into its own
                // stream, by launching an empty kernel into the default (null) stream.
                m_cuda.SynchronizeThread();
            }

            for (int g = 0; g < m_nGroup; g++)
            {
                m_cuda.SynchronizeStream(m_rghStream[g]);
            }
        }
Esempio n. 6
0
        /// <summary>
        /// Reshape the bottom (input) and top (output) blobs.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            base.Reshape(colBottom, colTop);

            if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes))
            {
                return;
            }

            m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Deconvolution input must have 2 spatial axes (e.g., height and width).  Use 'engine: CAFFE' for general ND deconvolution.");

            m_nBottomOffset = m_nBottomDim / m_nGroup;
            m_nTopOffset    = m_nTopDim / m_nGroup;

            int  nHeight    = colBottom[0].shape(m_nChannelAxis + 1);
            int  nWidth     = colBottom[0].shape(m_nChannelAxis + 2);
            int  nHeightOut = colTop[0].shape(m_nChannelAxis + 1);
            int  nWidthOut  = colTop[0].shape(m_nChannelAxis + 2);
            Size szPad      = size_at(m_blobPad);
            Size szStride   = size_at(m_blobStride);

            // Specify workspace limit for kernels directly until we have a
            // planning strategy and a rewrite of Caffe's GPU memory management.
            // default = 1024 * 1024 * 8;
            long lWorkspaceLimitBytes = m_param.convolution_param.cudnn_workspace_limit * 8;

            for (int i = 0; i < colBottom.Count; i++)
            {
                m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1);
                m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * nHeightOut * nWidthOut, nHeightOut * nWidthOut, nWidthOut, 1);
                m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width);

                // NOTE: The native Caffe team has found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
                // buggy (in deconvolution).  Thus, if this algo was chosen (by CuDnn), we attempt to use winograd
                // instead.  If winograd is not supported, or the workspace is larger than the threshold, we
                // use implicit_gemm instead.
                CONV_FWD_ALGO algoFwdPreferred = CONV_FWD_ALGO.ALGO_WINOGRAD;
                // Get the algorithms and workspace sizes needed.
                CONV_FWD_ALGO        algoFwd       = (CONV_FWD_ALGO)0;
                CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0;
                CONV_BWD_DATA_ALGO   algoBwdData   = (CONV_BWD_DATA_ALGO)0;
                long lWsSizeFwd       = 0;
                long lWsSizeBwdFilter = 0;
                long lWsSizeBwdData   = 0;

                m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghTopDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghBottomDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData, algoFwdPreferred);
                m_rgfwdAlgo[i]                  = algoFwd;
                m_rglWorkspaceFwdSizes[i]       = lWsSizeFwd;
                m_rgbwdFilterAlgo[i]            = algoBwdFilter;
                m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
                m_rgbwdDataAlgo[i]              = algoBwdData;
                m_rglWorkspaceBwdDataSizes[i]   = lWsSizeBwdData;
            }

            // reduce over all workspace sizes to get a maximum to allocate / reallocate
            long lTotalWsFwd       = 0;
            long lTotalWsBwdFilter = 0;
            long lTotalWsBwdData   = 0;

            for (int i = 0; i < colBottom.Count; i++)
            {
                lTotalWsFwd       = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
                lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
                lTotalWsBwdData   = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
            }

            // Get max over all oeprations.
            long lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));

            // Ensure all groups have enough workspace.
            long lTotalMaxWorkspace = lMaxWorkspace * m_nGroup * CUDNN_STREAMS_PER_GROUP;

            // Initialize the workspace data.
            WorkspaceArgs wsArgs = getWorkspace();

            // This is the total amount of storage needed over all groups + streams.
            if (lTotalMaxWorkspace > wsArgs.Size)
            {
                setWorkspace(lTotalMaxWorkspace);
            }

            // if we succedd in the allocation, set the offsets for the workspaces.
            for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
            {
                m_rglWorkspaceFwdOffsets[g]       = g * lTotalWsFwd;
                m_rglWorkspaceBwdFilterOffsets[g] = g * lTotalWsBwdFilter;
                m_rglWorkspaceBwdDataOffsets[g]   = g * lTotalWsBwdData;
            }

            // Tensor descriptor for bias.
            if (m_bBiasTerm)
            {
                m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1);
            }
        }
Esempio n. 7
0
        /// <summary>
        /// Run the Backward computation using the Engine CUDNN mode as specified in the LayerParameter.
        /// </summary>
        /// <param name="colTop">top output Blob vector (length 1).</param>
        /// <param name="rgbPropagateDown">see Layer::Backward</param>
        /// <param name="colBottom">bottom input Blob vector (length 1).</param>
        protected void backward_cudnn(BlobCollection <T> colTop, List <bool> rgbPropagateDown, BlobCollection <T> colBottom)
        {
            WorkspaceArgs wsArgs = getWorkspace();

            // Gradient w.r.t. bias.
            if (m_bBiasTerm && m_rgbParamPropagateDown[1])
            {
                long hBiasDiff = m_colBlobs[1].mutable_gpu_diff;

                for (int i = 0; i < colTop.Count; i++)
                {
                    long hTopDiff = colTop[i].mutable_gpu_diff;

                    // Backward through cuDNN in parallel over groups and gradients.
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        m_cuda.ConvolutionBackwardBias(m_rghCudnn[0 * m_nGroup + g],
                                                       m_tOne, m_rghTopDesc[i], hTopDiff, m_nTopOffset * g,
                                                       m_tOne, m_hBiasDesc, hBiasDiff, m_nBiasOffset * g,
                                                       false);
                    }
                    // Synchronize the work across groups, each of which went into its own stream.
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        m_cuda.SynchronizeStream(m_rghStream[g]);
                    }
                }
            }

            // Gradient w.r.t weights.
            if (m_rgbParamPropagateDown[0])
            {
                long hWeightDiff = m_colBlobs[0].mutable_gpu_diff;

                for (int i = 0; i < colTop.Count; i++)
                {
                    long hTopDiff    = colTop[i].mutable_gpu_diff;
                    long hBottomData = colBottom[i].gpu_data;

                    // Backward through cuDNN in parallel over groups and gradients.
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        m_cuda.ConvolutionBackwardFilter(m_rghCudnn[1 * m_nGroup + g],
                                                         m_tOne,
                                                         m_rghBottomDesc[i], hBottomData, m_nBottomOffset * g,
                                                         m_rghTopDesc[i], hTopDiff, m_nTopOffset * g,
                                                         m_rghConvDesc[i],
                                                         m_rgbwdFilterAlgo[i],
                                                         wsArgs.Data, (int)m_rglWorkspaceBwdFilterOffsets[1 * m_nGroup + g],
                                                         m_rglWorkspaceBwdFilterSizes[i],
                                                         m_tOne,
                                                         m_hFilterDesc, hWeightDiff, m_nWeightOffset * g,
                                                         false);
                    }
                    // Synchronize the work across groups, each of which went into its own stream.
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        m_cuda.SynchronizeStream(m_rghStream[g]);
                    }
                }
            }

            // Gradient w.r.t. bottom data.
            long hWeight = m_colBlobs[0].gpu_data;

            for (int i = 0; i < colTop.Count; i++)
            {
                if (rgbPropagateDown[i])
                {
                    long hTopDiff    = colTop[i].mutable_gpu_diff;
                    long hBottomDiff = colBottom[i].mutable_gpu_diff;

                    // Backward through cuDNN in parallel over groups and gradients.
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        m_cuda.ConvolutionBackwardData(m_rghCudnn[2 * m_nGroup + g],
                                                       m_tOne,
                                                       m_hFilterDesc, hWeight, m_nWeightOffset * g,
                                                       m_rghTopDesc[i], hTopDiff, m_nTopOffset * g,
                                                       m_rghConvDesc[i],
                                                       m_rgbwdDataAlgo[i],
                                                       wsArgs.Data, (int)m_rglWorkspaceBwdDataOffsets[2 * m_nGroup + g],
                                                       m_rglWorkspaceBwdDataSizes[i],
                                                       m_tZero,
                                                       m_rghBottomDesc[i], hBottomDiff, m_nBottomOffset * g,
                                                       false);
                    }
                    // Synchronize the work across groups, each of which went into its own stream.
                    for (int g = 0; g < m_nGroup; g++)
                    {
                        m_cuda.SynchronizeStream(m_rghStream[g]);
                    }
                }
            }
        }
Esempio n. 8
0
        /// <summary>
        /// Reshape the bottom (input) and top (output) blobs.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            base.Reshape(colBottom, colTop);

            if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes))
            {
                return;
            }

            m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Convolution input must have 2 spatial axes (e.g., height and width).  Use 'engine: CAFFE' for general ND convolution.");

            m_nBottomOffset = m_nBottomDim / m_nGroup;
            m_nTopOffset    = m_nTopDim / m_nGroup;

            int  nHeight    = colBottom[0].shape(m_nChannelAxis + 1);
            int  nWidth     = colBottom[0].shape(m_nChannelAxis + 2);
            int  nHeightOut = colTop[0].shape(m_nChannelAxis + 1);
            int  nWidthOut  = colTop[0].shape(m_nChannelAxis + 2);
            Size szPad      = size_at(m_blobPad);
            Size szStride   = size_at(m_blobStride);

            // Specify workspace limit for kernels directly until we have a
            // planning strategy and a rewrite of Caffe's GPU memory management.
            // default = 1024 * 1024 * 8;
            long lWorkspaceLimitBytes = m_param.convolution_param.cudnn_workspace_limit * 8;

            // BUG Work Around
            // With cuDNN 7.0.5 and above we are seeing memory overwrite errors (from CUDA)
            //  when using more than 1 group and the workspace.
            //  * also confirmed in cuDNN 7.1.4 and CUDA 9.2 on driver 397.64
            if (m_nGroup > 1)
            {
                lWorkspaceLimitBytes = 0; // sets option to NO_WORKSPACE for Bwd Filter and Data
            }
            for (int i = 0; i < colBottom.Count; i++)
            {
                m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1);
                m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * m_nOutSpatialDim, m_nOutSpatialDim, nWidthOut, 1);
                m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width);

                // Get the algorithms and workspace sizes needed.
                CONV_FWD_ALGO        algoFwd       = (CONV_FWD_ALGO)0;
                CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0;
                CONV_BWD_DATA_ALGO   algoBwdData   = (CONV_BWD_DATA_ALGO)0;
                long lWsSizeFwd       = 0;
                long lWsSizeBwdFilter = 0;
                long lWsSizeBwdData   = 0;

                m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghBottomDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghTopDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData);
                m_rgfwdAlgo[i]                  = algoFwd;
                m_rglWorkspaceFwdSizes[i]       = lWsSizeFwd;
                m_rgbwdFilterAlgo[i]            = algoBwdFilter;
                m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
                m_rgbwdDataAlgo[i]              = algoBwdData;
                m_rglWorkspaceBwdDataSizes[i]   = lWsSizeBwdData;
            }

            // reduce over all workspace sizes to get a maximum to allocate / reallocate
            long lTotalWsFwd       = 0;
            long lTotalWsBwdFilter = 0;
            long lTotalWsBwdData   = 0;

            for (int i = 0; i < colBottom.Count; i++)
            {
                lTotalWsFwd       = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
                lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
                lTotalWsBwdData   = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
            }

            // Get max over all oeprations.
            long lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));

            // Ensure all groups have enough workspace.
            long lTotalMaxWorkspace = lMaxWorkspace * m_nGroup * CUDNN_STREAMS_PER_GROUP;

            // Initialize the workspace data.
            WorkspaceArgs wsArgs = getWorkspace();

            // This is the total amount of storage needed over all groups + streams.
            if (lTotalMaxWorkspace > wsArgs.Size)
            {
                setWorkspace(lTotalMaxWorkspace);
            }

            // if we succedd in the allocation, set the offsets for the workspaces.
            for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
            {
                m_rglWorkspaceFwdOffsets[g]       = g * lTotalWsFwd;
                m_rglWorkspaceBwdFilterOffsets[g] = g * lTotalWsBwdFilter;
                m_rglWorkspaceBwdDataOffsets[g]   = g * lTotalWsBwdData;
            }

            // Tensor descriptor for bias.
            if (m_bBiasTerm)
            {
                m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1);
            }
        }
Esempio n. 9
0
        /// <summary>
        /// Reshape the bottom (input) and top (output) blobs.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            base.Reshape(colBottom, colTop);

            if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes))
            {
                return;
            }

            m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Convolution input must have 2 spatial axes (e.g., height and width).  Use 'engine: CAFFE' for general ND convolution.");

            m_nBottomOffset = m_nBottomDim / m_nGroup;
            m_nTopOffset    = m_nTopDim / m_nGroup;

            int  nHeight    = colBottom[0].shape(m_nChannelAxis + 1);
            int  nWidth     = colBottom[0].shape(m_nChannelAxis + 2);
            int  nHeightOut = colTop[0].shape(m_nChannelAxis + 1);
            int  nWidthOut  = colTop[0].shape(m_nChannelAxis + 2);
            Size szPad      = size_at(m_blobPad);
            Size szStride   = size_at(m_blobStride);

            ulong lWorkspaceLimitBytes = getWorkspaceLimitInBytes();

            for (int i = 0; i < colBottom.Count; i++)
            {
                m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1, m_bUseHalfSize);
                m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * m_nOutSpatialDim, m_nOutSpatialDim, nWidthOut, 1, m_bUseHalfSize);
                m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width, m_bUseHalfSize);

                // Get the algorithms and workspace sizes needed.
                CONV_FWD_ALGO        algoFwd       = (CONV_FWD_ALGO)0;
                CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0;
                CONV_BWD_DATA_ALGO   algoBwdData   = (CONV_BWD_DATA_ALGO)0;
                ulong lWsSizeFwd       = 0;
                ulong lWsSizeBwdFilter = 0;
                ulong lWsSizeBwdData   = 0;

                m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghBottomDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghTopDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData);
                m_rgfwdAlgo[i]                  = algoFwd;
                m_rglWorkspaceFwdSizes[i]       = lWsSizeFwd;
                m_rgbwdFilterAlgo[i]            = algoBwdFilter;
                m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
                m_rgbwdDataAlgo[i]              = algoBwdData;
                m_rglWorkspaceBwdDataSizes[i]   = lWsSizeBwdData;
            }

            // reduce over all workspace sizes to get a maximum to allocate / reallocate
            ulong lTotalWsFwd       = 0;
            ulong lTotalWsBwdFilter = 0;
            ulong lTotalWsBwdData   = 0;

            for (int i = 0; i < colBottom.Count; i++)
            {
                lTotalWsFwd       = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
                lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
                lTotalWsBwdData   = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
            }

            // Get max over all oeprations.
            ulong lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));

            // Ensure all groups have enough workspace.
            ulong lTotalMaxWorkspace = (ulong)lMaxWorkspace * (ulong)m_nGroup * (ulong)CUDNN_STREAMS_PER_GROUP;

            // Initialize the workspace data.
            WorkspaceArgs wsArgs = getWorkspace();

            // This is the total amount of storage needed over all groups + streams.
            if (lTotalMaxWorkspace > wsArgs.Size)
            {
                setWorkspace(lTotalMaxWorkspace);
            }

            // if we succedd in the allocation, set the offsets for the workspaces.
            for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
            {
                m_rglWorkspaceFwdOffsets[g]       = (ulong)g * lTotalWsFwd;
                m_rglWorkspaceBwdFilterOffsets[g] = (ulong)g * lTotalWsBwdFilter;
                m_rglWorkspaceBwdDataOffsets[g]   = (ulong)g * lTotalWsBwdData;
            }

            // Tensor descriptor for bias.
            if (m_bBiasTerm)
            {
                m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1, m_bUseHalfSize);
            }
        }
Esempio n. 10
0
 private void layer_OnGetWorkspace(object sender, WorkspaceArgs e)
 {
     e.Data = m_hWorkspaceData;
     e.Size = m_lWorkspaceSize;
 }