Beispiel #1
0
        /// <summary>
        /// Reshape the bottom (input) and top (output) blobs.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            base.Reshape(colBottom, colTop);

            if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes))
            {
                return;
            }

            m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Convolution input must have 2 spatial axes (e.g., height and width).  Use 'engine: CAFFE' for general ND convolution.");

            m_nBottomOffset = m_nBottomDim / m_nGroup;
            m_nTopOffset    = m_nTopDim / m_nGroup;

            int  nHeight    = colBottom[0].shape(m_nChannelAxis + 1);
            int  nWidth     = colBottom[0].shape(m_nChannelAxis + 2);
            int  nHeightOut = colTop[0].shape(m_nChannelAxis + 1);
            int  nWidthOut  = colTop[0].shape(m_nChannelAxis + 2);
            Size szPad      = size_at(m_blobPad);
            Size szStride   = size_at(m_blobStride);

            // Specify workspace limit for kernels directly until we have a
            // planning strategy and a rewrite of Caffe's GPU memory management.
            // default = 1024 * 1024 * 8;
            long lWorkspaceLimitBytes = m_param.convolution_param.cudnn_workspace_limit * 8;

            // BUG Work Around
            // With cuDNN 7.0.5 and above we are seeing memory overwrite errors (from CUDA)
            //  when using more than 1 group and the workspace.
            //  * also confirmed in cuDNN 7.1.4 and CUDA 9.2 on driver 397.64
            if (m_nGroup > 1)
            {
                lWorkspaceLimitBytes = 0; // sets option to NO_WORKSPACE for Bwd Filter and Data
            }
            for (int i = 0; i < colBottom.Count; i++)
            {
                m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1);
                m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * m_nOutSpatialDim, m_nOutSpatialDim, nWidthOut, 1);
                m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width);

                // Get the algorithms and workspace sizes needed.
                CONV_FWD_ALGO        algoFwd       = (CONV_FWD_ALGO)0;
                CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0;
                CONV_BWD_DATA_ALGO   algoBwdData   = (CONV_BWD_DATA_ALGO)0;
                long lWsSizeFwd       = 0;
                long lWsSizeBwdFilter = 0;
                long lWsSizeBwdData   = 0;

                m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghBottomDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghTopDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData);
                m_rgfwdAlgo[i]                  = algoFwd;
                m_rglWorkspaceFwdSizes[i]       = lWsSizeFwd;
                m_rgbwdFilterAlgo[i]            = algoBwdFilter;
                m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
                m_rgbwdDataAlgo[i]              = algoBwdData;
                m_rglWorkspaceBwdDataSizes[i]   = lWsSizeBwdData;
            }

            // reduce over all workspace sizes to get a maximum to allocate / reallocate
            long lTotalWsFwd       = 0;
            long lTotalWsBwdFilter = 0;
            long lTotalWsBwdData   = 0;

            for (int i = 0; i < colBottom.Count; i++)
            {
                lTotalWsFwd       = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
                lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
                lTotalWsBwdData   = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
            }

            // Get max over all oeprations.
            long lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));

            // Ensure all groups have enough workspace.
            long lTotalMaxWorkspace = lMaxWorkspace * m_nGroup * CUDNN_STREAMS_PER_GROUP;

            // Initialize the workspace data.
            WorkspaceArgs wsArgs = getWorkspace();

            // This is the total amount of storage needed over all groups + streams.
            if (lTotalMaxWorkspace > wsArgs.Size)
            {
                setWorkspace(lTotalMaxWorkspace);
            }

            // if we succedd in the allocation, set the offsets for the workspaces.
            for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
            {
                m_rglWorkspaceFwdOffsets[g]       = g * lTotalWsFwd;
                m_rglWorkspaceBwdFilterOffsets[g] = g * lTotalWsBwdFilter;
                m_rglWorkspaceBwdDataOffsets[g]   = g * lTotalWsBwdData;
            }

            // Tensor descriptor for bias.
            if (m_bBiasTerm)
            {
                m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1);
            }
        }
        /// <summary>
        /// Reshape the bottom (input) and top (output) blobs.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            base.Reshape(colBottom, colTop);

            if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes))
            {
                return;
            }

            m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Deconvolution input must have 2 spatial axes (e.g., height and width).  Use 'engine: CAFFE' for general ND deconvolution.");

            m_nBottomOffset = m_nBottomDim / m_nGroup;
            m_nTopOffset    = m_nTopDim / m_nGroup;

            int  nHeight    = colBottom[0].shape(m_nChannelAxis + 1);
            int  nWidth     = colBottom[0].shape(m_nChannelAxis + 2);
            int  nHeightOut = colTop[0].shape(m_nChannelAxis + 1);
            int  nWidthOut  = colTop[0].shape(m_nChannelAxis + 2);
            Size szPad      = size_at(m_blobPad);
            Size szStride   = size_at(m_blobStride);

            // Specify workspace limit for kernels directly until we have a
            // planning strategy and a rewrite of Caffe's GPU memory management.
            // default = 1024 * 1024 * 8;
            long lWorkspaceLimitBytes = m_param.convolution_param.cudnn_workspace_limit * 8;

            for (int i = 0; i < colBottom.Count; i++)
            {
                m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1);
                m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * nHeightOut * nWidthOut, nHeightOut * nWidthOut, nWidthOut, 1);
                m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width);

                // NOTE: The native Caffe team has found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
                // buggy (in deconvolution).  Thus, if this algo was chosen (by CuDnn), we attempt to use winograd
                // instead.  If winograd is not supported, or the workspace is larger than the threshold, we
                // use implicit_gemm instead.
                CONV_FWD_ALGO algoFwdPreferred = CONV_FWD_ALGO.ALGO_WINOGRAD;
                // Get the algorithms and workspace sizes needed.
                CONV_FWD_ALGO        algoFwd       = (CONV_FWD_ALGO)0;
                CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0;
                CONV_BWD_DATA_ALGO   algoBwdData   = (CONV_BWD_DATA_ALGO)0;
                long lWsSizeFwd       = 0;
                long lWsSizeBwdFilter = 0;
                long lWsSizeBwdData   = 0;

                m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghTopDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghBottomDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData, algoFwdPreferred);
                m_rgfwdAlgo[i]                  = algoFwd;
                m_rglWorkspaceFwdSizes[i]       = lWsSizeFwd;
                m_rgbwdFilterAlgo[i]            = algoBwdFilter;
                m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
                m_rgbwdDataAlgo[i]              = algoBwdData;
                m_rglWorkspaceBwdDataSizes[i]   = lWsSizeBwdData;
            }

            // reduce over all workspace sizes to get a maximum to allocate / reallocate
            long lTotalWsFwd       = 0;
            long lTotalWsBwdFilter = 0;
            long lTotalWsBwdData   = 0;

            for (int i = 0; i < colBottom.Count; i++)
            {
                lTotalWsFwd       = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
                lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
                lTotalWsBwdData   = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
            }

            // Get max over all oeprations.
            long lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));

            // Ensure all groups have enough workspace.
            long lTotalMaxWorkspace = lMaxWorkspace * m_nGroup * CUDNN_STREAMS_PER_GROUP;

            // Initialize the workspace data.
            WorkspaceArgs wsArgs = getWorkspace();

            // This is the total amount of storage needed over all groups + streams.
            if (lTotalMaxWorkspace > wsArgs.Size)
            {
                setWorkspace(lTotalMaxWorkspace);
            }

            // if we succedd in the allocation, set the offsets for the workspaces.
            for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
            {
                m_rglWorkspaceFwdOffsets[g]       = g * lTotalWsFwd;
                m_rglWorkspaceBwdFilterOffsets[g] = g * lTotalWsBwdFilter;
                m_rglWorkspaceBwdDataOffsets[g]   = g * lTotalWsBwdData;
            }

            // Tensor descriptor for bias.
            if (m_bBiasTerm)
            {
                m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1);
            }
        }
Beispiel #3
0
        /// <summary>
        /// Reshape the bottom (input) and top (output) blobs.
        /// </summary>
        /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param>
        /// <param name="colTop">Specifies the collection of top (output) Blobs.</param>
        public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            base.Reshape(colBottom, colTop);

            if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes))
            {
                return;
            }

            m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Convolution input must have 2 spatial axes (e.g., height and width).  Use 'engine: CAFFE' for general ND convolution.");

            m_nBottomOffset = m_nBottomDim / m_nGroup;
            m_nTopOffset    = m_nTopDim / m_nGroup;

            int  nHeight    = colBottom[0].shape(m_nChannelAxis + 1);
            int  nWidth     = colBottom[0].shape(m_nChannelAxis + 2);
            int  nHeightOut = colTop[0].shape(m_nChannelAxis + 1);
            int  nWidthOut  = colTop[0].shape(m_nChannelAxis + 2);
            Size szPad      = size_at(m_blobPad);
            Size szStride   = size_at(m_blobStride);

            ulong lWorkspaceLimitBytes = getWorkspaceLimitInBytes();

            for (int i = 0; i < colBottom.Count; i++)
            {
                m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1, m_bUseHalfSize);
                m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * m_nOutSpatialDim, m_nOutSpatialDim, nWidthOut, 1, m_bUseHalfSize);
                m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width, m_bUseHalfSize);

                // Get the algorithms and workspace sizes needed.
                CONV_FWD_ALGO        algoFwd       = (CONV_FWD_ALGO)0;
                CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0;
                CONV_BWD_DATA_ALGO   algoBwdData   = (CONV_BWD_DATA_ALGO)0;
                ulong lWsSizeFwd       = 0;
                ulong lWsSizeBwdFilter = 0;
                ulong lWsSizeBwdData   = 0;

                m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghBottomDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghTopDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData);
                m_rgfwdAlgo[i]                  = algoFwd;
                m_rglWorkspaceFwdSizes[i]       = lWsSizeFwd;
                m_rgbwdFilterAlgo[i]            = algoBwdFilter;
                m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter;
                m_rgbwdDataAlgo[i]              = algoBwdData;
                m_rglWorkspaceBwdDataSizes[i]   = lWsSizeBwdData;
            }

            // reduce over all workspace sizes to get a maximum to allocate / reallocate
            ulong lTotalWsFwd       = 0;
            ulong lTotalWsBwdFilter = 0;
            ulong lTotalWsBwdData   = 0;

            for (int i = 0; i < colBottom.Count; i++)
            {
                lTotalWsFwd       = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]);
                lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]);
                lTotalWsBwdData   = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]);
            }

            // Get max over all oeprations.
            ulong lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData));

            // Ensure all groups have enough workspace.
            ulong lTotalMaxWorkspace = (ulong)lMaxWorkspace * (ulong)m_nGroup * (ulong)CUDNN_STREAMS_PER_GROUP;

            // Initialize the workspace data.
            WorkspaceArgs wsArgs = getWorkspace();

            // This is the total amount of storage needed over all groups + streams.
            if (lTotalMaxWorkspace > wsArgs.Size)
            {
                setWorkspace(lTotalMaxWorkspace);
            }

            // if we succedd in the allocation, set the offsets for the workspaces.
            for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++)
            {
                m_rglWorkspaceFwdOffsets[g]       = (ulong)g * lTotalWsFwd;
                m_rglWorkspaceBwdFilterOffsets[g] = (ulong)g * lTotalWsBwdFilter;
                m_rglWorkspaceBwdDataOffsets[g]   = (ulong)g * lTotalWsBwdData;
            }

            // Tensor descriptor for bias.
            if (m_bBiasTerm)
            {
                m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1, m_bUseHalfSize);
            }
        }