/// <summary> /// Reshape the bottom (input) and top (output) blobs. /// </summary> /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param> /// <param name="colTop">Specifies the collection of top (output) Blobs.</param> public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop) { base.Reshape(colBottom, colTop); if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes)) { return; } m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Convolution input must have 2 spatial axes (e.g., height and width). Use 'engine: CAFFE' for general ND convolution."); m_nBottomOffset = m_nBottomDim / m_nGroup; m_nTopOffset = m_nTopDim / m_nGroup; int nHeight = colBottom[0].shape(m_nChannelAxis + 1); int nWidth = colBottom[0].shape(m_nChannelAxis + 2); int nHeightOut = colTop[0].shape(m_nChannelAxis + 1); int nWidthOut = colTop[0].shape(m_nChannelAxis + 2); Size szPad = size_at(m_blobPad); Size szStride = size_at(m_blobStride); // Specify workspace limit for kernels directly until we have a // planning strategy and a rewrite of Caffe's GPU memory management. // default = 1024 * 1024 * 8; long lWorkspaceLimitBytes = m_param.convolution_param.cudnn_workspace_limit * 8; // BUG Work Around // With cuDNN 7.0.5 and above we are seeing memory overwrite errors (from CUDA) // when using more than 1 group and the workspace. // * also confirmed in cuDNN 7.1.4 and CUDA 9.2 on driver 397.64 if (m_nGroup > 1) { lWorkspaceLimitBytes = 0; // sets option to NO_WORKSPACE for Bwd Filter and Data } for (int i = 0; i < colBottom.Count; i++) { m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1); m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * m_nOutSpatialDim, m_nOutSpatialDim, nWidthOut, 1); m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width); // Get the algorithms and workspace sizes needed. CONV_FWD_ALGO algoFwd = (CONV_FWD_ALGO)0; CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0; CONV_BWD_DATA_ALGO algoBwdData = (CONV_BWD_DATA_ALGO)0; long lWsSizeFwd = 0; long lWsSizeBwdFilter = 0; long lWsSizeBwdData = 0; m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghBottomDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghTopDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData); m_rgfwdAlgo[i] = algoFwd; m_rglWorkspaceFwdSizes[i] = lWsSizeFwd; m_rgbwdFilterAlgo[i] = algoBwdFilter; m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter; m_rgbwdDataAlgo[i] = algoBwdData; m_rglWorkspaceBwdDataSizes[i] = lWsSizeBwdData; } // reduce over all workspace sizes to get a maximum to allocate / reallocate long lTotalWsFwd = 0; long lTotalWsBwdFilter = 0; long lTotalWsBwdData = 0; for (int i = 0; i < colBottom.Count; i++) { lTotalWsFwd = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]); lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]); lTotalWsBwdData = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]); } // Get max over all oeprations. long lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData)); // Ensure all groups have enough workspace. long lTotalMaxWorkspace = lMaxWorkspace * m_nGroup * CUDNN_STREAMS_PER_GROUP; // Initialize the workspace data. WorkspaceArgs wsArgs = getWorkspace(); // This is the total amount of storage needed over all groups + streams. if (lTotalMaxWorkspace > wsArgs.Size) { setWorkspace(lTotalMaxWorkspace); } // if we succedd in the allocation, set the offsets for the workspaces. for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++) { m_rglWorkspaceFwdOffsets[g] = g * lTotalWsFwd; m_rglWorkspaceBwdFilterOffsets[g] = g * lTotalWsBwdFilter; m_rglWorkspaceBwdDataOffsets[g] = g * lTotalWsBwdData; } // Tensor descriptor for bias. if (m_bBiasTerm) { m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1); } }
/// <summary> /// Reshape the bottom (input) and top (output) blobs. /// </summary> /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param> /// <param name="colTop">Specifies the collection of top (output) Blobs.</param> public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop) { base.Reshape(colBottom, colTop); if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes)) { return; } m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Deconvolution input must have 2 spatial axes (e.g., height and width). Use 'engine: CAFFE' for general ND deconvolution."); m_nBottomOffset = m_nBottomDim / m_nGroup; m_nTopOffset = m_nTopDim / m_nGroup; int nHeight = colBottom[0].shape(m_nChannelAxis + 1); int nWidth = colBottom[0].shape(m_nChannelAxis + 2); int nHeightOut = colTop[0].shape(m_nChannelAxis + 1); int nWidthOut = colTop[0].shape(m_nChannelAxis + 2); Size szPad = size_at(m_blobPad); Size szStride = size_at(m_blobStride); // Specify workspace limit for kernels directly until we have a // planning strategy and a rewrite of Caffe's GPU memory management. // default = 1024 * 1024 * 8; long lWorkspaceLimitBytes = m_param.convolution_param.cudnn_workspace_limit * 8; for (int i = 0; i < colBottom.Count; i++) { m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1); m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * nHeightOut * nWidthOut, nHeightOut * nWidthOut, nWidthOut, 1); m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width); // NOTE: The native Caffe team has found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is // buggy (in deconvolution). Thus, if this algo was chosen (by CuDnn), we attempt to use winograd // instead. If winograd is not supported, or the workspace is larger than the threshold, we // use implicit_gemm instead. CONV_FWD_ALGO algoFwdPreferred = CONV_FWD_ALGO.ALGO_WINOGRAD; // Get the algorithms and workspace sizes needed. CONV_FWD_ALGO algoFwd = (CONV_FWD_ALGO)0; CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0; CONV_BWD_DATA_ALGO algoBwdData = (CONV_BWD_DATA_ALGO)0; long lWsSizeFwd = 0; long lWsSizeBwdFilter = 0; long lWsSizeBwdData = 0; m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghTopDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghBottomDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData, algoFwdPreferred); m_rgfwdAlgo[i] = algoFwd; m_rglWorkspaceFwdSizes[i] = lWsSizeFwd; m_rgbwdFilterAlgo[i] = algoBwdFilter; m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter; m_rgbwdDataAlgo[i] = algoBwdData; m_rglWorkspaceBwdDataSizes[i] = lWsSizeBwdData; } // reduce over all workspace sizes to get a maximum to allocate / reallocate long lTotalWsFwd = 0; long lTotalWsBwdFilter = 0; long lTotalWsBwdData = 0; for (int i = 0; i < colBottom.Count; i++) { lTotalWsFwd = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]); lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]); lTotalWsBwdData = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]); } // Get max over all oeprations. long lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData)); // Ensure all groups have enough workspace. long lTotalMaxWorkspace = lMaxWorkspace * m_nGroup * CUDNN_STREAMS_PER_GROUP; // Initialize the workspace data. WorkspaceArgs wsArgs = getWorkspace(); // This is the total amount of storage needed over all groups + streams. if (lTotalMaxWorkspace > wsArgs.Size) { setWorkspace(lTotalMaxWorkspace); } // if we succedd in the allocation, set the offsets for the workspaces. for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++) { m_rglWorkspaceFwdOffsets[g] = g * lTotalWsFwd; m_rglWorkspaceBwdFilterOffsets[g] = g * lTotalWsBwdFilter; m_rglWorkspaceBwdDataOffsets[g] = g * lTotalWsBwdData; } // Tensor descriptor for bias. if (m_bBiasTerm) { m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1); } }
/// <summary> /// Reshape the bottom (input) and top (output) blobs. /// </summary> /// <param name="colBottom">Specifies the collection of bottom (input) Blobs.</param> /// <param name="colTop">Specifies the collection of top (output) Blobs.</param> public override void Reshape(BlobCollection <T> colBottom, BlobCollection <T> colTop) { base.Reshape(colBottom, colTop); if (!m_param.convolution_param.useCudnn(m_nNumSpatialAxes)) { return; } m_log.CHECK_EQ(2, m_nNumSpatialAxes, "cuDNN Convolution input must have 2 spatial axes (e.g., height and width). Use 'engine: CAFFE' for general ND convolution."); m_nBottomOffset = m_nBottomDim / m_nGroup; m_nTopOffset = m_nTopDim / m_nGroup; int nHeight = colBottom[0].shape(m_nChannelAxis + 1); int nWidth = colBottom[0].shape(m_nChannelAxis + 2); int nHeightOut = colTop[0].shape(m_nChannelAxis + 1); int nWidthOut = colTop[0].shape(m_nChannelAxis + 2); Size szPad = size_at(m_blobPad); Size szStride = size_at(m_blobStride); ulong lWorkspaceLimitBytes = getWorkspaceLimitInBytes(); for (int i = 0; i < colBottom.Count; i++) { m_cuda.SetTensorDesc(m_rghBottomDesc[i], m_nNum, m_nChannels / m_nGroup, nHeight, nWidth, m_nChannels * nHeight * nWidth, nHeight * nWidth, nWidth, 1, m_bUseHalfSize); m_cuda.SetTensorDesc(m_rghTopDesc[i], m_nNum, m_nNumOutput / m_nGroup, nHeightOut, nWidthOut, m_nNumOutput * m_nOutSpatialDim, m_nOutSpatialDim, nWidthOut, 1, m_bUseHalfSize); m_cuda.SetConvolutionDesc(m_rghConvDesc[i], szPad.Height, szPad.Width, szStride.Height, szStride.Width, m_bUseHalfSize); // Get the algorithms and workspace sizes needed. CONV_FWD_ALGO algoFwd = (CONV_FWD_ALGO)0; CONV_BWD_FILTER_ALGO algoBwdFilter = (CONV_BWD_FILTER_ALGO)0; CONV_BWD_DATA_ALGO algoBwdData = (CONV_BWD_DATA_ALGO)0; ulong lWsSizeFwd = 0; ulong lWsSizeBwdFilter = 0; ulong lWsSizeBwdData = 0; m_cuda.GetConvolutionInfo(m_rghCudnn[0], m_rghBottomDesc[i], m_hFilterDesc, m_rghConvDesc[i], m_rghTopDesc[i], lWorkspaceLimitBytes, out algoFwd, out lWsSizeFwd, out algoBwdFilter, out lWsSizeBwdFilter, out algoBwdData, out lWsSizeBwdData); m_rgfwdAlgo[i] = algoFwd; m_rglWorkspaceFwdSizes[i] = lWsSizeFwd; m_rgbwdFilterAlgo[i] = algoBwdFilter; m_rglWorkspaceBwdFilterSizes[i] = lWsSizeBwdFilter; m_rgbwdDataAlgo[i] = algoBwdData; m_rglWorkspaceBwdDataSizes[i] = lWsSizeBwdData; } // reduce over all workspace sizes to get a maximum to allocate / reallocate ulong lTotalWsFwd = 0; ulong lTotalWsBwdFilter = 0; ulong lTotalWsBwdData = 0; for (int i = 0; i < colBottom.Count; i++) { lTotalWsFwd = Math.Max(lTotalWsFwd, m_rglWorkspaceFwdSizes[i]); lTotalWsBwdFilter = Math.Max(lTotalWsBwdFilter, m_rglWorkspaceBwdFilterSizes[i]); lTotalWsBwdData = Math.Max(lTotalWsBwdData, m_rglWorkspaceBwdDataSizes[i]); } // Get max over all oeprations. ulong lMaxWorkspace = Math.Max(lTotalWsFwd, Math.Max(lTotalWsBwdFilter, lTotalWsBwdData)); // Ensure all groups have enough workspace. ulong lTotalMaxWorkspace = (ulong)lMaxWorkspace * (ulong)m_nGroup * (ulong)CUDNN_STREAMS_PER_GROUP; // Initialize the workspace data. WorkspaceArgs wsArgs = getWorkspace(); // This is the total amount of storage needed over all groups + streams. if (lTotalMaxWorkspace > wsArgs.Size) { setWorkspace(lTotalMaxWorkspace); } // if we succedd in the allocation, set the offsets for the workspaces. for (int g = 0; g < (m_nGroup * CUDNN_STREAMS_PER_GROUP); g++) { m_rglWorkspaceFwdOffsets[g] = (ulong)g * lTotalWsFwd; m_rglWorkspaceBwdFilterOffsets[g] = (ulong)g * lTotalWsBwdFilter; m_rglWorkspaceBwdDataOffsets[g] = (ulong)g * lTotalWsBwdData; } // Tensor descriptor for bias. if (m_bBiasTerm) { m_cuda.SetTensorDesc(m_hBiasDesc, 1, m_nNumOutput / m_nGroup, 1, 1, m_bUseHalfSize); } }