public (I3DTensor Result, I3DTensor Indices) MaxPool(int filterWidth, int filterHeight, int xStride, int yStride, bool saveIndices) { Debug.Assert(IsValid); var maxPool = _cuda.TensorMaxPool(Memory, _rows, _columns, _depth, 1, filterWidth, filterHeight, xStride, yStride, saveIndices); var ret = new Gpu3DTensor(_cuda, maxPool.Rows, maxPool.Columns, _depth, maxPool.Data, true); var indices = saveIndices ? new Gpu3DTensor(_cuda, maxPool.Rows, maxPool.Columns, _depth, maxPool.Indices, true) : null; return(ret, indices); }
public I3DTensor TransposeThisAndMultiply(I4DTensor tensor) { var other = (Gpu4DTensor)tensor; #if DEBUG Debug.Assert(tensor.Count == Depth && IsValid && other.IsValid); #endif var ptr = Memory.DevicePointer; var ptr2 = other.Memory.DevicePointer; int rowsA = _rows, columnsA = _columns, columnsB = other.Depth, rowsB = other.RowCount * other.ColumnCount, blockSize2 = columnsB * rowsB; float alpha = 1.0f, beta = 0.0f; var output = new Gpu3DTensor(_cuda, _columns, columnsB, _depth, _cuda.Allocate(_columns * columnsB * _depth), true); var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(_cuda.Blas.CublasHandle, Operation.Transpose, Operation.NonTranspose, columnsA, columnsB, rowsB, ref alpha, ptr, rowsA, _blockSize, ptr2, rowsB, blockSize2, ref beta, output.Memory.DevicePointer, columnsA, _columns * columnsB, _depth); if (status != CublasStatus.Success) { throw new CudaBlasException(status); } return(output); //var output = Enumerable.Range(0, _depth).Select(i => new GpuMatrix(_cuda, _columns, columnsB, _cuda.Allocate(_columns * columnsB), true)).ToList(); //using (var aPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr + i * _blockSize * CudaProvider.FLOAT_SIZE).ToArray())) //using (var bPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr2 + i * blockSize2 * CudaProvider.FLOAT_SIZE).ToArray())) //using (var cPtrs = new PtrToDeviceMemoryList(output.Select(m => m.Memory.DevicePointer).ToArray())) { // var status = CudaBlasNativeMethods.cublasSgemmBatched(_cuda.Blas.CublasHandle, // Operation.Transpose, // Operation.NonTranspose, // columnsA, // columnsB, // rowsB, // ref alpha, // aPtrs.DevicePointer, // rowsA, // bPtrs.DevicePointer, // rowsB, // ref beta, // cPtrs.DevicePointer, // columnsA, // _depth // ); // if (status != CublasStatus.Success) // throw new CudaBlasException(status); //} //return _cuda.Create3DTensor(output); }
public I3DTensor Multiply(IMatrix matrix) { var other = (GpuMatrix)matrix; var ptr = Memory.DevicePointer; int rowsA = _rows, columnsArowsB = _columns, columnsB = matrix.ColumnCount; float alpha = 1.0f, beta = 0.0f; var output = new Gpu3DTensor(_cuda, _rows, columnsB, _depth, _cuda.Allocate(_rows * columnsB * _depth), true); var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(_cuda.Blas.CublasHandle, Operation.NonTranspose, Operation.NonTranspose, rowsA, columnsB, columnsArowsB, ref alpha, ptr, rowsA, _blockSize, other.Memory.DevicePointer, columnsArowsB, 0, ref beta, output.Memory.DevicePointer, rowsA, _rows * columnsB, _depth); if (status != CublasStatus.Success) { throw new CudaBlasException(status); } return(output); //var output = Enumerable.Range(0, _depth).Select(i => new GpuMatrix(_cuda, _rows, columnsB, _cuda.Allocate(_rows * columnsB), true)).ToList(); //using (var aPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr + i * _blockSize * CudaProvider.FLOAT_SIZE).ToArray())) //using (var bPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => other.Memory.DevicePointer).ToArray())) //using (var cPtrs = new PtrToDeviceMemoryList(output.Select(m => m.Memory.DevicePointer).ToArray())) { // var status = CudaBlasNativeMethods.cublasSgemmBatched(_cuda.Blas.CublasHandle, // Operation.NonTranspose, // Operation.NonTranspose, // rowsA, // columnsB, // columnsArowsB, // ref alpha, // aPtrs.DevicePointer, // rowsA, // bPtrs.DevicePointer, // columnsArowsB, // ref beta, // cPtrs.DevicePointer, // rowsA, // _depth // ); // if (status != CublasStatus.Success) // throw new CudaBlasException(status); //} //return _cuda.Create3DTensor(output); }