public IVector Add(IVector vector) { Debug.Assert(IsValid && vector.IsValid); var other = (GpuVector)vector; Debug.Assert(other.Count == Count); var ret = _cuda.Allocate(other._data.Size); ret.CopyToDevice(other._data); _cuda.Blas.Axpy(1.0f, _data.DeviceVariable, 1, ret.DeviceVariable, 1); return(new GpuVector(_cuda, ret, true)); }
public GpuMatrix(CudaProvider cuda, int rows, int columns, Func <int, int, float> init) { _cuda = cuda; _rows = rows; _columns = columns; var count = rows * columns; var data = new float[count]; for (var j = 0; j < columns; j++) { for (var i = 0; i < rows; i++) { data[j * rows + i] = init(i, j); } } _data = cuda.Allocate(count); _data.CopyToDevice(data); cuda.Register(this); #if DEBUG if (_id == _badAlloc) { Debugger.Break(); } #endif }
public IMatrix ConvertToMatrix() { Debug.Assert(IsValid); var rows = ColumnCount * RowCount; var columns = Depth; var ret = _cuda.Allocate(rows * columns); _cuda.TensorConvertToMatrix(_tensorInfo.Value.Single(), ColumnCount, RowCount, rows, columns, ret); return(new GpuMatrix(_cuda, rows, columns, ret)); }
public GpuVector(CudaProvider cuda, int size, Func <int, float> init) { _cuda = cuda; var data = new float[size]; for (var i = 0; i < size; i++) { data[i] = init(i); } _data = cuda.Allocate(size); _data.CopyToDevice(data); cuda.Register(this); #if DEBUG if (_id == _badAlloc) { Debugger.Break(); } #endif }
public I3DTensor Multiply(IMatrix matrix) { var other = (GpuMatrix)matrix; var ptr = _data.DevicePointer; int rowsA = _rows, columnsArowsB = _columns, columnsB = matrix.ColumnCount; float alpha = 1.0f, beta = 0.0f; var output = new Gpu3DTensor(_cuda, _rows, columnsB, _depth, _cuda.Allocate(_rows * columnsB * _depth), true); var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(_cuda.Blas.CublasHandle, Operation.NonTranspose, Operation.NonTranspose, rowsA, columnsB, columnsArowsB, ref alpha, ptr, rowsA, _blockSize, other.Memory.DevicePointer, columnsArowsB, 0, ref beta, output.Memory.DevicePointer, rowsA, _rows * columnsB, _depth ); if (status != CublasStatus.Success) { throw new CudaBlasException(status); } return(output); //var output = Enumerable.Range(0, _depth).Select(i => new GpuMatrix(_cuda, _rows, columnsB, _cuda.Allocate(_rows * columnsB), true)).ToList(); //using (var aPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr + i * _blockSize * CudaProvider.FLOAT_SIZE).ToArray())) //using (var bPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => other.Memory.DevicePointer).ToArray())) //using (var cPtrs = new PtrToDeviceMemoryList(output.Select(m => m.Memory.DevicePointer).ToArray())) { // var status = CudaBlasNativeMethods.cublasSgemmBatched(_cuda.Blas.CublasHandle, // Operation.NonTranspose, // Operation.NonTranspose, // rowsA, // columnsB, // columnsArowsB, // ref alpha, // aPtrs.DevicePointer, // rowsA, // bPtrs.DevicePointer, // columnsArowsB, // ref beta, // cPtrs.DevicePointer, // rowsA, // _depth // ); // if (status != CublasStatus.Success) // throw new CudaBlasException(status); //} //return _cuda.Create3DTensor(output); }
public IMatrix Add(IMatrix matrix) { Debug.Assert(IsValid && matrix.IsValid); var other = (GpuMatrix)matrix; Debug.Assert(other._rows == _rows && other._columns == _columns); var ret = _cuda.Allocate(other._data.Size); ret.CopyToDevice(other._data); _cuda.Blas.Axpy(1.0f, _data.DeviceVariable, 1, ret.DeviceVariable, 1); return(new GpuMatrix(_cuda, _rows, _columns, ret)); }