public static void Copy(FloatResidentArray res, FloatResidentArray src, int N) { Parallel.For(0, N, (i) => { res[i] = src[i]; }); }
public static void Saxpy(FloatResidentArray res, FloatResidentArray x, float alpha, FloatResidentArray y, int N) { Parallel.For(0, N, (i) => { res[i] = x[i] + alpha * y[i]; }); }
static void Main(string[] args) { // configure CUDA cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); const int BLOCK_DIM = 256; runner = HybRunner.Cuda().SetDistrib(16 * prop.multiProcessorCount, 1, BLOCK_DIM, 1, 1, BLOCK_DIM * sizeof(float)); wrapper = runner.Wrap(new Program()); int size = 1000000; // very slow convergence with no preconditioner SparseMatrix A = SparseMatrix.Laplacian_1D(size); FloatResidentArray B = new FloatResidentArray(size); FloatResidentArray X = new FloatResidentArray(size); int maxiter = 1000; float eps = 1.0e-09f; for (int i = 0; i < size; ++i) { B[i] = 1.0f; // right side X[i] = 0.0f; // starting point } ConjugateGradient(X, A, B, maxiter, eps); }
public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, FloatResidentArray B, int maxiter, float eps) { int N = (int)B.Count; FloatResidentArray R = new FloatResidentArray(N); FloatResidentArray P = new FloatResidentArray(N); FloatResidentArray AP = new FloatResidentArray(N); A.RefreshDevice(); X.RefreshDevice(); B.RefreshDevice(); wrapper.Fmsub(R, B, A, X, N); // R = B - A*X wrapper.Copy(P, R, N); int k = 0; while (k < maxiter) { wrapper.Multiply(AP, A, P, N); // AP = A*P float r = ScalarProd(R, R, N); // save <R|R> float alpha = r / ScalarProd(P, AP, N); // alpha = <R|R> / <P|AP> wrapper.Saxpy(X, X, alpha, P, N); // X = X - alpha*P wrapper.Saxpy(R, R, -alpha, AP, N); // RR = R-alpha*AP float rr = ScalarProd(R, R, N); if (rr < eps * eps) { break; } float beta = rr / r; wrapper.Saxpy(P, R, beta, P, N); // P = R + beta*P ++k; } X.RefreshHost(); }
public SparseMatrix(Dictionary <int, float>[] from) { rows = new IntResidentArray(from.Length + 1); List <int> _indices = new List <int>(); List <float> _data = new List <float>(); int colCounter = 0; int i = 0; for (; i < from.Length; ++i) { rows[i] = colCounter; foreach (var kvp in from[i]) { _data.Add(kvp.Value); _indices.Add(kvp.Key); colCounter += 1; } } rows[i] = colCounter; indices = new IntResidentArray(_indices.Count()); for (i = 0; i < indices.Count; ++i) { indices[i] = _indices[i]; } data = new FloatResidentArray(_data.Count()); for (i = 0; i < data.Count; ++i) { data[i] = _data[i]; } }
public static void Total(FloatResidentArray a, int N, float[] total) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float sum = 0f; while (tid < N) { sum = sum + a[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = sum; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y); } }
public SquareProblem(int N, int iter) { _N = N; _h = 1.0F / (float)_N; _invIter = 1.0F / (float)iter; _inner = new FloatResidentArray((N - 1) * (N - 1)); _iter = iter; }
public static void Multiply(FloatResidentArray res, SparseMatrix m, FloatResidentArray v, int N) { Parallel.For(0, N, (i) => { int rowless = m.rows[i]; int rowup = m.rows[i + 1]; float tmp = 0.0F; for (int j = rowless; j < rowup; ++j) { tmp += v[m.indices[j]] * m.data[j]; } res[i] = tmp; }); }
public static void ScalarProd(int N, FloatResidentArray a, FloatResidentArray b, float[] result) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += a[tid] * b[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicAdd(ref result[0], cache[0]); } }
private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += r1[tid] * r2[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y); } }
static void Main(string[] args) { const int N = 1024 * 1024 * 32; FloatResidentArray arr = new FloatResidentArray(N); float[] res = new float[1]; for (int i = 0; i < N; ++i) { arr[i] = 1.0F; } arr.RefreshDevice(); var runner = HybRunner.Cuda(); cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); runner.SetDistrib(16 * prop.multiProcessorCount, 1, 128, 1, 1, 128 * sizeof(float)); var wrapped = runner.Wrap(new Program()); runner.saveAssembly(); cuda.ERROR_CHECK((cudaError_t)(int)wrapped.Total(arr, N, res)); cuda.ERROR_CHECK(cuda.DeviceSynchronize()); Console.WriteLine(res[0]); }
public static float ScalarProd(FloatResidentArray X, FloatResidentArray Y, int N) { return(inner_scalar_prod((float *)X.DevicePointer, (float *)Y.DevicePointer, N)); }
public static float ScalarProd(FloatResidentArray X, FloatResidentArray Y, int N) { return(inner_scalar_prod((float *)X.DevicePointer, (float *)Y.DevicePointer, N)); //return ParallelEnumerable.Range(0, N).Sum(i => X[i] * Y[i]); }