public static void ConjugateGradient(FloatResidentArray X, SparseMatrix A, FloatResidentArray B, int maxiter, float eps) { int N = (int)B.Count; FloatResidentArray R = new FloatResidentArray(N); FloatResidentArray P = new FloatResidentArray(N); FloatResidentArray AP = new FloatResidentArray(N); A.RefreshDevice(); X.RefreshDevice(); B.RefreshDevice(); wrapper.Fmsub(R, B, A, X, N); // R = B - A*X wrapper.Copy(P, R, N); int k = 0; while (k < maxiter) { wrapper.Multiply(AP, A, P, N); // AP = A*P float r = ScalarProd(R, R, N); // save <R|R> float alpha = r / ScalarProd(P, AP, N); // alpha = <R|R> / <P|AP> wrapper.Saxpy(X, X, alpha, P, N); // X = X - alpha*P wrapper.Saxpy(R, R, -alpha, AP, N); // RR = R-alpha*AP float rr = ScalarProd(R, R, N); if (rr < eps * eps) { break; } float beta = rr / r; wrapper.Saxpy(P, R, beta, P, N); // P = R + beta*P ++k; } X.RefreshHost(); }
static void Main(string[] args) { const int N = 1024 * 1024 * 32; FloatResidentArray arr = new FloatResidentArray(N); float[] res = new float[1]; for (int i = 0; i < N; ++i) { arr[i] = 1.0F; } arr.RefreshDevice(); var runner = HybRunner.Cuda(); cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); runner.SetDistrib(16 * prop.multiProcessorCount, 1, 128, 1, 1, 128 * sizeof(float)); var wrapped = runner.Wrap(new Program()); runner.saveAssembly(); cuda.ERROR_CHECK((cudaError_t)(int)wrapped.Total(arr, N, res)); cuda.ERROR_CHECK(cuda.DeviceSynchronize()); Console.WriteLine(res[0]); }
public void RefreshDevice() { indices.RefreshDevice(); data.RefreshDevice(); rows.RefreshDevice(); }