/// <summary>Copies src vector contents to dst</summary> public static void CopyMatrix(floatMatrix Src, floatMatrix Dst) { if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { kernelCopyBuffer.Execute(new CLCalc.Program.MemoryObject[] { Src.CLValues, Dst.CLValues }, Src.CLValues.OriginalVarLength); } else { for (int i = 0; i < Src.Values.Length; i++) Dst.Values[i] = Src.Values[i]; } }
/// <summary>Computes A*inv(H)*A' and stores result in ans</summary> /// <param name="A">A matrix, mxn</param> /// <param name="H">H matrix, nxn</param> /// <param name="ans">answer, mxm</param> /// <param name="temp">Temporary matrix for the operation, size nxm</param> /// <param name="refine">Refine linear system solution?</param> public static floatSymPosDefMatrix ComputeAinvHTranspA(floatMatrix A, floatSymPosDefMatrix H, ref floatSymPosDefMatrix ans, ref floatMatrix temp, bool refine) { int m=A.Rows; int n=A.Cols; if (ans == null) ans = new floatSymPosDefMatrix(m); if (H.getN != n) throw new Exception("Matrix sizes not compatible"); if (ans.getN != m) throw new Exception("Answer size not compatible"); H.LinearSolve(A, refine, ref temp); //Go on to multiplying A*temp if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { kernelComputeAinvHAt.Execute(new CLCalc.Program.MemoryObject[] { A.CLValues, A.CLDim, temp.CLValues, ans.CLValues }, (m * (m + 1)) >> 1); ans.IsCholeskyFactorized = false; } else { for (int p = 0; p < m; p++) { int np = n * p; for (int q = 0; q <= p; q++) { int nq = n * q; float val = 0; for (int k = 0; k < n; k++) { val += A.Values[k + np] * temp.Values[k + nq]; } ans.Values[((p * (1 + p)) >> 1) + q] = val; } } } return ans; }
/// <summary>Symmetric positive definite product with matrix transpose, Msym*At. ans gets constructed if ==null </summary> public static floatMatrix SymPosDefMatrMatrMultiply(floatSymPosDefMatrix M, floatMatrix A, ref floatMatrix ans) { if (ans == null) ans = new floatMatrix(new float[A.Cols, M.getN]); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { if (!M.IsMatrixInClMemoryUpdated) M.CLValues.WriteToDevice(M.Values); kernelSymMatrMatrMultiply.Execute(new CLCalc.Program.MemoryObject[] { M.CLValues, A.CLValues, ans.CLValues }, new int[] { A.Rows, A.Cols }); } else { for (int j = 0; j < A.Cols; j++) { for (int i = 0; i < M.getN; i++) { float val = 0; for (int k = 0; k < M.getN; k++) { val += M[i, k] * A.Values[k + j * A.Rows]; } ans.Values[i + j * A.Rows] = val; } } } return ans; }
/// <summary>Sums the components of a vector using __local memory and coalesced access</summary> /// <param name="CLv">Matrix whose components should be summed</param> public static float SumMatrixElements(floatMatrix CLv) { float resp = 0; if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { /* The idea here is to create a reduction in which the access pattern to the vectors is coalesced. The first step is to reduce the number of non-summed items to a multiple of NWORKITEMS and then coalesce the access */ int LOCALWORKSIZE = Math.Min(256, (int)CLCalc.Program.CommQueues[CLCalc.Program.DefaultCQ].Device.MaxWorkGroupSize); int NWORKITEMS = 16 * LOCALWORKSIZE; int n = CLv.Values.Length; float[] resps = new float[NWORKITEMS]; if (CLv.CLresps == null) { CLv.CLresps = new CLCalc.Program.Variable(resps); CLv.CLn = new CLCalc.Program.Variable(new int[1]); } CLv.CLn.WriteToDevice(new int[] { n }); CLCalc.Program.Variable[] args = new CLCalc.Program.Variable[] { CLv.CLValues, CLv.CLresps, CLv.CLn }; //Write n = k*NWORKITEMS + p. Preprocess to eliminate p`s and leave summation only to a multiple of NWORKITEMS int k = n / NWORKITEMS; int p = n - k * NWORKITEMS; //Clears partial responses kernelClear.Execute(args, NWORKITEMS); //Sums the p last elements into the p first elements kernelPreSum.Execute(args, p); //Use CLn to inform each work-item its workload. Each one will access and sum k numbers CLv.CLn.WriteToDevice(new int[] { k }); kernelCoalLocalSum.Execute(args, new int[] { NWORKITEMS }, new int[] { LOCALWORKSIZE }); CLv.CLresps.ReadFromDeviceTo(resps); //Serial part int imax = NWORKITEMS / LOCALWORKSIZE; for (int i = 0; i < imax; i++) resp += resps[i]; } else { double sum = 0; for (int i = 0; i < CLv.Values.Length; i++) sum += CLv.Values[i]; resp = (float)sum; } return resp; }
/// <summary>Computes M*(alpha*v) + beta*u. Creates ans if it is null</summary> public static floatVector MatrVecProdSumVec(floatMatrix M, floatVector v, float alpha, floatVector u, float beta, ref floatVector ans) { if (ans != null && ans.Length != M.Rows) throw new Exception("ans length should match M rows"); if (v.Length != M.Cols) throw new Exception("v length should match M cols"); if (u.Length != M.Rows) throw new Exception("u length should match M rows"); if (ans == null) ans = new floatVector(new float[M.Rows]); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { v.CLCoef.WriteToDevice(new float[] { alpha }); u.CLCoef.WriteToDevice(new float[] { beta }); kernelMatrVecProdSumVec.Execute(new CLCalc.Program.MemoryObject[] { M.CLValues, M.CLDim, v.CLValues, v.CLCoef, u.CLValues, u.CLCoef, ans.CLValues }, M.Rows); } else { for (int i = 0; i < M.Rows; i++) { float temp = 0; for (int j = 0; j < M.Cols; j++) { temp += M[i, j] * v.Values[j] * alpha; } ans.Values[i] = temp + beta * u.Values[i]; } } return ans; }
/// <summary>Computes transpose(A)*diag(W)*b*alpha</summary> /// <param name="A">Original matrix</param> /// <param name="b">Vector to multiply</param> /// <param name="W">Measurement weight vector</param> /// <param name="alpha">Multiplication constant</param> /// <param name="ans">Answer. If null, gets created</param> public static floatVector MatrTraspVecMult(floatMatrix A, floatDiag W, floatVector b, float alpha, ref floatVector ans) { int m = A.Rows; int n = A.Cols; if (ans == null) ans = new floatVector(new float[A.Cols]); if (A.Rows != W.Rows) throw new Exception("Incompatible A and W dimensions"); if (A.Rows != b.Length) throw new Exception("Incompatible A and b dimensions"); if (A.Cols != ans.Length) throw new Exception("Incompatible A and ans dimensions"); if (CLCalc.CLAccelerationType.UsingCL == CLCalc.CLAcceleration) { b.CLCoef.WriteToDevice(new float[] {alpha}); kernelTranspMatrVecProdW.Execute(new CLCalc.Program.MemoryObject[] { A.CLValues, A.CLDim, b.CLValues, b.CLCoef, W.CLValues, ans.CLValues }, A.Cols); } else { for (int i = 0; i < n; i++) { double val = 0; for (int k = 0; k < m; k++) { val += A[k, i] * b.Values[k] * W.Values[k] * alpha; } ans.Values[i] = (float)val; } } return ans; }
/// <summary>Computes transpose(A)*diag(W)*b*alpha</summary> /// <param name="A">Original matrix</param> /// <param name="b">Vector to multiply</param> /// <param name="W">Measurement weight vector</param> /// <param name="ans">Answer. If null, gets created</param> public static floatVector MatrTraspVecMult(floatMatrix A, floatDiag W, floatVector b, ref floatVector ans) { return MatrTraspVecMult(A, W, b, 1.0f, ref ans); }
/// <summary>Solves system Ax = b and returns x, where b is a right hand side matrix</summary> /// <param name="b">b vector</param> /// <param name="refine">Refine solution? Recommended: true</param> public float[,] LinearSolve(float[,] b, bool refine) { floatMatrix CLbb = new floatMatrix(b); floatMatrix resp = null; LinearSolve(CLbb, true, ref resp); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { resp.CLValues.ReadFromDeviceTo(resp.Values); } float[,] vResp = new float[resp.Rows, resp.Cols]; for (int i = 0; i < resp.Rows; i++) for (int j = 0; j < resp.Cols; j++) vResp[i, j] = resp[i, j]; return vResp; }
/// <summary>Computes transpose(A)*A and transpose(A)*b weighted by W using OpenCL. Lambda is regularization term</summary> private static floatSymPosDefMatrix AuxLSAtACL(floatMatrix A, floatDiag W, floatVector lambda, ref floatSymPosDefMatrix AtA) { if (AtA == null || AtA.CLValues.OriginalVarLength != (A.Cols * (A.Cols + 1)) >> 1) { AtA = new floatSymPosDefMatrix(new float[(A.Cols * (A.Cols + 1)) >> 1]); } CLCalc.Program.Variable[] args = new CLCalc.Program.Variable[] { A.CLValues, A.CLDim, W.CLValues, AtA.CLValues, lambda.CLValues }; kernelComputeAtWA.Execute(args, AtA.CLValues.OriginalVarLength); //Just modified values in CL memory, matrix is no longer Cholesky factorized AtA.IsCholeskyFactorized = false; return AtA; }
/// <summary>Computes transpose(A)*A</summary> /// <param name="A">Original matrix</param> /// <param name="lambda">Regularization term</param> /// <param name="AtA">Answer, A transpose times A</param> public static floatSymPosDefMatrix MatrTranspMatrProd(floatMatrix A, floatVector lambda, ref floatSymPosDefMatrix AtA) { return MatrTranspMatrProd(A, null, lambda, ref AtA); }
/// <summary>Computer a linear combination alpha*u+beta*v. Puts answer in ans. Creates ans if it is null</summary> public static floatMatrix LinearCombination(float alpha, floatMatrix u, float beta, floatMatrix v, ref floatMatrix ans) { if (ans == null) ans = new floatMatrix(new float[u.Rows, u.Cols]); if (u.Rows != v.Rows || u.Cols != v.Cols) throw new Exception("Incompatible dimensions"); if (ans.Rows != u.Rows || ans.Cols != u.Cols) throw new Exception("Ans dimension should be equal to vectors dimension"); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { u.CLCoef.WriteToDevice(new float[] { alpha }); v.CLCoef.WriteToDevice(new float[] { beta }); kernelLinearComb.Execute(new CLCalc.Program.MemoryObject[] { u.CLCoef, v.CLCoef, u.CLValues, v.CLValues, ans.CLValues }, u.Values.Length); } else { for (int i = 0; i < u.Values.Length; i++) { ans.Values[i] = alpha * u.Values[i] + beta * v.Values[i]; } } return ans; }
/// <summary>Solves system Ax = b' and returns x</summary> /// <param name="bb">b Matrix</param> /// <param name="resp">Answer</param> private void linsolveMatrix(floatMatrix bb, ref floatMatrix resp) { float[] b = (float[])bb.Values.Clone(); float[] y = new float[bb.Values.Length]; if (resp == null) resp = new floatMatrix(new float[bb.Cols, bb.Rows]); for (int k = 0; k < bb.Rows; k++) { //Forward substitution for (int i = 0; i < N; i++) { y[i + k * N] = b[i + k * N] / cholDec[((i * (i + 1)) >> 1) + i]; for (int j = i + 1; j < N; j++) { b[j + k * N] -= cholDec[((j * (j + 1)) >> 1) + i] * y[i + k * N]; } } //Backward substitution for (int i = N - 1; i >= 0; i--) { resp.Values[i + k * N] = y[i + k * N] / cholDec[((i * (i + 1)) >> 1) + i]; for (int j = 0; j < i; j++) { y[j + k * N] -= cholDec[((i * (i + 1)) >> 1) + j] * resp.Values[i + k * N]; } } } }
/// <summary>Backsubstitutes to solve a linear system with a matrix right hand size</summary> private void LinsolveCLMatrix(floatMatrix M, ref floatMatrix resp) { //System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch(); //System.Diagnostics.Stopwatch sw1 = new System.Diagnostics.Stopwatch(); //sw.Start(); //number of RHS as multiple of SUBMATRIXSIZE int nRHSMult = M.Rows / SUBMATRIXSIZE; int nRHSleftOver = M.Rows - SUBMATRIXSIZE*nRHSMult; if (CLCalc.CLAcceleration != CLCalc.CLAccelerationType.UsingCL) { linsolveMatrix(M, ref resp); return; } //Copy elements to CLb if (CLb == null || CLb.OriginalVarLength < M.Values.Length) { CLb = new CLCalc.Program.Variable(M.Values); CLy = new CLCalc.Program.Variable(M.Values); } kernelCopyBuffer.Execute(new CLCalc.Program.MemoryObject[] { M.CLValues, CLb }, M.Values.Length); int nEqs = M.Rows; CLCalc.Program.Variable[] args = new CLCalc.Program.Variable[] { CLcholDec, CLy, CLb, CLoffSet, CLn }; int[] offset = new int[1]; //DEBUG //float[] yDebug = new float[M.Values.Length]; //float[] bDebug = new float[M.Values.Length]; //this.CLcholDec.ReadFromDeviceTo(cholDec); //Forward substitution int i; for (i = 0; i < N; i += SUBMATRIXSIZE) { offset[0] = i; CLoffSet.WriteToDevice(offset); int size = Math.Min(SUBMATRIXSIZE, N - i); kernelFwdUpperBackSubs.Execute(args, new int[] { size, nEqs }, new int[] { size, 1 }); ////DEBUG //CLy.ReadFromDeviceTo(yDebug); //CLb.ReadFromDeviceTo(bDebug); //sw1.Start(); //propagation if (i + SUBMATRIXSIZE < N) { if (nRHSMult > 0) kernelFwdPropag.Execute(args, new int[] { N - i - SUBMATRIXSIZE, nRHSMult * SUBMATRIXSIZE }, new int[] { 1, SUBMATRIXSIZE }); if (nRHSleftOver > 0) kernelFwdPropag2.Execute(args, new int[] { N - i - SUBMATRIXSIZE, nRHSleftOver }, new int[] { 1, nRHSleftOver }, new int[] { 0, nRHSMult * SUBMATRIXSIZE }); } //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); //sw1.Stop(); ////DEBUG //CLy.ReadFromDeviceTo(yDebug); //CLb.ReadFromDeviceTo(bDebug); } //Backward subst. Stores answer in CLb args = new CLCalc.Program.Variable[] { CLcholDec, CLb, CLy, CLoffSet, CLn }; //Backward substitution for (i = N - SUBMATRIXSIZE; i >= 0; i -= SUBMATRIXSIZE) { offset[0] = i; CLoffSet.WriteToDevice(offset); int size = SUBMATRIXSIZE; kernelBkLowerBackSubs.Execute(args, new int[] { size, nEqs }, new int[] { size, 1 }); if (i > 0) { //Propagation using __local storage if (nRHSMult > 0) kernelBackPropag.Execute(args, new int[] { i, nRHSMult * SUBMATRIXSIZE }, new int[] { 1, SUBMATRIXSIZE }); //leftovers (not multiples of SUBMATRIXSIZE) if (nRHSleftOver > 0) kernelBackPropag2.Execute(args, new int[] { i, nRHSleftOver }, new int[] { 1, nRHSleftOver }, new int[] { 0, nRHSMult * SUBMATRIXSIZE }); } } if (SUBMATRIXSIZE + i > 0) { offset[0] = 0; CLoffSet.WriteToDevice(offset); kernelBkLowerBackSubs.Execute(args, new int[] { SUBMATRIXSIZE + i, nEqs }, new int[] { SUBMATRIXSIZE + i, 1 }); } kernelCopyBuffer.Execute(new CLCalc.Program.Variable[] { CLb, resp.CLValues }, resp.Values.Length); //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); //sw.Stop(); }
/// <summary>Solves system A*invHAt = Mt and returns invHAt solving system per column. Refine may considerably slow the method.</summary> /// <param name="M">Right-hand-size of linear system</param> /// <param name="refine">Refine solution? Recommended: true</param> /// <param name="invHAt">Answer A*invHAt</param> public floatMatrix LinearSolve(floatMatrix M, bool refine, ref floatMatrix invHAt) { //System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch(); //System.Diagnostics.Stopwatch sw1 = new System.Diagnostics.Stopwatch(); //System.Diagnostics.Stopwatch sw2 = new System.Diagnostics.Stopwatch(); if (invHAt == null) invHAt=new floatMatrix(new float[this.N, M.Rows]); if (this.N != M.Cols) throw new Exception("Dimensions not compatible"); if (invHAt.Rows != this.N || invHAt.Cols != M.Rows) throw new Exception("Invalid matrix dimensions for invHAt"); //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw.Start(); if (!this.IsCholeskyFactorized) ComputeCholesky(); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { M.CLValues.ReadFromDeviceTo(M.Values); this.CLcholDec.ReadFromDeviceTo(this.cholDec); } linsolveMatrix(M, ref invHAt); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) invHAt.CLValues.WriteToDevice(invHAt.Values); ////TO DO: OpenCL fwd/bksubs ////OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw1.Start(); //LinsolveCLMatrix(M, ref invHAt); ////OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw1.Stop(); if (!refine) return invHAt; double totalRes = 0; if (matResidues == null || matResidues.Values.Length != M.Values.Length) { matResidues = new floatMatrix(new float[M.Rows, M.Cols]); matMx = new floatMatrix(new float[M.Rows, M.Cols]); matDeltax = new floatMatrix(new float[M.Rows, M.Cols]); matResiduesAbs = new floatMatrix(new float[M.Rows, M.Cols]); } for (int iter = 0; iter < 8 && !double.IsNaN(totalRes); iter++) { //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw2.Start(); BLAS.SymPosDefMatrMatrMultiply(this, invHAt, ref matMx); //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw2.Stop(); BLAS.LinearCombination(1, matMx, -1, M, ref matResidues); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) kernelElemWiseAbs.Execute(new CLCalc.Program.MemoryObject[] { matResidues.CLValues, matResiduesAbs.CLValues }, M.Values.Length); else { for (int i = 0; i < M.Values.Length; i++) matResiduesAbs.Values[i] = Math.Abs(matResidues.Values[i]); } totalRes = matResiduesAbs.Sum() / (double)N; if (totalRes < 1E-5) iter = 8; { LinsolveCLMatrix(matResidues, ref matDeltax); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) kernelInPlaceSubtract.Execute(new CLCalc.Program.MemoryObject[] { invHAt.CLValues, matDeltax.CLValues }, M.Values.Length); else { for (int i = 0; i < M.Values.Length; i++) invHAt.Values[i] -= matDeltax.Values[i]; } } } //swResto.Stop(); // OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw.Stop(); return invHAt; }
/// <summary>Computes transpose(A)*A and transpose(A)*b weighted by W</summary> /// <param name="A">Original matrix</param> /// <param name="W">Measurement weight vector</param> /// <param name="lambda">Regularization term</param> /// <param name="AtA">Answer, A transpose times A</param> private static floatSymPosDefMatrix AuxLeastSquaresAtAnoCL(floatMatrix A, floatDiag W, floatVector lambda, ref floatSymPosDefMatrix AtA) { //A (mxn), AtA (nxn) positive semidef symmetric int m = A.Rows; int n = A.Cols; if (AtA == null) AtA = new floatSymPosDefMatrix(new float[(n * (n + 1)) >> 1]); if (W != null) { for (int i = 0; i < n; i++) { for (int j = 0; j <= i; j++) { double val = 0; for (int k = 0; k < m; k++) { val += A[k, i] * A[k, j] * W.Values[k]; } AtA.Values[((i * (i + 1)) >> 1) + j] = (float)val; } } } else { for (int i = 0; i < n; i++) { for (int j = 0; j <= i; j++) { double val = 0; for (int k = 0; k < m; k++) { val += A[k, i] * A[k, j]; } AtA.Values[((i * (i + 1)) >> 1) + j] = (float)val; } } } //regularization term for (int i = 0; i < n; i++) { AtA.Values[((i * (i + 1)) >> 1) + i] += lambda.Values[i]; } return AtA; }
/// <summary>Computes transpose(A)*A using weights W</summary> /// <param name="A">Original matrix</param> /// <param name="W">Measurement weight vector</param> /// <param name="lambda">Regularization term</param> /// <param name="AtA">Answer, A transpose times A</param> public static floatSymPosDefMatrix MatrTranspMatrProd(floatMatrix A, floatDiag W, floatVector lambda, ref floatSymPosDefMatrix AtA) { if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { return AuxLSAtACL(A, W, lambda, ref AtA); } else { return AuxLeastSquaresAtAnoCL(A, W, lambda, ref AtA); } }
/// <summary>Computes A*B' = A*transpose(B) and stores result in ans</summary> /// <param name="A">Matrix A</param> /// <param name="B">Matrix B</param> /// <param name="ans">Answer. If null, gets created.</param> public static void MatrTranspMatrProd(floatMatrix A, floatMatrix B, ref floatMatrix ans) { if (A.Cols != B.Cols) throw new Exception("Incompatible dimensions"); if (ans == null) ans = new floatMatrix(new float[A.Rows, B.Rows]); if (ans.Rows != A.Rows || ans.Cols != B.Rows) throw new Exception("Invalid ans dimensions"); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { kernelRegularMatrTranspMatrProd.Execute(new CLCalc.Program.MemoryObject[] { A.CLValues, B.CLValues, ans.CLValues, A.CLDim }, new int[] { A.Rows, B.Rows }); } else { int n = A.Cols; int p = B.Rows; for (int i = 0; i < A.Rows; i++) { for (int j = 0; j < B.Rows; j++) { int ni = n * i; int nj = n * j; float temp = 0.0f; for (int k = 0; k < n; k++) { temp += A.Values[k + ni] * B.Values[k + nj]; } ans.Values[j + p * i] = temp; } } } }
/// <summary>Computes the Matrix-matrix transpose product alpha*D*transpose(V)</summary> public static floatMatrix DiagTranspMatProd(floatDiag D, floatMatrix u, float alpha, ref floatMatrix ans) { if (ans != null && (ans.Rows != u.Cols || ans.Cols != u.Rows)) throw new Exception("ans length should match transpose(u)"); if (u.Cols != D.Rows) throw new Exception("u Cols should match D dimension"); if (ans == null) ans = new floatMatrix(new float[u.Cols, u.Rows]); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) { u.CLCoef.WriteToDevice(new float[] { alpha }); kernelDiagTranspMatProd.Execute(new CLCalc.Program.MemoryObject[] { D.CLValues, u.CLValues, u.CLCoef, ans.CLValues }, new int[] { u.Cols, u.Rows }); } else { int NN = u.Cols; int MM = u.Rows; for (int j = 0; j < u.Rows; j++) { for (int i = 0; i < D.Rows; i++) { ans.Values[j + MM * i] = alpha * D.Values[i] * u.Values[i + NN * j]; } } } return ans; }
/// <summary>Computes nonlinear least squares using user functions to evaluate residues and their gradients</summary> /// <param name="f">Function that computes residues [m] and their gradients [grad r1; grad r2] m x n (each gradient in one line) [i,j] = gradR[i,j]</param> /// <param name="x">Intial guess</param> /// <param name="m">Number of residue equations</param> /// <param name="maxiter">Maximum number of iterations</param> /// <param name="err">Adjustment error</param> public static float[] NonLinearLS(ComputeResidueGrad f, float[] x, int m, int maxiter, ref double err) { int n = x.Length; float eps = 5e-5f * 0.5f; float alpha = 0.002f; float[,] A = new float[m, n]; float[] r = new float[m]; floatMatrix CLA = new floatMatrix(A); floatVector CLr = new floatVector(r); floatVector CLlambda = new floatVector(new float[CLA.Cols]); float[] ww = new float[CLA.Rows]; for (int i = 0; i < ww.Length; i++) ww[i] = 1; floatDiag CLW = new floatDiag(ww); float[] v = new float[CLA.Cols]; floatVector CLv = new floatVector(v); double errAnt = 0; for (int i = 0; i < maxiter; i++) { //Computes residues and gradient f(x, ref r, ref A, true); CLA.SetValues(A); CLr.CLValues.WriteToDevice(r); errAnt = err; err = NormAtb(A, r, m, n); //if (errAnt == err) it means algorithm is not converging at all if (err < eps || errAnt == err || double.IsNaN(err)) i = maxiter; else { floatSymPosDefMatrix AtA = null; AtA = BLAS.MatrTranspMatrProd(CLA, CLlambda, ref AtA); CLv = BLAS.MatrTraspVecMult(CLA, CLW, CLr, ref CLv); if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) CLv.CLValues.ReadFromDeviceTo(CLv.Values); v = AtA.LinearSolve(CLv.Values); for (int k = 0; k < v.Length; k++) v[k] = -v[k]; //Line search //||r||² float normRSquared = 0; for (int k = 0; k < r.Length; k++) normRSquared += r[k] * r[k]; //2transpose(r)Av float transpRAv = 0; for (int p = 0; p < m; p++) { float val = 0; for (int q = 0; q < n; q++) val += A[p, q] * v[q]; transpRAv += r[p] * val; } transpRAv *= 2.0f; float t = 2.0f; //iterates while sum(ri*(x+tv)^2)>||r||²+alpha*2*transpose(r)*A*v*t float lhs = 1; float rhs = 0; float[] newX = (float[])x.Clone(); while (lhs > rhs) { t *= 0.5f; //Update x for (int k = 0; k < x.Length; k++) newX[k] = x[k] + v[k] * t; //Update r f(newX, ref r, ref A, false); lhs = 0; for (int k = 0; k < m; k++) lhs += r[k] * r[k]; rhs = normRSquared + alpha * transpRAv * t; } x = newX; } } return x; }