/// <summary>Copies src vector contents to dst</summary>
 public static void CopyMatrix(floatMatrix Src, floatMatrix Dst)
 {
     if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
     {
         kernelCopyBuffer.Execute(new CLCalc.Program.MemoryObject[] { Src.CLValues, Dst.CLValues }, Src.CLValues.OriginalVarLength);
     }
     else
     {
         for (int i = 0; i < Src.Values.Length; i++) Dst.Values[i] = Src.Values[i];
     }
 }
            /// <summary>Computes A*inv(H)*A' and stores result in ans</summary>
            /// <param name="A">A matrix, mxn</param>
            /// <param name="H">H matrix, nxn</param>
            /// <param name="ans">answer, mxm</param>
            /// <param name="temp">Temporary matrix for the operation, size nxm</param>
            /// <param name="refine">Refine linear system solution?</param>
            public static floatSymPosDefMatrix ComputeAinvHTranspA(floatMatrix A, floatSymPosDefMatrix H, ref floatSymPosDefMatrix ans, ref floatMatrix temp, bool refine)
            {
                int m=A.Rows;
                int n=A.Cols;
                if (ans == null) ans = new floatSymPosDefMatrix(m);

                if (H.getN != n) throw new Exception("Matrix sizes not compatible");
                if (ans.getN != m) throw new Exception("Answer size not compatible");

                H.LinearSolve(A, refine, ref temp);

                //Go on to multiplying A*temp
                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    kernelComputeAinvHAt.Execute(new CLCalc.Program.MemoryObject[] { A.CLValues, A.CLDim, temp.CLValues, ans.CLValues }, (m * (m + 1)) >> 1);
                    ans.IsCholeskyFactorized = false;
                }
                else
                {
                    for (int p = 0; p < m; p++)
                    {
                        int np = n * p;
                        for (int q = 0; q <= p; q++)
                        {
                            int nq = n * q;

                            float val = 0;
                            for (int k = 0; k < n; k++)
                            {
                                val += A.Values[k + np] * temp.Values[k + nq];
                            }
                            ans.Values[((p * (1 + p)) >> 1) + q] = val;
                        }
                    }
                }

                return ans;
            }
            /// <summary>Symmetric positive definite product with matrix transpose, Msym*At. ans gets constructed if ==null </summary>
            public static floatMatrix SymPosDefMatrMatrMultiply(floatSymPosDefMatrix M, floatMatrix A, ref floatMatrix ans)
            {
                if (ans == null) ans = new floatMatrix(new float[A.Cols, M.getN]);

                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    if (!M.IsMatrixInClMemoryUpdated) M.CLValues.WriteToDevice(M.Values);
                    kernelSymMatrMatrMultiply.Execute(new CLCalc.Program.MemoryObject[] { M.CLValues, A.CLValues, ans.CLValues }, new int[] { A.Rows, A.Cols });
                }
                else
                {
                    for (int j = 0; j < A.Cols; j++)
                    {
                        for (int i = 0; i < M.getN; i++)
                        {
                            float val = 0;
                            for (int k = 0; k < M.getN; k++)
                            {
                                val += M[i, k] * A.Values[k + j * A.Rows];
                            }
                            ans.Values[i + j * A.Rows] = val;
                        }
                    }
                }

                return ans;
            }
            /// <summary>Sums the components of a vector using __local memory and coalesced access</summary>
            /// <param name="CLv">Matrix whose components should be summed</param>
            public static float SumMatrixElements(floatMatrix CLv)
            {
                float resp = 0;
                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    /*
                     The idea here is to create a reduction in which the access pattern to the vectors is coalesced.
                     The first step is to reduce the number of non-summed items to a multiple of NWORKITEMS and then coalesce the access
                     */

                    int LOCALWORKSIZE = Math.Min(256, (int)CLCalc.Program.CommQueues[CLCalc.Program.DefaultCQ].Device.MaxWorkGroupSize);
                    int NWORKITEMS = 16 * LOCALWORKSIZE;

                    int n = CLv.Values.Length;
                    float[] resps = new float[NWORKITEMS];
                    if (CLv.CLresps == null)
                    {
                        CLv.CLresps = new CLCalc.Program.Variable(resps);
                        CLv.CLn = new CLCalc.Program.Variable(new int[1]);
                    }

                    CLv.CLn.WriteToDevice(new int[] { n });
                    CLCalc.Program.Variable[] args = new CLCalc.Program.Variable[] { CLv.CLValues, CLv.CLresps, CLv.CLn };

                    //Write n = k*NWORKITEMS + p. Preprocess to eliminate p`s and leave summation only to a multiple of NWORKITEMS
                    int k = n / NWORKITEMS;
                    int p = n - k * NWORKITEMS;

                    //Clears partial responses
                    kernelClear.Execute(args, NWORKITEMS);

                    //Sums the p last elements into the p first elements
                    kernelPreSum.Execute(args, p);

                    //Use CLn to inform each work-item its workload. Each one will access and sum k numbers
                    CLv.CLn.WriteToDevice(new int[] { k });

                    kernelCoalLocalSum.Execute(args, new int[] { NWORKITEMS }, new int[] { LOCALWORKSIZE });

                    CLv.CLresps.ReadFromDeviceTo(resps);

                    //Serial part
                    int imax = NWORKITEMS / LOCALWORKSIZE;
                    for (int i = 0; i < imax; i++) resp += resps[i];

                }
                else
                {
                    double sum = 0;
                    for (int i = 0; i < CLv.Values.Length; i++) sum += CLv.Values[i];
                    resp = (float)sum;
                }

                return resp;
            }
            /// <summary>Computes M*(alpha*v) + beta*u. Creates ans if it is null</summary>
            public static floatVector MatrVecProdSumVec(floatMatrix M, floatVector v, float alpha, floatVector u, float beta, ref floatVector ans)
            {
                if (ans != null && ans.Length != M.Rows) throw new Exception("ans length should match M rows");
                if (v.Length != M.Cols) throw new Exception("v length should match M cols");
                if (u.Length != M.Rows) throw new Exception("u length should match M rows");
                if (ans == null) ans = new floatVector(new float[M.Rows]);

                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    v.CLCoef.WriteToDevice(new float[] { alpha });
                    u.CLCoef.WriteToDevice(new float[] { beta });
                    kernelMatrVecProdSumVec.Execute(new CLCalc.Program.MemoryObject[] { M.CLValues, M.CLDim, v.CLValues, v.CLCoef, u.CLValues, u.CLCoef, ans.CLValues }, M.Rows);
                }
                else
                {
                    for (int i = 0; i < M.Rows; i++)
                    {
                        float temp = 0;
                        for (int j = 0; j < M.Cols; j++)
                        {
                            temp += M[i, j] * v.Values[j] * alpha;
                        }
                        ans.Values[i] = temp + beta * u.Values[i];
                    }
                }

                return ans;
            }
            /// <summary>Computes transpose(A)*diag(W)*b*alpha</summary>
            /// <param name="A">Original matrix</param>
            /// <param name="b">Vector to multiply</param>
            /// <param name="W">Measurement weight vector</param>
            /// <param name="alpha">Multiplication constant</param>
            /// <param name="ans">Answer. If null, gets created</param>
            public static floatVector MatrTraspVecMult(floatMatrix A, floatDiag W, floatVector b, float alpha, ref floatVector ans)
            {
                int m = A.Rows;
                int n = A.Cols;
                if (ans == null) ans = new floatVector(new float[A.Cols]);

                if (A.Rows != W.Rows) throw new Exception("Incompatible A and W dimensions");
                if (A.Rows != b.Length) throw new Exception("Incompatible A and b dimensions");
                if (A.Cols != ans.Length) throw new Exception("Incompatible A and ans dimensions");

                if (CLCalc.CLAccelerationType.UsingCL == CLCalc.CLAcceleration)
                {
                    b.CLCoef.WriteToDevice(new float[] {alpha});
                    kernelTranspMatrVecProdW.Execute(new CLCalc.Program.MemoryObject[] { A.CLValues, A.CLDim, b.CLValues, b.CLCoef, W.CLValues, ans.CLValues }, A.Cols);
                }
                else
                {
                    for (int i = 0; i < n; i++)
                    {
                        double val = 0;
                        for (int k = 0; k < m; k++)
                        {
                            val += A[k, i] * b.Values[k] * W.Values[k] * alpha;
                        }
                        ans.Values[i] = (float)val;
                    }
                }

                return ans;
            }
 /// <summary>Computes transpose(A)*diag(W)*b*alpha</summary>
 /// <param name="A">Original matrix</param>
 /// <param name="b">Vector to multiply</param>
 /// <param name="W">Measurement weight vector</param>
 /// <param name="ans">Answer. If null, gets created</param>
 public static floatVector MatrTraspVecMult(floatMatrix A, floatDiag W, floatVector b, ref floatVector ans)
 {
     return MatrTraspVecMult(A, W, b, 1.0f, ref ans);
 }
            /// <summary>Solves system Ax = b and returns x, where b is a right hand side matrix</summary>
            /// <param name="b">b vector</param>
            /// <param name="refine">Refine solution? Recommended: true</param>
            public float[,] LinearSolve(float[,] b, bool refine)
            {
                floatMatrix CLbb = new floatMatrix(b);
                floatMatrix resp = null;
                LinearSolve(CLbb, true, ref resp);
                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    resp.CLValues.ReadFromDeviceTo(resp.Values);
                }
                float[,] vResp = new float[resp.Rows, resp.Cols];
                for (int i = 0; i < resp.Rows; i++) for (int j = 0; j < resp.Cols; j++) vResp[i, j] = resp[i, j];

                return vResp;
            }
            /// <summary>Computes transpose(A)*A and transpose(A)*b weighted by W using OpenCL. Lambda is regularization term</summary>
            private static floatSymPosDefMatrix AuxLSAtACL(floatMatrix A, floatDiag W, floatVector lambda, ref floatSymPosDefMatrix AtA)
            {
                if (AtA == null || AtA.CLValues.OriginalVarLength != (A.Cols * (A.Cols + 1)) >> 1)
                {
                    AtA = new floatSymPosDefMatrix(new float[(A.Cols * (A.Cols + 1)) >> 1]);
                }

                CLCalc.Program.Variable[] args = new CLCalc.Program.Variable[] { A.CLValues, A.CLDim, W.CLValues, AtA.CLValues, lambda.CLValues };
                kernelComputeAtWA.Execute(args, AtA.CLValues.OriginalVarLength);

                //Just modified values in CL memory, matrix is no longer Cholesky factorized
                AtA.IsCholeskyFactorized = false;

                return AtA;
            }
 /// <summary>Computes transpose(A)*A</summary>
 /// <param name="A">Original matrix</param>
 /// <param name="lambda">Regularization term</param>
 /// <param name="AtA">Answer, A transpose times A</param>
 public static floatSymPosDefMatrix MatrTranspMatrProd(floatMatrix A, floatVector lambda, ref floatSymPosDefMatrix AtA)
 {
     return MatrTranspMatrProd(A, null, lambda, ref AtA);
 }
            /// <summary>Computer a linear combination alpha*u+beta*v. Puts answer in ans. Creates ans if it is null</summary>
            public static floatMatrix LinearCombination(float alpha, floatMatrix u, float beta, floatMatrix v, ref floatMatrix ans)
            {
                if (ans == null) ans = new floatMatrix(new float[u.Rows, u.Cols]);
                if (u.Rows != v.Rows || u.Cols != v.Cols) throw new Exception("Incompatible dimensions");
                if (ans.Rows != u.Rows || ans.Cols != u.Cols) throw new Exception("Ans dimension should be equal to vectors dimension");

                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    u.CLCoef.WriteToDevice(new float[] { alpha });
                    v.CLCoef.WriteToDevice(new float[] { beta });
                    kernelLinearComb.Execute(new CLCalc.Program.MemoryObject[] { u.CLCoef, v.CLCoef, u.CLValues, v.CLValues, ans.CLValues }, u.Values.Length);
                }
                else
                {
                    for (int i = 0; i < u.Values.Length; i++)
                    {
                        ans.Values[i] = alpha * u.Values[i] + beta * v.Values[i];
                    }
                }

                return ans;
            }
            /// <summary>Solves system Ax = b' and returns x</summary>
            /// <param name="bb">b Matrix</param>
            /// <param name="resp">Answer</param>
            private void linsolveMatrix(floatMatrix bb, ref floatMatrix resp)
            {
                float[] b = (float[])bb.Values.Clone();
                float[] y = new float[bb.Values.Length];

                if (resp == null) resp = new floatMatrix(new float[bb.Cols, bb.Rows]);

                for (int k = 0; k < bb.Rows; k++)
                {

                    //Forward substitution
                    for (int i = 0; i < N; i++)
                    {
                        y[i + k * N] = b[i + k * N] / cholDec[((i * (i + 1)) >> 1) + i];

                        for (int j = i + 1; j < N; j++)
                        {
                            b[j + k * N] -= cholDec[((j * (j + 1)) >> 1) + i] * y[i + k * N];
                        }
                    }

                    //Backward substitution
                    for (int i = N - 1; i >= 0; i--)
                    {
                        resp.Values[i + k * N] = y[i + k * N] / cholDec[((i * (i + 1)) >> 1) + i];

                        for (int j = 0; j < i; j++)
                        {
                            y[j + k * N] -= cholDec[((i * (i + 1)) >> 1) + j] * resp.Values[i + k * N];
                        }
                    }
                }
            }
            /// <summary>Backsubstitutes to solve a linear system with a matrix right hand size</summary>
            private void LinsolveCLMatrix(floatMatrix M, ref floatMatrix resp)
            {
                //System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
                //System.Diagnostics.Stopwatch sw1 = new System.Diagnostics.Stopwatch();
                //sw.Start();

                //number of RHS as multiple of SUBMATRIXSIZE
                int nRHSMult = M.Rows / SUBMATRIXSIZE;
                int nRHSleftOver = M.Rows - SUBMATRIXSIZE*nRHSMult;

                if (CLCalc.CLAcceleration != CLCalc.CLAccelerationType.UsingCL)
                {
                    linsolveMatrix(M, ref resp);
                    return;
                }

                //Copy elements to CLb
                if (CLb == null || CLb.OriginalVarLength < M.Values.Length)
                {
                    CLb = new CLCalc.Program.Variable(M.Values);
                    CLy = new CLCalc.Program.Variable(M.Values);
                }

                kernelCopyBuffer.Execute(new CLCalc.Program.MemoryObject[] { M.CLValues, CLb }, M.Values.Length);
                int nEqs = M.Rows;

                CLCalc.Program.Variable[] args = new CLCalc.Program.Variable[] { CLcholDec, CLy, CLb, CLoffSet, CLn };
                int[] offset = new int[1];

                //DEBUG
                //float[] yDebug = new float[M.Values.Length];
                //float[] bDebug = new float[M.Values.Length];
                //this.CLcholDec.ReadFromDeviceTo(cholDec);

                //Forward substitution
                int i;
                for (i = 0; i < N; i += SUBMATRIXSIZE)
                {
                    offset[0] = i;
                    CLoffSet.WriteToDevice(offset);

                    int size = Math.Min(SUBMATRIXSIZE, N - i);
                    kernelFwdUpperBackSubs.Execute(args, new int[] { size, nEqs }, new int[] { size, 1 });

                    ////DEBUG
                    //CLy.ReadFromDeviceTo(yDebug);
                    //CLb.ReadFromDeviceTo(bDebug);

                    //sw1.Start();
                    //propagation
                    if (i + SUBMATRIXSIZE < N)
                    {
                        if (nRHSMult > 0) kernelFwdPropag.Execute(args, new int[] { N - i - SUBMATRIXSIZE, nRHSMult * SUBMATRIXSIZE }, new int[] { 1, SUBMATRIXSIZE });
                        if (nRHSleftOver > 0)
                            kernelFwdPropag2.Execute(args, new int[] { N - i - SUBMATRIXSIZE, nRHSleftOver }, new int[] { 1, nRHSleftOver }, new int[] { 0, nRHSMult * SUBMATRIXSIZE });
                    }
                    //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish();
                    //sw1.Stop();

                    ////DEBUG
                    //CLy.ReadFromDeviceTo(yDebug);
                    //CLb.ReadFromDeviceTo(bDebug);
                }

                //Backward subst. Stores answer in CLb
                args = new CLCalc.Program.Variable[] { CLcholDec, CLb, CLy, CLoffSet, CLn };
                //Backward substitution
                for (i = N - SUBMATRIXSIZE; i >= 0; i -= SUBMATRIXSIZE)
                {
                    offset[0] = i;
                    CLoffSet.WriteToDevice(offset);

                    int size = SUBMATRIXSIZE;
                    kernelBkLowerBackSubs.Execute(args, new int[] { size, nEqs }, new int[] { size, 1 });

                    if (i > 0)
                    {
                        //Propagation using __local storage
                        if (nRHSMult > 0) kernelBackPropag.Execute(args, new int[] { i, nRHSMult * SUBMATRIXSIZE }, new int[] { 1, SUBMATRIXSIZE });

                        //leftovers (not multiples of SUBMATRIXSIZE)
                        if (nRHSleftOver > 0)
                            kernelBackPropag2.Execute(args, new int[] { i, nRHSleftOver }, new int[] { 1, nRHSleftOver }, new int[] { 0, nRHSMult * SUBMATRIXSIZE });

                    }

                }
                if (SUBMATRIXSIZE + i > 0)
                {
                    offset[0] = 0; CLoffSet.WriteToDevice(offset);
                    kernelBkLowerBackSubs.Execute(args, new int[] { SUBMATRIXSIZE + i, nEqs }, new int[] { SUBMATRIXSIZE + i, 1 });
                }

                kernelCopyBuffer.Execute(new CLCalc.Program.Variable[] { CLb, resp.CLValues }, resp.Values.Length);

                //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish();
                //sw.Stop();
            }
            /// <summary>Solves system A*invHAt = Mt and returns invHAt solving system per column. Refine may considerably slow the method.</summary>
            /// <param name="M">Right-hand-size of linear system</param>
            /// <param name="refine">Refine solution? Recommended: true</param>
            /// <param name="invHAt">Answer A*invHAt</param>
            public floatMatrix LinearSolve(floatMatrix M, bool refine, ref floatMatrix invHAt)
            {
                //System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
                //System.Diagnostics.Stopwatch sw1 = new System.Diagnostics.Stopwatch();
                //System.Diagnostics.Stopwatch sw2 = new System.Diagnostics.Stopwatch();

                if (invHAt == null) invHAt=new floatMatrix(new float[this.N, M.Rows]);
                if (this.N != M.Cols) throw new Exception("Dimensions not compatible");
                if (invHAt.Rows != this.N || invHAt.Cols != M.Rows) throw new Exception("Invalid matrix dimensions for invHAt");

                //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw.Start();
                if (!this.IsCholeskyFactorized) ComputeCholesky();

                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    M.CLValues.ReadFromDeviceTo(M.Values);
                    this.CLcholDec.ReadFromDeviceTo(this.cholDec);
                }
                linsolveMatrix(M, ref invHAt);
                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) invHAt.CLValues.WriteToDevice(invHAt.Values);

                ////TO DO: OpenCL fwd/bksubs
                ////OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw1.Start();
                //LinsolveCLMatrix(M, ref invHAt);
                ////OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw1.Stop();

                if (!refine) return invHAt;

                double totalRes = 0;

                if (matResidues == null || matResidues.Values.Length != M.Values.Length)
                {
                    matResidues = new floatMatrix(new float[M.Rows, M.Cols]);
                    matMx = new floatMatrix(new float[M.Rows, M.Cols]);
                    matDeltax = new floatMatrix(new float[M.Rows, M.Cols]);
                    matResiduesAbs = new floatMatrix(new float[M.Rows, M.Cols]);
                }

                for (int iter = 0; iter < 8 && !double.IsNaN(totalRes); iter++)
                {
                    //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw2.Start();
                    BLAS.SymPosDefMatrMatrMultiply(this, invHAt, ref matMx);
                    //OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw2.Stop();
                    BLAS.LinearCombination(1, matMx, -1, M, ref matResidues);

                    if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                        kernelElemWiseAbs.Execute(new CLCalc.Program.MemoryObject[] { matResidues.CLValues, matResiduesAbs.CLValues }, M.Values.Length);
                    else
                    {
                        for (int i = 0; i < M.Values.Length; i++) matResiduesAbs.Values[i] = Math.Abs(matResidues.Values[i]);
                    }

                    totalRes = matResiduesAbs.Sum() / (double)N;

                    if (totalRes < 1E-5)
                        iter = 8;
                    {
                        LinsolveCLMatrix(matResidues, ref matDeltax);

                        if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                            kernelInPlaceSubtract.Execute(new CLCalc.Program.MemoryObject[] { invHAt.CLValues, matDeltax.CLValues }, M.Values.Length);
                        else
                        {
                            for (int i = 0; i < M.Values.Length; i++) invHAt.Values[i] -= matDeltax.Values[i];
                        }
                    }

                }

                //swResto.Stop();
                   // OpenCLTemplate.CLCalc.Program.CommQueues[OpenCLTemplate.CLCalc.Program.DefaultCQ].Finish(); sw.Stop();

                return invHAt;
            }
            /// <summary>Computes transpose(A)*A and transpose(A)*b weighted by W</summary>
            /// <param name="A">Original matrix</param>
            /// <param name="W">Measurement weight vector</param>
            /// <param name="lambda">Regularization term</param>
            /// <param name="AtA">Answer, A transpose times A</param>
            private static floatSymPosDefMatrix AuxLeastSquaresAtAnoCL(floatMatrix A, floatDiag W, floatVector lambda, ref floatSymPosDefMatrix AtA)
            {
                //A (mxn), AtA (nxn) positive semidef symmetric
                int m = A.Rows;
                int n = A.Cols;

                if (AtA == null) AtA = new floatSymPosDefMatrix(new float[(n * (n + 1)) >> 1]);

                if (W != null)
                {
                    for (int i = 0; i < n; i++)
                    {
                        for (int j = 0; j <= i; j++)
                        {
                            double val = 0;
                            for (int k = 0; k < m; k++)
                            {
                                val += A[k, i] * A[k, j] * W.Values[k];
                            }
                            AtA.Values[((i * (i + 1)) >> 1) + j] = (float)val;
                        }
                    }
                }
                else
                {
                    for (int i = 0; i < n; i++)
                    {
                        for (int j = 0; j <= i; j++)
                        {
                            double val = 0;
                            for (int k = 0; k < m; k++)
                            {
                                val += A[k, i] * A[k, j];
                            }
                            AtA.Values[((i * (i + 1)) >> 1) + j] = (float)val;
                        }
                    }
                }

                //regularization term
                for (int i = 0; i < n; i++)
                {
                    AtA.Values[((i * (i + 1)) >> 1) + i] += lambda.Values[i];
                }

                return AtA;
            }
 /// <summary>Computes transpose(A)*A using weights W</summary>
 /// <param name="A">Original matrix</param>
 /// <param name="W">Measurement weight vector</param>
 /// <param name="lambda">Regularization term</param>
 /// <param name="AtA">Answer, A transpose times A</param>
 public static floatSymPosDefMatrix MatrTranspMatrProd(floatMatrix A, floatDiag W, floatVector lambda, ref floatSymPosDefMatrix AtA)
 {
     if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
     {
         return AuxLSAtACL(A, W, lambda, ref AtA);
     }
     else
     {
         return AuxLeastSquaresAtAnoCL(A, W, lambda, ref AtA);
     }
 }
            /// <summary>Computes A*B' = A*transpose(B) and stores result in ans</summary>
            /// <param name="A">Matrix A</param>
            /// <param name="B">Matrix B</param>
            /// <param name="ans">Answer. If null, gets created.</param>
            public static void MatrTranspMatrProd(floatMatrix A, floatMatrix B, ref floatMatrix ans)
            {
                if (A.Cols != B.Cols) throw new Exception("Incompatible dimensions");
                if (ans == null) ans = new floatMatrix(new float[A.Rows, B.Rows]);
                if (ans.Rows != A.Rows || ans.Cols != B.Rows) throw new Exception("Invalid ans dimensions");

                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    kernelRegularMatrTranspMatrProd.Execute(new CLCalc.Program.MemoryObject[] { A.CLValues, B.CLValues, ans.CLValues, A.CLDim }, new int[] { A.Rows, B.Rows });
                }
                else
                {
                    int n = A.Cols;
                    int p = B.Rows;
                    for (int i = 0; i < A.Rows; i++)
                    {
                        for (int j = 0; j < B.Rows; j++)
                        {
                            int ni = n * i;
                            int nj = n * j;

                            float temp = 0.0f;
                            for (int k = 0; k < n; k++)
                            {
                                temp += A.Values[k + ni] * B.Values[k + nj];
                            }

                            ans.Values[j + p * i] = temp;

                        }
                    }
                }
            }
            /// <summary>Computes the Matrix-matrix transpose product alpha*D*transpose(V)</summary>
            public static floatMatrix DiagTranspMatProd(floatDiag D, floatMatrix u, float alpha, ref floatMatrix ans)
            {
                if (ans != null && (ans.Rows != u.Cols || ans.Cols != u.Rows)) throw new Exception("ans length should match transpose(u)");
                if (u.Cols != D.Rows) throw new Exception("u Cols should match D dimension");
                if (ans == null) ans = new floatMatrix(new float[u.Cols, u.Rows]);

                if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                {
                    u.CLCoef.WriteToDevice(new float[] { alpha });
                    kernelDiagTranspMatProd.Execute(new CLCalc.Program.MemoryObject[] { D.CLValues, u.CLValues, u.CLCoef, ans.CLValues }, new int[] { u.Cols, u.Rows });
                }
                else
                {
                    int NN = u.Cols;
                    int MM = u.Rows;
                    for (int j = 0; j < u.Rows; j++)
                    {
                        for (int i = 0; i < D.Rows; i++)
                        {
                            ans.Values[j + MM * i] = alpha * D.Values[i] * u.Values[i + NN * j];
                        }
                    }
                }

                return ans;
            }
            /// <summary>Computes nonlinear least squares using user functions to evaluate residues and their gradients</summary>
            /// <param name="f">Function that computes residues [m] and their gradients [grad r1; grad r2] m x n (each gradient in one line) [i,j] = gradR[i,j]</param>
            /// <param name="x">Intial guess</param>
            /// <param name="m">Number of residue equations</param>
            /// <param name="maxiter">Maximum number of iterations</param>
            /// <param name="err">Adjustment error</param>
            public static float[] NonLinearLS(ComputeResidueGrad f, float[] x, int m, int maxiter, ref double err)
            {
                int n = x.Length;
                float eps = 5e-5f * 0.5f;
                float alpha = 0.002f;

                float[,] A = new float[m, n];
                float[] r = new float[m];

                floatMatrix CLA = new floatMatrix(A);
                floatVector CLr = new floatVector(r);
                floatVector CLlambda = new floatVector(new float[CLA.Cols]);
                float[] ww = new float[CLA.Rows];
                for (int i = 0; i < ww.Length; i++) ww[i] = 1;
                floatDiag CLW = new floatDiag(ww);

                float[] v = new float[CLA.Cols];
                floatVector CLv = new floatVector(v);

                double errAnt = 0;

                for (int i = 0; i < maxiter; i++)
                {
                    //Computes residues and gradient
                    f(x, ref r, ref A, true);
                    CLA.SetValues(A);
                    CLr.CLValues.WriteToDevice(r);

                    errAnt = err;
                    err = NormAtb(A, r, m, n);

                    //if (errAnt == err) it means algorithm is not converging at all
                    if (err < eps || errAnt == err || double.IsNaN(err)) i = maxiter;
                    else
                    {
                        floatSymPosDefMatrix AtA = null;
                        AtA = BLAS.MatrTranspMatrProd(CLA, CLlambda, ref AtA);

                        CLv = BLAS.MatrTraspVecMult(CLA, CLW, CLr, ref CLv);

                        if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL) CLv.CLValues.ReadFromDeviceTo(CLv.Values);
                        v = AtA.LinearSolve(CLv.Values);

                        for (int k = 0; k < v.Length; k++) v[k] = -v[k];

                        //Line search

                        //||r||²
                        float normRSquared = 0;
                        for (int k = 0; k < r.Length; k++) normRSquared += r[k] * r[k];

                        //2transpose(r)Av
                        float transpRAv = 0;
                        for (int p = 0; p < m; p++)
                        {
                            float val = 0;
                            for (int q = 0; q < n; q++) val += A[p, q] * v[q];
                            transpRAv += r[p] * val;
                        }
                        transpRAv *= 2.0f;

                        float t = 2.0f;
                        //iterates while sum(ri*(x+tv)^2)>||r||²+alpha*2*transpose(r)*A*v*t
                        float lhs = 1;
                        float rhs = 0;

                        float[] newX = (float[])x.Clone();

                        while (lhs > rhs)
                        {
                            t *= 0.5f;

                            //Update x
                            for (int k = 0; k < x.Length; k++) newX[k] = x[k] + v[k] * t;

                            //Update r
                            f(newX, ref r, ref A, false);

                            lhs = 0;
                            for (int k = 0; k < m; k++) lhs += r[k] * r[k];
                            rhs = normRSquared + alpha * transpRAv * t;
                        }

                        x = newX;
                    }
                }

                return x;
            }