        /// <summary>Computes dot product of 2 vectors using their OpenCL images. Assumes data has been inserted to VectorData and WriteToDevice() has been called</summary>
        /// <param name="v1">First vector</param>
        /// <param name="v2">Second vector</param>
        public float DotProduct(CLImgVector v1, CLImgVector v2)
            if (v1.Length != v2.Length)
                throw new Exception("Incompatible lengths");

            if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                CLDotProd(v1, v2);

                float[] resp = new float[1];

                dprod[0] = resp[0];
                for (int i = 0; i < v1.Length; i++)
                    dprod[0] += v1.VectorData[i] * v2.VectorData[i];

                dprod[0] = dprod[0];

        /// <summary>Computes dot product of two vectors and stores result in
        /// dotProdSum</summary>
        private void CLDotProd(CLImgVector v1, CLImgVector v2)
            int[] vlenby4 = new int[] { (v1.Length >> 2) + 1 };


            //Computes products and most sums
            CLCalc.Program.MemoryObject[] args = new CLCalc.Program.MemoryObject[] { v1.CLVector, v2.CLVector, dotProd, vLenBy4 };

            //kernelDotProduct.Execute(args, GLOBALWORKSIZE);
            kernelDotProduct.Execute(args, new int[] { GLOBALWORKSIZE }, new int[] { (int)CLCalc.CLDevices[CLCalc.Program.DefaultCQ].MaxWorkItemSizes[0] });

            //Sums what's left
            int i = GLOBALWORKSIZE >> 3;

            args = new CLCalc.Program.MemoryObject[] { dotProd };
            while (i > 0)
                kernelSum.Execute(args, i);
                i = (i >> 1);

            //Reads final value
            args = new CLCalc.Program.MemoryObject[] { dotProd, dotProdSum };
            kernelGetDotSum.Execute(args, 1);
        /// <summary>Computes dot product of 2 vectors without OpenCL, in double precision</summary>
        public double ExactDotProductNoCL(CLImgVector v1, CLImgVector v2)
            double dProd = 0;

            for (int i = 0; i < v1.Length; i++)
                dProd += (double)v1.VectorData[i] * (double)v2.VectorData[i];

        /// <summary>Computes dot product of 2 vectors without OpenCL</summary>
        public float DotProductNoCL(CLImgVector v1, CLImgVector v2)
            float dProd = 0;

            for (int i = 0; i < v1.Length; i++)
                dProd += v1.VectorData[i] * v2.VectorData[i];

        /// <summary>Computes M*x and stores the result in y. Does not automatically read result from device memory</summary>
        /// <param name="M">Sparse matrix</param>
        /// <param name="x">Vector to be multiplied</param>
        /// <param name="y">Result</param>
        public void Multiply(CLImgSparseMatrix M, CLImgVector x, CLImgVector y)
            if (x.Length != M.MatrixDimension || y.Length != M.MatrixDimension)
                throw new Exception("M, x and y dimensions not compatible");

            if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                CLNonZeroElemsPerRow.WriteToDevice(new int[] { M.NonZeroElemsPerRow });
                CLCalc.Program.MemoryObject[] args = new CLCalc.Program.MemoryObject[] { M.CLMatrixData, M.CLColumns, x.CLVector, y.CLVector, CLNonZeroElemsPerRow };

                //Ideally matrix dimension should be a multiple of 4, but OK if it's not
                kernelSparseMatrixVecMult.Execute(args, 1 + ((M.MatrixDimension - 1) >> 2));
                y.VectorData = MultiplyNoCL(M, x);
        /// <summary>Computes product of sparse matrix M and vector x</summary>
        /// <param name="M">Sparse matrix</param>
        /// <param name="x">Vector to be multiplied</param>
        public float[] MultiplyNoCL(CLImgSparseMatrix M, CLImgVector x)
            if (x.Length != M.MatrixDimension)
                throw new Exception("M and x dimensions not compatible");

            float[] resp = new float[x.Length];

            for (int i = 0; i < M.MatrixDimension; i++)
                for (int j = 0; j < M.NonZeroElemsPerRow; j++)
                    if (M.Columns[j + M.NonZeroElemsPerRow * i] >= 0)
                        resp[i] += M.MatrixData[j + M.NonZeroElemsPerRow * i] * x.VectorData[M.Columns[j + M.NonZeroElemsPerRow * i]];

        /// <summary>Solves linear system Mx = b using conjugate gradient method</summary>
        /// <param name="M">Matrix M</param>
        /// <param name="b">Vector b</param>
        /// <param name="tol">Error tolerance</param>
        public float[] LinSolveNoCL(CLImgSparseMatrix M, CLImgVector b, float tol)
            if (b.Length != M.MatrixDimension)
                throw new Exception("M and x dimensions not compatible");

            int n = b.Length;

            if (r == null || r.Length != n)
                r  = new CLImgVector(n);
                p  = new CLImgVector(n);
                x  = new CLImgVector(n);
                Ap = new CLImgVector(n);

            float alpha, beta, RDotROld, RDotR;

            Ap.VectorData = MultiplyNoCL(M, x);
            for (int i = 0; i < n; i++)
                r.VectorData[i] = b.VectorData[i] - Ap.VectorData[i];
                p.VectorData[i] = r.VectorData[i];

            int count = 0;

            RDotR = DotProductNoCL(r, r);
            while ((RDotR > tol) && (count < n * MAXITER))
                RDotROld = RDotR;

                Ap.VectorData = MultiplyNoCL(M, p);
                alpha         = RDotROld / DotProductNoCL(Ap, p);

                for (int i = 0; i < n; i++)
                    x.VectorData[i] += alpha * p.VectorData[i];
                    r.VectorData[i] -= alpha * Ap.VectorData[i];

                RDotR = DotProductNoCL(r, r);
                beta  = RDotR / RDotROld;

                for (int i = 0; i < n; i++)
                    p.VectorData[i] = r.VectorData[i] + beta * p.VectorData[i];


            float[] resp = new float[n];
            for (int i = 0; i < n; i++)
                resp[i] = x.VectorData[i];

        /// <summary>Computes dot product of 2 vectors without OpenCL</summary>
        public float DotProductNoCL(CLImgVector v1, CLImgVector v2)
            float dProd = 0;

            for (int i = 0; i < v1.Length; i++)
                dProd += v1.VectorData[i] * v2.VectorData[i];

            return dProd;
        /// <summary>Computes dot product of two vectors and stores result in
        /// dotProdSum</summary>
        private void CLDotProd(CLImgVector v1, CLImgVector v2)
            int[] vlenby4 = new int[] { (v1.Length >> 2) + 1 };


            //Computes products and most sums
            CLCalc.Program.MemoryObject[] args = new CLCalc.Program.MemoryObject[] { v1.CLVector, v2.CLVector, dotProd, vLenBy4 };

            //kernelDotProduct.Execute(args, GLOBALWORKSIZE);
            kernelDotProduct.Execute(args, new int[] { GLOBALWORKSIZE }, new int[] { (int)CLCalc.CLDevices[CLCalc.Program.DefaultCQ].MaxWorkItemSizes[0] });

            //Sums what's left
            int i = GLOBALWORKSIZE >> 3;
            args = new CLCalc.Program.MemoryObject[] { dotProd };
            while (i > 0)
                kernelSum.Execute(args, i);
                i = (i >> 1);

            //Reads final value
            args = new CLCalc.Program.MemoryObject[] { dotProd, dotProdSum };
            kernelGetDotSum.Execute(args, 1);
        /// <summary>Computes product of sparse matrix M and vector x</summary>
        /// <param name="M">Sparse matrix</param>
        /// <param name="x">Vector to be multiplied</param>
        public float[] MultiplyNoCL(CLImgSparseMatrix M, CLImgVector x)
            if (x.Length != M.MatrixDimension) throw new Exception("M and x dimensions not compatible");

            float[] resp = new float[x.Length];

            for (int i = 0; i < M.MatrixDimension; i++)
                for (int j = 0; j < M.NonZeroElemsPerRow; j++)
                    if (M.Columns[j + M.NonZeroElemsPerRow * i] >= 0)
                        resp[i] += M.MatrixData[j + M.NonZeroElemsPerRow * i] * x.VectorData[M.Columns[j + M.NonZeroElemsPerRow * i]];

            return resp;
        /// <summary>Computes M*x and stores the result in y. Does not automatically read result from device memory</summary>
        /// <param name="M">Sparse matrix</param>
        /// <param name="x">Vector to be multiplied</param>
        /// <param name="y">Result</param>
        public void Multiply(CLImgSparseMatrix M, CLImgVector x, CLImgVector y)
            if (x.Length != M.MatrixDimension || y.Length != M.MatrixDimension) throw new Exception("M, x and y dimensions not compatible");

            if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                CLNonZeroElemsPerRow.WriteToDevice(new int[] { M.NonZeroElemsPerRow });
                CLCalc.Program.MemoryObject[] args = new CLCalc.Program.MemoryObject[] { M.CLMatrixData, M.CLColumns, x.CLVector, y.CLVector, CLNonZeroElemsPerRow };

                //Ideally matrix dimension should be a multiple of 4, but OK if it's not
                kernelSparseMatrixVecMult.Execute(args, 1 + ((M.MatrixDimension - 1) >> 2));
                y.VectorData = MultiplyNoCL(M, x);
        /// <summary>Solves linear system Mx = b using conjugate gradient method</summary>
        /// <param name="M">Matrix M</param>
        /// <param name="b">Vector b</param>
        /// <param name="tol">Error tolerance</param>
        public float[] LinSolveNoCL(CLImgSparseMatrix M, CLImgVector b, float tol)
            if (b.Length != M.MatrixDimension) throw new Exception("M and x dimensions not compatible");

            int n = b.Length;

            if (r == null || r.Length != n)
                r = new CLImgVector(n);
                p = new CLImgVector(n);
                x = new CLImgVector(n);
                Ap = new CLImgVector(n);

            float alpha, beta, RDotROld, RDotR;

            Ap.VectorData = MultiplyNoCL(M, x);
            for (int i = 0; i < n; i++)
                r.VectorData[i] = b.VectorData[i] - Ap.VectorData[i];
                p.VectorData[i] = r.VectorData[i];

            int count = 0;
            RDotR = DotProductNoCL(r, r);
            while ((RDotR > tol) && (count < n * MAXITER))
                RDotROld = RDotR;

                Ap.VectorData = MultiplyNoCL(M, p);
                alpha = RDotROld / DotProductNoCL(Ap, p);

                for (int i = 0; i < n; i++)
                    x.VectorData[i] += alpha * p.VectorData[i];
                    r.VectorData[i] -= alpha * Ap.VectorData[i];

                RDotR = DotProductNoCL(r, r);
                beta = RDotR / RDotROld;

                for (int i = 0; i < n; i++)
                    p.VectorData[i] = r.VectorData[i] + beta * p.VectorData[i];


            float[] resp = new float[n];
            for (int i = 0; i < n; i++) resp[i] = x.VectorData[i];

            return resp;
        /// <summary>Solves linear system Mx = b using conjugate gradient method. Doesn't try to improve the solution obtained.</summary>
        /// <param name="M">Matrix M</param>
        /// <param name="b">Vector b</param>
        /// <param name="tol">Error tolerance</param>
        /// <param name="x">Initial guess</param>
        public void LinSolveCLStep(CLImgSparseMatrix M, CLImgVector b, float tol, ref CLImgVector x)
            int n = b.Length;
            int nBy4 = 1 + ((n - 1) >> 2);

            if (lambda == null)
                lambda = new float[1];
                CLlambda = new CLCalc.Program.Variable(lambda);

            if (r == null || r.Length != n)
                r = new CLImgVector(n);
                p = new CLImgVector(n);
                //x = new CLImgVector(n);
                Ap = new CLImgVector(n);
                temp = new CLImgVector(n);
            if (temp == null) temp = new CLImgVector(n);

            if (x == null || x.Length != n) x = new CLImgVector(n);

            float alpha, beta, RDotROld, RDotR;

            Multiply(M, x, Ap);

            CLCalc.Program.MemoryObject[] args = new CLCalc.Program.MemoryObject[] { b.CLVector, Ap.CLVector, r.CLVector, p.CLVector };
            kernelInitRP.Execute(args, nBy4);

            int count = 0;

            RDotR = DotProduct(r, r);

            while (count<1 || ((RDotR > tol) && (count < n*MAXITER)))
                RDotROld = RDotR;

                //if ((count & 0x0080) == 0)
                //    Multiply(M, x, Ap);

                //    args = new CLCalc.Program.MemoryObject[] { b.CLVector, Ap.CLVector, r.CLVector, p.CLVector };
                //    kernelInitRP.Execute(args, nBy4);

                Multiply(M, p, Ap);

                alpha = RDotROld / DotProduct(Ap, p);

                //Update x
                kernelCopyToTemp.Execute(new CLCalc.Program.MemoryObject[] { x.CLVector, temp.CLVector }, nBy4);
                lambda[0] = alpha; CLlambda.WriteToDevice(lambda);
                kernelMultiplyAdd.Execute(new CLCalc.Program.MemoryObject[] { CLlambda, p.CLVector, temp.CLVector, x.CLVector }, nBy4);

                //Update r
                kernelCopyToTemp.Execute(new CLCalc.Program.MemoryObject[] { r.CLVector, temp.CLVector }, nBy4);
                lambda[0] = -alpha; CLlambda.WriteToDevice(lambda);
                kernelMultiplyAdd.Execute(new CLCalc.Program.MemoryObject[] { CLlambda, Ap.CLVector, temp.CLVector, r.CLVector }, nBy4);

                RDotR = DotProduct(r, r);
                beta = RDotR / RDotROld;

                //Update p
                kernelCopyToTemp.Execute(new CLCalc.Program.MemoryObject[] { p.CLVector, temp.CLVector }, nBy4);
                lambda[0] = beta; CLlambda.WriteToDevice(lambda);
                kernelMultiplyAdd.Execute(new CLCalc.Program.MemoryObject[] { CLlambda, temp.CLVector, r.CLVector, p.CLVector }, nBy4);

        /// <summary>Solves linear system Mx = b using conjugate gradient method. Writes variables to Device memory. Improves solution if accuracy is low.</summary>
        /// <param name="M">Matrix M</param>
        /// <param name="b">Vector b</param>
        /// <param name="tol">Error tolerance</param>
        public float[] LinSolveCL(CLImgSparseMatrix M, CLImgVector b, float tol)
            if (b.Length != M.MatrixDimension) throw new Exception("M and x dimensions not compatible");
            int n = b.Length;

            tol = Math.Abs(tol);

            //Writes M to device memory

            ////Preconditions M
            //float[] preC = JacobiPrecondition(M);
            ////Preconditions b using M
            //JacobiPrecondition(preC, b);

            //Backs up b data
            float[] bbkp = new float[n];
            for (int i = 0; i < n; i++) bbkp[i] = b.VectorData[i];

            //Residue variables
            double ResidueSumSquares = 1E100;
            double resAnt = 1E200;

            double[] dblResidues = new double[n];

            float[] Solution = new float[n];

            while (ResidueSumSquares > tol && Math.Abs(resAnt - ResidueSumSquares) >= tol && resAnt > ResidueSumSquares)
                //Check if solution is not improving anymore
                resAnt = ResidueSumSquares;


                LinSolveCLStep(M, b, tol, ref x);


                if (ResidueSumSquares == tol * 2)
                    //Copies solution
                    for (int i = 0; i < n; i++)
                        Solution[i] = x.VectorData[i];
                    //Improves solution
                    for (int i = 0; i < n; i++)
                        Solution[i] -= x.VectorData[i];
                        x.VectorData[i] = Solution[i];

                //Compute residue sum of squares and improves solution
                dblResidues = ExactMultiply(M, x);
                ResidueSumSquares = 0;
                for (int i = 0; i < n; i++)
                    //Computes residues
                    dblResidues[i] = dblResidues[i] - (double)bbkp[i];
                    ResidueSumSquares += dblResidues[i] * dblResidues[i];

                    b.VectorData[i] = (float)dblResidues[i];

            //Restores b data
            for (int i = 0; i < n; i++) b.VectorData[i] = bbkp[i];

            return Solution;
        /// <summary>Computes dot product of 2 vectors without OpenCL, in double precision</summary>
        public double ExactDotProductNoCL(CLImgVector v1, CLImgVector v2)
            double dProd = 0;

            for (int i = 0; i < v1.Length; i++)
                dProd += (double)v1.VectorData[i] * (double)v2.VectorData[i];

            return dProd;
        /// <summary>Solves linear system Mx = b using conjugate gradient method. Writes variables to Device memory. Improves solution if accuracy is low.</summary>
        /// <param name="M">Matrix M</param>
        /// <param name="b">Vector b</param>
        /// <param name="tol">Error tolerance</param>
        public float[] LinSolveCL(CLImgSparseMatrix M, CLImgVector b, float tol)
            if (b.Length != M.MatrixDimension)
                throw new Exception("M and x dimensions not compatible");
            int n = b.Length;

            tol = Math.Abs(tol);

            //Writes M to device memory

            //Backs up b data
            float[] bbkp = new float[n];
            for (int i = 0; i < n; i++)
                bbkp[i] = b.VectorData[i];

            //Residue variables
            double ResidueSumSquares = 1E100;
            double resAnt            = 1E200;

            double[] dblResidues = new double[n];

            float[] Solution = new float[n];

            while (ResidueSumSquares > tol && Math.Abs(resAnt - ResidueSumSquares) >= tol && resAnt > ResidueSumSquares)
                //Check if solution is not improving anymore
                resAnt = ResidueSumSquares;


                LinSolveCLStep(M, b, tol);


                if (ResidueSumSquares == tol * 2)
                    //Copies solution
                    for (int i = 0; i < n; i++)
                        Solution[i] = x.VectorData[i];
                    //Improves solution
                    for (int i = 0; i < n; i++)
                        Solution[i]    -= x.VectorData[i];
                        x.VectorData[i] = Solution[i];

                //Compute residue sum of squares and improves solution
                dblResidues       = ExactMultiply(M, x);
                ResidueSumSquares = 0;
                for (int i = 0; i < n; i++)
                    //Computes residues
                    dblResidues[i]     = dblResidues[i] - (double)bbkp[i];
                    ResidueSumSquares += dblResidues[i] * dblResidues[i];

                    b.VectorData[i] = (float)dblResidues[i];

            //Restores b data
            for (int i = 0; i < n; i++)
                b.VectorData[i] = bbkp[i];

        /// <summary>Solves linear system Mx = b using conjugate gradient method. Doesn't try to improve the solution obtained.</summary>
        /// <param name="M">Matrix M</param>
        /// <param name="b">Vector b</param>
        /// <param name="tol">Error tolerance</param>
        public void LinSolveCLStep(CLImgSparseMatrix M, CLImgVector b, float tol)
            int n    = b.Length;
            int nBy4 = 1 + ((n - 1) >> 2);

            if (lambda == null)
                lambda   = new float[1];
                CLlambda = new CLCalc.Program.Variable(lambda);

            if (r == null || r.Length != n)
                r    = new CLImgVector(n);
                p    = new CLImgVector(n);
                x    = new CLImgVector(n);
                Ap   = new CLImgVector(n);
                temp = new CLImgVector(n);
            if (temp == null)
                temp = new CLImgVector(n);

            float alpha, beta, RDotROld, RDotR;

            Multiply(M, x, Ap);

            CLCalc.Program.MemoryObject[] args = new CLCalc.Program.MemoryObject[] { b.CLVector, Ap.CLVector, r.CLVector, p.CLVector };
            kernelInitRP.Execute(args, nBy4);

            int count = 0;

            RDotR = DotProduct(r, r);

            while ((RDotR > tol) && (count < n * MAXITER))
                RDotROld = RDotR;

                //if ((count & 0x0080) == 0)
                //    Multiply(M, x, Ap);

                //    args = new CLCalc.Program.MemoryObject[] { b.CLVector, Ap.CLVector, r.CLVector, p.CLVector };
                //    kernelInitRP.Execute(args, nBy4);

                Multiply(M, p, Ap);

                alpha = RDotROld / DotProduct(Ap, p);

                //Update x
                kernelCopyToTemp.Execute(new CLCalc.Program.MemoryObject[] { x.CLVector, temp.CLVector }, nBy4);
                lambda[0] = alpha; CLlambda.WriteToDevice(lambda);
                kernelMultiplyAdd.Execute(new CLCalc.Program.MemoryObject[] { CLlambda, p.CLVector, temp.CLVector, x.CLVector }, nBy4);

                //Update r
                kernelCopyToTemp.Execute(new CLCalc.Program.MemoryObject[] { r.CLVector, temp.CLVector }, nBy4);
                lambda[0] = -alpha; CLlambda.WriteToDevice(lambda);
                kernelMultiplyAdd.Execute(new CLCalc.Program.MemoryObject[] { CLlambda, Ap.CLVector, temp.CLVector, r.CLVector }, nBy4);

                RDotR = DotProduct(r, r);
                beta  = RDotR / RDotROld;

                //Update p
                kernelCopyToTemp.Execute(new CLCalc.Program.MemoryObject[] { p.CLVector, temp.CLVector }, nBy4);
                lambda[0] = beta; CLlambda.WriteToDevice(lambda);
                kernelMultiplyAdd.Execute(new CLCalc.Program.MemoryObject[] { CLlambda, temp.CLVector, r.CLVector, p.CLVector }, nBy4);

        /// <summary>Computes dot product of 2 vectors using their OpenCL images. Assumes data has been inserted to VectorData and WriteToDevice() has been called</summary>
        /// <param name="v1">First vector</param>
        /// <param name="v2">Second vector</param>
        public float DotProduct(CLImgVector v1, CLImgVector v2)
            if (v1.Length != v2.Length) throw new Exception("Incompatible lengths");

            if (CLCalc.CLAcceleration == CLCalc.CLAccelerationType.UsingCL)
                CLDotProd(v1, v2);

                float[] resp = new float[1];

                dprod[0] = resp[0];
                for (int i = 0; i < v1.Length; i++)
                    dprod[0] += v1.VectorData[i] * v2.VectorData[i];

                dprod[0] = dprod[0];

            return dprod[0];