private void compute_regression(double[,] x, double[] y, double[] w) { int m = x.GetLength(1); Math.Linalg.Matrix A = null; int n = x.GetLength(0); if (m_intercept) { A = new Linalg.Matrix(n, m + 1); for (int i = 0; i < n; i++) { for (int j = 0; j < m; j++) { A[i, j] = x[i, j]; } A[i, m] = 1.0; } } else { A = new Linalg.Matrix(x, false); } compute_coefficients(A, y, w); }
/// <summary> /// 1D constructor /// </summary> /// <param name="x"></param> /// <param name="y"></param> /// <param name="w"></param> /// <param name="intercept"></param> public LinearRegression(double[] x, double[] y, double[] w, bool intercept) { if (x == null || y == null || x.Length != y.Length) { throw new ArgumentException("LinearRegression: check input arguments"); } bool use_weights = (w != null); if (use_weights && w.Length != y.Length) { throw new ArgumentException("LinearRegression: array of weights should have the same length"); } m_intercept = intercept; //init variable names m_names = new string[1 + (m_intercept ? 1 : 0)]; m_names[0] = "c1"; if (m_intercept) { m_names[1] = "(intercept)"; } Math.Linalg.Matrix A = null; int n = x.Length; if (m_intercept) { int m = 1; A = new Linalg.Matrix(n, m + 1); for (int i = 0; i < n; i++) { A[i, 0] = x[0]; A[i, 1] = 1.0; //intercept } } else { A = new Linalg.Matrix(x, false); } compute_coefficients(A, y, w); }
private void compute_coefficients(Linalg.Matrix A, double[] y, double[] w) { Linalg.Matrix b = new Linalg.Matrix(y, false); Linalg.Matrix coeffs; Linalg.Matrix cov; Linalg.SvDecomposition sv; if (A.Rows > A.Columns) { sv = new Linalg.SvDecomposition(A); coeffs = sv.Solve(b, false); } else { sv = new Linalg.SvDecomposition(A.Transpose()); coeffs = sv.Solve(b, true); } cov = sv.Cov(); Linalg.Matrix fit = A * coeffs; Linalg.Matrix residuals = fit - b; double mss = m_intercept ? Math.Stats.SumOfSquaredDev(fit.RowPackedData(), w) : Math.Stats.SumOfSquares(fit.RowPackedData()); double tss = m_intercept ? Math.Stats.SumOfSquaredDev(y, w) : Math.Stats.SumOfSquares(y); double rss = Math.Stats.SumOfSquares(residuals.RowPackedData()); //without intercept residuals dont add up to zero m_observations = A.Rows; m_coeffs = coeffs.RowPackedData(); int p = m_coeffs.Length; int n = m_observations; int dof = n - p; int dof_intercept = m_intercept ? 1 : 0; m_coeffs_stderr = new double[p]; //coefficient standard errors double stderr = System.Math.Sqrt(rss / dof); //residual sum of squares for (int i = 0; i < p; i++) { m_coeffs_stderr[i] = stderr * System.Math.Sqrt(cov[i, i]); } double r2 = mss / (mss + rss); double r2_adj = 1.0 - (n - dof_intercept) * (1.0 - r2) / dof; double fvalue = (mss / (p - dof_intercept)) / (rss / dof); double aic = n + n * System.Math.Log(2 * System.Math.PI) + n * System.Math.Log(rss / n) + 2 * (m_coeffs.Length + 1); // sum(log(w)) //generate summary m_summary = new Dictionary <string, double>(); m_summary["n"] = m_observations; m_summary["p"] = m_coeffs.Length; m_summary["dof"] = dof; m_summary["stderr"] = stderr; m_summary["rss"] = rss; m_summary["tss"] = tss; m_summary["mss"] = mss; m_summary["r2"] = r2; m_summary["r2_adj"] = r2_adj; m_summary["fvalue"] = fvalue; m_summary["aic"] = aic; for (int i = 0; i < p; i++) { double coef = m_coeffs[i]; double coef_std = m_coeffs_stderr[i]; double tvalue = coef / coef_std; double pvalue = Math.Special.incbet(0.5 * dof, 0.5, dof / (dof + tvalue * tvalue)); m_summary[m_names[i]] = coef; m_summary[m_names[i] + "_se"] = coef_std; m_summary[m_names[i] + "_tvalue"] = tvalue; m_summary[m_names[i] + "_pvalue"] = pvalue; } m_summary["intercept"] = m_intercept ? 1.0 : 0.0; }
List <double> m_gamma; //step sizes /// <summary> /// First Lars model, (includes intercept) /// </summary> /// <param name="x"></param> /// <param name="y"></param> public Lars(double[,] x, double[] y) { bool lasso = false; int n = x.GetLength(0); //number of observations int p = x.GetLength(1); //number of variables int maxvars = System.Math.Min(n - 1, p); //maximum number of variables int maxit = 8 * maxvars; m_beta = new List <double[]>(); // this are coefficients on each step m_gamma = new List <double>(); SortedSet <int> c_set = new SortedSet <int>(); //candidate set SortedSet <int> a_set = new SortedSet <int>(); //active set //initialize candidate set with all available variables for (int i = 0; i < p; i++) { c_set.Add(i); } //compute gramm matrix (gram = x' * x) Linalg.Matrix full_gram = new Linalg.Matrix(p, p); for (int i = 0; i < p; i++) { for (int j = 0; j < p; j++) { double sum = 0.0; for (int k = 0; k < n; k++) { sum += x[k, i] * x[k, j]; } full_gram[i, j] = sum; } } //bool stop_flag = false; double[] mu = new double[n]; //lars regression vector double[] c = new double[p]; //correlations for (int it = 0; it < maxit; it++) { //compute correlations for (int j = 0; j < p; j++) { double sum = 0.0; for (int i = 0; i < n; i++) { sum += x[i, j] * (y[i] - mu[i]); } c[j] = sum; } //find abs max corr from candidate set double max_abs_c = 0.0; int max_abs_c_index = -1; foreach (int i in c_set) { double abs_c = System.Math.Abs(c[i]); if (abs_c > max_abs_c) { max_abs_c = abs_c; max_abs_c_index = i; } } //exit if there is no correlation if (max_abs_c < ACQ.Math.Const.epsilon) { break; } a_set.Add(max_abs_c_index); c_set.Remove(max_abs_c_index); int vars = a_set.Count; double[] s = new double[vars]; foreach (int i in a_set) { s[i] = ACQ.Math.Utils.Sign(c[i]); } //compute partical Gram matrix, Gram = X(active_columns)' * X(active_columns) int[] active_indices = a_set.ToArray(); Linalg.Matrix gram = full_gram.Submatrix(active_indices, active_indices); Linalg.CholeskyDecomposition gram_chol = new Linalg.CholeskyDecomposition(gram); Linalg.Matrix inv_gram = gram_chol.Solve(s); //compute coefficients of equiangular vector double[] w = new double[vars]; double norm = 0.0; for (int i = 0; i < vars; i++) { w[i] = s[i] * inv_gram[i, 0]; norm += w[i]; } double scale = 1.0 / System.Math.Sqrt(norm); for (int i = 0; i < vars; i++) { w[i] = scale * w[i]; } //compute equiangular vector double[] u = new double[n]; for (int i = 0; i < n; i++) { double sum = 0.0; for (int j = 0; j < vars; j++) { sum += x[i, active_indices[j]] * w[j]; } u[i] = sum; } double gamma = max_abs_c / scale; // set gamma to the largest value (i.e. use regular least squares) //correlation (angle) between equiangular vector and all remaining variables foreach (int i in c_set) { double angle = 0.0; for (int j = 0; j < n; j++) { angle += x[j, i] * u[j]; } double t1 = (max_abs_c - c[i]) / (scale - angle); double t2 = (max_abs_c + c[i]) / (scale + angle); if (t1 > 0) { gamma = System.Math.Min(t1, gamma); } if (t2 > 0) { gamma = System.Math.Min(t2, gamma); } } //LASSO code here if (lasso) { } //update coefficients double[] beta = new double[p]; if (m_beta.Count > 0) { double[] pev_beta = m_beta[m_beta.Count - 1]; for (int i = 0; i < vars; i++) { int index = active_indices[i]; beta[index] = pev_beta[index]; } } for (int i = 0; i < vars; i++) { beta[active_indices[i]] += gamma * w[i]; } m_beta.Add(beta); m_gamma.Add(gamma); //update lars vector for (int i = 0; i < n; i++) { mu[i] += gamma * u[i]; } } }
/// <summary> /// 1D constructor /// </summary> /// <param name="x"></param> /// <param name="y"></param> /// <param name="w"></param> /// <param name="intercept"></param> public LinearRegression(double[] x, double[] y, double[] w, bool intercept) { if (x == null || y == null || x.Length != y.Length) { throw new ArgumentException("LinearRegression: check input arguments (x and y can't be null and must have the same size)"); } m_weighted = (w != null); if (m_weighted && w.Length != y.Length) { throw new ArgumentException("LinearRegression: array of weights should have the same length as x and y"); } m_intercept = intercept; //init variable names m_names = new string[1 + (m_intercept ? 1 : 0)]; m_names[0] = "c1"; if (m_intercept) { m_names[1] = "(intercept)"; } Math.Linalg.Matrix A = null; int n = x.Length; //check input for NaN for (int i = 0; i < x.Length; i++) { if (Double.IsNaN(x[i]) || Double.IsNaN(y[i])) { throw new ArgumentException("LinearRegression: there should not be NaN values in x or y"); } if (m_weighted && Double.IsNaN(w[i])) { throw new ArgumentException("LinearRegression: weights vector should not have NaN values"); } } if (m_intercept) { int m = 1; A = new Linalg.Matrix(n, m + 1); for (int i = 0; i < n; i++) { A[i, 0] = x[i]; A[i, 1] = 1.0; //intercept } } else { A = new Linalg.Matrix(x); } compute_coefficients(A, y, w); }
private void compute_coefficients(Linalg.Matrix A, double[] y, double[] w) { Linalg.Matrix b = new Linalg.Matrix(y, false); double[] w_sqrt = null; if (m_weighted) { w_sqrt = new double[b.Rows]; for (int i = 0; i < b.Rows; i++) { w_sqrt[i] = System.Math.Sqrt(w[i]); b[i, 0] = b[i, 0] * w_sqrt[i]; //here we assume that specified weights are sigmas (i.e. not sigma square) for (int j = 0; j < A.Columns; j++) { A[i, j] = A[i, j] * w_sqrt[i]; } } } Linalg.Matrix coeffs; Linalg.Matrix cov; Linalg.SvDecomposition sv; if (A.Rows > A.Columns) { sv = new Linalg.SvDecomposition(A); coeffs = sv.Solve(b, false); } else { sv = new Linalg.SvDecomposition(A.Transpose()); coeffs = sv.Solve(b, true); } cov = sv.Cov(); Linalg.Matrix fit = A * coeffs; Linalg.Matrix residuals = fit - b; double aic_weight_correction = 0; double weight_sum = fit.Rows; if (m_weighted) { for (int i = 0; i < fit.Rows; i++) { fit[i, 0] = fit[i, 0] / w_sqrt[i]; residuals[i, 0] = fit[i, 0] - b[i, 0] / w_sqrt[i]; aic_weight_correction += System.Math.Log(w[i]); } weight_sum = Math.Stats.Utils.Sum(w); } double mss = Math.Stats.Utils.SumOfSquaredDev(fit.RowPackedData(), w); double rss = Math.Stats.Utils.SumOfSquares(residuals.RowPackedData(), w); //without intercept residuals dont add up to zero double tss = Math.Stats.Utils.SumOfSquaredDev(y, w); m_observations = A.Rows; m_coeffs = coeffs.RowPackedData(); int p = m_coeffs.Length; int n = m_observations; int dof = n - p; int dof_intercept = m_intercept ? 1 : 0; m_coeffs_stderr = new double[p]; //coefficient standard errors double stderr = System.Math.Sqrt(rss / dof); //residual sum of squares for (int i = 0; i < p; i++) { m_coeffs_stderr[i] = stderr * System.Math.Sqrt(cov[i, i]); } double r2 = mss / (mss + rss); double r2_adj = 1.0 - (n - dof_intercept) * (1.0 - r2) / dof; double fvalue = (mss / (p - dof_intercept)) / (rss / dof); double aic_bic = n + n * System.Math.Log(2 * System.Math.PI) + n * System.Math.Log(rss / n) - aic_weight_correction; double aic = aic_bic + 2 * (m_coeffs.Length + 1); double bic = aic_bic + System.Math.Log(m_observations) * (m_coeffs.Length + 1); //generate summary m_summary = new Dictionary <string, double>(); m_summary["n"] = m_observations; m_summary["p"] = m_coeffs.Length; m_summary["dof"] = dof; m_summary["stderr"] = stderr; m_summary["rss"] = rss; m_summary["tss"] = tss; m_summary["mss"] = mss; m_summary["r2"] = r2; m_summary["r2_adj"] = r2_adj; m_summary["fvalue"] = fvalue; m_summary["aic"] = aic; m_summary["bic"] = bic; for (int i = 0; i < p; i++) { double coef = m_coeffs[i]; double coef_std = m_coeffs_stderr[i]; double tvalue = coef / coef_std; double pvalue = Math.Special.incbet(0.5 * dof, 0.5, dof / (dof + tvalue * tvalue)); m_summary[m_names[i]] = coef; m_summary[m_names[i] + "_se"] = coef_std; m_summary[m_names[i] + "_tvalue"] = tvalue; m_summary[m_names[i] + "_pvalue"] = pvalue; } m_summary["intercept"] = m_intercept ? 1.0 : 0.0; }