/// <summary> /// The strategy within each elimination step is to drop the variable with the largest p-value, refit the model, and reasses /// the inclusion of all variables. /// </summary> /// <param name="solver"></param> /// <param name="records"></param> /// <param name="significance_level"></param> /// <param name="one_sided"></param> /// <returns>The index list of the predictor variables to be included in the regression model</returns> public static List <int> EliminateByPValue(GlmSolverFactory solverFactory, List <RDataRecord> records, double significance_level = 0.05, bool one_sided = false) { int full_model_feature_count = records[0].FeatureCount; List <int> candidate_features = new List <int>(); for (int i = 1; i <= full_model_feature_count; ++i) { candidate_features.Add(i); } Glm solver = FitModel(solverFactory, records); double[] pValues = CalcPValues(solver.X, solver.Statistics.StandardErrors, records.Count, one_sided); double maxPValue; int featureIndexWithMaxPValue = SelectFeatureIndexWithMaxPValue(pValues, out maxPValue); int eliminatedFeatureId = candidate_features[featureIndexWithMaxPValue]; while (maxPValue > significance_level) { candidate_features.Remove(eliminatedFeatureId); RefitModel(candidate_features, solverFactory, records, out solver); pValues = CalcPValues(solver.X, solver.Statistics.StandardErrors, records.Count, one_sided); featureIndexWithMaxPValue = SelectFeatureIndexWithMaxPValue(pValues, out maxPValue); eliminatedFeatureId = candidate_features[featureIndexWithMaxPValue]; } return(candidate_features); }
public static Glm FitModel(GlmSolverFactory solverFactory, List <RDataRecord> records) { DataTransformer dt = new DataTransformer(); dt.DoFeaturesScaling(records); Glm solver = solverFactory.CreateSolver(records); solver.Solve(); return(solver); }
public static List <RDataRecord> RefitModel(List <int> candidate_features, GlmSolverFactory solverFactory, List <RDataRecord> records, out Glm solver) { List <RDataRecord> records2 = new List <RDataRecord>(); foreach (RDataRecord rec in records) { RDataRecord rec2 = new RDataRecord(candidate_features.Count); for (int d = 0; d < candidate_features.Count; ++d) { int featureId = candidate_features[d]; rec2.data[d + 1] = rec.data[featureId]; } records2.Add(rec2); } solver = FitModel(solverFactory, records2); return(records2); }
/// <summary> /// This is an alternative to using p-values in the backward elimination by using adjusted R^2. /// At each elimination step, we refit the model without each of the variable up for potential eliminiation. /// If one of these smaller models has a higher adjusted R^2 than our current model, we pick the smaller /// model with the largest adjusted R^2. We continue in this way until removing variables does not increase adjusted R^2. /// </summary> /// <param name="solver"></param> /// <param name="records"></param> /// <returns>The index list of the predictor variables to be included in the regression model</returns> public static List <int> BackwardEliminate(GlmSolverFactory solverFactory, List <RDataRecord> records, ModelSelectionCriteria criteria = ModelSelectionCriteria.AdjustedRSquare) { int full_model_feature_count = records[0].FeatureCount; List <int> candidate_features = new List <int>(); for (int i = 1; i <= full_model_feature_count; ++i) { candidate_features.Add(i); } int n = records.Count; double[] outcomes = new double[n]; for (int i = 0; i < n; ++i) { outcomes[i] = records[i].YValue; } Glm solver = FitModel(solverFactory, records); double fitness_score = -1; if (criteria == ModelSelectionCriteria.AdjustedRSquare) { fitness_score = CalcAdjustedRSquare(solver.Statistics.Residuals, outcomes, candidate_features.Count, records.Count); } else if (criteria == ModelSelectionCriteria.AIC) { double L = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, records, solver.X); int k = solver.X.Length; fitness_score = -CalcAIC(L, k, n); //negative sign as the lower the AIC, the better the fitted regression model } else if (criteria == ModelSelectionCriteria.BIC) { double L_hat = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, records, solver.X); int k = solver.X.Length; fitness_score = -CalcBIC(L_hat, k, n); //negative sign as the lower the BIC, the better the fitted regression model } else { throw new NotImplementedException(); } bool improved = true; while (improved) { int eliminatedFeatureId = -1; for (int i = 0; i < candidate_features.Count; ++i) { List <int> candidate_features_temp = new List <int>(); for (int j = 0; j < candidate_features.Count; ++j) { if (i == j) { continue; } candidate_features_temp.Add(candidate_features[j]); } List <RDataRecord> transformed_data_under_model = RefitModel(candidate_features_temp, solverFactory, records, out solver); double new_fitness_score = -1; if (criteria == ModelSelectionCriteria.AdjustedRSquare) { new_fitness_score = CalcAdjustedRSquare(solver.Statistics.Residuals, outcomes, candidate_features_temp.Count, records.Count); } else if (criteria == ModelSelectionCriteria.AIC) { double L = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, transformed_data_under_model, solver.X); int k = solver.X.Length; new_fitness_score = -CalcAIC(L, k, n); //negative sign as the lower the AIC, the better the fitted regression model } else if (criteria == ModelSelectionCriteria.BIC) { double L_hat = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, transformed_data_under_model, solver.X); int k = solver.X.Length; new_fitness_score = -CalcBIC(L_hat, k, n); //negative sign as the lower the BIC, the better the fitted regression model } if (fitness_score < new_fitness_score) { eliminatedFeatureId = i; fitness_score = new_fitness_score; } } if (eliminatedFeatureId == -1) { improved = false; } else { candidate_features.Remove(eliminatedFeatureId); } } return(candidate_features); }