Пример #1
0
        /// <summary>
        /// The strategy within each elimination step is to drop the variable with the largest p-value, refit the model, and reasses
        /// the inclusion of all variables.
        /// </summary>
        /// <param name="solver"></param>
        /// <param name="records"></param>
        /// <param name="significance_level"></param>
        /// <param name="one_sided"></param>
        /// <returns>The index list of the predictor variables to be included in the regression model</returns>
        public static List <int> EliminateByPValue(GlmSolverFactory solverFactory, List <RDataRecord> records, double significance_level = 0.05, bool one_sided = false)
        {
            int full_model_feature_count = records[0].FeatureCount;

            List <int> candidate_features = new List <int>();

            for (int i = 1; i <= full_model_feature_count; ++i)
            {
                candidate_features.Add(i);
            }

            Glm solver = FitModel(solverFactory, records);

            double[] pValues = CalcPValues(solver.X, solver.Statistics.StandardErrors, records.Count, one_sided);

            double maxPValue;
            int    featureIndexWithMaxPValue = SelectFeatureIndexWithMaxPValue(pValues, out maxPValue);
            int    eliminatedFeatureId       = candidate_features[featureIndexWithMaxPValue];

            while (maxPValue > significance_level)
            {
                candidate_features.Remove(eliminatedFeatureId);

                RefitModel(candidate_features, solverFactory, records, out solver);

                pValues = CalcPValues(solver.X, solver.Statistics.StandardErrors, records.Count, one_sided);
                featureIndexWithMaxPValue = SelectFeatureIndexWithMaxPValue(pValues, out maxPValue);
                eliminatedFeatureId       = candidate_features[featureIndexWithMaxPValue];
            }
            return(candidate_features);
        }
Пример #2
0
        public static Glm FitModel(GlmSolverFactory solverFactory, List <RDataRecord> records)
        {
            DataTransformer dt = new DataTransformer();

            dt.DoFeaturesScaling(records);

            Glm solver = solverFactory.CreateSolver(records);

            solver.Solve();

            return(solver);
        }
Пример #3
0
        /// <summary>
        /// This is an alternative to using p-values in the backward elimination by using adjusted R^2.
        /// At each elimination step, we refit the model without each of the variable up for potential eliminiation.
        /// If one of these smaller models has a higher adjusted R^2 than our current model, we pick the smaller
        /// model with the largest adjusted R^2. We continue in this way until removing variables does not increase adjusted R^2.
        /// </summary>
        /// <param name="solver"></param>
        /// <param name="records"></param>
        /// <returns>The index list of the predictor variables to be included in the regression model</returns>
        public static List <int> BackwardEliminate(GlmSolverFactory solverFactory, List <RDataRecord> records, ModelSelectionCriteria criteria = ModelSelectionCriteria.AdjustedRSquare)
        {
            int full_model_feature_count = records[0].FeatureCount;

            List <int> candidate_features = new List <int>();

            for (int i = 1; i <= full_model_feature_count; ++i)
            {
                candidate_features.Add(i);
            }

            int n = records.Count;

            double[] outcomes = new double[n];
            for (int i = 0; i < n; ++i)
            {
                outcomes[i] = records[i].YValue;
            }

            Glm solver = FitModel(solverFactory, records);

            double fitness_score = -1;

            if (criteria == ModelSelectionCriteria.AdjustedRSquare)
            {
                fitness_score = CalcAdjustedRSquare(solver.Statistics.Residuals, outcomes, candidate_features.Count, records.Count);
            }
            else if (criteria == ModelSelectionCriteria.AIC)
            {
                double L = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, records, solver.X);
                int    k = solver.X.Length;
                fitness_score = -CalcAIC(L, k, n); //negative sign as the lower the AIC, the better the fitted regression model
            }
            else if (criteria == ModelSelectionCriteria.BIC)
            {
                double L_hat = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, records, solver.X);
                int    k     = solver.X.Length;
                fitness_score = -CalcBIC(L_hat, k, n); //negative sign as the lower the BIC, the better the fitted regression model
            }
            else
            {
                throw new NotImplementedException();
            }


            bool improved = true;

            while (improved)
            {
                int eliminatedFeatureId = -1;
                for (int i = 0; i < candidate_features.Count; ++i)
                {
                    List <int> candidate_features_temp = new List <int>();
                    for (int j = 0; j < candidate_features.Count; ++j)
                    {
                        if (i == j)
                        {
                            continue;
                        }
                        candidate_features_temp.Add(candidate_features[j]);
                    }

                    List <RDataRecord> transformed_data_under_model = RefitModel(candidate_features_temp, solverFactory, records, out solver);

                    double new_fitness_score = -1;
                    if (criteria == ModelSelectionCriteria.AdjustedRSquare)
                    {
                        new_fitness_score = CalcAdjustedRSquare(solver.Statistics.Residuals, outcomes, candidate_features_temp.Count, records.Count);
                    }
                    else if (criteria == ModelSelectionCriteria.AIC)
                    {
                        double L = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, transformed_data_under_model, solver.X);
                        int    k = solver.X.Length;
                        new_fitness_score = -CalcAIC(L, k, n); //negative sign as the lower the AIC, the better the fitted regression model
                    }
                    else if (criteria == ModelSelectionCriteria.BIC)
                    {
                        double L_hat = GlmLikelihoodFunction.GetLikelihood(solver.DistributionFamily, transformed_data_under_model, solver.X);
                        int    k     = solver.X.Length;
                        new_fitness_score = -CalcBIC(L_hat, k, n); //negative sign as the lower the BIC, the better the fitted regression model
                    }

                    if (fitness_score < new_fitness_score)
                    {
                        eliminatedFeatureId = i;
                        fitness_score       = new_fitness_score;
                    }
                }

                if (eliminatedFeatureId == -1)
                {
                    improved = false;
                }
                else
                {
                    candidate_features.Remove(eliminatedFeatureId);
                }
            }

            return(candidate_features);
        }
Пример #4
0
        public static List <RDataRecord> RefitModel(List <int> candidate_features, GlmSolverFactory solverFactory, List <RDataRecord> records, out Glm solver)
        {
            List <RDataRecord> records2 = new List <RDataRecord>();

            foreach (RDataRecord rec in records)
            {
                RDataRecord rec2 = new RDataRecord(candidate_features.Count);

                for (int d = 0; d < candidate_features.Count; ++d)
                {
                    int featureId = candidate_features[d];
                    rec2.data[d + 1] = rec.data[featureId];
                }
                records2.Add(rec2);
            }

            solver = FitModel(solverFactory, records2);

            return(records2);
        }