private static IList GetReplacementValues(ModifiableDataset modifiableDataset, string variableName, IRegressionModel model, IEnumerable <int> rows, IEnumerable <double> targetValues, out IList originalValues, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { IList replacementValues = null; if (modifiableDataset.VariableHasType <double>(variableName)) { originalValues = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList(); replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List <double>)originalValues, replacementMethod); } else if (modifiableDataset.VariableHasType <string>(variableName)) { originalValues = modifiableDataset.GetReadOnlyStringValues(variableName).ToList(); replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List <string>)originalValues, targetValues, factorReplacementMethod); } else { throw new NotSupportedException("Variable not supported"); } return(replacementValues); }
public static IEnumerable <Tuple <string, double> > CalculateImpacts( IRegressionModel model, IRegressionProblemData problemData, IEnumerable <double> estimatedValues, IEnumerable <int> rows, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames); if (missingVariables.Any()) { throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables))); } IEnumerable <double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); var originalQuality = CalculateQuality(targetValues, estimatedValues); var impacts = new Dictionary <string, double>(); var inputvariables = new HashSet <string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction)); var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable(); foreach (var inputVariable in inputvariables) { impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality); } return(impacts.Select(i => Tuple.Create(i.Key, i.Value))); }
private static IList GetReplacementValuesForString(IRegressionModel model, ModifiableDataset modifiableDataset, string variableName, IEnumerable <int> rows, List <string> originalValues, IEnumerable <double> targetValues, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Shuffle) { List <string> replacementValues = null; IRandom random = new FastRandom(31415); switch (factorReplacementMethod) { case FactorReplacementMethodEnum.Best: // try replacing with all possible values and find the best replacement value var bestQuality = double.NegativeInfinity; foreach (var repl in modifiableDataset.GetStringValues(variableName, rows).Distinct()) { List <string> curReplacementValues = Enumerable.Repeat(repl, modifiableDataset.Rows).ToList(); //fholzing: this result could be used later on (theoretically), but is neglected for better readability/method consistency var newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, curReplacementValues, targetValues); var curQuality = newValue; if (curQuality > bestQuality) { bestQuality = curQuality; replacementValues = curReplacementValues; } } break; case FactorReplacementMethodEnum.Mode: var mostCommonValue = rows.Select(r => originalValues[r]) .GroupBy(v => v) .OrderByDescending(g => g.Count()) .First().Key; replacementValues = Enumerable.Repeat(mostCommonValue, modifiableDataset.Rows).ToList(); break; case FactorReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(string.Empty, modifiableDataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; default: throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", factorReplacementMethod)); } return(replacementValues); }
public static IEnumerable <Tuple <string, double> > CalculateImpacts( IRegressionSolution solution, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, DataPartitionEnum dataPartition = DataPartitionEnum.Training) { IEnumerable <int> rows = GetPartitionRows(dataPartition, solution.ProblemData); IEnumerable <double> estimatedValues = solution.GetEstimatedValues(rows); return(CalculateImpacts(solution.Model, solution.ProblemData, estimatedValues, rows, replacementMethod, factorReplacementMethod)); }
public static IEnumerable <Tuple <string, double> > CalculateImpacts( IClassificationSolution solution, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, DataPartitionEnum dataPartition = DataPartitionEnum.Training) { IEnumerable <int> rows = GetPartitionRows(dataPartition, solution.ProblemData); IEnumerable <double> estimatedClassValues = solution.GetEstimatedClassValues(rows); var model = (IClassificationModel)solution.Model.Clone(); //mkommend: clone of model is necessary, because the thresholds for IDiscriminantClassificationModels are updated return(CalculateImpacts(model, solution.ProblemData, estimatedClassValues, rows, replacementMethod, factorReplacementMethod)); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable( IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) { var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); List <string> replacementValues; IRandom rand; switch (replacement) { case FactorReplacementMethodEnum.Mode: var mostCommonValue = rows.Select(r => originalValues[r]) .GroupBy(v => v) .OrderByDescending(g => g.Count()) .First().Key; replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList(); break; case FactorReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; default: throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement)); } return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues)); }
public static double CalculateImpact(string variableName, IRegressionModel model, IRegressionProblemData problemData, ModifiableDataset modifiableDataset, IEnumerable <int> rows, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, IEnumerable <double> targetValues = null, double quality = double.NaN) { if (!model.VariablesUsedForPrediction.Contains(variableName)) { return(0.0); } if (!problemData.Dataset.VariableNames.Contains(variableName)) { throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName)); } if (targetValues == null) { targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); } if (quality == double.NaN) { quality = CalculateQuality(model.GetEstimatedValues(modifiableDataset, rows), targetValues); } IList originalValues = null; IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod); double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues); double impact = quality - newValue; return(impact); }
public static IEnumerable <Tuple <string, double> > CalculateImpacts( IRegressionSolution solution, DataPartitionEnum data = DataPartitionEnum.Training, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { var problemData = solution.ProblemData; var dataset = problemData.Dataset; IEnumerable <int> rows; IEnumerable <double> targetValues; double originalR2 = -1; OnlineCalculatorError error; switch (data) { case DataPartitionEnum.All: rows = solution.ProblemData.AllIndices; targetValues = problemData.TargetVariableValues.ToList(); originalR2 = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error); if (error != OnlineCalculatorError.None) { throw new InvalidOperationException("Error during R² calculation."); } originalR2 = originalR2 * originalR2; break; case DataPartitionEnum.Training: rows = problemData.TrainingIndices; targetValues = problemData.TargetVariableTrainingValues.ToList(); originalR2 = solution.TrainingRSquared; break; case DataPartitionEnum.Test: rows = problemData.TestIndices; targetValues = problemData.TargetVariableTestValues.ToList(); originalR2 = solution.TestRSquared; break; default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data)); } var impacts = new Dictionary <string, double>(); var modifiableDataset = ((Dataset)dataset).ToModifiable(); var inputvariables = new HashSet <string>(problemData.AllowedInputVariables.Union(solution.Model.VariablesUsedForPrediction)); var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList(); // calculate impacts for double variables foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType <double>)) { var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod); var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error); if (error != OnlineCalculatorError.None) { throw new InvalidOperationException("Error during R² calculation with replaced inputs."); } newR2 = newR2 * newR2; var impact = originalR2 - newR2; impacts[inputVariable] = impact; } // calculate impacts for string variables foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType <string>)) { if (factorReplacementMethod == FactorReplacementMethodEnum.Best) { // try replacing with all possible values and find the best replacement value var smallestImpact = double.PositiveInfinity; foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) { var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, Enumerable.Repeat(repl, dataset.Rows)); var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error); if (error != OnlineCalculatorError.None) { throw new InvalidOperationException("Error during R² calculation with replaced inputs."); } newR2 = newR2 * newR2; var impact = originalR2 - newR2; if (impact < smallestImpact) { smallestImpact = impact; } } impacts[inputVariable] = smallestImpact; } else { // for replacement methods shuffle and mode // calculate impacts for factor variables var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, factorReplacementMethod); var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error); if (error != OnlineCalculatorError.None) { throw new InvalidOperationException("Error during R² calculation with replaced inputs."); } newR2 = newR2 * newR2; var impact = originalR2 - newR2; impacts[inputVariable] = impact; } } // foreach return(impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value))); }