private static IList GetReplacementValues(ModifiableDataset modifiableDataset, string variableName, IRegressionModel model, IEnumerable <int> rows, IEnumerable <double> targetValues, out IList originalValues, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { IList replacementValues = null; if (modifiableDataset.VariableHasType <double>(variableName)) { originalValues = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList(); replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List <double>)originalValues, replacementMethod); } else if (modifiableDataset.VariableHasType <string>(variableName)) { originalValues = modifiableDataset.GetReadOnlyStringValues(variableName).ToList(); replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List <string>)originalValues, targetValues, factorReplacementMethod); } else { throw new NotSupportedException("Variable not supported"); } return(replacementValues); }
private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner) { var variables = variableValues.Keys.ToList(); foreach (var v in variables) { var type = GetVariableType(v); if (type == typeof(DateTime)) { variableValues[v] = GetDateTimeValues(v).ToList(); } else if (type == typeof(double)) { variableValues[v] = GetDoubleValues(v).ToList(); } else if (type == typeof(string)) { variableValues[v] = GetStringValues(v).ToList(); } else { throw new ArgumentException("Unsupported type " + type + " for variable " + v); } } }
private static double CalculateQualityForReplacement( IClassificationModel model, ModifiableDataset modifiableDataset, string variableName, IList originalValues, IEnumerable <int> rows, IList replacementValues, IEnumerable <double> targetValues) { modifiableDataset.ReplaceVariable(variableName, replacementValues); var discModel = model as IDiscriminantFunctionClassificationModel; if (discModel != null) { var problemData = new ClassificationProblemData(modifiableDataset, modifiableDataset.VariableNames, model.TargetVariable); discModel.RecalculateModelParameters(problemData, rows); } //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedClassValues(modifiableDataset, rows).ToList(); var ret = CalculateQuality(targetValues, estimates); modifiableDataset.ReplaceVariable(variableName, originalValues); return(ret); }
private static IList GetReplacementValuesForString(IRegressionModel model, ModifiableDataset modifiableDataset, string variableName, IEnumerable <int> rows, List <string> originalValues, IEnumerable <double> targetValues, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Shuffle) { List <string> replacementValues = null; IRandom random = new FastRandom(31415); switch (factorReplacementMethod) { case FactorReplacementMethodEnum.Best: // try replacing with all possible values and find the best replacement value var bestQuality = double.NegativeInfinity; foreach (var repl in modifiableDataset.GetStringValues(variableName, rows).Distinct()) { List <string> curReplacementValues = Enumerable.Repeat(repl, modifiableDataset.Rows).ToList(); //fholzing: this result could be used later on (theoretically), but is neglected for better readability/method consistency var newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, curReplacementValues, targetValues); var curQuality = newValue; if (curQuality > bestQuality) { bestQuality = curQuality; replacementValues = curReplacementValues; } } break; case FactorReplacementMethodEnum.Mode: var mostCommonValue = rows.Select(r => originalValues[r]) .GroupBy(v => v) .OrderByDescending(g => g.Count()) .First().Key; replacementValues = Enumerable.Repeat(mostCommonValue, modifiableDataset.Rows).ToList(); break; case FactorReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(string.Empty, modifiableDataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; default: throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", factorReplacementMethod)); } return(replacementValues); }
private static IList GetReplacementValuesForDouble(ModifiableDataset modifiableDataset, IEnumerable <int> rows, List <double> originalValues, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle) { IRandom random = new FastRandom(31415); List <double> replacementValues; double replacementValue; switch (replacementMethod) { case ReplacementMethodEnum.Median: replacementValue = rows.Select(r => originalValues[r]).Median(); replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList(); break; case ReplacementMethodEnum.Average: replacementValue = rows.Select(r => originalValues[r]).Average(); replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList(); break; case ReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; case ReplacementMethodEnum.Noise: var avg = rows.Select(r => originalValues[r]).Average(); var stdDev = rows.Select(r => originalValues[r]).StandardDeviation(); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList(); // update column values foreach (var r in rows) { replacementValues[r] = NormalDistributedRandom.NextDouble(random, avg, stdDev); } break; default: throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacementMethod)); } return(replacementValues); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, IEnumerable <string> replacementValues) { var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); dataset.ReplaceVariable(variable, replacementValues.ToList()); //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedValues(dataset, rows).ToList(); dataset.ReplaceVariable(variable, originalValues); return(estimates); }
private static double CalculateQualityForReplacement( IRegressionModel model, ModifiableDataset modifiableDataset, string variableName, IList originalValues, IEnumerable <int> rows, IList replacementValues, IEnumerable <double> targetValues) { modifiableDataset.ReplaceVariable(variableName, replacementValues); //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedValues(modifiableDataset, rows).ToList(); var ret = CalculateQuality(targetValues, estimates); modifiableDataset.ReplaceVariable(variableName, originalValues); return(ret); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable( IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) { var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); List <string> replacementValues; IRandom rand; switch (replacement) { case FactorReplacementMethodEnum.Mode: var mostCommonValue = rows.Select(r => originalValues[r]) .GroupBy(v => v) .OrderByDescending(g => g.Count()) .First().Key; replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList(); break; case FactorReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; default: throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement)); } return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues)); }
public static double CalculateImpact(string variableName, IRegressionModel model, IRegressionProblemData problemData, ModifiableDataset modifiableDataset, IEnumerable <int> rows, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, IEnumerable <double> targetValues = null, double quality = double.NaN) { if (!model.VariablesUsedForPrediction.Contains(variableName)) { return(0.0); } if (!problemData.Dataset.VariableNames.Contains(variableName)) { throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName)); } if (targetValues == null) { targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); } if (quality == double.NaN) { quality = CalculateQuality(model.GetEstimatedValues(modifiableDataset, rows), targetValues); } IList originalValues = null; IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod); double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues); double impact = quality - newValue; return(impact); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); double replacementValue; List <double> replacementValues; IRandom rand; switch (replacement) { case ReplacementMethodEnum.Median: replacementValue = rows.Select(r => originalValues[r]).Median(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Average: replacementValue = rows.Select(r => originalValues[r]).Average(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; case ReplacementMethodEnum.Noise: var avg = rows.Select(r => originalValues[r]).Average(); var stdDev = rows.Select(r => originalValues[r]).StandardDeviation(); rand = new FastRandom(31415); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList(); // update column values foreach (var r in rows) { replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev); } break; default: throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement)); } return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues)); }
private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner) { variableNames = new List <string>(original.variableNames); variableValues = CloneValues(original.variableValues); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); double replacementValue; List <double> replacementValues; IRandom rand; switch (replacement) { case ReplacementMethodEnum.Median: replacementValue = rows.Select(r => originalValues[r]).Median(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Average: replacementValue = rows.Select(r => originalValues[r]).Average(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); replacementValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); break; case ReplacementMethodEnum.Noise: var avg = rows.Select(r => originalValues[r]).Average(); var stdDev = rows.Select(r => originalValues[r]).StandardDeviation(); rand = new FastRandom(31415); replacementValues = rows.Select(_ => NormalDistributedRandom.NextDouble(rand, avg, stdDev)).ToList(); break; default: throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement)); } dataset.ReplaceVariable(variable, replacementValues); //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedValues(dataset, rows).ToList(); dataset.ReplaceVariable(variable, originalValues); return(estimates); }