protected DiscriminantFunctionClassificationModel(DiscriminantFunctionClassificationModel original, Cloner cloner) : base(original, cloner) { model = cloner.Clone(original.model); classValues = (double[])original.classValues.Clone(); thresholds = (double[])original.thresholds.Clone(); }
protected DampenedModel(DampenedModel original, Cloner cloner) : base(original, cloner) { Model = cloner.Clone(original.Model); Min = original.Min; Max = original.Max; Dampening = original.Dampening; }
public static IEnumerable <Tuple <string, double> > CalculateImpacts( IRegressionModel model, IRegressionProblemData problemData, IEnumerable <double> estimatedValues, IEnumerable <int> rows, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames); if (missingVariables.Any()) { throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables))); } IEnumerable <double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); var originalQuality = CalculateQuality(targetValues, estimatedValues); var impacts = new Dictionary <string, double>(); var inputvariables = new HashSet <string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction)); var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable(); foreach (var inputVariable in inputvariables) { impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality); } return(impacts.Select(i => Tuple.Create(i.Key, i.Value))); }
public static bool IsProblemDataCompatible(IRegressionModel model, IRegressionProblemData problemData, out string errorMessage) { if (model == null) { throw new ArgumentNullException("model", "The provided model is null."); } if (problemData == null) { throw new ArgumentNullException("problemData", "The provided problemData is null."); } errorMessage = string.Empty; if (model.TargetVariable != problemData.TargetVariable) { errorMessage = string.Format("The target variable of the model {0} does not match the target variable of the problemData {1}.", model.TargetVariable, problemData.TargetVariable); } var evaluationErrorMessage = string.Empty; var datasetCompatible = model.IsDatasetCompatible(problemData.Dataset, out evaluationErrorMessage); if (!datasetCompatible) { errorMessage += evaluationErrorMessage; } return(string.IsNullOrEmpty(errorMessage)); }
private static IList GetReplacementValues(ModifiableDataset modifiableDataset, string variableName, IRegressionModel model, IEnumerable <int> rows, IEnumerable <double> targetValues, out IList originalValues, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { IList replacementValues = null; if (modifiableDataset.VariableHasType <double>(variableName)) { originalValues = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList(); replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List <double>)originalValues, replacementMethod); } else if (modifiableDataset.VariableHasType <string>(variableName)) { originalValues = modifiableDataset.GetReadOnlyStringValues(variableName).ToList(); replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List <string>)originalValues, targetValues, factorReplacementMethod); } else { throw new NotSupportedException("Variable not supported"); } return(replacementValues); }
protected DampenedModel(IRegressionModel model, IRegressionProblemData pd, double dampening) : base(model.TargetVariable) { Model = model; Min = pd.TargetVariableTrainingValues.Min(); Max = pd.TargetVariableTrainingValues.Max(); Dampening = dampening; }
public void SetModelWeight(IRegressionModel model, double weight) { var index = models.IndexOf(model); modelWeights[index] = weight; OnChanged(); }
private static IList GetReplacementValuesForString(IRegressionModel model, ModifiableDataset modifiableDataset, string variableName, IEnumerable <int> rows, List <string> originalValues, IEnumerable <double> targetValues, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Shuffle) { List <string> replacementValues = null; IRandom random = new FastRandom(31415); switch (factorReplacementMethod) { case FactorReplacementMethodEnum.Best: // try replacing with all possible values and find the best replacement value var bestQuality = double.NegativeInfinity; foreach (var repl in modifiableDataset.GetStringValues(variableName, rows).Distinct()) { List <string> curReplacementValues = Enumerable.Repeat(repl, modifiableDataset.Rows).ToList(); //fholzing: this result could be used later on (theoretically), but is neglected for better readability/method consistency var newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, curReplacementValues, targetValues); var curQuality = newValue; if (curQuality > bestQuality) { bestQuality = curQuality; replacementValues = curReplacementValues; } } break; case FactorReplacementMethodEnum.Mode: var mostCommonValue = rows.Select(r => originalValues[r]) .GroupBy(v => v) .OrderByDescending(g => g.Count()) .First().Key; replacementValues = Enumerable.Repeat(mostCommonValue, modifiableDataset.Rows).ToList(); break; case FactorReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(string.Empty, modifiableDataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; default: throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", factorReplacementMethod)); } return(replacementValues); }
public void Add(IRegressionModel model) { if (string.IsNullOrEmpty(TargetVariable)) { TargetVariable = model.TargetVariable; } Add(model, 1.0); }
public static Output <Tensor, Tensor> Coefficients(IRegressionModel model) { Tensor coefficients = new Tensor(model.SkLearnModel.GetAttr("coef_")); Tensor intercept = new Tensor(model.SkLearnModel.GetAttr("intercept_")); return(new Output <Tensor, Tensor> { Item1 = coefficients, Item2 = intercept }); }
public DiscriminantFunctionClassificationModel(IRegressionModel model, IDiscriminantFunctionThresholdCalculator thresholdCalculator) : base() { this.name = ItemName; this.description = ItemDescription; this.model = model; this.classValues = new double[0]; this.thresholds = new double[0]; this.thresholdCalculator = thresholdCalculator; }
public void Add(IRegressionModel model, double weight) { if (string.IsNullOrEmpty(TargetVariable)) { TargetVariable = model.TargetVariable; } models.Add(model); modelWeights.Add(weight); OnChanged(); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); double replacementValue; List <double> replacementValues; IRandom rand; switch (replacement) { case ReplacementMethodEnum.Median: replacementValue = rows.Select(r => originalValues[r]).Median(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Average: replacementValue = rows.Select(r => originalValues[r]).Average(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; case ReplacementMethodEnum.Noise: var avg = rows.Select(r => originalValues[r]).Average(); var stdDev = rows.Select(r => originalValues[r]).StandardDeviation(); rand = new FastRandom(31415); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList(); // update column values foreach (var r in rows) { replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev); } break; default: throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement)); } return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues)); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, IEnumerable <string> replacementValues) { var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); dataset.ReplaceVariable(variable, replacementValues.ToList()); //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedValues(dataset, rows).ToList(); dataset.ReplaceVariable(variable, originalValues); return(estimates); }
public void Remove(IRegressionModel model) { var index = models.IndexOf(model); models.RemoveAt(index); modelWeights.RemoveAt(index); if (!models.Any()) { TargetVariable = string.Empty; } OnChanged(); }
protected RegressionSolutionBase(IRegressionModel model, IRegressionProblemData problemData) : base(model, problemData) { Add(new Result(TrainingMeanSquaredErrorResultName, TrainingMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TestMeanSquaredErrorResultName, TestMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TrainingMeanAbsoluteErrorResultName, TrainingMeanAbsoluteErrorResultDescription, new DoubleValue())); Add(new Result(TestMeanAbsoluteErrorResultName, TestMeanAbsoluteErrorResultDescription, new DoubleValue())); Add(new Result(TrainingSquaredCorrelationResultName, TrainingSquaredCorrelationResultDescription, new DoubleValue())); Add(new Result(TestSquaredCorrelationResultName, TestSquaredCorrelationResultDescription, new DoubleValue())); Add(new Result(TrainingRelativeErrorResultName, TrainingRelativeErrorResultDescription, new PercentValue())); Add(new Result(TestRelativeErrorResultName, TestRelativeErrorResultDescription, new PercentValue())); Add(new Result(TrainingNormalizedMeanSquaredErrorResultName, TrainingNormalizedMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TestNormalizedMeanSquaredErrorResultName, TestNormalizedMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TrainingRootMeanSquaredErrorResultName, TrainingRootMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TestRootMeanSquaredErrorResultName, TestRootMeanSquaredErrorResultDescription, new DoubleValue())); }
private static double CalculateQualityForReplacement( IRegressionModel model, ModifiableDataset modifiableDataset, string variableName, IList originalValues, IEnumerable <int> rows, IList replacementValues, IEnumerable <double> targetValues) { modifiableDataset.ReplaceVariable(variableName, replacementValues); //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedValues(modifiableDataset, rows).ToList(); var ret = CalculateQuality(targetValues, estimates); modifiableDataset.ReplaceVariable(variableName, originalValues); return(ret); }
private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); double replacementValue; List <double> replacementValues; IRandom rand; switch (replacement) { case ReplacementMethodEnum.Median: replacementValue = rows.Select(r => originalValues[r]).Median(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Average: replacementValue = rows.Select(r => originalValues[r]).Average(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); replacementValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); break; case ReplacementMethodEnum.Noise: var avg = rows.Select(r => originalValues[r]).Average(); var stdDev = rows.Select(r => originalValues[r]).StandardDeviation(); rand = new FastRandom(31415); replacementValues = rows.Select(_ => NormalDistributedRandom.NextDouble(rand, avg, stdDev)).ToList(); break; default: throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement)); } dataset.ReplaceVariable(variable, replacementValues); //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedValues(dataset, rows).ToList(); dataset.ReplaceVariable(variable, originalValues); return(estimates); }
private List <Tuple <string, double> > CalculateVariableImpacts(List <string> originalVariableOrdering, IRegressionModel model, IRegressionProblemData problemData, IEnumerable <double> estimatedValues, RegressionSolutionVariableImpactsCalculator.DataPartitionEnum dataPartition, RegressionSolutionVariableImpactsCalculator.ReplacementMethodEnum replMethod, RegressionSolutionVariableImpactsCalculator.FactorReplacementMethodEnum factorReplMethod, CancellationToken token, IProgress progress) { List <Tuple <string, double> > impacts = new List <Tuple <string, double> >(); int count = originalVariableOrdering.Count; int i = 0; var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable(); IEnumerable <int> rows = RegressionSolutionVariableImpactsCalculator.GetPartitionRows(dataPartition, problemData); //Calculate original quality-values (via calculator, default is R²) IEnumerable <double> targetValuesPartition = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); IEnumerable <double> estimatedValuesPartition = Content.GetEstimatedValues(rows); var originalCalculatorValue = RegressionSolutionVariableImpactsCalculator.CalculateQuality(targetValuesPartition, estimatedValuesPartition); foreach (var variableName in originalVariableOrdering) { if (cancellationToken.Token.IsCancellationRequested) { return(null); } progress.ProgressValue = (double)++i / count; progress.Message = string.Format("Calculating impact for variable {0} ({1} of {2})", variableName, i, count); double impact = 0; //If the variable isn't used for prediction, it has zero impact. if (model.VariablesUsedForPrediction.Contains(variableName)) { impact = RegressionSolutionVariableImpactsCalculator.CalculateImpact(variableName, model, problemData, modifiableDataset, rows, replMethod, factorReplMethod, targetValuesPartition, originalCalculatorValue); } impacts.Add(new Tuple <string, double>(variableName, impact)); } return(impacts); }
protected RegressionRuleModel(RegressionRuleModel original, Cloner cloner) : base(original, cloner) { if (original.SplitAttributes != null) { SplitAttributes = original.SplitAttributes.ToArray(); } if (original.SplitValues != null) { SplitValues = original.SplitValues.ToArray(); } if (original.Comparisons != null) { Comparisons = original.Comparisons.ToArray(); } RuleModel = cloner.Clone(original.RuleModel); if (original.variables != null) { variables = original.variables.ToList(); } }
private static IEnumerable <double> EvaluateModelWithReplacedVariable( IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) { var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); List <string> replacementValues; IRandom rand; switch (replacement) { case FactorReplacementMethodEnum.Mode: var mostCommonValue = rows.Select(r => originalValues[r]) .GroupBy(v => v) .OrderByDescending(g => g.Count()) .First().Key; replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList(); break; case FactorReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); // prepare a complete column for the dataset replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList(); // shuffle only the selected rows var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); int i = 0; // update column values foreach (var r in rows) { replacementValues[r] = shuffledValues[i++]; } break; default: throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement)); } return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues)); }
public static double CalculateImpact(string variableName, IRegressionModel model, IRegressionProblemData problemData, ModifiableDataset modifiableDataset, IEnumerable <int> rows, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best, IEnumerable <double> targetValues = null, double quality = double.NaN) { if (!model.VariablesUsedForPrediction.Contains(variableName)) { return(0.0); } if (!problemData.Dataset.VariableNames.Contains(variableName)) { throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName)); } if (targetValues == null) { targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); } if (quality == double.NaN) { quality = CalculateQuality(model.GetEstimatedValues(modifiableDataset, rows), targetValues); } IList originalValues = null; IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod); double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues); double impact = quality - newValue; return(impact); }
public void Add(IRegressionModel model) { models.Add(model); }
public double GetModelWeight(IRegressionModel model) { var index = models.IndexOf(model); return modelWeights[index]; }
public void Remove(IRegressionModel model) { models.Remove(model); }
private bool RowIsTestForModel(int currentRow, IRegressionModel model) { return testPartitions == null || !testPartitions.ContainsKey(model) || (testPartitions[model].Start <= currentRow && currentRow < testPartitions[model].End); }
public GradientBoostedTreesSolution(IRegressionModel model, IRegressionProblemData problemData) : base(model, problemData) { }
public double GetModelWeight(IRegressionModel model) { var index = models.IndexOf(model); return(modelWeights[index]); }
private bool RowIsTestForModel(int currentRow, IRegressionModel model) { return(testPartitions == null || !testPartitions.ContainsKey(model) || (testPartitions[model].Start <= currentRow && currentRow < testPartitions[model].End)); }
private static bool TryExecute(IAlgorithm alg, int seed, string regressionAlgorithmResultName, out IRegressionModel model, out IRun run) { model = null; SetSeed(alg, seed); using (var wh = new AutoResetEvent(false)) { Exception ex = null; EventHandler<EventArgs<Exception>> handler = (sender, args) => { ex = args.Value; wh.Set(); }; EventHandler handler2 = (sender, args) => wh.Set(); alg.ExceptionOccurred += handler; alg.Stopped += handler2; try { alg.Prepare(); alg.Start(); wh.WaitOne(); if (ex != null) throw new AggregateException(ex); run = alg.Runs.Last(); alg.Runs.Clear(); var sols = alg.Results.Select(r => r.Value).OfType<IRegressionSolution>(); if (!sols.Any()) return false; var sol = sols.First(); if (sols.Skip(1).Any()) { // more than one solution => use regressionAlgorithmResult if (alg.Results.ContainsKey(regressionAlgorithmResultName)) { sol = (IRegressionSolution)alg.Results[regressionAlgorithmResultName].Value; } } var symbRegSol = sol as SymbolicRegressionSolution; // only accept symb reg solutions that do not hit the estimation limits // NaN evaluations would not be critical but are problematic if we want to combine all symbolic models into a single symbolic model if (symbRegSol == null || (symbRegSol.TrainingLowerEstimationLimitHits == 0 && symbRegSol.TrainingUpperEstimationLimitHits == 0 && symbRegSol.TestLowerEstimationLimitHits == 0 && symbRegSol.TestUpperEstimationLimitHits == 0) && symbRegSol.TrainingNaNEvaluations == 0 && symbRegSol.TestNaNEvaluations == 0) { model = sol.Model; } } finally { alg.ExceptionOccurred -= handler; alg.Stopped -= handler2; } } return model != null; }
internal void AddModel(IRegressionModel m, double weight) { models.Add(m); weights.Add(weight); }
private static bool TryExecute(IAlgorithm alg, int seed, string regressionAlgorithmResultName, out IRegressionModel model, out IRun run) { model = null; SetSeed(alg, seed); using (var wh = new AutoResetEvent(false)) { Exception ex = null; EventHandler <EventArgs <Exception> > handler = (sender, args) => { ex = args.Value; wh.Set(); }; EventHandler handler2 = (sender, args) => wh.Set(); alg.ExceptionOccurred += handler; alg.Stopped += handler2; try { alg.Prepare(); alg.Start(); wh.WaitOne(); if (ex != null) { throw new AggregateException(ex); } run = alg.Runs.Last(); alg.Runs.Clear(); var sols = alg.Results.Select(r => r.Value).OfType <IRegressionSolution>(); if (!sols.Any()) { return(false); } var sol = sols.First(); if (sols.Skip(1).Any()) { // more than one solution => use regressionAlgorithmResult if (alg.Results.ContainsKey(regressionAlgorithmResultName)) { sol = (IRegressionSolution)alg.Results[regressionAlgorithmResultName].Value; } } var symbRegSol = sol as SymbolicRegressionSolution; // only accept symb reg solutions that do not hit the estimation limits // NaN evaluations would not be critical but are problematic if we want to combine all symbolic models into a single symbolic model if (symbRegSol == null || (symbRegSol.TrainingLowerEstimationLimitHits == 0 && symbRegSol.TrainingUpperEstimationLimitHits == 0 && symbRegSol.TestLowerEstimationLimitHits == 0 && symbRegSol.TestUpperEstimationLimitHits == 0) && symbRegSol.TrainingNaNEvaluations == 0 && symbRegSol.TestNaNEvaluations == 0) { model = sol.Model; } } finally { alg.ExceptionOccurred -= handler; alg.Stopped -= handler2; } } return(model != null); }
public void Add(IRegressionModel model) { if (string.IsNullOrEmpty(TargetVariable)) TargetVariable = model.TargetVariable; Add(model, 1.0); }
public RegressionSolution(IRegressionModel model, IRegressionProblemData problemData) : base(model, problemData) { evaluationCache = new Dictionary<int, double>(problemData.Dataset.Rows); CalculateRegressionResults(); }
public void Remove(IRegressionModel model) { var index = models.IndexOf(model); models.RemoveAt(index); modelWeights.RemoveAt(index); if (!models.Any()) TargetVariable = string.Empty; OnChanged(); }
public void Add(IRegressionModel model, double weight) { if (string.IsNullOrEmpty(TargetVariable)) TargetVariable = model.TargetVariable; models.Add(model); modelWeights.Add(weight); OnChanged(); }
public Regression(IRegressionModel regressionModel) { RegressionModel = regressionModel; }
internal void SetLeafModel(IRegressionModel model) { Model = model; }
public RegressionSolution(IRegressionModel model, IRegressionProblemData problemData) : base(model, problemData) { evaluationCache = new Dictionary <int, double>(problemData.Dataset.Rows); CalculateRegressionResults(); }
private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); double replacementValue; List<double> replacementValues; IRandom rand; switch (replacement) { case ReplacementMethodEnum.Median: replacementValue = rows.Select(r => originalValues[r]).Median(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Average: replacementValue = rows.Select(r => originalValues[r]).Average(); replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList(); break; case ReplacementMethodEnum.Shuffle: // new var has same empirical distribution but the relation to y is broken rand = new FastRandom(31415); replacementValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); break; case ReplacementMethodEnum.Noise: var avg = rows.Select(r => originalValues[r]).Average(); var stdDev = rows.Select(r => originalValues[r]).StandardDeviation(); rand = new FastRandom(31415); replacementValues = rows.Select(_ => NormalDistributedRandom.NextDouble(rand, avg, stdDev)).ToList(); break; default: throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement)); } dataset.ReplaceVariable(variable, replacementValues); //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements var estimates = model.GetEstimatedValues(dataset, rows).ToList(); dataset.ReplaceVariable(variable, originalValues); return estimates; }