public static IEnumerable <Tuple <string, double> > CalculateImpacts(IRegressionSolution solution,
                                                                             DataPartitionEnum data            = DataPartitionEnum.Training,
                                                                             ReplacementMethodEnum replacement = ReplacementMethodEnum.Median)
        {
            var problemData = solution.ProblemData;
            var dataset     = problemData.Dataset;

            IEnumerable <int>    rows;
            IEnumerable <double> targetValues;
            double originalR2 = -1;

            OnlineCalculatorError error;

            switch (data)
            {
            case DataPartitionEnum.All:
                rows         = solution.ProblemData.AllIndices;
                targetValues = problemData.TargetVariableValues.ToList();
                originalR2   = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
                if (error != OnlineCalculatorError.None)
                {
                    throw new InvalidOperationException("Error during R² calculation.");
                }
                originalR2 = originalR2 * originalR2;
                break;

            case DataPartitionEnum.Training:
                rows         = problemData.TrainingIndices;
                targetValues = problemData.TargetVariableTrainingValues.ToList();
                originalR2   = solution.TrainingRSquared;
                break;

            case DataPartitionEnum.Test:
                rows         = problemData.TestIndices;
                targetValues = problemData.TargetVariableTestValues.ToList();
                originalR2   = solution.TestRSquared;
                break;

            default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data));
            }


            var impacts           = new Dictionary <string, double>();
            var modifiableDataset = ((Dataset)dataset).ToModifiable();

            foreach (var inputVariable in problemData.AllowedInputVariables)
            {
                var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacement);
                var newR2        = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
                if (error != OnlineCalculatorError.None)
                {
                    throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
                }

                newR2 = newR2 * newR2;
                var impact = originalR2 - newR2;
                impacts[inputVariable] = impact;
            }
            return(impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value)));
        }
Ejemplo n.º 2
0
        private static IList GetReplacementValues(ModifiableDataset modifiableDataset,
                                                  string variableName,
                                                  IRegressionModel model,
                                                  IEnumerable <int> rows,
                                                  IEnumerable <double> targetValues,
                                                  out IList originalValues,
                                                  ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
                                                  FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best)
        {
            IList replacementValues = null;

            if (modifiableDataset.VariableHasType <double>(variableName))
            {
                originalValues    = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList();
                replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List <double>)originalValues, replacementMethod);
            }
            else if (modifiableDataset.VariableHasType <string>(variableName))
            {
                originalValues    = modifiableDataset.GetReadOnlyStringValues(variableName).ToList();
                replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List <string>)originalValues, targetValues, factorReplacementMethod);
            }
            else
            {
                throw new NotSupportedException("Variable not supported");
            }

            return(replacementValues);
        }
Ejemplo n.º 3
0
        public static IEnumerable <Tuple <string, double> > CalculateImpacts(
            IRegressionModel model,
            IRegressionProblemData problemData,
            IEnumerable <double> estimatedValues,
            IEnumerable <int> rows,
            ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
            FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best)
        {
            //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable
            var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames);

            if (missingVariables.Any())
            {
                throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables)));
            }
            IEnumerable <double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
            var originalQuality = CalculateQuality(targetValues, estimatedValues);

            var impacts           = new Dictionary <string, double>();
            var inputvariables    = new HashSet <string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction));
            var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable();

            foreach (var inputVariable in inputvariables)
            {
                impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality);
            }

            return(impacts.Select(i => Tuple.Create(i.Key, i.Value)));
        }
Ejemplo n.º 4
0
        public static IEnumerable <Tuple <string, double> > CalculateImpacts(
            IRegressionSolution solution,
            ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
            FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
            DataPartitionEnum dataPartition = DataPartitionEnum.Training)
        {
            IEnumerable <int>    rows            = GetPartitionRows(dataPartition, solution.ProblemData);
            IEnumerable <double> estimatedValues = solution.GetEstimatedValues(rows);

            return(CalculateImpacts(solution.Model, solution.ProblemData, estimatedValues, rows, replacementMethod, factorReplacementMethod));
        }
Ejemplo n.º 5
0
        private static IList GetReplacementValuesForDouble(ModifiableDataset modifiableDataset,
                                                           IEnumerable <int> rows,
                                                           List <double> originalValues,
                                                           ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle)
        {
            IRandom       random = new FastRandom(31415);
            List <double> replacementValues;
            double        replacementValue;

            switch (replacementMethod)
            {
            case ReplacementMethodEnum.Median:
                replacementValue  = rows.Select(r => originalValues[r]).Median();
                replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Average:
                replacementValue  = rows.Select(r => originalValues[r]).Average();
                replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
                // shuffle only the selected rows
                var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList();
                int i = 0;
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = shuffledValues[i++];
                }
                break;

            case ReplacementMethodEnum.Noise:
                var avg    = rows.Select(r => originalValues[r]).Average();
                var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = NormalDistributedRandom.NextDouble(random, avg, stdDev);
                }
                break;

            default:
                throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacementMethod));
            }

            return(replacementValues);
        }
        public static IEnumerable <Tuple <string, double> > CalculateImpacts(
            IClassificationSolution solution,
            ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
            FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
            DataPartitionEnum dataPartition = DataPartitionEnum.Training)
        {
            IEnumerable <int>    rows = GetPartitionRows(dataPartition, solution.ProblemData);
            IEnumerable <double> estimatedClassValues = solution.GetEstimatedClassValues(rows);
            var model = (IClassificationModel)solution.Model.Clone(); //mkommend: clone of model is necessary, because the thresholds for IDiscriminantClassificationModels are updated

            return(CalculateImpacts(model, solution.ProblemData, estimatedClassValues, rows, replacementMethod, factorReplacementMethod));
        }
Ejemplo n.º 7
0
        public static double CalculateImpact(string variableName,
                                             IRegressionModel model,
                                             IRegressionProblemData problemData,
                                             ModifiableDataset modifiableDataset,
                                             IEnumerable <int> rows,
                                             ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
                                             FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
                                             IEnumerable <double> targetValues = null,
                                             double quality = double.NaN)
        {
            if (!model.VariablesUsedForPrediction.Contains(variableName))
            {
                return(0.0);
            }
            if (!problemData.Dataset.VariableNames.Contains(variableName))
            {
                throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName));
            }

            if (targetValues == null)
            {
                targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
            }
            if (quality == double.NaN)
            {
                quality = CalculateQuality(model.GetEstimatedValues(modifiableDataset, rows), targetValues);
            }

            IList originalValues    = null;
            IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod);

            double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues);
            double impact   = quality - newValue;

            return(impact);
        }
Ejemplo n.º 8
0
        private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median)
        {
            var           originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
            double        replacementValue;
            List <double> replacementValues;
            IRandom       rand;

            switch (replacement)
            {
            case ReplacementMethodEnum.Median:
                replacementValue  = rows.Select(r => originalValues[r]).Median();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Average:
                replacementValue  = rows.Select(r => originalValues[r]).Average();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                rand = new FastRandom(31415);
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
                // shuffle only the selected rows
                var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
                int i = 0;
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = shuffledValues[i++];
                }
                break;

            case ReplacementMethodEnum.Noise:
                var avg    = rows.Select(r => originalValues[r]).Average();
                var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
                rand = new FastRandom(31415);
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev);
                }
                break;

            default:
                throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
            }

            return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues));
        }
Ejemplo n.º 9
0
        public static IEnumerable <Tuple <string, double> > CalculateImpacts(
            IRegressionSolution solution,
            DataPartitionEnum data = DataPartitionEnum.Training,
            ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Median,
            FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best)
        {
            var problemData = solution.ProblemData;
            var dataset     = problemData.Dataset;

            IEnumerable <int>    rows;
            IEnumerable <double> targetValues;
            double originalR2 = -1;

            OnlineCalculatorError error;

            switch (data)
            {
            case DataPartitionEnum.All:
                rows         = solution.ProblemData.AllIndices;
                targetValues = problemData.TargetVariableValues.ToList();
                originalR2   = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
                if (error != OnlineCalculatorError.None)
                {
                    throw new InvalidOperationException("Error during R² calculation.");
                }
                originalR2 = originalR2 * originalR2;
                break;

            case DataPartitionEnum.Training:
                rows         = problemData.TrainingIndices;
                targetValues = problemData.TargetVariableTrainingValues.ToList();
                originalR2   = solution.TrainingRSquared;
                break;

            case DataPartitionEnum.Test:
                rows         = problemData.TestIndices;
                targetValues = problemData.TargetVariableTestValues.ToList();
                originalR2   = solution.TestRSquared;
                break;

            default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data));
            }

            var impacts           = new Dictionary <string, double>();
            var modifiableDataset = ((Dataset)dataset).ToModifiable();

            var inputvariables        = new HashSet <string>(problemData.AllowedInputVariables.Union(solution.Model.VariablesUsedForPrediction));
            var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList();

            // calculate impacts for double variables
            foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType <double>))
            {
                var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
                var newR2        = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
                if (error != OnlineCalculatorError.None)
                {
                    throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
                }

                newR2 = newR2 * newR2;
                var impact = originalR2 - newR2;
                impacts[inputVariable] = impact;
            }

            // calculate impacts for string variables
            foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType <string>))
            {
                if (factorReplacementMethod == FactorReplacementMethodEnum.Best)
                {
                    // try replacing with all possible values and find the best replacement value
                    var smallestImpact = double.PositiveInfinity;
                    foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct())
                    {
                        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
                                                                             Enumerable.Repeat(repl, dataset.Rows));
                        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
                        if (error != OnlineCalculatorError.None)
                        {
                            throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
                        }

                        newR2 = newR2 * newR2;
                        var impact = originalR2 - newR2;
                        if (impact < smallestImpact)
                        {
                            smallestImpact = impact;
                        }
                    }
                    impacts[inputVariable] = smallestImpact;
                }
                else
                {
                    // for replacement methods shuffle and mode
                    // calculate impacts for factor variables

                    var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
                                                                         factorReplacementMethod);
                    var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
                    if (error != OnlineCalculatorError.None)
                    {
                        throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
                    }

                    newR2 = newR2 * newR2;
                    var impact = originalR2 - newR2;
                    impacts[inputVariable] = impact;
                }
            } // foreach
            return(impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value)));
        }
        private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median)
        {
            var           originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
            double        replacementValue;
            List <double> replacementValues;
            IRandom       rand;

            switch (replacement)
            {
            case ReplacementMethodEnum.Median:
                replacementValue  = rows.Select(r => originalValues[r]).Median();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Average:
                replacementValue  = rows.Select(r => originalValues[r]).Average();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                rand = new FastRandom(31415);
                replacementValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
                break;

            case ReplacementMethodEnum.Noise:
                var avg    = rows.Select(r => originalValues[r]).Average();
                var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
                rand = new FastRandom(31415);
                replacementValues = rows.Select(_ => NormalDistributedRandom.NextDouble(rand, avg, stdDev)).ToList();
                break;

            default:
                throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
            }

            dataset.ReplaceVariable(variable, replacementValues);
            //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
            var estimates = model.GetEstimatedValues(dataset, rows).ToList();

            dataset.ReplaceVariable(variable, originalValues);

            return(estimates);
        }
    public static IEnumerable<Tuple<string, double>> CalculateImpacts(IRegressionSolution solution,
      DataPartitionEnum data = DataPartitionEnum.Training,
      ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {

      var problemData = solution.ProblemData;
      var dataset = problemData.Dataset;

      IEnumerable<int> rows;
      IEnumerable<double> targetValues;
      double originalR2 = -1;

      OnlineCalculatorError error;

      switch (data) {
        case DataPartitionEnum.All:
          rows = solution.ProblemData.AllIndices;
          targetValues = problemData.TargetVariableValues.ToList();
          originalR2 = OnlinePearsonsRCalculator.Calculate(problemData.TargetVariableValues, solution.EstimatedValues, out error);
          if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation.");
          originalR2 = originalR2 * originalR2;
          break;
        case DataPartitionEnum.Training:
          rows = problemData.TrainingIndices;
          targetValues = problemData.TargetVariableTrainingValues.ToList();
          originalR2 = solution.TrainingRSquared;
          break;
        case DataPartitionEnum.Test:
          rows = problemData.TestIndices;
          targetValues = problemData.TargetVariableTestValues.ToList();
          originalR2 = solution.TestRSquared;
          break;
        default: throw new ArgumentException(string.Format("DataPartition {0} cannot be handled.", data));
      }


      var impacts = new Dictionary<string, double>();
      var modifiableDataset = ((Dataset)dataset).ToModifiable();

      foreach (var inputVariable in problemData.AllowedInputVariables) {
        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacement);
        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
        if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");

        newR2 = newR2 * newR2;
        var impact = originalR2 - newR2;
        impacts[inputVariable] = impact;
      }
      return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value));
    }
    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
      double replacementValue;
      List<double> replacementValues;
      IRandom rand;

      switch (replacement) {
        case ReplacementMethodEnum.Median:
          replacementValue = rows.Select(r => originalValues[r]).Median();
          replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
          break;
        case ReplacementMethodEnum.Average:
          replacementValue = rows.Select(r => originalValues[r]).Average();
          replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
          break;
        case ReplacementMethodEnum.Shuffle:
          // new var has same empirical distribution but the relation to y is broken
          rand = new FastRandom(31415);
          replacementValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
          break;
        case ReplacementMethodEnum.Noise:
          var avg = rows.Select(r => originalValues[r]).Average();
          var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
          rand = new FastRandom(31415);
          replacementValues = rows.Select(_ => NormalDistributedRandom.NextDouble(rand, avg, stdDev)).ToList();
          break;

        default:
          throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
      }

      dataset.ReplaceVariable(variable, replacementValues);
      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
      var estimates = model.GetEstimatedValues(dataset, rows).ToList();
      dataset.ReplaceVariable(variable, originalValues);

      return estimates;
    }