Exemple #1
0
        private static IList GetReplacementValues(ModifiableDataset modifiableDataset,
                                                  string variableName,
                                                  IRegressionModel model,
                                                  IEnumerable <int> rows,
                                                  IEnumerable <double> targetValues,
                                                  out IList originalValues,
                                                  ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
                                                  FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best)
        {
            IList replacementValues = null;

            if (modifiableDataset.VariableHasType <double>(variableName))
            {
                originalValues    = modifiableDataset.GetReadOnlyDoubleValues(variableName).ToList();
                replacementValues = GetReplacementValuesForDouble(modifiableDataset, rows, (List <double>)originalValues, replacementMethod);
            }
            else if (modifiableDataset.VariableHasType <string>(variableName))
            {
                originalValues    = modifiableDataset.GetReadOnlyStringValues(variableName).ToList();
                replacementValues = GetReplacementValuesForString(model, modifiableDataset, variableName, rows, (List <string>)originalValues, targetValues, factorReplacementMethod);
            }
            else
            {
                throw new NotSupportedException("Variable not supported");
            }

            return(replacementValues);
        }
Exemple #2
0
        private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner)
        {
            var variables = variableValues.Keys.ToList();

            foreach (var v in variables)
            {
                var type = GetVariableType(v);
                if (type == typeof(DateTime))
                {
                    variableValues[v] = GetDateTimeValues(v).ToList();
                }
                else if (type == typeof(double))
                {
                    variableValues[v] = GetDoubleValues(v).ToList();
                }
                else if (type == typeof(string))
                {
                    variableValues[v] = GetStringValues(v).ToList();
                }
                else
                {
                    throw new ArgumentException("Unsupported type " + type + " for variable " + v);
                }
            }
        }
        private static double CalculateQualityForReplacement(
            IClassificationModel model,
            ModifiableDataset modifiableDataset,
            string variableName,
            IList originalValues,
            IEnumerable <int> rows,
            IList replacementValues,
            IEnumerable <double> targetValues)
        {
            modifiableDataset.ReplaceVariable(variableName, replacementValues);
            var discModel = model as IDiscriminantFunctionClassificationModel;

            if (discModel != null)
            {
                var problemData = new ClassificationProblemData(modifiableDataset, modifiableDataset.VariableNames, model.TargetVariable);
                discModel.RecalculateModelParameters(problemData, rows);
            }

            //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
            var estimates = model.GetEstimatedClassValues(modifiableDataset, rows).ToList();
            var ret       = CalculateQuality(targetValues, estimates);

            modifiableDataset.ReplaceVariable(variableName, originalValues);

            return(ret);
        }
Exemple #4
0
        private static IList GetReplacementValuesForString(IRegressionModel model,
                                                           ModifiableDataset modifiableDataset,
                                                           string variableName,
                                                           IEnumerable <int> rows,
                                                           List <string> originalValues,
                                                           IEnumerable <double> targetValues,
                                                           FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Shuffle)
        {
            List <string> replacementValues = null;
            IRandom       random            = new FastRandom(31415);

            switch (factorReplacementMethod)
            {
            case FactorReplacementMethodEnum.Best:
                // try replacing with all possible values and find the best replacement value
                var bestQuality = double.NegativeInfinity;
                foreach (var repl in modifiableDataset.GetStringValues(variableName, rows).Distinct())
                {
                    List <string> curReplacementValues = Enumerable.Repeat(repl, modifiableDataset.Rows).ToList();
                    //fholzing: this result could be used later on (theoretically), but is neglected for better readability/method consistency
                    var newValue   = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, curReplacementValues, targetValues);
                    var curQuality = newValue;

                    if (curQuality > bestQuality)
                    {
                        bestQuality       = curQuality;
                        replacementValues = curReplacementValues;
                    }
                }
                break;

            case FactorReplacementMethodEnum.Mode:
                var mostCommonValue = rows.Select(r => originalValues[r])
                                      .GroupBy(v => v)
                                      .OrderByDescending(g => g.Count())
                                      .First().Key;
                replacementValues = Enumerable.Repeat(mostCommonValue, modifiableDataset.Rows).ToList();
                break;

            case FactorReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(string.Empty, modifiableDataset.Rows).ToList();
                // shuffle only the selected rows
                var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList();
                int i = 0;
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = shuffledValues[i++];
                }
                break;

            default:
                throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", factorReplacementMethod));
            }

            return(replacementValues);
        }
Exemple #5
0
        private static IList GetReplacementValuesForDouble(ModifiableDataset modifiableDataset,
                                                           IEnumerable <int> rows,
                                                           List <double> originalValues,
                                                           ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle)
        {
            IRandom       random = new FastRandom(31415);
            List <double> replacementValues;
            double        replacementValue;

            switch (replacementMethod)
            {
            case ReplacementMethodEnum.Median:
                replacementValue  = rows.Select(r => originalValues[r]).Median();
                replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Average:
                replacementValue  = rows.Select(r => originalValues[r]).Average();
                replacementValues = Enumerable.Repeat(replacementValue, modifiableDataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
                // shuffle only the selected rows
                var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(random).ToList();
                int i = 0;
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = shuffledValues[i++];
                }
                break;

            case ReplacementMethodEnum.Noise:
                var avg    = rows.Select(r => originalValues[r]).Average();
                var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, modifiableDataset.Rows).ToList();
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = NormalDistributedRandom.NextDouble(random, avg, stdDev);
                }
                break;

            default:
                throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacementMethod));
            }

            return(replacementValues);
        }
Exemple #6
0
        private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable,
                                                                              ModifiableDataset dataset, IEnumerable <int> rows, IEnumerable <string> replacementValues)
        {
            var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();

            dataset.ReplaceVariable(variable, replacementValues.ToList());
            //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
            var estimates = model.GetEstimatedValues(dataset, rows).ToList();

            dataset.ReplaceVariable(variable, originalValues);

            return(estimates);
        }
Exemple #7
0
        private static double CalculateQualityForReplacement(
            IRegressionModel model,
            ModifiableDataset modifiableDataset,
            string variableName,
            IList originalValues,
            IEnumerable <int> rows,
            IList replacementValues,
            IEnumerable <double> targetValues)
        {
            modifiableDataset.ReplaceVariable(variableName, replacementValues);
            //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
            var estimates = model.GetEstimatedValues(modifiableDataset, rows).ToList();
            var ret       = CalculateQuality(targetValues, estimates);

            modifiableDataset.ReplaceVariable(variableName, originalValues);

            return(ret);
        }
Exemple #8
0
        private static IEnumerable <double> EvaluateModelWithReplacedVariable(
            IRegressionModel model, string variable, ModifiableDataset dataset,
            IEnumerable <int> rows,
            FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle)
        {
            var           originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
            List <string> replacementValues;
            IRandom       rand;

            switch (replacement)
            {
            case FactorReplacementMethodEnum.Mode:
                var mostCommonValue = rows.Select(r => originalValues[r])
                                      .GroupBy(v => v)
                                      .OrderByDescending(g => g.Count())
                                      .First().Key;
                replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList();
                break;

            case FactorReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                rand = new FastRandom(31415);
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList();
                // shuffle only the selected rows
                var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
                int i = 0;
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = shuffledValues[i++];
                }
                break;

            default:
                throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement));
            }

            return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues));
        }
Exemple #9
0
        public static double CalculateImpact(string variableName,
                                             IRegressionModel model,
                                             IRegressionProblemData problemData,
                                             ModifiableDataset modifiableDataset,
                                             IEnumerable <int> rows,
                                             ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
                                             FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
                                             IEnumerable <double> targetValues = null,
                                             double quality = double.NaN)
        {
            if (!model.VariablesUsedForPrediction.Contains(variableName))
            {
                return(0.0);
            }
            if (!problemData.Dataset.VariableNames.Contains(variableName))
            {
                throw new InvalidOperationException(string.Format("Can not calculate variable impact, because the model uses inputs missing in the dataset ({0})", variableName));
            }

            if (targetValues == null)
            {
                targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
            }
            if (quality == double.NaN)
            {
                quality = CalculateQuality(model.GetEstimatedValues(modifiableDataset, rows), targetValues);
            }

            IList originalValues    = null;
            IList replacementValues = GetReplacementValues(modifiableDataset, variableName, model, rows, targetValues, out originalValues, replacementMethod, factorReplacementMethod);

            double newValue = CalculateQualityForReplacement(model, modifiableDataset, variableName, originalValues, rows, replacementValues, targetValues);
            double impact   = quality - newValue;

            return(impact);
        }
Exemple #10
0
        private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median)
        {
            var           originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
            double        replacementValue;
            List <double> replacementValues;
            IRandom       rand;

            switch (replacement)
            {
            case ReplacementMethodEnum.Median:
                replacementValue  = rows.Select(r => originalValues[r]).Median();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Average:
                replacementValue  = rows.Select(r => originalValues[r]).Average();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                rand = new FastRandom(31415);
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
                // shuffle only the selected rows
                var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
                int i = 0;
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = shuffledValues[i++];
                }
                break;

            case ReplacementMethodEnum.Noise:
                var avg    = rows.Select(r => originalValues[r]).Average();
                var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
                rand = new FastRandom(31415);
                // prepare a complete column for the dataset
                replacementValues = Enumerable.Repeat(double.NaN, dataset.Rows).ToList();
                // update column values
                foreach (var r in rows)
                {
                    replacementValues[r] = NormalDistributedRandom.NextDouble(rand, avg, stdDev);
                }
                break;

            default:
                throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
            }

            return(EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues));
        }
 private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner)
 {
     variableNames  = new List <string>(original.variableNames);
     variableValues = CloneValues(original.variableValues);
 }
        private static IEnumerable <double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable <int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median)
        {
            var           originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
            double        replacementValue;
            List <double> replacementValues;
            IRandom       rand;

            switch (replacement)
            {
            case ReplacementMethodEnum.Median:
                replacementValue  = rows.Select(r => originalValues[r]).Median();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Average:
                replacementValue  = rows.Select(r => originalValues[r]).Average();
                replacementValues = Enumerable.Repeat(replacementValue, dataset.Rows).ToList();
                break;

            case ReplacementMethodEnum.Shuffle:
                // new var has same empirical distribution but the relation to y is broken
                rand = new FastRandom(31415);
                replacementValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList();
                break;

            case ReplacementMethodEnum.Noise:
                var avg    = rows.Select(r => originalValues[r]).Average();
                var stdDev = rows.Select(r => originalValues[r]).StandardDeviation();
                rand = new FastRandom(31415);
                replacementValues = rows.Select(_ => NormalDistributedRandom.NextDouble(rand, avg, stdDev)).ToList();
                break;

            default:
                throw new ArgumentException(string.Format("ReplacementMethod {0} cannot be handled.", replacement));
            }

            dataset.ReplaceVariable(variable, replacementValues);
            //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
            var estimates = model.GetEstimatedValues(dataset, rows).ToList();

            dataset.ReplaceVariable(variable, originalValues);

            return(estimates);
        }