protected OneRClassificationModel(OneRClassificationModel original, Cloner cloner)
     : base(original, cloner)
 {
     this.variable = (string)original.variable;
     this.splits   = (double[])original.splits.Clone();
     this.classes  = (double[])original.classes.Clone();
 }
 private OneRClassificationModel(OneRClassificationModel original, Cloner cloner)
     : base(original, cloner)
 {
     this.variable           = (string)original.variable;
     this.splits             = (double[])original.splits.Clone();
     this.classes            = (double[])original.classes.Clone();
     this.missingValuesClass = original.missingValuesClass;
 }
 protected OneRClassificationModel(OneRClassificationModel original, Cloner cloner)
   : base(original, cloner) {
   this.variable = (string)original.variable;
   this.splits = (double[])original.splits.Clone();
   this.classes = (double[])original.classes.Clone();
 }
 public OneRClassificationSolution(OneRClassificationModel model, IClassificationProblemData problemData)
     : base(model, problemData)
 {
     RecalculateResults();
 }
 public OneRClassificationSolution(OneRClassificationModel model, IClassificationProblemData problemData)
   : base(model, problemData) {
   RecalculateResults();
 }
示例#6
0
        private static OneRClassificationModel FindBestDoubleVariableModel(IClassificationProblemData problemData, int minBucketSize = 6)
        {
            var          bestClassified         = 0;
            List <Split> bestSplits             = null;
            string       bestVariable           = string.Empty;
            double       bestMissingValuesClass = double.NaN;
            var          classValues            = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);

            var allowedInputVariables = problemData.AllowedInputVariables.Where(problemData.Dataset.VariableHasType <double>);

            if (!allowedInputVariables.Any())
            {
                return(null);
            }

            foreach (var variable in allowedInputVariables)
            {
                var inputValues = problemData.Dataset.GetDoubleValues(variable, problemData.TrainingIndices);
                var samples     = inputValues.Zip(classValues, (i, v) => new Sample(i, v)).OrderBy(s => s.inputValue);

                var missingValuesDistribution = samples
                                                .Where(s => double.IsNaN(s.inputValue)).GroupBy(s => s.classValue)
                                                .ToDictionary(s => s.Key, s => s.Count())
                                                .MaxItems(s => s.Value)
                                                .FirstOrDefault();

                //calculate class distributions for all distinct inputValues
                List <Dictionary <double, int> > classDistributions = new List <Dictionary <double, int> >();
                List <double> thresholds = new List <double>();
                double        lastValue  = double.NaN;
                foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue)))
                {
                    if (sample.inputValue > lastValue || double.IsNaN(lastValue))
                    {
                        if (!double.IsNaN(lastValue))
                        {
                            thresholds.Add((lastValue + sample.inputValue) / 2);
                        }
                        lastValue = sample.inputValue;
                        classDistributions.Add(new Dictionary <double, int>());
                        foreach (var classValue in problemData.ClassValues)
                        {
                            classDistributions[classDistributions.Count - 1][classValue] = 0;
                        }
                    }
                    classDistributions[classDistributions.Count - 1][sample.classValue]++;
                }
                thresholds.Add(double.PositiveInfinity);

                var distribution = classDistributions[0];
                var threshold    = thresholds[0];
                var splits       = new List <Split>();

                for (int i = 1; i < classDistributions.Count; i++)
                {
                    var samplesInSplit = distribution.Max(d => d.Value);
                    //join splits if there are too few samples in the split or the distributions has the same maximum class value as the current split
                    if (samplesInSplit < minBucketSize ||
                        classDistributions[i].MaxItems(d => d.Value).Select(d => d.Key).Contains(
                            distribution.MaxItems(d => d.Value).Select(d => d.Key).First()))
                    {
                        foreach (var classValue in classDistributions[i])
                        {
                            distribution[classValue.Key] += classValue.Value;
                        }
                        threshold = thresholds[i];
                    }
                    else
                    {
                        splits.Add(new Split(threshold, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));
                        distribution = classDistributions[i];
                        threshold    = thresholds[i];
                    }
                }
                splits.Add(new Split(double.PositiveInfinity, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));

                int correctClassified = 0;
                int splitIndex        = 0;
                foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue)))
                {
                    while (sample.inputValue >= splits[splitIndex].thresholdValue)
                    {
                        splitIndex++;
                    }
                    correctClassified += sample.classValue.IsAlmost(splits[splitIndex].classValue) ? 1 : 0;
                }
                correctClassified += missingValuesDistribution.Value;

                if (correctClassified > bestClassified)
                {
                    bestClassified         = correctClassified;
                    bestSplits             = splits;
                    bestVariable           = variable;
                    bestMissingValuesClass = missingValuesDistribution.Value == 0 ? double.NaN : missingValuesDistribution.Key;
                }
            }

            //remove neighboring splits with the same class value
            for (int i = 0; i < bestSplits.Count - 1; i++)
            {
                if (bestSplits[i].classValue.IsAlmost(bestSplits[i + 1].classValue))
                {
                    bestSplits.Remove(bestSplits[i]);
                    i--;
                }
            }

            var model = new OneRClassificationModel(problemData.TargetVariable, bestVariable,
                                                    bestSplits.Select(s => s.thresholdValue).ToArray(),
                                                    bestSplits.Select(s => s.classValue).ToArray(), bestMissingValuesClass);

            return(model);
        }