protected OneRClassificationSolution(OneRClassificationSolution original, Cloner cloner) : base(original, cloner)
 {
 }
 private OneRClassificationSolution(OneRClassificationSolution original, Cloner cloner) : base(original, cloner)
 {
 }
 protected OneRClassificationSolution(OneRClassificationSolution original, Cloner cloner) : base(original, cloner) { }
Пример #4
0
        public static IClassificationSolution CreateOneRSolution(IClassificationProblemData problemData, int minBucketSize = 6)
        {
            var          bestClassified         = 0;
            List <Split> bestSplits             = null;
            string       bestVariable           = string.Empty;
            double       bestMissingValuesClass = double.NaN;
            var          classValues            = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);

            foreach (var variable in problemData.AllowedInputVariables)
            {
                var inputValues = problemData.Dataset.GetDoubleValues(variable, problemData.TrainingIndices);
                var samples     = inputValues.Zip(classValues, (i, v) => new Sample(i, v)).OrderBy(s => s.inputValue);

                var missingValuesDistribution = samples.Where(s => double.IsNaN(s.inputValue)).GroupBy(s => s.classValue).ToDictionary(s => s.Key, s => s.Count()).MaxItems(s => s.Value).FirstOrDefault();

                //calculate class distributions for all distinct inputValues
                List <Dictionary <double, int> > classDistributions = new List <Dictionary <double, int> >();
                List <double> thresholds = new List <double>();
                double        lastValue  = double.NaN;
                foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue)))
                {
                    if (sample.inputValue > lastValue || double.IsNaN(lastValue))
                    {
                        if (!double.IsNaN(lastValue))
                        {
                            thresholds.Add((lastValue + sample.inputValue) / 2);
                        }
                        lastValue = sample.inputValue;
                        classDistributions.Add(new Dictionary <double, int>());
                        foreach (var classValue in problemData.ClassValues)
                        {
                            classDistributions[classDistributions.Count - 1][classValue] = 0;
                        }
                    }
                    classDistributions[classDistributions.Count - 1][sample.classValue]++;
                }
                thresholds.Add(double.PositiveInfinity);

                var distribution = classDistributions[0];
                var threshold    = thresholds[0];
                var splits       = new List <Split>();

                for (int i = 1; i < classDistributions.Count; i++)
                {
                    var samplesInSplit = distribution.Max(d => d.Value);
                    //join splits if there are too few samples in the split or the distributions has the same maximum class value as the current split
                    if (samplesInSplit < minBucketSize ||
                        classDistributions[i].MaxItems(d => d.Value).Select(d => d.Key).Contains(
                            distribution.MaxItems(d => d.Value).Select(d => d.Key).First()))
                    {
                        foreach (var classValue in classDistributions[i])
                        {
                            distribution[classValue.Key] += classValue.Value;
                        }
                        threshold = thresholds[i];
                    }
                    else
                    {
                        splits.Add(new Split(threshold, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));
                        distribution = classDistributions[i];
                        threshold    = thresholds[i];
                    }
                }
                splits.Add(new Split(double.PositiveInfinity, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));

                int correctClassified = 0;
                int splitIndex        = 0;
                foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue)))
                {
                    while (sample.inputValue >= splits[splitIndex].thresholdValue)
                    {
                        splitIndex++;
                    }
                    correctClassified += sample.classValue == splits[splitIndex].classValue ? 1 : 0;
                }
                correctClassified += missingValuesDistribution.Value;

                if (correctClassified > bestClassified)
                {
                    bestClassified         = correctClassified;
                    bestSplits             = splits;
                    bestVariable           = variable;
                    bestMissingValuesClass = missingValuesDistribution.Value == 0 ? double.NaN : missingValuesDistribution.Key;
                }
            }

            //remove neighboring splits with the same class value
            for (int i = 0; i < bestSplits.Count - 1; i++)
            {
                if (bestSplits[i].classValue == bestSplits[i + 1].classValue)
                {
                    bestSplits.Remove(bestSplits[i]);
                    i--;
                }
            }

            var model    = new OneRClassificationModel(bestVariable, bestSplits.Select(s => s.thresholdValue).ToArray(), bestSplits.Select(s => s.classValue).ToArray(), bestMissingValuesClass);
            var solution = new OneRClassificationSolution(model, (IClassificationProblemData)problemData.Clone());

            return(solution);
        }