Exemplo n.º 1
0
        /// <summary>
        /// Searches for the best split using a brute force approach on all unique threshold values.
        /// The implementation assumes that the features and targets have been sorted
        /// together using the features as sort criteria
        /// </summary>
        /// <param name="impurityCalculator"></param>
        /// <param name="feature"></param>
        /// <param name="targets"></param>
        /// <param name="parentInterval"></param>
        /// <param name="parentImpurity"></param>
        /// <returns></returns>
        public SplitResult FindBestSplit(IImpurityCalculator impurityCalculator, double[] feature, double[] targets,
                                         Interval1D parentInterval, double parentImpurity)
        {
            var bestSplitIndex          = -1;
            var bestThreshold           = 0.0;
            var bestImpurityImprovement = 0.0;
            var bestImpurityLeft        = 0.0;
            var bestImpurityRight       = 0.0;

            int prevSplit  = parentInterval.FromInclusive;
            var prevValue  = feature[prevSplit];
            var prevTarget = targets[prevSplit];

            impurityCalculator.UpdateInterval(parentInterval);

            for (int j = prevSplit + 1; j < parentInterval.ToExclusive; j++)
            {
                var currentValue  = feature[j];
                var currentTarget = targets[j];
                if (prevValue != currentValue)
                {
                    var currentSplit = j;
                    var leftSize     = (double)(currentSplit - parentInterval.FromInclusive);
                    var rightSize    = (double)(parentInterval.ToExclusive - currentSplit);

                    if (Math.Min(leftSize, rightSize) >= m_minimumSplitSize)
                    {
                        impurityCalculator.UpdateIndex(currentSplit);

                        if ((impurityCalculator.WeightedLeft < m_minimumLeafWeight) ||
                            (impurityCalculator.WeightedRight < m_minimumLeafWeight))
                        {
                            continue;
                        }

                        var improvement = impurityCalculator.ImpurityImprovement(parentImpurity);

                        if (improvement > bestImpurityImprovement)
                        {
                            var childImpurities = impurityCalculator.ChildImpurities(); // could be avoided

                            bestImpurityImprovement = improvement;
                            bestThreshold           = (currentValue + prevValue) * 0.5;
                            bestSplitIndex          = currentSplit;
                            bestImpurityLeft        = childImpurities.Left;
                            bestImpurityRight       = childImpurities.Right;
                        }

                        prevSplit = j;
                    }
                }

                prevValue  = currentValue;
                prevTarget = currentTarget;
            }

            return(new SplitResult(bestSplitIndex, bestThreshold,
                                   bestImpurityImprovement, bestImpurityLeft, bestImpurityRight));
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="impurityCalculator"></param>
        /// <param name="feature"></param>
        /// <param name="targets"></param>
        /// <param name="parentInterval"></param>
        /// <param name="parentImpurity"></param>
        /// <returns></returns>
        public SplitResult FindBestSplit(IImpurityCalculator impurityCalculator, double[] feature, double[] targets, Interval1D parentInterval, double parentImpurity)
        {
            var min = double.MaxValue;
            var max = double.MinValue;

            for (int i = parentInterval.FromInclusive; i < parentInterval.ToExclusive; i++)
            {
                var value = feature[i];

                if (value < min)
                {
                    min = value;
                }
                else if (value > max)
                {
                    max = value;
                }
            }

            if (min == max)
            {
                return(SplitResult.Initial());
            }

            var threshold = RandomThreshold(min, max);

            if (threshold == max)
            {
                threshold = min;
            }

            var splitIndex          = -1;
            var impurityImprovement = 0.0;
            var impurityLeft        = 0.0;
            var impurityRight       = 0.0;

            var currentFeature = double.MinValue;

            for (int i = parentInterval.FromInclusive; i < parentInterval.ToExclusive; i++)
            {
                var leftSize  = (double)(i - parentInterval.FromInclusive);
                var rightSize = (double)(parentInterval.ToExclusive - i);

                currentFeature = feature[i];

                if (currentFeature > threshold && Math.Min(leftSize, rightSize) >= m_minimumSplitSize)
                {
                    splitIndex = i;

                    impurityCalculator.UpdateInterval(parentInterval);
                    impurityCalculator.UpdateIndex(i);
                    impurityImprovement = impurityCalculator.ImpurityImprovement(parentImpurity);

                    var childImpurities = impurityCalculator.ChildImpurities();
                    impurityLeft  = childImpurities.Left;
                    impurityRight = childImpurities.Right;

                    break;
                }
            }

            return(new SplitResult(splitIndex, threshold, impurityImprovement,
                                   impurityLeft, impurityRight));
        }