/// <summary>
        ///
        /// </summary>
        /// <param name="maximumTreeDepth">The maximal tree depth before a leaf is generated</param>
        /// <param name="maximumLeafCount">The maximal allowed leaf nodes in the tree</param>
        /// <param name="featuresPrSplit">The number of features to be selected between at each split.
        /// 0 means use all available features</param>
        /// <param name="minimumInformationGain">The minimum improvement in information gain before a split is made</param>
        /// <param name="seed">Seed for feature selection if number of features pr split is not equal
        /// to the total amount of features in observations. The features will be selected at random for each split</param>
        /// <param name="splitSearcher">The type of searcher used for finding the best features splits when learning the tree</param>
        /// <param name="impurityCalculator">Impurity calculator used to decide which split is optimal</param>
        public BestFirstTreeBuilder(int maximumTreeDepth,
                                    int maximumLeafCount,
                                    int featuresPrSplit,
                                    double minimumInformationGain,
                                    int seed,
                                    ISplitSearcher splitSearcher,
                                    IImpurityCalculator impurityCalculator)
        {
            if (maximumTreeDepth <= 0)
            {
                throw new ArgumentException("maximum tree depth must be larger than 0");
            }
            if (maximumLeafCount <= 1)
            {
                throw new ArgumentException("maximum leaf count must be larger than 1");
            }
            if (minimumInformationGain <= 0)
            {
                throw new ArgumentException("minimum information gain must be larger than 0");
            }
            if (featuresPrSplit < 0)
            {
                throw new ArgumentException("features pr split must be at least 0");
            }
            m_splitSearcher      = splitSearcher ?? throw new ArgumentException(nameof(splitSearcher));
            m_impurityCalculator = impurityCalculator ?? throw new ArgumentException(nameof(impurityCalculator));

            m_maximumTreeDepth       = maximumTreeDepth;
            m_maximumLeafCount       = maximumLeafCount;
            m_featuresPrSplit        = featuresPrSplit;
            m_minimumInformationGain = minimumInformationGain;

            m_random = new Random(seed);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="maximumTreeDepth">The maximal tree depth before a leaf is generated</param>
        /// <param name="featuresPrSplit">The number of features to be selected between at each split.
        /// 0 means use all availible features</param>
        /// <param name="minimumInformationGain">The minimum improvement in information gain before a split is made</param>
        /// <param name="seed">Seed for feature selection if number of features pr split is not equal
        /// to the total amount of features in observations. The features will be selected at random for each split</param>
        /// <param name="splitSearcher">The type of searcher used for finding the best features splits when learning the tree</param>
        /// <param name="impurityCalculator">Impurity calculator used to decide which split is optimal</param>
        public DepthFirstTreeBuilder(int maximumTreeDepth, int featuresPrSplit, double minimumInformationGain, int seed,
                                     ISplitSearcher splitSearcher, IImpurityCalculator impurityCalculator)
        {
            if (splitSearcher == null)
            {
                throw new ArgumentException("splitSearcher");
            }
            if (maximumTreeDepth <= 0)
            {
                throw new ArgumentException("maximum tree depth must be larger than 0");
            }
            if (minimumInformationGain <= 0)
            {
                throw new ArgumentException("minimum information gain must be larger than 0");
            }
            if (featuresPrSplit < 0)
            {
                throw new ArgumentException("features pr split must be at least 0");
            }
            if (impurityCalculator == null)
            {
                throw new ArgumentException("impurityCalculator");
            }

            m_maximumTreeDepth       = maximumTreeDepth;
            m_featuresPrSplit        = featuresPrSplit;
            m_splitSearcher          = splitSearcher;
            m_impurityCalculator     = impurityCalculator;
            m_minimumInformationGain = minimumInformationGain;

            m_random = new Random(seed);
        }
Esempio n. 3
0
        /// <summary>
        /// Searches for the best split using a brute force approach on all unique threshold values.
        /// The implementation assumes that the features and targets have been sorted
        /// together using the features as sort criteria
        /// </summary>
        /// <param name="impurityCalculator"></param>
        /// <param name="feature"></param>
        /// <param name="targets"></param>
        /// <param name="parentInterval"></param>
        /// <param name="parentImpurity"></param>
        /// <returns></returns>
        public SplitResult FindBestSplit(IImpurityCalculator impurityCalculator, double[] feature, double[] targets,
                                         Interval1D parentInterval, double parentImpurity)
        {
            var bestSplitIndex          = -1;
            var bestThreshold           = 0.0;
            var bestImpurityImprovement = 0.0;
            var bestImpurityLeft        = 0.0;
            var bestImpurityRight       = 0.0;

            int prevSplit  = parentInterval.FromInclusive;
            var prevValue  = feature[prevSplit];
            var prevTarget = targets[prevSplit];

            impurityCalculator.UpdateInterval(parentInterval);

            for (int j = prevSplit + 1; j < parentInterval.ToExclusive; j++)
            {
                var currentValue  = feature[j];
                var currentTarget = targets[j];
                if (prevValue != currentValue)
                {
                    var currentSplit = j;
                    var leftSize     = (double)(currentSplit - parentInterval.FromInclusive);
                    var rightSize    = (double)(parentInterval.ToExclusive - currentSplit);

                    if (Math.Min(leftSize, rightSize) >= m_minimumSplitSize)
                    {
                        impurityCalculator.UpdateIndex(currentSplit);

                        if ((impurityCalculator.WeightedLeft < m_minimumLeafWeight) ||
                            (impurityCalculator.WeightedRight < m_minimumLeafWeight))
                        {
                            continue;
                        }

                        var improvement = impurityCalculator.ImpurityImprovement(parentImpurity);

                        if (improvement > bestImpurityImprovement)
                        {
                            var childImpurities = impurityCalculator.ChildImpurities(); // could be avoided

                            bestImpurityImprovement = improvement;
                            bestThreshold           = (currentValue + prevValue) * 0.5;
                            bestSplitIndex          = currentSplit;
                            bestImpurityLeft        = childImpurities.Left;
                            bestImpurityRight       = childImpurities.Right;
                        }

                        prevSplit = j;
                    }
                }

                prevValue  = currentValue;
                prevTarget = currentTarget;
            }

            return(new SplitResult(bestSplitIndex, bestThreshold,
                                   bestImpurityImprovement, bestImpurityLeft, bestImpurityRight));
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="impurityCalculator"></param>
        /// <param name="feature"></param>
        /// <param name="targets"></param>
        /// <param name="parentInterval"></param>
        /// <param name="parentImpurity"></param>
        /// <returns></returns>
        public SplitResult FindBestSplit(IImpurityCalculator impurityCalculator, double[] feature, double[] targets, Interval1D parentInterval, double parentImpurity)
        {
            var min = double.MaxValue;
            var max = double.MinValue;

            for (int i = parentInterval.FromInclusive; i < parentInterval.ToExclusive; i++)
            {
                var value = feature[i];

                if (value < min)
                {
                    min = value;
                }
                else if (value > max)
                {
                    max = value;
                }
            }

            if (min == max)
            {
                return(SplitResult.Initial());
            }

            var threshold = RandomThreshold(min, max);

            if (threshold == max)
            {
                threshold = min;
            }

            var splitIndex          = -1;
            var impurityImprovement = 0.0;
            var impurityLeft        = 0.0;
            var impurityRight       = 0.0;

            var currentFeature = double.MinValue;

            for (int i = parentInterval.FromInclusive; i < parentInterval.ToExclusive; i++)
            {
                var leftSize  = (double)(i - parentInterval.FromInclusive);
                var rightSize = (double)(parentInterval.ToExclusive - i);

                currentFeature = feature[i];

                if (currentFeature > threshold && Math.Min(leftSize, rightSize) >= m_minimumSplitSize)
                {
                    splitIndex = i;

                    impurityCalculator.UpdateInterval(parentInterval);
                    impurityCalculator.UpdateIndex(i);
                    impurityImprovement = impurityCalculator.ImpurityImprovement(parentImpurity);

                    var childImpurities = impurityCalculator.ChildImpurities();
                    impurityLeft  = childImpurities.Left;
                    impurityRight = childImpurities.Right;

                    break;
                }
            }

            return(new SplitResult(splitIndex, threshold, impurityImprovement,
                                   impurityLeft, impurityRight));
        }