/// <summary> /// /// </summary> /// <param name="maximumTreeDepth">The maximal tree depth before a leaf is generated</param> /// <param name="maximumLeafCount">The maximal allowed leaf nodes in the tree</param> /// <param name="featuresPrSplit">The number of features to be selected between at each split. /// 0 means use all available features</param> /// <param name="minimumInformationGain">The minimum improvement in information gain before a split is made</param> /// <param name="seed">Seed for feature selection if number of features pr split is not equal /// to the total amount of features in observations. The features will be selected at random for each split</param> /// <param name="splitSearcher">The type of searcher used for finding the best features splits when learning the tree</param> /// <param name="impurityCalculator">Impurity calculator used to decide which split is optimal</param> public BestFirstTreeBuilder(int maximumTreeDepth, int maximumLeafCount, int featuresPrSplit, double minimumInformationGain, int seed, ISplitSearcher splitSearcher, IImpurityCalculator impurityCalculator) { if (maximumTreeDepth <= 0) { throw new ArgumentException("maximum tree depth must be larger than 0"); } if (maximumLeafCount <= 1) { throw new ArgumentException("maximum leaf count must be larger than 1"); } if (minimumInformationGain <= 0) { throw new ArgumentException("minimum information gain must be larger than 0"); } if (featuresPrSplit < 0) { throw new ArgumentException("features pr split must be at least 0"); } m_splitSearcher = splitSearcher ?? throw new ArgumentException(nameof(splitSearcher)); m_impurityCalculator = impurityCalculator ?? throw new ArgumentException(nameof(impurityCalculator)); m_maximumTreeDepth = maximumTreeDepth; m_maximumLeafCount = maximumLeafCount; m_featuresPrSplit = featuresPrSplit; m_minimumInformationGain = minimumInformationGain; m_random = new Random(seed); }
/// <summary> /// /// </summary> /// <param name="maximumTreeDepth">The maximal tree depth before a leaf is generated</param> /// <param name="featuresPrSplit">The number of features to be selected between at each split. /// 0 means use all availible features</param> /// <param name="minimumInformationGain">The minimum improvement in information gain before a split is made</param> /// <param name="seed">Seed for feature selection if number of features pr split is not equal /// to the total amount of features in observations. The features will be selected at random for each split</param> /// <param name="splitSearcher">The type of searcher used for finding the best features splits when learning the tree</param> /// <param name="impurityCalculator">Impurity calculator used to decide which split is optimal</param> public DepthFirstTreeBuilder(int maximumTreeDepth, int featuresPrSplit, double minimumInformationGain, int seed, ISplitSearcher splitSearcher, IImpurityCalculator impurityCalculator) { if (splitSearcher == null) { throw new ArgumentException("splitSearcher"); } if (maximumTreeDepth <= 0) { throw new ArgumentException("maximum tree depth must be larger than 0"); } if (minimumInformationGain <= 0) { throw new ArgumentException("minimum information gain must be larger than 0"); } if (featuresPrSplit < 0) { throw new ArgumentException("features pr split must be at least 0"); } if (impurityCalculator == null) { throw new ArgumentException("impurityCalculator"); } m_maximumTreeDepth = maximumTreeDepth; m_featuresPrSplit = featuresPrSplit; m_splitSearcher = splitSearcher; m_impurityCalculator = impurityCalculator; m_minimumInformationGain = minimumInformationGain; m_random = new Random(seed); }
/// <summary> /// Searches for the best split using a brute force approach on all unique threshold values. /// The implementation assumes that the features and targets have been sorted /// together using the features as sort criteria /// </summary> /// <param name="impurityCalculator"></param> /// <param name="feature"></param> /// <param name="targets"></param> /// <param name="parentInterval"></param> /// <param name="parentImpurity"></param> /// <returns></returns> public SplitResult FindBestSplit(IImpurityCalculator impurityCalculator, double[] feature, double[] targets, Interval1D parentInterval, double parentImpurity) { var bestSplitIndex = -1; var bestThreshold = 0.0; var bestImpurityImprovement = 0.0; var bestImpurityLeft = 0.0; var bestImpurityRight = 0.0; int prevSplit = parentInterval.FromInclusive; var prevValue = feature[prevSplit]; var prevTarget = targets[prevSplit]; impurityCalculator.UpdateInterval(parentInterval); for (int j = prevSplit + 1; j < parentInterval.ToExclusive; j++) { var currentValue = feature[j]; var currentTarget = targets[j]; if (prevValue != currentValue) { var currentSplit = j; var leftSize = (double)(currentSplit - parentInterval.FromInclusive); var rightSize = (double)(parentInterval.ToExclusive - currentSplit); if (Math.Min(leftSize, rightSize) >= m_minimumSplitSize) { impurityCalculator.UpdateIndex(currentSplit); if ((impurityCalculator.WeightedLeft < m_minimumLeafWeight) || (impurityCalculator.WeightedRight < m_minimumLeafWeight)) { continue; } var improvement = impurityCalculator.ImpurityImprovement(parentImpurity); if (improvement > bestImpurityImprovement) { var childImpurities = impurityCalculator.ChildImpurities(); // could be avoided bestImpurityImprovement = improvement; bestThreshold = (currentValue + prevValue) * 0.5; bestSplitIndex = currentSplit; bestImpurityLeft = childImpurities.Left; bestImpurityRight = childImpurities.Right; } prevSplit = j; } } prevValue = currentValue; prevTarget = currentTarget; } return(new SplitResult(bestSplitIndex, bestThreshold, bestImpurityImprovement, bestImpurityLeft, bestImpurityRight)); }
/// <summary> /// /// </summary> /// <param name="impurityCalculator"></param> /// <param name="feature"></param> /// <param name="targets"></param> /// <param name="parentInterval"></param> /// <param name="parentImpurity"></param> /// <returns></returns> public SplitResult FindBestSplit(IImpurityCalculator impurityCalculator, double[] feature, double[] targets, Interval1D parentInterval, double parentImpurity) { var min = double.MaxValue; var max = double.MinValue; for (int i = parentInterval.FromInclusive; i < parentInterval.ToExclusive; i++) { var value = feature[i]; if (value < min) { min = value; } else if (value > max) { max = value; } } if (min == max) { return(SplitResult.Initial()); } var threshold = RandomThreshold(min, max); if (threshold == max) { threshold = min; } var splitIndex = -1; var impurityImprovement = 0.0; var impurityLeft = 0.0; var impurityRight = 0.0; var currentFeature = double.MinValue; for (int i = parentInterval.FromInclusive; i < parentInterval.ToExclusive; i++) { var leftSize = (double)(i - parentInterval.FromInclusive); var rightSize = (double)(parentInterval.ToExclusive - i); currentFeature = feature[i]; if (currentFeature > threshold && Math.Min(leftSize, rightSize) >= m_minimumSplitSize) { splitIndex = i; impurityCalculator.UpdateInterval(parentInterval); impurityCalculator.UpdateIndex(i); impurityImprovement = impurityCalculator.ImpurityImprovement(parentImpurity); var childImpurities = impurityCalculator.ChildImpurities(); impurityLeft = childImpurities.Left; impurityRight = childImpurities.Right; break; } } return(new SplitResult(splitIndex, threshold, impurityImprovement, impurityLeft, impurityRight)); }