protected virtual IDecisionTreeNode BuildDecisionNode( IDataFrame dataFrame, string dependentFeatureName, IDecisionTreeModelBuilderParams additionalParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, int treeDepth, bool isFirstSplit = false) { if (dataFrame.GetColumnVector <object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth)) { return(BuildLeaf(dataFrame, dependentFeatureName)); } // TODO: later on add additional params indicating which features were already used ISplittingResult splitResult = BestSplitSelector.SelectBestSplit( dataFrame, dependentFeatureName, SplitQualityChecker, alreadyUsedAttributesInfo); if (SplitIsEmpty(splitResult)) { return(BuildLeaf(dataFrame, dependentFeatureName)); } if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null) { var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant( dataFrame, splitResult, dependentFeatureName); if (!isSplitSignificant) { return(BuildLeaf(dataFrame, dependentFeatureName)); } } var children = new ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode>(); if (isFirstSplit) { Parallel.ForEach( splitResult.SplittedDataSets, splitData => { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); }); } else { foreach (var splitData in splitResult.SplittedDataSets) { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); } } return(BuildConcreteDecisionTreeNode(splitResult, children)); }
public Tuple <ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { ISplittingResult bestSplit = null; double bestSplitQuality = double.NegativeInfinity; var totalRowsCount = baseData.RowCount; var sortedRowData = baseData.GetNumericColumnVector(numericFeatureToProcess) .AsParallel() .Select((val, rowIdx) => new { RowIdx = rowIdx, FeatureValue = val, DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue }) .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx) .ToList(); var previousClass = sortedRowData[0].DependentFeatureValue; var previousFeatureVal = sortedRowData[0].FeatureValue; foreach (var rowData in sortedRowData) { var currentClass = rowData.DependentFeatureValue; var currentFeatureVal = rowData.FeatureValue; if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal)) { var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0; var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName); var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams); var quality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splitResult, dependentFeatureName); if (quality >= bestSplitQuality) { bestSplitQuality = quality; bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay); } previousClass = currentClass; } previousFeatureVal = currentFeatureVal; } return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality)); }
public bool IsSplitStatisticallySignificant( IDataFrame initialDataFrame, ISplittingResult splittingResults, string dependentFeatureName) { var uniqueDependentValuesCounts = initialDataFrame .GetColumnVector(dependentFeatureName) .Values .GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount); var chisquareStatisticSum = 0.0; if (splittingResults.IsSplitNumeric) { return(true); } var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1) + (splittingResults.SplittedDataSets.Count - 1); foreach (var splittingResult in splittingResults.SplittedDataSets) { var splitSize = splittingResult.SplittedDataFrame.RowCount; var actualDependentFeatureValues = splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName) .Values.GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count()); foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts) { var expectedCount = uniqueDependentValueCount.Value * splitSize; var actualCount = 0; if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key)) { actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key]; } var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount; chisquareStatisticSum += actualChisquareValue; } } if (ChiSquared.IsValidParameterSet(degreesOfFreedom)) { var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum); if (statisticValue < significanceLevel) { return(true); } } return(false); }
public bool IsSplitStatisticallySignificant( IDataFrame initialDataFrame, ISplittingResult splittingResults, string dependentFeatureName) { var uniqueDependentValuesCounts = initialDataFrame .GetColumnVector(dependentFeatureName) .Values .GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount); var chisquareStatisticSum = 0.0; if (splittingResults.IsSplitNumeric) { return true; } var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1) + (splittingResults.SplittedDataSets.Count - 1); foreach (var splittingResult in splittingResults.SplittedDataSets) { var splitSize = splittingResult.SplittedDataFrame.RowCount; var actualDependentFeatureValues = splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName) .Values.GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count()); foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts) { var expectedCount = uniqueDependentValueCount.Value * splitSize; var actualCount = 0; if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key)) { actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key]; } var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount; chisquareStatisticSum += actualChisquareValue; } } if (ChiSquared.IsValidParameterSet(degreesOfFreedom)) { var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum); if (statisticValue < significanceLevel) { return true; } } return false; }
protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode> children) { var binarySplittingResults = splittingResult as IBinarySplittingResult; if (binarySplittingResults == null) { throw new ArgumentException("Invalid split results passed to binary decision tree builder"); } return new BinaryDecisionTreeParentNode( false, splittingResult.SplittingFeatureName, children, binarySplittingResults.SplittingValue, binarySplittingResults.IsSplitNumeric); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { ISplittingResult bestSplit = null; double bestSplitQuality = float.NegativeInfinity; double initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName })) { if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric()) { // TODO: add checking for the already used attribtues var bestNumericSplitPointAndQuality = BinaryNumericBestSplitingPointSelector.FindBestSplitPoint( baseData, dependentFeatureName, attributeToSplit, splitQualityChecker, BinaryNumericDataSplitter, initialEntropy); if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality) { bestSplitQuality = bestNumericSplitPointAndQuality.Item2; bestSplit = bestNumericSplitPointAndQuality.Item1; } } else { var bestSplitForAttribute = EvaluateCategoricalSplit( baseData, dependentFeatureName, attributeToSplit, bestSplitQuality, initialEntropy, splitQualityChecker, alreadyUsedAttributesInfo); if (bestSplitForAttribute.Item3 > bestSplitQuality) { bestSplit = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1); bestSplitQuality = bestSplitForAttribute.Item3; } } } return(bestSplit); }
protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode> children) { var binarySplittingResults = splittingResult as IBinarySplittingResult; if (binarySplittingResults == null) { throw new ArgumentException("Invalid split results passed to binary decision tree builder"); } return(new BinaryDecisionTreeParentNode( false, splittingResult.SplittingFeatureName, children, binarySplittingResults.SplittingValue, binarySplittingResults.IsSplitNumeric)); }
protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode> children) { if (splittingResult is IBinarySplittingResult) { var binarySplitResult = (IBinarySplittingResult)splittingResult; return(new BinaryDecisionTreeParentNode( false, splittingResult.SplittingFeatureName, children, binarySplitResult.SplittingValue, binarySplitResult.IsSplitNumeric)); } return(new DecisionTreeParentNode( false, splittingResult.SplittingFeatureName, children)); }
protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode> children) { if (splittingResult is IBinarySplittingResult) { var binarySplitResult = (IBinarySplittingResult)splittingResult; return new BinaryDecisionTreeParentNode( false, splittingResult.SplittingFeatureName, children, binarySplitResult.SplittingValue, binarySplitResult.IsSplitNumeric); } return new DecisionTreeParentNode( false, splittingResult.SplittingFeatureName, children); }
private static bool SplitIsEmpty(ISplittingResult splitResult) { return splitResult == null || splitResult.SplittedDataSets.Any( splitSet => splitSet?.SplittedDataFrame == null || splitSet.SplittedDataFrame.RowCount == 0); }
protected abstract IDecisionTreeNode BuildConcreteDecisionTreeNode( ISplittingResult splittingResult, ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode> children);
protected abstract IDecisionTreeNode BuildConcreteDecisionTreeNode( ISplittingResult splittingResult, ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode> children);
private static bool SplitIsEmpty(ISplittingResult splitResult) { return(splitResult == null || splitResult.SplittedDataSets.Any( splitSet => splitSet?.SplittedDataFrame == null || splitSet.SplittedDataFrame.RowCount == 0)); }