protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName)) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), new SplittingParams(splittingFeatureName, dependentFeatureName), double.NegativeInfinity); } var totalRowsCount = dataToSplit.RowCount; var splitParams = new SplittingParams(splittingFeatureName, dependentFeatureName); var splitData = CategoricalDataSplitter.SplitData(dataToSplit, splitParams); if (splitData.Count == 1) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), splitParams, double.NegativeInfinity); } var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName); return new Tuple<IList<ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality); }
protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName)) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), new SplittingParams(splittingFeatureName, dependentFeatureName), double.NegativeInfinity)); } var totalRowsCount = dataToSplit.RowCount; var splitParams = new SplittingParams(splittingFeatureName, dependentFeatureName); var splitData = CategoricalDataSplitter.SplitData(dataToSplit, splitParams); if (splitData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), splitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName); return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality)); }
//TODO: AAA make it nicer - maybe encapsulate Tuple in some dto protected abstract Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo);
protected virtual IDecisionTreeNode BuildDecisionNode( IDataFrame dataFrame, string dependentFeatureName, IDecisionTreeModelBuilderParams additionalParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, int treeDepth, bool isFirstSplit = false) { if (dataFrame.GetColumnVector <object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth)) { return(BuildLeaf(dataFrame, dependentFeatureName)); } // TODO: later on add additional params indicating which features were already used ISplittingResult splitResult = BestSplitSelector.SelectBestSplit( dataFrame, dependentFeatureName, SplitQualityChecker, alreadyUsedAttributesInfo); if (SplitIsEmpty(splitResult)) { return(BuildLeaf(dataFrame, dependentFeatureName)); } if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null) { var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant( dataFrame, splitResult, dependentFeatureName); if (!isSplitSignificant) { return(BuildLeaf(dataFrame, dependentFeatureName)); } } var children = new ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode>(); if (isFirstSplit) { Parallel.ForEach( splitResult.SplittedDataSets, splitData => { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); }); } else { foreach (var splitData in splitResult.SplittedDataSets) { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); } } return(BuildConcreteDecisionTreeNode(splitResult, children)); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (!(splitQualityChecker is INumericalSplitQualityChecker)) { throw new ArgumentException("Invalid split quality checker for numerical outcome"); } return(SelectBestSplit(baseData, dependentFeatureName, (INumericalSplitQualityChecker)splitQualityChecker, alreadyUsedAttributesInfo)); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (!(splitQualityChecker is INumericalSplitQualityChecker)) { throw new ArgumentException("Invalid split quality checker for numerical outcome"); } return SelectBestSplit(baseData, dependentFeatureName, (INumericalSplitQualityChecker)splitQualityChecker, alreadyUsedAttributesInfo); }
protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList <ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), binarySplitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality)); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { ISplittingResult bestSplit = null; double bestSplitQuality = float.NegativeInfinity; double initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName })) { if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric()) { // TODO: add checking for the already used attribtues var bestNumericSplitPointAndQuality = BinaryNumericBestSplitingPointSelector.FindBestSplitPoint( baseData, dependentFeatureName, attributeToSplit, splitQualityChecker, BinaryNumericDataSplitter, initialEntropy); if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality) { bestSplitQuality = bestNumericSplitPointAndQuality.Item2; bestSplit = bestNumericSplitPointAndQuality.Item1; } } else { var bestSplitForAttribute = EvaluateCategoricalSplit( baseData, dependentFeatureName, attributeToSplit, bestSplitQuality, initialEntropy, splitQualityChecker, alreadyUsedAttributesInfo); if (bestSplitForAttribute.Item3 > bestSplitQuality) { bestSplit = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1); bestSplitQuality = bestSplitForAttribute.Item3; } } } return bestSplit; }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { ISplittingResult bestSplit = null; double bestSplitQuality = float.NegativeInfinity; double initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName })) { if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric()) { // TODO: add checking for the already used attribtues var bestNumericSplitPointAndQuality = BinaryNumericBestSplitingPointSelector.FindBestSplitPoint( baseData, dependentFeatureName, attributeToSplit, splitQualityChecker, BinaryNumericDataSplitter, initialEntropy); if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality) { bestSplitQuality = bestNumericSplitPointAndQuality.Item2; bestSplit = bestNumericSplitPointAndQuality.Item1; } } else { var bestSplitForAttribute = EvaluateCategoricalSplit( baseData, dependentFeatureName, attributeToSplit, bestSplitQuality, initialEntropy, splitQualityChecker, alreadyUsedAttributesInfo); if (bestSplitForAttribute.Item3 > bestSplitQuality) { bestSplit = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1); bestSplitQuality = bestSplitForAttribute.Item3; } } } return(bestSplit); }
protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList<ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), binarySplitParams, double.NegativeInfinity); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return new Tuple<IList<ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality); }
protected virtual void AddChildFromSplit( string dependentFeatureName, IDecisionTreeModelBuilderParams additionalParams, ISplittedData splitData, ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode> children, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, int treeDepth) { var decisionTreeNode = BuildDecisionNode( splitData.SplittedDataFrame, dependentFeatureName, additionalParams, alreadyUsedAttributesInfo, treeDepth); var link = splitData.SplitLink; children.TryAdd(link, decisionTreeNode); }
protected override void UpdateAlreadyUsedAttributes(ISplittingParams splittingParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { alreadyUsedAttributesInfo.AddAlreadyUsedAttribute(splittingParams.SplitOnFeature); }
protected override void UpdateAlreadyUsedAttributes(ISplittingParams splittingParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var binarySplittingParams = splittingParams as IBinarySplittingParams; alreadyUsedAttributesInfo.AddAlreadyUsedAttribute(splittingParams.SplitOnFeature, binarySplittingParams.SplitOnValue); }
protected abstract void UpdateAlreadyUsedAttributes( ISplittingParams splittingParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo);
protected virtual IDecisionTreeNode BuildDecisionNode( IDataFrame dataFrame, string dependentFeatureName, IDecisionTreeModelBuilderParams additionalParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, int treeDepth, bool isFirstSplit = false) { if (dataFrame.GetColumnVector<object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth)) { return BuildLeaf(dataFrame, dependentFeatureName); } // TODO: later on add additional params indicating which features were already used ISplittingResult splitResult = BestSplitSelector.SelectBestSplit( dataFrame, dependentFeatureName, SplitQualityChecker, alreadyUsedAttributesInfo); if (SplitIsEmpty(splitResult)) { return BuildLeaf(dataFrame, dependentFeatureName); } if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null) { var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant( dataFrame, splitResult, dependentFeatureName); if (!isSplitSignificant) { return BuildLeaf(dataFrame, dependentFeatureName); } } var children = new ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode>(); if (isFirstSplit) { Parallel.ForEach( splitResult.SplittedDataSets, splitData => { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); }); } else { foreach (var splitData in splitResult.SplittedDataSets) { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); } } return BuildConcreteDecisionTreeNode(splitResult, children); }
//TODO: AAA make it nicer - maybe encapsulate Tuple in some dto protected abstract Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo);
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple<string, double> bestSplit = null; /* if (baseData.RowCount <= baseData.ColumnsCount) { return null; } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple<double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count -1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple<string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return null; } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple <string, double> bestSplit = null; /* * if (baseData.RowCount <= baseData.ColumnsCount) * { * return null; * } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple <double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count - 1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple <string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return(null); } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return(new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2)); }
protected virtual void AddChildFromSplit( string dependentFeatureName, IDecisionTreeModelBuilderParams additionalParams, ISplittedData splitData, ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode> children, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, int treeDepth) { var decisionTreeNode = BuildDecisionNode( splitData.SplittedDataFrame, dependentFeatureName, additionalParams, alreadyUsedAttributesInfo, treeDepth); var link = splitData.SplitLink; children.TryAdd(link, decisionTreeNode); }