protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName)) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), new SplittingParams(splittingFeatureName, dependentFeatureName), double.NegativeInfinity); } var totalRowsCount = dataToSplit.RowCount; var splitParams = new SplittingParams(splittingFeatureName, dependentFeatureName); var splitData = CategoricalDataSplitter.SplitData(dataToSplit, splitParams); if (splitData.Count == 1) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), splitParams, double.NegativeInfinity); } var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName); return new Tuple<IList<ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality); }
protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName)) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), new SplittingParams(splittingFeatureName, dependentFeatureName), double.NegativeInfinity)); } var totalRowsCount = dataToSplit.RowCount; var splitParams = new SplittingParams(splittingFeatureName, dependentFeatureName); var splitData = CategoricalDataSplitter.SplitData(dataToSplit, splitParams); if (splitData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), splitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName); return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality)); }
public MultiSplitDecisionTreeModelBuilder( ISplitQualityChecker splitQualityChecker, IBestSplitSelector bestSplitSelector, ILeafBuilder leafBuilder, IStatisticalSignificanceChecker statisticalSignificanceChecker = null) : base(splitQualityChecker, bestSplitSelector, leafBuilder, statisticalSignificanceChecker) { }
//TODO: AAA make it nicer - maybe encapsulate Tuple in some dto protected abstract Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo);
protected BaseDecisionTreeModelBuilder( ISplitQualityChecker splitQualityChecker, IBestSplitSelector bestSplitSelector, ILeafBuilder leafBuilder, IStatisticalSignificanceChecker statisticalSignificanceChecker = null) { SplitQualityChecker = splitQualityChecker; BestSplitSelector = bestSplitSelector; LeafBuilder = leafBuilder; this.StatisticalSignificanceChecker = statisticalSignificanceChecker; }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (!(splitQualityChecker is INumericalSplitQualityChecker)) { throw new ArgumentException("Invalid split quality checker for numerical outcome"); } return(SelectBestSplit(baseData, dependentFeatureName, (INumericalSplitQualityChecker)splitQualityChecker, alreadyUsedAttributesInfo)); }
public BestSplitSelectorsTests() { ICategoricalImpurityMeasure<string> shannonEntropy = new ShannonEntropy<string>(); IBinaryDataSplitter binaryDataSplitter = new BinaryDiscreteDataSplitter(); binaryNumericBestSplitPointSelector = new ClassBreakpointsNumericSplitFinder(); binaryBestSplitSelector = new BinarySplitSelectorForCategoricalOutcome(binaryDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector); categoricalBinarySplitQualityChecker = new InformationGainCalculator<string>(shannonEntropy, shannonEntropy); categoricalMultiValueSplitQualityChecker = new InformationGainCalculator<string>(shannonEntropy, shannonEntropy); multiValueCategoricalDataSplitter = new MultiValueDiscreteDataSplitter(); multiValueBestSplitSelector = new MultiValueSplitSelectorForCategoricalOutcome(multiValueCategoricalDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector); dynamicProgrammingBestNumericSplitFinder = new DynamicProgrammingNumericSplitFinder(); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { if (!(splitQualityChecker is INumericalSplitQualityChecker)) { throw new ArgumentException("Invalid split quality checker for numerical outcome"); } return SelectBestSplit(baseData, dependentFeatureName, (INumericalSplitQualityChecker)splitQualityChecker, alreadyUsedAttributesInfo); }
public BestSplitSelectorsTests() { ICategoricalImpurityMeasure <string> shannonEntropy = new ShannonEntropy <string>(); IBinaryDataSplitter binaryDataSplitter = new BinaryDiscreteDataSplitter(); binaryNumericBestSplitPointSelector = new ClassBreakpointsNumericSplitFinder(); binaryBestSplitSelector = new BinarySplitSelectorForCategoricalOutcome(binaryDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector); categoricalBinarySplitQualityChecker = new InformationGainCalculator <string>(shannonEntropy, shannonEntropy); categoricalMultiValueSplitQualityChecker = new InformationGainCalculator <string>(shannonEntropy, shannonEntropy); multiValueCategoricalDataSplitter = new MultiValueDiscreteDataSplitter(); multiValueBestSplitSelector = new MultiValueSplitSelectorForCategoricalOutcome(multiValueCategoricalDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector); dynamicProgrammingBestNumericSplitFinder = new DynamicProgrammingNumericSplitFinder(); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { ISplittingResult bestSplit = null; double bestSplitQuality = float.NegativeInfinity; double initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName })) { if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric()) { // TODO: add checking for the already used attribtues var bestNumericSplitPointAndQuality = BinaryNumericBestSplitingPointSelector.FindBestSplitPoint( baseData, dependentFeatureName, attributeToSplit, splitQualityChecker, BinaryNumericDataSplitter, initialEntropy); if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality) { bestSplitQuality = bestNumericSplitPointAndQuality.Item2; bestSplit = bestNumericSplitPointAndQuality.Item1; } } else { var bestSplitForAttribute = EvaluateCategoricalSplit( baseData, dependentFeatureName, attributeToSplit, bestSplitQuality, initialEntropy, splitQualityChecker, alreadyUsedAttributesInfo); if (bestSplitForAttribute.Item3 > bestSplitQuality) { bestSplit = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1); bestSplitQuality = bestSplitForAttribute.Item3; } } } return(bestSplit); }
protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList <ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), binarySplitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality)); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { ISplittingResult bestSplit = null; double bestSplitQuality = float.NegativeInfinity; double initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName })) { if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric()) { // TODO: add checking for the already used attribtues var bestNumericSplitPointAndQuality = BinaryNumericBestSplitingPointSelector.FindBestSplitPoint( baseData, dependentFeatureName, attributeToSplit, splitQualityChecker, BinaryNumericDataSplitter, initialEntropy); if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality) { bestSplitQuality = bestNumericSplitPointAndQuality.Item2; bestSplit = bestNumericSplitPointAndQuality.Item1; } } else { var bestSplitForAttribute = EvaluateCategoricalSplit( baseData, dependentFeatureName, attributeToSplit, bestSplitQuality, initialEntropy, splitQualityChecker, alreadyUsedAttributesInfo); if (bestSplitForAttribute.Item3 > bestSplitQuality) { bestSplit = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1); bestSplitQuality = bestSplitForAttribute.Item3; } } } return bestSplit; }
public Tuple <ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ISplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { return(FindBestSplitPoint( baseData, dependentFeatureName, numericFeatureToProcess, splitQualityChecker as ICategoricalSplitQualityChecker, binaryNumericDataSplitter, initialEntropy)); }
public Tuple<ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ISplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { return FindBestSplitPoint( baseData, dependentFeatureName, numericFeatureToProcess, splitQualityChecker as ICategoricalSplitQualityChecker, binaryNumericDataSplitter, initialEntropy); }
protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList<ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), binarySplitParams, double.NegativeInfinity); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return new Tuple<IList<ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality); }
private IDecisionTreeModelBuilder BuildCustomModelBuilder( bool binary = false, ISplitQualityChecker splitQualityChecker = null, IBestSplitSelector bestSplitSelector = null, ILeafBuilder leafBuilder = null, IStatisticalSignificanceChecker statisticalSignificanceChecker = null) { if (binary) { return(new BinaryDecisionTreeModelBuilder( splitQualityChecker ?? new InformationGainRatioCalculator <string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure <string>), bestSplitSelector as IBinaryBestSplitSelector ?? new BinarySplitSelectorForCategoricalOutcome(new BinaryDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new ClassBreakpointsNumericSplitFinder()), leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(), statisticalSignificanceChecker)); } return(new MultiSplitDecisionTreeModelBuilder( splitQualityChecker ?? new InformationGainRatioCalculator <string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure <string>), bestSplitSelector ?? new MultiValueSplitSelectorForCategoricalOutcome(new MultiValueDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new DynamicProgrammingNumericSplitFinder()), leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(), statisticalSignificanceChecker)); }
private IDecisionTreeModelBuilder BuildCustomModelBuilder( bool binary = false, ISplitQualityChecker splitQualityChecker = null, IBestSplitSelector bestSplitSelector = null, ILeafBuilder leafBuilder = null, IStatisticalSignificanceChecker statisticalSignificanceChecker = null) { if (binary) { return new BinaryDecisionTreeModelBuilder( splitQualityChecker ?? new InformationGainRatioCalculator<string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure<string>), bestSplitSelector as IBinaryBestSplitSelector ?? new BinarySplitSelectorForCategoricalOutcome(new BinaryDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new ClassBreakpointsNumericSplitFinder()), leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(), statisticalSignificanceChecker); } return new MultiSplitDecisionTreeModelBuilder( splitQualityChecker ?? new InformationGainRatioCalculator<string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure<string>), bestSplitSelector ?? new MultiValueSplitSelectorForCategoricalOutcome(new MultiValueDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new DynamicProgrammingNumericSplitFinder()), leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(), statisticalSignificanceChecker); }
//TODO: AAA make it nicer - maybe encapsulate Tuple in some dto protected abstract Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo);