protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName))
            {
                return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                    new List<ISplittedData>(),
                    new SplittingParams(splittingFeatureName, dependentFeatureName),
                    double.NegativeInfinity);
            }
            var totalRowsCount = dataToSplit.RowCount;
            var splitParams = new SplittingParams(splittingFeatureName, dependentFeatureName);
            var splitData = CategoricalDataSplitter.SplitData(dataToSplit, splitParams);
            if (splitData.Count == 1)
            {
                return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                    new List<ISplittedData>(),
                    splitParams,
                    double.NegativeInfinity);
            }

            var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName);
            return new Tuple<IList<ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality);
        }
示例#2
0
        protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName))
            {
                return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                           new List <ISplittedData>(),
                           new SplittingParams(splittingFeatureName, dependentFeatureName),
                           double.NegativeInfinity));
            }
            var totalRowsCount = dataToSplit.RowCount;
            var splitParams    = new SplittingParams(splittingFeatureName, dependentFeatureName);
            var splitData      = CategoricalDataSplitter.SplitData(dataToSplit, splitParams);

            if (splitData.Count == 1)
            {
                return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                           new List <ISplittedData>(),
                           splitParams,
                           double.NegativeInfinity));
            }

            var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName);

            return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality));
        }
 public MultiSplitDecisionTreeModelBuilder(
     ISplitQualityChecker splitQualityChecker,
     IBestSplitSelector bestSplitSelector,
     ILeafBuilder leafBuilder,
     IStatisticalSignificanceChecker statisticalSignificanceChecker = null)
     : base(splitQualityChecker, bestSplitSelector, leafBuilder, statisticalSignificanceChecker)
 {
 }
 public MultiSplitDecisionTreeModelBuilder(
     ISplitQualityChecker splitQualityChecker,
     IBestSplitSelector bestSplitSelector,
     ILeafBuilder leafBuilder,
     IStatisticalSignificanceChecker statisticalSignificanceChecker = null)
     : base(splitQualityChecker, bestSplitSelector, leafBuilder, statisticalSignificanceChecker)
 {
 }
示例#5
0
 //TODO: AAA make it nicer - maybe encapsulate Tuple in some dto
 protected abstract Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
     IDataFrame dataToSplit,
     string dependentFeatureName,
     string splittingFeatureName,
     double bestSplitQualitySoFar,
     double initialEntropy,
     ISplitQualityChecker splitQualityChecker,
     IAlredyUsedAttributesInfo alredyUsedAttributesInfo);
 protected BaseDecisionTreeModelBuilder(
     ISplitQualityChecker splitQualityChecker,
     IBestSplitSelector bestSplitSelector,
     ILeafBuilder leafBuilder,
     IStatisticalSignificanceChecker statisticalSignificanceChecker = null)
 {
     SplitQualityChecker = splitQualityChecker;
     BestSplitSelector   = bestSplitSelector;
     LeafBuilder         = leafBuilder;
     this.StatisticalSignificanceChecker = statisticalSignificanceChecker;
 }
 protected BaseDecisionTreeModelBuilder(
     ISplitQualityChecker splitQualityChecker, 
     IBestSplitSelector bestSplitSelector, 
     ILeafBuilder leafBuilder, 
     IStatisticalSignificanceChecker statisticalSignificanceChecker = null)
 {
     SplitQualityChecker = splitQualityChecker;
     BestSplitSelector = bestSplitSelector;
     LeafBuilder = leafBuilder;
     this.StatisticalSignificanceChecker = statisticalSignificanceChecker;
 }
 public ISplittingResult SelectBestSplit(
     IDataFrame baseData,
     string dependentFeatureName,
     ISplitQualityChecker splitQualityChecker,
     IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
 {
     if (!(splitQualityChecker is INumericalSplitQualityChecker))
     {
         throw new ArgumentException("Invalid split quality checker for numerical outcome");
     }
     return(SelectBestSplit(baseData, dependentFeatureName, (INumericalSplitQualityChecker)splitQualityChecker, alreadyUsedAttributesInfo));
 }
 public BestSplitSelectorsTests()
 {
     ICategoricalImpurityMeasure<string> shannonEntropy = new ShannonEntropy<string>();
     IBinaryDataSplitter binaryDataSplitter = new BinaryDiscreteDataSplitter();
     binaryNumericBestSplitPointSelector = new ClassBreakpointsNumericSplitFinder();
     binaryBestSplitSelector = new BinarySplitSelectorForCategoricalOutcome(binaryDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector);
     categoricalBinarySplitQualityChecker = new InformationGainCalculator<string>(shannonEntropy, shannonEntropy);
     categoricalMultiValueSplitQualityChecker = new InformationGainCalculator<string>(shannonEntropy, shannonEntropy);
     multiValueCategoricalDataSplitter = new MultiValueDiscreteDataSplitter();
     multiValueBestSplitSelector = new MultiValueSplitSelectorForCategoricalOutcome(multiValueCategoricalDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector);
     dynamicProgrammingBestNumericSplitFinder = new DynamicProgrammingNumericSplitFinder();
 }
 public ISplittingResult SelectBestSplit(
     IDataFrame baseData,
     string dependentFeatureName,
     ISplitQualityChecker splitQualityChecker,
     IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
 {
     if (!(splitQualityChecker is INumericalSplitQualityChecker))
     {
         throw new ArgumentException("Invalid split quality checker for numerical outcome");
     }
     return SelectBestSplit(baseData, dependentFeatureName, (INumericalSplitQualityChecker)splitQualityChecker, alreadyUsedAttributesInfo);
 }
示例#11
0
        public BestSplitSelectorsTests()
        {
            ICategoricalImpurityMeasure <string> shannonEntropy = new ShannonEntropy <string>();
            IBinaryDataSplitter binaryDataSplitter = new BinaryDiscreteDataSplitter();

            binaryNumericBestSplitPointSelector      = new ClassBreakpointsNumericSplitFinder();
            binaryBestSplitSelector                  = new BinarySplitSelectorForCategoricalOutcome(binaryDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector);
            categoricalBinarySplitQualityChecker     = new InformationGainCalculator <string>(shannonEntropy, shannonEntropy);
            categoricalMultiValueSplitQualityChecker = new InformationGainCalculator <string>(shannonEntropy, shannonEntropy);
            multiValueCategoricalDataSplitter        = new MultiValueDiscreteDataSplitter();
            multiValueBestSplitSelector              = new MultiValueSplitSelectorForCategoricalOutcome(multiValueCategoricalDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector);
            dynamicProgrammingBestNumericSplitFinder = new DynamicProgrammingNumericSplitFinder();
        }
示例#12
0
        public ISplittingResult SelectBestSplit(
            IDataFrame baseData,
            string dependentFeatureName,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            ISplittingResult bestSplit        = null;
            double           bestSplitQuality = float.NegativeInfinity;
            double           initialEntropy   = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName);

            foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName }))
            {
                if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric())
                {
                    // TODO: add checking for the already used attribtues
                    var bestNumericSplitPointAndQuality =
                        BinaryNumericBestSplitingPointSelector.FindBestSplitPoint(
                            baseData,
                            dependentFeatureName,
                            attributeToSplit,
                            splitQualityChecker,
                            BinaryNumericDataSplitter,
                            initialEntropy);
                    if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality)
                    {
                        bestSplitQuality = bestNumericSplitPointAndQuality.Item2;
                        bestSplit        = bestNumericSplitPointAndQuality.Item1;
                    }
                }
                else
                {
                    var bestSplitForAttribute = EvaluateCategoricalSplit(
                        baseData,
                        dependentFeatureName,
                        attributeToSplit,
                        bestSplitQuality,
                        initialEntropy,
                        splitQualityChecker,
                        alreadyUsedAttributesInfo);
                    if (bestSplitForAttribute.Item3 > bestSplitQuality)
                    {
                        bestSplit        = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1);
                        bestSplitQuality = bestSplitForAttribute.Item3;
                    }
                }
            }
            return(bestSplit);
        }
        protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alredyUsedAttributesInfo)
        {
            var    totalRowsCount          = dataToSplit.RowCount;
            var    uniqueFeatureValues     = dataToSplit.GetColumnVector(splittingFeatureName).Distinct();
            double locallyBestSplitQuality = double.NegativeInfinity;
            IBinarySplittingParams localBestSplitParams = null;
            IList <ISplittedData>  locallyBestSplitData = null;

            foreach (var featureValue in uniqueFeatureValues)
            {
                if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue))
                {
                    var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName);
                    var splittedData      = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams);
                    if (splittedData.Count == 1)
                    {
                        return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                                   new List <ISplittedData>(),
                                   binarySplitParams,
                                   double.NegativeInfinity));
                    }

                    var splitQuality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splittedData,
                        dependentFeatureName);
                    if (splitQuality > locallyBestSplitQuality)
                    {
                        locallyBestSplitQuality = splitQuality;
                        locallyBestSplitData    = splittedData;
                        localBestSplitParams    = binarySplitParams;
                    }
                }
            }

            return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                       locallyBestSplitData,
                       localBestSplitParams,
                       locallyBestSplitQuality));
        }
        public ISplittingResult SelectBestSplit(
            IDataFrame baseData,
            string dependentFeatureName,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            ISplittingResult bestSplit = null;
            double bestSplitQuality = float.NegativeInfinity;
            double initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName);

            foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName }))
            {
                if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric())
                {
                    // TODO: add checking for the already used attribtues
                    var bestNumericSplitPointAndQuality =
                        BinaryNumericBestSplitingPointSelector.FindBestSplitPoint(
                            baseData,
                            dependentFeatureName,
                            attributeToSplit,
                            splitQualityChecker,
                            BinaryNumericDataSplitter,
                            initialEntropy);
                    if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality)
                    {
                        bestSplitQuality = bestNumericSplitPointAndQuality.Item2;
                        bestSplit = bestNumericSplitPointAndQuality.Item1;
                    }
                }
                else
                {
                    var bestSplitForAttribute = EvaluateCategoricalSplit(
                        baseData,
                        dependentFeatureName,
                        attributeToSplit,
                        bestSplitQuality,
                        initialEntropy,
                        splitQualityChecker,
                        alreadyUsedAttributesInfo);
                    if (bestSplitForAttribute.Item3 > bestSplitQuality)
                    {
                        bestSplit = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1);
                        bestSplitQuality = bestSplitForAttribute.Item3;
                    }
                }
            }
            return bestSplit;
        }
 public Tuple <ISplittingResult, double> FindBestSplitPoint(
     IDataFrame baseData,
     string dependentFeatureName,
     string numericFeatureToProcess,
     ISplitQualityChecker splitQualityChecker,
     IBinaryNumericDataSplitter binaryNumericDataSplitter,
     double initialEntropy)
 {
     return(FindBestSplitPoint(
                baseData,
                dependentFeatureName,
                numericFeatureToProcess,
                splitQualityChecker as ICategoricalSplitQualityChecker,
                binaryNumericDataSplitter,
                initialEntropy));
 }
 public Tuple<ISplittingResult, double> FindBestSplitPoint(
     IDataFrame baseData,
     string dependentFeatureName,
     string numericFeatureToProcess,
     ISplitQualityChecker splitQualityChecker,
     IBinaryNumericDataSplitter binaryNumericDataSplitter,
     double initialEntropy)
 {
     return FindBestSplitPoint(
         baseData,
         dependentFeatureName,
         numericFeatureToProcess,
         splitQualityChecker as ICategoricalSplitQualityChecker,
         binaryNumericDataSplitter,
         initialEntropy);
 }
        protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alredyUsedAttributesInfo)
        {
            var totalRowsCount = dataToSplit.RowCount;
            var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct();
            double locallyBestSplitQuality = double.NegativeInfinity;
            IBinarySplittingParams localBestSplitParams = null;
            IList<ISplittedData> locallyBestSplitData = null;
            foreach (var featureValue in uniqueFeatureValues)
            {
                if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue))
                {
                    var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName);
                    var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams);
                    if (splittedData.Count == 1)
                    {
                        return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                            new List<ISplittedData>(),
                            binarySplitParams,
                            double.NegativeInfinity);
                    }

                    var splitQuality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splittedData,
                        dependentFeatureName);
                    if (splitQuality > locallyBestSplitQuality)
                    {
                        locallyBestSplitQuality = splitQuality;
                        locallyBestSplitData = splittedData;
                        localBestSplitParams = binarySplitParams;
                    }
                }
            }

            return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                locallyBestSplitData,
                localBestSplitParams,
                locallyBestSplitQuality);
        }
示例#18
0
        private IDecisionTreeModelBuilder BuildCustomModelBuilder(
            bool binary = false,
            ISplitQualityChecker splitQualityChecker = null,
            IBestSplitSelector bestSplitSelector     = null,
            ILeafBuilder leafBuilder = null,
            IStatisticalSignificanceChecker statisticalSignificanceChecker = null)
        {
            if (binary)
            {
                return(new BinaryDecisionTreeModelBuilder(
                           splitQualityChecker ?? new InformationGainRatioCalculator <string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure <string>),
                           bestSplitSelector as IBinaryBestSplitSelector ?? new BinarySplitSelectorForCategoricalOutcome(new BinaryDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new ClassBreakpointsNumericSplitFinder()),
                           leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(),
                           statisticalSignificanceChecker));
            }

            return(new MultiSplitDecisionTreeModelBuilder(
                       splitQualityChecker ?? new InformationGainRatioCalculator <string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure <string>),
                       bestSplitSelector ?? new MultiValueSplitSelectorForCategoricalOutcome(new MultiValueDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new DynamicProgrammingNumericSplitFinder()),
                       leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(),
                       statisticalSignificanceChecker));
        }
        private IDecisionTreeModelBuilder BuildCustomModelBuilder(
            bool binary = false, 
            ISplitQualityChecker splitQualityChecker = null,
            IBestSplitSelector bestSplitSelector = null,
            ILeafBuilder leafBuilder = null,
            IStatisticalSignificanceChecker statisticalSignificanceChecker = null)
        {
            if (binary)
            {
                return new BinaryDecisionTreeModelBuilder(
                    splitQualityChecker ?? new InformationGainRatioCalculator<string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure<string>),
                    bestSplitSelector as IBinaryBestSplitSelector ?? new BinarySplitSelectorForCategoricalOutcome(new BinaryDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new ClassBreakpointsNumericSplitFinder()),
                    leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(),
                    statisticalSignificanceChecker);
            }

            return new MultiSplitDecisionTreeModelBuilder(
               splitQualityChecker ?? new InformationGainRatioCalculator<string>(shannonEntropy, shannonEntropy as ICategoricalImpurityMeasure<string>),
               bestSplitSelector ?? new MultiValueSplitSelectorForCategoricalOutcome(new MultiValueDiscreteDataSplitter(), new BinaryNumericDataSplitter(), new DynamicProgrammingNumericSplitFinder()),
               leafBuilder ?? new CategoricalDecisionTreeLeafBuilder(),
               statisticalSignificanceChecker);
        }
 //TODO: AAA make it nicer - maybe encapsulate Tuple in some dto
 protected abstract Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
     IDataFrame dataToSplit, 
     string dependentFeatureName, 
     string splittingFeatureName, 
     double bestSplitQualitySoFar,
     double initialEntropy,
     ISplitQualityChecker splitQualityChecker,
     IAlredyUsedAttributesInfo alredyUsedAttributesInfo);