protected virtual IDecisionTreeNode BuildDecisionNode(
            IDataFrame dataFrame,
            string dependentFeatureName,
            IDecisionTreeModelBuilderParams additionalParams,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo,
            int treeDepth,
            bool isFirstSplit = false)
        {
            if (dataFrame.GetColumnVector <object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth))
            {
                return(BuildLeaf(dataFrame, dependentFeatureName));
            }

            // TODO: later on add additional params indicating which features were already used
            ISplittingResult splitResult = BestSplitSelector.SelectBestSplit(
                dataFrame,
                dependentFeatureName,
                SplitQualityChecker,
                alreadyUsedAttributesInfo);

            if (SplitIsEmpty(splitResult))
            {
                return(BuildLeaf(dataFrame, dependentFeatureName));
            }

            if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null)
            {
                var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant(
                    dataFrame,
                    splitResult,
                    dependentFeatureName);
                if (!isSplitSignificant)
                {
                    return(BuildLeaf(dataFrame, dependentFeatureName));
                }
            }

            var children = new ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode>();

            if (isFirstSplit)
            {
                Parallel.ForEach(
                    splitResult.SplittedDataSets,
                    splitData =>
                {
                    this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1);
                });
            }
            else
            {
                foreach (var splitData in splitResult.SplittedDataSets)
                {
                    this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1);
                }
            }
            return(BuildConcreteDecisionTreeNode(splitResult, children));
        }
        public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit        = null;
            double           bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData  =
                baseData.GetNumericColumnVector(numericFeatureToProcess)
                .AsParallel()
                .Select((val, rowIdx) =>
                        new
            {
                RowIdx                = rowIdx,
                FeatureValue          = val,
                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
            })
                .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                .ToList();
            var previousClass      = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;

            foreach (var rowData in sortedRowData)
            {
                var currentClass      = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay     = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality     = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit        = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality));
        }
        public bool IsSplitStatisticallySignificant(
            IDataFrame initialDataFrame,
            ISplittingResult splittingResults,
            string dependentFeatureName)
        {
            var uniqueDependentValuesCounts = initialDataFrame
                                              .GetColumnVector(dependentFeatureName)
                                              .Values
                                              .GroupBy(elem => elem)
                                              .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount);
            var chisquareStatisticSum = 0.0;

            if (splittingResults.IsSplitNumeric)
            {
                return(true);
            }

            var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1)
                                   + (splittingResults.SplittedDataSets.Count - 1);

            foreach (var splittingResult in splittingResults.SplittedDataSets)
            {
                var splitSize = splittingResult.SplittedDataFrame.RowCount;
                var actualDependentFeatureValues =
                    splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName)
                    .Values.GroupBy(elem => elem)
                    .ToDictionary(grp => grp.Key, grp => grp.Count());
                foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts)
                {
                    var expectedCount = uniqueDependentValueCount.Value * splitSize;
                    var actualCount   = 0;
                    if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key))
                    {
                        actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key];
                    }
                    var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount;
                    chisquareStatisticSum += actualChisquareValue;
                }
            }

            if (ChiSquared.IsValidParameterSet(degreesOfFreedom))
            {
                var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum);
                if (statisticValue < significanceLevel)
                {
                    return(true);
                }
            }

            return(false);
        }
        public bool IsSplitStatisticallySignificant(
            IDataFrame initialDataFrame,
            ISplittingResult splittingResults,
            string dependentFeatureName)
        {
            var uniqueDependentValuesCounts = initialDataFrame
                .GetColumnVector(dependentFeatureName)
                .Values
                .GroupBy(elem => elem)
                .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount);
            var chisquareStatisticSum = 0.0;
            if (splittingResults.IsSplitNumeric)
            {
                return true;
            }

            var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1)
                                   + (splittingResults.SplittedDataSets.Count - 1);
            foreach (var splittingResult in splittingResults.SplittedDataSets)
            {
                var splitSize = splittingResult.SplittedDataFrame.RowCount;
                var actualDependentFeatureValues =
                    splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName)
                        .Values.GroupBy(elem => elem)
                        .ToDictionary(grp => grp.Key, grp => grp.Count());
                foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts)
                {
                    var expectedCount = uniqueDependentValueCount.Value * splitSize;
                    var actualCount = 0;
                    if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key))
                    {
                        actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key];
                    }
                    var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount;
                    chisquareStatisticSum += actualChisquareValue;
                }
            }

            if (ChiSquared.IsValidParameterSet(degreesOfFreedom))
            {
                var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum);
                if (statisticValue < significanceLevel)
                {
                    return true;
                }
            }

            return false;
        }
        protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode> children)
        {
            var binarySplittingResults = splittingResult as IBinarySplittingResult;
            if (binarySplittingResults == null)
            {
                throw new ArgumentException("Invalid split results passed to binary decision tree builder");
            }

            return new BinaryDecisionTreeParentNode(
                false,
                splittingResult.SplittingFeatureName,
                children,
                binarySplittingResults.SplittingValue,
                binarySplittingResults.IsSplitNumeric);
        }
Exemplo n.º 6
0
        public ISplittingResult SelectBestSplit(
            IDataFrame baseData,
            string dependentFeatureName,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            ISplittingResult bestSplit        = null;
            double           bestSplitQuality = float.NegativeInfinity;
            double           initialEntropy   = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName);

            foreach (var attributeToSplit in baseData.ColumnNames.Except(new[] { dependentFeatureName }))
            {
                if (baseData.GetColumnType(attributeToSplit).TypeIsNumeric())
                {
                    // TODO: add checking for the already used attribtues
                    var bestNumericSplitPointAndQuality =
                        BinaryNumericBestSplitingPointSelector.FindBestSplitPoint(
                            baseData,
                            dependentFeatureName,
                            attributeToSplit,
                            splitQualityChecker,
                            BinaryNumericDataSplitter,
                            initialEntropy);
                    if (bestNumericSplitPointAndQuality.Item2 > bestSplitQuality)
                    {
                        bestSplitQuality = bestNumericSplitPointAndQuality.Item2;
                        bestSplit        = bestNumericSplitPointAndQuality.Item1;
                    }
                }
                else
                {
                    var bestSplitForAttribute = EvaluateCategoricalSplit(
                        baseData,
                        dependentFeatureName,
                        attributeToSplit,
                        bestSplitQuality,
                        initialEntropy,
                        splitQualityChecker,
                        alreadyUsedAttributesInfo);
                    if (bestSplitForAttribute.Item3 > bestSplitQuality)
                    {
                        bestSplit        = BuildBestSplitObject(bestSplitForAttribute.Item2, bestSplitForAttribute.Item1);
                        bestSplitQuality = bestSplitForAttribute.Item3;
                    }
                }
            }
            return(bestSplit);
        }
Exemplo n.º 7
0
        protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode> children)
        {
            var binarySplittingResults = splittingResult as IBinarySplittingResult;

            if (binarySplittingResults == null)
            {
                throw new ArgumentException("Invalid split results passed to binary decision tree builder");
            }

            return(new BinaryDecisionTreeParentNode(
                       false,
                       splittingResult.SplittingFeatureName,
                       children,
                       binarySplittingResults.SplittingValue,
                       binarySplittingResults.IsSplitNumeric));
        }
 protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode> children)
 {
     if (splittingResult is IBinarySplittingResult)
     {
         var binarySplitResult = (IBinarySplittingResult)splittingResult;
         return(new BinaryDecisionTreeParentNode(
                    false,
                    splittingResult.SplittingFeatureName,
                    children,
                    binarySplitResult.SplittingValue,
                    binarySplitResult.IsSplitNumeric));
     }
     return(new DecisionTreeParentNode(
                false,
                splittingResult.SplittingFeatureName,
                children));
 }
 protected override IDecisionTreeNode BuildConcreteDecisionTreeNode(ISplittingResult splittingResult, ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode> children)
 {
     if (splittingResult is IBinarySplittingResult)
     {
         var binarySplitResult = (IBinarySplittingResult)splittingResult;
         return new BinaryDecisionTreeParentNode(
             false,
             splittingResult.SplittingFeatureName,
             children,
             binarySplitResult.SplittingValue,
             binarySplitResult.IsSplitNumeric);
     }
     return new DecisionTreeParentNode(
         false,
         splittingResult.SplittingFeatureName,
         children);
 }
 private static bool SplitIsEmpty(ISplittingResult splitResult)
 {
     return splitResult == null
            || splitResult.SplittedDataSets.Any(
                splitSet => splitSet?.SplittedDataFrame == null || splitSet.SplittedDataFrame.RowCount == 0);
 }
 protected abstract IDecisionTreeNode BuildConcreteDecisionTreeNode(
     ISplittingResult splittingResult, 
     ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode> children);
 protected abstract IDecisionTreeNode BuildConcreteDecisionTreeNode(
     ISplittingResult splittingResult,
     ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode> children);
 private static bool SplitIsEmpty(ISplittingResult splitResult)
 {
     return(splitResult == null ||
            splitResult.SplittedDataSets.Any(
                splitSet => splitSet?.SplittedDataFrame == null || splitSet.SplittedDataFrame.RowCount == 0));
 }