Ejemplo n.º 1
0
        public void Test_GetColumnVector_ByIndex_ObjectVarsion()
        {
            // Given
            var expectedVector = new DataVector <object>(new object[] { "a1.1", "a2.1", "a3.1" }, "C1");

            // When
            var actualVector = _subject.GetColumnVector <object>(0);

            // Then
            Assert.AreEqual(expectedVector, actualVector);
        }
        protected virtual IDecisionTreeNode BuildDecisionNode(
            IDataFrame dataFrame,
            string dependentFeatureName,
            IDecisionTreeModelBuilderParams additionalParams,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo,
            int treeDepth,
            bool isFirstSplit = false)
        {
            if (dataFrame.GetColumnVector <object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth))
            {
                return(BuildLeaf(dataFrame, dependentFeatureName));
            }

            // TODO: later on add additional params indicating which features were already used
            ISplittingResult splitResult = BestSplitSelector.SelectBestSplit(
                dataFrame,
                dependentFeatureName,
                SplitQualityChecker,
                alreadyUsedAttributesInfo);

            if (SplitIsEmpty(splitResult))
            {
                return(BuildLeaf(dataFrame, dependentFeatureName));
            }

            if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null)
            {
                var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant(
                    dataFrame,
                    splitResult,
                    dependentFeatureName);
                if (!isSplitSignificant)
                {
                    return(BuildLeaf(dataFrame, dependentFeatureName));
                }
            }

            var children = new ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode>();

            if (isFirstSplit)
            {
                Parallel.ForEach(
                    splitResult.SplittedDataSets,
                    splitData =>
                {
                    this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1);
                });
            }
            else
            {
                foreach (var splitData in splitResult.SplittedDataSets)
                {
                    this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1);
                }
            }
            return(BuildConcreteDecisionTreeNode(splitResult, children));
        }
        protected virtual Tuple <Matrix <double>, IList <TPredictionResult>, IList <string> > PrepareTrainingData(
            IDataFrame dataFrame,
            string dependentFeatureName)
        {
            var dataColumns  = dataFrame.ColumnNames.Where(col => col != dependentFeatureName).ToList();
            var trainingData = dataFrame.GetSubsetByColumns(dataColumns).GetAsMatrix();
            IDataVector <TPredictionResult> expectedOutcomes = dataFrame.GetColumnVector <TPredictionResult>(dependentFeatureName);

            return(new Tuple <Matrix <double>, IList <TPredictionResult>, IList <string> >(trainingData, expectedOutcomes, dataColumns));
        }
        public bool IsSplitStatisticallySignificant(
            IDataFrame initialDataFrame,
            ISplittingResult splittingResults,
            string dependentFeatureName)
        {
            var uniqueDependentValuesCounts = initialDataFrame
                                              .GetColumnVector(dependentFeatureName)
                                              .Values
                                              .GroupBy(elem => elem)
                                              .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount);
            var chisquareStatisticSum = 0.0;

            if (splittingResults.IsSplitNumeric)
            {
                return(true);
            }

            var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1)
                                   + (splittingResults.SplittedDataSets.Count - 1);

            foreach (var splittingResult in splittingResults.SplittedDataSets)
            {
                var splitSize = splittingResult.SplittedDataFrame.RowCount;
                var actualDependentFeatureValues =
                    splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName)
                    .Values.GroupBy(elem => elem)
                    .ToDictionary(grp => grp.Key, grp => grp.Count());
                foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts)
                {
                    var expectedCount = uniqueDependentValueCount.Value * splitSize;
                    var actualCount   = 0;
                    if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key))
                    {
                        actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key];
                    }
                    var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount;
                    chisquareStatisticSum += actualChisquareValue;
                }
            }

            if (ChiSquared.IsValidParameterSet(degreesOfFreedom))
            {
                var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum);
                if (statisticValue < significanceLevel)
                {
                    return(true);
                }
            }

            return(false);
        }
 public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName)
 {
     var counts = new Dictionary<object, int>();
     var finalValues = finalData.GetColumnVector(dependentFeatureName);
     foreach (var val in finalValues)
     {
         if (!counts.ContainsKey(val))
         {
             counts.Add(val, 0);
         }
         counts[val] += 1;
     }
     return new DecisionTreeLeaf(dependentFeatureName, counts.OrderBy(kvp => kvp.Value).Reverse().First().Key);
 }
        public bool IsSplitStatisticallySignificant(
            IDataFrame initialDataFrame,
            ISplittingResult splittingResults,
            string dependentFeatureName)
        {
            var uniqueDependentValuesCounts = initialDataFrame
                .GetColumnVector(dependentFeatureName)
                .Values
                .GroupBy(elem => elem)
                .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount);
            var chisquareStatisticSum = 0.0;
            if (splittingResults.IsSplitNumeric)
            {
                return true;
            }

            var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1)
                                   + (splittingResults.SplittedDataSets.Count - 1);
            foreach (var splittingResult in splittingResults.SplittedDataSets)
            {
                var splitSize = splittingResult.SplittedDataFrame.RowCount;
                var actualDependentFeatureValues =
                    splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName)
                        .Values.GroupBy(elem => elem)
                        .ToDictionary(grp => grp.Key, grp => grp.Count());
                foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts)
                {
                    var expectedCount = uniqueDependentValueCount.Value * splitSize;
                    var actualCount = 0;
                    if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key))
                    {
                        actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key];
                    }
                    var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount;
                    chisquareStatisticSum += actualChisquareValue;
                }
            }

            if (ChiSquared.IsValidParameterSet(degreesOfFreedom))
            {
                var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum);
                if (statisticValue < significanceLevel)
                {
                    return true;
                }
            }

            return false;
        }
Ejemplo n.º 7
0
        public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName)
        {
            var counts      = new Dictionary <object, int>();
            var finalValues = finalData.GetColumnVector(dependentFeatureName);

            foreach (var val in finalValues)
            {
                if (!counts.ContainsKey(val))
                {
                    counts.Add(val, 0);
                }
                counts[val] += 1;
            }
            return(new DecisionTreeLeaf(dependentFeatureName, counts.OrderBy(kvp => kvp.Value).Reverse().First().Key));
        }
        protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alredyUsedAttributesInfo)
        {
            var    totalRowsCount          = dataToSplit.RowCount;
            var    uniqueFeatureValues     = dataToSplit.GetColumnVector(splittingFeatureName).Distinct();
            double locallyBestSplitQuality = double.NegativeInfinity;
            IBinarySplittingParams localBestSplitParams = null;
            IList <ISplittedData>  locallyBestSplitData = null;

            foreach (var featureValue in uniqueFeatureValues)
            {
                if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue))
                {
                    var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName);
                    var splittedData      = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams);
                    if (splittedData.Count == 1)
                    {
                        return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                                   new List <ISplittedData>(),
                                   binarySplitParams,
                                   double.NegativeInfinity));
                    }

                    var splitQuality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splittedData,
                        dependentFeatureName);
                    if (splitQuality > locallyBestSplitQuality)
                    {
                        locallyBestSplitQuality = splitQuality;
                        locallyBestSplitData    = splittedData;
                        localBestSplitParams    = binarySplitParams;
                    }
                }
            }

            return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                       locallyBestSplitData,
                       localBestSplitParams,
                       locallyBestSplitQuality));
        }
        protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alredyUsedAttributesInfo)
        {
            var totalRowsCount = dataToSplit.RowCount;
            var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct();
            double locallyBestSplitQuality = double.NegativeInfinity;
            IBinarySplittingParams localBestSplitParams = null;
            IList<ISplittedData> locallyBestSplitData = null;
            foreach (var featureValue in uniqueFeatureValues)
            {
                if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue))
                {
                    var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName);
                    var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams);
                    if (splittedData.Count == 1)
                    {
                        return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                            new List<ISplittedData>(),
                            binarySplitParams,
                            double.NegativeInfinity);
                    }

                    var splitQuality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splittedData,
                        dependentFeatureName);
                    if (splitQuality > locallyBestSplitQuality)
                    {
                        locallyBestSplitQuality = splitQuality;
                        locallyBestSplitData = splittedData;
                        localBestSplitParams = binarySplitParams;
                    }
                }
            }

            return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                locallyBestSplitData,
                localBestSplitParams,
                locallyBestSplitQuality);
        }
 public IList<ISplittedData> SplitData(IDataFrame dataToSplit, ISplittingParams splttingParams)
 {
     var splitFeature = splttingParams.SplitOnFeature;
     var totalRowsCount = dataToSplit.RowCount;
     var uniqueValues = dataToSplit.GetColumnVector(splitFeature).Distinct();
     var splittedData = new List<ISplittedData>();
     //TODO: AAA emarassingly parallel - test it for performance
     foreach (var uniqueValue in uniqueValues)
     {
         var query = BuildQuery(splitFeature, uniqueValue);
         var splitResult = dataToSplit.GetSubsetByQuery(query);
         var subsetCount = splitResult.RowCount;
         var link = new DecisionLink(
             CalcInstancesPercentage(totalRowsCount, subsetCount),
             subsetCount,
             uniqueValue);
         splittedData.Add(new SplittedData(link, splitResult));
     }
     return splittedData;
 }
Ejemplo n.º 11
0
        public IList <ISplittedData> SplitData(IDataFrame dataToSplit, ISplittingParams splttingParams)
        {
            var splitFeature   = splttingParams.SplitOnFeature;
            var totalRowsCount = dataToSplit.RowCount;
            var uniqueValues   = dataToSplit.GetColumnVector(splitFeature).Distinct();
            var splittedData   = new List <ISplittedData>();

            //TODO: AAA emarassingly parallel - test it for performance
            foreach (var uniqueValue in uniqueValues)
            {
                var query       = BuildQuery(splitFeature, uniqueValue);
                var splitResult = dataToSplit.GetSubsetByQuery(query);
                var subsetCount = splitResult.RowCount;
                var link        = new DecisionLink(
                    CalcInstancesPercentage(totalRowsCount, subsetCount),
                    subsetCount,
                    uniqueValue);
                splittedData.Add(new SplittedData(link, splitResult));
            }
            return(splittedData);
        }
Ejemplo n.º 12
0
 public double GetInitialEntropy(IDataFrame baseData, string dependentFeatureName) => this.ImpuryMeasure.ImpurityValue(baseData.GetColumnVector <TDecisionType>(dependentFeatureName));
Ejemplo n.º 13
0
        public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList();
            var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess);
            var dependentValsCounts    = new List <Vector <double> >();
            var breakPoints            = new List <int>();
            var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal;

            for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++)
            {
                var currentElem                  = dependentValuesSortedByNumericFeature[elemIdx];
                var currentDependentValue        = currentElem.DependentVal;
                var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue);
                var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray());
                if (elemIdx != 0)
                {
                    dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation);
                }
                dependentValsCountAllocation[indexOfCurrentDependentValue]++;
                if (!currentDependentValue.Equals(lastKnowDependentValue))
                {
                    lastKnowDependentValue = currentDependentValue;
                    breakPoints.Add(elemIdx - 1);
                }

                dependentValsCounts.Add(dependentValsCountAllocation);
            }

            var bestSplitQualitySoFar = double.NegativeInfinity;
            int bestBreakpointSoFar   = -1;

            foreach (var breakpointIdx in breakPoints)
            {
                var dependentValsCountUpToBreakpoint  = dependentValsCounts[breakpointIdx];
                var dependentValsCountAboveBreakpoint =
                    dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint);
                var splitEntropy = splitQualityChecker.CalculateSplitQuality(
                    initialEntropy,
                    baseData.RowCount,
                    new List <IList <int> >
                {
                    VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint)
                });
                if (splitEntropy > bestSplitQualitySoFar)
                {
                    bestSplitQualitySoFar = splitEntropy;
                    bestBreakpointSoFar   = breakpointIdx;
                }
            }

            var splitVal = CalculateSplitPoint(
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal,
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal);
            var split = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName));
            var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal);

            return(new Tuple <ISplittingResult, double>(splitResult, bestSplitQualitySoFar));
        }
 protected static bool ShouldStopRecusrsiveBuilding(IDataFrame dataFrame, string dependentFeatureName)
 {
     return !dataFrame.GetColumnType(dependentFeatureName).IsNumericType() && dataFrame.GetColumnVector<object>(dependentFeatureName).DataItems.Distinct().Count() == 1;
 }
        protected virtual IDecisionTreeNode BuildDecisionNode(
            IDataFrame dataFrame, 
            string dependentFeatureName,
            IDecisionTreeModelBuilderParams additionalParams,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, 
            int treeDepth,
            bool isFirstSplit = false)
        {
            if (dataFrame.GetColumnVector<object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth))
            {
                return BuildLeaf(dataFrame, dependentFeatureName);
            }

            // TODO: later on add additional params indicating which features were already used
            ISplittingResult splitResult = BestSplitSelector.SelectBestSplit(
                dataFrame,
                dependentFeatureName,
                SplitQualityChecker,
                alreadyUsedAttributesInfo);
            if (SplitIsEmpty(splitResult))
            {
                return BuildLeaf(dataFrame, dependentFeatureName);
            }

            if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null)
            {
                var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant(
                    dataFrame,
                    splitResult,
                    dependentFeatureName);
                if (!isSplitSignificant)
                {
                    return BuildLeaf(dataFrame, dependentFeatureName);
                }
            }

            var children = new ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode>();
            if (isFirstSplit)
            {
                Parallel.ForEach(
                    splitResult.SplittedDataSets,
                    splitData =>
                    {
                        this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1);
                    });
            }
            else
            {
                foreach (var splitData in splitResult.SplittedDataSets)
                {
                    this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1);
                }
            }
            return BuildConcreteDecisionTreeNode(splitResult, children);
        }
 protected static bool ShouldStopRecusrsiveBuilding(IDataFrame dataFrame, string dependentFeatureName)
 {
     return(!dataFrame.GetColumnType(dependentFeatureName).IsNumericType() && dataFrame.GetColumnVector <object>(dependentFeatureName).DataItems.Distinct().Count() == 1);
 }
        public Tuple<ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList();
            var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess);
            var dependentValsCounts = new List<Vector<double>>();
            var breakPoints = new List<int>();
            var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal;
            for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++)
            {
                var currentElem = dependentValuesSortedByNumericFeature[elemIdx];
                var currentDependentValue = currentElem.DependentVal;
                var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue);
                var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray());
                if (elemIdx != 0)
                {
                    dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation);
                }
                dependentValsCountAllocation[indexOfCurrentDependentValue]++;
                if (!currentDependentValue.Equals(lastKnowDependentValue))
                {
                    lastKnowDependentValue = currentDependentValue;
                    breakPoints.Add(elemIdx - 1);
                }

                dependentValsCounts.Add(dependentValsCountAllocation);
            }

            var bestSplitQualitySoFar = double.NegativeInfinity;
            int bestBreakpointSoFar = -1;
            foreach (var breakpointIdx in breakPoints)
            {
                var dependentValsCountUpToBreakpoint = dependentValsCounts[breakpointIdx];
                var dependentValsCountAboveBreakpoint =
                    dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint);
                var splitEntropy = splitQualityChecker.CalculateSplitQuality(
                    initialEntropy,
                    baseData.RowCount,
                    new List<IList<int>>
                        {
                            VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint)
                        });
                if (splitEntropy > bestSplitQualitySoFar)
                {
                    bestSplitQualitySoFar = splitEntropy;
                    bestBreakpointSoFar = breakpointIdx;
                }
            }

            var splitVal = CalculateSplitPoint(
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal,
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal);
            var split = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName));
            var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal);
            return new Tuple<ISplittingResult, double>(splitResult, bestSplitQualitySoFar);
        }