public void Test_GetColumnVector_ByIndex_ObjectVarsion() { // Given var expectedVector = new DataVector <object>(new object[] { "a1.1", "a2.1", "a3.1" }, "C1"); // When var actualVector = _subject.GetColumnVector <object>(0); // Then Assert.AreEqual(expectedVector, actualVector); }
protected virtual IDecisionTreeNode BuildDecisionNode( IDataFrame dataFrame, string dependentFeatureName, IDecisionTreeModelBuilderParams additionalParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, int treeDepth, bool isFirstSplit = false) { if (dataFrame.GetColumnVector <object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth)) { return(BuildLeaf(dataFrame, dependentFeatureName)); } // TODO: later on add additional params indicating which features were already used ISplittingResult splitResult = BestSplitSelector.SelectBestSplit( dataFrame, dependentFeatureName, SplitQualityChecker, alreadyUsedAttributesInfo); if (SplitIsEmpty(splitResult)) { return(BuildLeaf(dataFrame, dependentFeatureName)); } if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null) { var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant( dataFrame, splitResult, dependentFeatureName); if (!isSplitSignificant) { return(BuildLeaf(dataFrame, dependentFeatureName)); } } var children = new ConcurrentDictionary <IDecisionTreeLink, IDecisionTreeNode>(); if (isFirstSplit) { Parallel.ForEach( splitResult.SplittedDataSets, splitData => { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); }); } else { foreach (var splitData in splitResult.SplittedDataSets) { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); } } return(BuildConcreteDecisionTreeNode(splitResult, children)); }
protected virtual Tuple <Matrix <double>, IList <TPredictionResult>, IList <string> > PrepareTrainingData( IDataFrame dataFrame, string dependentFeatureName) { var dataColumns = dataFrame.ColumnNames.Where(col => col != dependentFeatureName).ToList(); var trainingData = dataFrame.GetSubsetByColumns(dataColumns).GetAsMatrix(); IDataVector <TPredictionResult> expectedOutcomes = dataFrame.GetColumnVector <TPredictionResult>(dependentFeatureName); return(new Tuple <Matrix <double>, IList <TPredictionResult>, IList <string> >(trainingData, expectedOutcomes, dataColumns)); }
public bool IsSplitStatisticallySignificant( IDataFrame initialDataFrame, ISplittingResult splittingResults, string dependentFeatureName) { var uniqueDependentValuesCounts = initialDataFrame .GetColumnVector(dependentFeatureName) .Values .GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount); var chisquareStatisticSum = 0.0; if (splittingResults.IsSplitNumeric) { return(true); } var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1) + (splittingResults.SplittedDataSets.Count - 1); foreach (var splittingResult in splittingResults.SplittedDataSets) { var splitSize = splittingResult.SplittedDataFrame.RowCount; var actualDependentFeatureValues = splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName) .Values.GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count()); foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts) { var expectedCount = uniqueDependentValueCount.Value * splitSize; var actualCount = 0; if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key)) { actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key]; } var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount; chisquareStatisticSum += actualChisquareValue; } } if (ChiSquared.IsValidParameterSet(degreesOfFreedom)) { var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum); if (statisticValue < significanceLevel) { return(true); } } return(false); }
public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName) { var counts = new Dictionary<object, int>(); var finalValues = finalData.GetColumnVector(dependentFeatureName); foreach (var val in finalValues) { if (!counts.ContainsKey(val)) { counts.Add(val, 0); } counts[val] += 1; } return new DecisionTreeLeaf(dependentFeatureName, counts.OrderBy(kvp => kvp.Value).Reverse().First().Key); }
public bool IsSplitStatisticallySignificant( IDataFrame initialDataFrame, ISplittingResult splittingResults, string dependentFeatureName) { var uniqueDependentValuesCounts = initialDataFrame .GetColumnVector(dependentFeatureName) .Values .GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count() / (double)initialDataFrame.RowCount); var chisquareStatisticSum = 0.0; if (splittingResults.IsSplitNumeric) { return true; } var degreesOfFreedom = (uniqueDependentValuesCounts.Keys.Count - 1) + (splittingResults.SplittedDataSets.Count - 1); foreach (var splittingResult in splittingResults.SplittedDataSets) { var splitSize = splittingResult.SplittedDataFrame.RowCount; var actualDependentFeatureValues = splittingResult.SplittedDataFrame.GetColumnVector(dependentFeatureName) .Values.GroupBy(elem => elem) .ToDictionary(grp => grp.Key, grp => grp.Count()); foreach (var uniqueDependentValueCount in uniqueDependentValuesCounts) { var expectedCount = uniqueDependentValueCount.Value * splitSize; var actualCount = 0; if (actualDependentFeatureValues.ContainsKey(uniqueDependentValueCount.Key)) { actualCount = actualDependentFeatureValues[uniqueDependentValueCount.Key]; } var actualChisquareValue = Math.Pow(actualCount - expectedCount, 2) / expectedCount; chisquareStatisticSum += actualChisquareValue; } } if (ChiSquared.IsValidParameterSet(degreesOfFreedom)) { var statisticValue = 1 - ChiSquared.CDF(degreesOfFreedom, chisquareStatisticSum); if (statisticValue < significanceLevel) { return true; } } return false; }
public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName) { var counts = new Dictionary <object, int>(); var finalValues = finalData.GetColumnVector(dependentFeatureName); foreach (var val in finalValues) { if (!counts.ContainsKey(val)) { counts.Add(val, 0); } counts[val] += 1; } return(new DecisionTreeLeaf(dependentFeatureName, counts.OrderBy(kvp => kvp.Value).Reverse().First().Key)); }
protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList <ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), binarySplitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality)); }
protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList<ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), binarySplitParams, double.NegativeInfinity); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return new Tuple<IList<ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality); }
public IList<ISplittedData> SplitData(IDataFrame dataToSplit, ISplittingParams splttingParams) { var splitFeature = splttingParams.SplitOnFeature; var totalRowsCount = dataToSplit.RowCount; var uniqueValues = dataToSplit.GetColumnVector(splitFeature).Distinct(); var splittedData = new List<ISplittedData>(); //TODO: AAA emarassingly parallel - test it for performance foreach (var uniqueValue in uniqueValues) { var query = BuildQuery(splitFeature, uniqueValue); var splitResult = dataToSplit.GetSubsetByQuery(query); var subsetCount = splitResult.RowCount; var link = new DecisionLink( CalcInstancesPercentage(totalRowsCount, subsetCount), subsetCount, uniqueValue); splittedData.Add(new SplittedData(link, splitResult)); } return splittedData; }
public IList <ISplittedData> SplitData(IDataFrame dataToSplit, ISplittingParams splttingParams) { var splitFeature = splttingParams.SplitOnFeature; var totalRowsCount = dataToSplit.RowCount; var uniqueValues = dataToSplit.GetColumnVector(splitFeature).Distinct(); var splittedData = new List <ISplittedData>(); //TODO: AAA emarassingly parallel - test it for performance foreach (var uniqueValue in uniqueValues) { var query = BuildQuery(splitFeature, uniqueValue); var splitResult = dataToSplit.GetSubsetByQuery(query); var subsetCount = splitResult.RowCount; var link = new DecisionLink( CalcInstancesPercentage(totalRowsCount, subsetCount), subsetCount, uniqueValue); splittedData.Add(new SplittedData(link, splitResult)); } return(splittedData); }
public double GetInitialEntropy(IDataFrame baseData, string dependentFeatureName) => this.ImpuryMeasure.ImpurityValue(baseData.GetColumnVector <TDecisionType>(dependentFeatureName));
public Tuple <ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList(); var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess); var dependentValsCounts = new List <Vector <double> >(); var breakPoints = new List <int>(); var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal; for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++) { var currentElem = dependentValuesSortedByNumericFeature[elemIdx]; var currentDependentValue = currentElem.DependentVal; var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue); var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray()); if (elemIdx != 0) { dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation); } dependentValsCountAllocation[indexOfCurrentDependentValue]++; if (!currentDependentValue.Equals(lastKnowDependentValue)) { lastKnowDependentValue = currentDependentValue; breakPoints.Add(elemIdx - 1); } dependentValsCounts.Add(dependentValsCountAllocation); } var bestSplitQualitySoFar = double.NegativeInfinity; int bestBreakpointSoFar = -1; foreach (var breakpointIdx in breakPoints) { var dependentValsCountUpToBreakpoint = dependentValsCounts[breakpointIdx]; var dependentValsCountAboveBreakpoint = dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint); var splitEntropy = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new List <IList <int> > { VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint) }); if (splitEntropy > bestSplitQualitySoFar) { bestSplitQualitySoFar = splitEntropy; bestBreakpointSoFar = breakpointIdx; } } var splitVal = CalculateSplitPoint( dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal, dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal); var split = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName)); var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal); return(new Tuple <ISplittingResult, double>(splitResult, bestSplitQualitySoFar)); }
protected static bool ShouldStopRecusrsiveBuilding(IDataFrame dataFrame, string dependentFeatureName) { return !dataFrame.GetColumnType(dependentFeatureName).IsNumericType() && dataFrame.GetColumnVector<object>(dependentFeatureName).DataItems.Distinct().Count() == 1; }
protected virtual IDecisionTreeNode BuildDecisionNode( IDataFrame dataFrame, string dependentFeatureName, IDecisionTreeModelBuilderParams additionalParams, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo, int treeDepth, bool isFirstSplit = false) { if (dataFrame.GetColumnVector<object>(dependentFeatureName).DataItems.Distinct().Count() == 1 || MaximalTreeDepthHasBeenReached(additionalParams, treeDepth)) { return BuildLeaf(dataFrame, dependentFeatureName); } // TODO: later on add additional params indicating which features were already used ISplittingResult splitResult = BestSplitSelector.SelectBestSplit( dataFrame, dependentFeatureName, SplitQualityChecker, alreadyUsedAttributesInfo); if (SplitIsEmpty(splitResult)) { return BuildLeaf(dataFrame, dependentFeatureName); } if (additionalParams.UsePrunningHeuristicDuringTreeBuild && this.StatisticalSignificanceChecker != null) { var isSplitSignificant = StatisticalSignificanceChecker.IsSplitStatisticallySignificant( dataFrame, splitResult, dependentFeatureName); if (!isSplitSignificant) { return BuildLeaf(dataFrame, dependentFeatureName); } } var children = new ConcurrentDictionary<IDecisionTreeLink, IDecisionTreeNode>(); if (isFirstSplit) { Parallel.ForEach( splitResult.SplittedDataSets, splitData => { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); }); } else { foreach (var splitData in splitResult.SplittedDataSets) { this.AddChildFromSplit(dependentFeatureName, additionalParams, splitData, children, alreadyUsedAttributesInfo, treeDepth + 1); } } return BuildConcreteDecisionTreeNode(splitResult, children); }
protected static bool ShouldStopRecusrsiveBuilding(IDataFrame dataFrame, string dependentFeatureName) { return(!dataFrame.GetColumnType(dependentFeatureName).IsNumericType() && dataFrame.GetColumnVector <object>(dependentFeatureName).DataItems.Distinct().Count() == 1); }
public Tuple<ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList(); var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess); var dependentValsCounts = new List<Vector<double>>(); var breakPoints = new List<int>(); var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal; for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++) { var currentElem = dependentValuesSortedByNumericFeature[elemIdx]; var currentDependentValue = currentElem.DependentVal; var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue); var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray()); if (elemIdx != 0) { dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation); } dependentValsCountAllocation[indexOfCurrentDependentValue]++; if (!currentDependentValue.Equals(lastKnowDependentValue)) { lastKnowDependentValue = currentDependentValue; breakPoints.Add(elemIdx - 1); } dependentValsCounts.Add(dependentValsCountAllocation); } var bestSplitQualitySoFar = double.NegativeInfinity; int bestBreakpointSoFar = -1; foreach (var breakpointIdx in breakPoints) { var dependentValsCountUpToBreakpoint = dependentValsCounts[breakpointIdx]; var dependentValsCountAboveBreakpoint = dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint); var splitEntropy = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new List<IList<int>> { VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint) }); if (splitEntropy > bestSplitQualitySoFar) { bestSplitQualitySoFar = splitEntropy; bestBreakpointSoFar = breakpointIdx; } } var splitVal = CalculateSplitPoint( dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal, dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal); var split = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName)); var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal); return new Tuple<ISplittingResult, double>(splitResult, bestSplitQualitySoFar); }