protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList <ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), binarySplitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality)); }
protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList<ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), binarySplitParams, double.NegativeInfinity); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return new Tuple<IList<ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple <string, double> bestSplit = null; /* * if (baseData.RowCount <= baseData.ColumnsCount) * { * return null; * } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple <double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count - 1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple <string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return(null); } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return(new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2)); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple<string, double> bestSplit = null; /* if (baseData.RowCount <= baseData.ColumnsCount) { return null; } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple<double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count -1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple<string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return null; } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2); }