public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple <string, double> bestSplit = null; /* * if (baseData.RowCount <= baseData.ColumnsCount) * { * return null; * } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple <double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count - 1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple <string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return(null); } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return(new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2)); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple<string, double> bestSplit = null; /* if (baseData.RowCount <= baseData.ColumnsCount) { return null; } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple<double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count -1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple<string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return null; } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2); }