public Tuple <ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { ISplittingResult bestSplit = null; double bestSplitQuality = double.NegativeInfinity; var totalRowsCount = baseData.RowCount; var sortedRowData = baseData.GetNumericColumnVector(numericFeatureToProcess) .AsParallel() .Select((val, rowIdx) => new { RowIdx = rowIdx, FeatureValue = val, DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue }) .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx) .ToList(); var previousClass = sortedRowData[0].DependentFeatureValue; var previousFeatureVal = sortedRowData[0].FeatureValue; foreach (var rowData in sortedRowData) { var currentClass = rowData.DependentFeatureValue; var currentFeatureVal = rowData.FeatureValue; if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal)) { var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0; var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName); var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams); var quality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splitResult, dependentFeatureName); if (quality >= bestSplitQuality) { bestSplitQuality = quality; bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay); } previousClass = currentClass; } previousFeatureVal = currentFeatureVal; } return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality)); }
public Tuple<ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { ISplittingResult bestSplit = null; double bestSplitQuality = double.NegativeInfinity; var totalRowsCount = baseData.RowCount; var sortedRowData = baseData.GetNumericColumnVector(numericFeatureToProcess) .AsParallel() .Select((val, rowIdx) => new { RowIdx = rowIdx, FeatureValue = val, DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue }) .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx) .ToList(); var previousClass = sortedRowData[0].DependentFeatureValue; var previousFeatureVal = sortedRowData[0].FeatureValue; foreach (var rowData in sortedRowData) { var currentClass = rowData.DependentFeatureValue; var currentFeatureVal = rowData.FeatureValue; if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal)) { var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0; var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName); var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams); var quality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splitResult, dependentFeatureName); if (quality >= bestSplitQuality) { bestSplitQuality = quality; bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay); } previousClass = currentClass; } previousFeatureVal = currentFeatureVal; } return new Tuple<ISplittingResult, double>(bestSplit, bestSplitQuality); }
protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList <ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), binarySplitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality)); }
protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList<ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return new Tuple<IList<ISplittedData>, ISplittingParams, double>( new List<ISplittedData>(), binarySplitParams, double.NegativeInfinity); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return new Tuple<IList<ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality); }
public void PerformBinaryNumericDataSplit() { // Given var testData = TestDataBuilder.BuildSmallDataFrameNumbersOnly(); var expectedPositiveData = testData.GetSubsetByRows(new[] { 1, 2 }); var expectedNegativeData = testData.GetSubsetByRows(new[] { 0 }); var splitCriterion = new BinarySplittingParams("Col1", 5, null); // When var splitResults = BinaryNumericDataSplitter.SplitData(testData, splitCriterion); var actualPositiveData = splitResults.First().SplittedDataFrame; var actualNegativeData = splitResults.Last().SplittedDataFrame; // Then Assert.IsTrue(expectedPositiveData.ContentEquals(actualPositiveData)); Assert.IsTrue(actualNegativeData.ContentEquals(expectedNegativeData)); }
public void PerformBinaryDiscreteDataSplit() { // Given var testData = TestDataBuilder.ReadWeatherDataWithCategoricalAttributes(); var expectedPositiveData = testData.GetSubsetByRows(new[] { 3, 4, 5, 9, 13 }); var expectedNegativeData = testData.GetSubsetByRows(new[] { 0, 1, 2, 6, 7, 8, 10, 11, 12 }); var splitCriteria = new BinarySplittingParams("Outlook", "Rainy", "Play"); // When var splitResults = BinaryDiscreteDataSplitter.SplitData(testData, splitCriteria); var actualPositiveData = splitResults.First().SplittedDataFrame; var actualNegativeData = splitResults.Last().SplittedDataFrame; // Then Assert.IsTrue(expectedPositiveData.ContentEquals(actualPositiveData)); Assert.IsTrue(actualNegativeData.ContentEquals(expectedNegativeData)); }