public Tuple <ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { ISplittingResult bestSplit = null; double bestSplitQuality = double.NegativeInfinity; var totalRowsCount = baseData.RowCount; var sortedRowData = baseData.GetNumericColumnVector(numericFeatureToProcess) .AsParallel() .Select((val, rowIdx) => new { RowIdx = rowIdx, FeatureValue = val, DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue }) .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx) .ToList(); var previousClass = sortedRowData[0].DependentFeatureValue; var previousFeatureVal = sortedRowData[0].FeatureValue; foreach (var rowData in sortedRowData) { var currentClass = rowData.DependentFeatureValue; var currentFeatureVal = rowData.FeatureValue; if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal)) { var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0; var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName); var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams); var quality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splitResult, dependentFeatureName); if (quality >= bestSplitQuality) { bestSplitQuality = quality; bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay); } previousClass = currentClass; } previousFeatureVal = currentFeatureVal; } return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality)); }
public Tuple<ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { ISplittingResult bestSplit = null; double bestSplitQuality = double.NegativeInfinity; var totalRowsCount = baseData.RowCount; var sortedRowData = baseData.GetNumericColumnVector(numericFeatureToProcess) .AsParallel() .Select((val, rowIdx) => new { RowIdx = rowIdx, FeatureValue = val, DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue }) .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx) .ToList(); var previousClass = sortedRowData[0].DependentFeatureValue; var previousFeatureVal = sortedRowData[0].FeatureValue; foreach (var rowData in sortedRowData) { var currentClass = rowData.DependentFeatureValue; var currentFeatureVal = rowData.FeatureValue; if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal)) { var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0; var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName); var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams); var quality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splitResult, dependentFeatureName); if (quality >= bestSplitQuality) { bestSplitQuality = quality; bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay); } previousClass = currentClass; } previousFeatureVal = currentFeatureVal; } return new Tuple<ISplittingResult, double>(bestSplit, bestSplitQuality); }
public Tuple<ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList(); var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess); var dependentValsCounts = new List<Vector<double>>(); var breakPoints = new List<int>(); var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal; for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++) { var currentElem = dependentValuesSortedByNumericFeature[elemIdx]; var currentDependentValue = currentElem.DependentVal; var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue); var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray()); if (elemIdx != 0) { dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation); } dependentValsCountAllocation[indexOfCurrentDependentValue]++; if (!currentDependentValue.Equals(lastKnowDependentValue)) { lastKnowDependentValue = currentDependentValue; breakPoints.Add(elemIdx - 1); } dependentValsCounts.Add(dependentValsCountAllocation); } var bestSplitQualitySoFar = double.NegativeInfinity; int bestBreakpointSoFar = -1; foreach (var breakpointIdx in breakPoints) { var dependentValsCountUpToBreakpoint = dependentValsCounts[breakpointIdx]; var dependentValsCountAboveBreakpoint = dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint); var splitEntropy = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new List<IList<int>> { VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint) }); if (splitEntropy > bestSplitQualitySoFar) { bestSplitQualitySoFar = splitEntropy; bestBreakpointSoFar = breakpointIdx; } } var splitVal = CalculateSplitPoint( dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal, dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal); var split = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName)); var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal); return new Tuple<ISplittingResult, double>(splitResult, bestSplitQualitySoFar); }
public Tuple <ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList(); var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess); var dependentValsCounts = new List <Vector <double> >(); var breakPoints = new List <int>(); var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal; for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++) { var currentElem = dependentValuesSortedByNumericFeature[elemIdx]; var currentDependentValue = currentElem.DependentVal; var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue); var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray()); if (elemIdx != 0) { dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation); } dependentValsCountAllocation[indexOfCurrentDependentValue]++; if (!currentDependentValue.Equals(lastKnowDependentValue)) { lastKnowDependentValue = currentDependentValue; breakPoints.Add(elemIdx - 1); } dependentValsCounts.Add(dependentValsCountAllocation); } var bestSplitQualitySoFar = double.NegativeInfinity; int bestBreakpointSoFar = -1; foreach (var breakpointIdx in breakPoints) { var dependentValsCountUpToBreakpoint = dependentValsCounts[breakpointIdx]; var dependentValsCountAboveBreakpoint = dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint); var splitEntropy = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new List <IList <int> > { VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint) }); if (splitEntropy > bestSplitQualitySoFar) { bestSplitQualitySoFar = splitEntropy; bestBreakpointSoFar = breakpointIdx; } } var splitVal = CalculateSplitPoint( dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal, dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal); var split = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName)); var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal); return(new Tuple <ISplittingResult, double>(splitResult, bestSplitQualitySoFar)); }