public void Test_GetNumericColumnVector_ByIndex() { // Given var expectedNumericColumnVector = Vector <double> .Build.Dense(new double[] { 1, 3, 5 }); // When var actualNumericColumnVector = _subject.GetNumericColumnVector(1); // Then Assert.IsTrue(expectedNumericColumnVector.Equals(actualNumericColumnVector)); }
private static List <NumericFeatureData> OrderColumn(IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess) { return(baseData.GetNumericColumnVector(numericFeatureToProcess) .Select( (featureVal, rowIndex) => new NumericFeatureData(featureVal, baseData[rowIndex, dependentFeatureName].FeatureValue)) .OrderBy(elem => elem.FeatureVal) .ToList()); }
public Tuple <ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { ISplittingResult bestSplit = null; double bestSplitQuality = double.NegativeInfinity; var totalRowsCount = baseData.RowCount; var sortedRowData = baseData.GetNumericColumnVector(numericFeatureToProcess) .AsParallel() .Select((val, rowIdx) => new { RowIdx = rowIdx, FeatureValue = val, DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue }) .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx) .ToList(); var previousClass = sortedRowData[0].DependentFeatureValue; var previousFeatureVal = sortedRowData[0].FeatureValue; foreach (var rowData in sortedRowData) { var currentClass = rowData.DependentFeatureValue; var currentFeatureVal = rowData.FeatureValue; if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal)) { var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0; var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName); var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams); var quality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splitResult, dependentFeatureName); if (quality >= bestSplitQuality) { bestSplitQuality = quality; bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay); } previousClass = currentClass; } previousFeatureVal = currentFeatureVal; } return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality)); }
public IPredictionModel BuildModel(IDataFrame dataFrame, string dependentFeatureName, IModelBuilderParams additionalParams) { if (!(additionalParams is ILinearRegressionParams)) { throw new ArgumentException("Invalid parameters passed to Regularized Linear Regression model builder!"); } var linearRegressionParams = additionalParams as ILinearRegressionParams; var matrixX = dataFrame.GetSubsetByColumns(dataFrame.ColumnNames.Except(new[] { dependentFeatureName }).ToList()).GetAsMatrixWithIntercept(); var vectorY = dataFrame.GetNumericColumnVector(dependentFeatureName); return BuildModel(matrixX, vectorY, linearRegressionParams); }
public IPredictionModel BuildModel(IDataFrame dataFrame, string dependentFeatureName, IModelBuilderParams additionalParams) { if (!(additionalParams is ILinearRegressionParams)) { throw new ArgumentException("Invalid parameters passed to Gradient Desccent model builder!"); } var linearRegressionParams = additionalParams as ILinearRegressionParams; var matrixX = dataFrame.GetSubsetByColumns(dataFrame.ColumnNames.Except(new[] { dependentFeatureName }).ToList()).GetAsMatrixWithIntercept(); var vectorY = dataFrame.GetNumericColumnVector(dependentFeatureName); return(BuildModel(matrixX, vectorY, linearRegressionParams)); }
public Tuple<ISplittingResult, double> FindBestSplitPoint( IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess, ICategoricalSplitQualityChecker splitQualityChecker, IBinaryNumericDataSplitter binaryNumericDataSplitter, double initialEntropy) { ISplittingResult bestSplit = null; double bestSplitQuality = double.NegativeInfinity; var totalRowsCount = baseData.RowCount; var sortedRowData = baseData.GetNumericColumnVector(numericFeatureToProcess) .AsParallel() .Select((val, rowIdx) => new { RowIdx = rowIdx, FeatureValue = val, DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue }) .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx) .ToList(); var previousClass = sortedRowData[0].DependentFeatureValue; var previousFeatureVal = sortedRowData[0].FeatureValue; foreach (var rowData in sortedRowData) { var currentClass = rowData.DependentFeatureValue; var currentFeatureVal = rowData.FeatureValue; if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal)) { var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0; var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName); var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams); var quality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splitResult, dependentFeatureName); if (quality >= bestSplitQuality) { bestSplitQuality = quality; bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay); } previousClass = currentClass; } previousFeatureVal = currentFeatureVal; } return new Tuple<ISplittingResult, double>(bestSplit, bestSplitQuality); }
public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName) { var vectorY = finalData.GetNumericColumnVector(dependentFeatureName); var featureNames = finalData.ColumnNames.Except(new[] { dependentFeatureName }).ToList(); var subset = finalData.GetSubsetByColumns(featureNames); var matrixX = finalData.GetSubsetByColumns(featureNames).GetAsMatrixWithIntercept(); Vector <double> fittedWeights = null; try { fittedWeights = MultipleRegression.DirectMethod(matrixX, vectorY); } catch (Exception) { fittedWeights = regressionModelBuilder.BuildModel(matrixX, vectorY, regressionParams).Weights; } return(new RegressionAndModelLeaf(dependentFeatureName, fittedWeights, vectorY.Mean())); }
public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName) { var vectorY = finalData.GetNumericColumnVector(dependentFeatureName); var featureNames = finalData.ColumnNames.Except(new[] { dependentFeatureName }).ToList(); var subset = finalData.GetSubsetByColumns(featureNames); var matrixX = finalData.GetSubsetByColumns(featureNames).GetAsMatrixWithIntercept(); Vector<double> fittedWeights = null; try { fittedWeights = MultipleRegression.DirectMethod(matrixX, vectorY); } catch (Exception) { fittedWeights = regressionModelBuilder.BuildModel(matrixX, vectorY, regressionParams).Weights; } return new RegressionAndModelLeaf(dependentFeatureName, fittedWeights, vectorY.Mean()); }
public double GetInitialEntropy(IDataFrame baseData, string dependentFeatureName) { return(baseData.GetNumericColumnVector(dependentFeatureName).Variance()); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple <string, double> bestSplit = null; /* * if (baseData.RowCount <= baseData.ColumnsCount) * { * return null; * } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple <double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count - 1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple <string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return(null); } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return(new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2)); }
public double GetInitialEntropy(IDataFrame baseData, string dependentFeatureName) { return baseData.GetNumericColumnVector(dependentFeatureName).Variance(); }
private static List<NumericFeatureData> OrderColumn(IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess) { return baseData.GetNumericColumnVector(numericFeatureToProcess) .Select( (featureVal, rowIndex) => new NumericFeatureData(featureVal, baseData[rowIndex, dependentFeatureName].FeatureValue)) .OrderBy(elem => elem.FeatureVal) .ToList(); }
public ISplittingResult SelectBestSplit( IDataFrame baseData, string dependentFeatureName, INumericalSplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alreadyUsedAttributesInfo) { var bestSplitQuality = double.NegativeInfinity; var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName); Tuple<string, double> bestSplit = null; /* if (baseData.RowCount <= baseData.ColumnsCount) { return null; } */ var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName }); foreach (var feature in featureColumns) { var dataOrderedByFeature = baseData.GetNumericColumnVector(feature) .Select((rowVal, idx) => new Tuple<double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx)) .OrderBy(tpl => tpl.Item1) .ToList(); var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList(); var previousFeatureValue = dataOrderedByFeature.First().Item1; for (int i = 0; i < (dataOrderedByFeature.Count -1); i++) { var dataPoint = dataOrderedByFeature[i]; var currentFeatureValue = dataPoint.Item1; if (currentFeatureValue != previousFeatureValue) { var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0; if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint)) { var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList(); var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList(); var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, baseData.RowCount, new[] { dependentValsBelow, dependentValsAbove }); if (splitQuality > bestSplitQuality) { bestSplitQuality = splitQuality; bestSplit = new Tuple<string, double>(feature, splitPoint); } } } previousFeatureValue = currentFeatureValue; } } if (bestSplit == null) { return null; } var splittedData = binaryNumericDataSplitter.SplitData( baseData, new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName)); return new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2); }