public void Test_GetNumericColumnVector_ByIndex()
        {
            // Given
            var expectedNumericColumnVector = Vector <double> .Build.Dense(new double[] { 1, 3, 5 });

            // When
            var actualNumericColumnVector = _subject.GetNumericColumnVector(1);

            // Then
            Assert.IsTrue(expectedNumericColumnVector.Equals(actualNumericColumnVector));
        }
Example #2
0
 private static List <NumericFeatureData> OrderColumn(IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess)
 {
     return(baseData.GetNumericColumnVector(numericFeatureToProcess)
            .Select(
                (featureVal, rowIndex) =>
                new NumericFeatureData(featureVal, baseData[rowIndex, dependentFeatureName].FeatureValue))
            .OrderBy(elem => elem.FeatureVal)
            .ToList());
 }
        public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit        = null;
            double           bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData  =
                baseData.GetNumericColumnVector(numericFeatureToProcess)
                .AsParallel()
                .Select((val, rowIdx) =>
                        new
            {
                RowIdx                = rowIdx,
                FeatureValue          = val,
                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
            })
                .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                .ToList();
            var previousClass      = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;

            foreach (var rowData in sortedRowData)
            {
                var currentClass      = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay     = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality     = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit        = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality));
        }
 public IPredictionModel BuildModel(IDataFrame dataFrame, string dependentFeatureName, IModelBuilderParams additionalParams)
 {
     if (!(additionalParams is ILinearRegressionParams))
     {
         throw new ArgumentException("Invalid parameters passed to Regularized Linear Regression model builder!");
     }
     var linearRegressionParams = additionalParams as ILinearRegressionParams;
     var matrixX = dataFrame.GetSubsetByColumns(dataFrame.ColumnNames.Except(new[] { dependentFeatureName }).ToList()).GetAsMatrixWithIntercept();
     var vectorY = dataFrame.GetNumericColumnVector(dependentFeatureName);
     return BuildModel(matrixX, vectorY, linearRegressionParams);
 }
Example #5
0
        public IPredictionModel BuildModel(IDataFrame dataFrame, string dependentFeatureName, IModelBuilderParams additionalParams)
        {
            if (!(additionalParams is ILinearRegressionParams))
            {
                throw new ArgumentException("Invalid parameters passed to Gradient Desccent model builder!");
            }
            var linearRegressionParams = additionalParams as ILinearRegressionParams;
            var matrixX = dataFrame.GetSubsetByColumns(dataFrame.ColumnNames.Except(new[] { dependentFeatureName }).ToList()).GetAsMatrixWithIntercept();
            var vectorY = dataFrame.GetNumericColumnVector(dependentFeatureName);

            return(BuildModel(matrixX, vectorY, linearRegressionParams));
        }
        public Tuple<ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit = null;
            double bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData =
                        baseData.GetNumericColumnVector(numericFeatureToProcess)
                            .AsParallel()
                            .Select((val, rowIdx) =>
                            new
                            {
                                RowIdx = rowIdx,
                                FeatureValue = val,
                                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
                            })
                            .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                            .ToList();
            var previousClass = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;
            foreach (var rowData in sortedRowData)
            {
                var currentClass = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return new Tuple<ISplittingResult, double>(bestSplit, bestSplitQuality);
        }
        public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName)
        {
            var             vectorY       = finalData.GetNumericColumnVector(dependentFeatureName);
            var             featureNames  = finalData.ColumnNames.Except(new[] { dependentFeatureName }).ToList();
            var             subset        = finalData.GetSubsetByColumns(featureNames);
            var             matrixX       = finalData.GetSubsetByColumns(featureNames).GetAsMatrixWithIntercept();
            Vector <double> fittedWeights = null;

            try
            {
                fittedWeights = MultipleRegression.DirectMethod(matrixX, vectorY);
            }
            catch (Exception)
            {
                fittedWeights = regressionModelBuilder.BuildModel(matrixX, vectorY, regressionParams).Weights;
            }

            return(new RegressionAndModelLeaf(dependentFeatureName, fittedWeights, vectorY.Mean()));
        }
        public IDecisionTreeLeaf BuildLeaf(IDataFrame finalData, string dependentFeatureName)
        {
            var vectorY = finalData.GetNumericColumnVector(dependentFeatureName);
            var featureNames = finalData.ColumnNames.Except(new[] { dependentFeatureName }).ToList();
            var subset = finalData.GetSubsetByColumns(featureNames);
            var matrixX = finalData.GetSubsetByColumns(featureNames).GetAsMatrixWithIntercept();
            Vector<double> fittedWeights = null;

            try
            {
                fittedWeights = MultipleRegression.DirectMethod(matrixX, vectorY);
            }
            catch (Exception)
            {
                fittedWeights = regressionModelBuilder.BuildModel(matrixX, vectorY, regressionParams).Weights;
            }

            return new RegressionAndModelLeaf(dependentFeatureName, fittedWeights, vectorY.Mean());
        }
 public double GetInitialEntropy(IDataFrame baseData, string dependentFeatureName)
 {
     return(baseData.GetNumericColumnVector(dependentFeatureName).Variance());
 }
        public ISplittingResult SelectBestSplit(
            IDataFrame baseData,
            string dependentFeatureName,
            INumericalSplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            var bestSplitQuality             = double.NegativeInfinity;
            var initialEntropy               = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName);
            Tuple <string, double> bestSplit = null;

            /*
             * if (baseData.RowCount <= baseData.ColumnsCount)
             * {
             *  return null;
             * }
             */
            var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName });

            foreach (var feature in featureColumns)
            {
                var dataOrderedByFeature =
                    baseData.GetNumericColumnVector(feature)
                    .Select((rowVal, idx) => new Tuple <double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx))
                    .OrderBy(tpl => tpl.Item1)
                    .ToList();
                var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList();

                var previousFeatureValue = dataOrderedByFeature.First().Item1;

                for (int i = 0; i < (dataOrderedByFeature.Count - 1); i++)
                {
                    var dataPoint           = dataOrderedByFeature[i];
                    var currentFeatureValue = dataPoint.Item1;
                    if (currentFeatureValue != previousFeatureValue)
                    {
                        var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0;
                        if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint))
                        {
                            var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList();
                            var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList();
                            var splitQuality       = splitQualityChecker.CalculateSplitQuality(
                                initialEntropy,
                                baseData.RowCount,
                                new[] { dependentValsBelow, dependentValsAbove });
                            if (splitQuality > bestSplitQuality)
                            {
                                bestSplitQuality = splitQuality;
                                bestSplit        = new Tuple <string, double>(feature, splitPoint);
                            }
                        }
                    }

                    previousFeatureValue = currentFeatureValue;
                }
            }

            if (bestSplit == null)
            {
                return(null);
            }

            var splittedData = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName));

            return(new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2));
        }
 public double GetInitialEntropy(IDataFrame baseData, string dependentFeatureName)
 {
     return baseData.GetNumericColumnVector(dependentFeatureName).Variance();
 }
 private static List<NumericFeatureData> OrderColumn(IDataFrame baseData, string dependentFeatureName, string numericFeatureToProcess)
 {
     return baseData.GetNumericColumnVector(numericFeatureToProcess)
         .Select(
             (featureVal, rowIndex) =>
             new NumericFeatureData(featureVal, baseData[rowIndex, dependentFeatureName].FeatureValue))
         .OrderBy(elem => elem.FeatureVal)
         .ToList();
 }
        public ISplittingResult SelectBestSplit(
            IDataFrame baseData,
            string dependentFeatureName,
            INumericalSplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            var bestSplitQuality = double.NegativeInfinity;
            var initialEntropy = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName);
            Tuple<string, double> bestSplit = null;
            /*
            if (baseData.RowCount <= baseData.ColumnsCount)
            {
                return null;
            }
            */
            var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName });
            foreach (var feature in featureColumns)
            {
                var dataOrderedByFeature =
                    baseData.GetNumericColumnVector(feature)
                        .Select((rowVal, idx) => new Tuple<double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx))
                        .OrderBy(tpl => tpl.Item1)
                        .ToList();
                var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList();

                var previousFeatureValue = dataOrderedByFeature.First().Item1;

                for (int i = 0; i < (dataOrderedByFeature.Count -1); i++)
                {
                    var dataPoint = dataOrderedByFeature[i];
                    var currentFeatureValue = dataPoint.Item1;
                    if (currentFeatureValue != previousFeatureValue)
                    {
                        var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0;
                        if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint))
                        {
                            var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList();
                            var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList();
                            var splitQuality = splitQualityChecker.CalculateSplitQuality(
                                initialEntropy,
                                baseData.RowCount,
                                new[] { dependentValsBelow, dependentValsAbove });
                            if (splitQuality > bestSplitQuality)
                            {
                                bestSplitQuality = splitQuality;
                                bestSplit = new Tuple<string, double>(feature, splitPoint);
                            }
                        }
                    }

                    previousFeatureValue = currentFeatureValue;
                }
            }

            if (bestSplit == null)
            {
                return null;
            }

            var splittedData = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName));

            return new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2);
        }