Example #1
0
 public MultiValueSplitSelectorForCategoricalOutcome(
     IDataSplitter categoricalSplitter,
     IBinaryNumericDataSplitter binarySplitter,
     IBinaryNumericAttributeSplitPointSelector binaryNumericBestSplitPointSelector)
     : base(categoricalSplitter, binarySplitter, binaryNumericBestSplitPointSelector)
 {
 }
 public BinarySplitSelectorForCategoricalOutcome(
     IBinaryDataSplitter binaryDataSplitter, 
     IBinaryNumericDataSplitter binaryNumericDataSplitter,
     IBinaryNumericAttributeSplitPointSelector binaryNumericBestSplitPointSelector)
     : base(binaryDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector)
 {
 }
 public MultiValueSplitSelectorForCategoricalOutcome(
     IDataSplitter categoricalSplitter, 
     IBinaryNumericDataSplitter binarySplitter,
     IBinaryNumericAttributeSplitPointSelector binaryNumericBestSplitPointSelector)
     : base(categoricalSplitter, binarySplitter, binaryNumericBestSplitPointSelector)
 {
 }
 public BinarySplitSelectorForCategoricalOutcome(
     IBinaryDataSplitter binaryDataSplitter,
     IBinaryNumericDataSplitter binaryNumericDataSplitter,
     IBinaryNumericAttributeSplitPointSelector binaryNumericBestSplitPointSelector)
     : base(binaryDataSplitter, binaryNumericDataSplitter, binaryNumericBestSplitPointSelector)
 {
 }
Example #5
0
 protected BaseSplitSelectorForCategoricalOutcome(
     IDataSplitter binarySplitter,
     IBinaryNumericDataSplitter binaryNumericSplitter,
     IBinaryNumericAttributeSplitPointSelector binaryNumericBestSplitPointSelector)
 {
     CategoricalDataSplitter   = binarySplitter;
     BinaryNumericDataSplitter = binaryNumericSplitter;
     BinaryNumericBestSplitingPointSelector = binaryNumericBestSplitPointSelector;
 }
 protected BaseSplitSelectorForCategoricalOutcome(
     IDataSplitter binarySplitter, 
     IBinaryNumericDataSplitter binaryNumericSplitter, 
     IBinaryNumericAttributeSplitPointSelector binaryNumericBestSplitPointSelector)
 {
     CategoricalDataSplitter = binarySplitter;
     BinaryNumericDataSplitter = binaryNumericSplitter;
     BinaryNumericBestSplitingPointSelector = binaryNumericBestSplitPointSelector;
 }
        public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit        = null;
            double           bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData  =
                baseData.GetNumericColumnVector(numericFeatureToProcess)
                .AsParallel()
                .Select((val, rowIdx) =>
                        new
            {
                RowIdx                = rowIdx,
                FeatureValue          = val,
                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
            })
                .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                .ToList();
            var previousClass      = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;

            foreach (var rowData in sortedRowData)
            {
                var currentClass      = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay     = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality     = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit        = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality));
        }
        public Tuple<ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit = null;
            double bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData =
                        baseData.GetNumericColumnVector(numericFeatureToProcess)
                            .AsParallel()
                            .Select((val, rowIdx) =>
                            new
                            {
                                RowIdx = rowIdx,
                                FeatureValue = val,
                                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
                            })
                            .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                            .ToList();
            var previousClass = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;
            foreach (var rowData in sortedRowData)
            {
                var currentClass = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return new Tuple<ISplittingResult, double>(bestSplit, bestSplitQuality);
        }
 public Tuple <ISplittingResult, double> FindBestSplitPoint(
     IDataFrame baseData,
     string dependentFeatureName,
     string numericFeatureToProcess,
     ISplitQualityChecker splitQualityChecker,
     IBinaryNumericDataSplitter binaryNumericDataSplitter,
     double initialEntropy)
 {
     return(FindBestSplitPoint(
                baseData,
                dependentFeatureName,
                numericFeatureToProcess,
                splitQualityChecker as ICategoricalSplitQualityChecker,
                binaryNumericDataSplitter,
                initialEntropy));
 }
 public Tuple<ISplittingResult, double> FindBestSplitPoint(
     IDataFrame baseData,
     string dependentFeatureName,
     string numericFeatureToProcess,
     ISplitQualityChecker splitQualityChecker,
     IBinaryNumericDataSplitter binaryNumericDataSplitter,
     double initialEntropy)
 {
     return FindBestSplitPoint(
         baseData,
         dependentFeatureName,
         numericFeatureToProcess,
         splitQualityChecker as ICategoricalSplitQualityChecker,
         binaryNumericDataSplitter,
         initialEntropy);
 }
 public BestSplitSelectorForNumericValues(IBinaryNumericDataSplitter binaryNumericSplitter)
 {
     binaryNumericDataSplitter = binaryNumericSplitter;
 }
        public Tuple<ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList();
            var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess);
            var dependentValsCounts = new List<Vector<double>>();
            var breakPoints = new List<int>();
            var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal;
            for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++)
            {
                var currentElem = dependentValuesSortedByNumericFeature[elemIdx];
                var currentDependentValue = currentElem.DependentVal;
                var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue);
                var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray());
                if (elemIdx != 0)
                {
                    dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation);
                }
                dependentValsCountAllocation[indexOfCurrentDependentValue]++;
                if (!currentDependentValue.Equals(lastKnowDependentValue))
                {
                    lastKnowDependentValue = currentDependentValue;
                    breakPoints.Add(elemIdx - 1);
                }

                dependentValsCounts.Add(dependentValsCountAllocation);
            }

            var bestSplitQualitySoFar = double.NegativeInfinity;
            int bestBreakpointSoFar = -1;
            foreach (var breakpointIdx in breakPoints)
            {
                var dependentValsCountUpToBreakpoint = dependentValsCounts[breakpointIdx];
                var dependentValsCountAboveBreakpoint =
                    dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint);
                var splitEntropy = splitQualityChecker.CalculateSplitQuality(
                    initialEntropy,
                    baseData.RowCount,
                    new List<IList<int>>
                        {
                            VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint)
                        });
                if (splitEntropy > bestSplitQualitySoFar)
                {
                    bestSplitQualitySoFar = splitEntropy;
                    bestBreakpointSoFar = breakpointIdx;
                }
            }

            var splitVal = CalculateSplitPoint(
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal,
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal);
            var split = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName));
            var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal);
            return new Tuple<ISplittingResult, double>(splitResult, bestSplitQualitySoFar);
        }
Example #13
0
        public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList();
            var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess);
            var dependentValsCounts    = new List <Vector <double> >();
            var breakPoints            = new List <int>();
            var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal;

            for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++)
            {
                var currentElem                  = dependentValuesSortedByNumericFeature[elemIdx];
                var currentDependentValue        = currentElem.DependentVal;
                var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue);
                var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray());
                if (elemIdx != 0)
                {
                    dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation);
                }
                dependentValsCountAllocation[indexOfCurrentDependentValue]++;
                if (!currentDependentValue.Equals(lastKnowDependentValue))
                {
                    lastKnowDependentValue = currentDependentValue;
                    breakPoints.Add(elemIdx - 1);
                }

                dependentValsCounts.Add(dependentValsCountAllocation);
            }

            var bestSplitQualitySoFar = double.NegativeInfinity;
            int bestBreakpointSoFar   = -1;

            foreach (var breakpointIdx in breakPoints)
            {
                var dependentValsCountUpToBreakpoint  = dependentValsCounts[breakpointIdx];
                var dependentValsCountAboveBreakpoint =
                    dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint);
                var splitEntropy = splitQualityChecker.CalculateSplitQuality(
                    initialEntropy,
                    baseData.RowCount,
                    new List <IList <int> >
                {
                    VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint)
                });
                if (splitEntropy > bestSplitQualitySoFar)
                {
                    bestSplitQualitySoFar = splitEntropy;
                    bestBreakpointSoFar   = breakpointIdx;
                }
            }

            var splitVal = CalculateSplitPoint(
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal,
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal);
            var split = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName));
            var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal);

            return(new Tuple <ISplittingResult, double>(splitResult, bestSplitQualitySoFar));
        }
 public BestSplitSelectorForNumericValues(IBinaryNumericDataSplitter binaryNumericSplitter)
 {
     binaryNumericDataSplitter = binaryNumericSplitter;
 }