public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit        = null;
            double           bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData  =
                baseData.GetNumericColumnVector(numericFeatureToProcess)
                .AsParallel()
                .Select((val, rowIdx) =>
                        new
            {
                RowIdx                = rowIdx,
                FeatureValue          = val,
                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
            })
                .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                .ToList();
            var previousClass      = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;

            foreach (var rowData in sortedRowData)
            {
                var currentClass      = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay     = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality     = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit        = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality));
        }
        public Tuple<ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit = null;
            double bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData =
                        baseData.GetNumericColumnVector(numericFeatureToProcess)
                            .AsParallel()
                            .Select((val, rowIdx) =>
                            new
                            {
                                RowIdx = rowIdx,
                                FeatureValue = val,
                                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
                            })
                            .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                            .ToList();
            var previousClass = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;
            foreach (var rowData in sortedRowData)
            {
                var currentClass = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return new Tuple<ISplittingResult, double>(bestSplit, bestSplitQuality);
        }
        public ISplittingResult SelectBestSplit(
            IDataFrame baseData,
            string dependentFeatureName,
            INumericalSplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            var bestSplitQuality             = double.NegativeInfinity;
            var initialEntropy               = splitQualityChecker.GetInitialEntropy(baseData, dependentFeatureName);
            Tuple <string, double> bestSplit = null;

            /*
             * if (baseData.RowCount <= baseData.ColumnsCount)
             * {
             *  return null;
             * }
             */
            var featureColumns = baseData.ColumnNames.Except(new[] { dependentFeatureName });

            foreach (var feature in featureColumns)
            {
                var dataOrderedByFeature =
                    baseData.GetNumericColumnVector(feature)
                    .Select((rowVal, idx) => new Tuple <double, double, int>(rowVal, (double)baseData[idx, dependentFeatureName].FeatureValue, idx))
                    .OrderBy(tpl => tpl.Item1)
                    .ToList();
                var dependentFeatureValuesOrdered = dataOrderedByFeature.Select(elem => elem.Item2).ToList();

                var previousFeatureValue = dataOrderedByFeature.First().Item1;

                for (int i = 0; i < (dataOrderedByFeature.Count - 1); i++)
                {
                    var dataPoint           = dataOrderedByFeature[i];
                    var currentFeatureValue = dataPoint.Item1;
                    if (currentFeatureValue != previousFeatureValue)
                    {
                        var splitPoint = (currentFeatureValue + previousFeatureValue) / 2.0;
                        if (!alreadyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(feature, splitPoint))
                        {
                            var dependentValsBelow = dependentFeatureValuesOrdered.Take(i).ToList();
                            var dependentValsAbove = dependentFeatureValuesOrdered.Skip(i).ToList();
                            var splitQuality       = splitQualityChecker.CalculateSplitQuality(
                                initialEntropy,
                                baseData.RowCount,
                                new[] { dependentValsBelow, dependentValsAbove });
                            if (splitQuality > bestSplitQuality)
                            {
                                bestSplitQuality = splitQuality;
                                bestSplit        = new Tuple <string, double>(feature, splitPoint);
                            }
                        }
                    }

                    previousFeatureValue = currentFeatureValue;
                }
            }

            if (bestSplit == null)
            {
                return(null);
            }

            var splittedData = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(bestSplit.Item1, bestSplit.Item2, dependentFeatureName));

            return(new BinarySplittingResult(true, bestSplit.Item1, splittedData, bestSplit.Item2));
        }
        public Tuple<ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList();
            var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess);
            var dependentValsCounts = new List<Vector<double>>();
            var breakPoints = new List<int>();
            var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal;
            for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++)
            {
                var currentElem = dependentValuesSortedByNumericFeature[elemIdx];
                var currentDependentValue = currentElem.DependentVal;
                var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue);
                var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray());
                if (elemIdx != 0)
                {
                    dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation);
                }
                dependentValsCountAllocation[indexOfCurrentDependentValue]++;
                if (!currentDependentValue.Equals(lastKnowDependentValue))
                {
                    lastKnowDependentValue = currentDependentValue;
                    breakPoints.Add(elemIdx - 1);
                }

                dependentValsCounts.Add(dependentValsCountAllocation);
            }

            var bestSplitQualitySoFar = double.NegativeInfinity;
            int bestBreakpointSoFar = -1;
            foreach (var breakpointIdx in breakPoints)
            {
                var dependentValsCountUpToBreakpoint = dependentValsCounts[breakpointIdx];
                var dependentValsCountAboveBreakpoint =
                    dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint);
                var splitEntropy = splitQualityChecker.CalculateSplitQuality(
                    initialEntropy,
                    baseData.RowCount,
                    new List<IList<int>>
                        {
                            VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint)
                        });
                if (splitEntropy > bestSplitQualitySoFar)
                {
                    bestSplitQualitySoFar = splitEntropy;
                    bestBreakpointSoFar = breakpointIdx;
                }
            }

            var splitVal = CalculateSplitPoint(
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal,
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal);
            var split = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName));
            var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal);
            return new Tuple<ISplittingResult, double>(splitResult, bestSplitQualitySoFar);
        }
Example #5
0
        public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            var uniqueDependentValues = baseData.GetColumnVector(dependentFeatureName).Values.Distinct().ToList();
            var dependentValuesSortedByNumericFeature = OrderColumn(baseData, dependentFeatureName, numericFeatureToProcess);
            var dependentValsCounts    = new List <Vector <double> >();
            var breakPoints            = new List <int>();
            var lastKnowDependentValue = dependentValuesSortedByNumericFeature.First().DependentVal;

            for (int elemIdx = 0; elemIdx < dependentValuesSortedByNumericFeature.Count; elemIdx++)
            {
                var currentElem                  = dependentValuesSortedByNumericFeature[elemIdx];
                var currentDependentValue        = currentElem.DependentVal;
                var indexOfCurrentDependentValue = uniqueDependentValues.IndexOf(currentDependentValue);
                var dependentValsCountAllocation = DenseVector.OfArray(Enumerable.Repeat(0.0, uniqueDependentValues.Count).ToArray());
                if (elemIdx != 0)
                {
                    dependentValsCounts[elemIdx - 1].CopyTo(dependentValsCountAllocation);
                }
                dependentValsCountAllocation[indexOfCurrentDependentValue]++;
                if (!currentDependentValue.Equals(lastKnowDependentValue))
                {
                    lastKnowDependentValue = currentDependentValue;
                    breakPoints.Add(elemIdx - 1);
                }

                dependentValsCounts.Add(dependentValsCountAllocation);
            }

            var bestSplitQualitySoFar = double.NegativeInfinity;
            int bestBreakpointSoFar   = -1;

            foreach (var breakpointIdx in breakPoints)
            {
                var dependentValsCountUpToBreakpoint  = dependentValsCounts[breakpointIdx];
                var dependentValsCountAboveBreakpoint =
                    dependentValsCounts.Last().Subtract(dependentValsCountUpToBreakpoint);
                var splitEntropy = splitQualityChecker.CalculateSplitQuality(
                    initialEntropy,
                    baseData.RowCount,
                    new List <IList <int> >
                {
                    VectorToIntArray(dependentValsCountUpToBreakpoint), VectorToIntArray(dependentValsCountAboveBreakpoint)
                });
                if (splitEntropy > bestSplitQualitySoFar)
                {
                    bestSplitQualitySoFar = splitEntropy;
                    bestBreakpointSoFar   = breakpointIdx;
                }
            }

            var splitVal = CalculateSplitPoint(
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar + 1].FeatureVal,
                dependentValuesSortedByNumericFeature[bestBreakpointSoFar].FeatureVal);
            var split = binaryNumericDataSplitter.SplitData(
                baseData,
                new BinarySplittingParams(numericFeatureToProcess, splitVal, dependentFeatureName));
            var splitResult = new BinarySplittingResult(true, numericFeatureToProcess, split, splitVal);

            return(new Tuple <ISplittingResult, double>(splitResult, bestSplitQualitySoFar));
        }