public Tuple <ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit        = null;
            double           bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData  =
                baseData.GetNumericColumnVector(numericFeatureToProcess)
                .AsParallel()
                .Select((val, rowIdx) =>
                        new
            {
                RowIdx                = rowIdx,
                FeatureValue          = val,
                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
            })
                .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                .ToList();
            var previousClass      = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;

            foreach (var rowData in sortedRowData)
            {
                var currentClass      = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay     = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality     = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit        = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return(new Tuple <ISplittingResult, double>(bestSplit, bestSplitQuality));
        }
        public Tuple<ISplittingResult, double> FindBestSplitPoint(
            IDataFrame baseData,
            string dependentFeatureName,
            string numericFeatureToProcess,
            ICategoricalSplitQualityChecker splitQualityChecker,
            IBinaryNumericDataSplitter binaryNumericDataSplitter,
            double initialEntropy)
        {
            ISplittingResult bestSplit = null;
            double bestSplitQuality = double.NegativeInfinity;
            var totalRowsCount = baseData.RowCount;
            var sortedRowData =
                        baseData.GetNumericColumnVector(numericFeatureToProcess)
                            .AsParallel()
                            .Select((val, rowIdx) =>
                            new
                            {
                                RowIdx = rowIdx,
                                FeatureValue = val,
                                DependentFeatureValue = baseData[rowIdx, dependentFeatureName].FeatureValue
                            })
                            .OrderBy(elem => elem.FeatureValue).ThenBy(elem => elem.RowIdx)
                            .ToList();
            var previousClass = sortedRowData[0].DependentFeatureValue;
            var previousFeatureVal = sortedRowData[0].FeatureValue;
            foreach (var rowData in sortedRowData)
            {
                var currentClass = rowData.DependentFeatureValue;
                var currentFeatureVal = rowData.FeatureValue;
                if (!currentClass.Equals(previousClass) && !currentFeatureVal.Equals(previousFeatureVal))
                {
                    var halfWay = (previousFeatureVal + currentFeatureVal) / 2.0;
                    var splitParams = new BinarySplittingParams(numericFeatureToProcess, halfWay, dependentFeatureName);
                    var splitResult = binaryNumericDataSplitter.SplitData(baseData, splitParams);
                    var quality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splitResult,
                        dependentFeatureName);
                    if (quality >= bestSplitQuality)
                    {
                        bestSplitQuality = quality;
                        bestSplit = new BinarySplittingResult(true, numericFeatureToProcess, splitResult, halfWay);
                    }

                    previousClass = currentClass;
                }

                previousFeatureVal = currentFeatureVal;
            }

            return new Tuple<ISplittingResult, double>(bestSplit, bestSplitQuality);
        }
        protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alredyUsedAttributesInfo)
        {
            var    totalRowsCount          = dataToSplit.RowCount;
            var    uniqueFeatureValues     = dataToSplit.GetColumnVector(splittingFeatureName).Distinct();
            double locallyBestSplitQuality = double.NegativeInfinity;
            IBinarySplittingParams localBestSplitParams = null;
            IList <ISplittedData>  locallyBestSplitData = null;

            foreach (var featureValue in uniqueFeatureValues)
            {
                if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue))
                {
                    var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName);
                    var splittedData      = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams);
                    if (splittedData.Count == 1)
                    {
                        return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                                   new List <ISplittedData>(),
                                   binarySplitParams,
                                   double.NegativeInfinity));
                    }

                    var splitQuality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splittedData,
                        dependentFeatureName);
                    if (splitQuality > locallyBestSplitQuality)
                    {
                        locallyBestSplitQuality = splitQuality;
                        locallyBestSplitData    = splittedData;
                        localBestSplitParams    = binarySplitParams;
                    }
                }
            }

            return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                       locallyBestSplitData,
                       localBestSplitParams,
                       locallyBestSplitQuality));
        }
        protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alredyUsedAttributesInfo)
        {
            var totalRowsCount = dataToSplit.RowCount;
            var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct();
            double locallyBestSplitQuality = double.NegativeInfinity;
            IBinarySplittingParams localBestSplitParams = null;
            IList<ISplittedData> locallyBestSplitData = null;
            foreach (var featureValue in uniqueFeatureValues)
            {
                if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue))
                {
                    var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName);
                    var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams);
                    if (splittedData.Count == 1)
                    {
                        return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                            new List<ISplittedData>(),
                            binarySplitParams,
                            double.NegativeInfinity);
                    }

                    var splitQuality = splitQualityChecker.CalculateSplitQuality(
                        initialEntropy,
                        totalRowsCount,
                        splittedData,
                        dependentFeatureName);
                    if (splitQuality > locallyBestSplitQuality)
                    {
                        locallyBestSplitQuality = splitQuality;
                        locallyBestSplitData = splittedData;
                        localBestSplitParams = binarySplitParams;
                    }
                }
            }

            return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                locallyBestSplitData,
                localBestSplitParams,
                locallyBestSplitQuality);
        }
예제 #5
0
        public void PerformBinaryNumericDataSplit()
        {
            // Given
            var testData             = TestDataBuilder.BuildSmallDataFrameNumbersOnly();
            var expectedPositiveData = testData.GetSubsetByRows(new[] { 1, 2 });
            var expectedNegativeData = testData.GetSubsetByRows(new[] { 0 });
            var splitCriterion       = new BinarySplittingParams("Col1", 5, null);

            // When
            var splitResults       = BinaryNumericDataSplitter.SplitData(testData, splitCriterion);
            var actualPositiveData = splitResults.First().SplittedDataFrame;
            var actualNegativeData = splitResults.Last().SplittedDataFrame;

            // Then
            Assert.IsTrue(expectedPositiveData.ContentEquals(actualPositiveData));
            Assert.IsTrue(actualNegativeData.ContentEquals(expectedNegativeData));
        }
예제 #6
0
        public void PerformBinaryDiscreteDataSplit()
        {
            // Given
            var testData             = TestDataBuilder.ReadWeatherDataWithCategoricalAttributes();
            var expectedPositiveData = testData.GetSubsetByRows(new[] { 3, 4, 5, 9, 13 });
            var expectedNegativeData = testData.GetSubsetByRows(new[] { 0, 1, 2, 6, 7, 8, 10, 11, 12 });


            var splitCriteria = new BinarySplittingParams("Outlook", "Rainy", "Play");

            // When
            var splitResults       = BinaryDiscreteDataSplitter.SplitData(testData, splitCriteria);
            var actualPositiveData = splitResults.First().SplittedDataFrame;
            var actualNegativeData = splitResults.Last().SplittedDataFrame;

            // Then
            Assert.IsTrue(expectedPositiveData.ContentEquals(actualPositiveData));
            Assert.IsTrue(actualNegativeData.ContentEquals(expectedNegativeData));
        }