protected override Tuple<IList<ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName))
            {
                return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                    new List<ISplittedData>(),
                    new SplittingParams(splittingFeatureName, dependentFeatureName),
                    double.NegativeInfinity);
            }
            var totalRowsCount = dataToSplit.RowCount;
            var splitParams = new SplittingParams(splittingFeatureName, dependentFeatureName);
            var splitData = CategoricalDataSplitter.SplitData(dataToSplit, splitParams);
            if (splitData.Count == 1)
            {
                return new Tuple<IList<ISplittedData>, ISplittingParams, double>(
                    new List<ISplittedData>(),
                    splitParams,
                    double.NegativeInfinity);
            }

            var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName);
            return new Tuple<IList<ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality);
        }
Beispiel #2
0
        protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit(
            IDataFrame dataToSplit,
            string dependentFeatureName,
            string splittingFeatureName,
            double bestSplitQualitySoFar,
            double initialEntropy,
            ISplitQualityChecker splitQualityChecker,
            IAlredyUsedAttributesInfo alreadyUsedAttributesInfo)
        {
            if (alreadyUsedAttributesInfo.WasAttributeAlreadyUsed(splittingFeatureName))
            {
                return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                           new List <ISplittedData>(),
                           new SplittingParams(splittingFeatureName, dependentFeatureName),
                           double.NegativeInfinity));
            }
            var totalRowsCount = dataToSplit.RowCount;
            var splitParams    = new SplittingParams(splittingFeatureName, dependentFeatureName);
            var splitData      = CategoricalDataSplitter.SplitData(dataToSplit, splitParams);

            if (splitData.Count == 1)
            {
                return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(
                           new List <ISplittedData>(),
                           splitParams,
                           double.NegativeInfinity));
            }

            var splitQuality = splitQualityChecker.CalculateSplitQuality(initialEntropy, totalRowsCount, splitData, dependentFeatureName);

            return(new Tuple <IList <ISplittedData>, ISplittingParams, double>(splitData, splitParams, splitQuality));
        }
        public void PerformMultiValueDiscreteDataSplit()
        {
            // Given
            var testData          = TestDataBuilder.ReadWeatherDataWithCategoricalAttributes();
            var splitParams       = new SplittingParams("Outlook", "Play");
            var expectedRowCounts = new Dictionary <object, int>
            {
                ["Sunny"]    = 5,
                ["Overcast"] = 4,
                ["Rainy"]    = 5
            };

            // When
            var splittedData = MultiValueDiscreteDataSplitter.SplitData(testData, splitParams);

            // Then
            Assert.AreEqual(expectedRowCounts.Count, splittedData.Count);
            foreach (var splittedResult in splittedData)
            {
                var expectedCount = expectedRowCounts[splittedResult.SplitLink.TestResult];
                Assert.AreEqual(expectedCount, splittedResult.SplittedDataFrame.RowCount);
                Assert.AreEqual(expectedCount, splittedResult.SplitLink.InstancesCount);
            }
        }