public IList <ISplittedData> SplitData(IDataFrame dataToSplit, IBinarySplittingParams splttingParams) { var queries = BuildQueries(splttingParams.SplitOnFeature, splttingParams.SplitOnValue); var splitResults = new List <ISplittedData>(); var totalRowsCount = (double)dataToSplit.RowCount; foreach (var boolAndQuery in queries) { var resultDataFrame = dataToSplit.GetSubsetByQuery(boolAndQuery.Value); splitResults.Add(new SplittedData(GetSubsetLink(resultDataFrame, totalRowsCount, boolAndQuery.Key), resultDataFrame)); } return(splitResults); }
public IList<ISplittedData> SplitData(IDataFrame dataToSplit, IBinarySplittingParams splttingParams) { var queries = BuildQueries(splttingParams.SplitOnFeature, splttingParams.SplitOnValue); var splitResults = new List<ISplittedData>(); var totalRowsCount = (double)dataToSplit.RowCount; foreach (var boolAndQuery in queries) { var resultDataFrame = dataToSplit.GetSubsetByQuery(boolAndQuery.Value); splitResults.Add(new SplittedData(GetSubsetLink(resultDataFrame, totalRowsCount, boolAndQuery.Key), resultDataFrame)); } return splitResults; }
protected override Tuple <IList <ISplittedData>, ISplittingParams, double> EvaluateCategoricalSplit( IDataFrame dataToSplit, string dependentFeatureName, string splittingFeatureName, double bestSplitQualitySoFar, double initialEntropy, ISplitQualityChecker splitQualityChecker, IAlredyUsedAttributesInfo alredyUsedAttributesInfo) { var totalRowsCount = dataToSplit.RowCount; var uniqueFeatureValues = dataToSplit.GetColumnVector(splittingFeatureName).Distinct(); double locallyBestSplitQuality = double.NegativeInfinity; IBinarySplittingParams localBestSplitParams = null; IList <ISplittedData> locallyBestSplitData = null; foreach (var featureValue in uniqueFeatureValues) { if (!alredyUsedAttributesInfo.WasAttributeAlreadyUsedWithValue(splittingFeatureName, featureValue)) { var binarySplitParams = new BinarySplittingParams(splittingFeatureName, featureValue, dependentFeatureName); var splittedData = CategoricalDataSplitter.SplitData(dataToSplit, binarySplitParams); if (splittedData.Count == 1) { return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( new List <ISplittedData>(), binarySplitParams, double.NegativeInfinity)); } var splitQuality = splitQualityChecker.CalculateSplitQuality( initialEntropy, totalRowsCount, splittedData, dependentFeatureName); if (splitQuality > locallyBestSplitQuality) { locallyBestSplitQuality = splitQuality; locallyBestSplitData = splittedData; localBestSplitParams = binarySplitParams; } } } return(new Tuple <IList <ISplittedData>, ISplittingParams, double>( locallyBestSplitData, localBestSplitParams, locallyBestSplitQuality)); }