Exemplo n.º 1
0
        public void Build(IReadOnlyList <int> trainingRows, IReadOnlyList <int> pruningRows, IScope statescope, ResultCollection results, CancellationToken cancellationToken)
        {
            var regressionTreeParams = (RegressionTreeParameters)statescope.Variables[DecisionTreeRegression.RegressionTreeParameterVariableName].Value;

            variables = regressionTreeParams.AllowedInputVariables.ToList();

            //build tree and select node with maximum coverage
            var tree = RegressionNodeTreeModel.CreateTreeModel(regressionTreeParams.TargetVariable, regressionTreeParams);

            tree.BuildModel(trainingRows, pruningRows, statescope, results, cancellationToken);
            var nodeModel = tree.Root.EnumerateNodes().Where(x => x.IsLeaf).MaxItems(x => x.NumSamples).First();

            var satts = new List <string>();
            var svals = new List <double>();
            var reops = new List <Comparison>();

            //extract splits
            for (var temp = nodeModel; temp.Parent != null; temp = temp.Parent)
            {
                satts.Add(temp.Parent.SplitAttribute);
                svals.Add(temp.Parent.SplitValue);
                reops.Add(temp.Parent.Left == temp ? Comparison.LessEqual : Comparison.Greater);
            }
            Comparisons     = reops.ToArray();
            SplitAttributes = satts.ToArray();
            SplitValues     = svals.ToArray();
            int np;

            RuleModel = regressionTreeParams.LeafModel.BuildModel(trainingRows.Union(pruningRows).Where(r => Covers(regressionTreeParams.Data, r)).ToArray(), regressionTreeParams, cancellationToken, out np);
        }
Exemplo n.º 2
0
        private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool generateRules, bool useHoldout, double holdoutSize)
        {
            var stateScope = new Scope("RegressionTreeStateScope");

            //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise
            var doubleVars = new HashSet <string>(problemData.Dataset.DoubleVariables);
            var vars       = problemData.AllowedInputVariables.Concat(new[] { problemData.TargetVariable }).ToArray();

            if (vars.Any(v => !doubleVars.Contains(v)))
            {
                throw new NotSupportedException("Decision tree regression supports only double valued input or output features.");
            }
            var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray();

            if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x))))
            {
                throw new NotSupportedException("Decision tree regression does not support NaN or infinity values in the input dataset.");
            }
            var trainingData = new Dataset(vars, doubles);
            var pd           = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable);

            pd.TrainingPartition.End   = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows;
            pd.TrainingPartition.Start = 0;

            //store regression tree parameters
            var regressionTreeParams = new RegressionTreeParameters(pruning, minLeafSize, leafModel, pd, random, splitter);

            stateScope.Variables.Add(new Variable(RegressionTreeParameterVariableName, regressionTreeParams));

            //initialize tree operators
            pruning.Initialize(stateScope);
            splitter.Initialize(stateScope);
            leafModel.Initialize(stateScope);

            //store unbuilt model
            IItem model;

            if (generateRules)
            {
                model = RegressionRuleSetModel.CreateRuleModel(problemData.TargetVariable, regressionTreeParams);
                RegressionRuleSetModel.Initialize(stateScope);
            }
            else
            {
                model = RegressionNodeTreeModel.CreateTreeModel(problemData.TargetVariable, regressionTreeParams);
            }
            stateScope.Variables.Add(new Variable(ModelVariableName, model));

            //store training & pruning indices
            IReadOnlyList <int> trainingSet, pruningSet;

            GeneratePruningSet(pd.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingSet, out pruningSet);
            stateScope.Variables.Add(new Variable(TrainingSetVariableName, new IntArray(trainingSet.ToArray())));
            stateScope.Variables.Add(new Variable(PruningSetVariableName, new IntArray(pruningSet.ToArray())));

            return(stateScope);
        }