public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) { if (pd.Dataset.Rows < MinLeafSize(pd)) { throw new ArgumentException("The number of training instances is too small to create a Gaussian process model"); } Regression.Problem = new RegressionProblem { ProblemData = pd }; var cvscore = double.MaxValue; GaussianProcessRegressionSolution sol = null; for (var i = 0; i < Tries; i++) { var res = RegressionTreeUtilities.RunSubAlgorithm(Regression, random.Next(), cancellationToken); var t = res.Select(x => x.Value).OfType <GaussianProcessRegressionSolution>().FirstOrDefault(); var score = ((DoubleValue)res["Negative log pseudo-likelihood (LOO-CV)"].Value).Value; if (score >= cvscore || t == null || double.IsNaN(t.TrainingRSquared)) { continue; } cvscore = score; sol = t; } Regression.Runs.Clear(); if (sol == null) { throw new ArgumentException("Could not create Gaussian process model"); } numberOfParameters = pd.Dataset.Rows + 1 + Regression.CovarianceFunction.GetNumberOfParameters(pd.AllowedInputVariables.Count()) + Regression.MeanFunction.GetNumberOfParameters(pd.AllowedInputVariables.Count()); return(sol.Model); }
public void FillLeafs(RegressionNodeTreeModel tree, IReadOnlyList <int> trainingRows, IDataset data) { var helperQueue = new Queue <RegressionNodeModel>(); var trainingHelperQueue = new Queue <IReadOnlyList <int> >(); nodeQueue.Clear(); trainingRowsQueue.Clear(); helperQueue.Enqueue(tree.Root); trainingHelperQueue.Enqueue(trainingRows); while (helperQueue.Count != 0) { var n = helperQueue.Dequeue(); var t = trainingHelperQueue.Dequeue(); if (n.IsLeaf) { nodeQueue.Enqueue(n); trainingRowsQueue.Enqueue(t); continue; } IReadOnlyList <int> leftTraining, rightTraining; RegressionTreeUtilities.SplitRows(t, data, n.SplitAttribute, n.SplitValue, out leftTraining, out rightTraining); helperQueue.Enqueue(n.Left); helperQueue.Enqueue(n.Right); trainingHelperQueue.Enqueue(leftTraining); trainingHelperQueue.Enqueue(rightTraining); } }
private static SymbolicExpressionTreeNode MirrorTree(RegressionNodeModel regressionNode, IDictionary <int, RegressionNodeModel> dict, IDictionary <int, IReadOnlyList <int> > trainingLeafRows, IDictionary <int, IReadOnlyList <int> > testLeafRows, IntValue nextId, IDataset data, IReadOnlyList <int> trainingRows, IReadOnlyList <int> testRows) { if (regressionNode.IsLeaf) { var i = nextId.Value++; dict.Add(i, regressionNode); trainingLeafRows.Add(i, trainingRows); testLeafRows.Add(i, testRows); return(new SymbolicExpressionTreeNode(new TextSymbol("Model " + i + "\n(" + trainingRows.Count + "/" + testRows.Count + ")"))); } var pftext = "\npf = " + regressionNode.PruningStrength.ToString("0.###"); var text = regressionNode.SplitAttribute + " <= " + regressionNode.SplitValue.ToString("0.###"); if (!double.IsNaN(regressionNode.PruningStrength)) { text += pftext; } var textNode = new SymbolicExpressionTreeNode(new TextSymbol(text)); IReadOnlyList <int> lTrainingRows, rTrainingRows; IReadOnlyList <int> lTestRows, rTestRows; RegressionTreeUtilities.SplitRows(trainingRows, data, regressionNode.SplitAttribute, regressionNode.SplitValue, out lTrainingRows, out rTrainingRows); RegressionTreeUtilities.SplitRows(testRows, data, regressionNode.SplitAttribute, regressionNode.SplitValue, out lTestRows, out rTestRows); textNode.AddSubtree(MirrorTree(regressionNode.Left, dict, trainingLeafRows, testLeafRows, nextId, data, lTrainingRows, lTestRows)); textNode.AddSubtree(MirrorTree(regressionNode.Right, dict, trainingLeafRows, testLeafRows, nextId, data, rTrainingRows, rTestRows)); return(textNode); }
private static IRegressionProblemData Subselect(IRegressionProblemData data, IReadOnlyList <int> training, IReadOnlyList <int> test) { var dataset = RegressionTreeUtilities.ReduceDataset(data.Dataset, training.Concat(test).ToList(), data.AllowedInputVariables.ToList(), data.TargetVariable); var res = new RegressionProblemData(dataset, data.AllowedInputVariables, data.TargetVariable); res.TrainingPartition.Start = 0; res.TrainingPartition.End = training.Count; res.TestPartition.Start = training.Count; res.TestPartition.End = training.Count + test.Count; return(res); }
public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { if (pd.Dataset.Rows < MinLeafSize(pd)) { throw new ArgumentException("The number of training instances is too small to create a linear model"); } noParameters = pd.Dataset.Rows + 1; Regression.Problem = new RegressionProblem { ProblemData = pd }; var res = RegressionTreeUtilities.RunSubAlgorithm(Regression, random.Next(), cancellationToken); var t = res.Select(x => x.Value).OfType <IRegressionSolution>().FirstOrDefault(); if (t == null) { throw new ArgumentException("No RegressionSolution was provided by the algorithm"); } return(t.Model); }
public IRegressionModel BuildModel(IReadOnlyList <int> rows, RegressionTreeParameters parameters, CancellationToken cancellation, out int numberOfParameters) { var reducedData = RegressionTreeUtilities.ReduceDataset(parameters.Data, rows, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable); var pd = new RegressionProblemData(reducedData, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable); pd.TrainingPartition.Start = 0; pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = reducedData.Rows; int numP; var model = Build(pd, parameters.Random, cancellation, out numP); if (UseDampening && Dampening > 0.0) { model = DampenedModel.DampenModel(model, pd, Dampening); } numberOfParameters = numP; cancellation.ThrowIfCancellationRequested(); return(model); }
public void Split(RegressionNodeTreeModel tree, IReadOnlyList <int> trainingRows, IScope stateScope, CancellationToken cancellationToken) { var regressionTreeParams = (RegressionTreeParameters)stateScope.Variables[DecisionTreeRegression.RegressionTreeParameterVariableName].Value; var splittingState = (SplittingState)stateScope.Variables[SplittingStateVariableName].Value; var variables = regressionTreeParams.AllowedInputVariables.ToArray(); var target = regressionTreeParams.TargetVariable; if (splittingState.Code <= 0) { splittingState.nodeQueue.Enqueue(tree.Root); splittingState.trainingRowsQueue.Enqueue(trainingRows); splittingState.Code = 1; } while (splittingState.nodeQueue.Count != 0) { var n = splittingState.nodeQueue.Dequeue(); var rows = splittingState.trainingRowsQueue.Dequeue(); string attr; double splitValue; var isLeaf = !DecideSplit(new RegressionProblemData(RegressionTreeUtilities.ReduceDataset(regressionTreeParams.Data, rows, variables, target), variables, target), regressionTreeParams.MinLeafSize, out attr, out splitValue); if (isLeaf) { continue; } IReadOnlyList <int> leftRows, rightRows; RegressionTreeUtilities.SplitRows(rows, regressionTreeParams.Data, attr, splitValue, out leftRows, out rightRows); n.Split(regressionTreeParams, attr, splitValue, rows.Count); splittingState.nodeQueue.Enqueue(n.Left); splittingState.nodeQueue.Enqueue(n.Right); splittingState.trainingRowsQueue.Enqueue(leftRows); splittingState.trainingRowsQueue.Enqueue(rightRows); cancellationToken.ThrowIfCancellationRequested(); } }