public static ISymbolicRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, string modelStructure, int maxIterations) { var parser = new InfixExpressionParser(); var tree = parser.Parse(modelStructure); var simplifier = new SymbolicDataAnalysisExpressionTreeSimplifier(); if (!SymbolicRegressionConstantOptimizationEvaluator.CanOptimizeConstants(tree)) { throw new ArgumentException("The optimizer does not support the specified model structure."); } var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(); SymbolicRegressionConstantOptimizationEvaluator.OptimizeConstants(interpreter, tree, problemData, problemData.TrainingIndices, applyLinearScaling: false, maxIterations: maxIterations, updateVariableWeights: false, updateConstantsInTree: true); var scaledModel = new SymbolicRegressionModel(problemData.TargetVariable, tree, (ISymbolicDataAnalysisExpressionTreeInterpreter)interpreter.Clone()); scaledModel.Scale(problemData); SymbolicRegressionSolution solution = new SymbolicRegressionSolution(scaledModel, (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Regression Model"; solution.Name = "Regression Solution"; return(solution); }
public GbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize, double r, double m, double nu) { // default settings for MaxSize, Nu and R this.maxSize = maxSize; this.nu = nu; this.r = r; this.m = m; this.randSeed = randSeed; random = new MersenneTwister(randSeed); this.problemData = problemData; this.trainingRows = problemData.TrainingIndices.ToArray(); this.testRows = problemData.TestIndices.ToArray(); this.lossFunction = lossFunction; int nRows = trainingRows.Length; y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, trainingRows).ToArray(); treeBuilder = new RegressionTreeBuilder(problemData, random); activeIdx = Enumerable.Range(0, nRows).ToArray(); var zeros = Enumerable.Repeat(0.0, nRows).ToArray(); double f0 = lossFunction.LineSearch(y, zeros, activeIdx, 0, nRows - 1); // initial constant value (mean for squared errors) pred = Enumerable.Repeat(f0, nRows).ToArray(); predTest = Enumerable.Repeat(f0, testRows.Length).ToArray(); pseudoRes = new double[nRows]; models = new List<IRegressionModel>(); weights = new List<double>(); // add constant model models.Add(new ConstantModel(f0, problemData.TargetVariable)); weights.Add(1.0); }
public RegressionEnsembleSolution(IEnumerable <IRegressionModel> models, IRegressionProblemData problemData) : this(models, problemData, models.Select(m => (IntRange)problemData.TrainingPartition.Clone()), models.Select(m => (IntRange)problemData.TestPartition.Clone()) ) { }
protected DampenedModel(IRegressionModel model, IRegressionProblemData pd, double dampening) : base(model.TargetVariable) { Model = model; Min = pd.TargetVariableTrainingValues.Min(); Max = pd.TargetVariableTrainingValues.Max(); Dampening = dampening; }
public static IEnumerable <Tuple <string, double> > CalculateImpacts( IRegressionModel model, IRegressionProblemData problemData, IEnumerable <double> estimatedValues, IEnumerable <int> rows, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames); if (missingVariables.Any()) { throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables))); } IEnumerable <double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); var originalQuality = CalculateQuality(targetValues, estimatedValues); var impacts = new Dictionary <string, double>(); var inputvariables = new HashSet <string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction)); var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable(); foreach (var inputVariable in inputvariables) { impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality); } return(impacts.Select(i => Tuple.Create(i.Key, i.Value))); }
public static IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, IRandom random, ILeafModel leafModel = null, ISplitter splitter = null, IPruning pruning = null, bool useHoldout = false, double holdoutSize = 0.2, int minimumLeafSize = 1, bool generateRules = false, ResultCollection results = null, CancellationToken?cancellationToken = null) { if (leafModel == null) { leafModel = new LinearLeaf(); } if (splitter == null) { splitter = new Splitter(); } if (cancellationToken == null) { cancellationToken = CancellationToken.None; } if (pruning == null) { pruning = new ComplexityPruning(); } var stateScope = InitializeScope(random, problemData, pruning, minimumLeafSize, leafModel, splitter, generateRules, useHoldout, holdoutSize); var model = Build(stateScope, results, cancellationToken.Value); return(model.CreateRegressionSolution(problemData)); }
private IEnumerable <BasisFunction> CreateUnivariateBases(IRegressionProblemData problemData) { var B1 = new List <BasisFunction>(); var inputVariables = problemData.AllowedInputVariables; var validExponents = ConsiderExponentiations ? exponents : new double[] { 1 }; var validFuncs = ConsiderNonlinearFuncs ? NonlinearFuncs.CheckedItems.Select(val => val.Value) : new List <OpCode>(); // TODO: add Hinge functions foreach (var variableName in inputVariables) { foreach (var exp in validExponents) { var data = problemData.Dataset.GetDoubleValues(variableName).Select(x => Math.Pow(x, exp)).ToArray(); if (!ok(data)) { continue; } var name = expToString(exp, variableName); B1.Add(new BasisFunction(name, data, false)); foreach (OpCode _op in validFuncs) { var inner_data = data.Select(x => Utils.eval(_op, x)).ToArray(); if (!ok(inner_data)) { continue; } // the name is for later parsing the Basis Functions to an ISymbolicExpressionTree var inner_name = OpCodeToString.GetByFirst(_op) + "(" + name + ")"; B1.Add(new BasisFunction(inner_name, inner_data, true)); } } } return(B1); }
public void InitTest() { double[,] arr = new double[4, 3]; arr[0, 0] = 3; arr[0, 1] = 6; arr[0, 2] = 2; arr[1, 0] = 5; arr[1, 1] = 2; arr[1, 2] = 1; arr[2, 0] = 8; arr[2, 1] = 5; arr[2, 2] = 0; arr[3, 0] = 3; arr[3, 1] = 4; arr[3, 2] = 2; var ds = new Dataset(new string[] { "x1", "x2", "y" }, arr); problemData = (IRegressionProblemData) new RegressionProblemData(ds, new string[] { "x1", "x2" }, "y"); variableRanges = new Dictionary <string, Interval>(); variableRanges.Add("x1", new Interval(1, 10)); variableRanges.Add("x2", new Interval(4, 6)); }
/// <summary> /// Grid search with crossvalidation /// </summary> /// <param name="problemData">The regression problem data</param> /// <param name="numberOfFolds">The number of folds for crossvalidation</param> /// <param name="shuffleFolds">Specifies whether the folds should be shuffled</param> /// <param name="parameterRanges">The ranges for each parameter in the grid search</param> /// <param name="seed">The random seed (required by the random forest model)</param> /// <param name="maxDegreeOfParallelism">The maximum allowed number of threads (to parallelize the grid search)</param> /// <returns>The best parameter values found by the grid search</returns> public static RFParameter GridSearch(IRegressionProblemData problemData, int numberOfFolds, bool shuffleFolds, Dictionary <string, IEnumerable <double> > parameterRanges, int seed = 12345, int maxDegreeOfParallelism = 1) { DoubleValue mse = new DoubleValue(Double.MaxValue); RFParameter bestParameter = new RFParameter(); var setters = parameterRanges.Keys.Select(GenerateSetter).ToList(); var partitions = GenerateRandomForestPartitions(problemData, numberOfFolds); var crossProduct = parameterRanges.Values.CartesianProduct(); var locker = new object(); Parallel.ForEach(crossProduct, new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism }, parameterCombination => { var parameterValues = parameterCombination.ToList(); double testMSE; var parameters = new RFParameter(); for (int i = 0; i < setters.Count; ++i) { setters[i](parameters, parameterValues[i]); } CrossValidate(problemData, partitions, parameters.N, parameters.R, parameters.M, seed, out testMSE); lock (locker) { if (testMSE < mse.Value) { mse.Value = testMSE; bestParameter = (RFParameter)parameters.Clone(); } } }); return(bestParameter); }
/// <summary> /// Grid search without crossvalidation (since for random forests the out-of-bag estimate is unbiased) /// </summary> /// <param name="problemData">The regression problem data</param> /// <param name="parameterRanges">The ranges for each parameter in the grid search</param> /// <param name="seed">The random seed (required by the random forest model)</param> /// <param name="maxDegreeOfParallelism">The maximum allowed number of threads (to parallelize the grid search)</param> public static RFParameter GridSearch(IRegressionProblemData problemData, Dictionary <string, IEnumerable <double> > parameterRanges, int seed = 12345, int maxDegreeOfParallelism = 1) { var setters = parameterRanges.Keys.Select(GenerateSetter).ToList(); var crossProduct = parameterRanges.Values.CartesianProduct(); double bestOutOfBagRmsError = double.MaxValue; RFParameter bestParameters = new RFParameter(); var locker = new object(); Parallel.ForEach(crossProduct, new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism }, parameterCombination => { var parameterValues = parameterCombination.ToList(); var parameters = new RFParameter(); for (int i = 0; i < setters.Count; ++i) { setters[i](parameters, parameterValues[i]); } double rmsError, outOfBagRmsError, avgRelError, outOfBagAvgRelError; RandomForestModel.CreateRegressionModel(problemData, problemData.TrainingIndices, parameters.N, parameters.R, parameters.M, seed, out rmsError, out outOfBagRmsError, out avgRelError, out outOfBagAvgRelError); lock (locker) { if (bestOutOfBagRmsError > outOfBagRmsError) { bestOutOfBagRmsError = outOfBagRmsError; bestParameters = (RFParameter)parameters.Clone(); } } }); return(bestParameters); }
public static bool IsProblemDataCompatible(IRegressionModel model, IRegressionProblemData problemData, out string errorMessage) { if (model == null) { throw new ArgumentNullException("model", "The provided model is null."); } if (problemData == null) { throw new ArgumentNullException("problemData", "The provided problemData is null."); } errorMessage = string.Empty; if (model.TargetVariable != problemData.TargetVariable) { errorMessage = string.Format("The target variable of the model {0} does not match the target variable of the problemData {1}.", model.TargetVariable, problemData.TargetVariable); } var evaluationErrorMessage = string.Empty; var datasetCompatible = model.IsDatasetCompatible(problemData.Dataset, out evaluationErrorMessage); if (!datasetCompatible) { errorMessage += evaluationErrorMessage; } return(string.IsNullOrEmpty(errorMessage)); }
protected override void itemsListView_DragDrop(object sender, DragEventArgs e) { if (e.Effect != DragDropEffects.None) { var droppedData = e.Data.GetData(HeuristicLab.Common.Constants.DragDropDataFormat); if (droppedData is IValueParameter) { droppedData = ((IValueParameter)droppedData).Value; } if (droppedData is IRegressionProblem) { droppedData = ((IRegressionProblem)droppedData).ProblemData; } RegressionEnsembleProblemData ensembleProblemData = droppedData as RegressionEnsembleProblemData; IRegressionProblemData problemData = droppedData as IRegressionProblemData; if (ensembleProblemData != null) { Content.ProblemData = (RegressionEnsembleProblemData)ensembleProblemData.Clone(); } else if (problemData != null) { Content.ProblemData = new RegressionEnsembleProblemData((IRegressionProblemData)problemData.Clone()); } } }
public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) { if (pd.Dataset.Rows < MinLeafSize(pd)) { throw new ArgumentException("The number of training instances is too small to create a Gaussian process model"); } Regression.Problem = new RegressionProblem { ProblemData = pd }; var cvscore = double.MaxValue; GaussianProcessRegressionSolution sol = null; for (var i = 0; i < Tries; i++) { var res = RegressionTreeUtilities.RunSubAlgorithm(Regression, random.Next(), cancellationToken); var t = res.Select(x => x.Value).OfType <GaussianProcessRegressionSolution>().FirstOrDefault(); var score = ((DoubleValue)res["Negative log pseudo-likelihood (LOO-CV)"].Value).Value; if (score >= cvscore || t == null || double.IsNaN(t.TrainingRSquared)) { continue; } cvscore = score; sol = t; } Regression.Runs.Clear(); if (sol == null) { throw new ArgumentException("Could not create Gaussian process model"); } numberOfParameters = pd.Dataset.Rows + 1 + Regression.CovarianceFunction.GetNumberOfParameters(pd.AllowedInputVariables.Count()) + Regression.MeanFunction.GetNumberOfParameters(pd.AllowedInputVariables.Count()); return(sol.Model); }
private void CalculateFrequencies(List <double> residualValues, Series series) { double roundedMax, intervalWidth; CalculateResidualParameters(residualValues, out roundedMax, out intervalWidth); IEnumerable <double> relevantResiduals = residualValues; IRegressionProblemData problemdata = Content.ProblemData; if (series.Name.Equals(TRAINING_SAMPLES)) { relevantResiduals = residualValues.Skip(problemdata.TrainingPartition.Start).Take(problemdata.TrainingPartition.Size); } else if (series.Name.Equals(TEST_SAMPLES)) { relevantResiduals = residualValues.Skip(problemdata.TestPartition.Start).Take(problemdata.TestPartition.Size); } double intervalCenter = intervalWidth / 2.0; double sampleCount = relevantResiduals.Count(); double current = -roundedMax; DataPointCollection seriesPoints = series.Points; for (int i = 0; i <= bins; i++) { IEnumerable <double> help = relevantResiduals.Where(x => x >= (current - intervalCenter) && x < (current + intervalCenter)); seriesPoints.AddXY(current, help.Count() / sampleCount); seriesPoints[seriesPoints.Count - 1]["from"] = (current - intervalCenter).ToString(); seriesPoints[seriesPoints.Count - 1]["to"] = (current + intervalCenter).ToString(); current += intervalWidth; } }
public static void Run(IRegressionProblemData problemData, IEnumerable <string> allowedInputVariables, string svmType, string kernelType, double cost, double nu, double gamma, double epsilon, int degree, out ISupportVectorMachineModel model, out int nSv) { var dataset = problemData.Dataset; string targetVariable = problemData.TargetVariable; IEnumerable <int> rows = problemData.TrainingIndices; svm_parameter parameter = new svm_parameter { svm_type = GetSvmType(svmType), kernel_type = GetKernelType(kernelType), C = cost, nu = nu, gamma = gamma, p = epsilon, cache_size = 500, probability = 0, eps = 0.001, degree = degree, shrinking = 1, coef0 = 0 }; svm_problem problem = SupportVectorMachineUtil.CreateSvmProblem(dataset, targetVariable, allowedInputVariables, rows); RangeTransform rangeTransform = RangeTransform.Compute(problem); svm_problem scaledProblem = rangeTransform.Scale(problem); var svmModel = svm.svm_train(scaledProblem, parameter); nSv = svmModel.SV.Length; model = new SupportVectorMachineModel(svmModel, rangeTransform, targetVariable, allowedInputVariables); }
public static RandomForestModel CreateRandomForestRegressionModel(IRegressionProblemData problemData, int nTrees, double r, double m, int seed, out double rmsError, out double avgRelError, out double outOfBagRmsError, out double outOfBagAvgRelError) { return(RandomForestModel.CreateRegressionModel(problemData, nTrees, r, m, seed, rmsError: out rmsError, avgRelError: out avgRelError, outOfBagRmsError: out outOfBagRmsError, outOfBagAvgRelError: out outOfBagAvgRelError)); }
public static ISymbolicExpressionTree Prune(ISymbolicExpressionTree tree, SymbolicRegressionSolutionImpactValuesCalculator impactValuesCalculator, ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, IRegressionProblemData problemData, DoubleLimit estimationLimits, IEnumerable<int> rows, double nodeImpactThreshold = 0.0, bool pruneOnlyZeroImpactNodes = false) { var clonedTree = (ISymbolicExpressionTree)tree.Clone(); var model = new SymbolicRegressionModel(problemData.TargetVariable, clonedTree, interpreter, estimationLimits.Lower, estimationLimits.Upper); var nodes = clonedTree.Root.GetSubtree(0).GetSubtree(0).IterateNodesPrefix().ToList(); // skip the nodes corresponding to the ProgramRootSymbol and the StartSymbol double qualityForImpactsCalculation = double.NaN; // pass a NaN value initially so the impact calculator will calculate the quality for (int i = 0; i < nodes.Count; ++i) { var node = nodes[i]; if (node is ConstantTreeNode) continue; double impactValue, replacementValue; double newQualityForImpactsCalculation; impactValuesCalculator.CalculateImpactAndReplacementValues(model, node, problemData, rows, out impactValue, out replacementValue, out newQualityForImpactsCalculation, qualityForImpactsCalculation); if (pruneOnlyZeroImpactNodes && !impactValue.IsAlmost(0.0)) continue; if (!pruneOnlyZeroImpactNodes && impactValue > nodeImpactThreshold) continue; var constantNode = (ConstantTreeNode)node.Grammar.GetSymbol("Constant").CreateTreeNode(); constantNode.Value = replacementValue; ReplaceWithConstant(node, constantNode); i += node.GetLength() - 1; // skip subtrees under the node that was folded qualityForImpactsCalculation = newQualityForImpactsCalculation; } return model.SymbolicExpressionTree; }
public RegressionEnsembleSolution(IEnumerable <IRegressionModel> models, IRegressionProblemData problemData, IEnumerable <IntRange> trainingPartitions, IEnumerable <IntRange> testPartitions) : base(new RegressionEnsembleModel(Enumerable.Empty <IRegressionModel>()), new RegressionEnsembleProblemData(problemData)) { this.trainingPartitions = new Dictionary <IRegressionModel, IntRange>(); this.testPartitions = new Dictionary <IRegressionModel, IntRange>(); this.regressionSolutions = new ItemCollection <IRegressionSolution>(); List <IRegressionSolution> solutions = new List <IRegressionSolution>(); var modelEnumerator = models.GetEnumerator(); var trainingPartitionEnumerator = trainingPartitions.GetEnumerator(); var testPartitionEnumerator = testPartitions.GetEnumerator(); while (modelEnumerator.MoveNext() & trainingPartitionEnumerator.MoveNext() & testPartitionEnumerator.MoveNext()) { var p = (IRegressionProblemData)problemData.Clone(); p.TrainingPartition.Start = trainingPartitionEnumerator.Current.Start; p.TrainingPartition.End = trainingPartitionEnumerator.Current.End; p.TestPartition.Start = testPartitionEnumerator.Current.Start; p.TestPartition.End = testPartitionEnumerator.Current.End; solutions.Add(modelEnumerator.Current.CreateRegressionSolution(p)); } if (modelEnumerator.MoveNext() | trainingPartitionEnumerator.MoveNext() | testPartitionEnumerator.MoveNext()) { throw new ArgumentException(); } trainingEvaluationCache = new Dictionary <int, double>(problemData.TrainingIndices.Count()); testEvaluationCache = new Dictionary <int, double>(problemData.TestIndices.Count()); RegisterRegressionSolutionsEventHandler(); regressionSolutions.AddRange(solutions); }
public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) { var pca = PrincipleComponentTransformation.CreateProjection(pd.Dataset, pd.TrainingIndices, pd.AllowedInputVariables, normalize: true); var pcdata = pca.TransformProblemData(pd); ComponentReducedLinearModel bestModel = null; var bestCvrmse = double.MaxValue; numberOfParameters = 1; for (var i = 1; i <= Math.Min(NumberOfComponents, pd.AllowedInputVariables.Count()); i++) { var pd2 = (IRegressionProblemData)pcdata.Clone(); var inputs = new HashSet <string>(pca.ComponentNames.Take(i)); foreach (var v in pd2.InputVariables.CheckedItems.ToArray()) { pd2.InputVariables.SetItemCheckedState(v.Value, inputs.Contains(v.Value.Value)); } double rmse; var model = PreconstructedLinearModel.CreateLinearModel(pd2, out rmse); if (rmse > bestCvrmse) { continue; } bestModel = new ComponentReducedLinearModel(pd2.TargetVariable, model, pca); numberOfParameters = i + 1; bestCvrmse = rmse; } return(bestModel); }
// wrap an actual model in a surrograte public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed, ILossFunction lossFunction, int iterations, int maxSize, double r, double m, double nu, IGradientBoostedTreesModel model) : this(trainingProblemData, seed, lossFunction, iterations, maxSize, r, m, nu) { this.actualModel = model; }
public RegressionEnsembleProblemData(IRegressionProblemData regressionProblemData) : base(regressionProblemData.Dataset, regressionProblemData.AllowedInputVariables, regressionProblemData.TargetVariable) { TrainingPartition.Start = regressionProblemData.TrainingPartition.Start; TrainingPartition.End = regressionProblemData.TrainingPartition.End; TestPartition.Start = regressionProblemData.TestPartition.Start; TestPartition.End = regressionProblemData.TestPartition.End; }
private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd) { var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] { pd.TargetVariable }), pd.AllIndices); var nFeatures = inputMatrix.GetLength(1) - 1; double[] coefficients; alglib.linearmodel lm; alglib.lrreport ar; int retVal; alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar); if (retVal != 1) { throw new ArgumentException("Error in calculation of linear regression solution"); } alglib.lrunpack(lm, out coefficients, out nFeatures); var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new { s, d }).ToDictionary(x => x.s, x => x.d); var res = new PreconstructedLinearModel(coeffs, coefficients[nFeatures], pd.TargetVariable); return(res); }
public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda, out double[] trainNMSEs, out double[] testNMSEs, double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity, int maxVars = -1) { // run for multiple user-supplied lambdas double[,] coeff; double[] intercept; RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars); int nRows = intercept.Length; int nCols = coeff.GetLength(1) + 1; double[][] sols = new double[nRows][]; for (int solIdx = 0; solIdx < nRows; solIdx++) { sols[solIdx] = new double[nCols]; for (int cIdx = 0; cIdx < nCols - 1; cIdx++) { sols[solIdx][cIdx] = coeff[solIdx, cIdx]; } sols[solIdx][nCols - 1] = intercept[solIdx]; } return(sols); }
public RegressionEnsembleSolution(IRegressionEnsembleModel model, IRegressionProblemData problemData) : base(model, new RegressionEnsembleProblemData(problemData)) { trainingPartitions = new Dictionary <IRegressionModel, IntRange>(); testPartitions = new Dictionary <IRegressionModel, IntRange>(); regressionSolutions = new ItemCollection <IRegressionSolution>(); evaluationCache = new Dictionary <int, double>(problemData.Dataset.Rows); trainingEvaluationCache = new Dictionary <int, double>(problemData.TrainingIndices.Count()); testEvaluationCache = new Dictionary <int, double>(problemData.TestIndices.Count()); var solutions = model.Models.Select(m => m.CreateRegressionSolution((IRegressionProblemData)problemData.Clone())); foreach (var solution in solutions) { regressionSolutions.Add(solution); trainingPartitions.Add(solution.Model, solution.ProblemData.TrainingPartition); testPartitions.Add(solution.Model, solution.ProblemData.TestPartition); } RecalculateResults(); RegisterModelEvents(); RegisterRegressionSolutionsEventHandler(); }
public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) { var ds = problemData.Dataset; var allVariables = problemData.AllowedInputVariables.ToArray(); var doubleVariables = allVariables.Where(ds.VariableHasType <double>); var factorVariableNames = allVariables.Where(ds.VariableHasType <string>); var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) List <KeyValuePair <string, IEnumerable <string> > > remainingFactorVariablesAndValues = new List <KeyValuePair <string, IEnumerable <string> > >(); List <double> factorCoeff = new List <double>(); List <string> remainingDoubleVariables = new List <string>(); List <double> doubleVarCoeff = new List <double>(); { int i = 0; // find factor varibles & value combinations with non-zero coeff foreach (var factorVarAndValues in factorVariablesAndValues) { var l = new List <string>(); foreach (var factorValue in factorVarAndValues.Value) { if (!coeff[i].IsAlmost(0.0)) { l.Add(factorValue); factorCoeff.Add(coeff[i]); } i++; } if (l.Any()) { remainingFactorVariablesAndValues.Add(new KeyValuePair <string, IEnumerable <string> >(factorVarAndValues.Key, l)); } } // find double variables with non-zero coeff foreach (var doubleVar in doubleVariables) { if (!coeff[i].IsAlmost(0.0)) { remainingDoubleVariables.Add(doubleVar); doubleVarCoeff.Add(coeff[i]); } i++; } } var tree = LinearModelToTreeConverter.CreateTree( remainingFactorVariablesAndValues, factorCoeff.ToArray(), remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(), coeff.Last()); SymbolicRegressionSolution solution = new SymbolicRegressionSolution( new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Elastic-net Linear Regression Model"; solution.Name = "Elastic-net Linear Regression Solution"; return(solution); }
private static RandomForestRegressionSolution GridSearch(IRegressionProblemData problemData, out RFParameter bestParameters, int seed = 3141519) { double rmsError, outOfBagRmsError, avgRelError, outOfBagAvgRelError; var random = new MersenneTwister(); bestParameters = RandomForestUtil.GridSearch(problemData, randomForestParameterRanges, seed, maximumDegreeOfParallelism); var model = RandomForestModel.CreateRegressionModel(problemData, problemData.TrainingIndices, bestParameters.N, bestParameters.R, bestParameters.M, seed, out rmsError, out outOfBagRmsError, out avgRelError, out outOfBagAvgRelError); return (RandomForestRegressionSolution)model.CreateRegressionSolution(problemData); }
public static RandomForestModelFull CreateRandomForestRegressionModel(IRegressionProblemData problemData, int nTrees, double r, double m, int seed, out double rmsError, out double avgRelError, out double outOfBagRmsError, out double outOfBagAvgRelError) { var model = CreateRandomForestRegressionModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed, out rmsError, out avgRelError, out outOfBagRmsError, out outOfBagAvgRelError); return(model); }
public static INearestNeighbourModel Train(IRegressionProblemData problemData, int k) { return(new NearestNeighbourModel(problemData.Dataset, problemData.TrainingIndices, k, problemData.TargetVariable, problemData.AllowedInputVariables)); }
public RegressionProblemData(IRegressionProblemData regressionProblemData) : this(regressionProblemData.Dataset, regressionProblemData.AllowedInputVariables, regressionProblemData.TargetVariable) { TrainingPartition.Start = regressionProblemData.TrainingPartition.Start; TrainingPartition.End = regressionProblemData.TrainingPartition.End; TestPartition.Start = regressionProblemData.TestPartition.Start; TestPartition.End = regressionProblemData.TestPartition.End; }
public void ConstantModelVariableImpactTest() { IRegressionProblemData problemData = LoadDefaultTowerProblem(); IRegressionModel model = new ConstantModel(5, "y"); IRegressionSolution solution = new RegressionSolution(model, problemData); Dictionary <string, double> expectedImpacts = GetExpectedValuesForConstantModel(); CheckDefaultAsserts(solution, expectedImpacts); }
public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int decimalPlaces) { var mse = SymbolicRegressionSingleObjectiveMeanSquaredErrorEvaluator.Calculate(interpreter, solution, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); if (decimalPlaces >= 0) mse = Math.Round(mse, decimalPlaces); return new double[2] { mse, solution.Length }; }
private static SupportVectorRegressionSolution SvmGridSearch(IRegressionProblemData problemData, out svm_parameter bestParameters, out int nSv, out double cvMse) { bestParameters = SupportVectorMachineUtil.GridSearch(out cvMse, problemData, svmParameterRanges, numberOfFolds, shuffleFolds, maximumDegreeOfParallelism); double trainingError, testError; string svmType = svmTypes[bestParameters.svm_type]; string kernelType = kernelTypes[bestParameters.kernel_type]; var svm_solution = SupportVectorRegression.CreateSupportVectorRegressionSolution(problemData, problemData.AllowedInputVariables, svmType, kernelType, bestParameters.C, bestParameters.nu, bestParameters.gamma, bestParameters.eps, bestParameters.degree, out trainingError, out testError, out nSv); return svm_solution; }
private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool generateRules, bool useHoldout, double holdoutSize) { var stateScope = new Scope("RegressionTreeStateScope"); //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise var doubleVars = new HashSet <string>(problemData.Dataset.DoubleVariables); var vars = problemData.AllowedInputVariables.Concat(new[] { problemData.TargetVariable }).ToArray(); if (vars.Any(v => !doubleVars.Contains(v))) { throw new NotSupportedException("Decision tree regression supports only double valued input or output features."); } var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray(); if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x)))) { throw new NotSupportedException("Decision tree regression does not support NaN or infinity values in the input dataset."); } var trainingData = new Dataset(vars, doubles); var pd = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable); pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows; pd.TrainingPartition.Start = 0; //store regression tree parameters var regressionTreeParams = new RegressionTreeParameters(pruning, minLeafSize, leafModel, pd, random, splitter); stateScope.Variables.Add(new Variable(RegressionTreeParameterVariableName, regressionTreeParams)); //initialize tree operators pruning.Initialize(stateScope); splitter.Initialize(stateScope); leafModel.Initialize(stateScope); //store unbuilt model IItem model; if (generateRules) { model = RegressionRuleSetModel.CreateRuleModel(problemData.TargetVariable, regressionTreeParams); RegressionRuleSetModel.Initialize(stateScope); } else { model = RegressionNodeTreeModel.CreateTreeModel(problemData.TargetVariable, regressionTreeParams); } stateScope.Variables.Add(new Variable(ModelVariableName, model)); //store training & pruning indices IReadOnlyList <int> trainingSet, pruningSet; GeneratePruningSet(pd.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingSet, out pruningSet); stateScope.Variables.Add(new Variable(TrainingSetVariableName, new IntArray(trainingSet.ToArray()))); stateScope.Variables.Add(new Variable(PruningSetVariableName, new IntArray(pruningSet.ToArray()))); return(stateScope); }
private RegressionTreeParameters(RegressionTreeParameters original, Cloner cloner) : base(original, cloner) { problemData = cloner.Clone(original.problemData); random = cloner.Clone(original.random); leafModel = cloner.Clone(original.leafModel); splitter = cloner.Clone(original.splitter); pruning = cloner.Clone(original.pruning); minLeafSize = original.minLeafSize; }
private static RandomForestRegressionSolution GridSearchWithCrossvalidation(IRegressionProblemData problemData, out RFParameter bestParameters, int seed = 3141519) { double rmsError, outOfBagRmsError, avgRelError, outOfBagAvgRelError; bestParameters = RandomForestUtil.GridSearch(problemData, numberOfFolds, shuffleFolds, randomForestParameterRanges, seed, maximumDegreeOfParallelism); var model = RandomForestModel.CreateRegressionModel(problemData, problemData.TrainingIndices, bestParameters.N, bestParameters.R, bestParameters.M, seed, out rmsError, out outOfBagRmsError, out avgRelError, out outOfBagAvgRelError); return((RandomForestRegressionSolution)model.CreateRegressionSolution(problemData)); }
public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters) { if (pd.Dataset.Rows < MinLeafSize(pd)) { throw new ArgumentException("The number of training instances is too small to create a linear model"); } numberOfParameters = 1; return(new PreconstructedLinearModel(pd.Dataset.GetDoubleValues(pd.TargetVariable).Average(), pd.TargetVariable)); }
// create only the surrogate model without an actual model public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed, ILossFunction lossFunction, int iterations, int maxSize, double r, double m, double nu) : base("Gradient boosted tree model", string.Empty) { this.trainingProblemData = trainingProblemData; this.seed = seed; this.lossFunction = lossFunction; this.iterations = iterations; this.maxSize = maxSize; this.r = r; this.m = m; this.nu = nu; }
public override double Evaluate(IExecutionContext context, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows) { SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = context; EstimationLimitsParameter.ExecutionContext = context; double mlr = Calculate(SymbolicDataAnalysisTreeInterpreterParameter.ActualValue, tree, EstimationLimitsParameter.ActualValue.Lower, EstimationLimitsParameter.ActualValue.Upper, problemData, rows); SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = null; EstimationLimitsParameter.ExecutionContext = null; return mlr; }
public override double[] Evaluate(IExecutionContext context, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows) { SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = context; EstimationLimitsParameter.ExecutionContext = context; ApplyLinearScalingParameter.ExecutionContext = context; double[] quality = Calculate(SymbolicDataAnalysisTreeInterpreterParameter.ActualValue, tree, EstimationLimitsParameter.ActualValue.Lower, EstimationLimitsParameter.ActualValue.Upper, problemData, rows, ApplyLinearScalingParameter.ActualValue.Value, DecimalPlaces); SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = null; EstimationLimitsParameter.ExecutionContext = null; ApplyLinearScalingParameter.ExecutionContext = null; return quality; }
private GradientBoostedTreesModelSurrogate(GradientBoostedTreesModelSurrogate original, Cloner cloner) : base(original, cloner) { if (original.actualModel != null) this.actualModel = cloner.Clone(original.actualModel); this.trainingProblemData = cloner.Clone(original.trainingProblemData); this.lossFunction = cloner.Clone(original.lossFunction); this.seed = original.seed; this.iterations = original.iterations; this.maxSize = original.maxSize; this.r = original.r; this.m = original.m; this.nu = original.nu; }
public static double Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows) { IEnumerable<double> estimatedValues = interpreter.GetSymbolicExpressionTreeValues(solution, problemData.Dataset, rows); IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); IEnumerable<double> boundedEstimatedValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit); var logRes = boundedEstimatedValues.Zip(targetValues, (e, t) => Math.Log(1.0 + Math.Abs(e - t))); OnlineCalculatorError errorState; OnlineCalculatorError varErrorState; double mlr; double variance; OnlineMeanAndVarianceCalculator.Calculate(logRes, out mlr, out variance, out errorState, out varErrorState); if (errorState != OnlineCalculatorError.None) return double.NaN; return mlr; }
public static double Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling) { IEnumerable<double> estimatedValues = interpreter.GetSymbolicExpressionTreeValues(solution, problemData.Dataset, rows); IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); OnlineCalculatorError errorState; double mse; if (applyLinearScaling) { var mseCalculator = new OnlineMeanSquaredErrorCalculator(); CalculateWithScaling(targetValues, estimatedValues, lowerEstimationLimit, upperEstimationLimit, mseCalculator, problemData.Dataset.Rows); errorState = mseCalculator.ErrorState; mse = mseCalculator.MeanSquaredError; } else { IEnumerable<double> boundedEstimatedValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit); mse = OnlineMeanSquaredErrorCalculator.Calculate(targetValues, boundedEstimatedValues, out errorState); } if (errorState != OnlineCalculatorError.None) return double.NaN; return mse; }
// prepare and allocate buffer variables in ctor public RegressionTreeBuilder(IRegressionProblemData problemData, IRandom random) { this.problemData = problemData; this.random = random; var rows = problemData.TrainingIndices.Count(); this.nCols = problemData.AllowedInputVariables.Count(); allowedVariables = problemData.AllowedInputVariables.ToArray(); varName2Index = new Dictionary<string, int>(allowedVariables.Length); for (int i = 0; i < allowedVariables.Length; i++) varName2Index.Add(allowedVariables[i], i); sortedIdxAll = new int[nCols][]; sortedIdx = new int[nCols][]; sumImprovements = new Dictionary<string, double>(); internalIdx = new int[rows]; which = new int[rows]; leftTmp = new int[rows]; rightTmp = new int[rows]; outx = new double[rows]; outSortedIdx = new int[rows]; queue = new List<PartitionSplits>(100); x = new double[nCols][]; originalY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray(); y = new double[originalY.Length]; Array.Copy(originalY, y, y.Length); // copy values (originalY is fixed, y is changed in gradient boosting) curPred = Enumerable.Repeat(0.0, y.Length).ToArray(); // zeros int col = 0; foreach (var inputVariable in problemData.AllowedInputVariables) { x[col] = problemData.Dataset.GetDoubleValues(inputVariable, problemData.TrainingIndices).ToArray(); sortedIdxAll[col] = Enumerable.Range(0, rows).OrderBy(r => x[col][r]).ToArray(); sortedIdx[col] = new int[rows]; col++; } }
protected RegressionSolutionBase(IRegressionModel model, IRegressionProblemData problemData) : base(model, problemData) { Add(new Result(TrainingMeanSquaredErrorResultName, TrainingMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TestMeanSquaredErrorResultName, TestMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TrainingMeanAbsoluteErrorResultName, TrainingMeanAbsoluteErrorResultDescription, new DoubleValue())); Add(new Result(TestMeanAbsoluteErrorResultName, TestMeanAbsoluteErrorResultDescription, new DoubleValue())); Add(new Result(TrainingSquaredCorrelationResultName, TrainingSquaredCorrelationResultDescription, new DoubleValue())); Add(new Result(TestSquaredCorrelationResultName, TestSquaredCorrelationResultDescription, new DoubleValue())); Add(new Result(TrainingRelativeErrorResultName, TrainingRelativeErrorResultDescription, new PercentValue())); Add(new Result(TestRelativeErrorResultName, TestRelativeErrorResultDescription, new PercentValue())); Add(new Result(TrainingNormalizedMeanSquaredErrorResultName, TrainingNormalizedMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TestNormalizedMeanSquaredErrorResultName, TestNormalizedMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TrainingRootMeanSquaredErrorResultName, TrainingRootMeanSquaredErrorResultDescription, new DoubleValue())); Add(new Result(TestRootMeanSquaredErrorResultName, TestRootMeanSquaredErrorResultDescription, new DoubleValue())); }
public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { return new ConstantRegressionSolution(new ConstantModel(constant), new RegressionProblemData(problemData)); }
public override double Evaluate(IExecutionContext context, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows) { SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = context; EstimationLimitsParameter.ExecutionContext = context; ApplyLinearScalingParameter.ExecutionContext = context; // Pearson R² evaluator is used on purpose instead of the const-opt evaluator, // because Evaluate() is used to get the quality of evolved models on // different partitions of the dataset (e.g., best validation model) double r2 = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(SymbolicDataAnalysisTreeInterpreterParameter.ActualValue, tree, EstimationLimitsParameter.ActualValue.Lower, EstimationLimitsParameter.ActualValue.Upper, problemData, rows, ApplyLinearScalingParameter.ActualValue.Value); SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = null; EstimationLimitsParameter.ExecutionContext = null; ApplyLinearScalingParameter.ExecutionContext = null; return r2; }
public static double CalculateQualityForImpacts(ISymbolicRegressionModel model, IRegressionProblemData problemData, IEnumerable<int> rows) { var estimatedValues = model.GetEstimatedValues(problemData.Dataset, rows); // also bounds the values var targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); OnlineCalculatorError errorState; var r = OnlinePearsonsRCalculator.Calculate(targetValues, estimatedValues, out errorState); var quality = r * r; if (errorState != OnlineCalculatorError.None) return double.NaN; return quality; }
public static IGaussianProcessModel Create(IRegressionProblemData problemData, double[] hyperparameter, IMeanFunction meanFunction, ICovarianceFunction covarianceFunction, bool scaleInputs = true) { return new GaussianProcessModel(problemData.Dataset, problemData.TargetVariable, problemData.AllowedInputVariables, problemData.TrainingIndices, hyperparameter, meanFunction, covarianceFunction, scaleInputs); }
private static bool TrySetProblemData(IAlgorithm alg, IRegressionProblemData problemData) { var prob = alg.Problem as IRegressionProblem; // there is already a problem and it is compatible -> just set problem data if (prob != null) { prob.ProblemDataParameter.Value = problemData; return true; } else return false; }
private static ISymbolicRegressionSolution CreateSymbolicSolution(List<IRegressionModel> models, double nu, IRegressionProblemData problemData) { var symbModels = models.OfType<ISymbolicRegressionModel>(); var lowerLimit = symbModels.Min(m => m.LowerEstimationLimit); var upperLimit = symbModels.Max(m => m.UpperEstimationLimit); var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(); var progRootNode = new ProgramRootSymbol().CreateTreeNode(); var startNode = new StartSymbol().CreateTreeNode(); var addNode = new Addition().CreateTreeNode(); var mulNode = new Multiplication().CreateTreeNode(); var scaleNode = (ConstantTreeNode)new Constant().CreateTreeNode(); // all models are scaled using the same nu scaleNode.Value = nu; foreach (var m in symbModels) { var relevantPart = m.SymbolicExpressionTree.Root.GetSubtree(0).GetSubtree(0); // skip root and start addNode.AddSubtree((ISymbolicExpressionTreeNode)relevantPart.Clone()); } mulNode.AddSubtree(addNode); mulNode.AddSubtree(scaleNode); startNode.AddSubtree(mulNode); progRootNode.AddSubtree(startNode); var t = new SymbolicExpressionTree(progRootNode); var combinedModel = new SymbolicRegressionModel(problemData.TargetVariable, t, interpreter, lowerLimit, upperLimit); var sol = new SymbolicRegressionSolution(combinedModel, problemData); return sol; }
private static IRegressionEnsembleSolution CreateEnsembleSolution(List<IRegressionModel> models, IRegressionProblemData problemData) { var rows = problemData.TrainingPartition.Size; var features = models.Count; double[,] inputMatrix = new double[rows, features + 1]; //add model estimates for (int m = 0; m < models.Count; m++) { var model = models[m]; var estimates = model.GetEstimatedValues(problemData.Dataset, problemData.TrainingIndices); int estimatesCounter = 0; foreach (var estimate in estimates) { inputMatrix[estimatesCounter, m] = estimate; estimatesCounter++; } } //add target var targets = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices); int targetCounter = 0; foreach (var target in targets) { inputMatrix[targetCounter, models.Count] = target; targetCounter++; } alglib.linearmodel lm = new alglib.linearmodel(); alglib.lrreport ar = new alglib.lrreport(); double[] coefficients; int retVal = 1; alglib.lrbuildz(inputMatrix, rows, features, out retVal, out lm, out ar); if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution"); alglib.lrunpack(lm, out coefficients, out features); var ensembleModel = new RegressionEnsembleModel(models, coefficients.Take(models.Count)) { AverageModelEstimates = false }; var ensembleSolution = (IRegressionEnsembleSolution)ensembleModel.CreateRegressionSolution(problemData); return ensembleSolution; }
public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { return new GaussianProcessRegressionSolution(this, new RegressionProblemData(problemData)); }
public INeuralNetworkRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { return new NeuralNetworkRegressionSolution(new RegressionProblemData(problemData), this); }
public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int decimalPlaces) { double r2 = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, solution, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); if (decimalPlaces >= 0) r2 = Math.Round(r2, decimalPlaces); return new double[2] { r2, solution.IterateNodesPostfix().OfType<VariableTreeNode>().Count() }; // count the number of variables }
// for custom stepping & termination public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize = 3, double r = 0.66, double m = 0.5, double nu = 0.01) { return new GbmState(problemData, lossFunction, randSeed, maxSize, r, m, nu); }
// simple interface public static GradientBoostedTreesSolution TrainGbm(IRegressionProblemData problemData, ILossFunction lossFunction, int maxSize, double nu, double r, double m, int maxIterations, uint randSeed = 31415) { Contract.Assert(r > 0); Contract.Assert(r <= 1.0); Contract.Assert(nu > 0); Contract.Assert(nu <= 1.0); var state = (GbmState)CreateGbmState(problemData, lossFunction, randSeed, maxSize, r, m, nu); for (int iter = 0; iter < maxIterations; iter++) { MakeStep(state); } var model = state.GetModel(); return new GradientBoostedTreesSolution(model, (IRegressionProblemData)problemData.Clone()); }
public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int decimalPlaces) { double r2 = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, solution, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); if (decimalPlaces >= 0) r2 = Math.Round(r2, decimalPlaces); return new double[2] { r2, SymbolicDataAnalysisModelComplexityCalculator.CalculateComplexity(solution) }; }
public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { return new ConstantRegressionSolution(this, new RegressionProblemData(problemData)); }
public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int maxIterations, bool updateVariableWeights = true, double lowerEstimationLimit = double.MinValue, double upperEstimationLimit = double.MaxValue, bool updateConstantsInTree = true) { List<AutoDiff.Variable> variables = new List<AutoDiff.Variable>(); List<AutoDiff.Variable> parameters = new List<AutoDiff.Variable>(); List<string> variableNames = new List<string>(); AutoDiff.Term func; if (!TryTransformToAutoDiff(tree.Root.GetSubtree(0), variables, parameters, variableNames, updateVariableWeights, out func)) throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree."); if (variableNames.Count == 0) return 0.0; AutoDiff.IParametricCompiledTerm compiledFunc = func.Compile(variables.ToArray(), parameters.ToArray()); List<SymbolicExpressionTreeTerminalNode> terminalNodes = null; if (updateVariableWeights) terminalNodes = tree.Root.IterateNodesPrefix().OfType<SymbolicExpressionTreeTerminalNode>().ToList(); else terminalNodes = new List<SymbolicExpressionTreeTerminalNode>(tree.Root.IterateNodesPrefix().OfType<ConstantTreeNode>()); //extract inital constants double[] c = new double[variables.Count]; { c[0] = 0.0; c[1] = 1.0; int i = 2; foreach (var node in terminalNodes) { ConstantTreeNode constantTreeNode = node as ConstantTreeNode; VariableTreeNode variableTreeNode = node as VariableTreeNode; if (constantTreeNode != null) c[i++] = constantTreeNode.Value; else if (updateVariableWeights && variableTreeNode != null) c[i++] = variableTreeNode.Weight; } } double[] originalConstants = (double[])c.Clone(); double originalQuality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); alglib.lsfitstate state; alglib.lsfitreport rep; int info; IDataset ds = problemData.Dataset; double[,] x = new double[rows.Count(), variableNames.Count]; int row = 0; foreach (var r in rows) { for (int col = 0; col < variableNames.Count; col++) { x[row, col] = ds.GetDoubleValue(variableNames[col], r); } row++; } double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray(); int n = x.GetLength(0); int m = x.GetLength(1); int k = c.Length; alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(compiledFunc); alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(compiledFunc); try { alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state); alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations); //alglib.lsfitsetgradientcheck(state, 0.001); alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, null, null); alglib.lsfitresults(state, out info, out c, out rep); } catch (ArithmeticException) { return originalQuality; } catch (alglib.alglibexception) { return originalQuality; } //info == -7 => constant optimization failed due to wrong gradient if (info != -7) UpdateConstants(tree, c.Skip(2).ToArray(), updateVariableWeights); var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); if (!updateConstantsInTree) UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights); if (originalQuality - quality > 0.001 || double.IsNaN(quality)) { UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights); return originalQuality; } return quality; }
IRegressionSolution IRegressionModel.CreateRegressionSolution(IRegressionProblemData problemData) { return CreateRegressionSolution(problemData); }