public static ISymbolicRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, string modelStructure, int maxIterations)
        {
            var parser     = new InfixExpressionParser();
            var tree       = parser.Parse(modelStructure);
            var simplifier = new SymbolicDataAnalysisExpressionTreeSimplifier();

            if (!SymbolicRegressionConstantOptimizationEvaluator.CanOptimizeConstants(tree))
            {
                throw new ArgumentException("The optimizer does not support the specified model structure.");
            }

            var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter();

            SymbolicRegressionConstantOptimizationEvaluator.OptimizeConstants(interpreter, tree, problemData, problemData.TrainingIndices,
                                                                              applyLinearScaling: false, maxIterations: maxIterations,
                                                                              updateVariableWeights: false, updateConstantsInTree: true);


            var scaledModel = new SymbolicRegressionModel(problemData.TargetVariable, tree, (ISymbolicDataAnalysisExpressionTreeInterpreter)interpreter.Clone());

            scaledModel.Scale(problemData);
            SymbolicRegressionSolution solution = new SymbolicRegressionSolution(scaledModel, (IRegressionProblemData)problemData.Clone());

            solution.Model.Name = "Regression Model";
            solution.Name       = "Regression Solution";
            return(solution);
        }
      public GbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize, double r, double m, double nu) {
        // default settings for MaxSize, Nu and R
        this.maxSize = maxSize;
        this.nu = nu;
        this.r = r;
        this.m = m;

        this.randSeed = randSeed;
        random = new MersenneTwister(randSeed);
        this.problemData = problemData;
        this.trainingRows = problemData.TrainingIndices.ToArray();
        this.testRows = problemData.TestIndices.ToArray();
        this.lossFunction = lossFunction;

        int nRows = trainingRows.Length;

        y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, trainingRows).ToArray();

        treeBuilder = new RegressionTreeBuilder(problemData, random);

        activeIdx = Enumerable.Range(0, nRows).ToArray();

        var zeros = Enumerable.Repeat(0.0, nRows).ToArray();
        double f0 = lossFunction.LineSearch(y, zeros, activeIdx, 0, nRows - 1); // initial constant value (mean for squared errors)
        pred = Enumerable.Repeat(f0, nRows).ToArray();
        predTest = Enumerable.Repeat(f0, testRows.Length).ToArray();
        pseudoRes = new double[nRows];

        models = new List<IRegressionModel>();
        weights = new List<double>();
        // add constant model
        models.Add(new ConstantModel(f0, problemData.TargetVariable));
        weights.Add(1.0);
      }
 public RegressionEnsembleSolution(IEnumerable <IRegressionModel> models, IRegressionProblemData problemData)
     : this(models, problemData,
            models.Select(m => (IntRange)problemData.TrainingPartition.Clone()),
            models.Select(m => (IntRange)problemData.TestPartition.Clone())
            )
 {
 }
Example #4
0
 protected DampenedModel(IRegressionModel model, IRegressionProblemData pd, double dampening) : base(model.TargetVariable)
 {
     Model     = model;
     Min       = pd.TargetVariableTrainingValues.Min();
     Max       = pd.TargetVariableTrainingValues.Max();
     Dampening = dampening;
 }
Example #5
0
        public static IEnumerable <Tuple <string, double> > CalculateImpacts(
            IRegressionModel model,
            IRegressionProblemData problemData,
            IEnumerable <double> estimatedValues,
            IEnumerable <int> rows,
            ReplacementMethodEnum replacementMethod             = ReplacementMethodEnum.Shuffle,
            FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best)
        {
            //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable
            var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames);

            if (missingVariables.Any())
            {
                throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables)));
            }
            IEnumerable <double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
            var originalQuality = CalculateQuality(targetValues, estimatedValues);

            var impacts           = new Dictionary <string, double>();
            var inputvariables    = new HashSet <string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction));
            var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable();

            foreach (var inputVariable in inputvariables)
            {
                impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality);
            }

            return(impacts.Select(i => Tuple.Create(i.Key, i.Value)));
        }
Example #6
0
        public static IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, IRandom random, ILeafModel leafModel = null, ISplitter splitter = null, IPruning pruning = null,
                                                                   bool useHoldout = false, double holdoutSize = 0.2, int minimumLeafSize = 1, bool generateRules = false, ResultCollection results = null, CancellationToken?cancellationToken = null)
        {
            if (leafModel == null)
            {
                leafModel = new LinearLeaf();
            }
            if (splitter == null)
            {
                splitter = new Splitter();
            }
            if (cancellationToken == null)
            {
                cancellationToken = CancellationToken.None;
            }
            if (pruning == null)
            {
                pruning = new ComplexityPruning();
            }

            var stateScope = InitializeScope(random, problemData, pruning, minimumLeafSize, leafModel, splitter, generateRules, useHoldout, holdoutSize);
            var model      = Build(stateScope, results, cancellationToken.Value);

            return(model.CreateRegressionSolution(problemData));
        }
Example #7
0
        private IEnumerable <BasisFunction> CreateUnivariateBases(IRegressionProblemData problemData)
        {
            var B1             = new List <BasisFunction>();
            var inputVariables = problemData.AllowedInputVariables;
            var validExponents = ConsiderExponentiations ? exponents : new double[] { 1 };
            var validFuncs     = ConsiderNonlinearFuncs ? NonlinearFuncs.CheckedItems.Select(val => val.Value) : new List <OpCode>();

            // TODO: add Hinge functions

            foreach (var variableName in inputVariables)
            {
                foreach (var exp in validExponents)
                {
                    var data = problemData.Dataset.GetDoubleValues(variableName).Select(x => Math.Pow(x, exp)).ToArray();
                    if (!ok(data))
                    {
                        continue;
                    }
                    var name = expToString(exp, variableName);
                    B1.Add(new BasisFunction(name, data, false));
                    foreach (OpCode _op in validFuncs)
                    {
                        var inner_data = data.Select(x => Utils.eval(_op, x)).ToArray();
                        if (!ok(inner_data))
                        {
                            continue;
                        }
                        // the name is for later parsing the Basis Functions to an ISymbolicExpressionTree
                        var inner_name = OpCodeToString.GetByFirst(_op) + "(" + name + ")";
                        B1.Add(new BasisFunction(inner_name, inner_data, true));
                    }
                }
            }
            return(B1);
        }
        public void InitTest()
        {
            double[,] arr = new double[4, 3];

            arr[0, 0] = 3;
            arr[0, 1] = 6;
            arr[0, 2] = 2;
            arr[1, 0] = 5;
            arr[1, 1] = 2;
            arr[1, 2] = 1;
            arr[2, 0] = 8;
            arr[2, 1] = 5;
            arr[2, 2] = 0;
            arr[3, 0] = 3;
            arr[3, 1] = 4;
            arr[3, 2] = 2;

            var ds = new Dataset(new string[] { "x1", "x2", "y" }, arr);

            problemData = (IRegressionProblemData) new RegressionProblemData(ds, new string[] { "x1", "x2" }, "y");

            variableRanges = new Dictionary <string, Interval>();
            variableRanges.Add("x1", new Interval(1, 10));
            variableRanges.Add("x2", new Interval(4, 6));
        }
        /// <summary>
        /// Grid search with crossvalidation
        /// </summary>
        /// <param name="problemData">The regression problem data</param>
        /// <param name="numberOfFolds">The number of folds for crossvalidation</param>
        /// <param name="shuffleFolds">Specifies whether the folds should be shuffled</param>
        /// <param name="parameterRanges">The ranges for each parameter in the grid search</param>
        /// <param name="seed">The random seed (required by the random forest model)</param>
        /// <param name="maxDegreeOfParallelism">The maximum allowed number of threads (to parallelize the grid search)</param>
        /// <returns>The best parameter values found by the grid search</returns>
        public static RFParameter GridSearch(IRegressionProblemData problemData, int numberOfFolds, bool shuffleFolds, Dictionary <string, IEnumerable <double> > parameterRanges, int seed = 12345, int maxDegreeOfParallelism = 1)
        {
            DoubleValue mse           = new DoubleValue(Double.MaxValue);
            RFParameter bestParameter = new RFParameter();

            var setters      = parameterRanges.Keys.Select(GenerateSetter).ToList();
            var partitions   = GenerateRandomForestPartitions(problemData, numberOfFolds);
            var crossProduct = parameterRanges.Values.CartesianProduct();

            var locker = new object();

            Parallel.ForEach(crossProduct, new ParallelOptions {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            }, parameterCombination => {
                var parameterValues = parameterCombination.ToList();
                double testMSE;
                var parameters = new RFParameter();
                for (int i = 0; i < setters.Count; ++i)
                {
                    setters[i](parameters, parameterValues[i]);
                }
                CrossValidate(problemData, partitions, parameters.N, parameters.R, parameters.M, seed, out testMSE);

                lock (locker) {
                    if (testMSE < mse.Value)
                    {
                        mse.Value     = testMSE;
                        bestParameter = (RFParameter)parameters.Clone();
                    }
                }
            });
            return(bestParameter);
        }
        /// <summary>
        /// Grid search without crossvalidation (since for random forests the out-of-bag estimate is unbiased)
        /// </summary>
        /// <param name="problemData">The regression problem data</param>
        /// <param name="parameterRanges">The ranges for each parameter in the grid search</param>
        /// <param name="seed">The random seed (required by the random forest model)</param>
        /// <param name="maxDegreeOfParallelism">The maximum allowed number of threads (to parallelize the grid search)</param>
        public static RFParameter GridSearch(IRegressionProblemData problemData, Dictionary <string, IEnumerable <double> > parameterRanges, int seed = 12345, int maxDegreeOfParallelism = 1)
        {
            var         setters              = parameterRanges.Keys.Select(GenerateSetter).ToList();
            var         crossProduct         = parameterRanges.Values.CartesianProduct();
            double      bestOutOfBagRmsError = double.MaxValue;
            RFParameter bestParameters       = new RFParameter();

            var locker = new object();

            Parallel.ForEach(crossProduct, new ParallelOptions {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            }, parameterCombination => {
                var parameterValues = parameterCombination.ToList();
                var parameters      = new RFParameter();
                for (int i = 0; i < setters.Count; ++i)
                {
                    setters[i](parameters, parameterValues[i]);
                }
                double rmsError, outOfBagRmsError, avgRelError, outOfBagAvgRelError;
                RandomForestModel.CreateRegressionModel(problemData, problemData.TrainingIndices, parameters.N, parameters.R, parameters.M, seed, out rmsError, out outOfBagRmsError, out avgRelError, out outOfBagAvgRelError);

                lock (locker) {
                    if (bestOutOfBagRmsError > outOfBagRmsError)
                    {
                        bestOutOfBagRmsError = outOfBagRmsError;
                        bestParameters       = (RFParameter)parameters.Clone();
                    }
                }
            });
            return(bestParameters);
        }
Example #11
0
        public static bool IsProblemDataCompatible(IRegressionModel model, IRegressionProblemData problemData, out string errorMessage)
        {
            if (model == null)
            {
                throw new ArgumentNullException("model", "The provided model is null.");
            }
            if (problemData == null)
            {
                throw new ArgumentNullException("problemData", "The provided problemData is null.");
            }
            errorMessage = string.Empty;

            if (model.TargetVariable != problemData.TargetVariable)
            {
                errorMessage = string.Format("The target variable of the model {0} does not match the target variable of the problemData {1}.", model.TargetVariable, problemData.TargetVariable);
            }

            var evaluationErrorMessage = string.Empty;
            var datasetCompatible      = model.IsDatasetCompatible(problemData.Dataset, out evaluationErrorMessage);

            if (!datasetCompatible)
            {
                errorMessage += evaluationErrorMessage;
            }

            return(string.IsNullOrEmpty(errorMessage));
        }
        protected override void itemsListView_DragDrop(object sender, DragEventArgs e)
        {
            if (e.Effect != DragDropEffects.None)
            {
                var droppedData = e.Data.GetData(HeuristicLab.Common.Constants.DragDropDataFormat);
                if (droppedData is IValueParameter)
                {
                    droppedData = ((IValueParameter)droppedData).Value;
                }
                if (droppedData is IRegressionProblem)
                {
                    droppedData = ((IRegressionProblem)droppedData).ProblemData;
                }

                RegressionEnsembleProblemData ensembleProblemData = droppedData as RegressionEnsembleProblemData;
                IRegressionProblemData        problemData         = droppedData as IRegressionProblemData;
                if (ensembleProblemData != null)
                {
                    Content.ProblemData = (RegressionEnsembleProblemData)ensembleProblemData.Clone();
                }
                else if (problemData != null)
                {
                    Content.ProblemData = new RegressionEnsembleProblemData((IRegressionProblemData)problemData.Clone());
                }
            }
        }
        public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters)
        {
            if (pd.Dataset.Rows < MinLeafSize(pd))
            {
                throw new ArgumentException("The number of training instances is too small to create a Gaussian process model");
            }
            Regression.Problem = new RegressionProblem {
                ProblemData = pd
            };
            var cvscore = double.MaxValue;
            GaussianProcessRegressionSolution sol = null;

            for (var i = 0; i < Tries; i++)
            {
                var res   = RegressionTreeUtilities.RunSubAlgorithm(Regression, random.Next(), cancellationToken);
                var t     = res.Select(x => x.Value).OfType <GaussianProcessRegressionSolution>().FirstOrDefault();
                var score = ((DoubleValue)res["Negative log pseudo-likelihood (LOO-CV)"].Value).Value;
                if (score >= cvscore || t == null || double.IsNaN(t.TrainingRSquared))
                {
                    continue;
                }
                cvscore = score;
                sol     = t;
            }
            Regression.Runs.Clear();
            if (sol == null)
            {
                throw new ArgumentException("Could not create Gaussian process model");
            }

            numberOfParameters = pd.Dataset.Rows + 1
                                 + Regression.CovarianceFunction.GetNumberOfParameters(pd.AllowedInputVariables.Count())
                                 + Regression.MeanFunction.GetNumberOfParameters(pd.AllowedInputVariables.Count());
            return(sol.Model);
        }
        private void CalculateFrequencies(List <double> residualValues, Series series)
        {
            double roundedMax, intervalWidth;

            CalculateResidualParameters(residualValues, out roundedMax, out intervalWidth);

            IEnumerable <double>   relevantResiduals = residualValues;
            IRegressionProblemData problemdata       = Content.ProblemData;

            if (series.Name.Equals(TRAINING_SAMPLES))
            {
                relevantResiduals = residualValues.Skip(problemdata.TrainingPartition.Start).Take(problemdata.TrainingPartition.Size);
            }
            else if (series.Name.Equals(TEST_SAMPLES))
            {
                relevantResiduals = residualValues.Skip(problemdata.TestPartition.Start).Take(problemdata.TestPartition.Size);
            }

            double intervalCenter            = intervalWidth / 2.0;
            double sampleCount               = relevantResiduals.Count();
            double current                   = -roundedMax;
            DataPointCollection seriesPoints = series.Points;

            for (int i = 0; i <= bins; i++)
            {
                IEnumerable <double> help = relevantResiduals.Where(x => x >= (current - intervalCenter) && x < (current + intervalCenter));
                seriesPoints.AddXY(current, help.Count() / sampleCount);
                seriesPoints[seriesPoints.Count - 1]["from"] = (current - intervalCenter).ToString();
                seriesPoints[seriesPoints.Count - 1]["to"]   = (current + intervalCenter).ToString();
                current += intervalWidth;
            }
        }
        public static void Run(IRegressionProblemData problemData, IEnumerable <string> allowedInputVariables,
                               string svmType, string kernelType, double cost, double nu, double gamma, double epsilon, int degree,
                               out ISupportVectorMachineModel model, out int nSv)
        {
            var               dataset        = problemData.Dataset;
            string            targetVariable = problemData.TargetVariable;
            IEnumerable <int> rows           = problemData.TrainingIndices;

            svm_parameter parameter = new svm_parameter {
                svm_type    = GetSvmType(svmType),
                kernel_type = GetKernelType(kernelType),
                C           = cost,
                nu          = nu,
                gamma       = gamma,
                p           = epsilon,
                cache_size  = 500,
                probability = 0,
                eps         = 0.001,
                degree      = degree,
                shrinking   = 1,
                coef0       = 0
            };

            svm_problem    problem        = SupportVectorMachineUtil.CreateSvmProblem(dataset, targetVariable, allowedInputVariables, rows);
            RangeTransform rangeTransform = RangeTransform.Compute(problem);
            svm_problem    scaledProblem  = rangeTransform.Scale(problem);
            var            svmModel       = svm.svm_train(scaledProblem, parameter);

            nSv = svmModel.SV.Length;

            model = new SupportVectorMachineModel(svmModel, rangeTransform, targetVariable, allowedInputVariables);
        }
 public static RandomForestModel CreateRandomForestRegressionModel(IRegressionProblemData problemData, int nTrees,
                                                                   double r, double m, int seed,
                                                                   out double rmsError, out double avgRelError, out double outOfBagRmsError, out double outOfBagAvgRelError)
 {
     return(RandomForestModel.CreateRegressionModel(problemData, nTrees, r, m, seed,
                                                    rmsError: out rmsError, avgRelError: out avgRelError, outOfBagRmsError: out outOfBagRmsError, outOfBagAvgRelError: out outOfBagAvgRelError));
 }
    public static ISymbolicExpressionTree Prune(ISymbolicExpressionTree tree, SymbolicRegressionSolutionImpactValuesCalculator impactValuesCalculator, ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, IRegressionProblemData problemData, DoubleLimit estimationLimits, IEnumerable<int> rows, double nodeImpactThreshold = 0.0, bool pruneOnlyZeroImpactNodes = false) {
      var clonedTree = (ISymbolicExpressionTree)tree.Clone();
      var model = new SymbolicRegressionModel(problemData.TargetVariable, clonedTree, interpreter, estimationLimits.Lower, estimationLimits.Upper);
      var nodes = clonedTree.Root.GetSubtree(0).GetSubtree(0).IterateNodesPrefix().ToList(); // skip the nodes corresponding to the ProgramRootSymbol and the StartSymbol

      double qualityForImpactsCalculation = double.NaN; // pass a NaN value initially so the impact calculator will calculate the quality

      for (int i = 0; i < nodes.Count; ++i) {
        var node = nodes[i];
        if (node is ConstantTreeNode) continue;

        double impactValue, replacementValue;
        double newQualityForImpactsCalculation;
        impactValuesCalculator.CalculateImpactAndReplacementValues(model, node, problemData, rows, out impactValue, out replacementValue, out newQualityForImpactsCalculation, qualityForImpactsCalculation);

        if (pruneOnlyZeroImpactNodes && !impactValue.IsAlmost(0.0)) continue;
        if (!pruneOnlyZeroImpactNodes && impactValue > nodeImpactThreshold) continue;

        var constantNode = (ConstantTreeNode)node.Grammar.GetSymbol("Constant").CreateTreeNode();
        constantNode.Value = replacementValue;

        ReplaceWithConstant(node, constantNode);
        i += node.GetLength() - 1; // skip subtrees under the node that was folded

        qualityForImpactsCalculation = newQualityForImpactsCalculation;
      }
      return model.SymbolicExpressionTree;
    }
        public RegressionEnsembleSolution(IEnumerable <IRegressionModel> models, IRegressionProblemData problemData, IEnumerable <IntRange> trainingPartitions, IEnumerable <IntRange> testPartitions)
            : base(new RegressionEnsembleModel(Enumerable.Empty <IRegressionModel>()), new RegressionEnsembleProblemData(problemData))
        {
            this.trainingPartitions  = new Dictionary <IRegressionModel, IntRange>();
            this.testPartitions      = new Dictionary <IRegressionModel, IntRange>();
            this.regressionSolutions = new ItemCollection <IRegressionSolution>();

            List <IRegressionSolution> solutions = new List <IRegressionSolution>();
            var modelEnumerator             = models.GetEnumerator();
            var trainingPartitionEnumerator = trainingPartitions.GetEnumerator();
            var testPartitionEnumerator     = testPartitions.GetEnumerator();

            while (modelEnumerator.MoveNext() & trainingPartitionEnumerator.MoveNext() & testPartitionEnumerator.MoveNext())
            {
                var p = (IRegressionProblemData)problemData.Clone();
                p.TrainingPartition.Start = trainingPartitionEnumerator.Current.Start;
                p.TrainingPartition.End   = trainingPartitionEnumerator.Current.End;
                p.TestPartition.Start     = testPartitionEnumerator.Current.Start;
                p.TestPartition.End       = testPartitionEnumerator.Current.End;

                solutions.Add(modelEnumerator.Current.CreateRegressionSolution(p));
            }
            if (modelEnumerator.MoveNext() | trainingPartitionEnumerator.MoveNext() | testPartitionEnumerator.MoveNext())
            {
                throw new ArgumentException();
            }

            trainingEvaluationCache = new Dictionary <int, double>(problemData.TrainingIndices.Count());
            testEvaluationCache     = new Dictionary <int, double>(problemData.TestIndices.Count());

            RegisterRegressionSolutionsEventHandler();
            regressionSolutions.AddRange(solutions);
        }
Example #19
0
        public override IRegressionModel Build(IRegressionProblemData pd, IRandom random,
                                               CancellationToken cancellationToken, out int numberOfParameters)
        {
            var pca    = PrincipleComponentTransformation.CreateProjection(pd.Dataset, pd.TrainingIndices, pd.AllowedInputVariables, normalize: true);
            var pcdata = pca.TransformProblemData(pd);
            ComponentReducedLinearModel bestModel = null;
            var bestCvrmse = double.MaxValue;

            numberOfParameters = 1;
            for (var i = 1; i <= Math.Min(NumberOfComponents, pd.AllowedInputVariables.Count()); i++)
            {
                var pd2    = (IRegressionProblemData)pcdata.Clone();
                var inputs = new HashSet <string>(pca.ComponentNames.Take(i));
                foreach (var v in pd2.InputVariables.CheckedItems.ToArray())
                {
                    pd2.InputVariables.SetItemCheckedState(v.Value, inputs.Contains(v.Value.Value));
                }
                double rmse;
                var    model = PreconstructedLinearModel.CreateLinearModel(pd2, out rmse);
                if (rmse > bestCvrmse)
                {
                    continue;
                }
                bestModel          = new ComponentReducedLinearModel(pd2.TargetVariable, model, pca);
                numberOfParameters = i + 1;
                bestCvrmse         = rmse;
            }
            return(bestModel);
        }
 // wrap an actual model in a surrograte
 public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed,
                                           ILossFunction lossFunction, int iterations, int maxSize, double r, double m, double nu,
                                           IGradientBoostedTreesModel model)
     : this(trainingProblemData, seed, lossFunction, iterations, maxSize, r, m, nu)
 {
     this.actualModel = model;
 }
 public RegressionEnsembleProblemData(IRegressionProblemData regressionProblemData)
   : base(regressionProblemData.Dataset, regressionProblemData.AllowedInputVariables, regressionProblemData.TargetVariable) {
   TrainingPartition.Start = regressionProblemData.TrainingPartition.Start;
   TrainingPartition.End = regressionProblemData.TrainingPartition.End;
   TestPartition.Start = regressionProblemData.TestPartition.Start;
   TestPartition.End = regressionProblemData.TestPartition.End;
 }
        private static PreconstructedLinearModel ClassicCalculation(IRegressionProblemData pd)
        {
            var inputMatrix = pd.Dataset.ToArray(pd.AllowedInputVariables.Concat(new[] {
                pd.TargetVariable
            }), pd.AllIndices);

            var nFeatures = inputMatrix.GetLength(1) - 1;

            double[] coefficients;

            alglib.linearmodel lm;
            alglib.lrreport    ar;
            int retVal;

            alglib.lrbuild(inputMatrix, inputMatrix.GetLength(0), nFeatures, out retVal, out lm, out ar);
            if (retVal != 1)
            {
                throw new ArgumentException("Error in calculation of linear regression solution");
            }

            alglib.lrunpack(lm, out coefficients, out nFeatures);
            var coeffs = pd.AllowedInputVariables.Zip(coefficients, (s, d) => new { s, d }).ToDictionary(x => x.s, x => x.d);
            var res    = new PreconstructedLinearModel(coeffs, coefficients[nFeatures], pd.TargetVariable);

            return(res);
        }
        public static double[][] CalculateModelCoefficients(IRegressionProblemData problemData, double penalty, double[] lambda,
                                                            out double[] trainNMSEs, out double[] testNMSEs,
                                                            double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
                                                            int maxVars            = -1)
        {
            // run for multiple user-supplied lambdas
            double[,] coeff;
            double[] intercept;
            RunElasticNetLinearRegression(problemData, penalty, lambda.Length, 1.0, lambda, out lambda, out trainNMSEs, out testNMSEs, out coeff, out intercept, coeffLowerBound, coeffUpperBound, maxVars);

            int nRows = intercept.Length;
            int nCols = coeff.GetLength(1) + 1;

            double[][] sols = new double[nRows][];
            for (int solIdx = 0; solIdx < nRows; solIdx++)
            {
                sols[solIdx] = new double[nCols];
                for (int cIdx = 0; cIdx < nCols - 1; cIdx++)
                {
                    sols[solIdx][cIdx] = coeff[solIdx, cIdx];
                }
                sols[solIdx][nCols - 1] = intercept[solIdx];
            }
            return(sols);
        }
        public RegressionEnsembleSolution(IRegressionEnsembleModel model, IRegressionProblemData problemData)
            : base(model, new RegressionEnsembleProblemData(problemData))
        {
            trainingPartitions  = new Dictionary <IRegressionModel, IntRange>();
            testPartitions      = new Dictionary <IRegressionModel, IntRange>();
            regressionSolutions = new ItemCollection <IRegressionSolution>();

            evaluationCache         = new Dictionary <int, double>(problemData.Dataset.Rows);
            trainingEvaluationCache = new Dictionary <int, double>(problemData.TrainingIndices.Count());
            testEvaluationCache     = new Dictionary <int, double>(problemData.TestIndices.Count());


            var solutions = model.Models.Select(m => m.CreateRegressionSolution((IRegressionProblemData)problemData.Clone()));

            foreach (var solution in solutions)
            {
                regressionSolutions.Add(solution);
                trainingPartitions.Add(solution.Model, solution.ProblemData.TrainingPartition);
                testPartitions.Add(solution.Model, solution.ProblemData.TestPartition);
            }

            RecalculateResults();
            RegisterModelEvents();
            RegisterRegressionSolutionsEventHandler();
        }
        public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData)
        {
            var ds                       = problemData.Dataset;
            var allVariables             = problemData.AllowedInputVariables.ToArray();
            var doubleVariables          = allVariables.Where(ds.VariableHasType <double>);
            var factorVariableNames      = allVariables.Where(ds.VariableHasType <string>);
            var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set)

            List <KeyValuePair <string, IEnumerable <string> > > remainingFactorVariablesAndValues = new List <KeyValuePair <string, IEnumerable <string> > >();
            List <double> factorCoeff = new List <double>();
            List <string> remainingDoubleVariables = new List <string>();
            List <double> doubleVarCoeff           = new List <double>();

            {
                int i = 0;
                // find factor varibles & value combinations with non-zero coeff
                foreach (var factorVarAndValues in factorVariablesAndValues)
                {
                    var l = new List <string>();
                    foreach (var factorValue in factorVarAndValues.Value)
                    {
                        if (!coeff[i].IsAlmost(0.0))
                        {
                            l.Add(factorValue);
                            factorCoeff.Add(coeff[i]);
                        }
                        i++;
                    }
                    if (l.Any())
                    {
                        remainingFactorVariablesAndValues.Add(new KeyValuePair <string, IEnumerable <string> >(factorVarAndValues.Key, l));
                    }
                }
                // find double variables with non-zero coeff
                foreach (var doubleVar in doubleVariables)
                {
                    if (!coeff[i].IsAlmost(0.0))
                    {
                        remainingDoubleVariables.Add(doubleVar);
                        doubleVarCoeff.Add(coeff[i]);
                    }
                    i++;
                }
            }
            var tree = LinearModelToTreeConverter.CreateTree(
                remainingFactorVariablesAndValues, factorCoeff.ToArray(),
                remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(),
                coeff.Last());


            SymbolicRegressionSolution solution = new SymbolicRegressionSolution(
                new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()),
                (IRegressionProblemData)problemData.Clone());

            solution.Model.Name = "Elastic-net Linear Regression Model";
            solution.Name       = "Elastic-net Linear Regression Solution";

            return(solution);
        }
 private static RandomForestRegressionSolution GridSearch(IRegressionProblemData problemData, out RFParameter bestParameters, int seed = 3141519) {
   double rmsError, outOfBagRmsError, avgRelError, outOfBagAvgRelError;
   var random = new MersenneTwister();
   bestParameters = RandomForestUtil.GridSearch(problemData, randomForestParameterRanges, seed, maximumDegreeOfParallelism);
   var model = RandomForestModel.CreateRegressionModel(problemData, problemData.TrainingIndices, bestParameters.N, bestParameters.R, bestParameters.M, seed,
                                                       out rmsError, out outOfBagRmsError, out avgRelError, out outOfBagAvgRelError);
   return (RandomForestRegressionSolution)model.CreateRegressionSolution(problemData);
 }
Example #27
0
        public static RandomForestModelFull CreateRandomForestRegressionModel(IRegressionProblemData problemData, int nTrees,
                                                                              double r, double m, int seed,
                                                                              out double rmsError, out double avgRelError, out double outOfBagRmsError, out double outOfBagAvgRelError)
        {
            var model = CreateRandomForestRegressionModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed, out rmsError, out avgRelError, out outOfBagRmsError, out outOfBagAvgRelError);

            return(model);
        }
 public static INearestNeighbourModel Train(IRegressionProblemData problemData, int k)
 {
     return(new NearestNeighbourModel(problemData.Dataset,
                                      problemData.TrainingIndices,
                                      k,
                                      problemData.TargetVariable,
                                      problemData.AllowedInputVariables));
 }
 public RegressionProblemData(IRegressionProblemData regressionProblemData)
     : this(regressionProblemData.Dataset, regressionProblemData.AllowedInputVariables, regressionProblemData.TargetVariable)
 {
     TrainingPartition.Start = regressionProblemData.TrainingPartition.Start;
     TrainingPartition.End   = regressionProblemData.TrainingPartition.End;
     TestPartition.Start     = regressionProblemData.TestPartition.Start;
     TestPartition.End       = regressionProblemData.TestPartition.End;
 }
        public void ConstantModelVariableImpactTest()
        {
            IRegressionProblemData      problemData     = LoadDefaultTowerProblem();
            IRegressionModel            model           = new ConstantModel(5, "y");
            IRegressionSolution         solution        = new RegressionSolution(model, problemData);
            Dictionary <string, double> expectedImpacts = GetExpectedValuesForConstantModel();

            CheckDefaultAsserts(solution, expectedImpacts);
        }
    public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int decimalPlaces) {
      var mse = SymbolicRegressionSingleObjectiveMeanSquaredErrorEvaluator.Calculate(interpreter, solution, lowerEstimationLimit,
        upperEstimationLimit, problemData, rows, applyLinearScaling);

      if (decimalPlaces >= 0)
        mse = Math.Round(mse, decimalPlaces);

      return new double[2] { mse, solution.Length };
    }
 private static SupportVectorRegressionSolution SvmGridSearch(IRegressionProblemData problemData, out svm_parameter bestParameters, out int nSv, out double cvMse) {
   bestParameters = SupportVectorMachineUtil.GridSearch(out cvMse, problemData, svmParameterRanges, numberOfFolds, shuffleFolds, maximumDegreeOfParallelism);
   double trainingError, testError;
   string svmType = svmTypes[bestParameters.svm_type];
   string kernelType = kernelTypes[bestParameters.kernel_type];
   var svm_solution = SupportVectorRegression.CreateSupportVectorRegressionSolution(problemData, problemData.AllowedInputVariables, svmType, kernelType,
                      bestParameters.C, bestParameters.nu, bestParameters.gamma, bestParameters.eps, bestParameters.degree, out trainingError, out testError, out nSv);
   return svm_solution;
 }
Example #33
0
        private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool generateRules, bool useHoldout, double holdoutSize)
        {
            var stateScope = new Scope("RegressionTreeStateScope");

            //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise
            var doubleVars = new HashSet <string>(problemData.Dataset.DoubleVariables);
            var vars       = problemData.AllowedInputVariables.Concat(new[] { problemData.TargetVariable }).ToArray();

            if (vars.Any(v => !doubleVars.Contains(v)))
            {
                throw new NotSupportedException("Decision tree regression supports only double valued input or output features.");
            }
            var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray();

            if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x))))
            {
                throw new NotSupportedException("Decision tree regression does not support NaN or infinity values in the input dataset.");
            }
            var trainingData = new Dataset(vars, doubles);
            var pd           = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable);

            pd.TrainingPartition.End   = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows;
            pd.TrainingPartition.Start = 0;

            //store regression tree parameters
            var regressionTreeParams = new RegressionTreeParameters(pruning, minLeafSize, leafModel, pd, random, splitter);

            stateScope.Variables.Add(new Variable(RegressionTreeParameterVariableName, regressionTreeParams));

            //initialize tree operators
            pruning.Initialize(stateScope);
            splitter.Initialize(stateScope);
            leafModel.Initialize(stateScope);

            //store unbuilt model
            IItem model;

            if (generateRules)
            {
                model = RegressionRuleSetModel.CreateRuleModel(problemData.TargetVariable, regressionTreeParams);
                RegressionRuleSetModel.Initialize(stateScope);
            }
            else
            {
                model = RegressionNodeTreeModel.CreateTreeModel(problemData.TargetVariable, regressionTreeParams);
            }
            stateScope.Variables.Add(new Variable(ModelVariableName, model));

            //store training & pruning indices
            IReadOnlyList <int> trainingSet, pruningSet;

            GeneratePruningSet(pd.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingSet, out pruningSet);
            stateScope.Variables.Add(new Variable(TrainingSetVariableName, new IntArray(trainingSet.ToArray())));
            stateScope.Variables.Add(new Variable(PruningSetVariableName, new IntArray(pruningSet.ToArray())));

            return(stateScope);
        }
Example #34
0
 private RegressionTreeParameters(RegressionTreeParameters original, Cloner cloner) : base(original, cloner)
 {
     problemData = cloner.Clone(original.problemData);
     random      = cloner.Clone(original.random);
     leafModel   = cloner.Clone(original.leafModel);
     splitter    = cloner.Clone(original.splitter);
     pruning     = cloner.Clone(original.pruning);
     minLeafSize = original.minLeafSize;
 }
Example #35
0
    private static RandomForestRegressionSolution GridSearchWithCrossvalidation(IRegressionProblemData problemData, out RFParameter bestParameters, int seed = 3141519)
    {
        double rmsError, outOfBagRmsError, avgRelError, outOfBagAvgRelError;

        bestParameters = RandomForestUtil.GridSearch(problemData, numberOfFolds, shuffleFolds, randomForestParameterRanges, seed, maximumDegreeOfParallelism);
        var model = RandomForestModel.CreateRegressionModel(problemData, problemData.TrainingIndices, bestParameters.N, bestParameters.R, bestParameters.M, seed, out rmsError, out outOfBagRmsError, out avgRelError, out outOfBagAvgRelError);

        return((RandomForestRegressionSolution)model.CreateRegressionSolution(problemData));
    }
Example #36
0
 public override IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int numberOfParameters)
 {
     if (pd.Dataset.Rows < MinLeafSize(pd))
     {
         throw new ArgumentException("The number of training instances is too small to create a linear model");
     }
     numberOfParameters = 1;
     return(new PreconstructedLinearModel(pd.Dataset.GetDoubleValues(pd.TargetVariable).Average(), pd.TargetVariable));
 }
 // create only the surrogate model without an actual model
 public GradientBoostedTreesModelSurrogate(IRegressionProblemData trainingProblemData, uint seed, ILossFunction lossFunction, int iterations, int maxSize, double r, double m, double nu)
   : base("Gradient boosted tree model", string.Empty) {
   this.trainingProblemData = trainingProblemData;
   this.seed = seed;
   this.lossFunction = lossFunction;
   this.iterations = iterations;
   this.maxSize = maxSize;
   this.r = r;
   this.m = m;
   this.nu = nu;
 }
    public override double Evaluate(IExecutionContext context, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows) {
      SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = context;
      EstimationLimitsParameter.ExecutionContext = context;

      double mlr = Calculate(SymbolicDataAnalysisTreeInterpreterParameter.ActualValue, tree, EstimationLimitsParameter.ActualValue.Lower, EstimationLimitsParameter.ActualValue.Upper, problemData, rows);

      SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = null;
      EstimationLimitsParameter.ExecutionContext = null;

      return mlr;
    }
    public override double[] Evaluate(IExecutionContext context, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows) {
      SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = context;
      EstimationLimitsParameter.ExecutionContext = context;
      ApplyLinearScalingParameter.ExecutionContext = context;

      double[] quality = Calculate(SymbolicDataAnalysisTreeInterpreterParameter.ActualValue, tree, EstimationLimitsParameter.ActualValue.Lower, EstimationLimitsParameter.ActualValue.Upper, problemData, rows, ApplyLinearScalingParameter.ActualValue.Value, DecimalPlaces);

      SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = null;
      EstimationLimitsParameter.ExecutionContext = null;
      ApplyLinearScalingParameter.ExecutionContext = null;

      return quality;
    }
    private GradientBoostedTreesModelSurrogate(GradientBoostedTreesModelSurrogate original, Cloner cloner)
      : base(original, cloner) {
      if (original.actualModel != null) this.actualModel = cloner.Clone(original.actualModel);

      this.trainingProblemData = cloner.Clone(original.trainingProblemData);
      this.lossFunction = cloner.Clone(original.lossFunction);
      this.seed = original.seed;
      this.iterations = original.iterations;
      this.maxSize = original.maxSize;
      this.r = original.r;
      this.m = original.m;
      this.nu = original.nu;
    }
    public static double Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows) {
      IEnumerable<double> estimatedValues = interpreter.GetSymbolicExpressionTreeValues(solution, problemData.Dataset, rows);
      IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
      IEnumerable<double> boundedEstimatedValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit);

      var logRes = boundedEstimatedValues.Zip(targetValues, (e, t) => Math.Log(1.0 + Math.Abs(e - t)));

      OnlineCalculatorError errorState;
      OnlineCalculatorError varErrorState;
      double mlr;
      double variance;
      OnlineMeanAndVarianceCalculator.Calculate(logRes, out mlr, out variance, out errorState, out varErrorState);
      if (errorState != OnlineCalculatorError.None) return double.NaN;
      return mlr;
    }
    public static double Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling) {
      IEnumerable<double> estimatedValues = interpreter.GetSymbolicExpressionTreeValues(solution, problemData.Dataset, rows);
      IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
      OnlineCalculatorError errorState;

      double mse;
      if (applyLinearScaling) {
        var mseCalculator = new OnlineMeanSquaredErrorCalculator();
        CalculateWithScaling(targetValues, estimatedValues, lowerEstimationLimit, upperEstimationLimit, mseCalculator, problemData.Dataset.Rows);
        errorState = mseCalculator.ErrorState;
        mse = mseCalculator.MeanSquaredError;
      } else {
        IEnumerable<double> boundedEstimatedValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit);
        mse = OnlineMeanSquaredErrorCalculator.Calculate(targetValues, boundedEstimatedValues, out errorState);
      }
      if (errorState != OnlineCalculatorError.None) return double.NaN;
      return mse;
    }
    // prepare and allocate buffer variables in ctor
    public RegressionTreeBuilder(IRegressionProblemData problemData, IRandom random) {
      this.problemData = problemData;
      this.random = random;

      var rows = problemData.TrainingIndices.Count();

      this.nCols = problemData.AllowedInputVariables.Count();

      allowedVariables = problemData.AllowedInputVariables.ToArray();
      varName2Index = new Dictionary<string, int>(allowedVariables.Length);
      for (int i = 0; i < allowedVariables.Length; i++) varName2Index.Add(allowedVariables[i], i);

      sortedIdxAll = new int[nCols][];
      sortedIdx = new int[nCols][];
      sumImprovements = new Dictionary<string, double>();
      internalIdx = new int[rows];
      which = new int[rows];
      leftTmp = new int[rows];
      rightTmp = new int[rows];
      outx = new double[rows];
      outSortedIdx = new int[rows];
      queue = new List<PartitionSplits>(100);

      x = new double[nCols][];
      originalY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();
      y = new double[originalY.Length];
      Array.Copy(originalY, y, y.Length); // copy values (originalY is fixed, y is changed in gradient boosting)
      curPred = Enumerable.Repeat(0.0, y.Length).ToArray(); // zeros

      int col = 0;
      foreach (var inputVariable in problemData.AllowedInputVariables) {
        x[col] = problemData.Dataset.GetDoubleValues(inputVariable, problemData.TrainingIndices).ToArray();
        sortedIdxAll[col] = Enumerable.Range(0, rows).OrderBy(r => x[col][r]).ToArray();
        sortedIdx[col] = new int[rows];
        col++;
      }
    }
 protected RegressionSolutionBase(IRegressionModel model, IRegressionProblemData problemData)
   : base(model, problemData) {
   Add(new Result(TrainingMeanSquaredErrorResultName, TrainingMeanSquaredErrorResultDescription, new DoubleValue()));
   Add(new Result(TestMeanSquaredErrorResultName, TestMeanSquaredErrorResultDescription, new DoubleValue()));
   Add(new Result(TrainingMeanAbsoluteErrorResultName, TrainingMeanAbsoluteErrorResultDescription, new DoubleValue()));
   Add(new Result(TestMeanAbsoluteErrorResultName, TestMeanAbsoluteErrorResultDescription, new DoubleValue()));
   Add(new Result(TrainingSquaredCorrelationResultName, TrainingSquaredCorrelationResultDescription, new DoubleValue()));
   Add(new Result(TestSquaredCorrelationResultName, TestSquaredCorrelationResultDescription, new DoubleValue()));
   Add(new Result(TrainingRelativeErrorResultName, TrainingRelativeErrorResultDescription, new PercentValue()));
   Add(new Result(TestRelativeErrorResultName, TestRelativeErrorResultDescription, new PercentValue()));
   Add(new Result(TrainingNormalizedMeanSquaredErrorResultName, TrainingNormalizedMeanSquaredErrorResultDescription, new DoubleValue()));
   Add(new Result(TestNormalizedMeanSquaredErrorResultName, TestNormalizedMeanSquaredErrorResultDescription, new DoubleValue()));
   Add(new Result(TrainingRootMeanSquaredErrorResultName, TrainingRootMeanSquaredErrorResultDescription, new DoubleValue()));
   Add(new Result(TestRootMeanSquaredErrorResultName, TestRootMeanSquaredErrorResultDescription, new DoubleValue()));
 }
 public IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
   return new ConstantRegressionSolution(new ConstantModel(constant), new RegressionProblemData(problemData));
 }
    public override double Evaluate(IExecutionContext context, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows) {
      SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = context;
      EstimationLimitsParameter.ExecutionContext = context;
      ApplyLinearScalingParameter.ExecutionContext = context;

      // Pearson R² evaluator is used on purpose instead of the const-opt evaluator, 
      // because Evaluate() is used to get the quality of evolved models on 
      // different partitions of the dataset (e.g., best validation model)
      double r2 = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(SymbolicDataAnalysisTreeInterpreterParameter.ActualValue, tree, EstimationLimitsParameter.ActualValue.Lower, EstimationLimitsParameter.ActualValue.Upper, problemData, rows, ApplyLinearScalingParameter.ActualValue.Value);

      SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = null;
      EstimationLimitsParameter.ExecutionContext = null;
      ApplyLinearScalingParameter.ExecutionContext = null;

      return r2;
    }
 public static double CalculateQualityForImpacts(ISymbolicRegressionModel model, IRegressionProblemData problemData, IEnumerable<int> rows) {
   var estimatedValues = model.GetEstimatedValues(problemData.Dataset, rows); // also bounds the values
   var targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
   OnlineCalculatorError errorState;
   var r = OnlinePearsonsRCalculator.Calculate(targetValues, estimatedValues, out errorState);
   var quality = r * r;
   if (errorState != OnlineCalculatorError.None) return double.NaN;
   return quality;
 }
 public static IGaussianProcessModel Create(IRegressionProblemData problemData, double[] hyperparameter, IMeanFunction meanFunction, ICovarianceFunction covarianceFunction, bool scaleInputs = true) {
   return new GaussianProcessModel(problemData.Dataset, problemData.TargetVariable, problemData.AllowedInputVariables, problemData.TrainingIndices, hyperparameter, meanFunction, covarianceFunction, scaleInputs);
 }
 private static bool TrySetProblemData(IAlgorithm alg, IRegressionProblemData problemData) {
   var prob = alg.Problem as IRegressionProblem;
   // there is already a problem and it is compatible -> just set problem data
   if (prob != null) {
     prob.ProblemDataParameter.Value = problemData;
     return true;
   } else return false;
 }
    private static ISymbolicRegressionSolution CreateSymbolicSolution(List<IRegressionModel> models, double nu, IRegressionProblemData problemData) {
      var symbModels = models.OfType<ISymbolicRegressionModel>();
      var lowerLimit = symbModels.Min(m => m.LowerEstimationLimit);
      var upperLimit = symbModels.Max(m => m.UpperEstimationLimit);
      var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter();
      var progRootNode = new ProgramRootSymbol().CreateTreeNode();
      var startNode = new StartSymbol().CreateTreeNode();

      var addNode = new Addition().CreateTreeNode();
      var mulNode = new Multiplication().CreateTreeNode();
      var scaleNode = (ConstantTreeNode)new Constant().CreateTreeNode(); // all models are scaled using the same nu
      scaleNode.Value = nu;

      foreach (var m in symbModels) {
        var relevantPart = m.SymbolicExpressionTree.Root.GetSubtree(0).GetSubtree(0); // skip root and start
        addNode.AddSubtree((ISymbolicExpressionTreeNode)relevantPart.Clone());
      }

      mulNode.AddSubtree(addNode);
      mulNode.AddSubtree(scaleNode);
      startNode.AddSubtree(mulNode);
      progRootNode.AddSubtree(startNode);
      var t = new SymbolicExpressionTree(progRootNode);
      var combinedModel = new SymbolicRegressionModel(problemData.TargetVariable, t, interpreter, lowerLimit, upperLimit);
      var sol = new SymbolicRegressionSolution(combinedModel, problemData);
      return sol;
    }
    private static IRegressionEnsembleSolution CreateEnsembleSolution(List<IRegressionModel> models,
      IRegressionProblemData problemData) {
      var rows = problemData.TrainingPartition.Size;
      var features = models.Count;
      double[,] inputMatrix = new double[rows, features + 1];

      //add model estimates
      for (int m = 0; m < models.Count; m++) {
        var model = models[m];
        var estimates = model.GetEstimatedValues(problemData.Dataset, problemData.TrainingIndices);
        int estimatesCounter = 0;
        foreach (var estimate in estimates) {
          inputMatrix[estimatesCounter, m] = estimate;
          estimatesCounter++;
        }
      }

      //add target
      var targets = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
      int targetCounter = 0;
      foreach (var target in targets) {
        inputMatrix[targetCounter, models.Count] = target;
        targetCounter++;
      }

      alglib.linearmodel lm = new alglib.linearmodel();
      alglib.lrreport ar = new alglib.lrreport();
      double[] coefficients;
      int retVal = 1;
      alglib.lrbuildz(inputMatrix, rows, features, out retVal, out lm, out ar);
      if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression solution");

      alglib.lrunpack(lm, out coefficients, out features);

      var ensembleModel = new RegressionEnsembleModel(models, coefficients.Take(models.Count)) { AverageModelEstimates = false };
      var ensembleSolution = (IRegressionEnsembleSolution)ensembleModel.CreateRegressionSolution(problemData);      return ensembleSolution;
    }
 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
   return new GaussianProcessRegressionSolution(this, new RegressionProblemData(problemData));
 }
 public INeuralNetworkRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
   return new NeuralNetworkRegressionSolution(new RegressionProblemData(problemData), this);
 }
 public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int decimalPlaces) {
   double r2 = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, solution, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);
   if (decimalPlaces >= 0)
     r2 = Math.Round(r2, decimalPlaces);
   return new double[2] { r2, solution.IterateNodesPostfix().OfType<VariableTreeNode>().Count() }; // count the number of variables
 }
 // for custom stepping & termination
 public static IGbmState CreateGbmState(IRegressionProblemData problemData, ILossFunction lossFunction, uint randSeed, int maxSize = 3, double r = 0.66, double m = 0.5, double nu = 0.01) {
   return new GbmState(problemData, lossFunction, randSeed, maxSize, r, m, nu);
 }
    // simple interface
    public static GradientBoostedTreesSolution TrainGbm(IRegressionProblemData problemData, ILossFunction lossFunction, int maxSize, double nu, double r, double m, int maxIterations, uint randSeed = 31415) {
      Contract.Assert(r > 0);
      Contract.Assert(r <= 1.0);
      Contract.Assert(nu > 0);
      Contract.Assert(nu <= 1.0);

      var state = (GbmState)CreateGbmState(problemData, lossFunction, randSeed, maxSize, r, m, nu);

      for (int iter = 0; iter < maxIterations; iter++) {
        MakeStep(state);
      }

      var model = state.GetModel();
      return new GradientBoostedTreesSolution(model, (IRegressionProblemData)problemData.Clone());
    }
 public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int decimalPlaces) {
   double r2 = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, solution, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);
   if (decimalPlaces >= 0)
     r2 = Math.Round(r2, decimalPlaces);
   return new double[2] { r2, SymbolicDataAnalysisModelComplexityCalculator.CalculateComplexity(solution) };
 }
Example #58
0
 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) {
   return new ConstantRegressionSolution(this, new RegressionProblemData(problemData));
 }
    public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling, int maxIterations, bool updateVariableWeights = true, double lowerEstimationLimit = double.MinValue, double upperEstimationLimit = double.MaxValue, bool updateConstantsInTree = true) {

      List<AutoDiff.Variable> variables = new List<AutoDiff.Variable>();
      List<AutoDiff.Variable> parameters = new List<AutoDiff.Variable>();
      List<string> variableNames = new List<string>();

      AutoDiff.Term func;
      if (!TryTransformToAutoDiff(tree.Root.GetSubtree(0), variables, parameters, variableNames, updateVariableWeights, out func))
        throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree.");
      if (variableNames.Count == 0) return 0.0;

      AutoDiff.IParametricCompiledTerm compiledFunc = func.Compile(variables.ToArray(), parameters.ToArray());

      List<SymbolicExpressionTreeTerminalNode> terminalNodes = null;
      if (updateVariableWeights)
        terminalNodes = tree.Root.IterateNodesPrefix().OfType<SymbolicExpressionTreeTerminalNode>().ToList();
      else
        terminalNodes = new List<SymbolicExpressionTreeTerminalNode>(tree.Root.IterateNodesPrefix().OfType<ConstantTreeNode>());

      //extract inital constants
      double[] c = new double[variables.Count];
      {
        c[0] = 0.0;
        c[1] = 1.0;
        int i = 2;
        foreach (var node in terminalNodes) {
          ConstantTreeNode constantTreeNode = node as ConstantTreeNode;
          VariableTreeNode variableTreeNode = node as VariableTreeNode;
          if (constantTreeNode != null)
            c[i++] = constantTreeNode.Value;
          else if (updateVariableWeights && variableTreeNode != null)
            c[i++] = variableTreeNode.Weight;
        }
      }
      double[] originalConstants = (double[])c.Clone();
      double originalQuality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

      alglib.lsfitstate state;
      alglib.lsfitreport rep;
      int info;

      IDataset ds = problemData.Dataset;
      double[,] x = new double[rows.Count(), variableNames.Count];
      int row = 0;
      foreach (var r in rows) {
        for (int col = 0; col < variableNames.Count; col++) {
          x[row, col] = ds.GetDoubleValue(variableNames[col], r);
        }
        row++;
      }
      double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray();
      int n = x.GetLength(0);
      int m = x.GetLength(1);
      int k = c.Length;

      alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(compiledFunc);
      alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(compiledFunc);

      try {
        alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state);
        alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations);
        //alglib.lsfitsetgradientcheck(state, 0.001);
        alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, null, null);
        alglib.lsfitresults(state, out info, out c, out rep);
      }
      catch (ArithmeticException) {
        return originalQuality;
      }
      catch (alglib.alglibexception) {
        return originalQuality;
      }

      //info == -7  => constant optimization failed due to wrong gradient
      if (info != -7) UpdateConstants(tree, c.Skip(2).ToArray(), updateVariableWeights);
      var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

      if (!updateConstantsInTree) UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights);
      if (originalQuality - quality > 0.001 || double.IsNaN(quality)) {
        UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights);
        return originalQuality;
      }
      return quality;
    }
 IRegressionSolution IRegressionModel.CreateRegressionSolution(IRegressionProblemData problemData) {
   return CreateRegressionSolution(problemData);
 }