示例#1
0
        private static IScope CreateRulesResult(RegressionRuleModel regressionRuleModel, IRegressionProblemData pd, bool displayModels, out IRegressionProblemData notCovered)
        {
            var training = pd.TrainingIndices.Where(x => !regressionRuleModel.Covers(pd.Dataset, x)).ToArray();
            var test     = pd.TestIndices.Where(x => !regressionRuleModel.Covers(pd.Dataset, x)).ToArray();

            if (training.Length > 0 || test.Length > 0)
            {
                var data = new Dataset(pd.Dataset.DoubleVariables, pd.Dataset.DoubleVariables.Select(v => pd.Dataset.GetDoubleValues(v, training.Concat(test)).ToArray()));
                notCovered = new RegressionProblemData(data, pd.AllowedInputVariables, pd.TargetVariable);
                notCovered.TestPartition.Start = notCovered.TrainingPartition.End = training.Length;
                notCovered.TestPartition.End   = training.Length + test.Length;
            }
            else
            {
                notCovered = null;
            }

            var training2 = pd.TrainingIndices.Where(x => regressionRuleModel.Covers(pd.Dataset, x)).ToArray();
            var test2     = pd.TestIndices.Where(x => regressionRuleModel.Covers(pd.Dataset, x)).ToArray();
            var data2     = new Dataset(pd.Dataset.DoubleVariables, pd.Dataset.DoubleVariables.Select(v => pd.Dataset.GetDoubleValues(v, training2.Concat(test2)).ToArray()));
            var covered   = new RegressionProblemData(data2, pd.AllowedInputVariables, pd.TargetVariable);

            covered.TestPartition.Start = covered.TrainingPartition.End = training2.Length;
            covered.TestPartition.End   = training2.Length + test2.Length;

            var res2 = new Scope("RuleModels");

            res2.Variables.Add(new Variable(ConditionResultName, new StringValue(regressionRuleModel.ToCompactString())));
            res2.Variables.Add(new Variable(CoverResultName, new IntValue(pd.TrainingIndices.Count() - training.Length)));
            if (displayModels)
            {
                res2.Variables.Add(new Variable(RuleModelResultName, regressionRuleModel.CreateRegressionSolution(covered)));
            }
            return(res2);
        }
示例#2
0
        private static void BuildPruningModel(RegressionNodeModel regressionNode, ILeafModel leaf, IReadOnlyList <int> trainingRows, IReadOnlyList <int> pruningRows, PruningState state, RegressionTreeParameters regressionTreeParams, CancellationToken cancellationToken)
        {
            //create regressionProblemdata from pruning data
            var vars        = regressionTreeParams.AllowedInputVariables.Concat(new[] { regressionTreeParams.TargetVariable }).ToArray();
            var reducedData = new Dataset(vars, vars.Select(x => regressionTreeParams.Data.GetDoubleValues(x, pruningRows).ToList()));
            var pd          = new RegressionProblemData(reducedData, regressionTreeParams.AllowedInputVariables, regressionTreeParams.TargetVariable);

            pd.TrainingPartition.Start = pd.TrainingPartition.End = pd.TestPartition.Start = 0;
            pd.TestPartition.End       = reducedData.Rows;

            //build pruning model
            int numModelParams;
            var model = leaf.BuildModel(trainingRows, regressionTreeParams, cancellationToken, out numModelParams);

            //record error and complexities
            var rmsModel = model.CreateRegressionSolution(pd).TestRootMeanSquaredError;

            state.pruningSizes.Add(regressionNode, pruningRows.Count);
            state.modelErrors.Add(regressionNode, rmsModel);
            state.modelComplexities.Add(regressionNode, numModelParams);
            if (regressionNode.IsLeaf)
            {
                state.nodeComplexities[regressionNode] = state.modelComplexities[regressionNode];
            }
            else
            {
                state.nodeComplexities.Add(regressionNode, state.nodeComplexities[regressionNode.Left] + state.nodeComplexities[regressionNode.Right] + 1);
            }
        }
        private IRegressionSolution CreateLinearRegressionSolution()
        {
            if (Content == null)
            {
                throw new InvalidOperationException();
            }
            double rmse, cvRmsError;
            var    problemData = (IRegressionProblemData)ProblemData.Clone();

            if (!problemData.TrainingIndices.Any())
            {
                return(null);                              // don't create an LR model if the problem does not have a training set (e.g. loaded into an existing model)
            }
            var usedVariables = Content.Model.VariablesUsedForPrediction;

            var usedDoubleVariables = usedVariables
                                      .Where(name => problemData.Dataset.VariableHasType <double>(name))
                                      .Distinct();

            var usedFactorVariables = usedVariables
                                      .Where(name => problemData.Dataset.VariableHasType <string>(name))
                                      .Distinct();

            // gkronber: for binary factors we actually produce a binary variable in the new dataset
            // but only if the variable is not used as a full factor anyway (LR creates binary columns anyway)
            var usedBinaryFactors =
                Content.Model.SymbolicExpressionTree.IterateNodesPostfix().OfType <BinaryFactorVariableTreeNode>()
                .Where(node => !usedFactorVariables.Contains(node.VariableName))
                .Select(node => Tuple.Create(node.VariableValue, node.VariableValue));

            // create a new problem and dataset
            var variableNames =
                usedDoubleVariables
                .Concat(usedFactorVariables)
                .Concat(usedBinaryFactors.Select(t => t.Item1 + "=" + t.Item2))
                .Concat(new string[] { problemData.TargetVariable })
                .ToArray();
            var variableValues =
                usedDoubleVariables.Select(name => (IList)problemData.Dataset.GetDoubleValues(name).ToList())
                .Concat(usedFactorVariables.Select(name => problemData.Dataset.GetStringValues(name).ToList()))
                .Concat(
                    // create binary variable
                    usedBinaryFactors.Select(t => problemData.Dataset.GetReadOnlyStringValues(t.Item1).Select(val => val == t.Item2 ? 1.0 : 0.0).ToList())
                    )
                .Concat(new[] { problemData.Dataset.GetDoubleValues(problemData.TargetVariable).ToList() });

            var newDs          = new Dataset(variableNames, variableValues);
            var newProblemData = new RegressionProblemData(newDs, variableNames.Take(variableNames.Length - 1), variableNames.Last());

            newProblemData.TrainingPartition.Start = problemData.TrainingPartition.Start;
            newProblemData.TrainingPartition.End   = problemData.TrainingPartition.End;
            newProblemData.TestPartition.Start     = problemData.TestPartition.Start;
            newProblemData.TestPartition.End       = problemData.TestPartition.End;

            var solution = LinearRegression.CreateLinearRegressionSolution(newProblemData, out rmse, out cvRmsError);

            solution.Name = "Baseline (linear subset)";
            return(solution);
        }
示例#4
0
        private static void SymbolicDataAnalysisCrossoverPerformanceTest(ISymbolicDataAnalysisExpressionCrossover <IRegressionProblemData> crossover)
        {
            var twister   = new MersenneTwister(31415);
            var dataset   = Util.CreateRandomDataset(twister, Rows, Columns);
            var grammar   = new FullFunctionalExpressionGrammar();
            var stopwatch = new Stopwatch();

            grammar.MaximumFunctionArguments   = 0;
            grammar.MaximumFunctionDefinitions = 0;
            grammar.MinimumFunctionArguments   = 0;
            grammar.MinimumFunctionDefinitions = 0;

            var trees = Util.CreateRandomTrees(twister, dataset, grammar, PopulationSize, 1, MaxTreeLength, 0, 0);

            foreach (ISymbolicExpressionTree tree in trees)
            {
                Util.InitTree(tree, twister, new List <string>(dataset.VariableNames));
            }
            var problemData = new RegressionProblemData(dataset, dataset.VariableNames, dataset.VariableNames.Last());
            var problem     = new SymbolicRegressionSingleObjectiveProblem();

            problem.ProblemData = problemData;

            var globalScope = new Scope("Global Scope");

            globalScope.Variables.Add(new Core.Variable("Random", twister));
            var context = new ExecutionContext(null, problem, globalScope);

            context = new ExecutionContext(context, crossover, globalScope);

            stopwatch.Start();
            for (int i = 0; i != PopulationSize; ++i)
            {
                var parent0      = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone();
                var scopeParent0 = new Scope();
                scopeParent0.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent0));
                context.Scope.SubScopes.Add(scopeParent0);

                var parent1      = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone();
                var scopeParent1 = new Scope();
                scopeParent1.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent1));
                context.Scope.SubScopes.Add(scopeParent1);

                crossover.Execute(context, new CancellationToken());

                context.Scope.SubScopes.Remove(scopeParent0); // clean the scope in preparation for the next iteration
                context.Scope.SubScopes.Remove(scopeParent1); // clean the scope in preparation for the next iteration
            }
            stopwatch.Stop();
            double msPerCrossover = 2 * stopwatch.ElapsedMilliseconds / (double)PopulationSize;

            Console.WriteLine(crossover.Name + ": " + Environment.NewLine +
                              msPerCrossover + " ms per crossover (~" + Math.Round(1000.0 / (msPerCrossover)) + " crossover operations / s)");

            foreach (var tree in trees)
            {
                HeuristicLab.Encodings.SymbolicExpressionTreeEncoding.Tests.Util.IsValid(tree);
            }
        }
示例#5
0
        private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool generateRules, bool useHoldout, double holdoutSize)
        {
            var stateScope = new Scope("RegressionTreeStateScope");

            //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise
            var doubleVars = new HashSet <string>(problemData.Dataset.DoubleVariables);
            var vars       = problemData.AllowedInputVariables.Concat(new[] { problemData.TargetVariable }).ToArray();

            if (vars.Any(v => !doubleVars.Contains(v)))
            {
                throw new NotSupportedException("Decision tree regression supports only double valued input or output features.");
            }
            var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray();

            if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x))))
            {
                throw new NotSupportedException("Decision tree regression does not support NaN or infinity values in the input dataset.");
            }
            var trainingData = new Dataset(vars, doubles);
            var pd           = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable);

            pd.TrainingPartition.End   = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows;
            pd.TrainingPartition.Start = 0;

            //store regression tree parameters
            var regressionTreeParams = new RegressionTreeParameters(pruning, minLeafSize, leafModel, pd, random, splitter);

            stateScope.Variables.Add(new Variable(RegressionTreeParameterVariableName, regressionTreeParams));

            //initialize tree operators
            pruning.Initialize(stateScope);
            splitter.Initialize(stateScope);
            leafModel.Initialize(stateScope);

            //store unbuilt model
            IItem model;

            if (generateRules)
            {
                model = RegressionRuleSetModel.CreateRuleModel(problemData.TargetVariable, regressionTreeParams);
                RegressionRuleSetModel.Initialize(stateScope);
            }
            else
            {
                model = RegressionNodeTreeModel.CreateTreeModel(problemData.TargetVariable, regressionTreeParams);
            }
            stateScope.Variables.Add(new Variable(ModelVariableName, model));

            //store training & pruning indices
            IReadOnlyList <int> trainingSet, pruningSet;

            GeneratePruningSet(pd.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingSet, out pruningSet);
            stateScope.Variables.Add(new Variable(TrainingSetVariableName, new IntArray(trainingSet.ToArray())));
            stateScope.Variables.Add(new Variable(PruningSetVariableName, new IntArray(pruningSet.ToArray())));

            return(stateScope);
        }
        private static IRegressionProblemData CreateProblemData(IRegressionProblemData pd, IDataset data, IReadOnlyList <string> allowedNames)
        {
            var res = new RegressionProblemData(data, allowedNames, pd.TargetVariable);

            res.TestPartition.Start     = pd.TestPartition.Start;
            res.TestPartition.End       = pd.TestPartition.End;
            res.TrainingPartition.Start = pd.TrainingPartition.Start;
            res.TrainingPartition.End   = pd.TrainingPartition.End;
            res.Name = pd.Name;
            return(res);
        }
示例#7
0
        private static IRegressionProblemData Subselect(IRegressionProblemData data, IReadOnlyList <int> training, IReadOnlyList <int> test)
        {
            var dataset = RegressionTreeUtilities.ReduceDataset(data.Dataset, training.Concat(test).ToList(), data.AllowedInputVariables.ToList(), data.TargetVariable);
            var res     = new RegressionProblemData(dataset, data.AllowedInputVariables, data.TargetVariable);

            res.TrainingPartition.Start = 0;
            res.TrainingPartition.End   = training.Count;
            res.TestPartition.Start     = training.Count;
            res.TestPartition.End       = training.Count + test.Count;
            return(res);
        }
        public IRegressionProblemData GenerateRegressionData(Dataset dataset)
        {
            RegressionProblemData regData = new RegressionProblemData(dataset, AllowedInputVariables, TargetVariable);

            regData.Name                    = this.Name;
            regData.Description             = this.Description;
            regData.TrainingPartition.Start = this.TrainingPartitionStart;
            regData.TrainingPartition.End   = this.TrainingPartitionEnd;
            regData.TestPartition.Start     = this.TestPartitionStart;
            regData.TestPartition.End       = this.TestPartitionEnd;
            return(regData);
        }
示例#9
0
        private static PreconstructedLinearModel FindBestModel(Dictionary <string, double> variances, Dictionary <string, double> means, double yMean, double yVariance, IRegressionProblemData pd, IList <string> variables)
        {
            Dictionary <string, double> coeffs;
            double intercept;

            do
            {
                coeffs    = DoRegression(pd, variables, variances, means, yMean, 1.0e-8, out intercept);
                variables = DeselectColinear(variances, coeffs, yVariance, pd, variables);
            }while (coeffs.Count != variables.Count);
            var numAtts = variables.Count;
            var numInst = pd.TrainingIndices.Count();
            var fullMse = CalculateSE(coeffs, intercept, pd, variables);
            var akaike  = 1.0 * (numInst - numAtts) + 2 * numAtts;

            var improved             = true;
            var currentNumAttributes = numAtts;

            while (improved && currentNumAttributes > 1)
            {
                improved = false;
                currentNumAttributes--;
                // Find attribute with smallest SC (variance-scaled coefficient)
                var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance)))
                                .OrderBy(x => x.Value).Select(x => x.Key).First();

                var currVariables    = variables.Where(v => !v.Equals(candidate)).ToList();
                var currentIntercept = 0.0;
                var currentCoeffs    = DoRegression(pd, currVariables, variances, means, yMean, 1.0e-8, out currentIntercept);
                var currentMse       = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables);
                var currentAkaike    = currentMse / fullMse * (numInst - numAtts) + 2 * currentNumAttributes;

                if (!(currentAkaike < akaike))
                {
                    continue;
                }
                improved  = true;
                akaike    = currentAkaike;
                coeffs    = currentCoeffs;
                intercept = currentIntercept;
                variables = currVariables;
            }

            var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable);

            pd2.TestPartition.End       = pd.TestPartition.End;
            pd2.TestPartition.Start     = pd.TestPartition.Start;
            pd2.TrainingPartition.End   = pd.TrainingPartition.End;
            pd2.TrainingPartition.Start = pd.TrainingPartition.Start;

            return(new PreconstructedLinearModel(coeffs, intercept, pd.TargetVariable));
        }
        private IDataAnalysisProblemData CreateRegressionData(RegressionProblemData oldProblemData)
        {
            var targetVariable = oldProblemData.TargetVariable;

            if (!context.Data.VariableNames.Contains(targetVariable))
            {
                targetVariable = context.Data.VariableNames.First();
            }
            var inputVariables = GetDoubleInputVariables(targetVariable);
            var newProblemData = new RegressionProblemData(ExportedDataset, inputVariables, targetVariable, Transformations);

            return(newProblemData);
        }
示例#11
0
        static void Main(string[] args)
        {
            var path = @"D:\Data\Sinus\SimpleSinusHeuristicLab.csv";

            var parser = new TableFileParser();

            parser.Parse(path, columnNamesInFirstLine: true);

            var dataset = new Dataset(parser.VariableNames, parser.Values);

            var problemData = new RegressionProblemData(dataset, dataset.DoubleVariables.Take(dataset.Columns - 1), dataset.DoubleVariables.Last());

            var grammar = new TypeCoherentExpressionGrammar();
            var add     = new Addition();

//      var sub = new Subtraction();
//      var mul = new Multiplication();
//      var div = new Division();
            grammar.ConfigureAsDefaultRegressionGrammar();

            var problem = new SymbolicRegressionSingleObjectiveProblem {
                ProblemData = problemData,
                SymbolicExpressionTreeInterpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(),
                SymbolicExpressionTreeGrammar     = grammar,
                EvaluatorParameter       = { Value = new SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator() },
                SolutionCreatorParameter = { Value = new SymbolicDataAnalysisExpressionTreeCreator() }
            };

            var ga = new GeneticAlgorithm {
                PopulationSize      = { Value = 1000 },
                MaximumGenerations  = { Value = 100 },
                Problem             = problem,
                MutationProbability = new PercentValue(25),
                Engine = new SequentialEngine()
            };
            var manipulator = new MultiSymbolicExpressionTreeManipulator();

            ga.MutatorParameter.ValidValues.Add(manipulator);


            ga.ExecutionStateChanged += OnExecutionStateChanged;
            ga.ExecutionTimeChanged  += OnExecutionTimeChanged;

            ga.Prepare();
            ga.Start();
            mutex.WaitOne();
        }
        private void BuildTree(double[,] xy, string[] allVariables, int maxSize)
        {
            int nRows         = xy.GetLength(0);
            var allowedInputs = allVariables.Skip(1);
            var dataset       = new Dataset(allVariables, xy);
            var problemData   = new RegressionProblemData(dataset, allowedInputs, allVariables.First());

            problemData.TrainingPartition.Start = 0;
            problemData.TrainingPartition.End   = nRows;
            problemData.TestPartition.Start     = nRows;
            problemData.TestPartition.End       = nRows;
            var solution = GradientBoostedTreesAlgorithmStatic.TrainGbm(problemData, new SquaredErrorLoss(), maxSize, nu: 1, r: 1, m: 1, maxIterations: 1, randSeed: 31415);
            var model    = solution.Model;
            var treeM    = model.Models.Skip(1).First() as RegressionTreeModel;

            Console.WriteLine(treeM.ToString());
            Console.WriteLine();
        }
        protected override IRegressionProblemData ImportData(string path, RegressionImportType type, TableFileParser csvFileParser)
        {
            List <IList> values = csvFileParser.Values;

            if (type.Shuffle)
            {
                values = Shuffle(values);
            }
            Dataset dataset = new Dataset(csvFileParser.VariableNames, values);

            // turn of input variables that are constant in the training partition
            var allowedInputVars = new List <string>();
            int trainingPartEnd  = (csvFileParser.Rows * type.TrainingPercentage) / 100;

            trainingPartEnd = trainingPartEnd > 0 ? trainingPartEnd : 1;
            var trainingIndizes = Enumerable.Range(0, trainingPartEnd);

            if (trainingIndizes.Count() >= 2)
            {
                foreach (var variableName in dataset.DoubleVariables)
                {
                    if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
                        variableName != type.TargetVariable)
                    {
                        allowedInputVars.Add(variableName);
                    }
                }
            }
            else
            {
                allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable)));
            }

            RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, type.TargetVariable);

            regressionData.TrainingPartition.Start = 0;
            regressionData.TrainingPartition.End   = trainingPartEnd;
            regressionData.TestPartition.Start     = trainingPartEnd;
            regressionData.TestPartition.End       = csvFileParser.Rows;

            regressionData.Name = Path.GetFileName(path);

            return(regressionData);
        }
        public override IRegressionProblemData ImportData(string path)
        {
            TableFileParser csvFileParser = new TableFileParser();

            csvFileParser.Parse(path, csvFileParser.AreColumnNamesInFirstLine(path));

            Dataset dataset   = new Dataset(csvFileParser.VariableNames, csvFileParser.Values);
            string  targetVar = dataset.DoubleVariables.Last();

            // turn off input variables that are constant in the training partition
            var allowedInputVars = new List <string>();
            var trainingIndizes  = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3);

            if (trainingIndizes.Count() >= 2)
            {
                foreach (var variableName in dataset.DoubleVariables)
                {
                    if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
                        variableName != targetVar)
                    {
                        allowedInputVars.Add(variableName);
                    }
                }
            }
            else
            {
                allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
            }

            IRegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, targetVar);

            var trainingPartEnd = trainingIndizes.Last();

            regressionData.TrainingPartition.Start = trainingIndizes.First();
            regressionData.TrainingPartition.End   = trainingPartEnd;
            regressionData.TestPartition.Start     = trainingPartEnd;
            regressionData.TestPartition.End       = csvFileParser.Rows;

            regressionData.Name = Path.GetFileName(path);

            return(regressionData);
        }
        public void PerformanceVariableImpactRegressionTest()
        {
            int rows    = 20000;
            int columns = 77;
            var dataSet = OnlineCalculatorPerformanceTest.CreateRandomDataset(new MersenneTwister(1234), rows, columns);
            IRegressionProblemData problemData = new RegressionProblemData(dataSet, dataSet.VariableNames.Except("y".ToEnumerable()), "y");
            double rmsError;
            double cvRmsError;
            var    solution = LinearRegression.CreateSolution(problemData, out rmsError, out cvRmsError);

            Stopwatch watch = new Stopwatch();

            watch.Start();
            var results = RegressionSolutionVariableImpactsCalculator.CalculateImpacts(solution);

            watch.Stop();

            TestContext.WriteLine("");
            TestContext.WriteLine("Calculated cells per millisecond: {0}.", rows * columns / watch.ElapsedMilliseconds);
        }
示例#16
0
        public IRegressionModel BuildModel(IReadOnlyList <int> rows, RegressionTreeParameters parameters, CancellationToken cancellation, out int numberOfParameters)
        {
            var reducedData = RegressionTreeUtilities.ReduceDataset(parameters.Data, rows, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable);
            var pd          = new RegressionProblemData(reducedData, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable);

            pd.TrainingPartition.Start = 0;
            pd.TrainingPartition.End   = pd.TestPartition.Start = pd.TestPartition.End = reducedData.Rows;

            int numP;
            var model = Build(pd, parameters.Random, cancellation, out numP);

            if (UseDampening && Dampening > 0.0)
            {
                model = DampenedModel.DampenModel(model, pd, Dampening);
            }

            numberOfParameters = numP;
            cancellation.ThrowIfCancellationRequested();
            return(model);
        }
示例#17
0
        // wraps the list of basis functions into an IRegressionProblemData object
        private static IRegressionProblemData PrepareData(IRegressionProblemData problemData, IEnumerable <BasisFunction> basisFunctions)
        {
            HashSet <string> variableNames = new HashSet <string>();
            List <IList>     variableVals  = new List <IList>();

            foreach (var basisFunc in basisFunctions)
            {
                variableNames.Add(basisFunc.Var);
                variableVals.Add(new List <double>(basisFunc.Val));
            }
            var matrix = new ModifiableDataset(variableNames, variableVals);

            // add the unmodified target variable to the matrix
            matrix.AddVariable(problemData.TargetVariable, problemData.TargetVariableValues.ToList());
            IEnumerable <string>   allowedInputVars = matrix.VariableNames.Where(x => !x.Equals(problemData.TargetVariable)).ToArray();
            IRegressionProblemData rpd = new RegressionProblemData(matrix, allowedInputVars, problemData.TargetVariable);

            rpd.TargetVariable          = problemData.TargetVariable;
            rpd.TrainingPartition.Start = problemData.TrainingPartition.Start;
            rpd.TrainingPartition.End   = problemData.TrainingPartition.End;
            rpd.TestPartition.Start     = problemData.TestPartition.Start;
            rpd.TestPartition.End       = problemData.TestPartition.End;
            return(rpd);
        }
示例#18
0
        protected override void Run(CancellationToken cancellationToken)
        {
            // Set up the algorithm
            if (SetSeedRandomly)
            {
                Seed = RandomSeedGenerator.GetSeed();
            }
            var rand = new MersenneTwister((uint)Seed);

            // Set up the results display
            var iterations = new IntValue(0);

            Results.Add(new Result("Iterations", iterations));

            var table = new DataTable("Qualities");

            table.Rows.Add(new DataRow("R² (train)"));
            table.Rows.Add(new DataRow("R² (test)"));
            Results.Add(new Result("Qualities", table));
            var curLoss     = new DoubleValue();
            var curTestLoss = new DoubleValue();

            Results.Add(new Result("R² (train)", curLoss));
            Results.Add(new Result("R² (test)", curTestLoss));
            var runCollection = new RunCollection();

            if (StoreRuns)
            {
                Results.Add(new Result("Runs", runCollection));
            }

            // init
            var problemData       = Problem.ProblemData;
            var targetVarName     = problemData.TargetVariable;
            var activeVariables   = problemData.AllowedInputVariables.Concat(new string[] { problemData.TargetVariable });
            var modifiableDataset = new ModifiableDataset(
                activeVariables,
                activeVariables.Select(v => problemData.Dataset.GetDoubleValues(v).ToList()));

            var trainingRows = problemData.TrainingIndices;
            var testRows     = problemData.TestIndices;
            var yPred        = new double[trainingRows.Count()];
            var yPredTest    = new double[testRows.Count()];
            var y            = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();
            var curY         = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();

            var yTest    = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray();
            var curYTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray();
            var nu       = Nu;
            var mVars    = (int)Math.Ceiling(M * problemData.AllowedInputVariables.Count());
            var rRows    = (int)Math.Ceiling(R * problemData.TrainingIndices.Count());
            var alg      = RegressionAlgorithm;
            List <IRegressionModel> models = new List <IRegressionModel>();

            try {
                // Loop until iteration limit reached or canceled.
                for (int i = 0; i < Iterations; i++)
                {
                    cancellationToken.ThrowIfCancellationRequested();

                    modifiableDataset.RemoveVariable(targetVarName);
                    modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest).ToList());

                    SampleTrainingData(rand, modifiableDataset, rRows, problemData.Dataset, curY, problemData.TargetVariable, problemData.TrainingIndices); // all training indices from the original problem data are allowed
                    var modifiableProblemData = new RegressionProblemData(modifiableDataset,
                                                                          problemData.AllowedInputVariables.SampleRandomWithoutRepetition(rand, mVars),
                                                                          problemData.TargetVariable);
                    modifiableProblemData.TrainingPartition.Start = 0;
                    modifiableProblemData.TrainingPartition.End   = rRows;
                    modifiableProblemData.TestPartition.Start     = problemData.TestPartition.Start;
                    modifiableProblemData.TestPartition.End       = problemData.TestPartition.End;

                    if (!TrySetProblemData(alg, modifiableProblemData))
                    {
                        throw new NotSupportedException("The algorithm cannot be used with GBM.");
                    }

                    IRegressionModel model;
                    IRun             run;

                    // try to find a model. The algorithm might fail to produce a model. In this case we just retry until the iterations are exhausted
                    if (TryExecute(alg, rand.Next(), RegressionAlgorithmResult, out model, out run))
                    {
                        int row = 0;
                        // update predictions for training and test
                        // update new targets (in the case of squared error loss we simply use negative residuals)
                        foreach (var pred in model.GetEstimatedValues(problemData.Dataset, trainingRows))
                        {
                            yPred[row] = yPred[row] + nu * pred;
                            curY[row]  = y[row] - yPred[row];
                            row++;
                        }
                        row = 0;
                        foreach (var pred in model.GetEstimatedValues(problemData.Dataset, testRows))
                        {
                            yPredTest[row] = yPredTest[row] + nu * pred;
                            curYTest[row]  = yTest[row] - yPredTest[row];
                            row++;
                        }
                        // determine quality
                        OnlineCalculatorError error;
                        var trainR = OnlinePearsonsRCalculator.Calculate(yPred, y, out error);
                        var testR  = OnlinePearsonsRCalculator.Calculate(yPredTest, yTest, out error);

                        // iteration results
                        curLoss.Value     = error == OnlineCalculatorError.None ? trainR * trainR : 0.0;
                        curTestLoss.Value = error == OnlineCalculatorError.None ? testR * testR : 0.0;

                        models.Add(model);
                    }

                    if (StoreRuns)
                    {
                        runCollection.Add(run);
                    }
                    table.Rows["R² (train)"].Values.Add(curLoss.Value);
                    table.Rows["R² (test)"].Values.Add(curTestLoss.Value);
                    iterations.Value = i + 1;
                }

                // produce solution
                if (CreateSolution)
                {
                    // when all our models are symbolic models we can easily combine them to a single model
                    if (models.All(m => m is ISymbolicRegressionModel))
                    {
                        Results.Add(new Result("Solution", CreateSymbolicSolution(models, Nu, (IRegressionProblemData)problemData.Clone())));
                    }
                    // just produce an ensemble solution for now (TODO: correct scaling or linear regression for ensemble model weights)

                    var ensembleSolution = CreateEnsembleSolution(models, (IRegressionProblemData)problemData.Clone());
                    Results.Add(new Result("EnsembleSolution", ensembleSolution));
                }
            }
            finally {
                // reset everything
                alg.Prepare(true);
            }
        }
    private static void SymbolicDataAnalysisCrossoverPerformanceTest(ISymbolicDataAnalysisExpressionCrossover<IRegressionProblemData> crossover) {
      var twister = new MersenneTwister(31415);
      var dataset = Util.CreateRandomDataset(twister, Rows, Columns);
      var grammar = new FullFunctionalExpressionGrammar();
      var stopwatch = new Stopwatch();

      grammar.MaximumFunctionArguments = 0;
      grammar.MaximumFunctionDefinitions = 0;
      grammar.MinimumFunctionArguments = 0;
      grammar.MinimumFunctionDefinitions = 0;

      var trees = Util.CreateRandomTrees(twister, dataset, grammar, PopulationSize, 1, MaxTreeLength, 0, 0);
      foreach (ISymbolicExpressionTree tree in trees) {
        Util.InitTree(tree, twister, new List<string>(dataset.VariableNames));
      }
      var problemData = new RegressionProblemData(dataset, dataset.VariableNames, dataset.VariableNames.Last());
      var problem = new SymbolicRegressionSingleObjectiveProblem();
      problem.ProblemData = problemData;

      var globalScope = new Scope("Global Scope");
      globalScope.Variables.Add(new Core.Variable("Random", twister));
      var context = new ExecutionContext(null, problem, globalScope);
      context = new ExecutionContext(context, crossover, globalScope);

      stopwatch.Start();
      for (int i = 0; i != PopulationSize; ++i) {
        var parent0 = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone();
        var scopeParent0 = new Scope();
        scopeParent0.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent0));
        context.Scope.SubScopes.Add(scopeParent0);

        var parent1 = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone();
        var scopeParent1 = new Scope();
        scopeParent1.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent1));
        context.Scope.SubScopes.Add(scopeParent1);

        crossover.Execute(context, new CancellationToken());

        context.Scope.SubScopes.Remove(scopeParent0); // clean the scope in preparation for the next iteration
        context.Scope.SubScopes.Remove(scopeParent1); // clean the scope in preparation for the next iteration
      }
      stopwatch.Stop();
      double msPerCrossover = 2 * stopwatch.ElapsedMilliseconds / (double)PopulationSize;
      Console.WriteLine(crossover.Name + ": " + Environment.NewLine +
                        msPerCrossover + " ms per crossover (~" + Math.Round(1000.0 / (msPerCrossover)) + " crossover operations / s)");

      foreach (var tree in trees)
        HeuristicLab.Encodings.SymbolicExpressionTreeEncoding.Tests.Util.IsValid(tree);
    }