private static IScope CreateRulesResult(RegressionRuleModel regressionRuleModel, IRegressionProblemData pd, bool displayModels, out IRegressionProblemData notCovered) { var training = pd.TrainingIndices.Where(x => !regressionRuleModel.Covers(pd.Dataset, x)).ToArray(); var test = pd.TestIndices.Where(x => !regressionRuleModel.Covers(pd.Dataset, x)).ToArray(); if (training.Length > 0 || test.Length > 0) { var data = new Dataset(pd.Dataset.DoubleVariables, pd.Dataset.DoubleVariables.Select(v => pd.Dataset.GetDoubleValues(v, training.Concat(test)).ToArray())); notCovered = new RegressionProblemData(data, pd.AllowedInputVariables, pd.TargetVariable); notCovered.TestPartition.Start = notCovered.TrainingPartition.End = training.Length; notCovered.TestPartition.End = training.Length + test.Length; } else { notCovered = null; } var training2 = pd.TrainingIndices.Where(x => regressionRuleModel.Covers(pd.Dataset, x)).ToArray(); var test2 = pd.TestIndices.Where(x => regressionRuleModel.Covers(pd.Dataset, x)).ToArray(); var data2 = new Dataset(pd.Dataset.DoubleVariables, pd.Dataset.DoubleVariables.Select(v => pd.Dataset.GetDoubleValues(v, training2.Concat(test2)).ToArray())); var covered = new RegressionProblemData(data2, pd.AllowedInputVariables, pd.TargetVariable); covered.TestPartition.Start = covered.TrainingPartition.End = training2.Length; covered.TestPartition.End = training2.Length + test2.Length; var res2 = new Scope("RuleModels"); res2.Variables.Add(new Variable(ConditionResultName, new StringValue(regressionRuleModel.ToCompactString()))); res2.Variables.Add(new Variable(CoverResultName, new IntValue(pd.TrainingIndices.Count() - training.Length))); if (displayModels) { res2.Variables.Add(new Variable(RuleModelResultName, regressionRuleModel.CreateRegressionSolution(covered))); } return(res2); }
private static void BuildPruningModel(RegressionNodeModel regressionNode, ILeafModel leaf, IReadOnlyList <int> trainingRows, IReadOnlyList <int> pruningRows, PruningState state, RegressionTreeParameters regressionTreeParams, CancellationToken cancellationToken) { //create regressionProblemdata from pruning data var vars = regressionTreeParams.AllowedInputVariables.Concat(new[] { regressionTreeParams.TargetVariable }).ToArray(); var reducedData = new Dataset(vars, vars.Select(x => regressionTreeParams.Data.GetDoubleValues(x, pruningRows).ToList())); var pd = new RegressionProblemData(reducedData, regressionTreeParams.AllowedInputVariables, regressionTreeParams.TargetVariable); pd.TrainingPartition.Start = pd.TrainingPartition.End = pd.TestPartition.Start = 0; pd.TestPartition.End = reducedData.Rows; //build pruning model int numModelParams; var model = leaf.BuildModel(trainingRows, regressionTreeParams, cancellationToken, out numModelParams); //record error and complexities var rmsModel = model.CreateRegressionSolution(pd).TestRootMeanSquaredError; state.pruningSizes.Add(regressionNode, pruningRows.Count); state.modelErrors.Add(regressionNode, rmsModel); state.modelComplexities.Add(regressionNode, numModelParams); if (regressionNode.IsLeaf) { state.nodeComplexities[regressionNode] = state.modelComplexities[regressionNode]; } else { state.nodeComplexities.Add(regressionNode, state.nodeComplexities[regressionNode.Left] + state.nodeComplexities[regressionNode.Right] + 1); } }
private IRegressionSolution CreateLinearRegressionSolution() { if (Content == null) { throw new InvalidOperationException(); } double rmse, cvRmsError; var problemData = (IRegressionProblemData)ProblemData.Clone(); if (!problemData.TrainingIndices.Any()) { return(null); // don't create an LR model if the problem does not have a training set (e.g. loaded into an existing model) } var usedVariables = Content.Model.VariablesUsedForPrediction; var usedDoubleVariables = usedVariables .Where(name => problemData.Dataset.VariableHasType <double>(name)) .Distinct(); var usedFactorVariables = usedVariables .Where(name => problemData.Dataset.VariableHasType <string>(name)) .Distinct(); // gkronber: for binary factors we actually produce a binary variable in the new dataset // but only if the variable is not used as a full factor anyway (LR creates binary columns anyway) var usedBinaryFactors = Content.Model.SymbolicExpressionTree.IterateNodesPostfix().OfType <BinaryFactorVariableTreeNode>() .Where(node => !usedFactorVariables.Contains(node.VariableName)) .Select(node => Tuple.Create(node.VariableValue, node.VariableValue)); // create a new problem and dataset var variableNames = usedDoubleVariables .Concat(usedFactorVariables) .Concat(usedBinaryFactors.Select(t => t.Item1 + "=" + t.Item2)) .Concat(new string[] { problemData.TargetVariable }) .ToArray(); var variableValues = usedDoubleVariables.Select(name => (IList)problemData.Dataset.GetDoubleValues(name).ToList()) .Concat(usedFactorVariables.Select(name => problemData.Dataset.GetStringValues(name).ToList())) .Concat( // create binary variable usedBinaryFactors.Select(t => problemData.Dataset.GetReadOnlyStringValues(t.Item1).Select(val => val == t.Item2 ? 1.0 : 0.0).ToList()) ) .Concat(new[] { problemData.Dataset.GetDoubleValues(problemData.TargetVariable).ToList() }); var newDs = new Dataset(variableNames, variableValues); var newProblemData = new RegressionProblemData(newDs, variableNames.Take(variableNames.Length - 1), variableNames.Last()); newProblemData.TrainingPartition.Start = problemData.TrainingPartition.Start; newProblemData.TrainingPartition.End = problemData.TrainingPartition.End; newProblemData.TestPartition.Start = problemData.TestPartition.Start; newProblemData.TestPartition.End = problemData.TestPartition.End; var solution = LinearRegression.CreateLinearRegressionSolution(newProblemData, out rmse, out cvRmsError); solution.Name = "Baseline (linear subset)"; return(solution); }
private static void SymbolicDataAnalysisCrossoverPerformanceTest(ISymbolicDataAnalysisExpressionCrossover <IRegressionProblemData> crossover) { var twister = new MersenneTwister(31415); var dataset = Util.CreateRandomDataset(twister, Rows, Columns); var grammar = new FullFunctionalExpressionGrammar(); var stopwatch = new Stopwatch(); grammar.MaximumFunctionArguments = 0; grammar.MaximumFunctionDefinitions = 0; grammar.MinimumFunctionArguments = 0; grammar.MinimumFunctionDefinitions = 0; var trees = Util.CreateRandomTrees(twister, dataset, grammar, PopulationSize, 1, MaxTreeLength, 0, 0); foreach (ISymbolicExpressionTree tree in trees) { Util.InitTree(tree, twister, new List <string>(dataset.VariableNames)); } var problemData = new RegressionProblemData(dataset, dataset.VariableNames, dataset.VariableNames.Last()); var problem = new SymbolicRegressionSingleObjectiveProblem(); problem.ProblemData = problemData; var globalScope = new Scope("Global Scope"); globalScope.Variables.Add(new Core.Variable("Random", twister)); var context = new ExecutionContext(null, problem, globalScope); context = new ExecutionContext(context, crossover, globalScope); stopwatch.Start(); for (int i = 0; i != PopulationSize; ++i) { var parent0 = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone(); var scopeParent0 = new Scope(); scopeParent0.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent0)); context.Scope.SubScopes.Add(scopeParent0); var parent1 = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone(); var scopeParent1 = new Scope(); scopeParent1.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent1)); context.Scope.SubScopes.Add(scopeParent1); crossover.Execute(context, new CancellationToken()); context.Scope.SubScopes.Remove(scopeParent0); // clean the scope in preparation for the next iteration context.Scope.SubScopes.Remove(scopeParent1); // clean the scope in preparation for the next iteration } stopwatch.Stop(); double msPerCrossover = 2 * stopwatch.ElapsedMilliseconds / (double)PopulationSize; Console.WriteLine(crossover.Name + ": " + Environment.NewLine + msPerCrossover + " ms per crossover (~" + Math.Round(1000.0 / (msPerCrossover)) + " crossover operations / s)"); foreach (var tree in trees) { HeuristicLab.Encodings.SymbolicExpressionTreeEncoding.Tests.Util.IsValid(tree); } }
private static IScope InitializeScope(IRandom random, IRegressionProblemData problemData, IPruning pruning, int minLeafSize, ILeafModel leafModel, ISplitter splitter, bool generateRules, bool useHoldout, double holdoutSize) { var stateScope = new Scope("RegressionTreeStateScope"); //reduce RegressionProblemData to AllowedInput & Target column wise and to TrainingSet row wise var doubleVars = new HashSet <string>(problemData.Dataset.DoubleVariables); var vars = problemData.AllowedInputVariables.Concat(new[] { problemData.TargetVariable }).ToArray(); if (vars.Any(v => !doubleVars.Contains(v))) { throw new NotSupportedException("Decision tree regression supports only double valued input or output features."); } var doubles = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray(); if (doubles.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x)))) { throw new NotSupportedException("Decision tree regression does not support NaN or infinity values in the input dataset."); } var trainingData = new Dataset(vars, doubles); var pd = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable); pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = pd.Dataset.Rows; pd.TrainingPartition.Start = 0; //store regression tree parameters var regressionTreeParams = new RegressionTreeParameters(pruning, minLeafSize, leafModel, pd, random, splitter); stateScope.Variables.Add(new Variable(RegressionTreeParameterVariableName, regressionTreeParams)); //initialize tree operators pruning.Initialize(stateScope); splitter.Initialize(stateScope); leafModel.Initialize(stateScope); //store unbuilt model IItem model; if (generateRules) { model = RegressionRuleSetModel.CreateRuleModel(problemData.TargetVariable, regressionTreeParams); RegressionRuleSetModel.Initialize(stateScope); } else { model = RegressionNodeTreeModel.CreateTreeModel(problemData.TargetVariable, regressionTreeParams); } stateScope.Variables.Add(new Variable(ModelVariableName, model)); //store training & pruning indices IReadOnlyList <int> trainingSet, pruningSet; GeneratePruningSet(pd.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingSet, out pruningSet); stateScope.Variables.Add(new Variable(TrainingSetVariableName, new IntArray(trainingSet.ToArray()))); stateScope.Variables.Add(new Variable(PruningSetVariableName, new IntArray(pruningSet.ToArray()))); return(stateScope); }
private static IRegressionProblemData CreateProblemData(IRegressionProblemData pd, IDataset data, IReadOnlyList <string> allowedNames) { var res = new RegressionProblemData(data, allowedNames, pd.TargetVariable); res.TestPartition.Start = pd.TestPartition.Start; res.TestPartition.End = pd.TestPartition.End; res.TrainingPartition.Start = pd.TrainingPartition.Start; res.TrainingPartition.End = pd.TrainingPartition.End; res.Name = pd.Name; return(res); }
private static IRegressionProblemData Subselect(IRegressionProblemData data, IReadOnlyList <int> training, IReadOnlyList <int> test) { var dataset = RegressionTreeUtilities.ReduceDataset(data.Dataset, training.Concat(test).ToList(), data.AllowedInputVariables.ToList(), data.TargetVariable); var res = new RegressionProblemData(dataset, data.AllowedInputVariables, data.TargetVariable); res.TrainingPartition.Start = 0; res.TrainingPartition.End = training.Count; res.TestPartition.Start = training.Count; res.TestPartition.End = training.Count + test.Count; return(res); }
public IRegressionProblemData GenerateRegressionData(Dataset dataset) { RegressionProblemData regData = new RegressionProblemData(dataset, AllowedInputVariables, TargetVariable); regData.Name = this.Name; regData.Description = this.Description; regData.TrainingPartition.Start = this.TrainingPartitionStart; regData.TrainingPartition.End = this.TrainingPartitionEnd; regData.TestPartition.Start = this.TestPartitionStart; regData.TestPartition.End = this.TestPartitionEnd; return(regData); }
private static PreconstructedLinearModel FindBestModel(Dictionary <string, double> variances, Dictionary <string, double> means, double yMean, double yVariance, IRegressionProblemData pd, IList <string> variables) { Dictionary <string, double> coeffs; double intercept; do { coeffs = DoRegression(pd, variables, variances, means, yMean, 1.0e-8, out intercept); variables = DeselectColinear(variances, coeffs, yVariance, pd, variables); }while (coeffs.Count != variables.Count); var numAtts = variables.Count; var numInst = pd.TrainingIndices.Count(); var fullMse = CalculateSE(coeffs, intercept, pd, variables); var akaike = 1.0 * (numInst - numAtts) + 2 * numAtts; var improved = true; var currentNumAttributes = numAtts; while (improved && currentNumAttributes > 1) { improved = false; currentNumAttributes--; // Find attribute with smallest SC (variance-scaled coefficient) var candidate = variables.ToDictionary(v => v, v => Math.Abs(coeffs[v] * Math.Sqrt(variances[v] / yVariance))) .OrderBy(x => x.Value).Select(x => x.Key).First(); var currVariables = variables.Where(v => !v.Equals(candidate)).ToList(); var currentIntercept = 0.0; var currentCoeffs = DoRegression(pd, currVariables, variances, means, yMean, 1.0e-8, out currentIntercept); var currentMse = CalculateSE(currentCoeffs, currentIntercept, pd, currVariables); var currentAkaike = currentMse / fullMse * (numInst - numAtts) + 2 * currentNumAttributes; if (!(currentAkaike < akaike)) { continue; } improved = true; akaike = currentAkaike; coeffs = currentCoeffs; intercept = currentIntercept; variables = currVariables; } var pd2 = new RegressionProblemData(pd.Dataset, variables, pd.TargetVariable); pd2.TestPartition.End = pd.TestPartition.End; pd2.TestPartition.Start = pd.TestPartition.Start; pd2.TrainingPartition.End = pd.TrainingPartition.End; pd2.TrainingPartition.Start = pd.TrainingPartition.Start; return(new PreconstructedLinearModel(coeffs, intercept, pd.TargetVariable)); }
private IDataAnalysisProblemData CreateRegressionData(RegressionProblemData oldProblemData) { var targetVariable = oldProblemData.TargetVariable; if (!context.Data.VariableNames.Contains(targetVariable)) { targetVariable = context.Data.VariableNames.First(); } var inputVariables = GetDoubleInputVariables(targetVariable); var newProblemData = new RegressionProblemData(ExportedDataset, inputVariables, targetVariable, Transformations); return(newProblemData); }
static void Main(string[] args) { var path = @"D:\Data\Sinus\SimpleSinusHeuristicLab.csv"; var parser = new TableFileParser(); parser.Parse(path, columnNamesInFirstLine: true); var dataset = new Dataset(parser.VariableNames, parser.Values); var problemData = new RegressionProblemData(dataset, dataset.DoubleVariables.Take(dataset.Columns - 1), dataset.DoubleVariables.Last()); var grammar = new TypeCoherentExpressionGrammar(); var add = new Addition(); // var sub = new Subtraction(); // var mul = new Multiplication(); // var div = new Division(); grammar.ConfigureAsDefaultRegressionGrammar(); var problem = new SymbolicRegressionSingleObjectiveProblem { ProblemData = problemData, SymbolicExpressionTreeInterpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(), SymbolicExpressionTreeGrammar = grammar, EvaluatorParameter = { Value = new SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator() }, SolutionCreatorParameter = { Value = new SymbolicDataAnalysisExpressionTreeCreator() } }; var ga = new GeneticAlgorithm { PopulationSize = { Value = 1000 }, MaximumGenerations = { Value = 100 }, Problem = problem, MutationProbability = new PercentValue(25), Engine = new SequentialEngine() }; var manipulator = new MultiSymbolicExpressionTreeManipulator(); ga.MutatorParameter.ValidValues.Add(manipulator); ga.ExecutionStateChanged += OnExecutionStateChanged; ga.ExecutionTimeChanged += OnExecutionTimeChanged; ga.Prepare(); ga.Start(); mutex.WaitOne(); }
private void BuildTree(double[,] xy, string[] allVariables, int maxSize) { int nRows = xy.GetLength(0); var allowedInputs = allVariables.Skip(1); var dataset = new Dataset(allVariables, xy); var problemData = new RegressionProblemData(dataset, allowedInputs, allVariables.First()); problemData.TrainingPartition.Start = 0; problemData.TrainingPartition.End = nRows; problemData.TestPartition.Start = nRows; problemData.TestPartition.End = nRows; var solution = GradientBoostedTreesAlgorithmStatic.TrainGbm(problemData, new SquaredErrorLoss(), maxSize, nu: 1, r: 1, m: 1, maxIterations: 1, randSeed: 31415); var model = solution.Model; var treeM = model.Models.Skip(1).First() as RegressionTreeModel; Console.WriteLine(treeM.ToString()); Console.WriteLine(); }
protected override IRegressionProblemData ImportData(string path, RegressionImportType type, TableFileParser csvFileParser) { List <IList> values = csvFileParser.Values; if (type.Shuffle) { values = Shuffle(values); } Dataset dataset = new Dataset(csvFileParser.VariableNames, values); // turn of input variables that are constant in the training partition var allowedInputVars = new List <string>(); int trainingPartEnd = (csvFileParser.Rows * type.TrainingPercentage) / 100; trainingPartEnd = trainingPartEnd > 0 ? trainingPartEnd : 1; var trainingIndizes = Enumerable.Range(0, trainingPartEnd); if (trainingIndizes.Count() >= 2) { foreach (var variableName in dataset.DoubleVariables) { if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && variableName != type.TargetVariable) { allowedInputVars.Add(variableName); } } } else { allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable))); } RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, type.TargetVariable); regressionData.TrainingPartition.Start = 0; regressionData.TrainingPartition.End = trainingPartEnd; regressionData.TestPartition.Start = trainingPartEnd; regressionData.TestPartition.End = csvFileParser.Rows; regressionData.Name = Path.GetFileName(path); return(regressionData); }
public override IRegressionProblemData ImportData(string path) { TableFileParser csvFileParser = new TableFileParser(); csvFileParser.Parse(path, csvFileParser.AreColumnNamesInFirstLine(path)); Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values); string targetVar = dataset.DoubleVariables.Last(); // turn off input variables that are constant in the training partition var allowedInputVars = new List <string>(); var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3); if (trainingIndizes.Count() >= 2) { foreach (var variableName in dataset.DoubleVariables) { if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && variableName != targetVar) { allowedInputVars.Add(variableName); } } } else { allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); } IRegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, targetVar); var trainingPartEnd = trainingIndizes.Last(); regressionData.TrainingPartition.Start = trainingIndizes.First(); regressionData.TrainingPartition.End = trainingPartEnd; regressionData.TestPartition.Start = trainingPartEnd; regressionData.TestPartition.End = csvFileParser.Rows; regressionData.Name = Path.GetFileName(path); return(regressionData); }
public void PerformanceVariableImpactRegressionTest() { int rows = 20000; int columns = 77; var dataSet = OnlineCalculatorPerformanceTest.CreateRandomDataset(new MersenneTwister(1234), rows, columns); IRegressionProblemData problemData = new RegressionProblemData(dataSet, dataSet.VariableNames.Except("y".ToEnumerable()), "y"); double rmsError; double cvRmsError; var solution = LinearRegression.CreateSolution(problemData, out rmsError, out cvRmsError); Stopwatch watch = new Stopwatch(); watch.Start(); var results = RegressionSolutionVariableImpactsCalculator.CalculateImpacts(solution); watch.Stop(); TestContext.WriteLine(""); TestContext.WriteLine("Calculated cells per millisecond: {0}.", rows * columns / watch.ElapsedMilliseconds); }
public IRegressionModel BuildModel(IReadOnlyList <int> rows, RegressionTreeParameters parameters, CancellationToken cancellation, out int numberOfParameters) { var reducedData = RegressionTreeUtilities.ReduceDataset(parameters.Data, rows, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable); var pd = new RegressionProblemData(reducedData, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable); pd.TrainingPartition.Start = 0; pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = reducedData.Rows; int numP; var model = Build(pd, parameters.Random, cancellation, out numP); if (UseDampening && Dampening > 0.0) { model = DampenedModel.DampenModel(model, pd, Dampening); } numberOfParameters = numP; cancellation.ThrowIfCancellationRequested(); return(model); }
// wraps the list of basis functions into an IRegressionProblemData object private static IRegressionProblemData PrepareData(IRegressionProblemData problemData, IEnumerable <BasisFunction> basisFunctions) { HashSet <string> variableNames = new HashSet <string>(); List <IList> variableVals = new List <IList>(); foreach (var basisFunc in basisFunctions) { variableNames.Add(basisFunc.Var); variableVals.Add(new List <double>(basisFunc.Val)); } var matrix = new ModifiableDataset(variableNames, variableVals); // add the unmodified target variable to the matrix matrix.AddVariable(problemData.TargetVariable, problemData.TargetVariableValues.ToList()); IEnumerable <string> allowedInputVars = matrix.VariableNames.Where(x => !x.Equals(problemData.TargetVariable)).ToArray(); IRegressionProblemData rpd = new RegressionProblemData(matrix, allowedInputVars, problemData.TargetVariable); rpd.TargetVariable = problemData.TargetVariable; rpd.TrainingPartition.Start = problemData.TrainingPartition.Start; rpd.TrainingPartition.End = problemData.TrainingPartition.End; rpd.TestPartition.Start = problemData.TestPartition.Start; rpd.TestPartition.End = problemData.TestPartition.End; return(rpd); }
protected override void Run(CancellationToken cancellationToken) { // Set up the algorithm if (SetSeedRandomly) { Seed = RandomSeedGenerator.GetSeed(); } var rand = new MersenneTwister((uint)Seed); // Set up the results display var iterations = new IntValue(0); Results.Add(new Result("Iterations", iterations)); var table = new DataTable("Qualities"); table.Rows.Add(new DataRow("R² (train)")); table.Rows.Add(new DataRow("R² (test)")); Results.Add(new Result("Qualities", table)); var curLoss = new DoubleValue(); var curTestLoss = new DoubleValue(); Results.Add(new Result("R² (train)", curLoss)); Results.Add(new Result("R² (test)", curTestLoss)); var runCollection = new RunCollection(); if (StoreRuns) { Results.Add(new Result("Runs", runCollection)); } // init var problemData = Problem.ProblemData; var targetVarName = problemData.TargetVariable; var activeVariables = problemData.AllowedInputVariables.Concat(new string[] { problemData.TargetVariable }); var modifiableDataset = new ModifiableDataset( activeVariables, activeVariables.Select(v => problemData.Dataset.GetDoubleValues(v).ToList())); var trainingRows = problemData.TrainingIndices; var testRows = problemData.TestIndices; var yPred = new double[trainingRows.Count()]; var yPredTest = new double[testRows.Count()]; var y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray(); var curY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray(); var yTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray(); var curYTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray(); var nu = Nu; var mVars = (int)Math.Ceiling(M * problemData.AllowedInputVariables.Count()); var rRows = (int)Math.Ceiling(R * problemData.TrainingIndices.Count()); var alg = RegressionAlgorithm; List <IRegressionModel> models = new List <IRegressionModel>(); try { // Loop until iteration limit reached or canceled. for (int i = 0; i < Iterations; i++) { cancellationToken.ThrowIfCancellationRequested(); modifiableDataset.RemoveVariable(targetVarName); modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest).ToList()); SampleTrainingData(rand, modifiableDataset, rRows, problemData.Dataset, curY, problemData.TargetVariable, problemData.TrainingIndices); // all training indices from the original problem data are allowed var modifiableProblemData = new RegressionProblemData(modifiableDataset, problemData.AllowedInputVariables.SampleRandomWithoutRepetition(rand, mVars), problemData.TargetVariable); modifiableProblemData.TrainingPartition.Start = 0; modifiableProblemData.TrainingPartition.End = rRows; modifiableProblemData.TestPartition.Start = problemData.TestPartition.Start; modifiableProblemData.TestPartition.End = problemData.TestPartition.End; if (!TrySetProblemData(alg, modifiableProblemData)) { throw new NotSupportedException("The algorithm cannot be used with GBM."); } IRegressionModel model; IRun run; // try to find a model. The algorithm might fail to produce a model. In this case we just retry until the iterations are exhausted if (TryExecute(alg, rand.Next(), RegressionAlgorithmResult, out model, out run)) { int row = 0; // update predictions for training and test // update new targets (in the case of squared error loss we simply use negative residuals) foreach (var pred in model.GetEstimatedValues(problemData.Dataset, trainingRows)) { yPred[row] = yPred[row] + nu * pred; curY[row] = y[row] - yPred[row]; row++; } row = 0; foreach (var pred in model.GetEstimatedValues(problemData.Dataset, testRows)) { yPredTest[row] = yPredTest[row] + nu * pred; curYTest[row] = yTest[row] - yPredTest[row]; row++; } // determine quality OnlineCalculatorError error; var trainR = OnlinePearsonsRCalculator.Calculate(yPred, y, out error); var testR = OnlinePearsonsRCalculator.Calculate(yPredTest, yTest, out error); // iteration results curLoss.Value = error == OnlineCalculatorError.None ? trainR * trainR : 0.0; curTestLoss.Value = error == OnlineCalculatorError.None ? testR * testR : 0.0; models.Add(model); } if (StoreRuns) { runCollection.Add(run); } table.Rows["R² (train)"].Values.Add(curLoss.Value); table.Rows["R² (test)"].Values.Add(curTestLoss.Value); iterations.Value = i + 1; } // produce solution if (CreateSolution) { // when all our models are symbolic models we can easily combine them to a single model if (models.All(m => m is ISymbolicRegressionModel)) { Results.Add(new Result("Solution", CreateSymbolicSolution(models, Nu, (IRegressionProblemData)problemData.Clone()))); } // just produce an ensemble solution for now (TODO: correct scaling or linear regression for ensemble model weights) var ensembleSolution = CreateEnsembleSolution(models, (IRegressionProblemData)problemData.Clone()); Results.Add(new Result("EnsembleSolution", ensembleSolution)); } } finally { // reset everything alg.Prepare(true); } }
private static void SymbolicDataAnalysisCrossoverPerformanceTest(ISymbolicDataAnalysisExpressionCrossover<IRegressionProblemData> crossover) { var twister = new MersenneTwister(31415); var dataset = Util.CreateRandomDataset(twister, Rows, Columns); var grammar = new FullFunctionalExpressionGrammar(); var stopwatch = new Stopwatch(); grammar.MaximumFunctionArguments = 0; grammar.MaximumFunctionDefinitions = 0; grammar.MinimumFunctionArguments = 0; grammar.MinimumFunctionDefinitions = 0; var trees = Util.CreateRandomTrees(twister, dataset, grammar, PopulationSize, 1, MaxTreeLength, 0, 0); foreach (ISymbolicExpressionTree tree in trees) { Util.InitTree(tree, twister, new List<string>(dataset.VariableNames)); } var problemData = new RegressionProblemData(dataset, dataset.VariableNames, dataset.VariableNames.Last()); var problem = new SymbolicRegressionSingleObjectiveProblem(); problem.ProblemData = problemData; var globalScope = new Scope("Global Scope"); globalScope.Variables.Add(new Core.Variable("Random", twister)); var context = new ExecutionContext(null, problem, globalScope); context = new ExecutionContext(context, crossover, globalScope); stopwatch.Start(); for (int i = 0; i != PopulationSize; ++i) { var parent0 = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone(); var scopeParent0 = new Scope(); scopeParent0.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent0)); context.Scope.SubScopes.Add(scopeParent0); var parent1 = (ISymbolicExpressionTree)trees.SampleRandom(twister).Clone(); var scopeParent1 = new Scope(); scopeParent1.Variables.Add(new Core.Variable(crossover.ParentsParameter.ActualName, parent1)); context.Scope.SubScopes.Add(scopeParent1); crossover.Execute(context, new CancellationToken()); context.Scope.SubScopes.Remove(scopeParent0); // clean the scope in preparation for the next iteration context.Scope.SubScopes.Remove(scopeParent1); // clean the scope in preparation for the next iteration } stopwatch.Stop(); double msPerCrossover = 2 * stopwatch.ElapsedMilliseconds / (double)PopulationSize; Console.WriteLine(crossover.Name + ": " + Environment.NewLine + msPerCrossover + " ms per crossover (~" + Math.Round(1000.0 / (msPerCrossover)) + " crossover operations / s)"); foreach (var tree in trees) HeuristicLab.Encodings.SymbolicExpressionTreeEncoding.Tests.Util.IsValid(tree); }