static RegressionProblemData() { defaultDataset = new Dataset(new string[] { "y", "x" }, kozaF1); defaultDataset.Name = "Fourth-order Polynomial Function Benchmark Dataset"; defaultDataset.Description = "f(x) = x^4 + x^3 + x^2 + x^1"; defaultAllowedInputVariables = new List <string>() { "x" }; defaultTargetVariable = "y"; var problemData = new RegressionProblemData(); problemData.Parameters.Clear(); problemData.Name = "Empty Regression ProblemData"; problemData.Description = "This ProblemData acts as place holder before the correct problem data is loaded."; problemData.isEmpty = true; problemData.Parameters.Add(new FixedValueParameter <Dataset>(DatasetParameterName, "", new Dataset())); problemData.Parameters.Add(new FixedValueParameter <ReadOnlyCheckedItemList <StringValue> >(InputVariablesParameterName, "")); problemData.Parameters.Add(new FixedValueParameter <IntRange>(TrainingPartitionParameterName, "", (IntRange) new IntRange(0, 0).AsReadOnly())); problemData.Parameters.Add(new FixedValueParameter <IntRange>(TestPartitionParameterName, "", (IntRange) new IntRange(0, 0).AsReadOnly())); problemData.Parameters.Add(new ConstrainedValueParameter <StringValue>(TargetVariableParameterName, new ItemSet <StringValue>())); emptyProblemData = problemData; }
public override IRegressionProblemData ImportData(string path) { TableFileParser csvFileParser = new TableFileParser(); csvFileParser.Parse(path, csvFileParser.AreColumnNamesInFirstLine(path)); Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values); string targetVar = dataset.DoubleVariables.Last(); // turn off input variables that are constant in the training partition var allowedInputVars = new List<string>(); var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3); if (trainingIndizes.Count() >= 2) { foreach (var variableName in dataset.DoubleVariables) { if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && variableName != targetVar) allowedInputVars.Add(variableName); } } else { allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); } IRegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, targetVar); var trainingPartEnd = trainingIndizes.Last(); regressionData.TrainingPartition.Start = trainingIndizes.First(); regressionData.TrainingPartition.End = trainingPartEnd; regressionData.TestPartition.Start = trainingPartEnd; regressionData.TestPartition.End = csvFileParser.Rows; regressionData.Name = Path.GetFileName(path); return regressionData; }
private IDataAnalysisProblemData CreateRegressionData(RegressionProblemData oldProblemData) { var targetVariable = oldProblemData.TargetVariable; if (!context.Data.VariableNames.Contains(targetVariable)) targetVariable = context.Data.VariableNames.First(); var inputVariables = GetDoubleInputVariables(targetVariable); var newProblemData = new RegressionProblemData(ExportedDataset, inputVariables, targetVariable, Transformations); return newProblemData; }
public IRegressionProblemData GenerateRegressionData(Dataset dataset) { RegressionProblemData regData = new RegressionProblemData(dataset, AllowedInputVariables, TargetVariable); regData.Name = this.Name; regData.Description = this.Description; regData.TrainingPartition.Start = this.TrainingPartitionStart; regData.TrainingPartition.End = this.TrainingPartitionEnd; regData.TestPartition.Start = this.TestPartitionStart; regData.TestPartition.End = this.TestPartitionEnd; return regData; }
public override void AdjustProblemDataProperties(IDataAnalysisProblemData problemData) { if (problemData == null) { throw new ArgumentNullException("problemData", "The provided problemData is null."); } RegressionProblemData regressionProblemData = problemData as RegressionProblemData; if (regressionProblemData == null) { throw new ArgumentException("The problem data is not a regression problem data. Instead a " + problemData.GetType().GetPrettyName() + " was provided.", "problemData"); } base.AdjustProblemDataProperties(problemData); }
static RegressionProblemData() { defaultDataset = new Dataset(new string[] { "y", "x" }, kozaF1); defaultDataset.Name = "Fourth-order Polynomial Function Benchmark Dataset"; defaultDataset.Description = "f(x) = x^4 + x^3 + x^2 + x^1"; defaultAllowedInputVariables = new List<string>() { "x" }; defaultTargetVariable = "y"; var problemData = new RegressionProblemData(); problemData.Parameters.Clear(); problemData.Name = "Empty Regression ProblemData"; problemData.Description = "This ProblemData acts as place holder before the correct problem data is loaded."; problemData.isEmpty = true; problemData.Parameters.Add(new FixedValueParameter<Dataset>(DatasetParameterName, "", new Dataset())); problemData.Parameters.Add(new FixedValueParameter<ReadOnlyCheckedItemList<StringValue>>(InputVariablesParameterName, "")); problemData.Parameters.Add(new FixedValueParameter<IntRange>(TrainingPartitionParameterName, "", (IntRange)new IntRange(0, 0).AsReadOnly())); problemData.Parameters.Add(new FixedValueParameter<IntRange>(TestPartitionParameterName, "", (IntRange)new IntRange(0, 0).AsReadOnly())); problemData.Parameters.Add(new ConstrainedValueParameter<StringValue>(TargetVariableParameterName, new ItemSet<StringValue>())); emptyProblemData = problemData; }
protected override void OnProblemDataChanged() { trainingEvaluationCache.Clear(); testEvaluationCache.Clear(); evaluationCache.Clear(); IRegressionProblemData problemData = new RegressionProblemData(ProblemData.Dataset, ProblemData.AllowedInputVariables, ProblemData.TargetVariable); problemData.TrainingPartition.Start = ProblemData.TrainingPartition.Start; problemData.TrainingPartition.End = ProblemData.TrainingPartition.End; problemData.TestPartition.Start = ProblemData.TestPartition.Start; problemData.TestPartition.End = ProblemData.TestPartition.End; foreach (var solution in RegressionSolutions) { if (solution is RegressionEnsembleSolution) { solution.ProblemData = ProblemData; } else { solution.ProblemData = problemData; } } foreach (var trainingPartition in trainingPartitions.Values) { trainingPartition.Start = ProblemData.TrainingPartition.Start; trainingPartition.End = ProblemData.TrainingPartition.End; } foreach (var testPartition in testPartitions.Values) { testPartition.Start = ProblemData.TestPartition.Start; testPartition.End = ProblemData.TestPartition.End; } base.OnProblemDataChanged(); }
protected RegressionProblemData(RegressionProblemData original, Cloner cloner) : base(original, cloner) { RegisterParameterEvents(); }
protected override void Run(CancellationToken cancellationToken) { // Set up the algorithm if (SetSeedRandomly) Seed = new System.Random().Next(); var rand = new MersenneTwister((uint)Seed); // Set up the results display var iterations = new IntValue(0); Results.Add(new Result("Iterations", iterations)); var table = new DataTable("Qualities"); table.Rows.Add(new DataRow("R² (train)")); table.Rows.Add(new DataRow("R² (test)")); Results.Add(new Result("Qualities", table)); var curLoss = new DoubleValue(); var curTestLoss = new DoubleValue(); Results.Add(new Result("R² (train)", curLoss)); Results.Add(new Result("R² (test)", curTestLoss)); var runCollection = new RunCollection(); if (StoreRuns) Results.Add(new Result("Runs", runCollection)); // init var problemData = Problem.ProblemData; var targetVarName = problemData.TargetVariable; var activeVariables = problemData.AllowedInputVariables.Concat(new string[] { problemData.TargetVariable }); var modifiableDataset = new ModifiableDataset( activeVariables, activeVariables.Select(v => problemData.Dataset.GetDoubleValues(v).ToList())); var trainingRows = problemData.TrainingIndices; var testRows = problemData.TestIndices; var yPred = new double[trainingRows.Count()]; var yPredTest = new double[testRows.Count()]; var y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray(); var curY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray(); var yTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray(); var curYTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray(); var nu = Nu; var mVars = (int)Math.Ceiling(M * problemData.AllowedInputVariables.Count()); var rRows = (int)Math.Ceiling(R * problemData.TrainingIndices.Count()); var alg = RegressionAlgorithm; List<IRegressionModel> models = new List<IRegressionModel>(); try { // Loop until iteration limit reached or canceled. for (int i = 0; i < Iterations; i++) { cancellationToken.ThrowIfCancellationRequested(); modifiableDataset.RemoveVariable(targetVarName); modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest)); SampleTrainingData(rand, modifiableDataset, rRows, problemData.Dataset, curY, problemData.TargetVariable, problemData.TrainingIndices); // all training indices from the original problem data are allowed var modifiableProblemData = new RegressionProblemData(modifiableDataset, problemData.AllowedInputVariables.SampleRandomWithoutRepetition(rand, mVars), problemData.TargetVariable); modifiableProblemData.TrainingPartition.Start = 0; modifiableProblemData.TrainingPartition.End = rRows; modifiableProblemData.TestPartition.Start = problemData.TestPartition.Start; modifiableProblemData.TestPartition.End = problemData.TestPartition.End; if (!TrySetProblemData(alg, modifiableProblemData)) throw new NotSupportedException("The algorithm cannot be used with GBM."); IRegressionModel model; IRun run; // try to find a model. The algorithm might fail to produce a model. In this case we just retry until the iterations are exhausted if (TryExecute(alg, rand.Next(), RegressionAlgorithmResult, out model, out run)) { int row = 0; // update predictions for training and test // update new targets (in the case of squared error loss we simply use negative residuals) foreach (var pred in model.GetEstimatedValues(problemData.Dataset, trainingRows)) { yPred[row] = yPred[row] + nu * pred; curY[row] = y[row] - yPred[row]; row++; } row = 0; foreach (var pred in model.GetEstimatedValues(problemData.Dataset, testRows)) { yPredTest[row] = yPredTest[row] + nu * pred; curYTest[row] = yTest[row] - yPredTest[row]; row++; } // determine quality OnlineCalculatorError error; var trainR = OnlinePearsonsRCalculator.Calculate(yPred, y, out error); var testR = OnlinePearsonsRCalculator.Calculate(yPredTest, yTest, out error); // iteration results curLoss.Value = error == OnlineCalculatorError.None ? trainR * trainR : 0.0; curTestLoss.Value = error == OnlineCalculatorError.None ? testR * testR : 0.0; models.Add(model); } if (StoreRuns) runCollection.Add(run); table.Rows["R² (train)"].Values.Add(curLoss.Value); table.Rows["R² (test)"].Values.Add(curTestLoss.Value); iterations.Value = i + 1; } // produce solution if (CreateSolution) { // when all our models are symbolic models we can easily combine them to a single model if (models.All(m => m is ISymbolicRegressionModel)) { Results.Add(new Result("Solution", CreateSymbolicSolution(models, Nu, (IRegressionProblemData)problemData.Clone()))); } // just produce an ensemble solution for now (TODO: correct scaling or linear regression for ensemble model weights) var ensembleSolution = CreateEnsembleSolution(models, (IRegressionProblemData)problemData.Clone()); Results.Add(new Result("EnsembleSolution", ensembleSolution)); } } finally { // reset everything alg.Prepare(true); } }
private void BuildTree(double[,] xy, string[] allVariables, int maxSize) { int nRows = xy.GetLength(0); var allowedInputs = allVariables.Skip(1); var dataset = new Dataset(allVariables, xy); var problemData = new RegressionProblemData(dataset, allowedInputs, allVariables.First()); problemData.TrainingPartition.Start = 0; problemData.TrainingPartition.End = nRows; problemData.TestPartition.Start = nRows; problemData.TestPartition.End = nRows; var solution = GradientBoostedTreesAlgorithmStatic.TrainGbm(problemData, new SquaredErrorLoss(), maxSize, nu: 1, r: 1, m: 1, maxIterations: 1, randSeed: 31415); var model = solution.Model; var treeM = model.Models.Skip(1).First() as RegressionTreeModel; Console.WriteLine(treeM.ToString()); Console.WriteLine(); }
protected override IRegressionProblemData ImportData(string path, RegressionImportType type, TableFileParser csvFileParser) { List<IList> values = csvFileParser.Values; if (type.Shuffle) { values = Shuffle(values); } Dataset dataset = new Dataset(csvFileParser.VariableNames, values); // turn of input variables that are constant in the training partition var allowedInputVars = new List<string>(); int trainingPartEnd = (csvFileParser.Rows * type.TrainingPercentage) / 100; trainingPartEnd = trainingPartEnd > 0 ? trainingPartEnd : 1; var trainingIndizes = Enumerable.Range(0, trainingPartEnd); if (trainingIndizes.Count() >= 2) { foreach (var variableName in dataset.DoubleVariables) { if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && variableName != type.TargetVariable) allowedInputVars.Add(variableName); } } else { allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable))); } RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, type.TargetVariable); regressionData.TrainingPartition.Start = 0; regressionData.TrainingPartition.End = trainingPartEnd; regressionData.TestPartition.Start = trainingPartEnd; regressionData.TestPartition.End = csvFileParser.Rows; regressionData.Name = Path.GetFileName(path); return regressionData; }
protected override void OnProblemDataChanged() { trainingEvaluationCache.Clear(); testEvaluationCache.Clear(); evaluationCache.Clear(); IRegressionProblemData problemData = new RegressionProblemData(ProblemData.Dataset, ProblemData.AllowedInputVariables, ProblemData.TargetVariable); problemData.TrainingPartition.Start = ProblemData.TrainingPartition.Start; problemData.TrainingPartition.End = ProblemData.TrainingPartition.End; problemData.TestPartition.Start = ProblemData.TestPartition.Start; problemData.TestPartition.End = ProblemData.TestPartition.End; foreach (var solution in RegressionSolutions) { if (solution is RegressionEnsembleSolution) solution.ProblemData = ProblemData; else solution.ProblemData = problemData; } foreach (var trainingPartition in trainingPartitions.Values) { trainingPartition.Start = ProblemData.TrainingPartition.Start; trainingPartition.End = ProblemData.TrainingPartition.End; } foreach (var testPartition in testPartitions.Values) { testPartition.Start = ProblemData.TestPartition.Start; testPartition.End = ProblemData.TestPartition.End; } base.OnProblemDataChanged(); }