static RegressionProblemData()
        {
            defaultDataset               = new Dataset(new string[] { "y", "x" }, kozaF1);
            defaultDataset.Name          = "Fourth-order Polynomial Function Benchmark Dataset";
            defaultDataset.Description   = "f(x) = x^4 + x^3 + x^2 + x^1";
            defaultAllowedInputVariables = new List <string>()
            {
                "x"
            };
            defaultTargetVariable = "y";

            var problemData = new RegressionProblemData();

            problemData.Parameters.Clear();
            problemData.Name        = "Empty Regression ProblemData";
            problemData.Description = "This ProblemData acts as place holder before the correct problem data is loaded.";
            problemData.isEmpty     = true;

            problemData.Parameters.Add(new FixedValueParameter <Dataset>(DatasetParameterName, "", new Dataset()));
            problemData.Parameters.Add(new FixedValueParameter <ReadOnlyCheckedItemList <StringValue> >(InputVariablesParameterName, ""));
            problemData.Parameters.Add(new FixedValueParameter <IntRange>(TrainingPartitionParameterName, "", (IntRange) new IntRange(0, 0).AsReadOnly()));
            problemData.Parameters.Add(new FixedValueParameter <IntRange>(TestPartitionParameterName, "", (IntRange) new IntRange(0, 0).AsReadOnly()));
            problemData.Parameters.Add(new ConstrainedValueParameter <StringValue>(TargetVariableParameterName, new ItemSet <StringValue>()));
            emptyProblemData = problemData;
        }
    public override IRegressionProblemData ImportData(string path) {
      TableFileParser csvFileParser = new TableFileParser();
      csvFileParser.Parse(path, csvFileParser.AreColumnNamesInFirstLine(path));

      Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values);
      string targetVar = dataset.DoubleVariables.Last();

      // turn off input variables that are constant in the training partition
      var allowedInputVars = new List<string>();
      var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3);
      if (trainingIndizes.Count() >= 2) {
        foreach (var variableName in dataset.DoubleVariables) {
          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
            variableName != targetVar)
            allowedInputVars.Add(variableName);
        }
      } else {
        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
      }

      IRegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, targetVar);

      var trainingPartEnd = trainingIndizes.Last();
      regressionData.TrainingPartition.Start = trainingIndizes.First();
      regressionData.TrainingPartition.End = trainingPartEnd;
      regressionData.TestPartition.Start = trainingPartEnd;
      regressionData.TestPartition.End = csvFileParser.Rows;

      regressionData.Name = Path.GetFileName(path);

      return regressionData;
    }
 private IDataAnalysisProblemData CreateRegressionData(RegressionProblemData oldProblemData) {
   var targetVariable = oldProblemData.TargetVariable;
   if (!context.Data.VariableNames.Contains(targetVariable))
     targetVariable = context.Data.VariableNames.First();
   var inputVariables = GetDoubleInputVariables(targetVariable);
   var newProblemData = new RegressionProblemData(ExportedDataset, inputVariables, targetVariable, Transformations);
   return newProblemData;
 }
 public IRegressionProblemData GenerateRegressionData(Dataset dataset) {
   RegressionProblemData regData = new RegressionProblemData(dataset, AllowedInputVariables, TargetVariable);
   regData.Name = this.Name;
   regData.Description = this.Description;
   regData.TrainingPartition.Start = this.TrainingPartitionStart;
   regData.TrainingPartition.End = this.TrainingPartitionEnd;
   regData.TestPartition.Start = this.TestPartitionStart;
   regData.TestPartition.End = this.TestPartitionEnd;
   return regData;
 }
        public override void AdjustProblemDataProperties(IDataAnalysisProblemData problemData)
        {
            if (problemData == null)
            {
                throw new ArgumentNullException("problemData", "The provided problemData is null.");
            }
            RegressionProblemData regressionProblemData = problemData as RegressionProblemData;

            if (regressionProblemData == null)
            {
                throw new ArgumentException("The problem data is not a regression problem data. Instead a " + problemData.GetType().GetPrettyName() + " was provided.", "problemData");
            }

            base.AdjustProblemDataProperties(problemData);
        }
    static RegressionProblemData() {
      defaultDataset = new Dataset(new string[] { "y", "x" }, kozaF1);
      defaultDataset.Name = "Fourth-order Polynomial Function Benchmark Dataset";
      defaultDataset.Description = "f(x) = x^4 + x^3 + x^2 + x^1";
      defaultAllowedInputVariables = new List<string>() { "x" };
      defaultTargetVariable = "y";

      var problemData = new RegressionProblemData();
      problemData.Parameters.Clear();
      problemData.Name = "Empty Regression ProblemData";
      problemData.Description = "This ProblemData acts as place holder before the correct problem data is loaded.";
      problemData.isEmpty = true;

      problemData.Parameters.Add(new FixedValueParameter<Dataset>(DatasetParameterName, "", new Dataset()));
      problemData.Parameters.Add(new FixedValueParameter<ReadOnlyCheckedItemList<StringValue>>(InputVariablesParameterName, ""));
      problemData.Parameters.Add(new FixedValueParameter<IntRange>(TrainingPartitionParameterName, "", (IntRange)new IntRange(0, 0).AsReadOnly()));
      problemData.Parameters.Add(new FixedValueParameter<IntRange>(TestPartitionParameterName, "", (IntRange)new IntRange(0, 0).AsReadOnly()));
      problemData.Parameters.Add(new ConstrainedValueParameter<StringValue>(TargetVariableParameterName, new ItemSet<StringValue>()));
      emptyProblemData = problemData;
    }
        protected override void OnProblemDataChanged()
        {
            trainingEvaluationCache.Clear();
            testEvaluationCache.Clear();
            evaluationCache.Clear();
            IRegressionProblemData problemData = new RegressionProblemData(ProblemData.Dataset,
                                                                           ProblemData.AllowedInputVariables,
                                                                           ProblemData.TargetVariable);

            problemData.TrainingPartition.Start = ProblemData.TrainingPartition.Start;
            problemData.TrainingPartition.End   = ProblemData.TrainingPartition.End;
            problemData.TestPartition.Start     = ProblemData.TestPartition.Start;
            problemData.TestPartition.End       = ProblemData.TestPartition.End;

            foreach (var solution in RegressionSolutions)
            {
                if (solution is RegressionEnsembleSolution)
                {
                    solution.ProblemData = ProblemData;
                }
                else
                {
                    solution.ProblemData = problemData;
                }
            }
            foreach (var trainingPartition in trainingPartitions.Values)
            {
                trainingPartition.Start = ProblemData.TrainingPartition.Start;
                trainingPartition.End   = ProblemData.TrainingPartition.End;
            }
            foreach (var testPartition in testPartitions.Values)
            {
                testPartition.Start = ProblemData.TestPartition.Start;
                testPartition.End   = ProblemData.TestPartition.End;
            }

            base.OnProblemDataChanged();
        }
 protected RegressionProblemData(RegressionProblemData original, Cloner cloner)
   : base(original, cloner) {
   RegisterParameterEvents();
 }
    protected override void Run(CancellationToken cancellationToken) {
      // Set up the algorithm
      if (SetSeedRandomly) Seed = new System.Random().Next();
      var rand = new MersenneTwister((uint)Seed);

      // Set up the results display
      var iterations = new IntValue(0);
      Results.Add(new Result("Iterations", iterations));

      var table = new DataTable("Qualities");
      table.Rows.Add(new DataRow("R² (train)"));
      table.Rows.Add(new DataRow("R² (test)"));
      Results.Add(new Result("Qualities", table));
      var curLoss = new DoubleValue();
      var curTestLoss = new DoubleValue();
      Results.Add(new Result("R² (train)", curLoss));
      Results.Add(new Result("R² (test)", curTestLoss));
      var runCollection = new RunCollection();
      if (StoreRuns)
        Results.Add(new Result("Runs", runCollection));

      // init
      var problemData = Problem.ProblemData;
      var targetVarName = problemData.TargetVariable;
      var activeVariables = problemData.AllowedInputVariables.Concat(new string[] { problemData.TargetVariable });
      var modifiableDataset = new ModifiableDataset(
        activeVariables,
        activeVariables.Select(v => problemData.Dataset.GetDoubleValues(v).ToList()));

      var trainingRows = problemData.TrainingIndices;
      var testRows = problemData.TestIndices;
      var yPred = new double[trainingRows.Count()];
      var yPredTest = new double[testRows.Count()];
      var y = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();
      var curY = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices).ToArray();

      var yTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray();
      var curYTest = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TestIndices).ToArray();
      var nu = Nu;
      var mVars = (int)Math.Ceiling(M * problemData.AllowedInputVariables.Count());
      var rRows = (int)Math.Ceiling(R * problemData.TrainingIndices.Count());
      var alg = RegressionAlgorithm;
      List<IRegressionModel> models = new List<IRegressionModel>();
      try {

        // Loop until iteration limit reached or canceled.
        for (int i = 0; i < Iterations; i++) {
          cancellationToken.ThrowIfCancellationRequested();

          modifiableDataset.RemoveVariable(targetVarName);
          modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest));

          SampleTrainingData(rand, modifiableDataset, rRows, problemData.Dataset, curY, problemData.TargetVariable, problemData.TrainingIndices); // all training indices from the original problem data are allowed 
          var modifiableProblemData = new RegressionProblemData(modifiableDataset,
            problemData.AllowedInputVariables.SampleRandomWithoutRepetition(rand, mVars),
            problemData.TargetVariable);
          modifiableProblemData.TrainingPartition.Start = 0;
          modifiableProblemData.TrainingPartition.End = rRows;
          modifiableProblemData.TestPartition.Start = problemData.TestPartition.Start;
          modifiableProblemData.TestPartition.End = problemData.TestPartition.End;

          if (!TrySetProblemData(alg, modifiableProblemData))
            throw new NotSupportedException("The algorithm cannot be used with GBM.");

          IRegressionModel model;
          IRun run;

          // try to find a model. The algorithm might fail to produce a model. In this case we just retry until the iterations are exhausted
          if (TryExecute(alg, rand.Next(), RegressionAlgorithmResult, out model, out run)) {
            int row = 0;
            // update predictions for training and test
            // update new targets (in the case of squared error loss we simply use negative residuals)
            foreach (var pred in model.GetEstimatedValues(problemData.Dataset, trainingRows)) {
              yPred[row] = yPred[row] + nu * pred;
              curY[row] = y[row] - yPred[row];
              row++;
            }
            row = 0;
            foreach (var pred in model.GetEstimatedValues(problemData.Dataset, testRows)) {
              yPredTest[row] = yPredTest[row] + nu * pred;
              curYTest[row] = yTest[row] - yPredTest[row];
              row++;
            }
            // determine quality
            OnlineCalculatorError error;
            var trainR = OnlinePearsonsRCalculator.Calculate(yPred, y, out error);
            var testR = OnlinePearsonsRCalculator.Calculate(yPredTest, yTest, out error);

            // iteration results
            curLoss.Value = error == OnlineCalculatorError.None ? trainR * trainR : 0.0;
            curTestLoss.Value = error == OnlineCalculatorError.None ? testR * testR : 0.0;

            models.Add(model);


          }

          if (StoreRuns)
            runCollection.Add(run);
          table.Rows["R² (train)"].Values.Add(curLoss.Value);
          table.Rows["R² (test)"].Values.Add(curTestLoss.Value);
          iterations.Value = i + 1;
        }

        // produce solution 
        if (CreateSolution) {
          // when all our models are symbolic models we can easily combine them to a single model
          if (models.All(m => m is ISymbolicRegressionModel)) {
            Results.Add(new Result("Solution", CreateSymbolicSolution(models, Nu, (IRegressionProblemData)problemData.Clone())));
          }
          // just produce an ensemble solution for now (TODO: correct scaling or linear regression for ensemble model weights)

          var ensembleSolution = CreateEnsembleSolution(models, (IRegressionProblemData)problemData.Clone());
          Results.Add(new Result("EnsembleSolution", ensembleSolution));
        }
      }
      finally {
        // reset everything
        alg.Prepare(true);
      }
    }
Esempio n. 10
0
    private void BuildTree(double[,] xy, string[] allVariables, int maxSize) {
      int nRows = xy.GetLength(0);
      var allowedInputs = allVariables.Skip(1);
      var dataset = new Dataset(allVariables, xy);
      var problemData = new RegressionProblemData(dataset, allowedInputs, allVariables.First());
      problemData.TrainingPartition.Start = 0;
      problemData.TrainingPartition.End = nRows;
      problemData.TestPartition.Start = nRows;
      problemData.TestPartition.End = nRows;
      var solution = GradientBoostedTreesAlgorithmStatic.TrainGbm(problemData, new SquaredErrorLoss(), maxSize, nu: 1, r: 1, m: 1, maxIterations: 1, randSeed: 31415);
      var model = solution.Model;
      var treeM = model.Models.Skip(1).First() as RegressionTreeModel;

      Console.WriteLine(treeM.ToString());
      Console.WriteLine();
    }
    protected override IRegressionProblemData ImportData(string path, RegressionImportType type, TableFileParser csvFileParser) {
      List<IList> values = csvFileParser.Values;
      if (type.Shuffle) {
        values = Shuffle(values);
      }
      Dataset dataset = new Dataset(csvFileParser.VariableNames, values);

      // turn of input variables that are constant in the training partition
      var allowedInputVars = new List<string>();
      int trainingPartEnd = (csvFileParser.Rows * type.TrainingPercentage) / 100;
      trainingPartEnd = trainingPartEnd > 0 ? trainingPartEnd : 1;
      var trainingIndizes = Enumerable.Range(0, trainingPartEnd);
      if (trainingIndizes.Count() >= 2) {
        foreach (var variableName in dataset.DoubleVariables) {
          if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
            variableName != type.TargetVariable)
            allowedInputVars.Add(variableName);
        }
      } else {
        allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable)));
      }

      RegressionProblemData regressionData = new RegressionProblemData(dataset, allowedInputVars, type.TargetVariable);

      regressionData.TrainingPartition.Start = 0;
      regressionData.TrainingPartition.End = trainingPartEnd;
      regressionData.TestPartition.Start = trainingPartEnd;
      regressionData.TestPartition.End = csvFileParser.Rows;

      regressionData.Name = Path.GetFileName(path);

      return regressionData;
    }
 protected RegressionProblemData(RegressionProblemData original, Cloner cloner)
     : base(original, cloner)
 {
     RegisterParameterEvents();
 }
    protected override void OnProblemDataChanged() {
      trainingEvaluationCache.Clear();
      testEvaluationCache.Clear();
      evaluationCache.Clear();
      IRegressionProblemData problemData = new RegressionProblemData(ProblemData.Dataset,
                                                                     ProblemData.AllowedInputVariables,
                                                                     ProblemData.TargetVariable);
      problemData.TrainingPartition.Start = ProblemData.TrainingPartition.Start;
      problemData.TrainingPartition.End = ProblemData.TrainingPartition.End;
      problemData.TestPartition.Start = ProblemData.TestPartition.Start;
      problemData.TestPartition.End = ProblemData.TestPartition.End;

      foreach (var solution in RegressionSolutions) {
        if (solution is RegressionEnsembleSolution)
          solution.ProblemData = ProblemData;
        else
          solution.ProblemData = problemData;
      }
      foreach (var trainingPartition in trainingPartitions.Values) {
        trainingPartition.Start = ProblemData.TrainingPartition.Start;
        trainingPartition.End = ProblemData.TrainingPartition.End;
      }
      foreach (var testPartition in testPartitions.Values) {
        testPartition.Start = ProblemData.TestPartition.Start;
        testPartition.End = ProblemData.TestPartition.End;
      }

      base.OnProblemDataChanged();
    }