public static ISymbolicRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, string modelStructure, int maxIterations) { var parser = new InfixExpressionParser(); var tree = parser.Parse(modelStructure); var simplifier = new SymbolicDataAnalysisExpressionTreeSimplifier(); if (!SymbolicRegressionConstantOptimizationEvaluator.CanOptimizeConstants(tree)) { throw new ArgumentException("The optimizer does not support the specified model structure."); } var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(); SymbolicRegressionConstantOptimizationEvaluator.OptimizeConstants(interpreter, tree, problemData, problemData.TrainingIndices, applyLinearScaling: false, maxIterations: maxIterations, updateVariableWeights: false, updateConstantsInTree: true); var scaledModel = new SymbolicRegressionModel(problemData.TargetVariable, tree, (ISymbolicDataAnalysisExpressionTreeInterpreter)interpreter.Clone()); scaledModel.Scale(problemData); SymbolicRegressionSolution solution = new SymbolicRegressionSolution(scaledModel, (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Regression Model"; solution.Name = "Regression Solution"; return(solution); }
private static ISymbolicRegressionSolution CreateSymbolicSolution(List <IRegressionModel> models, double nu, IRegressionProblemData problemData) { var symbModels = models.OfType <ISymbolicRegressionModel>(); var lowerLimit = symbModels.Min(m => m.LowerEstimationLimit); var upperLimit = symbModels.Max(m => m.UpperEstimationLimit); var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(); var progRootNode = new ProgramRootSymbol().CreateTreeNode(); var startNode = new StartSymbol().CreateTreeNode(); var addNode = new Addition().CreateTreeNode(); var mulNode = new Multiplication().CreateTreeNode(); var scaleNode = (ConstantTreeNode) new Constant().CreateTreeNode(); // all models are scaled using the same nu scaleNode.Value = nu; foreach (var m in symbModels) { var relevantPart = m.SymbolicExpressionTree.Root.GetSubtree(0).GetSubtree(0); // skip root and start addNode.AddSubtree((ISymbolicExpressionTreeNode)relevantPart.Clone()); } mulNode.AddSubtree(addNode); mulNode.AddSubtree(scaleNode); startNode.AddSubtree(mulNode); progRootNode.AddSubtree(startNode); var t = new SymbolicExpressionTree(progRootNode); var combinedModel = new SymbolicRegressionModel(problemData.TargetVariable, t, interpreter, lowerLimit, upperLimit); var sol = new SymbolicRegressionSolution(combinedModel, problemData); return(sol); }
public static IRegressionSolution CreateSymbolicSolution(double[] coeff, IRegressionProblemData problemData) { var ds = problemData.Dataset; var allVariables = problemData.AllowedInputVariables.ToArray(); var doubleVariables = allVariables.Where(ds.VariableHasType <double>); var factorVariableNames = allVariables.Where(ds.VariableHasType <string>); var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); // must consider all factor values (in train and test set) List <KeyValuePair <string, IEnumerable <string> > > remainingFactorVariablesAndValues = new List <KeyValuePair <string, IEnumerable <string> > >(); List <double> factorCoeff = new List <double>(); List <string> remainingDoubleVariables = new List <string>(); List <double> doubleVarCoeff = new List <double>(); { int i = 0; // find factor varibles & value combinations with non-zero coeff foreach (var factorVarAndValues in factorVariablesAndValues) { var l = new List <string>(); foreach (var factorValue in factorVarAndValues.Value) { if (!coeff[i].IsAlmost(0.0)) { l.Add(factorValue); factorCoeff.Add(coeff[i]); } i++; } if (l.Any()) { remainingFactorVariablesAndValues.Add(new KeyValuePair <string, IEnumerable <string> >(factorVarAndValues.Key, l)); } } // find double variables with non-zero coeff foreach (var doubleVar in doubleVariables) { if (!coeff[i].IsAlmost(0.0)) { remainingDoubleVariables.Add(doubleVar); doubleVarCoeff.Add(coeff[i]); } i++; } } var tree = LinearModelToTreeConverter.CreateTree( remainingFactorVariablesAndValues, factorCoeff.ToArray(), remainingDoubleVariables.ToArray(), doubleVarCoeff.ToArray(), coeff.Last()); SymbolicRegressionSolution solution = new SymbolicRegressionSolution( new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Elastic-net Linear Regression Model"; solution.Name = "Elastic-net Linear Regression Solution"; return(solution); }
private void transformModelButton_Click(object sender, EventArgs e) { var mapper = new TransformationToSymbolicTreeMapper(); var transformator = new SymbolicExpressionTreeBacktransformator(mapper); var transformations = Content.ProblemData.Transformations; var targetVar = Content.ProblemData.TargetVariable; var transformedModel = (ISymbolicRegressionModel)transformator.Backtransform(Content.Model, transformations, targetVar); var transformedSolution = new SymbolicRegressionSolution(transformedModel, (IRegressionProblemData)Content.ProblemData.Clone()); MainFormManager.MainForm.ShowContent(transformedSolution); }
private void transformModelButton_Click(object sender, EventArgs e) { var mapper = new TransformationToSymbolicTreeMapper(); var transformator = new SymbolicExpressionTreeBacktransformator(mapper); var transformations = Content.ProblemData.Transformations; var targetVar = Content.ProblemData.TargetVariable; var transformedModel = (ISymbolicRegressionModel)transformator.Backtransform(Content.Model, transformations, targetVar); var transformedSolution = new SymbolicRegressionSolution(transformedModel, (IRegressionProblemData)Content.ProblemData.Clone()); MainFormManager.MainForm.ShowContent(transformedSolution); }
public static ISymbolicRegressionSolution CreateLinearRegressionSolution(IRegressionProblemData problemData, out double rmsError, out double cvRmsError) { var dataset = problemData.Dataset; string targetVariable = problemData.TargetVariable; IEnumerable <string> allowedInputVariables = problemData.AllowedInputVariables; IEnumerable <int> rows = problemData.TrainingIndices; var doubleVariables = allowedInputVariables.Where(dataset.VariableHasType <double>); var factorVariableNames = allowedInputVariables.Where(dataset.VariableHasType <string>); var factorVariables = dataset.GetFactorVariableValues(factorVariableNames, rows); double[,] binaryMatrix = dataset.ToArray(factorVariables, rows); double[,] doubleVarMatrix = dataset.ToArray(doubleVariables.Concat(new string[] { targetVariable }), rows); var inputMatrix = binaryMatrix.HorzCat(doubleVarMatrix); if (inputMatrix.Cast <double>().Any(x => double.IsNaN(x) || double.IsInfinity(x))) { throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset."); } alglib.linearmodel lm = new alglib.linearmodel(); alglib.lrreport ar = new alglib.lrreport(); int nRows = inputMatrix.GetLength(0); int nFeatures = inputMatrix.GetLength(1) - 1; double[] coefficients = new double[nFeatures + 1]; // last coefficient is for the constant int retVal = 1; alglib.lrbuild(inputMatrix, nRows, nFeatures, out retVal, out lm, out ar); if (retVal != 1) { throw new ArgumentException("Error in calculation of linear regression solution"); } rmsError = ar.rmserror; cvRmsError = ar.cvrmserror; alglib.lrunpack(lm, out coefficients, out nFeatures); int nFactorCoeff = binaryMatrix.GetLength(1); int nVarCoeff = doubleVariables.Count(); var tree = LinearModelToTreeConverter.CreateTree(factorVariables, coefficients.Take(nFactorCoeff).ToArray(), doubleVariables.ToArray(), coefficients.Skip(nFactorCoeff).Take(nVarCoeff).ToArray(), @const: coefficients[nFeatures]); SymbolicRegressionSolution solution = new SymbolicRegressionSolution(new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeLinearInterpreter()), (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Linear Regression Model"; solution.Name = "Linear Regression Solution"; return(solution); }
public static ISymbolicRegressionSolution CreateLinearRegressionSolution(IRegressionProblemData problemData, out double rmsError, out double cvRmsError) { IEnumerable <string> doubleVariables; IEnumerable <KeyValuePair <string, IEnumerable <string> > > factorVariables; double[,] inputMatrix; PrepareData(problemData, out inputMatrix, out doubleVariables, out factorVariables); alglib.linearmodel lm = new alglib.linearmodel(); alglib.lrreport ar = new alglib.lrreport(); int nRows = inputMatrix.GetLength(0); int nFeatures = inputMatrix.GetLength(1) - 1; int retVal = 1; alglib.lrbuild(inputMatrix, nRows, nFeatures, out retVal, out lm, out ar); if (retVal != 1) { throw new ArgumentException("Error in calculation of linear regression solution"); } rmsError = ar.rmserror; cvRmsError = ar.cvrmserror; double[] coefficients = new double[nFeatures + 1]; // last coefficient is for the constant alglib.lrunpack(lm, out coefficients, out nFeatures); int nFactorCoeff = factorVariables.Sum(kvp => kvp.Value.Count()); int nVarCoeff = doubleVariables.Count(); var tree = LinearModelToTreeConverter.CreateTree(factorVariables, coefficients.Take(nFactorCoeff).ToArray(), doubleVariables.ToArray(), coefficients.Skip(nFactorCoeff).Take(nVarCoeff).ToArray(), @const: coefficients[nFeatures]); SymbolicRegressionSolution solution = new SymbolicRegressionSolution(new SymbolicRegressionModel(problemData.TargetVariable, tree, new SymbolicDataAnalysisExpressionTreeLinearInterpreter()), (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Linear Regression Model"; solution.Name = "Linear Regression Solution"; return(solution); }
/// <summary> /// Fits a model to the data by optimizing the numeric constants. /// Model is specified as infix expression containing variable names and numbers. /// The starting point for the numeric constants is initialized randomly if a random number generator is specified (~N(0,1)). Otherwise the user specified constants are /// used as a starting point. /// </summary>- /// <param name="problemData">Training and test data</param> /// <param name="modelStructure">The function as infix expression</param> /// <param name="maxIterations">Number of constant optimization iterations (using Levenberg-Marquardt algorithm)</param> /// <param name="random">Optional random number generator for random initialization of numeric constants.</param> /// <returns></returns> public static ISymbolicRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, string modelStructure, int maxIterations, bool applyLinearScaling, IRandom rand = null) { var parser = new InfixExpressionParser(); var tree = parser.Parse(modelStructure); // parser handles double and string variables equally by creating a VariableTreeNode // post-process to replace VariableTreeNodes by FactorVariableTreeNodes for all string variables var factorSymbol = new FactorVariable(); factorSymbol.VariableNames = problemData.AllowedInputVariables.Where(name => problemData.Dataset.VariableHasType <string>(name)); factorSymbol.AllVariableNames = factorSymbol.VariableNames; factorSymbol.VariableValues = factorSymbol.VariableNames.Select(name => new KeyValuePair <string, Dictionary <string, int> >(name, problemData.Dataset.GetReadOnlyStringValues(name).Distinct() .Select((n, i) => Tuple.Create(n, i)) .ToDictionary(tup => tup.Item1, tup => tup.Item2))); foreach (var parent in tree.IterateNodesPrefix().ToArray()) { for (int i = 0; i < parent.SubtreeCount; i++) { var varChild = parent.GetSubtree(i) as VariableTreeNode; var factorVarChild = parent.GetSubtree(i) as FactorVariableTreeNode; if (varChild != null && factorSymbol.VariableNames.Contains(varChild.VariableName)) { parent.RemoveSubtree(i); var factorTreeNode = (FactorVariableTreeNode)factorSymbol.CreateTreeNode(); factorTreeNode.VariableName = varChild.VariableName; factorTreeNode.Weights = factorTreeNode.Symbol.GetVariableValues(factorTreeNode.VariableName).Select(_ => 1.0).ToArray(); // weight = 1.0 for each value parent.InsertSubtree(i, factorTreeNode); } else if (factorVarChild != null && factorSymbol.VariableNames.Contains(factorVarChild.VariableName)) { if (factorSymbol.GetVariableValues(factorVarChild.VariableName).Count() != factorVarChild.Weights.Length) { throw new ArgumentException( string.Format("Factor variable {0} needs exactly {1} weights", factorVarChild.VariableName, factorSymbol.GetVariableValues(factorVarChild.VariableName).Count())); } parent.RemoveSubtree(i); var factorTreeNode = (FactorVariableTreeNode)factorSymbol.CreateTreeNode(); factorTreeNode.VariableName = factorVarChild.VariableName; factorTreeNode.Weights = factorVarChild.Weights; parent.InsertSubtree(i, factorTreeNode); } } } if (!SymbolicRegressionConstantOptimizationEvaluator.CanOptimizeConstants(tree)) { throw new ArgumentException("The optimizer does not support the specified model structure."); } // initialize constants randomly if (rand != null) { foreach (var node in tree.IterateNodesPrefix().OfType <ConstantTreeNode>()) { double f = Math.Exp(NormalDistributedRandom.NextDouble(rand, 0, 1)); double s = rand.NextDouble() < 0.5 ? -1 : 1; node.Value = s * node.Value * f; } } var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(); SymbolicRegressionConstantOptimizationEvaluator.OptimizeConstants(interpreter, tree, problemData, problemData.TrainingIndices, applyLinearScaling: applyLinearScaling, maxIterations: maxIterations, updateVariableWeights: false, updateConstantsInTree: true); var model = new SymbolicRegressionModel(problemData.TargetVariable, tree, (ISymbolicDataAnalysisExpressionTreeInterpreter)interpreter.Clone()); if (applyLinearScaling) { model.Scale(problemData); } SymbolicRegressionSolution solution = new SymbolicRegressionSolution(model, (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Regression Model"; solution.Name = "Regression Solution"; return(solution); }
public static ISymbolicRegressionSolution CreateLinearRegressionSolution(IRegressionProblemData problemData, out double rmsError, out double cvRmsError) { var dataset = problemData.Dataset; string targetVariable = problemData.TargetVariable; IEnumerable <string> allowedInputVariables = problemData.AllowedInputVariables; IEnumerable <int> rows = problemData.TrainingIndices; double[,] inputMatrix = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables.Concat(new string[] { targetVariable }), rows); if (inputMatrix.Cast <double>().Any(x => double.IsNaN(x) || double.IsInfinity(x))) { throw new NotSupportedException("Linear regression does not support NaN or infinity values in the input dataset."); } alglib.linearmodel lm = new alglib.linearmodel(); alglib.lrreport ar = new alglib.lrreport(); int nRows = inputMatrix.GetLength(0); int nFeatures = inputMatrix.GetLength(1) - 1; double[] coefficients = new double[nFeatures + 1]; // last coefficient is for the constant int retVal = 1; alglib.lrbuild(inputMatrix, nRows, nFeatures, out retVal, out lm, out ar); if (retVal != 1) { throw new ArgumentException("Error in calculation of linear regression solution"); } rmsError = ar.rmserror; cvRmsError = ar.cvrmserror; alglib.lrunpack(lm, out coefficients, out nFeatures); ISymbolicExpressionTree tree = new SymbolicExpressionTree(new ProgramRootSymbol().CreateTreeNode()); ISymbolicExpressionTreeNode startNode = new StartSymbol().CreateTreeNode(); tree.Root.AddSubtree(startNode); ISymbolicExpressionTreeNode addition = new Addition().CreateTreeNode(); startNode.AddSubtree(addition); int col = 0; foreach (string column in allowedInputVariables) { VariableTreeNode vNode = (VariableTreeNode) new HeuristicLab.Problems.DataAnalysis.Symbolic.Variable().CreateTreeNode(); vNode.VariableName = column; vNode.Weight = coefficients[col]; addition.AddSubtree(vNode); col++; } ConstantTreeNode cNode = (ConstantTreeNode) new Constant().CreateTreeNode(); cNode.Value = coefficients[coefficients.Length - 1]; addition.AddSubtree(cNode); SymbolicRegressionSolution solution = new SymbolicRegressionSolution(new SymbolicRegressionModel(tree, new SymbolicDataAnalysisExpressionTreeInterpreter()), (IRegressionProblemData)problemData.Clone()); solution.Model.Name = "Linear Regression Model"; solution.Name = "Linear Regression Solution"; return(solution); }