public static double Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree tree, double lowerEstimationLimit, double upperEstimationLimit, IClassificationProblemData problemData,
                                       IEnumerable <int> rows, bool applyLinearScaling, ISymbolicClassificationModelCreator modelCreator, double normalizedMeanSquaredErrorWeightingFactor, double falseNegativeRateWeightingFactor, double falsePositiveRateWeightingFactor)
        {
            var estimatedValues        = interpreter.GetSymbolicExpressionTreeValues(tree, problemData.Dataset, rows);
            var targetClassValues      = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows);
            var boundedEstimatedValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit).ToArray();
            OnlineCalculatorError errorState;
            double nmse;

            //calculate performance measures
            string positiveClassName = problemData.PositiveClass;

            double[]             classValues, thresholds;
            IEnumerable <double> estimatedClassValues = null;
            ISymbolicDiscriminantFunctionClassificationModel m;

            var model = modelCreator.CreateSymbolicClassificationModel(problemData.TargetVariable, tree, interpreter, lowerEstimationLimit, upperEstimationLimit);

            if ((m = model as ISymbolicDiscriminantFunctionClassificationModel) != null)
            {
                m.ThresholdCalculator.Calculate(problemData, boundedEstimatedValues, targetClassValues, out classValues, out thresholds);
                m.SetThresholdsAndClassValues(thresholds, classValues);
                estimatedClassValues = m.GetEstimatedClassValues(boundedEstimatedValues);
            }
            else
            {
                model.RecalculateModelParameters(problemData, rows);
                estimatedClassValues = model.GetEstimatedClassValues(problemData.Dataset, rows);
            }

            var performanceCalculator = new ClassificationPerformanceMeasuresCalculator(positiveClassName, problemData.GetClassValue(positiveClassName));

            performanceCalculator.Calculate(targetClassValues, estimatedClassValues);
            if (performanceCalculator.ErrorState != OnlineCalculatorError.None)
            {
                return(Double.NaN);
            }
            double falseNegativeRate = 1 - performanceCalculator.TruePositiveRate;
            double falsePositiveRate = performanceCalculator.FalsePositiveRate;

            if (applyLinearScaling)
            {
                throw new NotSupportedException("The Weighted Performance Measures Evaluator does not suppport linear scaling!");
            }
            nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(targetClassValues, boundedEstimatedValues, out errorState);
            if (errorState != OnlineCalculatorError.None)
            {
                return(Double.NaN);
            }
            return(normalizedMeanSquaredErrorWeightingFactor * nmse + falseNegativeRateWeightingFactor * falseNegativeRate + falsePositiveRateWeightingFactor * falsePositiveRate);
        }
        /// <summary>
        /// Elastic net with squared-error-loss for dense predictor matrix, runs the full path of all lambdas
        /// </summary>
        /// <param name="problemData">Predictor target matrix x and target vector y</param>
        /// <param name="penalty">Penalty for balance between ridge (0.0) and lasso (1.0) regression</param>
        /// <param name="nlam">Maximum number of lambda values (default 100)</param>
        /// <param name="flmin">User control of lambda values (&lt;1.0 => minimum lambda = flmin * (largest lambda value), >= 1.0 => use supplied lambda values</param>
        /// <param name="ulam">User supplied lambda values</param>
        /// <param name="lambda">Output lambda values</param>
        /// <param name="trainNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the training set for each set of coefficients along the path</param>
        /// <param name="testNMSE">Vector of normalized mean of squared error (NMSE = Variance(res) / Variance(y)) values on the test set for each set of coefficients along the path</param>
        /// <param name="coeff">Vector of coefficient vectors for each solution along the path</param>
        /// <param name="intercept">Vector of intercepts for each solution along the path</param>
        /// <param name="coeffLowerBound">Optional lower bound for all coefficients</param>
        /// <param name="coeffUpperBound">Optional upper bound for all coefficients</param>
        /// <param name="maxVars">Maximum allowed number of variables in each solution along the path (-1 => all variables are allowed)</param>
        private static void RunElasticNetLinearRegression(IRegressionProblemData problemData, double penalty,
                                                          int nlam, double flmin, double[] ulam, out double[] lambda, out double[] trainNMSE, out double[] testNMSE, out double[,] coeff, out double[] intercept,
                                                          double coeffLowerBound = double.NegativeInfinity, double coeffUpperBound = double.PositiveInfinity,
                                                          int maxVars            = -1
                                                          )
        {
            if (penalty < 0.0 || penalty > 1.0)
            {
                throw new ArgumentException("0 <= penalty <= 1", "penalty");
            }

            double[,] trainX;
            double[,] testX;
            double[] trainY;
            double[] testY;

            PrepareData(problemData, out trainX, out trainY, out testX, out testY);
            var numTrainObs = trainX.GetLength(1);
            var numTestObs  = testX.GetLength(1);
            var numVars     = trainX.GetLength(0);

            int    ka   = 1; // => covariance updating algorithm
            double parm = penalty;

            double[] w  = Enumerable.Repeat(1.0, numTrainObs).ToArray(); // all observations have the same weight
            int[]    jd = new int[1];                                    // do not force to use any of the variables
            double[] vp = Enumerable.Repeat(1.0, numVars).ToArray();     // all predictor variables are unpenalized
            double[,] cl = new double[numVars, 2];                       // use the same bounds for all coefficients
            for (int i = 0; i < numVars; i++)
            {
                cl[i, 0] = coeffLowerBound;
                cl[i, 1] = coeffUpperBound;
            }

            int    ne    = maxVars > 0 ? maxVars : numVars;
            int    nx    = numVars;
            double thr   = 1.0e-5; // default value as recommended in glmnet
            int    isd   = 1;      //  => regression on standardized predictor variables
            int    intr  = 1;      // => do include intercept in model
            int    maxit = 100000; // default value as recommended in glmnet
            // outputs
            int lmu = -1;

            double[,] ca;
            int[] ia;
            int[] nin;
            int   nlp  = -99;
            int   jerr = -99;

            double[] trainR2;
            Glmnet.elnet(ka, parm, numTrainObs, numVars, trainX, trainY, w, jd, vp, cl, ne, nx, nlam, flmin, ulam, thr, isd, intr, maxit, out lmu, out intercept, out ca, out ia, out nin, out trainR2, out lambda, out nlp, out jerr);

            trainNMSE = new double[lmu]; // elnet returns R**2 as 1 - NMSE
            testNMSE  = new double[lmu];
            coeff     = new double[lmu, numVars];
            for (int solIdx = 0; solIdx < lmu; solIdx++)
            {
                trainNMSE[solIdx] = 1.0 - trainR2[solIdx];

                // uncompress coefficients of solution
                int      selectedNin = nin[solIdx];
                double[] coefficients;
                double[] selectedCa = new double[nx];
                for (int i = 0; i < nx; i++)
                {
                    selectedCa[i] = ca[solIdx, i];
                }

                // apply to test set to calculate test NMSE values for each lambda step
                double[] fn;
                Glmnet.modval(intercept[solIdx], selectedCa, ia, selectedNin, numTestObs, testX, out fn);
                OnlineCalculatorError error;
                var nmse = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(testY, fn, out error);
                if (error != OnlineCalculatorError.None)
                {
                    nmse = double.NaN;
                }
                testNMSE[solIdx] = nmse;

                // uncompress coefficients
                Glmnet.uncomp(numVars, selectedCa, ia, selectedNin, out coefficients);
                for (int i = 0; i < coefficients.Length; i++)
                {
                    coeff[solIdx, i] = coefficients[i];
                }
            }
        }
Example #3
0
        public override IOperation Apply()
        {
            var operation   = base.Apply();
            var paretoFront = TrainingBestSolutionsParameter.ActualValue;

            IResult     result;
            ScatterPlot qualityToTreeSize;

            if (!ResultCollection.TryGetValue("Pareto Front Analysis", out result))
            {
                qualityToTreeSize = new ScatterPlot("Quality vs Tree Size", "");
                qualityToTreeSize.VisualProperties.XAxisMinimumAuto = false;
                qualityToTreeSize.VisualProperties.XAxisMaximumAuto = false;
                qualityToTreeSize.VisualProperties.YAxisMinimumAuto = false;
                qualityToTreeSize.VisualProperties.YAxisMaximumAuto = false;

                qualityToTreeSize.VisualProperties.XAxisMinimumFixedValue = 0;
                qualityToTreeSize.VisualProperties.XAxisMaximumFixedValue = MaximumSymbolicExpressionTreeLengthParameter.ActualValue.Value;
                qualityToTreeSize.VisualProperties.YAxisMinimumFixedValue = 0;
                qualityToTreeSize.VisualProperties.YAxisMaximumFixedValue = 2;
                ResultCollection.Add(new Result("Pareto Front Analysis", qualityToTreeSize));
            }
            else
            {
                qualityToTreeSize = (ScatterPlot)result.Value;
            }


            int previousTreeLength = -1;
            var sizeParetoFront    = new LinkedList <ISymbolicRegressionSolution>();

            foreach (var solution in paretoFront.OrderBy(s => s.Model.SymbolicExpressionTree.Length))
            {
                int treeLength = solution.Model.SymbolicExpressionTree.Length;
                if (!sizeParetoFront.Any())
                {
                    sizeParetoFront.AddLast(solution);
                }
                if (solution.TrainingNormalizedMeanSquaredError < sizeParetoFront.Last.Value.TrainingNormalizedMeanSquaredError)
                {
                    if (treeLength == previousTreeLength)
                    {
                        sizeParetoFront.RemoveLast();
                    }
                    sizeParetoFront.AddLast(solution);
                }
                previousTreeLength = treeLength;
            }

            qualityToTreeSize.Rows.Clear();
            var trainingRow = new ScatterPlotDataRow("Training NMSE", "", sizeParetoFront.Select(x => new Point2D <double>(x.Model.SymbolicExpressionTree.Length, x.TrainingNormalizedMeanSquaredError, x)));

            trainingRow.VisualProperties.PointSize = 8;
            qualityToTreeSize.Rows.Add(trainingRow);

            if (AnalyzeTestError)
            {
                var testRow = new ScatterPlotDataRow("Test NMSE", "",
                                                     sizeParetoFront.Select(x => new Point2D <double>(x.Model.SymbolicExpressionTree.Length, x.TestNormalizedMeanSquaredError, x)));
                testRow.VisualProperties.PointSize = 8;
                qualityToTreeSize.Rows.Add(testRow);
            }

            var validationPartition = ValidationPartitionParameter.ActualValue;

            if (validationPartition.Size != 0)
            {
                var problemData       = ProblemDataParameter.ActualValue;
                var validationIndizes = Enumerable.Range(validationPartition.Start, validationPartition.Size).ToList();
                var targetValues      = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, validationIndizes).ToList();
                OnlineCalculatorError error;
                var validationRow = new ScatterPlotDataRow("Validation NMSE", "",
                                                           sizeParetoFront.Select(x => new Point2D <double>(x.Model.SymbolicExpressionTree.Length,
                                                                                                            OnlineNormalizedMeanSquaredErrorCalculator.Calculate(targetValues, x.GetEstimatedValues(validationIndizes), out error))));
                validationRow.VisualProperties.PointSize = 7;
                qualityToTreeSize.Rows.Add(validationRow);
            }

            return(operation);
        }