Esempio n. 1
0
        protected void WriteDatasetToExcel(ExcelWorksheet datasetWorksheet, IDataAnalysisProblemData problemData)
        {
            //remark the performance of EPPlus drops dramatically
            //if the data is not written row wise (from left to right) due the internal indices used.
            IDataset dataset         = problemData.Dataset;
            var      variableNames   = dataset.VariableNames.ToList();
            var      doubleVariables = new HashSet <string>(dataset.DoubleVariables);

            for (int col = 1; col <= variableNames.Count; col++)
            {
                datasetWorksheet.Cells[1, col].Value = variableNames[col - 1];
            }

            for (int row = 0; row < dataset.Rows; row++)
            {
                for (int col = 0; col < variableNames.Count; col++)
                {
                    if (doubleVariables.Contains(variableNames[col]))
                    {
                        datasetWorksheet.Cells[row + 2, col + 1].Value = dataset.GetDoubleValue(variableNames[col], row);
                    }
                    else
                    {
                        datasetWorksheet.Cells[row + 2, col + 1].Value = dataset.GetValue(row, col);
                    }
                }
            }
        }
        public double[,] RevertData(IDataset dataset, IEnumerable <int> rows)
        {
            var instances  = rows.ToArray();
            var components = ComponentNames;
            var result     = new double[instances.Length, VariableNames.Length];

            for (var r = 0; r < instances.Length; r++)
            {
                for (var i = 0; i < components.Length; i++)
                {
                    var val = dataset.GetDoubleValue(components[i], instances[r]);
                    for (var j = 0; j < VariableNames.Length; j++)
                    {
                        result[r, j] += val * Matrix[j, i];
                    }
                }
            }
            for (var r = 0; r < instances.Length; r++)
            {
                for (var j = 0; j < VariableNames.Length; j++)
                {
                    result[r, j] *= Deviations[j];
                    result[r, j] += Means[j];
                }
            }

            return(result);
        }
Esempio n. 3
0
    public static IEnumerable<int> FindClosestCenters(IEnumerable<double[]> centers, IDataset dataset, IEnumerable<string> allowedInputVariables, IEnumerable<int> rows) {
      int nRows = rows.Count();
      int nCols = allowedInputVariables.Count();
      int[] closestCenter = new int[nRows];
      double[] bestCenterDistance = Enumerable.Repeat(double.MaxValue, nRows).ToArray();
      int centerIndex = 1;

      foreach (double[] center in centers) {
        if (nCols != center.Length) throw new ArgumentException();
        int rowIndex = 0;
        foreach (var row in rows) {
          // calc euclidian distance of point to center
          double centerDistance = 0;
          int col = 0;
          foreach (var inputVariable in allowedInputVariables) {
            double d = center[col++] - dataset.GetDoubleValue(inputVariable, row);
            d = d * d; // square;
            centerDistance += d;
            if (centerDistance > bestCenterDistance[rowIndex]) break;
          }
          if (centerDistance < bestCenterDistance[rowIndex]) {
            bestCenterDistance[rowIndex] = centerDistance;
            closestCenter[rowIndex] = centerIndex;
          }
          rowIndex++;
        }
        centerIndex++;
      }
      return closestCenter;
    }
Esempio n. 4
0
 private double GetEstimatedValue(IDataset dataset, int row)
 {
     if (!IsLeaf)
     {
         return((dataset.GetDoubleValue(SplitAttribute, row) <= SplitValue ? Left : Right).GetEstimatedValue(dataset, row));
     }
     if (Model == null)
     {
         throw new NotSupportedException("The model has not been built correctly");
     }
     return(Model.GetEstimatedValues(dataset, new[] { row }).First());
 }
Esempio n. 5
0
    public static double CalculateIntraClusterSumOfSquares(KMeansClusteringModel model, IDataset dataset, IEnumerable<int> rows) {
      List<int> clusterValues = model.GetClusterValues(dataset, rows).ToList();
      List<string> allowedInputVariables = model.AllowedInputVariables.ToList();
      int nCols = allowedInputVariables.Count;
      Dictionary<int, List<double[]>> clusterPoints = new Dictionary<int, List<double[]>>();
      Dictionary<int, double[]> clusterMeans = new Dictionary<int, double[]>();
      foreach (var clusterValue in clusterValues.Distinct()) {
        clusterPoints.Add(clusterValue, new List<double[]>());
      }

      // collect points of clusters
      int clusterValueIndex = 0;
      foreach (var row in rows) {
        double[] p = new double[allowedInputVariables.Count];
        for (int i = 0; i < nCols; i++) {
          p[i] = dataset.GetDoubleValue(allowedInputVariables[i], row);
        }
        clusterPoints[clusterValues[clusterValueIndex++]].Add(p);
      }
      // calculate cluster means
      foreach (var pair in clusterPoints) {
        double[] mean = new double[nCols];
        foreach (var p in pair.Value) {
          for (int i = 0; i < nCols; i++) {
            mean[i] += p[i];
          }
        }
        for (int i = 0; i < nCols; i++) {
          mean[i] /= pair.Value.Count;
        }
        clusterMeans[pair.Key] = mean;
      }
      // calculate distances
      double allCenterDistances = 0;
      foreach (var pair in clusterMeans) {
        double[] mean = pair.Value;
        double centerDistances = 0;
        foreach (var clusterPoint in clusterPoints[pair.Key]) {
          double centerDistance = 0;
          for (int i = 0; i < nCols; i++) {
            double d = mean[i] - clusterPoint[i];
            d = d * d;
            centerDistance += d;
          }
          centerDistances += centerDistance;
        }
        allCenterDistances += centerDistances;
      }
      return allCenterDistances;
    }
        /// <summary>
        /// Transforms <paramref name="dataset"/> into a data structure as needed by libSVM.
        /// </summary>
        /// <param name="dataset">The source dataset</param>
        /// <param name="targetVariable">The target variable</param>
        /// <param name="inputVariables">The selected input variables to include in the svm_problem.</param>
        /// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param>
        /// <returns>A problem data type that can be used to train a support vector machine.</returns>
        public static svm_problem CreateSvmProblem(IDataset dataset, string targetVariable, IEnumerable <string> inputVariables, IEnumerable <int> rowIndices)
        {
            double[] targetVector;
            var      nRows = rowIndices.Count();

            if (string.IsNullOrEmpty(targetVariable))
            {
                // if the target variable is not set (e.g. for prediction of a trained model) we just use a zero vector
                targetVector = new double[nRows];
            }
            else
            {
                targetVector = dataset.GetDoubleValues(targetVariable, rowIndices).ToArray();
            }
            svm_node[][]  nodes              = new svm_node[nRows][];
            int           maxNodeIndex       = 0;
            int           svmProblemRowIndex = 0;
            List <string> inputVariablesList = inputVariables.ToList();

            foreach (int row in rowIndices)
            {
                List <svm_node> tempRow  = new List <svm_node>();
                int             colIndex = 1; // make sure the smallest node index for SVM = 1
                foreach (var inputVariable in inputVariablesList)
                {
                    double value = dataset.GetDoubleValue(inputVariable, row);
                    // SVM also works with missing values
                    // => don't add NaN values in the dataset to the sparse SVM matrix representation
                    if (!double.IsNaN(value))
                    {
                        tempRow.Add(new svm_node()
                        {
                            index = colIndex, value = value
                        });
                        // nodes must be sorted in ascending ordered by column index
                        if (colIndex > maxNodeIndex)
                        {
                            maxNodeIndex = colIndex;
                        }
                    }
                    colIndex++;
                }
                nodes[svmProblemRowIndex++] = tempRow.ToArray();
            }
            return(new svm_problem {
                l = targetVector.Length, y = targetVector, x = nodes
            });
        }
        public double[,] TransformData(IDataset dataset, IEnumerable <int> rows)
        {
            var instances = rows.ToArray();
            var result    = new double[instances.Length, VariableNames.Length];

            for (var r = 0; r < instances.Length; r++)
            {
                for (var i = 0; i < VariableNames.Length; i++)
                {
                    var val = (dataset.GetDoubleValue(VariableNames[i], instances[r]) - Means[i]) / Deviations[i];
                    for (var j = 0; j < VariableNames.Length; j++)
                    {
                        result[r, j] += val * Matrix[i, j];
                    }
                }
            }
            return(result);
        }
        private static double[,] PCAReduce(IDataset dataset, IEnumerable <int> rows, IEnumerable <string> variables)
        {
            var instances  = rows.ToArray();
            var attributes = variables.ToArray();
            var data       = new double[instances.Length, attributes.Length + 1];

            for (int j = 0; j < attributes.Length; j++)
            {
                int i      = 0;
                var values = dataset.GetDoubleValues(attributes[j], instances);
                foreach (var v in values)
                {
                    data[i++, j] = v;
                }
            }
            int info;

            double[] variances;
            var      matrix = new double[0, 0];

            alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix);

            var result = new double[instances.Length, matrix.GetLength(1)];
            int r      = 0;

            foreach (var inst in instances)
            {
                int i = 0;
                foreach (var attrib in attributes)
                {
                    double val = dataset.GetDoubleValue(attrib, inst);
                    for (int j = 0; j < result.GetLength(1); j++)
                    {
                        result[r, j] += val * matrix[i, j];
                    }
                    i++;
                }
                r++;
            }

            return(result);
        }
Esempio n. 9
0
        public static IEnumerable <int> FindClosestCenters(IEnumerable <double[]> centers, IDataset dataset, IEnumerable <string> allowedInputVariables, IEnumerable <int> rows)
        {
            int nRows = rows.Count();
            int nCols = allowedInputVariables.Count();

            int[]    closestCenter      = new int[nRows];
            double[] bestCenterDistance = Enumerable.Repeat(double.MaxValue, nRows).ToArray();
            int      centerIndex        = 1;

            foreach (double[] center in centers)
            {
                if (nCols != center.Length)
                {
                    throw new ArgumentException();
                }
                int rowIndex = 0;
                foreach (var row in rows)
                {
                    // calc euclidian distance of point to center
                    double centerDistance = 0;
                    int    col            = 0;
                    foreach (var inputVariable in allowedInputVariables)
                    {
                        double d = center[col++] - dataset.GetDoubleValue(inputVariable, row);
                        d = d * d; // square;
                        centerDistance += d;
                        if (centerDistance > bestCenterDistance[rowIndex])
                        {
                            break;
                        }
                    }
                    if (centerDistance < bestCenterDistance[rowIndex])
                    {
                        bestCenterDistance[rowIndex] = centerDistance;
                        closestCenter[rowIndex]      = centerIndex;
                    }
                    rowIndex++;
                }
                centerIndex++;
            }
            return(closestCenter);
        }
        private void EvaluateLaggedOperations(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, IDataset ds)
        {
            // lag
            Evaluate(interpreter, ds, "(lagVariable 1.0 a -1) ", 1, ds.GetDoubleValue("A", 0));
            Evaluate(interpreter, ds, "(lagVariable 1.0 a -1) ", 2, ds.GetDoubleValue("A", 1));
            Evaluate(interpreter, ds, "(lagVariable 1.0 a 0) ", 2, ds.GetDoubleValue("A", 2));
            Evaluate(interpreter, ds, "(lagVariable 1.0 a 1) ", 0, ds.GetDoubleValue("A", 1));

            // integral
            Evaluate(interpreter, ds, "(integral -1.0 (variable 1.0 a)) ", 1, ds.GetDoubleValue("A", 0) + ds.GetDoubleValue("A", 1));
            Evaluate(interpreter, ds, "(integral -1.0 (lagVariable 1.0 a 1)) ", 1, ds.GetDoubleValue("A", 1) + ds.GetDoubleValue("A", 2));
            Evaluate(interpreter, ds, "(integral -2.0 (variable 1.0 a)) ", 2, ds.GetDoubleValue("A", 0) + ds.GetDoubleValue("A", 1) + ds.GetDoubleValue("A", 2));
            Evaluate(interpreter, ds, "(integral -1.0 (* (variable 1.0 a) (variable 1.0 b)))", 1, ds.GetDoubleValue("A", 0) * ds.GetDoubleValue("B", 0) + ds.GetDoubleValue("A", 1) * ds.GetDoubleValue("B", 1));
            Evaluate(interpreter, ds, "(integral -2.0 3.0)", 1, 9.0);

            // derivative
            // (f_0 + 2 * f_1 - 2 * f_3 - f_4) / 8; // h = 1
            Evaluate(interpreter, ds, "(diff (variable 1.0 a)) ", 5, (ds.GetDoubleValue("A", 5) + 2 * ds.GetDoubleValue("A", 4) - 2 * ds.GetDoubleValue("A", 2) - ds.GetDoubleValue("A", 1)) / 8.0);
            Evaluate(interpreter, ds, "(diff (variable 1.0 b)) ", 5, (ds.GetDoubleValue("B", 5) + 2 * ds.GetDoubleValue("B", 4) - 2 * ds.GetDoubleValue("B", 2) - ds.GetDoubleValue("B", 1)) / 8.0);
            Evaluate(interpreter, ds, "(diff (* (variable 1.0 a) (variable 1.0 b)))", 5, +
                     (ds.GetDoubleValue("A", 5) * ds.GetDoubleValue("B", 5) +
                      2 * ds.GetDoubleValue("A", 4) * ds.GetDoubleValue("B", 4) -
                      2 * ds.GetDoubleValue("A", 2) * ds.GetDoubleValue("B", 2) -
                      ds.GetDoubleValue("A", 1) * ds.GetDoubleValue("B", 1)) / 8.0);
            Evaluate(interpreter, ds, "(diff -2.0 3.0)", 5, 0.0);

            // timelag
            Evaluate(interpreter, ds, "(lag -1.0 (lagVariable 1.0 a 2)) ", 1, ds.GetDoubleValue("A", 2));
            Evaluate(interpreter, ds, "(lag -2.0 (lagVariable 1.0 a 2)) ", 2, ds.GetDoubleValue("A", 2));
            Evaluate(interpreter, ds, "(lag -1.0 (* (lagVariable 1.0 a 1) (lagVariable 1.0 b 2)))", 1, ds.GetDoubleValue("A", 1) * ds.GetDoubleValue("B", 2));
            Evaluate(interpreter, ds, "(lag -2.0 3.0)", 1, 3.0);
        }
        public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter,
                                               ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable <int> rows, bool applyLinearScaling,
                                               int maxIterations, bool updateVariableWeights = true,
                                               double lowerEstimationLimit = double.MinValue, double upperEstimationLimit              = double.MaxValue,
                                               bool updateConstantsInTree  = true, Action <double[], double, object> iterationCallback = null, EvaluationsCounter counter = null)
        {
            // numeric constants in the tree become variables for constant opt
            // variables in the tree become parameters (fixed values) for constant opt
            // for each parameter (variable in the original tree) we store the
            // variable name, variable value (for factor vars) and lag as a DataForVariable object.
            // A dictionary is used to find parameters
            double[] initialConstants;
            var      parameters = new List <TreeToAutoDiffTermConverter.DataForVariable>();

            TreeToAutoDiffTermConverter.ParametricFunction         func;
            TreeToAutoDiffTermConverter.ParametricFunctionGradient func_grad;
            if (!TreeToAutoDiffTermConverter.TryConvertToAutoDiff(tree, updateVariableWeights, applyLinearScaling, out parameters, out initialConstants, out func, out func_grad))
            {
                throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree.");
            }
            if (parameters.Count == 0)
            {
                return(0.0);                             // gkronber: constant expressions always have a R² of 0.0
            }
            var parameterEntries = parameters.ToArray(); // order of entries must be the same for x

            //extract inital constants
            double[] c;
            if (applyLinearScaling)
            {
                c    = new double[initialConstants.Length + 2];
                c[0] = 0.0;
                c[1] = 1.0;
                Array.Copy(initialConstants, 0, c, 2, initialConstants.Length);
            }
            else
            {
                c = (double[])initialConstants.Clone();
            }

            double originalQuality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            if (counter == null)
            {
                counter = new EvaluationsCounter();
            }
            var rowEvaluationsCounter = new EvaluationsCounter();

            alglib.lsfitstate  state;
            alglib.lsfitreport rep;
            int retVal;

            IDataset ds = problemData.Dataset;

            double[,] x = new double[rows.Count(), parameters.Count];
            int row = 0;

            foreach (var r in rows)
            {
                int col = 0;
                foreach (var info in parameterEntries)
                {
                    if (ds.VariableHasType <double>(info.variableName))
                    {
                        x[row, col] = ds.GetDoubleValue(info.variableName, r + info.lag);
                    }
                    else if (ds.VariableHasType <string>(info.variableName))
                    {
                        x[row, col] = ds.GetStringValue(info.variableName, r) == info.variableValue ? 1 : 0;
                    }
                    else
                    {
                        throw new InvalidProgramException("found a variable of unknown type");
                    }
                    col++;
                }
                row++;
            }
            double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray();
            int      n = x.GetLength(0);
            int      m = x.GetLength(1);
            int      k = c.Length;

            alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(func);
            alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(func_grad);
            alglib.ndimensional_rep   xrep = (p, f, obj) => iterationCallback(p, f, obj);

            try {
                alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state);
                alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations);
                alglib.lsfitsetxrep(state, iterationCallback != null);
                //alglib.lsfitsetgradientcheck(state, 0.001);
                alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, xrep, rowEvaluationsCounter);
                alglib.lsfitresults(state, out retVal, out c, out rep);
            } catch (ArithmeticException) {
                return(originalQuality);
            } catch (alglib.alglibexception) {
                return(originalQuality);
            }

            counter.FunctionEvaluations += rowEvaluationsCounter.FunctionEvaluations / n;
            counter.GradientEvaluations += rowEvaluationsCounter.GradientEvaluations / n;

            //retVal == -7  => constant optimization failed due to wrong gradient
            if (retVal != -7)
            {
                if (applyLinearScaling)
                {
                    var tmp = new double[c.Length - 2];
                    Array.Copy(c, 2, tmp, 0, tmp.Length);
                    UpdateConstants(tree, tmp, updateVariableWeights);
                }
                else
                {
                    UpdateConstants(tree, c, updateVariableWeights);
                }
            }
            var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            if (!updateConstantsInTree)
            {
                UpdateConstants(tree, initialConstants, updateVariableWeights);
            }

            if (originalQuality - quality > 0.001 || double.IsNaN(quality))
            {
                UpdateConstants(tree, initialConstants, updateVariableWeights);
                return(originalQuality);
            }
            return(quality);
        }
Esempio n. 12
0
 private double GetEstimatedVariance(IDataset dataset, int row)
 {
     return(!IsLeaf ? ((IConfidenceRegressionModel)(dataset.GetDoubleValue(SplitAttribute, row) <= SplitValue ? Left : Right)).GetEstimatedVariances(dataset, row.ToEnumerable()).Single() : ((IConfidenceRegressionModel)Model).GetEstimatedVariances(dataset, new[] { row }).First());
 }
Esempio n. 13
0
        public static double CalculateIntraClusterSumOfSquares(KMeansClusteringModel model, IDataset dataset, IEnumerable <int> rows)
        {
            List <int>    clusterValues         = model.GetClusterValues(dataset, rows).ToList();
            List <string> allowedInputVariables = model.AllowedInputVariables.ToList();
            int           nCols = allowedInputVariables.Count;
            Dictionary <int, List <double[]> > clusterPoints = new Dictionary <int, List <double[]> >();
            Dictionary <int, double[]>         clusterMeans  = new Dictionary <int, double[]>();

            foreach (var clusterValue in clusterValues.Distinct())
            {
                clusterPoints.Add(clusterValue, new List <double[]>());
            }

            // collect points of clusters
            int clusterValueIndex = 0;

            foreach (var row in rows)
            {
                double[] p = new double[allowedInputVariables.Count];
                for (int i = 0; i < nCols; i++)
                {
                    p[i] = dataset.GetDoubleValue(allowedInputVariables[i], row);
                }
                clusterPoints[clusterValues[clusterValueIndex++]].Add(p);
            }
            // calculate cluster means
            foreach (var pair in clusterPoints)
            {
                double[] mean = new double[nCols];
                foreach (var p in pair.Value)
                {
                    for (int i = 0; i < nCols; i++)
                    {
                        mean[i] += p[i];
                    }
                }
                for (int i = 0; i < nCols; i++)
                {
                    mean[i] /= pair.Value.Count;
                }
                clusterMeans[pair.Key] = mean;
            }
            // calculate distances
            double allCenterDistances = 0;

            foreach (var pair in clusterMeans)
            {
                double[] mean            = pair.Value;
                double   centerDistances = 0;
                foreach (var clusterPoint in clusterPoints[pair.Key])
                {
                    double centerDistance = 0;
                    for (int i = 0; i < nCols; i++)
                    {
                        double d = mean[i] - clusterPoint[i];
                        d = d * d;
                        centerDistance += d;
                    }
                    centerDistances += centerDistance;
                }
                allCenterDistances += centerDistances;
            }
            return(allCenterDistances);
        }
 private double GetEstimatedValue(IDataset dataset, int row)
 {
     return(Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row))));
 }
        public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable <int> rows, bool applyLinearScaling, int maxIterations, bool updateVariableWeights = true, double lowerEstimationLimit = double.MinValue, double upperEstimationLimit = double.MaxValue, bool updateConstantsInTree = true)
        {
            List <AutoDiff.Variable> variables     = new List <AutoDiff.Variable>();
            List <AutoDiff.Variable> parameters    = new List <AutoDiff.Variable>();
            List <string>            variableNames = new List <string>();

            AutoDiff.Term func;
            if (!TryTransformToAutoDiff(tree.Root.GetSubtree(0), variables, parameters, variableNames, updateVariableWeights, out func))
            {
                throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree.");
            }
            if (variableNames.Count == 0)
            {
                return(0.0);
            }

            AutoDiff.IParametricCompiledTerm compiledFunc = func.Compile(variables.ToArray(), parameters.ToArray());

            List <SymbolicExpressionTreeTerminalNode> terminalNodes = null;

            if (updateVariableWeights)
            {
                terminalNodes = tree.Root.IterateNodesPrefix().OfType <SymbolicExpressionTreeTerminalNode>().ToList();
            }
            else
            {
                terminalNodes = new List <SymbolicExpressionTreeTerminalNode>(tree.Root.IterateNodesPrefix().OfType <ConstantTreeNode>());
            }

            //extract inital constants
            double[] c = new double[variables.Count];
            {
                c[0] = 0.0;
                c[1] = 1.0;
                int i = 2;
                foreach (var node in terminalNodes)
                {
                    ConstantTreeNode constantTreeNode = node as ConstantTreeNode;
                    VariableTreeNode variableTreeNode = node as VariableTreeNode;
                    if (constantTreeNode != null)
                    {
                        c[i++] = constantTreeNode.Value;
                    }
                    else if (updateVariableWeights && variableTreeNode != null)
                    {
                        c[i++] = variableTreeNode.Weight;
                    }
                }
            }
            double[] originalConstants = (double[])c.Clone();
            double   originalQuality   = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            alglib.lsfitstate  state;
            alglib.lsfitreport rep;
            int info;

            IDataset ds = problemData.Dataset;

            double[,] x = new double[rows.Count(), variableNames.Count];
            int row = 0;

            foreach (var r in rows)
            {
                for (int col = 0; col < variableNames.Count; col++)
                {
                    x[row, col] = ds.GetDoubleValue(variableNames[col], r);
                }
                row++;
            }
            double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray();
            int      n = x.GetLength(0);
            int      m = x.GetLength(1);
            int      k = c.Length;

            alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(compiledFunc);
            alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(compiledFunc);

            try {
                alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state);
                alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations);
                //alglib.lsfitsetgradientcheck(state, 0.001);
                alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, null, null);
                alglib.lsfitresults(state, out info, out c, out rep);
            }
            catch (ArithmeticException) {
                return(originalQuality);
            }
            catch (alglib.alglibexception) {
                return(originalQuality);
            }

            //info == -7  => constant optimization failed due to wrong gradient
            if (info != -7)
            {
                UpdateConstants(tree, c.Skip(2).ToArray(), updateVariableWeights);
            }
            var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            if (!updateConstantsInTree)
            {
                UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights);
            }
            if (originalQuality - quality > 0.001 || double.IsNaN(quality))
            {
                UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights);
                return(originalQuality);
            }
            return(quality);
        }
Esempio n. 16
0
 public bool Covers(IDataset dataset, int row)
 {
     return(!SplitAttributes.Where((t, i) => !Comparisons[i].Compare(dataset.GetDoubleValue(t, row), SplitValues[i])).Any());
 }
    private static double[,] PCAReduce(IDataset dataset, IEnumerable<int> rows, IEnumerable<string> variables) {
      var instances = rows.ToArray();
      var attributes = variables.ToArray();
      var data = new double[instances.Length, attributes.Length + 1];

      for (int j = 0; j < attributes.Length; j++) {
        int i = 0;
        var values = dataset.GetDoubleValues(attributes[j], instances);
        foreach (var v in values) {
          data[i++, j] = v;
        }
      }
      int info;
      double[] variances;
      var matrix = new double[0, 0];
      alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix);

      var result = new double[instances.Length, matrix.GetLength(1)];
      int r = 0;
      foreach (var inst in instances) {
        int i = 0;
        foreach (var attrib in attributes) {
          double val = dataset.GetDoubleValue(attrib, inst);
          for (int j = 0; j < result.GetLength(1); j++)
            result[r, j] += val * matrix[i, j];
          i++;
        }
        r++;
      }

      return result;
    }