private void BuildPca(IDataset dataset, IEnumerable <int> rows, IEnumerable <string> variables, bool normalize)
        {
            var instances  = rows.ToArray();
            var attributes = variables.ToArray();

            Means = normalize
        ? attributes.Select(v => dataset.GetDoubleValues(v, instances).Average()).ToArray()
        : attributes.Select(x => 0.0).ToArray();
            Deviations = normalize
        ? attributes.Select(v => dataset.GetDoubleValues(v, instances).StandardDeviationPop()).Select(x => x.IsAlmost(0.0) ? 1 : x).ToArray()
        : attributes.Select(x => 1.0).ToArray();

            var data = new double[instances.Length, attributes.Length];

            for (var j = 0; j < attributes.Length; j++)
            {
                var i = 0;
                foreach (var v in dataset.GetDoubleValues(attributes[j], instances))
                {
                    data[i, j] = (v - Means[j]) / Deviations[j];
                    i++;
                }
            }

            int info;

            double[] variances;
            double[,] matrix;
            alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix);
            Matrix        = matrix;
            Variances     = variances;
            VariableNames = attributes;
        }
Ejemplo n.º 2
0
        private static double[,] ExtractData(IDataset dataset, IEnumerable <int> rows, IReadOnlyCollection <string> allowedInputVariables, ITransformation <double>[] scaling = null)
        {
            double[][] variables;
            if (scaling != null)
            {
                variables =
                    allowedInputVariables.Select((var, i) => scaling[i].Apply(dataset.GetDoubleValues(var, rows)).ToArray())
                    .ToArray();
            }
            else
            {
                variables =
                    allowedInputVariables.Select(var => dataset.GetDoubleValues(var, rows).ToArray()).ToArray();
            }
            int n   = variables.First().Length;
            var res = new double[n, variables.Length];

            for (int r = 0; r < n; r++)
            {
                for (int c = 0; c < variables.Length; c++)
                {
                    res[r, c] = variables[c][r];
                }
            }
            return(res);
        }
Ejemplo n.º 3
0
        public IEnumerable <IEnumerable <double> > GetSymbolicExpressionTreeValues(ISymbolicExpressionTree tree, IDataset dataset, IEnumerable <int> rows, IEnumerable <int> horizons)
        {
            if (CheckExpressionsWithIntervalArithmetic.Value)
            {
                throw new NotSupportedException("Interval arithmetic is not yet supported in the symbolic data analysis interpreter.");
            }
            if (targetVariableCache == null || targetVariableCache.GetLength(0) < dataset.Rows)
            {
                targetVariableCache = dataset.GetDoubleValues(TargetVariable).ToArray();
            }
            if (invalidateCacheIndexes == null)
            {
                invalidateCacheIndexes = new List <int>(10);
            }

            string targetVariable = TargetVariable;

            lock (EvaluatedSolutions) {
                EvaluatedSolutions.Value++; // increment the evaluated solutions counter
            }
            var state              = PrepareInterpreterState(tree, dataset, targetVariableCache, TargetVariable);
            var rowsEnumerator     = rows.GetEnumerator();
            var horizonsEnumerator = horizons.GetEnumerator();

            // produce a n-step forecast for all rows
            while (rowsEnumerator.MoveNext() & horizonsEnumerator.MoveNext())
            {
                int      row     = rowsEnumerator.Current;
                int      horizon = horizonsEnumerator.Current;
                double[] vProgs  = new double[horizon];

                for (int i = 0; i < horizon; i++)
                {
                    int localRow = i + row; // create a local variable for the ref parameter
                    vProgs[i] = Evaluate(dataset, ref localRow, state);
                    targetVariableCache[localRow] = vProgs[i];
                    invalidateCacheIndexes.Add(localRow);
                    state.Reset();
                }
                yield return(vProgs);

                int j = 0;
                foreach (var targetValue in dataset.GetDoubleValues(targetVariable, invalidateCacheIndexes))
                {
                    targetVariableCache[invalidateCacheIndexes[j]] = targetValue;
                    j++;
                }
                invalidateCacheIndexes.Clear();
            }

            if (rowsEnumerator.MoveNext() || horizonsEnumerator.MoveNext())
            {
                throw new ArgumentException("Number of elements in rows and horizon enumerations doesn't match.");
            }
        }
        // uses sorting to return the values in the order of rows, instead of using nested for loops
        // to avoid O(n²) runtime
        public override IEnumerable <double> GetEstimatedClassValues(IDataset dataset, IEnumerable <int> rows)
        {
            var values    = dataset.GetDoubleValues(Variable, rows).ToArray();
            var rowsArray = rows.ToArray();
            var order     = Enumerable.Range(0, rowsArray.Length).ToArray();

            double[] estimated = new double[rowsArray.Length];
            Array.Sort(rowsArray, order);
            Array.Sort(values, rowsArray);
            int curSplit = 0, curIndex = 0;

            while (curIndex < values.Length && Double.IsNaN(values[curIndex]))
            {
                estimated[curIndex] = MissingValuesClass;
                curIndex++;
            }
            while (curSplit < Splits.Length)
            {
                while (curIndex < values.Length && Splits[curSplit] > values[curIndex])
                {
                    estimated[curIndex] = classes[curSplit];
                    curIndex++;
                }
                curSplit++;
            }
            Array.Sort(rowsArray, estimated);
            Array.Sort(order, estimated);
            return(estimated);
        }
Ejemplo n.º 5
0
        public static double[,] ToArray(this IDataset dataset, IEnumerable <string> variables,
                                        IEnumerable <ITransformation <double> > transformations, IEnumerable <int> rows)
        {
            string[] variablesArr = variables.ToArray();
            int[]    rowsArr      = rows.ToArray();
            ITransformation <double>[] transformArr = transformations.ToArray();
            if (transformArr.Length != variablesArr.Length)
            {
                throw new ArgumentException("Number of variables and number of transformations must match.");
            }

            double[,] matrix = new double[rowsArr.Length, variablesArr.Length];

            for (int i = 0; i < variablesArr.Length; i++)
            {
                var origValues = dataset.GetDoubleValues(variablesArr[i], rowsArr);
                var values     = transformArr[i] != null ? transformArr[i].Apply(origValues) : origValues;
                int row        = 0;
                foreach (var value in values)
                {
                    matrix[row, i] = value;
                    row++;
                }
            }

            return(matrix);
        }
Ejemplo n.º 6
0
        private IEnumerable <double> InterpretRec(ISymbolicExpressionTreeNode node, IDataset dataset, IEnumerable <int> rows)
        {
            Func <ISymbolicExpressionTreeNode, ISymbolicExpressionTreeNode, Func <double, double, double>, IEnumerable <double> > binaryEval =
                (left, right, f) => InterpretRec(left, dataset, rows).Zip(InterpretRec(right, dataset, rows), f);

            switch (node.Symbol.Name)
            {
            case "+": return(binaryEval(node.GetSubtree(0), node.GetSubtree(1), (x, y) => x + y));

            case "*": return(binaryEval(node.GetSubtree(0), node.GetSubtree(1), (x, y) => x * y));

            case "-": return(binaryEval(node.GetSubtree(0), node.GetSubtree(1), (x, y) => x - y));

            case "%": return(binaryEval(node.GetSubtree(0), node.GetSubtree(1), (x, y) => y.IsAlmost(0.0) ? 0.0 : x / y)); // protected division

            default: {
                double erc;
                if (double.TryParse(node.Symbol.Name, out erc))
                {
                    return(rows.Select(_ => erc));
                }
                else
                {
                    // assume that this is a variable name
                    return(dataset.GetDoubleValues(node.Symbol.Name, rows));
                }
            }
            }
        }
Ejemplo n.º 7
0
 public Scaling(IDataset ds, IEnumerable<string> variables, IEnumerable<int> rows) {
   foreach (var variable in variables) {
     var values = ds.GetDoubleValues(variable, rows);
     var min = values.Where(x => !double.IsNaN(x)).Min();
     var max = values.Where(x => !double.IsNaN(x)).Max();
     scalingParameters[variable] = Tuple.Create(min, max);
   }
 }
Ejemplo n.º 8
0
        private static ITransformation <double>[] CreateScaling(IDataset dataset, int[] rows, IReadOnlyCollection <string> allowedInputVariables)
        {
            var trans = new ITransformation <double> [allowedInputVariables.Count];
            int i     = 0;

            foreach (var variable in allowedInputVariables)
            {
                var lin = new LinearTransformation(allowedInputVariables);
                var max = dataset.GetDoubleValues(variable, rows).Max();
                var min = dataset.GetDoubleValues(variable, rows).Min();
                lin.Multiplier = 1.0 / (max - min);
                lin.Addend     = -min / (max - min);
                trans[i]       = lin;
                i++;
            }
            return(trans);
        }
Ejemplo n.º 9
0
        public static void SplitRows(IReadOnlyList <int> rows, IDataset data, string splitAttr, double splitValue, out IReadOnlyList <int> leftRows, out IReadOnlyList <int> rightRows)
        {
            //TODO check and revert?: points at borders are now used multipe times
            var assignment = data.GetDoubleValues(splitAttr, rows).Select(x => x.IsAlmost(splitValue) ? 2 : x < splitValue ? 0 : 1).ToArray();

            leftRows  = rows.Zip(assignment, (i, b) => new { i, b }).Where(x => x.b == 0 || x.b == 2).Select(x => x.i).ToList();
            rightRows = rows.Zip(assignment, (i, b) => new { i, b }).Where(x => x.b > 0).Select(x => x.i).ToList();
        }
 private void InitCache(IDataset dataset)
 {
     this.dataset = dataset;
     cachedData   = new Dictionary <string, double[]>();
     foreach (var v in dataset.DoubleVariables)
     {
         cachedData[v] = dataset.GetDoubleValues(v).ToArray();
     }
 }
Ejemplo n.º 11
0
 public Scaling(IDataset ds, IEnumerable <string> variables, IEnumerable <int> rows)
 {
     foreach (var variable in variables)
     {
         var values = ds.GetDoubleValues(variable, rows);
         var min    = values.Where(x => !double.IsNaN(x)).Min();
         var max    = values.Where(x => !double.IsNaN(x)).Max();
         scalingParameters[variable] = Tuple.Create(min, max);
     }
 }
Ejemplo n.º 12
0
        public IEnumerable <double> GetScaledValues(IDataset ds, string variable, IEnumerable <int> rows)
        {
            double min = scalingParameters[variable].Item1;
            double max = scalingParameters[variable].Item2;

            if (min.IsAlmost(max))
            {
                return(rows.Select(i => 0.0));                                               // return enumerable of zeros
            }
            return(ds.GetDoubleValues(variable, rows).Select(x => (x - min) / (max - min))); // scale to range [0..1]
        }
Ejemplo n.º 13
0
        public static Dictionary <string, Interval> GetVariableRanges(IDataset dataset, IEnumerable <int> rows = null)
        {
            Dictionary <string, Interval> variableRanges = new Dictionary <string, Interval>();

            foreach (var variable in dataset.VariableNames)
            {
                IEnumerable <double> values = null;

                if (rows == null)
                {
                    values = dataset.GetDoubleValues(variable);
                }
                else
                {
                    values = dataset.GetDoubleValues(variable, rows);
                }

                var range = Interval.GetInterval(values);
                variableRanges.Add(variable, range);
            }

            return(variableRanges);
        }
        private IDataset CreateRevertedDataset(IDataset data, double[,] pcs)
        {
            var n         = VariableNames;
            var nDouble   = data.DoubleVariables.Where(x => !ComponentNames.Contains(x)).ToArray();
            var nDateTime = data.DateTimeVariables.ToArray();
            var nString   = data.StringVariables.ToArray();

            IEnumerable <IList> nData         = n.Select((_, x) => Enumerable.Range(0, pcs.GetLength(0)).Select(r => pcs[r, x]).ToList());
            IEnumerable <IList> nDoubleData   = nDouble.Select(x => data.GetDoubleValues(x).ToList());
            IEnumerable <IList> nDateTimeData = nDateTime.Select(x => data.GetDateTimeValues(x).ToList());
            IEnumerable <IList> nStringData   = nString.Select(x => data.GetStringValues(x).ToList());

            return(new Dataset(n.Concat(nDouble).Concat(nDateTime).Concat(nString), nData.Concat(nDoubleData).Concat(nDateTimeData).Concat(nStringData).ToArray()));
        }
Ejemplo n.º 15
0
        /// <summary>
        /// Transforms <paramref name="dataset"/> into a data structure as needed by libSVM.
        /// </summary>
        /// <param name="dataset">The source dataset</param>
        /// <param name="targetVariable">The target variable</param>
        /// <param name="inputVariables">The selected input variables to include in the svm_problem.</param>
        /// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param>
        /// <returns>A problem data type that can be used to train a support vector machine.</returns>
        public static svm_problem CreateSvmProblem(IDataset dataset, string targetVariable, IEnumerable <string> inputVariables, IEnumerable <int> rowIndices)
        {
            double[] targetVector;
            var      nRows = rowIndices.Count();

            if (string.IsNullOrEmpty(targetVariable))
            {
                // if the target variable is not set (e.g. for prediction of a trained model) we just use a zero vector
                targetVector = new double[nRows];
            }
            else
            {
                targetVector = dataset.GetDoubleValues(targetVariable, rowIndices).ToArray();
            }
            svm_node[][]  nodes              = new svm_node[nRows][];
            int           maxNodeIndex       = 0;
            int           svmProblemRowIndex = 0;
            List <string> inputVariablesList = inputVariables.ToList();

            foreach (int row in rowIndices)
            {
                List <svm_node> tempRow  = new List <svm_node>();
                int             colIndex = 1; // make sure the smallest node index for SVM = 1
                foreach (var inputVariable in inputVariablesList)
                {
                    double value = dataset.GetDoubleValue(inputVariable, row);
                    // SVM also works with missing values
                    // => don't add NaN values in the dataset to the sparse SVM matrix representation
                    if (!double.IsNaN(value))
                    {
                        tempRow.Add(new svm_node()
                        {
                            index = colIndex, value = value
                        });
                        // nodes must be sorted in ascending ordered by column index
                        if (colIndex > maxNodeIndex)
                        {
                            maxNodeIndex = colIndex;
                        }
                    }
                    colIndex++;
                }
                nodes[svmProblemRowIndex++] = tempRow.ToArray();
            }
            return(new svm_problem {
                l = targetVector.Length, y = targetVector, x = nodes
            });
        }
        public static IEnumerable <string> CheckVariablesForPossibleTargetVariables(IDataset dataset)
        {
            int maxSamples           = Math.Min(InspectedRowsToDetermineTargets, dataset.Rows);
            var validTargetVariables = (from v in dataset.DoubleVariables
                                        let distinctValues = dataset.GetDoubleValues(v)
                                                             .Take(maxSamples)
                                                             .Distinct()
                                                             .Count()
                                                             where distinctValues <= MaximumNumberOfClasses
                                                             select v).ToArray();

            if (!validTargetVariables.Any())
            {
                throw new ArgumentException("Import of classification problem data was not successful, because no target variable was found." +
                                            " A target variable must have at most " + MaximumNumberOfClasses + " distinct values to be applicable to classification.");
            }
            return(validTargetVariables);
        }
Ejemplo n.º 17
0
    public static double[,] PrepareInputMatrix(IDataset dataset, IEnumerable<string> variables, IEnumerable<int> rows) {
      List<string> variablesList = variables.ToList();
      List<int> rowsList = rows.ToList();

      double[,] matrix = new double[rowsList.Count, variablesList.Count];

      int col = 0;
      foreach (string column in variables) {
        var values = dataset.GetDoubleValues(column, rows);
        int row = 0;
        foreach (var value in values) {
          matrix[row, col] = value;
          row++;
        }
        col++;
      }

      return matrix;
    }
    public IEnumerable<IEnumerable<double>> GetSymbolicExpressionTreeValues(ISymbolicExpressionTree tree, IDataset dataset, IEnumerable<int> rows, IEnumerable<int> horizons) {
      if (CheckExpressionsWithIntervalArithmetic.Value)
        throw new NotSupportedException("Interval arithmetic is not yet supported in the symbolic data analysis interpreter.");
      if (targetVariableCache == null || targetVariableCache.GetLength(0) < dataset.Rows)
        targetVariableCache = dataset.GetDoubleValues(TargetVariable).ToArray();
      if (invalidateCacheIndexes == null)
        invalidateCacheIndexes = new List<int>(10);

      string targetVariable = TargetVariable;
      lock (EvaluatedSolutions) {
        EvaluatedSolutions.Value++; // increment the evaluated solutions counter
      }
      var state = PrepareInterpreterState(tree, dataset, targetVariableCache, TargetVariable);
      var rowsEnumerator = rows.GetEnumerator();
      var horizonsEnumerator = horizons.GetEnumerator();

      // produce a n-step forecast for all rows
      while (rowsEnumerator.MoveNext() & horizonsEnumerator.MoveNext()) {
        int row = rowsEnumerator.Current;
        int horizon = horizonsEnumerator.Current;
        double[] vProgs = new double[horizon];

        for (int i = 0; i < horizon; i++) {
          int localRow = i + row; // create a local variable for the ref parameter
          vProgs[i] = Evaluate(dataset, ref localRow, state);
          targetVariableCache[localRow] = vProgs[i];
          invalidateCacheIndexes.Add(localRow);
          state.Reset();
        }
        yield return vProgs;

        int j = 0;
        foreach (var targetValue in dataset.GetDoubleValues(targetVariable, invalidateCacheIndexes)) {
          targetVariableCache[invalidateCacheIndexes[j]] = targetValue;
          j++;
        }
        invalidateCacheIndexes.Clear();
      }

      if (rowsEnumerator.MoveNext() || horizonsEnumerator.MoveNext())
        throw new ArgumentException("Number of elements in rows and horizon enumerations doesn't match.");
    }
        private static double[,] PCAReduce(IDataset dataset, IEnumerable <int> rows, IEnumerable <string> variables)
        {
            var instances  = rows.ToArray();
            var attributes = variables.ToArray();
            var data       = new double[instances.Length, attributes.Length + 1];

            for (int j = 0; j < attributes.Length; j++)
            {
                int i      = 0;
                var values = dataset.GetDoubleValues(attributes[j], instances);
                foreach (var v in values)
                {
                    data[i++, j] = v;
                }
            }
            int info;

            double[] variances;
            var      matrix = new double[0, 0];

            alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix);

            var result = new double[instances.Length, matrix.GetLength(1)];
            int r      = 0;

            foreach (var inst in instances)
            {
                int i = 0;
                foreach (var attrib in attributes)
                {
                    double val = dataset.GetDoubleValue(attrib, inst);
                    for (int j = 0; j < result.GetLength(1); j++)
                    {
                        result[r, j] += val * matrix[i, j];
                    }
                    i++;
                }
                r++;
            }

            return(result);
        }
Ejemplo n.º 20
0
        public double[,] Reduce(IDataset dataset, IEnumerable <int> rows)
        {
            var data = dataset.ToArray(allowedInputVariables, rows);

            var targets = dataset.GetDoubleValues(TargetVariable, rows).ToArray();
            var result  = new double[data.GetLength(0), transformationMatrix.GetLength(1) + 1];

            for (int i = 0; i < data.GetLength(0); i++)
            {
                for (int j = 0; j < data.GetLength(1); j++)
                {
                    for (int x = 0; x < transformationMatrix.GetLength(1); x++)
                    {
                        result[i, x] += data[i, j] * transformationMatrix[j, x];
                    }
                    result[i, transformationMatrix.GetLength(1)] = targets[i];
                }
            }
            return(result);
        }
        private void InitCache(IDataset dataset)
        {
            this.dataset = dataset;

            // free handles to old data
            if (cachedData != null)
            {
                foreach (var gch in cachedData.Values)
                {
                    gch.Free();
                }
                cachedData = null;
            }

            // cache new data
            cachedData = new Dictionary <string, GCHandle>();
            foreach (var v in dataset.DoubleVariables)
            {
                var values = dataset.GetDoubleValues(v).ToArray();
                var gch    = GCHandle.Alloc(values, GCHandleType.Pinned);
                cachedData[v] = gch;
            }
        }
Ejemplo n.º 22
0
        public static double[,] PrepareInputMatrix(IDataset dataset, IEnumerable <string> variables, IEnumerable <int> rows)
        {
            List <string> variablesList = variables.ToList();
            List <int>    rowsList      = rows.ToList();

            double[,] matrix = new double[rowsList.Count, variablesList.Count];

            int col = 0;

            foreach (string column in variables)
            {
                var values = dataset.GetDoubleValues(column, rows);
                int row    = 0;
                foreach (var value in values)
                {
                    matrix[row, col] = value;
                    row++;
                }
                col++;
            }

            return(matrix);
        }
Ejemplo n.º 23
0
        private KernelRidgeRegressionModel(IDataset dataset, string targetVariable, IEnumerable <string> allowedInputVariables, int[] rows,
                                           bool scaleInputs, ICovarianceFunction kernel, double lambda = 0.1) : base(targetVariable)
        {
            this.allowedInputVariables = allowedInputVariables.ToArray();
            if (kernel.GetNumberOfParameters(this.allowedInputVariables.Length) > 0)
            {
                throw new ArgumentException("All parameters in the kernel function must be specified.");
            }
            name        = ItemName;
            description = ItemDescription;

            this.kernel = (ICovarianceFunction)kernel.Clone();
            this.lambda = lambda;
            if (scaleInputs)
            {
                scaling = CreateScaling(dataset, rows, this.allowedInputVariables);
            }
            trainX = ExtractData(dataset, rows, this.allowedInputVariables, scaling);
            var y = dataset.GetDoubleValues(targetVariable, rows).ToArray();

            yOffset = y.Average();
            yScale  = 1.0 / y.StandardDeviation();
            alpha   = new double[trainX.GetLength(0)];
        }
Ejemplo n.º 24
0
 // uses sorting to return the values in the order of rows, instead of using nested for loops
 // to avoid O(n²) runtime
 public override IEnumerable<double> GetEstimatedClassValues(IDataset dataset, IEnumerable<int> rows) {
   var values = dataset.GetDoubleValues(Variable, rows).ToArray();
   var rowsArray = rows.ToArray();
   var order = Enumerable.Range(0, rowsArray.Length).ToArray();
   double[] estimated = new double[rowsArray.Length];
   Array.Sort(rowsArray, order);
   Array.Sort(values, rowsArray);
   int curSplit = 0, curIndex = 0;
   while (curIndex < values.Length && Double.IsNaN(values[curIndex])) {
     estimated[curIndex] = MissingValuesClass;
     curIndex++;
   }
   while (curSplit < Splits.Length) {
     while (curIndex < values.Length && Splits[curSplit] > values[curIndex]) {
       estimated[curIndex] = classes[curSplit];
       curIndex++;
     }
     curSplit++;
   }
   Array.Sort(rowsArray, estimated);
   Array.Sort(order, estimated);
   return estimated;
 }
Ejemplo n.º 25
0
    public static IEnumerable<string> CheckVariablesForPossibleTargetVariables(IDataset dataset) {
      int maxSamples = Math.Min(InspectedRowsToDetermineTargets, dataset.Rows);
      var validTargetVariables = (from v in dataset.DoubleVariables
                                  let distinctValues = dataset.GetDoubleValues(v)
                                    .Take(maxSamples)
                                    .Distinct()
                                    .Count()
                                  where distinctValues <= MaximumNumberOfClasses
                                  select v).ToArray();

      if (!validTargetVariables.Any())
        throw new ArgumentException("Import of classification problem data was not successful, because no target variable was found." +
          " A target variable must have at most " + MaximumNumberOfClasses + " distinct values to be applicable to classification.");
      return validTargetVariables;
    }
Ejemplo n.º 26
0
        public NearestNeighbourModel(IDataset dataset, IEnumerable <int> rows, int k, string targetVariable, IEnumerable <string> allowedInputVariables, IEnumerable <double> weights = null, double[] classValues = null)
            : base(targetVariable)
        {
            Name        = ItemName;
            Description = ItemDescription;
            this.k      = k;
            this.allowedInputVariables = allowedInputVariables.ToArray();
            double[,] inputMatrix;
            if (IsCompatibilityLoaded)
            {
                // no scaling
                inputMatrix = dataset.ToArray(
                    this.allowedInputVariables.Concat(new string[] { targetVariable }),
                    rows);
            }
            else
            {
                this.offsets = this.allowedInputVariables
                               .Select(name => dataset.GetDoubleValues(name, rows).Average() * -1)
                               .Concat(new double[] { 0 }) // no offset for target variable
                               .ToArray();
                if (weights == null)
                {
                    // automatic determination of weights (all features should have variance = 1)
                    this.weights = this.allowedInputVariables
                                   .Select(name => 1.0 / dataset.GetDoubleValues(name, rows).StandardDeviationPop())
                                   .Concat(new double[] { 1.0 }) // no scaling for target variable
                                   .ToArray();
                }
                else
                {
                    // user specified weights (+ 1 for target)
                    this.weights = weights.Concat(new double[] { 1.0 }).ToArray();
                    if (this.weights.Length - 1 != this.allowedInputVariables.Length)
                    {
                        throw new ArgumentException("The number of elements in the weight vector must match the number of input variables");
                    }
                }
                inputMatrix = CreateScaledData(dataset, this.allowedInputVariables.Concat(new string[] { targetVariable }), rows, this.offsets, this.weights);
            }

            if (inputMatrix.Cast <double>().Any(x => double.IsNaN(x) || double.IsInfinity(x)))
            {
                throw new NotSupportedException(
                          "Nearest neighbour model does not support NaN or infinity values in the input dataset.");
            }

            this.kdTree = new alglib.nearestneighbor.kdtree();

            var nRows     = inputMatrix.GetLength(0);
            var nFeatures = inputMatrix.GetLength(1) - 1;

            if (classValues != null)
            {
                this.classValues = (double[])classValues.Clone();
                int nClasses = classValues.Length;
                // map original class values to values [0..nClasses-1]
                var classIndices = new Dictionary <double, double>();
                for (int i = 0; i < nClasses; i++)
                {
                    classIndices[classValues[i]] = i;
                }

                for (int row = 0; row < nRows; row++)
                {
                    inputMatrix[row, nFeatures] = classIndices[inputMatrix[row, nFeatures]];
                }
            }
            alglib.nearestneighbor.kdtreebuild(inputMatrix, nRows, inputMatrix.GetLength(1) - 1, 1, 2, kdTree);
        }
Ejemplo n.º 27
0
        private static IndexedDataTable <double> CoefficientGraph(double[,] coeff, double[] lambda, IEnumerable <string> allowedVars, IDataset ds, bool showOnlyRelevantBasisFuncs = true)
        {
            var coeffTable = new IndexedDataTable <double>("Coefficients", "The paths of standarized coefficient values over different lambda values");

            coeffTable.VisualProperties.YAxisMaximumAuto = false;
            coeffTable.VisualProperties.YAxisMinimumAuto = false;
            coeffTable.VisualProperties.XAxisMaximumAuto = false;
            coeffTable.VisualProperties.XAxisMinimumAuto = false;

            coeffTable.VisualProperties.XAxisLogScale    = true;
            coeffTable.VisualProperties.XAxisTitle       = "Lambda";
            coeffTable.VisualProperties.YAxisTitle       = "Coefficients";
            coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables";

            var nLambdas         = lambda.Length;
            var nCoeff           = coeff.GetLength(1);
            var dataRows         = new IndexedDataRow <double> [nCoeff];
            var numNonZeroCoeffs = new int[nLambdas];

            var doubleVariables          = allowedVars.Where(ds.VariableHasType <double>);
            var factorVariableNames      = allowedVars.Where(ds.VariableHasType <string>);
            var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); //must consider all factor values (in train and test set)

            for (int i = 0; i < coeff.GetLength(0); i++)
            {
                for (int j = 0; j < coeff.GetLength(1); j++)
                {
                    if (!coeff[i, j].IsAlmost(0.0))
                    {
                        numNonZeroCoeffs[i]++;
                    }
                }
            }

            {
                int i = 0;
                foreach (var factorVariableAndValues in factorVariablesAndValues)
                {
                    foreach (var factorValue in factorVariableAndValues.Value)
                    {
                        double sigma = ds.GetStringValues(factorVariableAndValues.Key)
                                       .Select(s => s == factorValue ? 1.0 : 0.0)
                                       .StandardDeviation(); // calc std dev of binary indicator
                        var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
                        dataRows[i] = new IndexedDataRow <double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path);
                        i++;
                    }
                }

                foreach (var doubleVariable in doubleVariables)
                {
                    double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation();
                    var    path  = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
                    dataRows[i] = new IndexedDataRow <double>(doubleVariable, doubleVariable, path);
                    i++;
                }

                // add to coeffTable by total weight (larger area under the curve => more important);
                foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x))))
                {
                    coeffTable.Rows.Add(r);
                }
            }

            if (lambda.Length > 2)
            {
                coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
                coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
            }

            coeffTable.Rows.Add(new IndexedDataRow <double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
            coeffTable.Rows["Number of variables"].VisualProperties.ChartType   = DataRowVisualProperties.DataRowChartType.Points;
            coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;

            return(coeffTable);
        }
        public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable <int> rows, bool applyLinearScaling, int maxIterations, bool updateVariableWeights = true, double lowerEstimationLimit = double.MinValue, double upperEstimationLimit = double.MaxValue, bool updateConstantsInTree = true)
        {
            List <AutoDiff.Variable> variables     = new List <AutoDiff.Variable>();
            List <AutoDiff.Variable> parameters    = new List <AutoDiff.Variable>();
            List <string>            variableNames = new List <string>();

            AutoDiff.Term func;
            if (!TryTransformToAutoDiff(tree.Root.GetSubtree(0), variables, parameters, variableNames, updateVariableWeights, out func))
            {
                throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree.");
            }
            if (variableNames.Count == 0)
            {
                return(0.0);
            }

            AutoDiff.IParametricCompiledTerm compiledFunc = func.Compile(variables.ToArray(), parameters.ToArray());

            List <SymbolicExpressionTreeTerminalNode> terminalNodes = null;

            if (updateVariableWeights)
            {
                terminalNodes = tree.Root.IterateNodesPrefix().OfType <SymbolicExpressionTreeTerminalNode>().ToList();
            }
            else
            {
                terminalNodes = new List <SymbolicExpressionTreeTerminalNode>(tree.Root.IterateNodesPrefix().OfType <ConstantTreeNode>());
            }

            //extract inital constants
            double[] c = new double[variables.Count];
            {
                c[0] = 0.0;
                c[1] = 1.0;
                int i = 2;
                foreach (var node in terminalNodes)
                {
                    ConstantTreeNode constantTreeNode = node as ConstantTreeNode;
                    VariableTreeNode variableTreeNode = node as VariableTreeNode;
                    if (constantTreeNode != null)
                    {
                        c[i++] = constantTreeNode.Value;
                    }
                    else if (updateVariableWeights && variableTreeNode != null)
                    {
                        c[i++] = variableTreeNode.Weight;
                    }
                }
            }
            double[] originalConstants = (double[])c.Clone();
            double   originalQuality   = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            alglib.lsfitstate  state;
            alglib.lsfitreport rep;
            int info;

            IDataset ds = problemData.Dataset;

            double[,] x = new double[rows.Count(), variableNames.Count];
            int row = 0;

            foreach (var r in rows)
            {
                for (int col = 0; col < variableNames.Count; col++)
                {
                    x[row, col] = ds.GetDoubleValue(variableNames[col], r);
                }
                row++;
            }
            double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray();
            int      n = x.GetLength(0);
            int      m = x.GetLength(1);
            int      k = c.Length;

            alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(compiledFunc);
            alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(compiledFunc);

            try {
                alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state);
                alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations);
                //alglib.lsfitsetgradientcheck(state, 0.001);
                alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, null, null);
                alglib.lsfitresults(state, out info, out c, out rep);
            }
            catch (ArithmeticException) {
                return(originalQuality);
            }
            catch (alglib.alglibexception) {
                return(originalQuality);
            }

            //info == -7  => constant optimization failed due to wrong gradient
            if (info != -7)
            {
                UpdateConstants(tree, c.Skip(2).ToArray(), updateVariableWeights);
            }
            var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            if (!updateConstantsInTree)
            {
                UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights);
            }
            if (originalQuality - quality > 0.001 || double.IsNaN(quality))
            {
                UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights);
                return(originalQuality);
            }
            return(quality);
        }
Ejemplo n.º 29
0
        private void CalculateModel(IDataset ds, IEnumerable <int> rows, bool scaleInputs = true)
        {
            this.trainingDataset = (IDataset)ds.Clone();
            this.trainingRows    = rows.ToArray();
            this.inputScaling    = scaleInputs ? new Scaling(ds, allowedInputVariables, rows) : null;

            x = GetData(ds, this.allowedInputVariables, this.trainingRows, this.inputScaling);

            IEnumerable <double> y;

            y = ds.GetDoubleValues(targetVariable, rows);

            int n = x.GetLength(0);

            // calculate cholesky decomposed (lower triangular) covariance matrix
            var cov = covarianceFunction.GetParameterizedCovarianceFunction(covarianceParameter, Enumerable.Range(0, x.GetLength(1)));

            this.l = CalculateL(x, cov, sqrSigmaNoise);

            // calculate mean
            var mean = meanFunction.GetParameterizedMeanFunction(meanParameter, Enumerable.Range(0, x.GetLength(1)));

            double[] m = Enumerable.Range(0, x.GetLength(0))
                         .Select(r => mean.Mean(x, r))
                         .ToArray();

            // calculate sum of diagonal elements for likelihood
            double diagSum = Enumerable.Range(0, n).Select(i => Math.Log(l[i, i])).Sum();

            // solve for alpha
            double[] ym = y.Zip(m, (a, b) => a - b).ToArray();

            int info;

            alglib.densesolverreport denseSolveRep;

            alglib.spdmatrixcholeskysolve(l, n, false, ym, out info, out denseSolveRep, out alpha);
            for (int i = 0; i < alpha.Length; i++)
            {
                alpha[i] = alpha[i] / sqrSigmaNoise;
            }
            negativeLogLikelihood = 0.5 * Util.ScalarProd(ym, alpha) + diagSum + (n / 2.0) * Math.Log(2.0 * Math.PI * sqrSigmaNoise);

            // derivatives
            int nAllowedVariables = x.GetLength(1);

            alglib.matinvreport matInvRep;
            double[,] lCopy = new double[l.GetLength(0), l.GetLength(1)];
            Array.Copy(l, lCopy, lCopy.Length);

            alglib.spdmatrixcholeskyinverse(ref lCopy, n, false, out info, out matInvRep);
            if (info != 1)
            {
                throw new ArgumentException("Can't invert matrix to calculate gradients.");
            }
            for (int i = 0; i < n; i++)
            {
                for (int j = 0; j <= i; j++)
                {
                    lCopy[i, j] = lCopy[i, j] / sqrSigmaNoise - alpha[i] * alpha[j];
                }
            }

            double noiseGradient = sqrSigmaNoise * Enumerable.Range(0, n).Select(i => lCopy[i, i]).Sum();

            double[] meanGradients = new double[meanFunction.GetNumberOfParameters(nAllowedVariables)];
            for (int k = 0; k < meanGradients.Length; k++)
            {
                var meanGrad = Enumerable.Range(0, alpha.Length)
                               .Select(r => mean.Gradient(x, r, k));
                meanGradients[k] = -Util.ScalarProd(meanGrad, alpha);
            }

            double[] covGradients = new double[covarianceFunction.GetNumberOfParameters(nAllowedVariables)];
            if (covGradients.Length > 0)
            {
                for (int i = 0; i < n; i++)
                {
                    for (int j = 0; j < i; j++)
                    {
                        var g = cov.CovarianceGradient(x, i, j).ToArray();
                        for (int k = 0; k < covGradients.Length; k++)
                        {
                            covGradients[k] += lCopy[i, j] * g[k];
                        }
                    }

                    var gDiag = cov.CovarianceGradient(x, i, i).ToArray();
                    for (int k = 0; k < covGradients.Length; k++)
                    {
                        // diag
                        covGradients[k] += 0.5 * lCopy[i, i] * gDiag[k];
                    }
                }
            }

            hyperparameterGradients =
                meanGradients
                .Concat(covGradients)
                .Concat(new double[] { noiseGradient }).ToArray();
        }
Ejemplo n.º 30
0
 public IEnumerable<double> GetScaledValues(IDataset ds, string variable, IEnumerable<int> rows) {
   double min = scalingParameters[variable].Item1;
   double max = scalingParameters[variable].Item2;
   if (min.IsAlmost(max)) return rows.Select(i => 0.0); // return enumerable of zeros
   return ds.GetDoubleValues(variable, rows).Select(x => (x - min) / (max - min));  // scale to range [0..1]
 }
    private static double[,] PCAReduce(IDataset dataset, IEnumerable<int> rows, IEnumerable<string> variables) {
      var instances = rows.ToArray();
      var attributes = variables.ToArray();
      var data = new double[instances.Length, attributes.Length + 1];

      for (int j = 0; j < attributes.Length; j++) {
        int i = 0;
        var values = dataset.GetDoubleValues(attributes[j], instances);
        foreach (var v in values) {
          data[i++, j] = v;
        }
      }
      int info;
      double[] variances;
      var matrix = new double[0, 0];
      alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix);

      var result = new double[instances.Length, matrix.GetLength(1)];
      int r = 0;
      foreach (var inst in instances) {
        int i = 0;
        foreach (var attrib in attributes) {
          double val = dataset.GetDoubleValue(attrib, inst);
          for (int j = 0; j < result.GetLength(1); j++)
            result[r, j] += val * matrix[i, j];
          i++;
        }
        r++;
      }

      return result;
    }
Ejemplo n.º 32
0
    public double[,] Reduce(IDataset dataset, IEnumerable<int> rows) {
      var data = AlglibUtil.PrepareInputMatrix(dataset, allowedInputVariables, rows);

      var targets = dataset.GetDoubleValues(targetVariable, rows).ToArray();
      var result = new double[data.GetLength(0), transformationMatrix.GetLength(1) + 1];
      for (int i = 0; i < data.GetLength(0); i++)
        for (int j = 0; j < data.GetLength(1); j++) {
          for (int x = 0; x < transformationMatrix.GetLength(1); x++) {
            result[i, x] += data[i, j] * transformationMatrix[j, x];
          }
          result[i, transformationMatrix.GetLength(1)] = targets[i];
        }
      return result;
    }
Ejemplo n.º 33
0
 public static IDataset ReduceDataset(IDataset data, IReadOnlyList <int> rows, IReadOnlyList <string> inputVariables, string target)
 {
     return(new Dataset(inputVariables.Concat(new[] { target }), inputVariables.Concat(new[] { target }).Select(x => data.GetDoubleValues(x, rows).ToList())));
 }
        public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter,
                                               ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable <int> rows, bool applyLinearScaling,
                                               int maxIterations, bool updateVariableWeights = true,
                                               double lowerEstimationLimit = double.MinValue, double upperEstimationLimit              = double.MaxValue,
                                               bool updateConstantsInTree  = true, Action <double[], double, object> iterationCallback = null, EvaluationsCounter counter = null)
        {
            // numeric constants in the tree become variables for constant opt
            // variables in the tree become parameters (fixed values) for constant opt
            // for each parameter (variable in the original tree) we store the
            // variable name, variable value (for factor vars) and lag as a DataForVariable object.
            // A dictionary is used to find parameters
            double[] initialConstants;
            var      parameters = new List <TreeToAutoDiffTermConverter.DataForVariable>();

            TreeToAutoDiffTermConverter.ParametricFunction         func;
            TreeToAutoDiffTermConverter.ParametricFunctionGradient func_grad;
            if (!TreeToAutoDiffTermConverter.TryConvertToAutoDiff(tree, updateVariableWeights, applyLinearScaling, out parameters, out initialConstants, out func, out func_grad))
            {
                throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree.");
            }
            if (parameters.Count == 0)
            {
                return(0.0);                             // gkronber: constant expressions always have a R² of 0.0
            }
            var parameterEntries = parameters.ToArray(); // order of entries must be the same for x

            //extract inital constants
            double[] c;
            if (applyLinearScaling)
            {
                c    = new double[initialConstants.Length + 2];
                c[0] = 0.0;
                c[1] = 1.0;
                Array.Copy(initialConstants, 0, c, 2, initialConstants.Length);
            }
            else
            {
                c = (double[])initialConstants.Clone();
            }

            double originalQuality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            if (counter == null)
            {
                counter = new EvaluationsCounter();
            }
            var rowEvaluationsCounter = new EvaluationsCounter();

            alglib.lsfitstate  state;
            alglib.lsfitreport rep;
            int retVal;

            IDataset ds = problemData.Dataset;

            double[,] x = new double[rows.Count(), parameters.Count];
            int row = 0;

            foreach (var r in rows)
            {
                int col = 0;
                foreach (var info in parameterEntries)
                {
                    if (ds.VariableHasType <double>(info.variableName))
                    {
                        x[row, col] = ds.GetDoubleValue(info.variableName, r + info.lag);
                    }
                    else if (ds.VariableHasType <string>(info.variableName))
                    {
                        x[row, col] = ds.GetStringValue(info.variableName, r) == info.variableValue ? 1 : 0;
                    }
                    else
                    {
                        throw new InvalidProgramException("found a variable of unknown type");
                    }
                    col++;
                }
                row++;
            }
            double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray();
            int      n = x.GetLength(0);
            int      m = x.GetLength(1);
            int      k = c.Length;

            alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(func);
            alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(func_grad);
            alglib.ndimensional_rep   xrep = (p, f, obj) => iterationCallback(p, f, obj);

            try {
                alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state);
                alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations);
                alglib.lsfitsetxrep(state, iterationCallback != null);
                //alglib.lsfitsetgradientcheck(state, 0.001);
                alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, xrep, rowEvaluationsCounter);
                alglib.lsfitresults(state, out retVal, out c, out rep);
            } catch (ArithmeticException) {
                return(originalQuality);
            } catch (alglib.alglibexception) {
                return(originalQuality);
            }

            counter.FunctionEvaluations += rowEvaluationsCounter.FunctionEvaluations / n;
            counter.GradientEvaluations += rowEvaluationsCounter.GradientEvaluations / n;

            //retVal == -7  => constant optimization failed due to wrong gradient
            if (retVal != -7)
            {
                if (applyLinearScaling)
                {
                    var tmp = new double[c.Length - 2];
                    Array.Copy(c, 2, tmp, 0, tmp.Length);
                    UpdateConstants(tree, tmp, updateVariableWeights);
                }
                else
                {
                    UpdateConstants(tree, c, updateVariableWeights);
                }
            }
            var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling);

            if (!updateConstantsInTree)
            {
                UpdateConstants(tree, initialConstants, updateVariableWeights);
            }

            if (originalQuality - quality > 0.001 || double.IsNaN(quality))
            {
                UpdateConstants(tree, initialConstants, updateVariableWeights);
                return(originalQuality);
            }
            return(quality);
        }
Ejemplo n.º 35
0
        public static KernelRidgeRegressionModel Create(IDataset dataset, string targetVariable, IEnumerable <string> allowedInputVariables, IEnumerable <int> rows,
                                                        bool scaleInputs, ICovarianceFunction kernel, double lambda = 0.1)
        {
            var trainingRows = rows.ToArray();
            var model        = new KernelRidgeRegressionModel(dataset, targetVariable, allowedInputVariables, trainingRows, scaleInputs, kernel, lambda);

            try {
                int info;
                int n = model.trainX.GetLength(0);
                alglib.densesolverreport denseSolveRep;
                var gram = BuildGramMatrix(model.trainX, lambda, kernel);
                var l    = new double[n, n];
                Array.Copy(gram, l, l.Length);

                double[] alpha = new double[n];
                double[,] invG;
                var y = dataset.GetDoubleValues(targetVariable, trainingRows).ToArray();
                for (int i = 0; i < y.Length; i++)
                {
                    y[i] -= model.yOffset;
                    y[i] *= model.yScale;
                }
                // cholesky decomposition
                var res = alglib.trfac.spdmatrixcholesky(ref l, n, false);
                if (res == false) //try lua decomposition if cholesky faild
                {
                    int[] pivots;
                    var   lua = new double[n, n];
                    Array.Copy(gram, lua, lua.Length);
                    alglib.rmatrixlu(ref lua, n, n, out pivots);
                    alglib.rmatrixlusolve(lua, pivots, n, y, out info, out denseSolveRep, out alpha);
                    if (info != 1)
                    {
                        throw new ArgumentException("Could not create model.");
                    }
                    alglib.matinvreport rep;
                    invG = lua; // rename
                    alglib.rmatrixluinverse(ref invG, pivots, n, out info, out rep);
                }
                else
                {
                    alglib.spdmatrixcholeskysolve(l, n, false, y, out info, out denseSolveRep, out alpha);
                    if (info != 1)
                    {
                        throw new ArgumentException("Could not create model.");
                    }
                    // for LOO-CV we need to build the inverse of the gram matrix
                    alglib.matinvreport rep;
                    invG = l; // rename
                    alglib.spdmatrixcholeskyinverse(ref invG, n, false, out info, out rep);
                }
                if (info != 1)
                {
                    throw new ArgumentException("Could not invert Gram matrix.");
                }

                var ssqLooError = 0.0;
                for (int i = 0; i < n; i++)
                {
                    var pred_i    = Util.ScalarProd(Util.GetRow(gram, i).ToArray(), alpha);
                    var looPred_i = pred_i - alpha[i] / invG[i, i];
                    var error     = (y[i] - looPred_i) / model.yScale;
                    ssqLooError += error * error;
                }

                Array.Copy(alpha, model.alpha, n);
                model.LooCvRMSE = Math.Sqrt(ssqLooError / n);
            } catch (alglib.alglibexception ae) {
                // wrap exception so that calling code doesn't have to know about alglib implementation
                throw new ArgumentException("There was a problem in the calculation of the kernel ridge regression model", ae);
            }
            return(model);
        }
Ejemplo n.º 36
0
 private IDataset ReduceDataset(IDataset data, IReadOnlyList <int> rows)
 {
     return(new Dataset(data.DoubleVariables, data.DoubleVariables.Select(v => data.GetDoubleValues(v, rows).ToList())));
 }
Ejemplo n.º 37
0
    private void CalculateModel(IDataset ds, IEnumerable<int> rows, bool scaleInputs = true) {
      this.trainingDataset = (IDataset)ds.Clone();
      this.trainingRows = rows.ToArray();
      this.inputScaling = scaleInputs ? new Scaling(ds, allowedInputVariables, rows) : null;

      x = GetData(ds, this.allowedInputVariables, this.trainingRows, this.inputScaling);

      IEnumerable<double> y;
      y = ds.GetDoubleValues(TargetVariable, rows);

      int n = x.GetLength(0);

      var columns = Enumerable.Range(0, x.GetLength(1)).ToArray();
      // calculate cholesky decomposed (lower triangular) covariance matrix
      var cov = covarianceFunction.GetParameterizedCovarianceFunction(covarianceParameter, columns);
      this.l = CalculateL(x, cov, sqrSigmaNoise);

      // calculate mean
      var mean = meanFunction.GetParameterizedMeanFunction(meanParameter, columns);
      double[] m = Enumerable.Range(0, x.GetLength(0))
        .Select(r => mean.Mean(x, r))
        .ToArray();

      // calculate sum of diagonal elements for likelihood
      double diagSum = Enumerable.Range(0, n).Select(i => Math.Log(l[i, i])).Sum();

      // solve for alpha
      double[] ym = y.Zip(m, (a, b) => a - b).ToArray();

      int info;
      alglib.densesolverreport denseSolveRep;

      alglib.spdmatrixcholeskysolve(l, n, false, ym, out info, out denseSolveRep, out alpha);
      for (int i = 0; i < alpha.Length; i++)
        alpha[i] = alpha[i] / sqrSigmaNoise;
      negativeLogLikelihood = 0.5 * Util.ScalarProd(ym, alpha) + diagSum + (n / 2.0) * Math.Log(2.0 * Math.PI * sqrSigmaNoise);

      // derivatives
      int nAllowedVariables = x.GetLength(1);

      alglib.matinvreport matInvRep;
      double[,] lCopy = new double[l.GetLength(0), l.GetLength(1)];
      Array.Copy(l, lCopy, lCopy.Length);

      alglib.spdmatrixcholeskyinverse(ref lCopy, n, false, out info, out matInvRep);
      if (info != 1) throw new ArgumentException("Can't invert matrix to calculate gradients.");
      for (int i = 0; i < n; i++) {
        for (int j = 0; j <= i; j++)
          lCopy[i, j] = lCopy[i, j] / sqrSigmaNoise - alpha[i] * alpha[j];
      }

      double noiseGradient = sqrSigmaNoise * Enumerable.Range(0, n).Select(i => lCopy[i, i]).Sum();

      double[] meanGradients = new double[meanFunction.GetNumberOfParameters(nAllowedVariables)];
      for (int k = 0; k < meanGradients.Length; k++) {
        var meanGrad = new double[alpha.Length];
        for (int g = 0; g < meanGrad.Length; g++)
          meanGrad[g] = mean.Gradient(x, g, k);
        meanGradients[k] = -Util.ScalarProd(meanGrad, alpha);
      }

      double[] covGradients = new double[covarianceFunction.GetNumberOfParameters(nAllowedVariables)];
      if (covGradients.Length > 0) {
        for (int i = 0; i < n; i++) {
          for (int j = 0; j < i; j++) {
            var g = cov.CovarianceGradient(x, i, j);
            for (int k = 0; k < covGradients.Length; k++) {
              covGradients[k] += lCopy[i, j] * g[k];
            }
          }

          var gDiag = cov.CovarianceGradient(x, i, i);
          for (int k = 0; k < covGradients.Length; k++) {
            // diag
            covGradients[k] += 0.5 * lCopy[i, i] * gDiag[k];
          }
        }
      }

      hyperparameterGradients =
        meanGradients
        .Concat(covGradients)
        .Concat(new double[] { noiseGradient }).ToArray();

    }