protected void WriteDatasetToExcel(ExcelWorksheet datasetWorksheet, IDataAnalysisProblemData problemData) { //remark the performance of EPPlus drops dramatically //if the data is not written row wise (from left to right) due the internal indices used. IDataset dataset = problemData.Dataset; var variableNames = dataset.VariableNames.ToList(); var doubleVariables = new HashSet <string>(dataset.DoubleVariables); for (int col = 1; col <= variableNames.Count; col++) { datasetWorksheet.Cells[1, col].Value = variableNames[col - 1]; } for (int row = 0; row < dataset.Rows; row++) { for (int col = 0; col < variableNames.Count; col++) { if (doubleVariables.Contains(variableNames[col])) { datasetWorksheet.Cells[row + 2, col + 1].Value = dataset.GetDoubleValue(variableNames[col], row); } else { datasetWorksheet.Cells[row + 2, col + 1].Value = dataset.GetValue(row, col); } } } }
public double[,] RevertData(IDataset dataset, IEnumerable <int> rows) { var instances = rows.ToArray(); var components = ComponentNames; var result = new double[instances.Length, VariableNames.Length]; for (var r = 0; r < instances.Length; r++) { for (var i = 0; i < components.Length; i++) { var val = dataset.GetDoubleValue(components[i], instances[r]); for (var j = 0; j < VariableNames.Length; j++) { result[r, j] += val * Matrix[j, i]; } } } for (var r = 0; r < instances.Length; r++) { for (var j = 0; j < VariableNames.Length; j++) { result[r, j] *= Deviations[j]; result[r, j] += Means[j]; } } return(result); }
public static IEnumerable<int> FindClosestCenters(IEnumerable<double[]> centers, IDataset dataset, IEnumerable<string> allowedInputVariables, IEnumerable<int> rows) { int nRows = rows.Count(); int nCols = allowedInputVariables.Count(); int[] closestCenter = new int[nRows]; double[] bestCenterDistance = Enumerable.Repeat(double.MaxValue, nRows).ToArray(); int centerIndex = 1; foreach (double[] center in centers) { if (nCols != center.Length) throw new ArgumentException(); int rowIndex = 0; foreach (var row in rows) { // calc euclidian distance of point to center double centerDistance = 0; int col = 0; foreach (var inputVariable in allowedInputVariables) { double d = center[col++] - dataset.GetDoubleValue(inputVariable, row); d = d * d; // square; centerDistance += d; if (centerDistance > bestCenterDistance[rowIndex]) break; } if (centerDistance < bestCenterDistance[rowIndex]) { bestCenterDistance[rowIndex] = centerDistance; closestCenter[rowIndex] = centerIndex; } rowIndex++; } centerIndex++; } return closestCenter; }
private double GetEstimatedValue(IDataset dataset, int row) { if (!IsLeaf) { return((dataset.GetDoubleValue(SplitAttribute, row) <= SplitValue ? Left : Right).GetEstimatedValue(dataset, row)); } if (Model == null) { throw new NotSupportedException("The model has not been built correctly"); } return(Model.GetEstimatedValues(dataset, new[] { row }).First()); }
public static double CalculateIntraClusterSumOfSquares(KMeansClusteringModel model, IDataset dataset, IEnumerable<int> rows) { List<int> clusterValues = model.GetClusterValues(dataset, rows).ToList(); List<string> allowedInputVariables = model.AllowedInputVariables.ToList(); int nCols = allowedInputVariables.Count; Dictionary<int, List<double[]>> clusterPoints = new Dictionary<int, List<double[]>>(); Dictionary<int, double[]> clusterMeans = new Dictionary<int, double[]>(); foreach (var clusterValue in clusterValues.Distinct()) { clusterPoints.Add(clusterValue, new List<double[]>()); } // collect points of clusters int clusterValueIndex = 0; foreach (var row in rows) { double[] p = new double[allowedInputVariables.Count]; for (int i = 0; i < nCols; i++) { p[i] = dataset.GetDoubleValue(allowedInputVariables[i], row); } clusterPoints[clusterValues[clusterValueIndex++]].Add(p); } // calculate cluster means foreach (var pair in clusterPoints) { double[] mean = new double[nCols]; foreach (var p in pair.Value) { for (int i = 0; i < nCols; i++) { mean[i] += p[i]; } } for (int i = 0; i < nCols; i++) { mean[i] /= pair.Value.Count; } clusterMeans[pair.Key] = mean; } // calculate distances double allCenterDistances = 0; foreach (var pair in clusterMeans) { double[] mean = pair.Value; double centerDistances = 0; foreach (var clusterPoint in clusterPoints[pair.Key]) { double centerDistance = 0; for (int i = 0; i < nCols; i++) { double d = mean[i] - clusterPoint[i]; d = d * d; centerDistance += d; } centerDistances += centerDistance; } allCenterDistances += centerDistances; } return allCenterDistances; }
/// <summary> /// Transforms <paramref name="dataset"/> into a data structure as needed by libSVM. /// </summary> /// <param name="dataset">The source dataset</param> /// <param name="targetVariable">The target variable</param> /// <param name="inputVariables">The selected input variables to include in the svm_problem.</param> /// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param> /// <returns>A problem data type that can be used to train a support vector machine.</returns> public static svm_problem CreateSvmProblem(IDataset dataset, string targetVariable, IEnumerable <string> inputVariables, IEnumerable <int> rowIndices) { double[] targetVector; var nRows = rowIndices.Count(); if (string.IsNullOrEmpty(targetVariable)) { // if the target variable is not set (e.g. for prediction of a trained model) we just use a zero vector targetVector = new double[nRows]; } else { targetVector = dataset.GetDoubleValues(targetVariable, rowIndices).ToArray(); } svm_node[][] nodes = new svm_node[nRows][]; int maxNodeIndex = 0; int svmProblemRowIndex = 0; List <string> inputVariablesList = inputVariables.ToList(); foreach (int row in rowIndices) { List <svm_node> tempRow = new List <svm_node>(); int colIndex = 1; // make sure the smallest node index for SVM = 1 foreach (var inputVariable in inputVariablesList) { double value = dataset.GetDoubleValue(inputVariable, row); // SVM also works with missing values // => don't add NaN values in the dataset to the sparse SVM matrix representation if (!double.IsNaN(value)) { tempRow.Add(new svm_node() { index = colIndex, value = value }); // nodes must be sorted in ascending ordered by column index if (colIndex > maxNodeIndex) { maxNodeIndex = colIndex; } } colIndex++; } nodes[svmProblemRowIndex++] = tempRow.ToArray(); } return(new svm_problem { l = targetVector.Length, y = targetVector, x = nodes }); }
public double[,] TransformData(IDataset dataset, IEnumerable <int> rows) { var instances = rows.ToArray(); var result = new double[instances.Length, VariableNames.Length]; for (var r = 0; r < instances.Length; r++) { for (var i = 0; i < VariableNames.Length; i++) { var val = (dataset.GetDoubleValue(VariableNames[i], instances[r]) - Means[i]) / Deviations[i]; for (var j = 0; j < VariableNames.Length; j++) { result[r, j] += val * Matrix[i, j]; } } } return(result); }
private static double[,] PCAReduce(IDataset dataset, IEnumerable <int> rows, IEnumerable <string> variables) { var instances = rows.ToArray(); var attributes = variables.ToArray(); var data = new double[instances.Length, attributes.Length + 1]; for (int j = 0; j < attributes.Length; j++) { int i = 0; var values = dataset.GetDoubleValues(attributes[j], instances); foreach (var v in values) { data[i++, j] = v; } } int info; double[] variances; var matrix = new double[0, 0]; alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix); var result = new double[instances.Length, matrix.GetLength(1)]; int r = 0; foreach (var inst in instances) { int i = 0; foreach (var attrib in attributes) { double val = dataset.GetDoubleValue(attrib, inst); for (int j = 0; j < result.GetLength(1); j++) { result[r, j] += val * matrix[i, j]; } i++; } r++; } return(result); }
public static IEnumerable <int> FindClosestCenters(IEnumerable <double[]> centers, IDataset dataset, IEnumerable <string> allowedInputVariables, IEnumerable <int> rows) { int nRows = rows.Count(); int nCols = allowedInputVariables.Count(); int[] closestCenter = new int[nRows]; double[] bestCenterDistance = Enumerable.Repeat(double.MaxValue, nRows).ToArray(); int centerIndex = 1; foreach (double[] center in centers) { if (nCols != center.Length) { throw new ArgumentException(); } int rowIndex = 0; foreach (var row in rows) { // calc euclidian distance of point to center double centerDistance = 0; int col = 0; foreach (var inputVariable in allowedInputVariables) { double d = center[col++] - dataset.GetDoubleValue(inputVariable, row); d = d * d; // square; centerDistance += d; if (centerDistance > bestCenterDistance[rowIndex]) { break; } } if (centerDistance < bestCenterDistance[rowIndex]) { bestCenterDistance[rowIndex] = centerDistance; closestCenter[rowIndex] = centerIndex; } rowIndex++; } centerIndex++; } return(closestCenter); }
private void EvaluateLaggedOperations(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, IDataset ds) { // lag Evaluate(interpreter, ds, "(lagVariable 1.0 a -1) ", 1, ds.GetDoubleValue("A", 0)); Evaluate(interpreter, ds, "(lagVariable 1.0 a -1) ", 2, ds.GetDoubleValue("A", 1)); Evaluate(interpreter, ds, "(lagVariable 1.0 a 0) ", 2, ds.GetDoubleValue("A", 2)); Evaluate(interpreter, ds, "(lagVariable 1.0 a 1) ", 0, ds.GetDoubleValue("A", 1)); // integral Evaluate(interpreter, ds, "(integral -1.0 (variable 1.0 a)) ", 1, ds.GetDoubleValue("A", 0) + ds.GetDoubleValue("A", 1)); Evaluate(interpreter, ds, "(integral -1.0 (lagVariable 1.0 a 1)) ", 1, ds.GetDoubleValue("A", 1) + ds.GetDoubleValue("A", 2)); Evaluate(interpreter, ds, "(integral -2.0 (variable 1.0 a)) ", 2, ds.GetDoubleValue("A", 0) + ds.GetDoubleValue("A", 1) + ds.GetDoubleValue("A", 2)); Evaluate(interpreter, ds, "(integral -1.0 (* (variable 1.0 a) (variable 1.0 b)))", 1, ds.GetDoubleValue("A", 0) * ds.GetDoubleValue("B", 0) + ds.GetDoubleValue("A", 1) * ds.GetDoubleValue("B", 1)); Evaluate(interpreter, ds, "(integral -2.0 3.0)", 1, 9.0); // derivative // (f_0 + 2 * f_1 - 2 * f_3 - f_4) / 8; // h = 1 Evaluate(interpreter, ds, "(diff (variable 1.0 a)) ", 5, (ds.GetDoubleValue("A", 5) + 2 * ds.GetDoubleValue("A", 4) - 2 * ds.GetDoubleValue("A", 2) - ds.GetDoubleValue("A", 1)) / 8.0); Evaluate(interpreter, ds, "(diff (variable 1.0 b)) ", 5, (ds.GetDoubleValue("B", 5) + 2 * ds.GetDoubleValue("B", 4) - 2 * ds.GetDoubleValue("B", 2) - ds.GetDoubleValue("B", 1)) / 8.0); Evaluate(interpreter, ds, "(diff (* (variable 1.0 a) (variable 1.0 b)))", 5, + (ds.GetDoubleValue("A", 5) * ds.GetDoubleValue("B", 5) + 2 * ds.GetDoubleValue("A", 4) * ds.GetDoubleValue("B", 4) - 2 * ds.GetDoubleValue("A", 2) * ds.GetDoubleValue("B", 2) - ds.GetDoubleValue("A", 1) * ds.GetDoubleValue("B", 1)) / 8.0); Evaluate(interpreter, ds, "(diff -2.0 3.0)", 5, 0.0); // timelag Evaluate(interpreter, ds, "(lag -1.0 (lagVariable 1.0 a 2)) ", 1, ds.GetDoubleValue("A", 2)); Evaluate(interpreter, ds, "(lag -2.0 (lagVariable 1.0 a 2)) ", 2, ds.GetDoubleValue("A", 2)); Evaluate(interpreter, ds, "(lag -1.0 (* (lagVariable 1.0 a 1) (lagVariable 1.0 b 2)))", 1, ds.GetDoubleValue("A", 1) * ds.GetDoubleValue("B", 2)); Evaluate(interpreter, ds, "(lag -2.0 3.0)", 1, 3.0); }
public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable <int> rows, bool applyLinearScaling, int maxIterations, bool updateVariableWeights = true, double lowerEstimationLimit = double.MinValue, double upperEstimationLimit = double.MaxValue, bool updateConstantsInTree = true, Action <double[], double, object> iterationCallback = null, EvaluationsCounter counter = null) { // numeric constants in the tree become variables for constant opt // variables in the tree become parameters (fixed values) for constant opt // for each parameter (variable in the original tree) we store the // variable name, variable value (for factor vars) and lag as a DataForVariable object. // A dictionary is used to find parameters double[] initialConstants; var parameters = new List <TreeToAutoDiffTermConverter.DataForVariable>(); TreeToAutoDiffTermConverter.ParametricFunction func; TreeToAutoDiffTermConverter.ParametricFunctionGradient func_grad; if (!TreeToAutoDiffTermConverter.TryConvertToAutoDiff(tree, updateVariableWeights, applyLinearScaling, out parameters, out initialConstants, out func, out func_grad)) { throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree."); } if (parameters.Count == 0) { return(0.0); // gkronber: constant expressions always have a R² of 0.0 } var parameterEntries = parameters.ToArray(); // order of entries must be the same for x //extract inital constants double[] c; if (applyLinearScaling) { c = new double[initialConstants.Length + 2]; c[0] = 0.0; c[1] = 1.0; Array.Copy(initialConstants, 0, c, 2, initialConstants.Length); } else { c = (double[])initialConstants.Clone(); } double originalQuality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); if (counter == null) { counter = new EvaluationsCounter(); } var rowEvaluationsCounter = new EvaluationsCounter(); alglib.lsfitstate state; alglib.lsfitreport rep; int retVal; IDataset ds = problemData.Dataset; double[,] x = new double[rows.Count(), parameters.Count]; int row = 0; foreach (var r in rows) { int col = 0; foreach (var info in parameterEntries) { if (ds.VariableHasType <double>(info.variableName)) { x[row, col] = ds.GetDoubleValue(info.variableName, r + info.lag); } else if (ds.VariableHasType <string>(info.variableName)) { x[row, col] = ds.GetStringValue(info.variableName, r) == info.variableValue ? 1 : 0; } else { throw new InvalidProgramException("found a variable of unknown type"); } col++; } row++; } double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray(); int n = x.GetLength(0); int m = x.GetLength(1); int k = c.Length; alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(func); alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(func_grad); alglib.ndimensional_rep xrep = (p, f, obj) => iterationCallback(p, f, obj); try { alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state); alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations); alglib.lsfitsetxrep(state, iterationCallback != null); //alglib.lsfitsetgradientcheck(state, 0.001); alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, xrep, rowEvaluationsCounter); alglib.lsfitresults(state, out retVal, out c, out rep); } catch (ArithmeticException) { return(originalQuality); } catch (alglib.alglibexception) { return(originalQuality); } counter.FunctionEvaluations += rowEvaluationsCounter.FunctionEvaluations / n; counter.GradientEvaluations += rowEvaluationsCounter.GradientEvaluations / n; //retVal == -7 => constant optimization failed due to wrong gradient if (retVal != -7) { if (applyLinearScaling) { var tmp = new double[c.Length - 2]; Array.Copy(c, 2, tmp, 0, tmp.Length); UpdateConstants(tree, tmp, updateVariableWeights); } else { UpdateConstants(tree, c, updateVariableWeights); } } var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); if (!updateConstantsInTree) { UpdateConstants(tree, initialConstants, updateVariableWeights); } if (originalQuality - quality > 0.001 || double.IsNaN(quality)) { UpdateConstants(tree, initialConstants, updateVariableWeights); return(originalQuality); } return(quality); }
private double GetEstimatedVariance(IDataset dataset, int row) { return(!IsLeaf ? ((IConfidenceRegressionModel)(dataset.GetDoubleValue(SplitAttribute, row) <= SplitValue ? Left : Right)).GetEstimatedVariances(dataset, row.ToEnumerable()).Single() : ((IConfidenceRegressionModel)Model).GetEstimatedVariances(dataset, new[] { row }).First()); }
public static double CalculateIntraClusterSumOfSquares(KMeansClusteringModel model, IDataset dataset, IEnumerable <int> rows) { List <int> clusterValues = model.GetClusterValues(dataset, rows).ToList(); List <string> allowedInputVariables = model.AllowedInputVariables.ToList(); int nCols = allowedInputVariables.Count; Dictionary <int, List <double[]> > clusterPoints = new Dictionary <int, List <double[]> >(); Dictionary <int, double[]> clusterMeans = new Dictionary <int, double[]>(); foreach (var clusterValue in clusterValues.Distinct()) { clusterPoints.Add(clusterValue, new List <double[]>()); } // collect points of clusters int clusterValueIndex = 0; foreach (var row in rows) { double[] p = new double[allowedInputVariables.Count]; for (int i = 0; i < nCols; i++) { p[i] = dataset.GetDoubleValue(allowedInputVariables[i], row); } clusterPoints[clusterValues[clusterValueIndex++]].Add(p); } // calculate cluster means foreach (var pair in clusterPoints) { double[] mean = new double[nCols]; foreach (var p in pair.Value) { for (int i = 0; i < nCols; i++) { mean[i] += p[i]; } } for (int i = 0; i < nCols; i++) { mean[i] /= pair.Value.Count; } clusterMeans[pair.Key] = mean; } // calculate distances double allCenterDistances = 0; foreach (var pair in clusterMeans) { double[] mean = pair.Value; double centerDistances = 0; foreach (var clusterPoint in clusterPoints[pair.Key]) { double centerDistance = 0; for (int i = 0; i < nCols; i++) { double d = mean[i] - clusterPoint[i]; d = d * d; centerDistance += d; } centerDistances += centerDistance; } allCenterDistances += centerDistances; } return(allCenterDistances); }
private double GetEstimatedValue(IDataset dataset, int row) { return(Intercept + (Coefficients.Count == 0 ? 0 : Coefficients.Sum(s => s.Value * dataset.GetDoubleValue(s.Key, row)))); }
public static double OptimizeConstants(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree tree, IRegressionProblemData problemData, IEnumerable <int> rows, bool applyLinearScaling, int maxIterations, bool updateVariableWeights = true, double lowerEstimationLimit = double.MinValue, double upperEstimationLimit = double.MaxValue, bool updateConstantsInTree = true) { List <AutoDiff.Variable> variables = new List <AutoDiff.Variable>(); List <AutoDiff.Variable> parameters = new List <AutoDiff.Variable>(); List <string> variableNames = new List <string>(); AutoDiff.Term func; if (!TryTransformToAutoDiff(tree.Root.GetSubtree(0), variables, parameters, variableNames, updateVariableWeights, out func)) { throw new NotSupportedException("Could not optimize constants of symbolic expression tree due to not supported symbols used in the tree."); } if (variableNames.Count == 0) { return(0.0); } AutoDiff.IParametricCompiledTerm compiledFunc = func.Compile(variables.ToArray(), parameters.ToArray()); List <SymbolicExpressionTreeTerminalNode> terminalNodes = null; if (updateVariableWeights) { terminalNodes = tree.Root.IterateNodesPrefix().OfType <SymbolicExpressionTreeTerminalNode>().ToList(); } else { terminalNodes = new List <SymbolicExpressionTreeTerminalNode>(tree.Root.IterateNodesPrefix().OfType <ConstantTreeNode>()); } //extract inital constants double[] c = new double[variables.Count]; { c[0] = 0.0; c[1] = 1.0; int i = 2; foreach (var node in terminalNodes) { ConstantTreeNode constantTreeNode = node as ConstantTreeNode; VariableTreeNode variableTreeNode = node as VariableTreeNode; if (constantTreeNode != null) { c[i++] = constantTreeNode.Value; } else if (updateVariableWeights && variableTreeNode != null) { c[i++] = variableTreeNode.Weight; } } } double[] originalConstants = (double[])c.Clone(); double originalQuality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); alglib.lsfitstate state; alglib.lsfitreport rep; int info; IDataset ds = problemData.Dataset; double[,] x = new double[rows.Count(), variableNames.Count]; int row = 0; foreach (var r in rows) { for (int col = 0; col < variableNames.Count; col++) { x[row, col] = ds.GetDoubleValue(variableNames[col], r); } row++; } double[] y = ds.GetDoubleValues(problemData.TargetVariable, rows).ToArray(); int n = x.GetLength(0); int m = x.GetLength(1); int k = c.Length; alglib.ndimensional_pfunc function_cx_1_func = CreatePFunc(compiledFunc); alglib.ndimensional_pgrad function_cx_1_grad = CreatePGrad(compiledFunc); try { alglib.lsfitcreatefg(x, y, c, n, m, k, false, out state); alglib.lsfitsetcond(state, 0.0, 0.0, maxIterations); //alglib.lsfitsetgradientcheck(state, 0.001); alglib.lsfitfit(state, function_cx_1_func, function_cx_1_grad, null, null); alglib.lsfitresults(state, out info, out c, out rep); } catch (ArithmeticException) { return(originalQuality); } catch (alglib.alglibexception) { return(originalQuality); } //info == -7 => constant optimization failed due to wrong gradient if (info != -7) { UpdateConstants(tree, c.Skip(2).ToArray(), updateVariableWeights); } var quality = SymbolicRegressionSingleObjectivePearsonRSquaredEvaluator.Calculate(interpreter, tree, lowerEstimationLimit, upperEstimationLimit, problemData, rows, applyLinearScaling); if (!updateConstantsInTree) { UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights); } if (originalQuality - quality > 0.001 || double.IsNaN(quality)) { UpdateConstants(tree, originalConstants.Skip(2).ToArray(), updateVariableWeights); return(originalQuality); } return(quality); }
public bool Covers(IDataset dataset, int row) { return(!SplitAttributes.Where((t, i) => !Comparisons[i].Compare(dataset.GetDoubleValue(t, row), SplitValues[i])).Any()); }
private static double[,] PCAReduce(IDataset dataset, IEnumerable<int> rows, IEnumerable<string> variables) { var instances = rows.ToArray(); var attributes = variables.ToArray(); var data = new double[instances.Length, attributes.Length + 1]; for (int j = 0; j < attributes.Length; j++) { int i = 0; var values = dataset.GetDoubleValues(attributes[j], instances); foreach (var v in values) { data[i++, j] = v; } } int info; double[] variances; var matrix = new double[0, 0]; alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix); var result = new double[instances.Length, matrix.GetLength(1)]; int r = 0; foreach (var inst in instances) { int i = 0; foreach (var attrib in attributes) { double val = dataset.GetDoubleValue(attrib, inst); for (int j = 0; j < result.GetLength(1); j++) result[r, j] += val * matrix[i, j]; i++; } r++; } return result; }