public static bool IsProblemDataCompatible(IClassificationModel model, IClassificationProblemData problemData, out string errorMessage) { if (model == null) { throw new ArgumentNullException("model", "The provided model is null."); } if (problemData == null) { throw new ArgumentNullException("problemData", "The provided problemData is null."); } errorMessage = string.Empty; if (model.TargetVariable != problemData.TargetVariable) { errorMessage = string.Format("The target variable of the model {0} does not match the target variable of the problemData {1}.", model.TargetVariable, problemData.TargetVariable); } var evaluationErrorMessage = string.Empty; var datasetCompatible = model.IsDatasetCompatible(problemData.Dataset, out evaluationErrorMessage); if (!datasetCompatible) { errorMessage += evaluationErrorMessage; } return(string.IsNullOrEmpty(errorMessage)); }
/// <summary> /// Grid search without crossvalidation (since for random forests the out-of-bag estimate is unbiased) /// </summary> /// <param name="problemData">The classification problem data</param> /// <param name="parameterRanges">The ranges for each parameter in the grid search</param> /// <param name="seed">The random seed (required by the random forest model)</param> /// <param name="maxDegreeOfParallelism">The maximum allowed number of threads (to parallelize the grid search)</param> public static RFParameter GridSearch(IClassificationProblemData problemData, Dictionary <string, IEnumerable <double> > parameterRanges, int seed = 12345, int maxDegreeOfParallelism = 1) { var setters = parameterRanges.Keys.Select(GenerateSetter).ToList(); var crossProduct = parameterRanges.Values.CartesianProduct(); double bestOutOfBagRmsError = double.MaxValue; RFParameter bestParameters = new RFParameter(); var locker = new object(); Parallel.ForEach(crossProduct, new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism }, parameterCombination => { var parameterValues = parameterCombination.ToList(); var parameters = new RFParameter(); for (int i = 0; i < setters.Count; ++i) { setters[i](parameters, parameterValues[i]); } double rmsError, outOfBagRmsError, avgRelError, outOfBagAvgRelError; RandomForestModel.CreateClassificationModel(problemData, problemData.TrainingIndices, parameters.N, parameters.R, parameters.M, seed, out rmsError, out outOfBagRmsError, out avgRelError, out outOfBagAvgRelError); lock (locker) { if (bestOutOfBagRmsError > outOfBagRmsError) { bestOutOfBagRmsError = outOfBagRmsError; bestParameters = (RFParameter)parameters.Clone(); } } }); return(bestParameters); }
/// <summary> /// Grid search with crossvalidation /// </summary> /// <param name="problemData">The classification problem data</param> /// <param name="numberOfFolds">The number of folds for crossvalidation</param> /// <param name="shuffleFolds">Specifies whether the folds should be shuffled</param> /// <param name="parameterRanges">The ranges for each parameter in the grid search</param> /// <param name="seed">The random seed (for shuffling)</param> /// <param name="maxDegreeOfParallelism">The maximum allowed number of threads (to parallelize the grid search)</param> public static RFParameter GridSearch(IClassificationProblemData problemData, int numberOfFolds, bool shuffleFolds, Dictionary <string, IEnumerable <double> > parameterRanges, int seed = 12345, int maxDegreeOfParallelism = 1) { DoubleValue accuracy = new DoubleValue(0); RFParameter bestParameter = new RFParameter(); var setters = parameterRanges.Keys.Select(GenerateSetter).ToList(); var crossProduct = parameterRanges.Values.CartesianProduct(); var partitions = GenerateRandomForestPartitions(problemData, numberOfFolds, shuffleFolds); var locker = new object(); Parallel.ForEach(crossProduct, new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism }, parameterCombination => { var parameterValues = parameterCombination.ToList(); double testAccuracy; var parameters = new RFParameter(); for (int i = 0; i < setters.Count; ++i) { setters[i](parameters, parameterValues[i]); } CrossValidate(problemData, partitions, parameters.N, parameters.R, parameters.M, seed, out testAccuracy); lock (locker) { if (testAccuracy > accuracy.Value) { accuracy.Value = testAccuracy; bestParameter = (RFParameter)parameters.Clone(); } } }); return(bestParameter); }
public static IEnumerable <Tuple <string, double> > CalculateImpacts( IClassificationModel model, IClassificationProblemData problemData, IEnumerable <double> estimatedClassValues, IEnumerable <int> rows, ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Shuffle, FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { //fholzing: try and catch in case a different dataset is loaded, otherwise statement is neglectable var missingVariables = model.VariablesUsedForPrediction.Except(problemData.Dataset.VariableNames); if (missingVariables.Any()) { throw new InvalidOperationException(string.Format("Can not calculate variable impacts, because the model uses inputs missing in the dataset ({0})", string.Join(", ", missingVariables))); } IEnumerable <double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); var originalQuality = CalculateQuality(targetValues, estimatedClassValues); var impacts = new Dictionary <string, double>(); var inputvariables = new HashSet <string>(problemData.AllowedInputVariables.Union(model.VariablesUsedForPrediction)); var modifiableDataset = ((Dataset)(problemData.Dataset).Clone()).ToModifiable(); foreach (var inputVariable in inputvariables) { impacts[inputVariable] = CalculateImpact(inputVariable, model, problemData, modifiableDataset, rows, replacementMethod, factorReplacementMethod, targetValues, originalQuality); } return(impacts.Select(i => Tuple.Create(i.Key, i.Value))); }
private static RandomForestClassificationSolution GridSearch(IClassificationProblemData problemData, out RFParameter bestParameters, int seed = 3141519) { double rmsError, outOfBagRmsError, relClassificationError, outOfBagRelClassificationError; bestParameters = RandomForestUtil.GridSearch(problemData, randomForestParameterRanges, seed, maximumDegreeOfParallelism); var model = RandomForestModel.CreateClassificationModel(problemData, problemData.TrainingIndices, bestParameters.N, bestParameters.R, bestParameters.M, seed, out rmsError, out outOfBagRmsError, out relClassificationError, out outOfBagRelClassificationError); return (RandomForestClassificationSolution)model.CreateClassificationSolution(problemData); }
public override double[,] Initialize(IClassificationProblemData data, int dimensions) { var instances = data.TrainingIndices.Count(); var attributes = data.AllowedInputVariables.Count(); var ldaDs = data.Dataset.ToArray( data.AllowedInputVariables.Concat(data.TargetVariable.ToEnumerable()), data.TrainingIndices); // map class values to sequential natural numbers (required by alglib) var uniqueClasses = data.Dataset.GetDoubleValues(data.TargetVariable, data.TrainingIndices) .Distinct() .Select((v, i) => new { v, i }) .ToDictionary(x => x.v, x => x.i); for (int row = 0; row < instances; row++) { ldaDs[row, attributes] = uniqueClasses[ldaDs[row, attributes]]; } int info; double[,] matrix; alglib.fisherldan(ldaDs, instances, attributes, uniqueClasses.Count, out info, out matrix); return(matrix); }
private static OneFactorClassificationModel FindBestFactorModel(IClassificationProblemData problemData) { var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices); var defaultClass = FindMostFrequentClassValue(classValues); // only select string variables var allowedInputVariables = problemData.AllowedInputVariables.Where(problemData.Dataset.VariableHasType<string>); if (!allowedInputVariables.Any()) return null; OneFactorClassificationModel bestModel = null; var bestModelNumCorrect = 0; foreach (var variable in allowedInputVariables) { var variableValues = problemData.Dataset.GetStringValues(variable, problemData.TrainingIndices); var groupedClassValues = variableValues .Zip(classValues, (v, c) => new KeyValuePair<string, double>(v, c)) .GroupBy(kvp => kvp.Key) .ToDictionary(g => g.Key, g => FindMostFrequentClassValue(g.Select(kvp => kvp.Value))); var model = new OneFactorClassificationModel(problemData.TargetVariable, variable, groupedClassValues.Select(kvp => kvp.Key).ToArray(), groupedClassValues.Select(kvp => kvp.Value).ToArray(), defaultClass); var modelEstimatedValues = model.GetEstimatedClassValues(problemData.Dataset, problemData.TrainingIndices); var modelNumCorrect = classValues.Zip(modelEstimatedValues, (a, b) => a.IsAlmost(b)).Count(e => e); if (modelNumCorrect > bestModelNumCorrect) { bestModelNumCorrect = modelNumCorrect; bestModel = model; } } return bestModel; }
public DiscriminantFunctionClassificationSolution(IDiscriminantFunctionClassificationModel model, IClassificationProblemData problemData) : base(model, problemData) { valueEvaluationCache = new Dictionary<int, double>(); classValueEvaluationCache = new Dictionary<int, double>(); CalculateRegressionResults(); CalculateClassificationResults(); }
public static RandomForestModelFull CreateRandomForestClassificationModel(IClassificationProblemData problemData, int nTrees, double r, double m, int seed, out double rmsError, out double avgRelError, out double outOfBagRmsError, out double outOfBagAvgRelError) { var model = CreateRandomForestClassificationModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed, out rmsError, out avgRelError, out outOfBagRmsError, out outOfBagAvgRelError); return(model); }
protected DiscriminantFunctionClassificationSolutionBase(IDiscriminantFunctionClassificationModel model, IClassificationProblemData problemData) : base(model, problemData) { Add(new Result(TrainingMeanSquaredErrorResultName, "Mean of squared errors of the model on the training partition", new DoubleValue())); Add(new Result(TestMeanSquaredErrorResultName, "Mean of squared errors of the model on the test partition", new DoubleValue())); Add(new Result(TrainingRSquaredResultName, "Squared Pearson's correlation coefficient of the model output and the actual values on the training partition", new DoubleValue())); Add(new Result(TestRSquaredResultName, "Squared Pearson's correlation coefficient of the model output and the actual values on the test partition", new DoubleValue())); RegisterEventHandler(); }
public ClassificationEnsembleProblemData(IClassificationProblemData classificationProblemData) : base(classificationProblemData.Dataset, classificationProblemData.AllowedInputVariables, classificationProblemData.TargetVariable) { this.TrainingPartition.Start = classificationProblemData.TrainingPartition.Start; this.TrainingPartition.End = classificationProblemData.TrainingPartition.End; this.TestPartition.Start = classificationProblemData.TestPartition.Start; this.TestPartition.End = classificationProblemData.TestPartition.End; this.PositiveClass = classificationProblemData.PositiveClass; }
public SymbolicClassificationSolution(ISymbolicClassificationModel model, IClassificationProblemData problemData) : base(model, problemData) { foreach (var node in model.SymbolicExpressionTree.Root.IterateNodesPrefix().OfType<SymbolicExpressionTreeTopLevelNode>()) node.SetGrammar(null); Add(new Result(ModelLengthResultName, "Length of the symbolic classification model.", new IntValue())); Add(new Result(ModelDepthResultName, "Depth of the symbolic classification model.", new IntValue())); RecalculateResults(); }
public static INearestNeighbourModel Train(IClassificationProblemData problemData, int k) { return(new NearestNeighbourModel(problemData.Dataset, problemData.TrainingIndices, k, problemData.TargetVariable, problemData.AllowedInputVariables, problemData.ClassValues.ToArray())); }
public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IClassificationProblemData problemData, IEnumerable<int> rows) { IEnumerable<double> estimatedValues = interpreter.GetSymbolicExpressionTreeValues(solution, problemData.Dataset, rows); IEnumerable<double> originalValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); IEnumerable<double> boundedEstimationValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit); OnlineCalculatorError errorState; double mse = OnlineMeanSquaredErrorCalculator.Calculate(originalValues, boundedEstimationValues, out errorState); if (errorState != OnlineCalculatorError.None) mse = double.NaN; return new double[2] { mse, solution.Length }; }
public void ConstantModelVariableImpactTest() { IClassificationProblemData problemData = LoadIrisProblem(); IClassificationModel model = new ConstantModel(5, "y"); IClassificationSolution solution = new ClassificationSolution(model, problemData); Dictionary <string, double> expectedImpacts = GetExpectedValuesForConstantModel(); CheckDefaultAsserts(solution, expectedImpacts); }
private static SupportVectorClassificationSolution SvmGridSearch(IClassificationProblemData problemData, out svm_parameter bestParameters, out int nSv, out double cvMse) { bestParameters = SupportVectorMachineUtil.GridSearch(out cvMse, problemData, svmParameterRanges, numberOfFolds, shuffleFolds, maximumDegreeOfParallelism); double trainingError, testError; string svmType = svmTypes[bestParameters.svm_type]; string kernelType = kernelTypes[bestParameters.kernel_type]; var svm_solution = SupportVectorClassification.CreateSupportVectorClassificationSolution(problemData, problemData.AllowedInputVariables, svmType, kernelType, bestParameters.C, bestParameters.nu, bestParameters.gamma, bestParameters.degree, out trainingError, out testError, out nSv); return svm_solution; }
public override void RecalculateModelParameters(IClassificationProblemData problemData, IEnumerable <int> rows) { double[] classValues; double[] thresholds; var targetClassValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); var estimatedTrainingValues = GetEstimatedValues(problemData.Dataset, rows); thresholdCalculator.Calculate(problemData, estimatedTrainingValues, targetClassValues, out classValues, out thresholds); SetThresholdsAndClassValues(thresholds, classValues); }
private static RandomForestClassificationSolution GridSearch(IClassificationProblemData problemData, out RFParameter bestParameters, int seed = 3141519) { double rmsError, outOfBagRmsError, relClassificationError, outOfBagRelClassificationError; bestParameters = RandomForestUtil.GridSearch(problemData, randomForestParameterRanges, seed, maximumDegreeOfParallelism); var model = RandomForestModel.CreateClassificationModel(problemData, problemData.TrainingIndices, bestParameters.N, bestParameters.R, bestParameters.M, seed, out rmsError, out outOfBagRmsError, out relClassificationError, out outOfBagRelClassificationError); return((RandomForestClassificationSolution)model.CreateClassificationSolution(problemData)); }
public void KNNIrisVariableImpactTest() { IClassificationProblemData problemData = LoadIrisProblem(); IClassificationSolution solution = NearestNeighbourClassification.CreateNearestNeighbourClassificationSolution(problemData, 3); ClassificationSolutionVariableImpactsCalculator.CalculateImpacts(solution); Dictionary <string, double> expectedImpacts = GetExpectedValuesForIrisKNNModel(); CheckDefaultAsserts(solution, expectedImpacts); }
public void LDAIrisVariableImpactTest() { IClassificationProblemData problemData = LoadIrisProblem(); IClassificationSolution solution = LinearDiscriminantAnalysis.CreateLinearDiscriminantAnalysisSolution(problemData); ClassificationSolutionVariableImpactsCalculator.CalculateImpacts(solution); Dictionary <string, double> expectedImpacts = GetExpectedValuesForIrisLDAModel(); CheckDefaultAsserts(solution, expectedImpacts); }
protected ClassificationSolutionBase(IClassificationModel model, IClassificationProblemData problemData) : base(model, problemData) { Add(new Result(TrainingAccuracyResultName, "Accuracy of the model on the training partition (percentage of correctly classified instances).", new PercentValue())); Add(new Result(TestAccuracyResultName, "Accuracy of the model on the test partition (percentage of correctly classified instances).", new PercentValue())); Add(new Result(TrainingNormalizedGiniCoefficientResultName, "Normalized Gini coefficient of the model on the training partition.", new DoubleValue())); Add(new Result(TestNormalizedGiniCoefficientResultName, "Normalized Gini coefficient of the model on the test partition.", new DoubleValue())); Add(new Result(ClassificationPerformanceMeasuresResultName, @"Classification performance measures.\n In a multiclass classification all misclassifications of the negative class will be treated as true negatives except on positive class estimations.", new ClassificationPerformanceMeasuresResultCollection())); }
public static INearestNeighbourModel Train(IClassificationProblemData problemData, int k, bool selfMatch = false, double[] weights = null) { return(new NearestNeighbourModel(problemData.Dataset, problemData.TrainingIndices, k, selfMatch, problemData.TargetVariable, problemData.AllowedInputVariables, weights, problemData.ClassValues.ToArray())); }
private void AddThresholds() { chart.Annotations.Clear(); int classIndex = 1; IClassificationProblemData problemData = Content.ProblemData; var classValues = Content.Model.ClassValues.ToArray(); Axis y = chart.ChartAreas[0].AxisY; Axis x = chart.ChartAreas[0].AxisX; string name; foreach (double threshold in Content.Model.Thresholds) { if (!double.IsInfinity(threshold)) { HorizontalLineAnnotation annotation = new HorizontalLineAnnotation(); annotation.AllowMoving = true; annotation.AllowResizing = false; annotation.LineWidth = 2; annotation.LineColor = Color.Red; annotation.IsInfinitive = true; annotation.ClipToChartArea = chart.ChartAreas[0].Name; annotation.Tag = classIndex; //save classIndex as Tag to avoid moving the threshold accross class bounderies annotation.AxisX = chart.ChartAreas[0].AxisX; annotation.AxisY = y; annotation.Y = threshold; name = problemData.GetClassName(classValues[classIndex - 1]); TextAnnotation beneathLeft = CreateTextAnnotation(name, classIndex, x, y, x.Minimum, threshold, ContentAlignment.TopLeft); TextAnnotation beneathRight = CreateTextAnnotation(name, classIndex, x, y, x.Maximum, threshold, ContentAlignment.TopRight); name = problemData.GetClassName(classValues[classIndex]); TextAnnotation aboveLeft = CreateTextAnnotation(name, classIndex, x, y, x.Minimum, threshold, ContentAlignment.BottomLeft); TextAnnotation aboveRight = CreateTextAnnotation(name, classIndex, x, y, x.Maximum, threshold, ContentAlignment.BottomRight); chart.Annotations.Add(annotation); chart.Annotations.Add(beneathLeft); chart.Annotations.Add(aboveLeft); chart.Annotations.Add(beneathRight); chart.Annotations.Add(aboveRight); beneathLeft.ResizeToContent(); beneathRight.ResizeToContent(); aboveLeft.ResizeToContent(); aboveRight.ResizeToContent(); beneathRight.Width = -beneathRight.Width; aboveLeft.Height = -aboveLeft.Height; aboveRight.Height = -aboveRight.Height; aboveRight.Width = -aboveRight.Width; classIndex++; } } }
public override double[] Evaluate(IExecutionContext context, ISymbolicExpressionTree tree, IClassificationProblemData problemData, IEnumerable<int> rows) { SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = context; EstimationLimitsParameter.ExecutionContext = context; double[] quality = Calculate(SymbolicDataAnalysisTreeInterpreterParameter.ActualValue, tree, EstimationLimitsParameter.ActualValue.Lower, EstimationLimitsParameter.ActualValue.Upper, problemData, rows, ApplyLinearScalingParameter.ActualValue.Value); SymbolicDataAnalysisTreeInterpreterParameter.ExecutionContext = null; EstimationLimitsParameter.ExecutionContext = null; return quality; }
private static SupportVectorClassificationSolution SvmGridSearch(IClassificationProblemData problemData, out svm_parameter bestParameters, out int nSv, out double cvMse) { bestParameters = SupportVectorMachineUtil.GridSearch(out cvMse, problemData, svmParameterRanges, numberOfFolds, shuffleFolds, maximumDegreeOfParallelism); double trainingError, testError; string svmType = svmTypes[bestParameters.svm_type]; string kernelType = kernelTypes[bestParameters.kernel_type]; var svm_solution = SupportVectorClassification.CreateSupportVectorClassificationSolution(problemData, problemData.AllowedInputVariables, svmType, kernelType, bestParameters.C, bestParameters.nu, bestParameters.gamma, bestParameters.degree, out trainingError, out testError, out nSv); return(svm_solution); }
public void WrongDataSetVariableImpactClassificationTest() { IClassificationProblemData problemData = LoadIrisProblem(); IClassificationSolution solution = NearestNeighbourClassification.CreateNearestNeighbourClassificationSolution(problemData, 3); ClassificationSolutionVariableImpactsCalculator.CalculateImpacts(solution); Dictionary <string, double> expectedImpacts = GetExpectedValuesForIrisKNNModel(); solution.ProblemData = LoadMammographyProblem(); ClassificationSolutionVariableImpactsCalculator.CalculateImpacts(solution); }
public override double[,] Initialize(IClassificationProblemData data, int dimensions) { var attributes = data.AllowedInputVariables.Count(); var random = RandomParameter.ActualValue; var matrix = new double[attributes, dimensions]; for (int i = 0; i < attributes; i++) for (int j = 0; j < dimensions; j++) matrix[i, j] = random.NextDouble(); return matrix; }
public SymbolicClassificationSolution(ISymbolicClassificationModel model, IClassificationProblemData problemData) : base(model, problemData) { foreach (var node in model.SymbolicExpressionTree.Root.IterateNodesPrefix().OfType <SymbolicExpressionTreeTopLevelNode>()) { node.SetGrammar(null); } Add(new Result(ModelLengthResultName, "Length of the symbolic classification model.", new IntValue())); Add(new Result(ModelDepthResultName, "Depth of the symbolic classification model.", new IntValue())); RecalculateResults(); }
public static IClassificationSolution CreateLogitClassificationSolution(IClassificationProblemData problemData, out double rmsError, out double relClassError) { var dataset = problemData.Dataset; string targetVariable = problemData.TargetVariable; var doubleVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType <double>); var factorVariableNames = problemData.AllowedInputVariables.Where(dataset.VariableHasType <string>); IEnumerable <int> rows = problemData.TrainingIndices; double[,] inputMatrix = dataset.ToArray(doubleVariableNames.Concat(new string[] { targetVariable }), rows); var factorVariableValues = dataset.GetFactorVariableValues(factorVariableNames, rows); var factorMatrix = dataset.ToArray(factorVariableValues, rows); inputMatrix = factorMatrix.HorzCat(inputMatrix); if (inputMatrix.Cast <double>().Any(x => double.IsNaN(x) || double.IsInfinity(x))) { throw new NotSupportedException("Multinomial logit classification does not support NaN or infinity values in the input dataset."); } alglib.logitmodel lm = new alglib.logitmodel(); alglib.mnlreport rep = new alglib.mnlreport(); int nRows = inputMatrix.GetLength(0); int nFeatures = inputMatrix.GetLength(1) - 1; double[] classValues = dataset.GetDoubleValues(targetVariable).Distinct().OrderBy(x => x).ToArray(); int nClasses = classValues.Count(); // map original class values to values [0..nClasses-1] Dictionary <double, double> classIndices = new Dictionary <double, double>(); for (int i = 0; i < nClasses; i++) { classIndices[classValues[i]] = i; } for (int row = 0; row < nRows; row++) { inputMatrix[row, nFeatures] = classIndices[inputMatrix[row, nFeatures]]; } int info; alglib.mnltrainh(inputMatrix, nRows, nFeatures, nClasses, out info, out lm, out rep); if (info != 1) { throw new ArgumentException("Error in calculation of logit classification solution"); } rmsError = alglib.mnlrmserror(lm, inputMatrix, nRows); relClassError = alglib.mnlrelclserror(lm, inputMatrix, nRows); MultinomialLogitClassificationSolution solution = new MultinomialLogitClassificationSolution(new MultinomialLogitModel(lm, targetVariable, doubleVariableNames, factorVariableValues, classValues), (IClassificationProblemData)problemData.Clone()); return(solution); }
/// <summary> /// Stratified fold generation from classification data. Stratification means that we ensure the same distribution of class labels for each fold. /// The samples are grouped by class label and each group is split into @numberOfFolds parts. The final folds are formed from the joining of /// the corresponding parts from each class label. /// </summary> /// <param name="problemData">The classification problem data.</param> /// <param name="numberOfFolds">The number of folds in which to split the data.</param> /// <param name="random">The random generator used to shuffle the folds.</param> /// <returns>An enumerable sequece of folds, where a fold is represented by a sequence of row indices.</returns> private static IEnumerable <IEnumerable <int> > GenerateFoldsStratified(IClassificationProblemData problemData, int numberOfFolds, IRandom random) { var values = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices); var valuesIndices = problemData.TrainingIndices.Zip(values, (i, v) => new { Index = i, Value = v }).ToList(); IEnumerable <IEnumerable <IEnumerable <int> > > foldsByClass = valuesIndices.GroupBy(x => x.Value, x => x.Index).Select(g => GenerateFolds(g, g.Count(), numberOfFolds)); var enumerators = foldsByClass.Select(f => f.GetEnumerator()).ToList(); while (enumerators.All(e => e.MoveNext())) { yield return(enumerators.SelectMany(e => e.Current).OrderBy(x => random.Next()).ToList()); } }
public void CustomModelVariableImpactNoInfluenceTest() { IClassificationProblemData problemData = CreateDefaultProblem(); ISymbolicExpressionTree tree = CreateCustomExpressionTreeNoInfluenceX1(); var model = new SymbolicNearestNeighbourClassificationModel(problemData.TargetVariable, 3, tree, new SymbolicDataAnalysisExpressionTreeInterpreter()); model.RecalculateModelParameters(problemData, problemData.TrainingIndices); IClassificationSolution solution = new ClassificationSolution(model, (IClassificationProblemData)problemData.Clone()); Dictionary <string, double> expectedImpacts = GetExpectedValuesForCustomProblemNoInfluence(); CheckDefaultAsserts(solution, expectedImpacts); }
public override double[,] Initialize(IClassificationProblemData data, int dimensions) { var instances = data.TrainingIndices.Count(); var attributes = data.AllowedInputVariables.Count(); var pcaDs = AlglibUtil.PrepareInputMatrix(data.Dataset, data.AllowedInputVariables, data.TrainingIndices); int info; double[] varianceValues; double[,] matrix; alglib.pcabuildbasis(pcaDs, instances, attributes, out info, out varianceValues, out matrix); return matrix; }
public static double Calculate(IClassificationModel model, IClassificationProblemData problemData, IEnumerable<int> rows) { var estimations = model.GetEstimatedClassValues(problemData.Dataset, rows).GetEnumerator(); if (!estimations.MoveNext()) return double.NaN; var penalty = 0.0; var count = 0; foreach (var r in rows) { var actualClass = problemData.Dataset.GetDoubleValue(problemData.TargetVariable, r); penalty += problemData.GetClassificationPenalty(actualClass, estimations.Current); estimations.MoveNext(); count++; } return penalty / count; }
public static void Run(IClassificationProblemData problemData, IEnumerable <string> allowedInputVariables, int svmType, int kernelType, double cost, double nu, double gamma, int degree, out ISupportVectorMachineModel model, out int nSv) { var dataset = problemData.Dataset; string targetVariable = problemData.TargetVariable; IEnumerable <int> rows = problemData.TrainingIndices; svm_parameter parameter = new svm_parameter { svm_type = svmType, kernel_type = kernelType, C = cost, nu = nu, gamma = gamma, cache_size = 500, probability = 0, eps = 0.001, degree = degree, shrinking = 1, coef0 = 0 }; var weightLabels = new List <int>(); var weights = new List <double>(); foreach (double c in problemData.ClassValues) { double wSum = 0.0; foreach (double otherClass in problemData.ClassValues) { if (!c.IsAlmost(otherClass)) { wSum += problemData.GetClassificationPenalty(c, otherClass); } } weightLabels.Add((int)c); weights.Add(wSum); } parameter.weight_label = weightLabels.ToArray(); parameter.weight = weights.ToArray(); svm_problem problem = SupportVectorMachineUtil.CreateSvmProblem(dataset, targetVariable, allowedInputVariables, rows); RangeTransform rangeTransform = RangeTransform.Compute(problem); svm_problem scaledProblem = rangeTransform.Scale(problem); var svmModel = svm.svm_train(scaledProblem, parameter); nSv = svmModel.SV.Length; model = new SupportVectorMachineModel(svmModel, rangeTransform, targetVariable, allowedInputVariables, problemData.ClassValues); }
private void CheckDefaultAsserts(IClassificationSolution solution, Dictionary <string, double> expectedImpacts) { IClassificationProblemData problemData = solution.ProblemData; IEnumerable <double> estimatedValues = solution.GetEstimatedClassValues(solution.ProblemData.TrainingIndices); var solutionImpacts = ClassificationSolutionVariableImpactsCalculator.CalculateImpacts(solution); var modelImpacts = ClassificationSolutionVariableImpactsCalculator.CalculateImpacts(solution.Model, problemData, estimatedValues, problemData.TrainingIndices); //Both ways should return equal results Assert.IsTrue(solutionImpacts.SequenceEqual(modelImpacts)); //Check if impacts are as expected Assert.AreEqual(modelImpacts.Count(), expectedImpacts.Count); Assert.IsTrue(modelImpacts.All(v => v.Item2.IsAlmost(expectedImpacts[v.Item1]))); }
public override double[,] Initialize(IClassificationProblemData data, int dimensions) { var instances = data.TrainingIndices.Count(); var attributes = data.AllowedInputVariables.Count(); var pcaDs = AlglibUtil.PrepareInputMatrix(data.Dataset, data.AllowedInputVariables, data.TrainingIndices); int info; double[] varianceValues; double[,] matrix; alglib.pcabuildbasis(pcaDs, instances, attributes, out info, out varianceValues, out matrix); return(matrix); }
public static IClassificationSolution CreateZeroRSolution(IClassificationProblemData problemData) { var dataset = problemData.Dataset; string target = problemData.TargetVariable; var targetValues = dataset.GetDoubleValues(target, problemData.TrainingIndices); // if multiple classes have the same number of observations then simply take the first one var dominantClass = targetValues.GroupBy(x => x).ToDictionary(g => g.Key, g => g.Count()) .MaxItems(kvp => kvp.Value).Select(x => x.Key).First(); var model = new ConstantModel(dominantClass); var solution = model.CreateClassificationSolution(problemData); return(solution); }
public void PerformanceVariableImpactClassificationTest() { int rows = 1500; int columns = 77; IClassificationProblemData problemData = CreateDefaultProblem(rows, columns); IClassificationSolution solution = NearestNeighbourClassification.CreateNearestNeighbourClassificationSolution(problemData, 3); Stopwatch watch = new Stopwatch(); watch.Start(); var results = ClassificationSolutionVariableImpactsCalculator.CalculateImpacts(solution); watch.Stop(); TestContext.WriteLine(""); TestContext.WriteLine("Calculated cells per millisecond: {0}.", rows * columns / watch.ElapsedMilliseconds); }
protected override void Run() { IClassificationProblemData problemData = Problem.ProblemData; IEnumerable <string> selectedInputVariables = problemData.AllowedInputVariables; int nSv; ISupportVectorMachineModel model; Run(problemData, selectedInputVariables, GetSvmType(SvmType.Value), GetKernelType(KernelType.Value), Cost.Value, Nu.Value, Gamma.Value, Degree.Value, out model, out nSv); if (CreateSolution) { var solution = new SupportVectorClassificationSolution((SupportVectorMachineModel)model, (IClassificationProblemData)problemData.Clone()); Results.Add(new Result("Support vector classification solution", "The support vector classification solution.", solution)); } { // calculate classification metrics // calculate regression model metrics var ds = problemData.Dataset; var trainRows = problemData.TrainingIndices; var testRows = problemData.TestIndices; var yTrain = ds.GetDoubleValues(problemData.TargetVariable, trainRows); var yTest = ds.GetDoubleValues(problemData.TargetVariable, testRows); var yPredTrain = model.GetEstimatedClassValues(ds, trainRows); var yPredTest = model.GetEstimatedClassValues(ds, testRows); OnlineCalculatorError error; var trainAccuracy = OnlineAccuracyCalculator.Calculate(yPredTrain, yTrain, out error); if (error != OnlineCalculatorError.None) { trainAccuracy = double.MaxValue; } var testAccuracy = OnlineAccuracyCalculator.Calculate(yPredTest, yTest, out error); if (error != OnlineCalculatorError.None) { testAccuracy = double.MaxValue; } Results.Add(new Result("Accuracy (training)", "The mean of squared errors of the SVR solution on the training partition.", new DoubleValue(trainAccuracy))); Results.Add(new Result("Accuracy (test)", "The mean of squared errors of the SVR solution on the test partition.", new DoubleValue(testAccuracy))); Results.Add(new Result("Number of support vectors", "The number of support vectors of the SVR solution.", new IntValue(nSv))); } }
private void AfterDeserialization() { if (!classificationSolutions.Any()) { foreach (var model in Model.Models) { IClassificationProblemData problemData = (IClassificationProblemData)ProblemData.Clone(); problemData.TrainingPartition.Start = trainingPartitions[model].Start; problemData.TrainingPartition.End = trainingPartitions[model].End; problemData.TestPartition.Start = testPartitions[model].Start; problemData.TestPartition.End = testPartitions[model].End; classificationSolutions.Add(model.CreateClassificationSolution(problemData)); } } RegisterClassificationSolutionsEventHandler(); }
public static double Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IClassificationProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling) { IEnumerable<double> estimatedValues = interpreter.GetSymbolicExpressionTreeValues(solution, problemData.Dataset, rows); IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); OnlineCalculatorError errorState; double mse; if (applyLinearScaling) { var mseCalculator = new OnlineMeanSquaredErrorCalculator(); CalculateWithScaling(targetValues, estimatedValues, lowerEstimationLimit, upperEstimationLimit, mseCalculator, problemData.Dataset.Rows); errorState = mseCalculator.ErrorState; mse = mseCalculator.MeanSquaredError; } else { IEnumerable<double> boundedEstimatedValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit); mse = OnlineMeanSquaredErrorCalculator.Calculate(targetValues, boundedEstimatedValues, out errorState); } if (errorState != OnlineCalculatorError.None) return Double.NaN; return mse; }
public static double[] Calculate(ISymbolicDataAnalysisExpressionTreeInterpreter interpreter, ISymbolicExpressionTree solution, double lowerEstimationLimit, double upperEstimationLimit, IClassificationProblemData problemData, IEnumerable<int> rows, bool applyLinearScaling) { IEnumerable<double> estimatedValues = interpreter.GetSymbolicExpressionTreeValues(solution, problemData.Dataset, rows); IEnumerable<double> targetValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); OnlineCalculatorError errorState; double r; if (applyLinearScaling) { var rCalculator = new OnlinePearsonsRCalculator(); CalculateWithScaling(targetValues, estimatedValues, lowerEstimationLimit, upperEstimationLimit, rCalculator, problemData.Dataset.Rows); errorState = rCalculator.ErrorState; r = rCalculator.R; } else { IEnumerable<double> boundedEstimatedValues = estimatedValues.LimitToRange(lowerEstimationLimit, upperEstimationLimit); r = OnlinePearsonsRCalculator.Calculate(targetValues, boundedEstimatedValues, out errorState); } if (errorState != OnlineCalculatorError.None) r = double.NaN; return new double[2] { r*r, solution.Length }; }
public override double[,] Initialize(IClassificationProblemData data, int dimensions) { var instances = data.TrainingIndices.Count(); var attributes = data.AllowedInputVariables.Count(); var ldaDs = AlglibUtil.PrepareInputMatrix(data.Dataset, data.AllowedInputVariables.Concat(data.TargetVariable.ToEnumerable()), data.TrainingIndices); // map class values to sequential natural numbers (required by alglib) var uniqueClasses = data.Dataset.GetDoubleValues(data.TargetVariable, data.TrainingIndices) .Distinct() .Select((v, i) => new { v, i }) .ToDictionary(x => x.v, x => x.i); for (int row = 0; row < instances; row++) ldaDs[row, attributes] = uniqueClasses[ldaDs[row, attributes]]; int info; double[,] matrix; alglib.fisherldan(ldaDs, instances, attributes, uniqueClasses.Count, out info, out matrix); return matrix; }
public ClassificationSolution(IClassificationModel model, IClassificationProblemData problemData) : base(model, problemData) { evaluationCache = new Dictionary<int, double>(problemData.Dataset.Rows); CalculateClassificationResults(); }
public abstract double[,] Initialize(IClassificationProblemData data, int dimensions);
public override IClassificationSolution CreateClassificationSolution(IClassificationProblemData problemData) { return new OneRClassificationSolution(this, new ClassificationProblemData(problemData)); }
public override ISymbolicClassificationSolution CreateClassificationSolution(IClassificationProblemData problemData) { return CreateDiscriminantClassificationSolution(problemData); }
public NeuralNetworkEnsembleClassificationSolution(INeuralNetworkEnsembleModel nnModel, IClassificationProblemData problemData) : base(nnModel, problemData) { }
public SymbolicDiscriminantFunctionClassificationSolution CreateDiscriminantClassificationSolution(IClassificationProblemData problemData) { return new SymbolicDiscriminantFunctionClassificationSolution(this, new ClassificationProblemData(problemData)); }
public override void RecalculateModelParameters(IClassificationProblemData problemData, IEnumerable<int> rows) { double[] classValues; double[] thresholds; var targetClassValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, rows); var estimatedTrainingValues = GetEstimatedValues(problemData.Dataset, rows); thresholdCalculator.Calculate(problemData, estimatedTrainingValues, targetClassValues, out classValues, out thresholds); SetThresholdsAndClassValues(thresholds, classValues); }
IDiscriminantFunctionClassificationSolution IDiscriminantFunctionClassificationModel.CreateDiscriminantFunctionClassificationSolution(IClassificationProblemData problemData) { return CreateDiscriminantClassificationSolution(problemData); }
public override void Calculate(IClassificationProblemData problemData, IEnumerable<double> estimatedValues, IEnumerable<double> targetClassValues, out double[] classValues, out double[] thresholds) { AccuracyMaximizationThresholdCalculator.CalculateThresholds(problemData, estimatedValues, targetClassValues, out classValues, out thresholds); }
public static void CalculateThresholds(IClassificationProblemData problemData, IEnumerable<double> estimatedValues, IEnumerable<double> targetClassValues, out double[] classValues, out double[] thresholds) { const int slices = 100; const double minThresholdInc = 10e-5; // necessary to prevent infinite loop when maxEstimated - minEstimated is effectively zero (constant model) List<double> estimatedValuesList = estimatedValues.ToList(); double maxEstimatedValue = estimatedValuesList.Max(); double minEstimatedValue = estimatedValuesList.Min(); double thresholdIncrement = Math.Max((maxEstimatedValue - minEstimatedValue) / slices, minThresholdInc); var estimatedAndTargetValuePairs = estimatedValuesList.Zip(targetClassValues, (x, y) => new { EstimatedValue = x, TargetClassValue = y }) .OrderBy(x => x.EstimatedValue).ToList(); classValues = estimatedAndTargetValuePairs.GroupBy(x => x.TargetClassValue) .Select(x => new { Median = x.Select(y => y.EstimatedValue).Median(), Class = x.Key }) .OrderBy(x => x.Median).Select(x => x.Class).ToArray(); int nClasses = classValues.Length; thresholds = new double[nClasses]; thresholds[0] = double.NegativeInfinity; // incrementally calculate accuracy of all possible thresholds for (int i = 1; i < thresholds.Length; i++) { double lowerThreshold = thresholds[i - 1]; double actualThreshold = Math.Max(lowerThreshold, minEstimatedValue); double lowestBestThreshold = double.NaN; double highestBestThreshold = double.NaN; double bestClassificationScore = double.PositiveInfinity; bool seriesOfEqualClassificationScores = false; while (actualThreshold < maxEstimatedValue) { double classificationScore = 0.0; foreach (var pair in estimatedAndTargetValuePairs) { //all positives if (pair.TargetClassValue.IsAlmost(classValues[i - 1])) { if (pair.EstimatedValue > lowerThreshold && pair.EstimatedValue <= actualThreshold) //true positive classificationScore += problemData.GetClassificationPenalty(pair.TargetClassValue, pair.TargetClassValue); else //false negative classificationScore += problemData.GetClassificationPenalty(pair.TargetClassValue, classValues[i]); } //all negatives else { //false positive if (pair.EstimatedValue > lowerThreshold && pair.EstimatedValue <= actualThreshold) classificationScore += problemData.GetClassificationPenalty(pair.TargetClassValue, classValues[i - 1]); else if (pair.EstimatedValue <= lowerThreshold) classificationScore += problemData.GetClassificationPenalty(pair.TargetClassValue, classValues[i - 2]); else if (pair.EstimatedValue > actualThreshold) { if (pair.TargetClassValue < classValues[i - 1]) //negative in wrong class, consider upper class classificationScore += problemData.GetClassificationPenalty(pair.TargetClassValue, classValues[i]); else //true negative, must be optimized by the other thresholds classificationScore += problemData.GetClassificationPenalty(pair.TargetClassValue, pair.TargetClassValue); } } } //new best classification score found if (classificationScore < bestClassificationScore) { bestClassificationScore = classificationScore; lowestBestThreshold = actualThreshold; highestBestThreshold = actualThreshold; seriesOfEqualClassificationScores = true; } //equal classification scores => if seriesOfEqualClassifcationScores == true update highest threshold else if (Math.Abs(classificationScore - bestClassificationScore) < double.Epsilon && seriesOfEqualClassificationScores) highestBestThreshold = actualThreshold; //worse classificatoin score found reset seriesOfEqualClassifcationScores else seriesOfEqualClassificationScores = false; actualThreshold += thresholdIncrement; } //scale lowest thresholds and highest found optimal threshold according to the misclassification matrix double falseNegativePenalty = problemData.GetClassificationPenalty(classValues[i], classValues[i - 1]); double falsePositivePenalty = problemData.GetClassificationPenalty(classValues[i - 1], classValues[i]); thresholds[i] = (lowestBestThreshold * falsePositivePenalty + highestBestThreshold * falseNegativePenalty) / (falseNegativePenalty + falsePositivePenalty); } }
public static double CalculateQualityForImpacts(ISymbolicClassificationModel model, IClassificationProblemData problemData, IEnumerable<int> rows) { OnlineCalculatorError errorState; var dataset = problemData.Dataset; var targetClassValues = dataset.GetDoubleValues(problemData.TargetVariable, rows); var originalClassValues = model.GetEstimatedClassValues(dataset, rows); var qualityForImpactsCalculation = OnlineAccuracyCalculator.Calculate(targetClassValues, originalClassValues, out errorState); if (errorState != OnlineCalculatorError.None) qualityForImpactsCalculation = 0.0; return qualityForImpactsCalculation; }
public NcaClassificationSolution(INcaModel ncaModel, IClassificationProblemData problemData) : base(ncaModel, problemData) { }
public IClassificationSolution CreateClassificationSolution(IClassificationProblemData problemData) { return new ConstantClassificationSolution(this, new ClassificationProblemData(problemData)); }
public abstract void Calculate(IClassificationProblemData problemData, IEnumerable<double> estimatedValues, IEnumerable<double> targetClassValues, out double[] classValues, out double[] thresholds);
public RandomForestClassificationSolution(IClassificationProblemData problemData, IRandomForestModel randomForestModel) : base(randomForestModel, problemData) { }