private IClassificationProblemData CreateDefaultProblem(int rows, int columns) { List <string> allowedInputVariables = Enumerable.Range(0, columns - 1).Select(x => "x" + x.ToString()).ToList(); string targetVariable = "y"; var variableNames = allowedInputVariables.Union(targetVariable.ToEnumerable()); double[,] variableValues = new double[rows, columns]; FastRandom random = new FastRandom(12345); int len0 = variableValues.GetLength(0); int len1 = variableValues.GetLength(1); for (int i = 0; i < len0; i++) { for (int j = 0; j < len1; j++) { if (j == len1 - 1) { variableValues[i, j] = (j + i) % 2; } else { variableValues[i, j] = random.Next(1, 100); } } } Dataset dataset = new Dataset(variableNames, variableValues); var ret = new ClassificationProblemData(dataset, allowedInputVariables, targetVariable); ret.SetClassName(0, "NOK"); ret.SetClassName(1, "OK"); return(ret); }
protected override void itemsListView_DragDrop(object sender, DragEventArgs e) { if (e.Effect != DragDropEffects.None) { var droppedData = e.Data.GetData(HeuristicLab.Common.Constants.DragDropDataFormat); if (droppedData is IValueParameter) { droppedData = ((IValueParameter)droppedData).Value; } else if (droppedData is IClassificationProblem) { droppedData = ((IClassificationProblem)droppedData).ProblemData; } ClassificationEnsembleProblemData ensembleProblemData = droppedData as ClassificationEnsembleProblemData; ClassificationProblemData problemData = droppedData as ClassificationProblemData; if (ensembleProblemData != null) { Content.ProblemData = (ClassificationEnsembleProblemData)ensembleProblemData.Clone(); } else if (problemData != null) { Content.ProblemData = new ClassificationEnsembleProblemData((IClassificationProblemData)problemData.Clone()); } } }
public IClassificationProblemData GenerateClassificationData(Dataset dataset) { IClassificationProblemData claData = new ClassificationProblemData(dataset, AllowedInputVariables, TargetVariable); claData.Name = this.Name; claData.Description = this.Description; claData.TrainingPartition.Start = this.TrainingPartitionStart; claData.TrainingPartition.End = this.TrainingPartitionEnd; claData.TestPartition.Start = this.TestPartitionStart; claData.TestPartition.End = this.TestPartitionEnd; return(claData); }
protected void SetPossibleTargetVariables() { var dataset = PreviewDatasetMatrix.Content as Dataset; if (dataset != null) { IEnumerable <string> possibleTargetVariables = ClassificationProblemData.CheckVariablesForPossibleTargetVariables(dataset); // Remove " (Double)" at the end of the variable name (last 9 chars) TargetVariableComboBox.DataSource = possibleTargetVariables.Select(x => x.Substring(0, x.Length - 9)).ToList(); } }
private IDataAnalysisProblemData CreateClassificationData(ClassificationProblemData oldProblemData) { var targetVariable = oldProblemData.TargetVariable; if (!context.Data.VariableNames.Contains(targetVariable)) { targetVariable = context.Data.VariableNames.First(); } var inputVariables = GetDoubleInputVariables(targetVariable); var newProblemData = new ClassificationProblemData(ExportedDataset, inputVariables, targetVariable, Transformations); newProblemData.PositiveClass = oldProblemData.PositiveClass; return(newProblemData); }
private IEnumerable <IResult> ExtractAndAggregateClassificationSolutions(IEnumerable <KeyValuePair <string, IItem> > resultCollections) { Dictionary <string, List <IClassificationSolution> > resultSolutions = new Dictionary <string, List <IClassificationSolution> >(); foreach (var result in resultCollections) { var classificationSolution = result.Value as IClassificationSolution; if (classificationSolution != null) { if (resultSolutions.ContainsKey(result.Key)) { resultSolutions[result.Key].Add(classificationSolution); } else { resultSolutions.Add(result.Key, new List <IClassificationSolution>() { classificationSolution }); } } } var aggregatedResults = new List <IResult>(); foreach (KeyValuePair <string, List <IClassificationSolution> > solutions in resultSolutions) { // at least one algorithm (GBT with logistic regression loss) produces a classification solution even though the original problem is a regression problem. var targetVariable = solutions.Value.First().ProblemData.TargetVariable; var dataset = (Dataset)Problem.ProblemData.Dataset; if (ShuffleSamples.Value) { var random = new FastRandom(seed); dataset = dataset.Shuffle(random); } var problemDataClone = new ClassificationProblemData(dataset, Problem.ProblemData.AllowedInputVariables, targetVariable); // set partitions of problem data clone correctly problemDataClone.TrainingPartition.Start = SamplesStart.Value; problemDataClone.TrainingPartition.End = SamplesEnd.Value; problemDataClone.TestPartition.Start = SamplesStart.Value; problemDataClone.TestPartition.End = SamplesEnd.Value; // clone models var ensembleSolution = new ClassificationEnsembleSolution(problemDataClone); ensembleSolution.AddClassificationSolutions(solutions.Value); aggregatedResults.Add(new Result(solutions.Key + " (ensemble)", ensembleSolution)); } List <IResult> flattenedResults = new List <IResult>(); CollectResultsRecursively("", aggregatedResults, flattenedResults); return(flattenedResults); }
protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser) { int trainingPartEnd = (csvFileParser.Rows * type.TrainingPercentage) / 100; List <IList> values = csvFileParser.Values; if (type.Shuffle) { values = Shuffle(values); if (type.UniformlyDistributeClasses) { values = Shuffle(values, csvFileParser.VariableNames.ToList().FindIndex(x => x.Equals(type.TargetVariable)), type.TrainingPercentage, out trainingPartEnd); } } Dataset dataset = new Dataset(csvFileParser.VariableNames, values); // turn of input variables that are constant in the training partition var allowedInputVars = new List <string>(); var trainingIndizes = Enumerable.Range(0, trainingPartEnd); if (trainingIndizes.Count() >= 2) { foreach (var variableName in dataset.DoubleVariables) { if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && variableName != type.TargetVariable) { allowedInputVars.Add(variableName); } } } else { allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable))); } ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, type.TargetVariable); classificationData.TrainingPartition.Start = 0; classificationData.TrainingPartition.End = trainingPartEnd; classificationData.TestPartition.Start = trainingPartEnd; classificationData.TestPartition.End = csvFileParser.Rows; classificationData.Name = Path.GetFileName(path); return(classificationData); }
public override IClassificationProblemData ImportData(string path) { TableFileParser csvFileParser = new TableFileParser(); csvFileParser.Parse(path, csvFileParser.AreColumnNamesInFirstLine(path)); Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values); string targetVar = dataset.DoubleVariables.Last(); // turn of input variables that are constant in the training partition var allowedInputVars = new List <string>(); var trainingIndizes = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3); if (trainingIndizes.Count() >= 2) { foreach (var variableName in dataset.DoubleVariables) { if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 && variableName != targetVar) { allowedInputVars.Add(variableName); } } } else { allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar))); } ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, targetVar); int trainingPartEnd = trainingIndizes.Last(); classificationData.TrainingPartition.Start = trainingIndizes.First(); classificationData.TrainingPartition.End = trainingPartEnd; classificationData.TestPartition.Start = trainingPartEnd; classificationData.TestPartition.End = csvFileParser.Rows; classificationData.Name = Path.GetFileName(path); return(classificationData); }
private IClassificationProblemData CreateDefaultProblem() { List <string> allowedInputVariables = new List <string>() { "x1", "x2", "x3", "x4", "x5" }; string targetVariable = "y"; var variableNames = allowedInputVariables.Union(targetVariable.ToEnumerable()); double[,] variableValues = new double[100, variableNames.Count()]; FastRandom random = new FastRandom(12345); int len0 = variableValues.GetLength(0); int len1 = variableValues.GetLength(1); for (int i = 0; i < len0; i++) { for (int j = 0; j < len1; j++) { if (j == len1 - 1) { variableValues[i, j] = (j + i) % 2; } else { variableValues[i, j] = random.Next(1, 100); } } } Dataset dataset = new Dataset(variableNames, variableValues); var ret = new ClassificationProblemData(dataset, allowedInputVariables, targetVariable); ret.SetClassName(0, "NOK"); ret.SetClassName(1, "OK"); return(ret); }
protected override void Run(CancellationToken cancellationToken) { // Set up the algorithm if (SetSeedRandomly) { Seed = new System.Random().Next(); } // Set up the results display var iterations = new IntValue(0); Results.Add(new Result("Iterations", iterations)); var table = new DataTable("Qualities"); table.Rows.Add(new DataRow("Loss (train)")); table.Rows.Add(new DataRow("Loss (test)")); Results.Add(new Result("Qualities", table)); var curLoss = new DoubleValue(); Results.Add(new Result("Loss (train)", curLoss)); // init var problemData = (IRegressionProblemData)Problem.ProblemData.Clone(); var lossFunction = LossFunctionParameter.Value; var state = GradientBoostedTreesAlgorithmStatic.CreateGbmState(problemData, lossFunction, (uint)Seed, MaxSize, R, M, Nu); var updateInterval = UpdateIntervalParameter.Value.Value; // Loop until iteration limit reached or canceled. for (int i = 0; i < Iterations; i++) { cancellationToken.ThrowIfCancellationRequested(); GradientBoostedTreesAlgorithmStatic.MakeStep(state); // iteration results if (i % updateInterval == 0) { curLoss.Value = state.GetTrainLoss(); table.Rows["Loss (train)"].Values.Add(curLoss.Value); table.Rows["Loss (test)"].Values.Add(state.GetTestLoss()); iterations.Value = i; } } // final results iterations.Value = Iterations; curLoss.Value = state.GetTrainLoss(); table.Rows["Loss (train)"].Values.Add(curLoss.Value); table.Rows["Loss (test)"].Values.Add(state.GetTestLoss()); // produce variable relevance var orderedImpacts = state.GetVariableRelevance().Select(t => new { name = t.Key, impact = t.Value }).ToList(); var impacts = new DoubleMatrix(); var matrix = impacts as IStringConvertibleMatrix; matrix.Rows = orderedImpacts.Count; matrix.RowNames = orderedImpacts.Select(x => x.name); matrix.Columns = 1; matrix.ColumnNames = new string[] { "Relative variable relevance" }; int rowIdx = 0; foreach (var p in orderedImpacts) { matrix.SetValue(string.Format("{0:N2}", p.impact), rowIdx++, 0); } Results.Add(new Result("Variable relevance", impacts)); Results.Add(new Result("Loss (test)", new DoubleValue(state.GetTestLoss()))); // produce solution if (CreateSolution) { var model = state.GetModel(); // for logistic regression we produce a classification solution if (lossFunction is LogisticRegressionLoss) { var classificationModel = new DiscriminantFunctionClassificationModel(model, new AccuracyMaximizationThresholdCalculator()); var classificationProblemData = new ClassificationProblemData(problemData.Dataset, problemData.AllowedInputVariables, problemData.TargetVariable, problemData.Transformations); classificationModel.RecalculateModelParameters(classificationProblemData, classificationProblemData.TrainingIndices); var classificationSolution = new DiscriminantFunctionClassificationSolution(classificationModel, classificationProblemData); Results.Add(new Result("Solution", classificationSolution)); } else { // otherwise we produce a regression solution Results.Add(new Result("Solution", new RegressionSolution(model, problemData))); } } }
protected override IEnumerable <IClassificationSolution> GenerateClassificationSolutions() { var solutionsBase = base.GenerateClassificationSolutions(); var solutions = new List <IClassificationSolution>(); var symbolicSolution = Content; // does not support lagged variables if (symbolicSolution.Model.SymbolicExpressionTree.IterateNodesPrefix().OfType <LaggedVariableTreeNode>().Any()) { return(solutionsBase); } var problemData = (IClassificationProblemData)symbolicSolution.ProblemData.Clone(); if (!problemData.TrainingIndices.Any()) { return(null); // don't create an comparison models if the problem does not have a training set (e.g. loaded into an existing model) } var usedVariables = Content.Model.SymbolicExpressionTree.IterateNodesPostfix() .OfType <IVariableTreeNode>() .Select(node => node.VariableName).ToArray(); var usedDoubleVariables = usedVariables .Where(name => problemData.Dataset.VariableHasType <double>(name)) .Distinct(); var usedFactorVariables = usedVariables .Where(name => problemData.Dataset.VariableHasType <string>(name)) .Distinct(); // gkronber: for binary factors we actually produce a binary variable in the new dataset // but only if the variable is not used as a full factor anyway (LR creates binary columns anyway) var usedBinaryFactors = Content.Model.SymbolicExpressionTree.IterateNodesPostfix().OfType <BinaryFactorVariableTreeNode>() .Where(node => !usedFactorVariables.Contains(node.VariableName)) .Select(node => Tuple.Create(node.VariableValue, node.VariableValue)); // create a new problem and dataset var variableNames = usedDoubleVariables .Concat(usedFactorVariables) .Concat(usedBinaryFactors.Select(t => t.Item1 + "=" + t.Item2)) .Concat(new string[] { problemData.TargetVariable }) .ToArray(); var variableValues = usedDoubleVariables.Select(name => (IList)problemData.Dataset.GetDoubleValues(name).ToList()) .Concat(usedFactorVariables.Select(name => problemData.Dataset.GetStringValues(name).ToList())) .Concat( // create binary variable usedBinaryFactors.Select(t => problemData.Dataset.GetReadOnlyStringValues(t.Item1).Select(val => val == t.Item2 ? 1.0 : 0.0).ToList()) ) .Concat(new[] { problemData.Dataset.GetDoubleValues(problemData.TargetVariable).ToList() }); var newDs = new Dataset(variableNames, variableValues); var newProblemData = new ClassificationProblemData(newDs, variableNames.Take(variableNames.Length - 1), variableNames.Last()); newProblemData.PositiveClass = problemData.PositiveClass; newProblemData.TrainingPartition.Start = problemData.TrainingPartition.Start; newProblemData.TrainingPartition.End = problemData.TrainingPartition.End; newProblemData.TestPartition.Start = problemData.TestPartition.Start; newProblemData.TestPartition.End = problemData.TestPartition.End; try { var oneR = OneR.CreateOneRSolution(newProblemData); oneR.Name = "OneR Classification Solution (subset)"; solutions.Add(oneR); } catch (NotSupportedException) { } catch (ArgumentException) { } try { var lda = LinearDiscriminantAnalysis.CreateLinearDiscriminantAnalysisSolution(newProblemData); lda.Name = "Linear Discriminant Analysis Solution (subset)"; solutions.Add(lda); } catch (NotSupportedException) { } catch (ArgumentException) { } return(solutionsBase.Concat(solutions)); }