Beispiel #1
0
        private IClassificationProblemData CreateDefaultProblem(int rows, int columns)
        {
            List <string> allowedInputVariables = Enumerable.Range(0, columns - 1).Select(x => "x" + x.ToString()).ToList();
            string        targetVariable        = "y";
            var           variableNames         = allowedInputVariables.Union(targetVariable.ToEnumerable());

            double[,] variableValues = new double[rows, columns];

            FastRandom random = new FastRandom(12345);
            int        len0   = variableValues.GetLength(0);
            int        len1   = variableValues.GetLength(1);

            for (int i = 0; i < len0; i++)
            {
                for (int j = 0; j < len1; j++)
                {
                    if (j == len1 - 1)
                    {
                        variableValues[i, j] = (j + i) % 2;
                    }
                    else
                    {
                        variableValues[i, j] = random.Next(1, 100);
                    }
                }
            }

            Dataset dataset = new Dataset(variableNames, variableValues);
            var     ret     = new ClassificationProblemData(dataset, allowedInputVariables, targetVariable);

            ret.SetClassName(0, "NOK");
            ret.SetClassName(1, "OK");
            return(ret);
        }
        protected override void itemsListView_DragDrop(object sender, DragEventArgs e)
        {
            if (e.Effect != DragDropEffects.None)
            {
                var droppedData = e.Data.GetData(HeuristicLab.Common.Constants.DragDropDataFormat);
                if (droppedData is IValueParameter)
                {
                    droppedData = ((IValueParameter)droppedData).Value;
                }
                else if (droppedData is IClassificationProblem)
                {
                    droppedData = ((IClassificationProblem)droppedData).ProblemData;
                }

                ClassificationEnsembleProblemData ensembleProblemData = droppedData as ClassificationEnsembleProblemData;
                ClassificationProblemData         problemData         = droppedData as ClassificationProblemData;
                if (ensembleProblemData != null)
                {
                    Content.ProblemData = (ClassificationEnsembleProblemData)ensembleProblemData.Clone();
                }
                else if (problemData != null)
                {
                    Content.ProblemData = new ClassificationEnsembleProblemData((IClassificationProblemData)problemData.Clone());
                }
            }
        }
Beispiel #3
0
        public IClassificationProblemData GenerateClassificationData(Dataset dataset)
        {
            IClassificationProblemData claData = new ClassificationProblemData(dataset, AllowedInputVariables, TargetVariable);

            claData.Name                    = this.Name;
            claData.Description             = this.Description;
            claData.TrainingPartition.Start = this.TrainingPartitionStart;
            claData.TrainingPartition.End   = this.TrainingPartitionEnd;
            claData.TestPartition.Start     = this.TestPartitionStart;
            claData.TestPartition.End       = this.TestPartitionEnd;
            return(claData);
        }
Beispiel #4
0
        protected void SetPossibleTargetVariables()
        {
            var dataset = PreviewDatasetMatrix.Content as Dataset;

            if (dataset != null)
            {
                IEnumerable <string> possibleTargetVariables = ClassificationProblemData.CheckVariablesForPossibleTargetVariables(dataset);

                // Remove " (Double)" at the end of the variable name (last 9 chars)
                TargetVariableComboBox.DataSource = possibleTargetVariables.Select(x => x.Substring(0, x.Length - 9)).ToList();
            }
        }
Beispiel #5
0
        private IDataAnalysisProblemData CreateClassificationData(ClassificationProblemData oldProblemData)
        {
            var targetVariable = oldProblemData.TargetVariable;

            if (!context.Data.VariableNames.Contains(targetVariable))
            {
                targetVariable = context.Data.VariableNames.First();
            }
            var inputVariables = GetDoubleInputVariables(targetVariable);
            var newProblemData = new ClassificationProblemData(ExportedDataset, inputVariables, targetVariable, Transformations);

            newProblemData.PositiveClass = oldProblemData.PositiveClass;
            return(newProblemData);
        }
Beispiel #6
0
        private IEnumerable <IResult> ExtractAndAggregateClassificationSolutions(IEnumerable <KeyValuePair <string, IItem> > resultCollections)
        {
            Dictionary <string, List <IClassificationSolution> > resultSolutions = new Dictionary <string, List <IClassificationSolution> >();

            foreach (var result in resultCollections)
            {
                var classificationSolution = result.Value as IClassificationSolution;
                if (classificationSolution != null)
                {
                    if (resultSolutions.ContainsKey(result.Key))
                    {
                        resultSolutions[result.Key].Add(classificationSolution);
                    }
                    else
                    {
                        resultSolutions.Add(result.Key, new List <IClassificationSolution>()
                        {
                            classificationSolution
                        });
                    }
                }
            }
            var aggregatedResults = new List <IResult>();

            foreach (KeyValuePair <string, List <IClassificationSolution> > solutions in resultSolutions)
            {
                // at least one algorithm (GBT with logistic regression loss) produces a classification solution even though the original problem is a regression problem.
                var targetVariable = solutions.Value.First().ProblemData.TargetVariable;
                var dataset        = (Dataset)Problem.ProblemData.Dataset;
                if (ShuffleSamples.Value)
                {
                    var random = new FastRandom(seed);
                    dataset = dataset.Shuffle(random);
                }
                var problemDataClone = new ClassificationProblemData(dataset, Problem.ProblemData.AllowedInputVariables, targetVariable);
                // set partitions of problem data clone correctly
                problemDataClone.TrainingPartition.Start = SamplesStart.Value; problemDataClone.TrainingPartition.End = SamplesEnd.Value;
                problemDataClone.TestPartition.Start     = SamplesStart.Value; problemDataClone.TestPartition.End = SamplesEnd.Value;
                // clone models
                var ensembleSolution = new ClassificationEnsembleSolution(problemDataClone);
                ensembleSolution.AddClassificationSolutions(solutions.Value);

                aggregatedResults.Add(new Result(solutions.Key + " (ensemble)", ensembleSolution));
            }
            List <IResult> flattenedResults = new List <IResult>();

            CollectResultsRecursively("", aggregatedResults, flattenedResults);
            return(flattenedResults);
        }
Beispiel #7
0
        protected override IClassificationProblemData ImportData(string path, ClassificationImportType type, TableFileParser csvFileParser)
        {
            int          trainingPartEnd = (csvFileParser.Rows * type.TrainingPercentage) / 100;
            List <IList> values          = csvFileParser.Values;

            if (type.Shuffle)
            {
                values = Shuffle(values);
                if (type.UniformlyDistributeClasses)
                {
                    values = Shuffle(values, csvFileParser.VariableNames.ToList().FindIndex(x => x.Equals(type.TargetVariable)),
                                     type.TrainingPercentage, out trainingPartEnd);
                }
            }

            Dataset dataset = new Dataset(csvFileParser.VariableNames, values);

            // turn of input variables that are constant in the training partition
            var allowedInputVars = new List <string>();
            var trainingIndizes  = Enumerable.Range(0, trainingPartEnd);

            if (trainingIndizes.Count() >= 2)
            {
                foreach (var variableName in dataset.DoubleVariables)
                {
                    if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
                        variableName != type.TargetVariable)
                    {
                        allowedInputVars.Add(variableName);
                    }
                }
            }
            else
            {
                allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(type.TargetVariable)));
            }

            ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, type.TargetVariable);

            classificationData.TrainingPartition.Start = 0;
            classificationData.TrainingPartition.End   = trainingPartEnd;
            classificationData.TestPartition.Start     = trainingPartEnd;
            classificationData.TestPartition.End       = csvFileParser.Rows;

            classificationData.Name = Path.GetFileName(path);

            return(classificationData);
        }
Beispiel #8
0
        public override IClassificationProblemData ImportData(string path)
        {
            TableFileParser csvFileParser = new TableFileParser();

            csvFileParser.Parse(path, csvFileParser.AreColumnNamesInFirstLine(path));

            Dataset dataset   = new Dataset(csvFileParser.VariableNames, csvFileParser.Values);
            string  targetVar = dataset.DoubleVariables.Last();

            // turn of input variables that are constant in the training partition
            var allowedInputVars = new List <string>();
            var trainingIndizes  = Enumerable.Range(0, (csvFileParser.Rows * 2) / 3);

            if (trainingIndizes.Count() >= 2)
            {
                foreach (var variableName in dataset.DoubleVariables)
                {
                    if (dataset.GetDoubleValues(variableName, trainingIndizes).Range() > 0 &&
                        variableName != targetVar)
                    {
                        allowedInputVars.Add(variableName);
                    }
                }
            }
            else
            {
                allowedInputVars.AddRange(dataset.DoubleVariables.Where(x => !x.Equals(targetVar)));
            }

            ClassificationProblemData classificationData = new ClassificationProblemData(dataset, allowedInputVars, targetVar);

            int trainingPartEnd = trainingIndizes.Last();

            classificationData.TrainingPartition.Start = trainingIndizes.First();
            classificationData.TrainingPartition.End   = trainingPartEnd;
            classificationData.TestPartition.Start     = trainingPartEnd;
            classificationData.TestPartition.End       = csvFileParser.Rows;

            classificationData.Name = Path.GetFileName(path);

            return(classificationData);
        }
Beispiel #9
0
        private IClassificationProblemData CreateDefaultProblem()
        {
            List <string> allowedInputVariables = new List <string>()
            {
                "x1", "x2", "x3", "x4", "x5"
            };
            string targetVariable = "y";
            var    variableNames  = allowedInputVariables.Union(targetVariable.ToEnumerable());

            double[,] variableValues = new double[100, variableNames.Count()];

            FastRandom random = new FastRandom(12345);
            int        len0   = variableValues.GetLength(0);
            int        len1   = variableValues.GetLength(1);

            for (int i = 0; i < len0; i++)
            {
                for (int j = 0; j < len1; j++)
                {
                    if (j == len1 - 1)
                    {
                        variableValues[i, j] = (j + i) % 2;
                    }
                    else
                    {
                        variableValues[i, j] = random.Next(1, 100);
                    }
                }
            }

            Dataset dataset = new Dataset(variableNames, variableValues);
            var     ret     = new ClassificationProblemData(dataset, allowedInputVariables, targetVariable);

            ret.SetClassName(0, "NOK");
            ret.SetClassName(1, "OK");
            return(ret);
        }
Beispiel #10
0
        protected override void Run(CancellationToken cancellationToken)
        {
            // Set up the algorithm
            if (SetSeedRandomly)
            {
                Seed = new System.Random().Next();
            }

            // Set up the results display
            var iterations = new IntValue(0);

            Results.Add(new Result("Iterations", iterations));

            var table = new DataTable("Qualities");

            table.Rows.Add(new DataRow("Loss (train)"));
            table.Rows.Add(new DataRow("Loss (test)"));
            Results.Add(new Result("Qualities", table));
            var curLoss = new DoubleValue();

            Results.Add(new Result("Loss (train)", curLoss));

            // init
            var problemData  = (IRegressionProblemData)Problem.ProblemData.Clone();
            var lossFunction = LossFunctionParameter.Value;
            var state        = GradientBoostedTreesAlgorithmStatic.CreateGbmState(problemData, lossFunction, (uint)Seed, MaxSize, R, M, Nu);

            var updateInterval = UpdateIntervalParameter.Value.Value;

            // Loop until iteration limit reached or canceled.
            for (int i = 0; i < Iterations; i++)
            {
                cancellationToken.ThrowIfCancellationRequested();

                GradientBoostedTreesAlgorithmStatic.MakeStep(state);

                // iteration results
                if (i % updateInterval == 0)
                {
                    curLoss.Value = state.GetTrainLoss();
                    table.Rows["Loss (train)"].Values.Add(curLoss.Value);
                    table.Rows["Loss (test)"].Values.Add(state.GetTestLoss());
                    iterations.Value = i;
                }
            }

            // final results
            iterations.Value = Iterations;
            curLoss.Value    = state.GetTrainLoss();
            table.Rows["Loss (train)"].Values.Add(curLoss.Value);
            table.Rows["Loss (test)"].Values.Add(state.GetTestLoss());

            // produce variable relevance
            var orderedImpacts = state.GetVariableRelevance().Select(t => new { name = t.Key, impact = t.Value }).ToList();

            var impacts = new DoubleMatrix();
            var matrix  = impacts as IStringConvertibleMatrix;

            matrix.Rows        = orderedImpacts.Count;
            matrix.RowNames    = orderedImpacts.Select(x => x.name);
            matrix.Columns     = 1;
            matrix.ColumnNames = new string[] { "Relative variable relevance" };

            int rowIdx = 0;

            foreach (var p in orderedImpacts)
            {
                matrix.SetValue(string.Format("{0:N2}", p.impact), rowIdx++, 0);
            }

            Results.Add(new Result("Variable relevance", impacts));
            Results.Add(new Result("Loss (test)", new DoubleValue(state.GetTestLoss())));

            // produce solution
            if (CreateSolution)
            {
                var model = state.GetModel();

                // for logistic regression we produce a classification solution
                if (lossFunction is LogisticRegressionLoss)
                {
                    var classificationModel = new DiscriminantFunctionClassificationModel(model,
                                                                                          new AccuracyMaximizationThresholdCalculator());
                    var classificationProblemData = new ClassificationProblemData(problemData.Dataset,
                                                                                  problemData.AllowedInputVariables, problemData.TargetVariable, problemData.Transformations);
                    classificationModel.RecalculateModelParameters(classificationProblemData, classificationProblemData.TrainingIndices);

                    var classificationSolution = new DiscriminantFunctionClassificationSolution(classificationModel, classificationProblemData);
                    Results.Add(new Result("Solution", classificationSolution));
                }
                else
                {
                    // otherwise we produce a regression solution
                    Results.Add(new Result("Solution", new RegressionSolution(model, problemData)));
                }
            }
        }
Beispiel #11
0
        protected override IEnumerable <IClassificationSolution> GenerateClassificationSolutions()
        {
            var solutionsBase = base.GenerateClassificationSolutions();
            var solutions     = new List <IClassificationSolution>();

            var symbolicSolution = Content;

            // does not support lagged variables
            if (symbolicSolution.Model.SymbolicExpressionTree.IterateNodesPrefix().OfType <LaggedVariableTreeNode>().Any())
            {
                return(solutionsBase);
            }

            var problemData = (IClassificationProblemData)symbolicSolution.ProblemData.Clone();

            if (!problemData.TrainingIndices.Any())
            {
                return(null);                              // don't create an comparison models if the problem does not have a training set (e.g. loaded into an existing model)
            }
            var usedVariables = Content.Model.SymbolicExpressionTree.IterateNodesPostfix()
                                .OfType <IVariableTreeNode>()
                                .Select(node => node.VariableName).ToArray();

            var usedDoubleVariables = usedVariables
                                      .Where(name => problemData.Dataset.VariableHasType <double>(name))
                                      .Distinct();

            var usedFactorVariables = usedVariables
                                      .Where(name => problemData.Dataset.VariableHasType <string>(name))
                                      .Distinct();

            // gkronber: for binary factors we actually produce a binary variable in the new dataset
            // but only if the variable is not used as a full factor anyway (LR creates binary columns anyway)
            var usedBinaryFactors =
                Content.Model.SymbolicExpressionTree.IterateNodesPostfix().OfType <BinaryFactorVariableTreeNode>()
                .Where(node => !usedFactorVariables.Contains(node.VariableName))
                .Select(node => Tuple.Create(node.VariableValue, node.VariableValue));

            // create a new problem and dataset
            var variableNames =
                usedDoubleVariables
                .Concat(usedFactorVariables)
                .Concat(usedBinaryFactors.Select(t => t.Item1 + "=" + t.Item2))
                .Concat(new string[] { problemData.TargetVariable })
                .ToArray();
            var variableValues =
                usedDoubleVariables.Select(name => (IList)problemData.Dataset.GetDoubleValues(name).ToList())
                .Concat(usedFactorVariables.Select(name => problemData.Dataset.GetStringValues(name).ToList()))
                .Concat(
                    // create binary variable
                    usedBinaryFactors.Select(t => problemData.Dataset.GetReadOnlyStringValues(t.Item1).Select(val => val == t.Item2 ? 1.0 : 0.0).ToList())
                    )
                .Concat(new[] { problemData.Dataset.GetDoubleValues(problemData.TargetVariable).ToList() });

            var newDs          = new Dataset(variableNames, variableValues);
            var newProblemData = new ClassificationProblemData(newDs, variableNames.Take(variableNames.Length - 1), variableNames.Last());

            newProblemData.PositiveClass           = problemData.PositiveClass;
            newProblemData.TrainingPartition.Start = problemData.TrainingPartition.Start;
            newProblemData.TrainingPartition.End   = problemData.TrainingPartition.End;
            newProblemData.TestPartition.Start     = problemData.TestPartition.Start;
            newProblemData.TestPartition.End       = problemData.TestPartition.End;

            try {
                var oneR = OneR.CreateOneRSolution(newProblemData);
                oneR.Name = "OneR Classification Solution (subset)";
                solutions.Add(oneR);
            } catch (NotSupportedException) { } catch (ArgumentException) { }
            try {
                var lda = LinearDiscriminantAnalysis.CreateLinearDiscriminantAnalysisSolution(newProblemData);
                lda.Name = "Linear Discriminant Analysis Solution (subset)";
                solutions.Add(lda);
            } catch (NotSupportedException) { } catch (ArgumentException) { }

            return(solutionsBase.Concat(solutions));
        }