public void run_KMeans(string file_path, int k, out double train, out double valid, out double test, out double size, out double rmse) { Accuracies_training.Clear(); Accuracies_validation.Clear(); Accuracies_testing.Clear(); trees.Clear(); tree_sizes.Clear(); rmse_list.Clear(); DataLoader d = new DataLoader(); DecisionTree tree = new DecisionTree(); Accuracy a = new Accuracy(); d.get_K_Partitions(file_path, k); //fills d.partitions with k even partitions of the dataset (each contains a header row) for (int i = 0; i < k; i++) //for each partition configuration { Console.WriteLine("Partition " + i + " / " + k + " ---------------------------------------------------------------"); List <string> training_data = new List <string>(); List <string> testing_data = new List <string>(); List <string> validation_data = new List <string>(); training_data.Add(d.title_row); //Add title row to top of training set for (int j = 0; j < k; j++) { if (j != i) //Iteratively keep one partition to be used as the test set { for (int z = 0; z < d.partitions[j].Length; z++) { training_data.Add(d.partitions[j][z]); } } else { for (int z = 0; z < d.partitions[j].Length; z++) { testing_data.Add(d.partitions[j][z]); } } } //Reserve 50% of the training data to be the validation set (move the rows to validation_data) int s = training_data.Count / 2; validation_data = training_data.GetRange(training_data.Count - s, s); training_data.RemoveRange(training_data.Count - s, s); DataTable x = d.CreateTable(training_data.ToArray()); //input: string[] output: DataTable List <DataColumn> all_attributes = d.getAllAttributes(x); Node root = tree.root = tree.RunC4_5(x, all_attributes); root.isRoot = true; //Set identifier of the root root.pruneTree(root); trees.Add(root); training_data.RemoveAt(0); List <string> validation_subset = getValidationSubset(validation_data); //Optimise with respect to the validation set for (int it = 0; it < 10000; it++) { /////////////////////////////////////////////////SELECT OBJECTIVE FUNCTION/////////////////////////////////////////////////////////// //root = root.randomMutateAndRebuild_Accuracy(root); //Objective Function: Maximise Accuracy (regardless of size) //root = root.randomMutateAndRebuild_RMSE(root); //Objective Function: Minimise RMSE (For regression trees, regardless of size) //PARETO FRONT //The below objective function is a pareto front. It minimises the size of the tree while also increasing accuracy (if either remain stable, the change is accepted) if ((it % 100) == 0) { validation_subset = getValidationSubset(validation_data); } //Randomise validation subset every x iterations root = root.randomMutateAndRebuild_Size(root, validation_subset.ToArray()); //Objective Function: Minimise size of the tree (number of nodes) //force a mutation here if the accuracy has not improved in the last 100 iterations, for instance... } //Save the accuracies of each partition Accuracies_training.Add(a.GetAccuracy(root, training_data.ToArray())); Accuracies_validation.Add(a.GetAccuracy(root, validation_data.ToArray())); Accuracies_testing.Add(a.GetAccuracy(root, testing_data.ToArray())); tree_sizes.Add(Convert.ToDouble(root.flattenTree(root).Length)); rmse_list.Add(a.getRMSE(root, testing_data.ToArray())); x.Clear(); //Clear DataTable so that we can begin the next C4.5 run - on the next partition } Console.WriteLine("\n\n"); Console.WriteLine("Final report: "); double training_total = 0; foreach (double q in Accuracies_training.Reverse <double>()) { if (q != 0) { training_total += q; } else { Accuracies_training.Remove(q); } } double average_training_accuracy = training_total / Accuracies_training.Count; double validation_total = 0; foreach (double q in Accuracies_validation.Reverse <double>()) { if (q != 0) { validation_total += q; } else { Accuracies_validation.Remove(q); } } double average_validation_accuracy = validation_total / Accuracies_validation.Count; double testing_total = 0; double highest_acc = double.NegativeInfinity; int highest_acc_index = 0; for (int t = 0; t < Accuracies_testing.Count; t++) { if (Accuracies_testing[t] != 0) { testing_total += Accuracies_testing[t]; if (Accuracies_testing[t] > highest_acc) { highest_acc = Accuracies_testing[t]; highest_acc_index = t; } } else { Accuracies_testing.RemoveAt(t); } } double average_testing_accuracy = testing_total / Accuracies_testing.Count; double tot = 0; foreach (double i in tree_sizes) { tot += i / 2; } double average_size = tot / tree_sizes.Count; double tot_rmse = 0; foreach (double r in rmse_list.Reverse <double>()) { if (r != 0) { tot_rmse += r; } else { rmse_list.Remove(r); } } double average_rmse = tot_rmse / rmse_list.Count; //Set 'out' variables for collection train = average_training_accuracy; valid = average_validation_accuracy; test = average_testing_accuracy; size = average_size; rmse = average_rmse; Console.WriteLine("Training accuracies:"); foreach (double p in Accuracies_training) { Console.WriteLine(p); } Console.WriteLine("Validation accuracies:"); foreach (double p in Accuracies_validation) { Console.WriteLine(p); } Console.WriteLine("Testing accuracies:"); foreach (double p in Accuracies_testing) { Console.WriteLine(p); } Console.WriteLine("Average training accuracy: " + average_training_accuracy); Console.WriteLine("Average validation accuracy: " + average_validation_accuracy); Console.WriteLine("Average testing accuracy: " + average_testing_accuracy); Console.WriteLine("Average tree size: " + average_size); Console.WriteLine("Printed tree (highest test accuracy) : " + Accuracies_testing[highest_acc_index]); //Visualise the tree with the highest test accuracy DOT_file_generator df = new DOT_file_generator(); df.createDOTfile(trees[highest_acc_index]); }