public Node buildBranch_Thresh(Node root, Node pivot, Node[] nodes) { int colID = pivot.attribute.Ordinal; DataTable lowerOrEqualTable2 = pivot.subTable.Clone(); DataTable higherTable2 = pivot.subTable.Clone(); foreach (DataRow r in pivot.subTable.Rows) { string value = r[colID].ToString(); double d = double.Parse(value); if (d <= pivot.threshold) { lowerOrEqualTable2.ImportRow(r); } else { higherTable2.ImportRow(r); } } //Build new branch using new threshold DecisionTree dt = new DecisionTree(); pivot.children.Nodes[0] = dt.RunC4_5(lowerOrEqualTable2, pivot.subTableAttributes); //Rebuild left sub-tree pivot.children.Nodes[0].parent = pivot; pivot.children.Nodes[0].parentRef = 0; pivot.children.Nodes[1] = dt.RunC4_5(higherTable2, pivot.subTableAttributes); //Rebuild right sub-tree pivot.children.Nodes[1].parent = pivot; pivot.children.Nodes[1].parentRef = 1; nodeList.Clear(); nodes = root.flattenTree(root); nodeList.Add(root); //Replace old branch with new branch if (pivot.parent != null && pivot.children.Nodes.Count != 0) { //if selected node is not root or a leaf pivot.parent.children.Nodes[pivot.parentRef] = pivot; } else { Console.WriteLine("pivot.parent == null" + pivot.parent == null); return(root); } return(root); }
public Node buildBranch_Attribute(Node root, Node pivot, Node[] nodes) { DecisionTree dt = new DecisionTree(); int p = pivot.parentRef; pivot = dt.RunC4_5(pivot.subTable, pivot.subTableAttributes, pivot.attribute); //parent and parent ref will remain the same pivot.parentRef = p; nodeList.Clear(); nodes = root.flattenTree(root); nodeList.Add(root); return(root); }
public void run_KMeans(string file_path, int k, out double train, out double valid, out double test, out double size, out double rmse) { Accuracies_training.Clear(); Accuracies_validation.Clear(); Accuracies_testing.Clear(); trees.Clear(); tree_sizes.Clear(); rmse_list.Clear(); DataLoader d = new DataLoader(); DecisionTree tree = new DecisionTree(); Accuracy a = new Accuracy(); d.get_K_Partitions(file_path, k); //fills d.partitions with k even partitions of the dataset (each contains a header row) for (int i = 0; i < k; i++) //for each partition configuration { Console.WriteLine("Partition " + i + " / " + k + " ---------------------------------------------------------------"); List <string> training_data = new List <string>(); List <string> testing_data = new List <string>(); List <string> validation_data = new List <string>(); training_data.Add(d.title_row); //Add title row to top of training set for (int j = 0; j < k; j++) { if (j != i) //Iteratively keep one partition to be used as the test set { for (int z = 0; z < d.partitions[j].Length; z++) { training_data.Add(d.partitions[j][z]); } } else { for (int z = 0; z < d.partitions[j].Length; z++) { testing_data.Add(d.partitions[j][z]); } } } //Reserve 50% of the training data to be the validation set (move the rows to validation_data) int s = training_data.Count / 2; validation_data = training_data.GetRange(training_data.Count - s, s); training_data.RemoveRange(training_data.Count - s, s); DataTable x = d.CreateTable(training_data.ToArray()); //input: string[] output: DataTable List <DataColumn> all_attributes = d.getAllAttributes(x); Node root = tree.root = tree.RunC4_5(x, all_attributes); root.isRoot = true; //Set identifier of the root root.pruneTree(root); trees.Add(root); training_data.RemoveAt(0); List <string> validation_subset = getValidationSubset(validation_data); //Optimise with respect to the validation set for (int it = 0; it < 10000; it++) { /////////////////////////////////////////////////SELECT OBJECTIVE FUNCTION/////////////////////////////////////////////////////////// //root = root.randomMutateAndRebuild_Accuracy(root); //Objective Function: Maximise Accuracy (regardless of size) //root = root.randomMutateAndRebuild_RMSE(root); //Objective Function: Minimise RMSE (For regression trees, regardless of size) //PARETO FRONT //The below objective function is a pareto front. It minimises the size of the tree while also increasing accuracy (if either remain stable, the change is accepted) if ((it % 100) == 0) { validation_subset = getValidationSubset(validation_data); } //Randomise validation subset every x iterations root = root.randomMutateAndRebuild_Size(root, validation_subset.ToArray()); //Objective Function: Minimise size of the tree (number of nodes) //force a mutation here if the accuracy has not improved in the last 100 iterations, for instance... } //Save the accuracies of each partition Accuracies_training.Add(a.GetAccuracy(root, training_data.ToArray())); Accuracies_validation.Add(a.GetAccuracy(root, validation_data.ToArray())); Accuracies_testing.Add(a.GetAccuracy(root, testing_data.ToArray())); tree_sizes.Add(Convert.ToDouble(root.flattenTree(root).Length)); rmse_list.Add(a.getRMSE(root, testing_data.ToArray())); x.Clear(); //Clear DataTable so that we can begin the next C4.5 run - on the next partition } Console.WriteLine("\n\n"); Console.WriteLine("Final report: "); double training_total = 0; foreach (double q in Accuracies_training.Reverse <double>()) { if (q != 0) { training_total += q; } else { Accuracies_training.Remove(q); } } double average_training_accuracy = training_total / Accuracies_training.Count; double validation_total = 0; foreach (double q in Accuracies_validation.Reverse <double>()) { if (q != 0) { validation_total += q; } else { Accuracies_validation.Remove(q); } } double average_validation_accuracy = validation_total / Accuracies_validation.Count; double testing_total = 0; double highest_acc = double.NegativeInfinity; int highest_acc_index = 0; for (int t = 0; t < Accuracies_testing.Count; t++) { if (Accuracies_testing[t] != 0) { testing_total += Accuracies_testing[t]; if (Accuracies_testing[t] > highest_acc) { highest_acc = Accuracies_testing[t]; highest_acc_index = t; } } else { Accuracies_testing.RemoveAt(t); } } double average_testing_accuracy = testing_total / Accuracies_testing.Count; double tot = 0; foreach (double i in tree_sizes) { tot += i / 2; } double average_size = tot / tree_sizes.Count; double tot_rmse = 0; foreach (double r in rmse_list.Reverse <double>()) { if (r != 0) { tot_rmse += r; } else { rmse_list.Remove(r); } } double average_rmse = tot_rmse / rmse_list.Count; //Set 'out' variables for collection train = average_training_accuracy; valid = average_validation_accuracy; test = average_testing_accuracy; size = average_size; rmse = average_rmse; Console.WriteLine("Training accuracies:"); foreach (double p in Accuracies_training) { Console.WriteLine(p); } Console.WriteLine("Validation accuracies:"); foreach (double p in Accuracies_validation) { Console.WriteLine(p); } Console.WriteLine("Testing accuracies:"); foreach (double p in Accuracies_testing) { Console.WriteLine(p); } Console.WriteLine("Average training accuracy: " + average_training_accuracy); Console.WriteLine("Average validation accuracy: " + average_validation_accuracy); Console.WriteLine("Average testing accuracy: " + average_testing_accuracy); Console.WriteLine("Average tree size: " + average_size); Console.WriteLine("Printed tree (highest test accuracy) : " + Accuracies_testing[highest_acc_index]); //Visualise the tree with the highest test accuracy DOT_file_generator df = new DOT_file_generator(); df.createDOTfile(trees[highest_acc_index]); }