public void runProgram() { var watch = new System.Diagnostics.Stopwatch(); watch.Start(); /* * //PRINT CONSOLE OUTPUT TO CSV * FileStream filestream = new FileStream("console_data.csv", FileMode.Create); //Outputs console text to CSV file: "console_data.csv" * var streamwriter = new StreamWriter(filestream); * streamwriter.AutoFlush = true; * Console.SetOut(streamwriter); * Console.SetError(streamwriter);*/ List <double> trains = new List <double>(); List <double> valids = new List <double>(); List <double> tests = new List <double>(); List <double> sizes = new List <double>(); List <double> rmses = new List <double>(); DecisionTree tree = new DecisionTree(); for (int i = 0; i < 10; i++) { tree.run_KMeans("PATH_TO_TARGET_DATASET", 10, out double train, out double valid, out double test, out double size, out double rmse); trains.Add(train); valids.Add(valid); tests.Add(test); sizes.Add(size); rmses.Add(rmse); Console.WriteLine("---------------------------------- ITERATION: " + i + " -----------------------------------"); } for (int i = 0; i < 10; i++) { Console.WriteLine("\n"); Console.WriteLine("Iteration: " + i + "\n" + "Train: " + trains[i] + "\n" + "Valid: " + valids[i] + "\n" + "Test: " + tests[i] + "\n" + "Size: " + sizes[i] + "\n" + "RMSE: " + rmses[i]); } //Create new Excel file containing test results XLWorkbook workbook = new XLWorkbook(); DataTable table = new DataTable("table"); DataColumn column1 = new DataColumn(); column1.DataType = typeof(double); column1.ColumnName = "Training Accuracy"; column1.Unique = false; column1.AllowDBNull = false; DataColumn column2 = new DataColumn(); column2.DataType = typeof(double); column2.ColumnName = "Validation Accuracy"; column2.Unique = false; column2.AllowDBNull = false; DataColumn column3 = new DataColumn(); column3.DataType = typeof(double); column3.ColumnName = "Test Accuracy"; column3.Unique = false; column3.AllowDBNull = false; DataColumn column4 = new DataColumn(); column4.DataType = typeof(double); column4.ColumnName = "Average Size"; column4.Unique = false; column4.AllowDBNull = false; DataColumn column5 = new DataColumn(); column5.DataType = typeof(double); column5.ColumnName = "RMSE"; column5.Unique = false; column5.AllowDBNull = false; table.Columns.Add(column1); table.Columns.Add(column2); table.Columns.Add(column3); table.Columns.Add(column4); table.Columns.Add(column5); for (int i = 0; i < trains.Count; i++) { DataRow row = table.NewRow(); row[0] = trains[i]; row[1] = valids[i]; row[2] = tests[i]; row[3] = sizes[i]; row[4] = rmses[i]; table.Rows.Add(row); } workbook.Worksheets.Add(table); workbook.SaveAs("Example_statistics_output"); watch.Stop(); TimeSpan ts = watch.Elapsed; string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}", ts.Hours, ts.Minutes, ts.Seconds); Console.WriteLine("RunTime " + elapsedTime); TimeSpan tt = watch.Elapsed / 100; string elapsedTime2 = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", tt.Hours, tt.Minutes, tt.Seconds, tt.Milliseconds / 10); Console.WriteLine("Avg. training time (seconds) : " + elapsedTime2); }
static void Main(string[] args) { DecisionTree t = new DecisionTree(); t.runProgram(); }
public void run_KMeans(string file_path, int k, out double train, out double valid, out double test, out double size, out double rmse) { Accuracies_training.Clear(); Accuracies_validation.Clear(); Accuracies_testing.Clear(); trees.Clear(); tree_sizes.Clear(); rmse_list.Clear(); DataLoader d = new DataLoader(); DecisionTree tree = new DecisionTree(); Accuracy a = new Accuracy(); d.get_K_Partitions(file_path, k); //fills d.partitions with k even partitions of the dataset (each contains a header row) for (int i = 0; i < k; i++) //for each partition configuration { Console.WriteLine("Partition " + i + " / " + k + " ---------------------------------------------------------------"); List <string> training_data = new List <string>(); List <string> testing_data = new List <string>(); List <string> validation_data = new List <string>(); training_data.Add(d.title_row); //Add title row to top of training set for (int j = 0; j < k; j++) { if (j != i) //Iteratively keep one partition to be used as the test set { for (int z = 0; z < d.partitions[j].Length; z++) { training_data.Add(d.partitions[j][z]); } } else { for (int z = 0; z < d.partitions[j].Length; z++) { testing_data.Add(d.partitions[j][z]); } } } //Reserve 50% of the training data to be the validation set (move the rows to validation_data) int s = training_data.Count / 2; validation_data = training_data.GetRange(training_data.Count - s, s); training_data.RemoveRange(training_data.Count - s, s); DataTable x = d.CreateTable(training_data.ToArray()); //input: string[] output: DataTable List <DataColumn> all_attributes = d.getAllAttributes(x); Node root = tree.root = tree.RunC4_5(x, all_attributes); root.isRoot = true; //Set identifier of the root root.pruneTree(root); trees.Add(root); training_data.RemoveAt(0); List <string> validation_subset = getValidationSubset(validation_data); //Optimise with respect to the validation set for (int it = 0; it < 10000; it++) { /////////////////////////////////////////////////SELECT OBJECTIVE FUNCTION/////////////////////////////////////////////////////////// //root = root.randomMutateAndRebuild_Accuracy(root); //Objective Function: Maximise Accuracy (regardless of size) //root = root.randomMutateAndRebuild_RMSE(root); //Objective Function: Minimise RMSE (For regression trees, regardless of size) //PARETO FRONT //The below objective function is a pareto front. It minimises the size of the tree while also increasing accuracy (if either remain stable, the change is accepted) if ((it % 100) == 0) { validation_subset = getValidationSubset(validation_data); } //Randomise validation subset every x iterations root = root.randomMutateAndRebuild_Size(root, validation_subset.ToArray()); //Objective Function: Minimise size of the tree (number of nodes) //force a mutation here if the accuracy has not improved in the last 100 iterations, for instance... } //Save the accuracies of each partition Accuracies_training.Add(a.GetAccuracy(root, training_data.ToArray())); Accuracies_validation.Add(a.GetAccuracy(root, validation_data.ToArray())); Accuracies_testing.Add(a.GetAccuracy(root, testing_data.ToArray())); tree_sizes.Add(Convert.ToDouble(root.flattenTree(root).Length)); rmse_list.Add(a.getRMSE(root, testing_data.ToArray())); x.Clear(); //Clear DataTable so that we can begin the next C4.5 run - on the next partition } Console.WriteLine("\n\n"); Console.WriteLine("Final report: "); double training_total = 0; foreach (double q in Accuracies_training.Reverse <double>()) { if (q != 0) { training_total += q; } else { Accuracies_training.Remove(q); } } double average_training_accuracy = training_total / Accuracies_training.Count; double validation_total = 0; foreach (double q in Accuracies_validation.Reverse <double>()) { if (q != 0) { validation_total += q; } else { Accuracies_validation.Remove(q); } } double average_validation_accuracy = validation_total / Accuracies_validation.Count; double testing_total = 0; double highest_acc = double.NegativeInfinity; int highest_acc_index = 0; for (int t = 0; t < Accuracies_testing.Count; t++) { if (Accuracies_testing[t] != 0) { testing_total += Accuracies_testing[t]; if (Accuracies_testing[t] > highest_acc) { highest_acc = Accuracies_testing[t]; highest_acc_index = t; } } else { Accuracies_testing.RemoveAt(t); } } double average_testing_accuracy = testing_total / Accuracies_testing.Count; double tot = 0; foreach (double i in tree_sizes) { tot += i / 2; } double average_size = tot / tree_sizes.Count; double tot_rmse = 0; foreach (double r in rmse_list.Reverse <double>()) { if (r != 0) { tot_rmse += r; } else { rmse_list.Remove(r); } } double average_rmse = tot_rmse / rmse_list.Count; //Set 'out' variables for collection train = average_training_accuracy; valid = average_validation_accuracy; test = average_testing_accuracy; size = average_size; rmse = average_rmse; Console.WriteLine("Training accuracies:"); foreach (double p in Accuracies_training) { Console.WriteLine(p); } Console.WriteLine("Validation accuracies:"); foreach (double p in Accuracies_validation) { Console.WriteLine(p); } Console.WriteLine("Testing accuracies:"); foreach (double p in Accuracies_testing) { Console.WriteLine(p); } Console.WriteLine("Average training accuracy: " + average_training_accuracy); Console.WriteLine("Average validation accuracy: " + average_validation_accuracy); Console.WriteLine("Average testing accuracy: " + average_testing_accuracy); Console.WriteLine("Average tree size: " + average_size); Console.WriteLine("Printed tree (highest test accuracy) : " + Accuracies_testing[highest_acc_index]); //Visualise the tree with the highest test accuracy DOT_file_generator df = new DOT_file_generator(); df.createDOTfile(trees[highest_acc_index]); }