public static double gainRatio(List <DataInstance> S, string wanted_attribute, string targetAttribute, List <string> possible_values) { // Adjust for missing data double missingFraction = SetHelper.missingDataFraction(S, wanted_attribute); double gain_result = gain(S, wanted_attribute, targetAttribute, possible_values); double splitinfo = splitInfo(S, wanted_attribute, possible_values); if (splitinfo == 0) { return(0); } double ret = gain_result / splitinfo; return(ret); // return (1 - missingFraction) * gain(S, wanted_attribute, targetAttribute, possible_values) / splitInfo(S, wanted_attribute, possible_values); }
public DecisionTree replaceNodeByNewLeaf(Node removeNode) { if (removeNode.getParent() == null) { Console.WriteLine("Tried to prune root. You sure this Decision Tree makes sense?"); return(this); } // Create the new leaf List <DataInstance> total_set = new List <DataInstance>(); List <Node> queue = new List <Node>(); queue.Add(removeNode); // Get all instances that should be covered. while (queue.Count > 0) { Node node = queue[0]; queue.RemoveAt(0); foreach (Leaf child in node.getLeafChildren()) { total_set.AddRange(this.data_locations[child]); this.data_locations.Remove(child); } // Add child nodes to queue so their leafs also get added queue.AddRange(node.getNodeChildren()); } Node parent = removeNode.getParent(); // Remove the old node from its parent. if (parent != null) { // Make the new leaf string prediction = SetHelper.mostCommonClassifier(total_set, this.target_attribute); double uncertainty = (double)SetHelper.subset_errors(total_set, this.target_attribute) / (double)total_set.Count; Leaf newleaf = this.addUncertainLeaf(removeNode.value_splitter, prediction, parent, uncertainty); // Make sure we can access this leaf's new subset! this.data_locations[newleaf] = total_set; parent.removeChildNode(removeNode); } return(this); }
private DecisionTree addEstimationLeaf(DecisionTree tree, List <DataInstance> subset, Node parent, string value_splitter) { // We are out of attributes to split on! We need to identify the most common classifier. string most_common_classifier = SetHelper.mostCommonClassifier(subset, target_attribute); // Adjust for the uncertainty that comes with this prediction. We combine the certainty of classifier (percentage) with the certainty of the instances belonging here (weight). double percentage_with_this_classifier = (double)subset.Where(A => A.getProperty(target_attribute) == most_common_classifier).ToList().Count / (double)subset.Count; double certainty = 0; foreach (DataInstance instance in subset) { certainty += instance.getWeight(); } certainty /= (double)subset.Count; certainty = certainty * percentage_with_this_classifier; Leaf leaf = tree.addUncertainLeaf(value_splitter, most_common_classifier, parent, certainty); tree.data_locations[leaf] = subset; return(tree); }
private DecisionTree pruneIterate(DecisionTree tree, List <Node> queue, string target_attribute) { // Manage queue. Node node = queue[0]; queue.RemoveAt(0); agent.THINK("consider-node-for-pruning").finish(); // Lets consider this node. List <DataInstance> node_set = new List <DataInstance>(); // Calculate error estimate of the leafs double leaf_estimated_errors = 0; int leaf_actual_errors = 0; foreach (Leaf child in SetHelper.all_leaf_children(node)) { List <DataInstance> leaf_set = tree.data_locations[child]; node_set.AddRange(leaf_set); // Calculate estimated error. int my_errors = SetHelper.subset_errors(leaf_set, target_attribute); leaf_actual_errors += my_errors; double errorRate = Calculator.confidenceIntervalExact(my_errors, leaf_set.Count, this.confidence); double estimatedError = errorRate * leaf_set.Count; leaf_estimated_errors += estimatedError; } // Calculate estimated error of node. int node_errors = SetHelper.subset_errors(node_set, target_attribute); double nodeErrorRate = Calculator.confidenceIntervalExact(node_errors, node_set.Count, this.confidence); double nodeEstimatedError = nodeErrorRate * node_set.Count; // Compare // If a node has a lower estimated error than its leafs, it should be pruned. Dictionary <string, object> state = StateRecording.generateState("estimated_prune_errors", nodeEstimatedError, "estimated_keep_errors", leaf_estimated_errors, "node_attribute", node.label, "node_data_size", node_set.Count, "node_id", node.identifier, "node_value_splitter", (node.value_splitter != null) ? node.value_splitter : "NULL", "node_threshold", (node is ContinuousNode) ? (double?)(node as ContinuousNode).threshold : null, "parent_id", (node.getParent() != null) ? node.getParent().identifier : "NULL", "parent_attribute", (node.getParent() != null) ? node.getParent().label : "NULL", "parent_threshold", (node.getParent() != null && node.getParent() is ContinuousNode) ? (double?)((ContinuousNode)node.getParent()).threshold : null); if (nodeEstimatedError < leaf_estimated_errors) { // We need to prune! this.prepareSnapshot(node); agent.THINK("prune-node").setState(state).finish(); tree = tree.replaceNodeByNewLeaf(node); } else { agent.THINK("keep-node").setState(state).finish(); } // Iterate further if necessary. if (queue.Count > 0) { tree = this.pruneIterate(tree, queue, target_attribute); } return(tree); }
public DecisionTree iterate(DecisionTree tree, List <DataInstance> sets_todo, List <string> considerable_attributes, Node parent_node, string parent_value_splitter) { agent.THINK("iterate").finish(); List <string> attributes_copy = new List <string>(considerable_attributes.ToArray()); // Find best possible way to split these sets. For each attribute we will calculate the gain, and select the highest. string best_attr = "UNDETERMINED"; double highest_gain = 0; foreach (string attr in attributes_copy) { agent.THINK("consider-attribute").finish(); double my_gain = Calculator.gain(sets_todo, attr, this.target_attribute, this.possible_attribute_values[attr]); Dictionary <string, object> state = StateRecording.generateState("current_best_attribute", best_attr, "competing_attribute", attr, "current_best_gain", highest_gain, "competing_gain", my_gain, "parent_id", (parent_node != null) ? parent_node.identifier : "NULL", "parent_attribute", (parent_node != null) ? parent_node.label : "NULL", "previous_value_split", (parent_value_splitter != null) ? parent_value_splitter : "NULL"); if (my_gain > highest_gain) { agent.THINK("set-new-best-attribute").setState(state).finish(); best_attr = attr; highest_gain = my_gain; } else { agent.THINK("keep-old-attribute").setState(state).finish(); } } agent.THINK("end-attribute-loop").finish(); if (highest_gain == 0) { // This set cannot be split further. // We have tried all attributes so we can't go further. The tree ends here my friend. // This happens when instances have all attributes the same except for the classifier. throw new Exception("This dataset contains instances with exactly the same attribute values but different classifiers, which this algorithm does not support."); // I previously made an implementation of the algorithm that adds a 'Best Guess leaf' to address this problem, // but this is not described as such in the algorithm description and has therefore been left out for the experimentation. agent.THINK("add-best-guess-leaf").set("best_attribute", best_attr).set("highest_gain", 0d).set("possible_attributes", attributes_copy.Count).finish(); string classifier_value = SetHelper.mostCommonClassifier(sets_todo, target_attribute); Leaf leaf = tree.addBestGuessLeaf(parent_value_splitter, classifier_value, parent_node); tree.data_locations[leaf] = sets_todo; return(tree); } // The best attribute to split this set is now saved in best_attr. Create a node for that. agent.THINK("add-node").finish(); // Remove this attribute as a splitter for the dataset. attributes_copy.RemoveAt(considerable_attributes.IndexOf(best_attr)); // Parent value splitter is to give a node an idea what it's parent splitted on. For decision rules this is needed information. Node new_node = tree.addNode(best_attr, parent_value_splitter, parent_node); // Create subsets for each possible value of the attribute we created a node for. int values_left = this.possible_attribute_values[best_attr].Count; foreach (string value_splitter in this.possible_attribute_values[best_attr]) { agent.THINK("subset-on-value").finish(); List <DataInstance> subset = sets_todo.Where(A => A.getProperty(best_attr) == value_splitter).ToList(); Dictionary <string, object> considering_state = StateRecording.generateState("node_attribute", best_attr, "value_split", value_splitter, "current_node_id", new_node.identifier, "parent_node_id", (parent_node != null) ? parent_node.identifier : "NULL", "parent_attribute", (parent_node != null) ? parent_node.label : "NULL", "previous_value_split", (parent_value_splitter != null) ? parent_value_splitter : "NULL"); if (subset.Count == 0) { // There are no more of this subset. We need to skip this iteration. agent.THINK("ignore-value").setState(considering_state).finish(); continue; } if (SetHelper.hasUniformClassifier(subset, target_attribute)) { // This subset doesn't have to be split anymore. We can just add it to the node as a leaf. // Each leaf represents one decision rule. agent.THINK("add-leaf").setState(considering_state).finish(); string classifier_value = subset.First().getProperty(target_attribute); Leaf leaf = tree.addLeaf(value_splitter, classifier_value, new_node); tree.data_locations[leaf] = subset; } else { // We still haven't resolved this set. We need to iterate upon it to split it again. agent.THINK("iterate-further").setState(considering_state).finish(); tree = this.iterate(tree, subset, attributes_copy, new_node, value_splitter); // If we got here in the code then the set that was previously not all the same classifier has been resolved. We need to move up. } values_left -= 1; } agent.THINK("end-value-loop").finish(); if (parent_node != null) { agent.THINK("return-tree-to-self").finish(); } // We have succesfully split all examples on this attribute. Return the tree in its current state. return(tree); }
private DecisionTree iterate(DecisionTree tree, List <DataInstance> set, Dictionary <string, string> attributes, Node parent, string last_split) { this.agent.THINK("iterate").finish(); // Calculate gains and thresholds. Dictionary <string, Dictionary <string, double> > gains_and_thresholds = calculate_attribute_gain_ratios(set, target_attribute, attributes); Dictionary <string, double> thresholds = gains_and_thresholds["thresholds"]; Tuple <string, Dictionary <string, List <DataInstance> > > attributeFound = this.findAttributeSplit(tree, set, attributes, gains_and_thresholds, parent, last_split); // We need to know what the best attribute to split on is, and what the subsets of splitting on it would be. string best_split_attribute = attributeFound.Item1; Dictionary <string, List <DataInstance> > subsets = attributeFound.Item2; double threshold = -1000000; // This is to come to the same result as J48 [TODO: This has to go at some point] if (!have_been_at_root && best_split_attribute == "petal-length") { have_been_at_root = true; Console.WriteLine("Adjust to J48"); best_split_attribute = "petal-width"; threshold = thresholds[best_split_attribute]; } bool split_on_continuous = (best_split_attribute != "[INTERNAL_VARIABLE]-NOTFOUND") ? attributes[best_split_attribute] == "continuous" : false; // Check if a split attribute could even be found Dictionary <string, object> checkBestAttributeWasPossibleState = StateRecording.generateState("attribute_was_found", best_split_attribute == "[INTERNAL_VARIABLE]-NOTFOUND" ? "TRUE" : "FALSE", "best_attribute", best_split_attribute, "suggested_threshold", (split_on_continuous) ? (double?)thresholds[best_split_attribute] : null, "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null); if (best_split_attribute == "[INTERNAL_VARIABLE]-NOTFOUND") { // Okay so the subset we received could not be split such that it did not create too small of a leaf. // Therefore we will make an estimation leaf and move up. agent.THINK("add-estimation-leaf").setState(checkBestAttributeWasPossibleState).finish(); tree = this.addEstimationLeaf(tree, set, parent, last_split); return(tree); } // If we got here then we did not return an estimation leaf and therefore we found a suitable attribute to split on! agent.THINK("add-node").setState(checkBestAttributeWasPossibleState).finish(); if (split_on_continuous) { threshold = thresholds[best_split_attribute]; } // Get started on making a node Dictionary <string, string> attributes_for_further_iteration = AttributeHelper.CopyAttributeDictionary(attributes); // We now know the best splitting attribute and how to split it. Node newnode = null; if (split_on_continuous) { newnode = tree.addContinuousNode(best_split_attribute, last_split, threshold, parent); } else { newnode = tree.addNode(best_split_attribute, last_split, parent); attributes_for_further_iteration.Remove(best_split_attribute); } // We now have a dictionary where each string represents the value split and the list of datainstances is the subset. foreach (string subset_splitter in subsets.Keys) { List <DataInstance> subset = subsets[subset_splitter]; agent.THINK("subset-on-value").finish(); bool uniformClassifier = false; if (subset.Count > 0) { uniformClassifier = SetHelper.hasUniformClassifier(subset, target_attribute); } Dictionary <string, object> state = StateRecording.generateState("set_count", subset.Count, "set_has_uniform_classifier", (subset.Count > 0) ? (uniformClassifier ? "TRUE" : "FALSE") : "EMPTY SET", "chosen_attribute", best_split_attribute, "value_split", subset_splitter, "possible_attribute_count", attributes_for_further_iteration.Count, "chosen_threshold", (split_on_continuous) ? (double?)thresholds[best_split_attribute] : null, "current_node_id", newnode.identifier, "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null); if (subset.Count == 0) { // There are no more of this subset. We need to skip this iteration. agent.THINK("ignore-value").setState(state).finish(); continue; } if (uniformClassifier) { // This subset doesn't have to be split anymore. We can just add it to the node as a leaf. // Each leaf represents one decision rule. string classifier_value = subset.First().getProperty(target_attribute); double certainty = 0; // Calculate the certainty of this leaf. It's the average weight of the dataset. foreach (DataInstance instance in subset) { certainty += instance.getWeight(); } certainty /= (double)subset.Count; agent.THINK("add-leaf").setState(state).finish(); Leaf leaf = tree.addUncertainLeaf(subset_splitter, classifier_value, newnode, certainty); tree.data_locations[leaf] = subset; } else { // We still haven't resolved this set. We need to iterate upon it to split it again. if (attributes_for_further_iteration.Count == 0) { // If this happens than we have no more agent.THINK("add-majority-leaf").setState(state).finish(); tree = this.addEstimationLeaf(tree, subset, newnode, subset_splitter); } else { // We still have attributes left, we can continue further! agent.THINK("iterate-further").setState(state).finish(); tree = this.iterate(tree, subset, attributes_for_further_iteration, newnode, subset_splitter); } // If we got here in the code then the set that was previously not all the same classifier has been resolved. // Therefore we can let the foreach continue further! } } // The set that we have received has been dealt with completely. We can now move up! agent.THINK("end-value-loop").finish(); if (parent != null) { agent.THINK("return-tree-to-self").finish(); } return(tree); }
private Tuple <string, Dictionary <string, List <DataInstance> > > findAttributeSplit(DecisionTree tree, List <DataInstance> set, Dictionary <string, string> attributes, Dictionary <string, Dictionary <string, double> > gains_and_thresholds, Node parent, string last_split) { // Get gains and thresholds from parameters Dictionary <string, double> gains = gains_and_thresholds["gains"]; Dictionary <string, double> thresholds = gains_and_thresholds["thresholds"]; // Select the best attribute to split on double highest_gain_ratio = -1; string best_split_attribute = "[INTERNAL_VARIABLE]-NOTFOUND"; Boolean split_on_continuous = false; double threshold = 0; Dictionary <string, List <DataInstance> > subsets = null; foreach (string competing_attribute in attributes.Keys.ToList()) { agent.THINK("consider-attribute").finish(); double my_gain_ratio = gains[competing_attribute]; bool competing_is_continuous = (attributes[competing_attribute] == "continuous"); Dictionary <string, object> comparingAttributeState = StateRecording.generateState("current_best_attribute", best_split_attribute, "competing_attribute", competing_attribute, "current_best_gain", highest_gain_ratio, "competing_gain", my_gain_ratio, "current_best_threshold", (split_on_continuous) ? (double?)threshold : null, "competing_threshold", (competing_is_continuous) ? (double?)thresholds[competing_attribute] : null, "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "NULL", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null); if (my_gain_ratio > highest_gain_ratio) { // This attribute has the potential to become the new best attribute, but first we need to make sure splitting on this // attribute will not result in a leaf that has a subset lower than the minimum leaf size. agent.THINK("propose-competing-attribute").setState(comparingAttributeState).finish(); Dictionary <string, List <DataInstance> > competing_subsets = (competing_is_continuous) ? SetHelper.subsetOnAttributeContinuous(set, competing_attribute, thresholds[competing_attribute]) : SetHelper.subsetOnAttributeNominal(set, competing_attribute, possible_nominal_values[competing_attribute]); int subsets_above_minimum_requirement = 0; foreach (string value_splitter in competing_subsets.Keys.ToList()) { List <DataInstance> subset = competing_subsets[value_splitter]; // If at least one of these subsets has less instances than the minimum leaf size, then this split should NOT happen. if (subset.Count >= minimum_leaf_size) { subsets_above_minimum_requirement++; } } // I could not find proof of how J48 determines how many 'wrong' subsets are allowed. // I found a suggestion (https://stackoverflow.com/questions/21762161/what-does-the-minnumobj-parameter-do-in-j48-classifier-weka) // And that works perfectly like J48 so I assume that this is how they do it. Dictionary <string, object> verifyCompetingAttributeState = StateRecording.generateState("minimum_objects", minimum_leaf_size, "valid_subset_count", subsets_above_minimum_requirement, "chosen_attribute", best_split_attribute, "suggested_threshold", (split_on_continuous) ? (double?)thresholds[best_split_attribute] : null, "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null); if (subsets_above_minimum_requirement < 2) { // Although this attribute has a better gain ratio than the best one we have now, it also forces us to create a leaf // that is below the minimum leaf size and therefore we cannot choose this one! agent.THINK("disregard-competing-attribute").setState(verifyCompetingAttributeState).finish(); } else { agent.THINK("allow-competing-attribute").setState(verifyCompetingAttributeState).finish(); highest_gain_ratio = my_gain_ratio; best_split_attribute = competing_attribute; split_on_continuous = competing_is_continuous; subsets = competing_subsets; if (split_on_continuous) { threshold = thresholds[competing_attribute]; } } } else { // Previous attribute had a better gain ratio agent.THINK("keep-best-attribute").setState(comparingAttributeState).finish(); } } agent.THINK("end-attribute-loop").finish(); return(new Tuple <string, Dictionary <string, List <DataInstance> > >(best_split_attribute, subsets)); }
public static double[] best_split_and_ratio_for_continuous(List <DataInstance> S, string wanted_attribute, string target_attribute, List <double> supplied_values) { double total_set_entropy = entropy(S, target_attribute); // Sort by the wanted attribute List <DataInstance> s_sorted = S.OrderBy(o => o.getProperty(wanted_attribute)).ToList(); List <double> possible_values = null; if (supplied_values.Count == 0) { possible_values = new List <double>(); // If the supplied list of possible values is empty, we have to fill it ourselves. // The list becomes empty if the user opts to keep considering all values of an attribute, not just the possible values of subsets. // Add posisble attribute values based on instances supplied. foreach (DataInstance instance in s_sorted) { if (instance.getProperty(wanted_attribute) != null) { double my_value = instance.getPropertyAsDouble(wanted_attribute); if (!possible_values.Contains(my_value)) { possible_values.Add(my_value); } } } } else { // If we a list of possible values supplied to us, we'll use that one. // It's safe to just reference to that list since we won't make changes to it. possible_values = supplied_values; } // Loop through possible splits and calculate their gain ratios double best_split = 0; double best_split_gain = -1; double best_split_gain_ratio = -1; bool found_better_than_nothing = false; foreach (double binary_split in possible_values) { // Create subsets below or equal, and above the wanted attribute's current binary split. List <DataInstance> s_below_or_equal = S.Where(o => (o.getProperty(wanted_attribute) != null) ? o.getPropertyAsDouble(wanted_attribute) <= binary_split : false).ToList(); List <DataInstance> s_above = S.Where(o => (o.getProperty(wanted_attribute) != null) ? o.getPropertyAsDouble(wanted_attribute) > binary_split : false).ToList(); double entropy_below_or_equal = entropy(s_below_or_equal, target_attribute); double entropy_above = entropy(s_above, target_attribute); double proportion_below_or_equal = ((double)s_below_or_equal.Count()) / ((double)S.Count()); double proportion_above = ((double)s_above.Count()) / ((double)S.Count()); // Calculare gain of splitting on this binary split double gain_on_this_split = total_set_entropy - (proportion_below_or_equal * entropy_below_or_equal) - (proportion_above * entropy_above); double splitinfo_on_this_split = -(proportion_below_or_equal * Math.Log(proportion_below_or_equal, 2)) - (proportion_above * Math.Log(proportion_above, 2)); double gain_ratio_on_this_split = gain_on_this_split / splitinfo_on_this_split; // Finally all calculations are done! Lets find out if this one is the best one yet. if (gain_on_this_split > best_split_gain) { found_better_than_nothing = true; best_split_gain = gain_on_this_split; best_split_gain_ratio = gain_ratio_on_this_split; best_split = binary_split; } } if (!found_better_than_nothing) { Console.WriteLine($"No gain ratio could be found for this attribute {wanted_attribute}"); } // Adjust for missing data double missingFraction = SetHelper.missingDataFraction(S, wanted_attribute); best_split_gain_ratio = (1 - missingFraction) * best_split_gain_ratio; // We want to select by the best gain, not by the best gain ratio, just like J48 does it. return(new double[] { best_split, best_split_gain }); }