Esempio n. 1
0
        private DecisionTree pruneIterate(DecisionTree tree, List <Node> queue, string target_attribute)
        {
            // Manage queue.
            Node node = queue[0];

            queue.RemoveAt(0);

            agent.THINK("consider-node-for-pruning").finish();

            // Lets consider this node.
            List <DataInstance> node_set = new List <DataInstance>();

            // Calculate error estimate of the leafs
            double leaf_estimated_errors = 0;
            int    leaf_actual_errors    = 0;

            foreach (Leaf child in SetHelper.all_leaf_children(node))
            {
                List <DataInstance> leaf_set = tree.data_locations[child];
                node_set.AddRange(leaf_set);

                // Calculate estimated error.
                int my_errors = SetHelper.subset_errors(leaf_set, target_attribute);
                leaf_actual_errors += my_errors;
                double errorRate      = Calculator.confidenceIntervalExact(my_errors, leaf_set.Count, this.confidence);
                double estimatedError = errorRate * leaf_set.Count;
                leaf_estimated_errors += estimatedError;
            }

            // Calculate estimated error of node.
            int    node_errors        = SetHelper.subset_errors(node_set, target_attribute);
            double nodeErrorRate      = Calculator.confidenceIntervalExact(node_errors, node_set.Count, this.confidence);
            double nodeEstimatedError = nodeErrorRate * node_set.Count;

            // Compare
            // If a node has a lower estimated error than its leafs, it should be pruned.
            Dictionary <string, object> state = StateRecording.generateState("estimated_prune_errors", nodeEstimatedError, "estimated_keep_errors", leaf_estimated_errors,
                                                                             "node_attribute", node.label, "node_data_size", node_set.Count, "node_id", node.identifier, "node_value_splitter", (node.value_splitter != null) ? node.value_splitter : "NULL",
                                                                             "node_threshold", (node is ContinuousNode) ? (double?)(node as ContinuousNode).threshold : null,
                                                                             "parent_id", (node.getParent() != null) ? node.getParent().identifier : "NULL", "parent_attribute", (node.getParent() != null) ? node.getParent().label : "NULL", "parent_threshold", (node.getParent() != null && node.getParent() is ContinuousNode) ? (double?)((ContinuousNode)node.getParent()).threshold : null);

            if (nodeEstimatedError < leaf_estimated_errors)
            {
                // We need to prune!
                this.prepareSnapshot(node);
                agent.THINK("prune-node").setState(state).finish();

                tree = tree.replaceNodeByNewLeaf(node);
            }
            else
            {
                agent.THINK("keep-node").setState(state).finish();
            }

            // Iterate further if necessary.
            if (queue.Count > 0)
            {
                tree = this.pruneIterate(tree, queue, target_attribute);
            }
            return(tree);
        }
        private DecisionTree iterate(DecisionTree tree, List <DataInstance> set, Dictionary <string, string> attributes, Node parent, string last_split)
        {
            this.agent.THINK("iterate").finish();

            // Calculate gains and thresholds.
            Dictionary <string, Dictionary <string, double> > gains_and_thresholds = calculate_attribute_gain_ratios(set, target_attribute, attributes);
            Dictionary <string, double> thresholds = gains_and_thresholds["thresholds"];

            Tuple <string, Dictionary <string, List <DataInstance> > > attributeFound = this.findAttributeSplit(tree, set, attributes, gains_and_thresholds, parent, last_split);

            // We need to know what the best attribute to split on is, and  what the subsets of splitting on it would be.
            string best_split_attribute = attributeFound.Item1;
            Dictionary <string, List <DataInstance> > subsets = attributeFound.Item2;

            double threshold = -1000000;

            // This is to come to the same result as J48 [TODO: This has to go at some point]
            if (!have_been_at_root && best_split_attribute == "petal-length")
            {
                have_been_at_root = true;
                Console.WriteLine("Adjust to J48");
                best_split_attribute = "petal-width";
                threshold            = thresholds[best_split_attribute];
            }

            bool split_on_continuous = (best_split_attribute != "[INTERNAL_VARIABLE]-NOTFOUND") ? attributes[best_split_attribute] == "continuous" : false;

            // Check if a split attribute could even be found
            Dictionary <string, object> checkBestAttributeWasPossibleState = StateRecording.generateState("attribute_was_found", best_split_attribute == "[INTERNAL_VARIABLE]-NOTFOUND" ? "TRUE" : "FALSE", "best_attribute", best_split_attribute,
                                                                                                          "suggested_threshold", (split_on_continuous) ? (double?)thresholds[best_split_attribute] : null,
                                                                                                          "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null);

            if (best_split_attribute == "[INTERNAL_VARIABLE]-NOTFOUND")
            {
                // Okay so the subset we received could not be split such that it did not create too small of a leaf.
                // Therefore we will make an estimation leaf and move up.
                agent.THINK("add-estimation-leaf").setState(checkBestAttributeWasPossibleState).finish();
                tree = this.addEstimationLeaf(tree, set, parent, last_split);
                return(tree);
            }
            // If we got here then we did not return an estimation leaf and therefore we found a suitable attribute to split on!
            agent.THINK("add-node").setState(checkBestAttributeWasPossibleState).finish();

            if (split_on_continuous)
            {
                threshold = thresholds[best_split_attribute];
            }

            // Get started on making a node

            Dictionary <string, string> attributes_for_further_iteration = AttributeHelper.CopyAttributeDictionary(attributes);

            // We now know the best splitting attribute and how to split it.
            Node newnode = null;

            if (split_on_continuous)
            {
                newnode = tree.addContinuousNode(best_split_attribute, last_split, threshold, parent);
            }
            else
            {
                newnode = tree.addNode(best_split_attribute, last_split, parent);
                attributes_for_further_iteration.Remove(best_split_attribute);
            }

            // We now have a dictionary where each string represents the value split and the list of datainstances is the subset.
            foreach (string subset_splitter in subsets.Keys)
            {
                List <DataInstance> subset = subsets[subset_splitter];
                agent.THINK("subset-on-value").finish();

                bool uniformClassifier = false;

                if (subset.Count > 0)
                {
                    uniformClassifier = SetHelper.hasUniformClassifier(subset, target_attribute);
                }
                Dictionary <string, object> state = StateRecording.generateState("set_count", subset.Count, "set_has_uniform_classifier", (subset.Count > 0) ? (uniformClassifier ? "TRUE" : "FALSE") : "EMPTY SET", "chosen_attribute", best_split_attribute, "value_split", subset_splitter, "possible_attribute_count", attributes_for_further_iteration.Count,
                                                                                 "chosen_threshold", (split_on_continuous) ? (double?)thresholds[best_split_attribute] : null, "current_node_id", newnode.identifier,
                                                                                 "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null);

                if (subset.Count == 0)
                {
                    // There are no more of this subset. We need to skip this iteration.
                    agent.THINK("ignore-value").setState(state).finish();
                    continue;
                }

                if (uniformClassifier)
                {
                    // This subset doesn't have to be split anymore. We can just add it to the node as a leaf.
                    // Each leaf represents one decision rule.
                    string classifier_value = subset.First().getProperty(target_attribute);
                    double certainty        = 0;

                    // Calculate the certainty of this leaf. It's the average weight of the dataset.
                    foreach (DataInstance instance in subset)
                    {
                        certainty += instance.getWeight();
                    }
                    certainty /= (double)subset.Count;

                    agent.THINK("add-leaf").setState(state).finish();
                    Leaf leaf = tree.addUncertainLeaf(subset_splitter, classifier_value, newnode, certainty);
                    tree.data_locations[leaf] = subset;
                }
                else
                {
                    // We still haven't resolved this set. We need to iterate upon it to split it again.
                    if (attributes_for_further_iteration.Count == 0)
                    {
                        // If this happens than we have no more
                        agent.THINK("add-majority-leaf").setState(state).finish();
                        tree = this.addEstimationLeaf(tree, subset, newnode, subset_splitter);
                    }
                    else
                    {
                        // We still have attributes left, we can continue further!
                        agent.THINK("iterate-further").setState(state).finish();
                        tree = this.iterate(tree, subset, attributes_for_further_iteration, newnode, subset_splitter);
                    }
                    // If we got here in the code then the set that was previously not all the same classifier has been resolved.
                    // Therefore we can let the foreach continue further!
                }
            }

            // The set that we have received has been dealt with completely. We can now move up!
            agent.THINK("end-value-loop").finish();
            if (parent != null)
            {
                agent.THINK("return-tree-to-self").finish();
            }
            return(tree);
        }
        public DecisionTree iterate(DecisionTree tree, List <DataInstance> sets_todo, List <string> considerable_attributes, Node parent_node, string parent_value_splitter)
        {
            agent.THINK("iterate").finish();
            List <string> attributes_copy = new List <string>(considerable_attributes.ToArray());
            // Find best possible way to split these sets. For each attribute we will calculate the gain, and select the highest.
            string best_attr    = "UNDETERMINED";
            double highest_gain = 0;

            foreach (string attr in attributes_copy)
            {
                agent.THINK("consider-attribute").finish();
                double my_gain = Calculator.gain(sets_todo, attr, this.target_attribute, this.possible_attribute_values[attr]);

                Dictionary <string, object> state = StateRecording.generateState("current_best_attribute", best_attr, "competing_attribute", attr, "current_best_gain", highest_gain, "competing_gain", my_gain, "parent_id", (parent_node != null) ? parent_node.identifier : "NULL", "parent_attribute", (parent_node != null) ? parent_node.label : "NULL", "previous_value_split", (parent_value_splitter != null) ? parent_value_splitter : "NULL");
                if (my_gain > highest_gain)
                {
                    agent.THINK("set-new-best-attribute").setState(state).finish();
                    best_attr    = attr;
                    highest_gain = my_gain;
                }
                else
                {
                    agent.THINK("keep-old-attribute").setState(state).finish();
                }
            }
            agent.THINK("end-attribute-loop").finish();

            if (highest_gain == 0)
            {
                // This set cannot be split further.
                // We have tried all attributes so we can't go further. The tree ends here my friend.
                // This happens when instances have all attributes the same except for the classifier.

                throw new Exception("This dataset contains instances with exactly the same attribute values but different classifiers, which this algorithm does not support.");

                // I previously made an implementation of the algorithm that adds a 'Best Guess leaf' to address this problem,
                // but this is not described as such in the algorithm description and has therefore been left out for the experimentation.


                agent.THINK("add-best-guess-leaf").set("best_attribute", best_attr).set("highest_gain", 0d).set("possible_attributes", attributes_copy.Count).finish();
                string classifier_value = SetHelper.mostCommonClassifier(sets_todo, target_attribute);
                Leaf   leaf             = tree.addBestGuessLeaf(parent_value_splitter, classifier_value, parent_node);
                tree.data_locations[leaf] = sets_todo;
                return(tree);
            }

            // The best attribute to split this set is now saved in best_attr. Create a node for that.
            agent.THINK("add-node").finish();

            // Remove this attribute as a splitter for the dataset.
            attributes_copy.RemoveAt(considerable_attributes.IndexOf(best_attr));

            // Parent value splitter is to give a node an idea what it's parent splitted on. For decision rules this is needed information.
            Node new_node = tree.addNode(best_attr, parent_value_splitter, parent_node);

            // Create subsets for each possible value of the attribute we created a node for.
            int values_left = this.possible_attribute_values[best_attr].Count;

            foreach (string value_splitter in this.possible_attribute_values[best_attr])
            {
                agent.THINK("subset-on-value").finish();
                List <DataInstance>         subset            = sets_todo.Where(A => A.getProperty(best_attr) == value_splitter).ToList();
                Dictionary <string, object> considering_state = StateRecording.generateState("node_attribute", best_attr, "value_split", value_splitter, "current_node_id", new_node.identifier, "parent_node_id", (parent_node != null) ? parent_node.identifier : "NULL", "parent_attribute", (parent_node != null) ? parent_node.label : "NULL", "previous_value_split", (parent_value_splitter != null) ? parent_value_splitter : "NULL");
                if (subset.Count == 0)
                {
                    // There are no more of this subset. We need to skip this iteration.
                    agent.THINK("ignore-value").setState(considering_state).finish();
                    continue;
                }
                if (SetHelper.hasUniformClassifier(subset, target_attribute))
                {
                    // This subset doesn't have to be split anymore. We can just add it to the node as a leaf.
                    // Each leaf represents one decision rule.
                    agent.THINK("add-leaf").setState(considering_state).finish();
                    string classifier_value = subset.First().getProperty(target_attribute);
                    Leaf   leaf             = tree.addLeaf(value_splitter, classifier_value, new_node);
                    tree.data_locations[leaf] = subset;
                }
                else
                {
                    // We still haven't resolved this set. We need to iterate upon it to split it again.
                    agent.THINK("iterate-further").setState(considering_state).finish();
                    tree = this.iterate(tree, subset, attributes_copy, new_node, value_splitter);
                    // If we got here in the code then the set that was previously not all the same classifier has been resolved. We need to move up.
                }
                values_left -= 1;
            }
            agent.THINK("end-value-loop").finish();
            if (parent_node != null)
            {
                agent.THINK("return-tree-to-self").finish();
            }
            // We have succesfully split all examples on this attribute. Return the tree in its current state.
            return(tree);
        }
        private Tuple <string, Dictionary <string, List <DataInstance> > > findAttributeSplit(DecisionTree tree, List <DataInstance> set, Dictionary <string, string> attributes, Dictionary <string, Dictionary <string, double> > gains_and_thresholds, Node parent, string last_split)
        {
            // Get gains and thresholds from parameters
            Dictionary <string, double> gains      = gains_and_thresholds["gains"];
            Dictionary <string, double> thresholds = gains_and_thresholds["thresholds"];

            // Select the best attribute to split on
            double  highest_gain_ratio   = -1;
            string  best_split_attribute = "[INTERNAL_VARIABLE]-NOTFOUND";
            Boolean split_on_continuous  = false;
            double  threshold            = 0;
            Dictionary <string, List <DataInstance> > subsets = null;

            foreach (string competing_attribute in attributes.Keys.ToList())
            {
                agent.THINK("consider-attribute").finish();
                double my_gain_ratio           = gains[competing_attribute];
                bool   competing_is_continuous = (attributes[competing_attribute] == "continuous");
                Dictionary <string, object> comparingAttributeState = StateRecording.generateState("current_best_attribute", best_split_attribute, "competing_attribute", competing_attribute, "current_best_gain", highest_gain_ratio, "competing_gain", my_gain_ratio,
                                                                                                   "current_best_threshold", (split_on_continuous) ? (double?)threshold : null, "competing_threshold", (competing_is_continuous) ? (double?)thresholds[competing_attribute] : null,
                                                                                                   "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "NULL", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null);

                if (my_gain_ratio > highest_gain_ratio)
                {
                    // This attribute has the potential to become the new best attribute, but first we need to make sure splitting on this
                    // attribute will not result in a leaf that has a subset lower than the minimum leaf size.
                    agent.THINK("propose-competing-attribute").setState(comparingAttributeState).finish();

                    Dictionary <string, List <DataInstance> > competing_subsets = (competing_is_continuous) ? SetHelper.subsetOnAttributeContinuous(set, competing_attribute, thresholds[competing_attribute]) : SetHelper.subsetOnAttributeNominal(set, competing_attribute, possible_nominal_values[competing_attribute]);
                    int subsets_above_minimum_requirement = 0;

                    foreach (string value_splitter in competing_subsets.Keys.ToList())
                    {
                        List <DataInstance> subset = competing_subsets[value_splitter];

                        // If at least one of these subsets has less instances than the minimum leaf size, then this split should NOT happen.
                        if (subset.Count >= minimum_leaf_size)
                        {
                            subsets_above_minimum_requirement++;
                        }
                    }

                    // I could not find proof of how J48 determines how many 'wrong' subsets are allowed.
                    // I found a suggestion (https://stackoverflow.com/questions/21762161/what-does-the-minnumobj-parameter-do-in-j48-classifier-weka)
                    // And that works perfectly like J48 so I assume that this is how they do it.
                    Dictionary <string, object> verifyCompetingAttributeState = StateRecording.generateState("minimum_objects", minimum_leaf_size, "valid_subset_count", subsets_above_minimum_requirement, "chosen_attribute", best_split_attribute,
                                                                                                             "suggested_threshold", (split_on_continuous) ? (double?)thresholds[best_split_attribute] : null,
                                                                                                             "parent_id", (parent != null) ? parent.identifier : "NULL", "parent_attribute", (parent != null) ? parent.label : "NULL", "previous_value_split", (last_split != null) ? last_split : "", "parent_threshold", (parent != null && parent is ContinuousNode) ? (double?)((ContinuousNode)parent).threshold : null);

                    if (subsets_above_minimum_requirement < 2)
                    {
                        // Although this attribute has a better gain ratio than the best one we have now, it also forces us to create a leaf
                        // that is below the minimum leaf size and therefore we cannot choose this one!
                        agent.THINK("disregard-competing-attribute").setState(verifyCompetingAttributeState).finish();
                    }
                    else
                    {
                        agent.THINK("allow-competing-attribute").setState(verifyCompetingAttributeState).finish();
                        highest_gain_ratio   = my_gain_ratio;
                        best_split_attribute = competing_attribute;
                        split_on_continuous  = competing_is_continuous;
                        subsets = competing_subsets;
                        if (split_on_continuous)
                        {
                            threshold = thresholds[competing_attribute];
                        }
                    }
                }
                else
                {
                    // Previous attribute had a better gain ratio
                    agent.THINK("keep-best-attribute").setState(comparingAttributeState).finish();
                }
            }
            agent.THINK("end-attribute-loop").finish();
            return(new Tuple <string, Dictionary <string, List <DataInstance> > >(best_split_attribute, subsets));
        }