Ejemplo n.º 1
0
        public static int subset_errors(List <DataInstance> set, string target_attribute)
        {
            if (!hasUniformClassifier(set, target_attribute))
            {
                // No uniform classifier. Let's calculate the error rate.
                string majority_value = SetHelper.mostCommonClassifier(set, target_attribute);
                List <DataInstance> set_without_majority_clasifier = set.Where(A => A.getProperty(target_attribute) != majority_value).ToList();
                return(set_without_majority_clasifier.Count);
            }

            // If this subset has a uniform classifier, it has 0 errors.
            return(0);
        }
        public DecisionTree replaceNodeByNewLeaf(Node removeNode)
        {
            if (removeNode.getParent() == null)
            {
                Console.WriteLine("Tried to prune root. You sure this Decision Tree makes sense?");
                return(this);
            }
            // Create the new leaf
            List <DataInstance> total_set = new List <DataInstance>();
            List <Node>         queue     = new List <Node>();

            queue.Add(removeNode);

            // Get all instances that should be covered.
            while (queue.Count > 0)
            {
                Node node = queue[0];
                queue.RemoveAt(0);

                foreach (Leaf child in node.getLeafChildren())
                {
                    total_set.AddRange(this.data_locations[child]);
                    this.data_locations.Remove(child);
                }

                // Add child nodes to queue so their leafs also get added
                queue.AddRange(node.getNodeChildren());
            }

            Node parent = removeNode.getParent();

            // Remove the old node from its parent.
            if (parent != null)
            {
                // Make the new leaf
                string prediction  = SetHelper.mostCommonClassifier(total_set, this.target_attribute);
                double uncertainty = (double)SetHelper.subset_errors(total_set, this.target_attribute) / (double)total_set.Count;
                Leaf   newleaf     = this.addUncertainLeaf(removeNode.value_splitter, prediction, parent, uncertainty);
                // Make sure we can access this leaf's new subset!
                this.data_locations[newleaf] = total_set;

                parent.removeChildNode(removeNode);
            }
            return(this);
        }
        private DecisionTree addEstimationLeaf(DecisionTree tree, List <DataInstance> subset, Node parent, string value_splitter)
        {
            // We are out of attributes to split on! We need to identify the most common classifier.
            string most_common_classifier = SetHelper.mostCommonClassifier(subset, target_attribute);

            // Adjust for the uncertainty that comes with this prediction.  We combine the certainty of classifier (percentage) with the certainty of the instances belonging here (weight).
            double percentage_with_this_classifier = (double)subset.Where(A => A.getProperty(target_attribute) == most_common_classifier).ToList().Count / (double)subset.Count;
            double certainty = 0;

            foreach (DataInstance instance in subset)
            {
                certainty += instance.getWeight();
            }
            certainty /= (double)subset.Count;
            certainty  = certainty * percentage_with_this_classifier;
            Leaf leaf = tree.addUncertainLeaf(value_splitter, most_common_classifier, parent, certainty);

            tree.data_locations[leaf] = subset;
            return(tree);
        }
        public DecisionTree iterate(DecisionTree tree, List <DataInstance> sets_todo, List <string> considerable_attributes, Node parent_node, string parent_value_splitter)
        {
            agent.THINK("iterate").finish();
            List <string> attributes_copy = new List <string>(considerable_attributes.ToArray());
            // Find best possible way to split these sets. For each attribute we will calculate the gain, and select the highest.
            string best_attr    = "UNDETERMINED";
            double highest_gain = 0;

            foreach (string attr in attributes_copy)
            {
                agent.THINK("consider-attribute").finish();
                double my_gain = Calculator.gain(sets_todo, attr, this.target_attribute, this.possible_attribute_values[attr]);

                Dictionary <string, object> state = StateRecording.generateState("current_best_attribute", best_attr, "competing_attribute", attr, "current_best_gain", highest_gain, "competing_gain", my_gain, "parent_id", (parent_node != null) ? parent_node.identifier : "NULL", "parent_attribute", (parent_node != null) ? parent_node.label : "NULL", "previous_value_split", (parent_value_splitter != null) ? parent_value_splitter : "NULL");
                if (my_gain > highest_gain)
                {
                    agent.THINK("set-new-best-attribute").setState(state).finish();
                    best_attr    = attr;
                    highest_gain = my_gain;
                }
                else
                {
                    agent.THINK("keep-old-attribute").setState(state).finish();
                }
            }
            agent.THINK("end-attribute-loop").finish();

            if (highest_gain == 0)
            {
                // This set cannot be split further.
                // We have tried all attributes so we can't go further. The tree ends here my friend.
                // This happens when instances have all attributes the same except for the classifier.

                throw new Exception("This dataset contains instances with exactly the same attribute values but different classifiers, which this algorithm does not support.");

                // I previously made an implementation of the algorithm that adds a 'Best Guess leaf' to address this problem,
                // but this is not described as such in the algorithm description and has therefore been left out for the experimentation.


                agent.THINK("add-best-guess-leaf").set("best_attribute", best_attr).set("highest_gain", 0d).set("possible_attributes", attributes_copy.Count).finish();
                string classifier_value = SetHelper.mostCommonClassifier(sets_todo, target_attribute);
                Leaf   leaf             = tree.addBestGuessLeaf(parent_value_splitter, classifier_value, parent_node);
                tree.data_locations[leaf] = sets_todo;
                return(tree);
            }

            // The best attribute to split this set is now saved in best_attr. Create a node for that.
            agent.THINK("add-node").finish();

            // Remove this attribute as a splitter for the dataset.
            attributes_copy.RemoveAt(considerable_attributes.IndexOf(best_attr));

            // Parent value splitter is to give a node an idea what it's parent splitted on. For decision rules this is needed information.
            Node new_node = tree.addNode(best_attr, parent_value_splitter, parent_node);

            // Create subsets for each possible value of the attribute we created a node for.
            int values_left = this.possible_attribute_values[best_attr].Count;

            foreach (string value_splitter in this.possible_attribute_values[best_attr])
            {
                agent.THINK("subset-on-value").finish();
                List <DataInstance>         subset            = sets_todo.Where(A => A.getProperty(best_attr) == value_splitter).ToList();
                Dictionary <string, object> considering_state = StateRecording.generateState("node_attribute", best_attr, "value_split", value_splitter, "current_node_id", new_node.identifier, "parent_node_id", (parent_node != null) ? parent_node.identifier : "NULL", "parent_attribute", (parent_node != null) ? parent_node.label : "NULL", "previous_value_split", (parent_value_splitter != null) ? parent_value_splitter : "NULL");
                if (subset.Count == 0)
                {
                    // There are no more of this subset. We need to skip this iteration.
                    agent.THINK("ignore-value").setState(considering_state).finish();
                    continue;
                }
                if (SetHelper.hasUniformClassifier(subset, target_attribute))
                {
                    // This subset doesn't have to be split anymore. We can just add it to the node as a leaf.
                    // Each leaf represents one decision rule.
                    agent.THINK("add-leaf").setState(considering_state).finish();
                    string classifier_value = subset.First().getProperty(target_attribute);
                    Leaf   leaf             = tree.addLeaf(value_splitter, classifier_value, new_node);
                    tree.data_locations[leaf] = subset;
                }
                else
                {
                    // We still haven't resolved this set. We need to iterate upon it to split it again.
                    agent.THINK("iterate-further").setState(considering_state).finish();
                    tree = this.iterate(tree, subset, attributes_copy, new_node, value_splitter);
                    // If we got here in the code then the set that was previously not all the same classifier has been resolved. We need to move up.
                }
                values_left -= 1;
            }
            agent.THINK("end-value-loop").finish();
            if (parent_node != null)
            {
                agent.THINK("return-tree-to-self").finish();
            }
            // We have succesfully split all examples on this attribute. Return the tree in its current state.
            return(tree);
        }