public KeyValuePair <string, Dictionary <string, DecisionTreeNode <T> > > Prune() { Dictionary <string, DecisionTreeNode <T> > children = mChildren; KeyValuePair <string, Dictionary <string, DecisionTreeNode <T> > > tree_info = new KeyValuePair <string, Dictionary <string, DecisionTreeNode <T> > >( mSplitVariableName, children); DecisionTreePartition <T> partition = DecisionTreePartition <T> .Create(this); Dictionary <string, DecisionTreePartition <T> > sub_partitions_by_class_variable = DecisionTreeMethods.SplitPartitionByClassVariable <T>(partition, (gain, feature_name) => { if (HandleCost != null) { return(HandleCost(gain, feature_name)); } return(gain); }); partition.UpdateSubPartitions(sub_partitions_by_class_variable, DecisionTree <T> .ClassVariableName); return(tree_info); }
public static void Split <T>(DecisionTreePartition <T> partition, HashSet <string> feature_variable_names, DoAttributeCostGainHandle handle_cost) where T : DDataRecord { if (feature_variable_names.Count == 0) { Dictionary <string, DecisionTreePartition <T> > sub_partitions_by_class_variable = SplitPartitionByClassVariable(partition, handle_cost); partition.UpdateSubPartitions(sub_partitions_by_class_variable, DecisionTree <T> .ClassVariableName); return; } double max_information_gain = 0; string selected_feature_name = null; HashSet <string> temp_feature_variable_names = null; if (partition.Forest == null) { temp_feature_variable_names = feature_variable_names.Clone(); } else // if in random forest, select a random subset of features to split { int subset_featire_variable_count = partition.Forest.FeatureSubsetSize; List <string> temp_names = feature_variable_names.ToList(); temp_names.Shuffle(); temp_feature_variable_names = new HashSet <string>(); for (int i = 0; i < subset_featire_variable_count; ++i) { temp_feature_variable_names.Add(temp_names[i]); } } foreach (string variable_name in temp_feature_variable_names) { double g = CalcInformationGain(partition, variable_name, handle_cost); if (max_information_gain < g) { max_information_gain = g; selected_feature_name = variable_name; } } partition.SplitInformationGain = max_information_gain; if (selected_feature_name == null) // this suggest the children of the nodes are leave nodes { Dictionary <string, DecisionTreePartition <T> > sub_partitions_by_class_variable = SplitPartitionByClassVariable(partition, handle_cost); partition.UpdateSubPartitions(sub_partitions_by_class_variable, DecisionTree <T> .ClassVariableName); } else { if (partition.Forest == null) // if not in random forest, remove the used feature variable; otherwise, keep all and in subsequent split, select random subset { feature_variable_names.Remove(selected_feature_name); } Dictionary <string, DecisionTreePartition <T> > sub_partitions_by_feature_name = SplitPartitionByFeatureVariable(partition, selected_feature_name, handle_cost); partition.UpdateSubPartitions(sub_partitions_by_feature_name, selected_feature_name); if (feature_variable_names.Count > 0) { foreach (DecisionTreePartition <T> sub_partition_by_feature_name in sub_partitions_by_feature_name.Values) { Split(sub_partition_by_feature_name, feature_variable_names.Clone(), handle_cost); } } } }