public static TreeNode TreeNodeFactory(GeneticAlgorithmManager ga_mgr, bool ShouldForceTerminal, Tree tree) { TreeNode node_output; bool term_node = ga_mgr.rando.NextDouble() > ga_mgr._gaOptions.prob_node_terminal; //TODO: consider changing this or using some other scheme to prevent runaway initial trees. if (term_node || ShouldForceTerminal || tree._nodes.Count > ga_mgr._gaOptions.max_node_count_for_new_tree) { var node = new ClassificationTreeNode(); node.CreateRandom(ga_mgr); node_output = node; } else { var node = new YesNoMissingTreeNode(); node.CreateRandom(ga_mgr); node_output = node; } //TODO there might be a better place for this node_output.matrix = new ConfusionMatrix(ga_mgr.dataPointMgr.classes.Length); tree.AddNodeWithoutChildren(node_output); return(node_output); }
public override TreeNode CopyNonLinkingData() { var new_node = new YesNoMissingTreeNode(); new_node.Test = this.Test.Copy(); return(new_node); }
public override TreeNode ReturnFullyLinkedCopyOfSelf() { //know that it is a decision tree since it is self YesNoMissingTreeNode self_copy = (YesNoMissingTreeNode)this.CopyNonLinkingData(); TreeNode true_copy = _trueNode.ReturnFullyLinkedCopyOfSelf(); TreeNode false_copy = _falseNode.ReturnFullyLinkedCopyOfSelf(); TreeNode missing_copy = _missingNode.ReturnFullyLinkedCopyOfSelf(); self_copy._trueNode = true_copy; self_copy._falseNode = false_copy; self_copy._missingNode = missing_copy; true_copy._parent = self_copy; false_copy._parent = self_copy; missing_copy._parent = self_copy; return(self_copy); }
public static TreeNode SplitClassificationNode(ClassificationTreeNode nodeToReplace, GeneticAlgorithmManager ga_mgr) { var nodeNewDecision = new YesNoMissingTreeNode(); nodeNewDecision.CreateRandom(ga_mgr); //TODO unhardcode this items var item1 = 0.0; var item2 = 0.0; var item3 = 0.0; //create the two classification nodes var nodeTrue = new ClassificationTreeNode(); nodeTrue.Classification = item1; nodeTrue._parent = nodeNewDecision; var nodeFalse = new ClassificationTreeNode(); nodeFalse.Classification = item2; nodeFalse._parent = nodeNewDecision; var nodeMissing = new ClassificationTreeNode(); nodeMissing.Classification = item3; nodeMissing._parent = nodeNewDecision; nodeNewDecision._trueNode = nodeTrue; nodeNewDecision._falseNode = nodeFalse; nodeNewDecision._missingNode = nodeMissing; ReplaceOneNodeWithAnother(nodeToReplace, nodeNewDecision); //return the new tree with that change //try to optimize the node bool opTest = OptimizeTest(nodeNewDecision, ga_mgr); return(nodeNewDecision); }
public static IEnumerable <Tree> OptimizeSplitForNode(GeneticAlgorithmManager ga_mgr, List <Tree> treesInPopulation) { //pick a random tree Random rando = ga_mgr.rando; Tree tree1 = treesInPopulation[rando.Next(treesInPopulation.Count())]; Tree tree1_copy = tree1.Copy(); var nodes_to_choose_from = tree1_copy.GetNodesOfType <YesNoMissingTreeNode>().Where(c => c.Test is LessThanEqualTreeTest); if (!nodes_to_choose_from.Any()) { //empty collection test yield break; } var node_picker = new WeightedSelector <YesNoMissingTreeNode>( nodes_to_choose_from.Select(c => Tuple.Create(c, 1.0)) ); YesNoMissingTreeNode node1_copy = node_picker.PickRandom(rando); if (node1_copy == null) { yield break; } //iterate through all values for the node and use the one with the best impurity if (OptimizeTest(node1_copy, ga_mgr)) { tree1_copy._source = "optimize value"; yield return(tree1_copy); } yield break; }
public static IEnumerable <Tree> SplitNodeAndOptimizeTests(GeneticAlgorithmManager ga_mgr, List <Tree> treesInPopulation) { //find a grab a tree Tree tree1 = treesInPopulation[ga_mgr.rando.Next(treesInPopulation.Count())]; //uses the traversal count for selecting var node_picker = new WeightedSelector <ClassificationTreeNode>( tree1.GetNodesOfType <ClassificationTreeNode>().Select(c => Tuple.Create(c, (double)c._traverseCount)) ); ClassificationTreeNode node1 = node_picker.PickRandom(ga_mgr.rando); tree1.SetStructuralLocationsForNodes(); if (node1.matrix == null) { } var matrix_rows = node1.matrix.GetRowsOrderedByCount().ToList(); //trap is here in case there are fewer than 2 "top" rows to split on if (matrix_rows.Count >= 2) { //TODO improve this structural business to be cleaner and more obvious if it belongs to the Node or Tree Tree tree1_copy = tree1.Copy(); TreeNode node1_copy = tree1_copy.GetNodeAtStructualLocation(node1._structuralLocation); //grab the node with the greatest traverse count (that is terminal) //determine the two most popular classes from there (rows in the confusion table) //create a random test for the current node (change from classification to decision) //TODO create a proper factory for this code var node1_decision = new YesNoMissingTreeNode(); node1_decision.CreateRandom(ga_mgr); node1_decision._parent = node1_copy._parent; var item1 = node1.Classification; var item2 = 0; var item3 = 0; //create the two classification nodes var node1a_class = new ClassificationTreeNode(); node1a_class.Classification = item1; node1a_class._parent = node1_decision; var node1b_class = new ClassificationTreeNode(); node1b_class.Classification = item2; node1b_class._parent = node1_decision; var node1c_class = new ClassificationTreeNode(); node1c_class.Classification = item3; node1c_class._parent = node1_decision; node1_decision._trueNode = node1a_class; node1_decision._falseNode = node1b_class; node1_decision._missingNode = node1c_class; //add the two nodes with the most popular classes //TODO create a "replace node" operation to standardize this code tree1_copy.RemoveNodeWithChildren(node1_copy); node1_copy.UpdateParentReference(node1_decision); tree1_copy.AddNodeWithChildren(node1_decision); //return the new tree with that change //try to optimize the node bool opTest = OptimizeTest(node1_decision, ga_mgr); tree1_copy._source = "node split"; if (opTest) { tree1_copy._source += " w/ op"; } yield return(tree1_copy); } }
public static bool OptimizeTest(YesNoMissingTreeNode node1_copy, GeneticAlgorithmManager ga_mgr) { if (node1_copy.Test is LessThanEqualTreeTest) { LessThanEqualTreeTest test = node1_copy.Test as LessThanEqualTreeTest; if (test == null) { return(false); } //iterate through all values, make split, test impurity var values = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test.param]); var all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c).ToArray(); List <double> all_splits = new List <double>(); for (int i = 1; i < all_uniques.Length; i++) { all_splits.Add(0.5 * (all_uniques[i] + all_uniques[i - 1])); } double best_split = double.NaN; double best_purity = double.MinValue; //TODO improve this selection for how many split points to consider foreach (var split in all_splits.TakeEvery(all_splits.Count / 10 + 1)) { //change the test value and find the best purity test.valTest = split; var results = new GeneticAlgorithmRunResults(ga_mgr); node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest); //check the result of the split var gini_d = node1_copy.matrix.GiniImpuritySqrt; double gini_split = 0.0; int count = 0; foreach (var node in node1_copy._subNodes) { gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt; count += node.matrix._count; } gini_split /= count; double gini_gain = gini_d - gini_split; if (gini_gain > best_purity) { best_split = split; best_purity = gini_gain; } } test.valTest = best_split; } else if (node1_copy.Test is EqualTreeTest) { EqualTreeTest test = node1_copy.Test as EqualTreeTest; if (test == null) { return(false); } //iterate through all values, make split, test impurity var values = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test._param]); IEnumerable <double> all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c); var unique_count = all_uniques.Count(); if (unique_count > 10) { all_uniques = all_uniques.TakeEvery(unique_count / 10 + 1); } double best_split = double.NaN; double best_purity = double.MinValue; //TODO improve this selection for how many split points to consider foreach (var split in all_uniques) { //change the test value and find the best purity test._valTest = split; var results = new GeneticAlgorithmRunResults(ga_mgr); node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest); var gini_d = node1_copy.matrix.GiniImpuritySqrt; double gini_split = 0.0; int count = 0; foreach (var node in node1_copy._subNodes) { gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt; count += node.matrix._count; } gini_split /= count; double gini_gain = gini_d - gini_split; if (gini_gain > best_purity) { best_split = split; best_purity = gini_gain; } } test._valTest = best_split; } else { return(false); } return(true); }