public override TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results) { this._traverseCount++; //-1 will be the no classificaiton route for now if (this.Classification == -1.0) { //TODO consider not returning a null here and instead returning the node so that it can be used return(null); } else { results.node_assoc.Add(Tuple.Create(point, this)); //these are known to be ints since they are classes from a Codebook results.count_classedData++; //TODO fix this to be less hacky. currently assumes that if no class then assume 0 int actualClass = 0; if (point._classification != null) { actualClass = (int)point._classification._value; } this.ProcessResultFromClassification(actualClass, this.Classification); return(this); } }
public override TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results) { this._traverseCount++; return(this.Test.isTrueTest(point) ? this._trueNode.TraverseData(point, results) : this._falseNode.TraverseData(point, results)); }
private List <Tree> ScoreTreesAndReturnKept(IEnumerable <Tree> trees, int generation) { List <Tree> keeps = new List <Tree>(); Logger.WriteLine("point count: " + dataPointMgr._pointsToTest.Count); foreach (var tree in trees) { GeneticAlgorithmRunResults results = null; //TODO figure out why a NullRefExc came through here if (tree == null) { continue; } //quick pass through for trees already processed if (!tree._isDirty) { tree._source = "pass through"; keeps.Add(tree); continue; } results = new GeneticAlgorithmRunResults(this); tree.ProcessDataThroughTree(dataPointMgr, results, dataPointMgr._pointsToTest); tree.RemoveZeroCountNodes(); GeneticOperations.PruneTreeOfUselessNodes(tree); //will add kepeers if there was no previous result or if the score improved or randomly if (tree._prevResults == null || results.MetricResult > tree._prevResults.MetricResult) { //only add the new tree to the results if the score improved keeps.Add(tree); } tree._isDirty = false; tree._prevResults = results; //now run a set of values through for the cross validation //TODO determine when to do the CV step, how many points to use, and what to do with the results /*TODO uncomment to get CV back, consider impact of node level matrices changing) * var cv_results = new GeneticAlgorithmRunResults(this); * tree.ProcessDataThroughTree(dataPointMgr, cv_results, dataPointMgr._pointsNotUsedToTest.TakeEvery(5)); * double loss_ratio = results.AverageLoss / cv_results.AverageLoss; * Logger.WriteLine(loss_ratio); */ } return(keeps); }
public void LoadTreeAndGenerateResults(string path) { //will process all of the data through a single tree //need a tree to test //TODO generalize this file name/folder Tree tree = Tree.ReadFromXmlFile(path); var results = new GeneticAlgorithmRunResults(ga_mgr); tree.ProcessDataThroughTree(data_mgr, results, data_mgr._dataPoints); //deal with results Logger.WriteLine(results); }
public void GeneratePredictionsForDataWithAllTrees(string folderPath) { List <Tree> treesToTest = new List <Tree>(); foreach (var file in Directory.GetFiles(folderPath)) { var tree = Tree.ReadFromXmlFile(file); treesToTest.Add(tree); Debug.WriteLine(tree); } //loop through the data points, and then loop through trees //will contain the ID and probability var probs = new List <Tuple <string, double> >(); foreach (var dataPoint in data_mgr._dataPoints) { double pred_value = 0.0; var results = new GeneticAlgorithmRunResults(ga_mgr); int count = 0; foreach (var tree in treesToTest) { var node = tree._root.TraverseData(dataPoint, results); ClassificationTreeNode termNode = node as ClassificationTreeNode; if (termNode == null) { continue; } else { pred_value += termNode.ProbPrediction; count++; } } probs.Add(Tuple.Create(dataPoint._id, pred_value / count)); } using (StreamWriter sw = new StreamWriter("submission_" + DateTime.Now.Ticks + ".csv")) { sw.WriteLine("ID,PredictedProb"); foreach (var prob in probs) { sw.WriteLine("{0},{1:0.0000}", prob.Item1, prob.Item2); } } }
public void ProcessDataThroughTree( DataPointManager dataPointMgr, GeneticAlgorithmRunResults results, IEnumerable <DataPoint> dataPoints) { //reset traverse counts //double check on traverse count //TODO clean this up to determine which one is failing _nodes.Clear(); Stack <TreeNode> nodes_to_process = new Stack <TreeNode>(); nodes_to_process.Push(_root); while (nodes_to_process.Count > 0) { TreeNode node = nodes_to_process.Pop(); //TODO determine why this line will fail at times... seems to be related to root nodes _nodes.Add(node); node._traverseCount = 0; node.matrix = new ConfusionMatrix(dataPointMgr.classes.Length); foreach (var subNode in node._subNodes) { nodes_to_process.Push(subNode); } } foreach (var dataPoint in dataPoints) { results.count_allData++; TraverseData(dataPoint, results); } //at this point, have the results for the run through, need to determine a score results.ProcessScoresAfterTraverse(); results.tree_nodeCount = _nodes.Count; results._matrix = this._root.matrix; //store the results for future use _currentResults = results; }
public static bool OptimizeTest(YesNoMissingTreeNode node1_copy, GeneticAlgorithmManager ga_mgr) { if (node1_copy.Test is LessThanEqualTreeTest) { LessThanEqualTreeTest test = node1_copy.Test as LessThanEqualTreeTest; if (test == null) { return(false); } //iterate through all values, make split, test impurity var values = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test.param]); var all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c).ToArray(); List <double> all_splits = new List <double>(); for (int i = 1; i < all_uniques.Length; i++) { all_splits.Add(0.5 * (all_uniques[i] + all_uniques[i - 1])); } double best_split = double.NaN; double best_purity = double.MinValue; //TODO improve this selection for how many split points to consider foreach (var split in all_splits.TakeEvery(all_splits.Count / 10 + 1)) { //change the test value and find the best purity test.valTest = split; var results = new GeneticAlgorithmRunResults(ga_mgr); node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest); //check the result of the split var gini_d = node1_copy.matrix.GiniImpuritySqrt; double gini_split = 0.0; int count = 0; foreach (var node in node1_copy._subNodes) { gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt; count += node.matrix._count; } gini_split /= count; double gini_gain = gini_d - gini_split; if (gini_gain > best_purity) { best_split = split; best_purity = gini_gain; } } test.valTest = best_split; } else if (node1_copy.Test is EqualTreeTest) { EqualTreeTest test = node1_copy.Test as EqualTreeTest; if (test == null) { return(false); } //iterate through all values, make split, test impurity var values = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test._param]); IEnumerable <double> all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c); var unique_count = all_uniques.Count(); if (unique_count > 10) { all_uniques = all_uniques.TakeEvery(unique_count / 10 + 1); } double best_split = double.NaN; double best_purity = double.MinValue; //TODO improve this selection for how many split points to consider foreach (var split in all_uniques) { //change the test value and find the best purity test._valTest = split; var results = new GeneticAlgorithmRunResults(ga_mgr); node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest); var gini_d = node1_copy.matrix.GiniImpuritySqrt; double gini_split = 0.0; int count = 0; foreach (var node in node1_copy._subNodes) { gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt; count += node.matrix._count; } gini_split /= count; double gini_gain = gini_d - gini_split; if (gini_gain > best_purity) { best_split = split; best_purity = gini_gain; } } test._valTest = best_split; } else { return(false); } return(true); }
public TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results) { return(_root.TraverseData(point, results)); }
public abstract TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results);