public override TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results)
        {
            this._traverseCount++;
            //-1 will be the no classificaiton route for now

            if (this.Classification == -1.0)
            {
                //TODO consider not returning a null here and instead returning the node so that it can be used
                return(null);
            }
            else
            {
                results.node_assoc.Add(Tuple.Create(point, this));

                //these are known to be ints since they are classes from a Codebook
                results.count_classedData++;

                //TODO fix this to be less hacky.  currently assumes that if no class then assume 0
                int actualClass = 0;
                if (point._classification != null)
                {
                    actualClass = (int)point._classification._value;
                }

                this.ProcessResultFromClassification(actualClass, this.Classification);

                return(this);
            }
        }
Esempio n. 2
0
        public override TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results)
        {
            this._traverseCount++;

            return(this.Test.isTrueTest(point) ?
                   this._trueNode.TraverseData(point, results) :
                   this._falseNode.TraverseData(point, results));
        }
Esempio n. 3
0
        private List <Tree> ScoreTreesAndReturnKept(IEnumerable <Tree> trees, int generation)
        {
            List <Tree> keeps = new List <Tree>();

            Logger.WriteLine("point count: " + dataPointMgr._pointsToTest.Count);

            foreach (var tree in trees)
            {
                GeneticAlgorithmRunResults results = null;

                //TODO figure out why a NullRefExc came through here
                if (tree == null)
                {
                    continue;
                }

                //quick pass through for trees already processed
                if (!tree._isDirty)
                {
                    tree._source = "pass through";
                    keeps.Add(tree);
                    continue;
                }

                results = new GeneticAlgorithmRunResults(this);
                tree.ProcessDataThroughTree(dataPointMgr, results, dataPointMgr._pointsToTest);

                tree.RemoveZeroCountNodes();
                GeneticOperations.PruneTreeOfUselessNodes(tree);

                //will add kepeers if there was no previous result or if the score improved or randomly
                if (tree._prevResults == null || results.MetricResult > tree._prevResults.MetricResult)
                {
                    //only add the new tree to the results if the score improved
                    keeps.Add(tree);
                }

                tree._isDirty     = false;
                tree._prevResults = results;

                //now run a set of values through for the cross validation
                //TODO determine when to do the CV step, how many points to use, and what to do with the results

                /*TODO uncomment to get CV back, consider impact of node level matrices changing)
                 * var cv_results = new GeneticAlgorithmRunResults(this);
                 * tree.ProcessDataThroughTree(dataPointMgr, cv_results, dataPointMgr._pointsNotUsedToTest.TakeEvery(5));
                 * double loss_ratio = results.AverageLoss / cv_results.AverageLoss;
                 * Logger.WriteLine(loss_ratio);
                 */
            }

            return(keeps);
        }
Esempio n. 4
0
        public void LoadTreeAndGenerateResults(string path)
        {
            //will process all of the data through a single tree
            //need a tree to test

            //TODO generalize this file name/folder
            Tree tree    = Tree.ReadFromXmlFile(path);
            var  results = new GeneticAlgorithmRunResults(ga_mgr);

            tree.ProcessDataThroughTree(data_mgr, results, data_mgr._dataPoints);

            //deal with results
            Logger.WriteLine(results);
        }
Esempio n. 5
0
        public void GeneratePredictionsForDataWithAllTrees(string folderPath)
        {
            List <Tree> treesToTest = new List <Tree>();

            foreach (var file in Directory.GetFiles(folderPath))
            {
                var tree = Tree.ReadFromXmlFile(file);
                treesToTest.Add(tree);
                Debug.WriteLine(tree);
            }
            //loop through the data points, and then loop through trees
            //will contain the ID and probability
            var probs = new List <Tuple <string, double> >();

            foreach (var dataPoint in data_mgr._dataPoints)
            {
                double pred_value = 0.0;
                var    results    = new GeneticAlgorithmRunResults(ga_mgr);
                int    count      = 0;
                foreach (var tree in treesToTest)
                {
                    var node = tree._root.TraverseData(dataPoint, results);

                    ClassificationTreeNode termNode = node as ClassificationTreeNode;

                    if (termNode == null)
                    {
                        continue;
                    }
                    else
                    {
                        pred_value += termNode.ProbPrediction;
                        count++;
                    }
                }

                probs.Add(Tuple.Create(dataPoint._id, pred_value / count));
            }
            using (StreamWriter sw = new StreamWriter("submission_" + DateTime.Now.Ticks + ".csv"))
            {
                sw.WriteLine("ID,PredictedProb");
                foreach (var prob in probs)
                {
                    sw.WriteLine("{0},{1:0.0000}", prob.Item1, prob.Item2);
                }
            }
        }
Esempio n. 6
0
        public void ProcessDataThroughTree(
            DataPointManager dataPointMgr,
            GeneticAlgorithmRunResults results,
            IEnumerable <DataPoint> dataPoints)
        {
            //reset traverse counts
            //double check on traverse count

            //TODO clean this up to determine which one is failing
            _nodes.Clear();

            Stack <TreeNode> nodes_to_process = new Stack <TreeNode>();

            nodes_to_process.Push(_root);

            while (nodes_to_process.Count > 0)
            {
                TreeNode node = nodes_to_process.Pop();
                //TODO determine why this line will fail at times... seems to be related to root nodes
                _nodes.Add(node);
                node._traverseCount = 0;
                node.matrix         = new ConfusionMatrix(dataPointMgr.classes.Length);

                foreach (var subNode in node._subNodes)
                {
                    nodes_to_process.Push(subNode);
                }
            }

            foreach (var dataPoint in dataPoints)
            {
                results.count_allData++;
                TraverseData(dataPoint, results);
            }

            //at this point, have the results for the run through, need to determine a score
            results.ProcessScoresAfterTraverse();

            results.tree_nodeCount = _nodes.Count;
            results._matrix        = this._root.matrix;

            //store the results for future use
            _currentResults = results;
        }
Esempio n. 7
0
        public static bool OptimizeTest(YesNoMissingTreeNode node1_copy, GeneticAlgorithmManager ga_mgr)
        {
            if (node1_copy.Test is LessThanEqualTreeTest)
            {
                LessThanEqualTreeTest test = node1_copy.Test as LessThanEqualTreeTest;

                if (test == null)
                {
                    return(false);
                }
                //iterate through all values, make split, test impurity
                var values      = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test.param]);
                var all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c).ToArray();

                List <double> all_splits = new List <double>();
                for (int i = 1; i < all_uniques.Length; i++)
                {
                    all_splits.Add(0.5 * (all_uniques[i] + all_uniques[i - 1]));
                }

                double best_split  = double.NaN;
                double best_purity = double.MinValue;

                //TODO improve this selection for how many split points to consider
                foreach (var split in all_splits.TakeEvery(all_splits.Count / 10 + 1))
                {
                    //change the test value and find the best purity
                    test.valTest = split;

                    var results = new GeneticAlgorithmRunResults(ga_mgr);
                    node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest);

                    //check the result of the split
                    var gini_d = node1_copy.matrix.GiniImpuritySqrt;

                    double gini_split = 0.0;
                    int    count      = 0;

                    foreach (var node in node1_copy._subNodes)
                    {
                        gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt;
                        count      += node.matrix._count;
                    }

                    gini_split /= count;

                    double gini_gain = gini_d - gini_split;

                    if (gini_gain > best_purity)
                    {
                        best_split  = split;
                        best_purity = gini_gain;
                    }
                }

                test.valTest = best_split;
            }
            else if (node1_copy.Test is EqualTreeTest)
            {
                EqualTreeTest test = node1_copy.Test as EqualTreeTest;

                if (test == null)
                {
                    return(false);
                }
                //iterate through all values, make split, test impurity
                var values = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test._param]);
                IEnumerable <double> all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c);

                var unique_count = all_uniques.Count();
                if (unique_count > 10)
                {
                    all_uniques = all_uniques.TakeEvery(unique_count / 10 + 1);
                }

                double best_split  = double.NaN;
                double best_purity = double.MinValue;

                //TODO improve this selection for how many split points to consider
                foreach (var split in all_uniques)
                {
                    //change the test value and find the best purity
                    test._valTest = split;

                    var results = new GeneticAlgorithmRunResults(ga_mgr);

                    node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest);

                    var gini_d = node1_copy.matrix.GiniImpuritySqrt;

                    double gini_split = 0.0;
                    int    count      = 0;

                    foreach (var node in node1_copy._subNodes)
                    {
                        gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt;
                        count      += node.matrix._count;
                    }

                    gini_split /= count;

                    double gini_gain = gini_d - gini_split;

                    if (gini_gain > best_purity)
                    {
                        best_split  = split;
                        best_purity = gini_gain;
                    }
                }

                test._valTest = best_split;
            }
            else
            {
                return(false);
            }

            return(true);
        }
Esempio n. 8
0
 public TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results)
 {
     return(_root.TraverseData(point, results));
 }
Esempio n. 9
0
 public abstract TreeNode TraverseData(DataPoint point, GeneticAlgorithmRunResults results);