//TODO add the ability to test against another value in the data, will work against the balance scale data
        public override TreeTest Copy()
        {
            LessThanEqualTreeTest test_copy = new LessThanEqualTreeTest();

            test_copy.param    = this.param;
            test_copy.valTest  = this.valTest;
            test_copy._testCol = this._testCol;
            return(test_copy);
        }
Пример #2
0
        public static TreeTest TreeTestFactory(DataPointManager dataPointMgr, Random rando)
        {
            //TODO this shoudl take a ga_mgr instead of the parts
            //TODO clean up this mess once the DataColumns quit using the TYPE part

            TreeTest output;

            var        col_param = rando.Next(dataPointMgr._columns.Count);
            DataColumn column    = dataPointMgr._columns[col_param];

            switch (column._type)
            {
            case DataColumn.DataValueTypes.NUMBER:
                LessThanEqualTreeTest test = new LessThanEqualTreeTest();
                test.param   = col_param;
                test.valTest = column.GetTestValue(rando);
                output       = test;

                break;

            case DataColumn.DataValueTypes.CATEGORY:
                var cat_column = column as CategoryDataColumn;

                var categories = cat_column._codebook.GetCategories();

                var category_count = categories.Count();
                //toss a coin to decide on subsetter
                EqualTreeTest test_eq = new EqualTreeTest();
                test_eq._param   = col_param;
                test_eq._valTest = column.GetTestValue(rando);
                output           = test_eq;
                break;

            default:
                throw new ArgumentOutOfRangeException();
            }

            output._testCol = column;
            return(output);
        }
Пример #3
0
        public static bool OptimizeTest(YesNoMissingTreeNode node1_copy, GeneticAlgorithmManager ga_mgr)
        {
            if (node1_copy.Test is LessThanEqualTreeTest)
            {
                LessThanEqualTreeTest test = node1_copy.Test as LessThanEqualTreeTest;

                if (test == null)
                {
                    return(false);
                }
                //iterate through all values, make split, test impurity
                var values      = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test.param]);
                var all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c).ToArray();

                List <double> all_splits = new List <double>();
                for (int i = 1; i < all_uniques.Length; i++)
                {
                    all_splits.Add(0.5 * (all_uniques[i] + all_uniques[i - 1]));
                }

                double best_split  = double.NaN;
                double best_purity = double.MinValue;

                //TODO improve this selection for how many split points to consider
                foreach (var split in all_splits.TakeEvery(all_splits.Count / 10 + 1))
                {
                    //change the test value and find the best purity
                    test.valTest = split;

                    var results = new GeneticAlgorithmRunResults(ga_mgr);
                    node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest);

                    //check the result of the split
                    var gini_d = node1_copy.matrix.GiniImpuritySqrt;

                    double gini_split = 0.0;
                    int    count      = 0;

                    foreach (var node in node1_copy._subNodes)
                    {
                        gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt;
                        count      += node.matrix._count;
                    }

                    gini_split /= count;

                    double gini_gain = gini_d - gini_split;

                    if (gini_gain > best_purity)
                    {
                        best_split  = split;
                        best_purity = gini_gain;
                    }
                }

                test.valTest = best_split;
            }
            else if (node1_copy.Test is EqualTreeTest)
            {
                EqualTreeTest test = node1_copy.Test as EqualTreeTest;

                if (test == null)
                {
                    return(false);
                }
                //iterate through all values, make split, test impurity
                var values = ga_mgr.dataPointMgr._pointsToTest.Select(c => c._data[test._param]);
                IEnumerable <double> all_uniques = values.Where(c => !c._isMissing).Select(c => c._value).Distinct().OrderBy(c => c);

                var unique_count = all_uniques.Count();
                if (unique_count > 10)
                {
                    all_uniques = all_uniques.TakeEvery(unique_count / 10 + 1);
                }

                double best_split  = double.NaN;
                double best_purity = double.MinValue;

                //TODO improve this selection for how many split points to consider
                foreach (var split in all_uniques)
                {
                    //change the test value and find the best purity
                    test._valTest = split;

                    var results = new GeneticAlgorithmRunResults(ga_mgr);

                    node1_copy._tree.ProcessDataThroughTree(ga_mgr.dataPointMgr, results, ga_mgr.dataPointMgr._pointsToTest);

                    var gini_d = node1_copy.matrix.GiniImpuritySqrt;

                    double gini_split = 0.0;
                    int    count      = 0;

                    foreach (var node in node1_copy._subNodes)
                    {
                        gini_split += node.matrix._count * node.matrix.GiniImpuritySqrt;
                        count      += node.matrix._count;
                    }

                    gini_split /= count;

                    double gini_gain = gini_d - gini_split;

                    if (gini_gain > best_purity)
                    {
                        best_split  = split;
                        best_purity = gini_gain;
                    }
                }

                test._valTest = best_split;
            }
            else
            {
                return(false);
            }

            return(true);
        }