Beispiel #1
0
        public static double CalculateBiasAndVariance(ParserResults trainingData, List <List <DecisionTreeLevel> > listOfTreesToRunTestOn, out double bias)
        {
            double variance = 0;

            bias = 0;
            // Calculate biar and variance for all trees
            foreach (var trainingDataValue in trainingData.Values)
            {
                double        realValue      = Transformer.BoolToDouble(trainingDataValue.Output);
                List <double> allPredictions = new List <double>(listOfTreesToRunTestOn.Count);
                foreach (var bagger in listOfTreesToRunTestOn)
                {
                    bool prediction = DecisionTreeScorer.CalculatePrediction(bagger, trainingDataValue);
                    allPredictions.Add(Transformer.BoolToDouble(prediction));
                }
                double mode = ModeFinder.FindMode(allPredictions);
                double averagePrediction = allPredictions.Average();

                // Now that we have the mode, realValue, and average of all predictions, we can calculate variance and bias
                double varianceForThisDataPoint            = 0;
                double diffOfRealValueAndAveragePrediction = (realValue - averagePrediction);
                double biasForThisDataPoint = diffOfRealValueAndAveragePrediction * diffOfRealValueAndAveragePrediction;
                foreach (var prediction in allPredictions)
                {
                    var diffForModeAndPrediction = prediction - mode;
                    varianceForThisDataPoint += diffForModeAndPrediction * diffForModeAndPrediction;
                }
                varianceForThisDataPoint = varianceForThisDataPoint / allPredictions.Count;

                // Accumulate
                variance += varianceForThisDataPoint;
                bias     += biasForThisDataPoint;
            }

            variance = variance / trainingData.Values.Count;
            bias     = bias / trainingData.Values.Count;
            return(variance);
        }
Beispiel #2
0
        static void Main(string[] args)
        {
            string errorMessage = "";

            if (!File.Exists(DataSetPath))
            {
                errorMessage += $"Failed to find file ${DataSetPath} - please update variable ${nameof(DataSetPath)} or create that file.\n";
            }
            if (!File.Exists(TestSetPath))
            {
                errorMessage += $"Failed to find file ${TestSetPath} - please update variable ${nameof(TestSetPath)} or create that file.\n";
            }

            if (errorMessage != "")
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("Not all files available - not running!");
                Console.WriteLine(errorMessage);
                Console.ResetColor();
                Console.WriteLine("Press any key to continue...");
                Console.ReadKey();
                return;
            }

            Random rnd = new Random();

            Console.WriteLine("Reading training data...");
            ParserResults trainingData = ParserUtils.ParseData(DataSetPath);

            Console.WriteLine("Validating data set");
            DataSetCleaner.ValidateDataSet(trainingData.Attributes, trainingData.Values);

            List <List <List <DataSetValue> > > dataSetValuesForBagging = new List <List <List <DataSetValue> > >()
            {
                Bagging.ProduceDifferentDataSets(trainingData.Values, 1, rnd),
                Bagging.ProduceDifferentDataSets(trainingData.Values, 3, rnd),
                Bagging.ProduceDifferentDataSets(trainingData.Values, 5, rnd),
                Bagging.ProduceDifferentDataSets(trainingData.Values, 10, rnd),
                Bagging.ProduceDifferentDataSets(trainingData.Values, 20, rnd),
            };

            // Initialize the required trees
            List <List <DecisionTreeLevel> > listOfTreesToRunTestOn = new List <List <DecisionTreeLevel> >();

            foreach (var dataSetForBagging in dataSetValuesForBagging)
            {
                listOfTreesToRunTestOn.Add(dataSetForBagging.Select(x => new DecisionTreeLevel(ChiTestLimit, trainingData.Attributes, x)).ToList());
            }

            Console.WriteLine("Runnind D3 on all trees in parallel...");
            Parallel.ForEach(listOfTreesToRunTestOn.SelectMany(s => s), l => l.D3());

            Console.WriteLine("Deleting unecessary nodes...");
            Parallel.ForEach(listOfTreesToRunTestOn.SelectMany(s => s), l => l.TrimTree());

            Console.WriteLine("Getting test data set...");
            ParserResults testData = ParserUtils.ParseData(TestSetPath);

            Console.WriteLine("Evaluating trees against test data...");
            foreach (List <DecisionTreeLevel> baggingSetOfTrees in listOfTreesToRunTestOn)
            {
                DecisionTreeScore score = DecisionTreeScorer.ScoreWithTreeWithTestSet(baggingSetOfTrees, testData.Values);
                score.PrintTotalScore();
            }

            Console.WriteLine("Press any key to quit...");
            Console.ReadKey();
        }
Beispiel #3
0
        private static void RunWithTreeLevels(ParserResults trainingData, Random rnd, int treeDepth, int sizeOfBaggers, ParserResults testData)
        {
            List <List <List <DataSetValue> > > dataSetValuesForBagging = new List <List <List <DataSetValue> > >();

            for (int i = 0; i < TotalSamplesForBiasAndVariance; i++)
            {
                // Two layer sampling
                List <DataSetValue> layer1Sampling = Bagging.ProduceDifferentDataSets(trainingData.Values, 1, rnd).Single();
                if (sizeOfBaggers == 1)
                {
                    dataSetValuesForBagging.Add(new List <List <DataSetValue> >()
                    {
                        layer1Sampling
                    });
                }
                else
                {
                    dataSetValuesForBagging.Add(Bagging.ProduceDifferentDataSets(layer1Sampling, sizeOfBaggers, rnd));
                }
            }

            // Initialize the required trees
            List <List <DecisionTreeLevel> > listOfTreesToRunTestOn = new List <List <DecisionTreeLevel> >();

            foreach (var dataSetForBagging in dataSetValuesForBagging)
            {
                // Foe each bagger, for each dataset, create a new tree
                listOfTreesToRunTestOn.Add(
                    dataSetForBagging.Select(
                        dataSet => new DecisionTreeLevel(0, trainingData.Attributes, dataSet, maximumDepth: treeDepth)).ToList());
            }

            Parallel.ForEach(listOfTreesToRunTestOn.SelectMany(s => s), l => l.D3());
            Parallel.ForEach(listOfTreesToRunTestOn.SelectMany(s => s), l => l.TrimTree());

            //string sampleSerializedTree = listOfTreesToRunTestOn[0][0].SerializeDecisionTree();

            //Console.WriteLine("Evaluating trees against test data...");
            double totalScoreAgainstTrainingData = 0;
            double totalScoreAgainstTestData     = 0;

            foreach (List <DecisionTreeLevel> baggingSetOfTrees in listOfTreesToRunTestOn)
            {
                DecisionTreeScore scoreAgainstTrainingData = DecisionTreeScorer.ScoreWithTreeWithTestSet(baggingSetOfTrees, trainingData.Values);
                DecisionTreeScore scoreAgainstTestData     = DecisionTreeScorer.ScoreWithTreeWithTestSet(baggingSetOfTrees, testData.Values);
                //score.PrintTotalScore();

                totalScoreAgainstTrainingData += scoreAgainstTrainingData.GetTotalScore();
                totalScoreAgainstTestData     += scoreAgainstTestData.GetTotalScore();
            }
            totalScoreAgainstTrainingData = totalScoreAgainstTrainingData / listOfTreesToRunTestOn.Count;
            totalScoreAgainstTestData     = totalScoreAgainstTestData / listOfTreesToRunTestOn.Count;

            double bias;
            double variance = BiasAndVarianceCalculator.CalculateBiasAndVariance(trainingData, listOfTreesToRunTestOn, out bias);

            Console.WriteLine("Variance: {0:0.00000}. Bias: {1:0.00000}. ScoreTraining : {2:0.00000}, ScoreTest : {3:0.00000}", variance, bias, totalScoreAgainstTrainingData, totalScoreAgainstTestData);
            //Console.WriteLine(bias);
            //Console.WriteLine(variance);
            //Console.WriteLine(totalScoreAgainstTrainingData);
            //Console.WriteLine(totalScoreAgainstTestData);
        }
Beispiel #4
0
        static void Main(string[] args)
        {
            string errorMessage = "";

            if (!File.Exists(DataSetPath))
            {
                errorMessage += $"Failed to find file ${DataSetPath} - please update variable ${nameof(DataSetPath)} or create that file.\n";
            }
            if (!File.Exists(TestSetPath))
            {
                errorMessage += $"Failed to find file ${TestSetPath} - please update variable ${nameof(TestSetPath)} or create that file.\n";
            }

            if (errorMessage != "")
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine("Not all files available - not running!");
                Console.WriteLine(errorMessage);
                Console.ResetColor();
                Console.WriteLine("Press any key to continue...");
                Console.ReadKey();
                return;
            }

            Console.WriteLine("Reading training data...");
            ParserResults trainingData = ParserUtils.ParseData(DataSetPath);

            // Optimizations are optional
            // DataSetOptimizerForExtraCredit.OptimizeDataSetForExtraCredit(trainingData.Attributes, trainingData.Values);

            Console.WriteLine("Validating data set");
            DataSetCleaner.ValidateDataSet(trainingData.Attributes, trainingData.Values);

            // Initialize the required trees with their respective chiTestLimits
            List <DecisionTreeLevel> listOfTreesToRunTestOn = new List <DecisionTreeLevel>()
            {
                new DecisionTreeLevel(chiTestLimit: 0.99),
                new DecisionTreeLevel(chiTestLimit: 0.95),
                new DecisionTreeLevel(chiTestLimit: 0),
            };

            Console.WriteLine("Runnind D3...");
            Parallel.ForEach(listOfTreesToRunTestOn, l => l.D3(trainingData.Attributes, trainingData.Values));

            Console.WriteLine("Deleting unecessary nodes...");
            Parallel.ForEach(listOfTreesToRunTestOn, l => l.TrimTree());

            Console.WriteLine("Getting test data set...");
            ParserResults testData = ParserUtils.ParseData(TestSetPath);

            // Optimizations are optional
            // DataSetOptimizerForExtraCredit.OptimizeDataSetForExtraCredit(testData.Attributes, testData.Values);

            Console.WriteLine("Evaluating trees against test data...");
            List <DecisionTreeScore> scores = listOfTreesToRunTestOn.AsParallel().Select(t => DecisionTreeScorer.ScoreWithTreeWithTestSet(t, testData.Values)).ToList();

            //Console.WriteLine("Writing trees to text files (for debugging/visualization)...");
            // Dump the trees to a txt file for debugging/visualization
            // NOTE: This won't work the the Chi=0 case - the JSON file generated is too big
            // Parallel.ForEach(listOfTreesToRunTestOn, l => File.WriteAllText("Chi" + Convert.ToInt64(l.ChiTestLimit * 10000000000000) + ".json", l.SerializeDecisionTree()));

            List <DecisionTreeScore> trainingDataScores = listOfTreesToRunTestOn.AsParallel().Select(t => DecisionTreeScorer.ScoreWithTreeWithTestSet(t, trainingData.Values)).ToList();

            // Print the results to console
            foreach (var score in scores)
            {
                score.PrintTotalScore();
            }

            Console.WriteLine("Evaluating trees against training data:");
            foreach (var score in trainingDataScores)
            {
                score.PrintTotalScore();
            }

            Console.WriteLine("Press any key to quit...");
            Console.ReadKey();
        }