Пример #1
0
        /// <summary>
        /// Tests the case c using classification (majority vote) against the ensemble learner and assumes
        /// that its last attribute is the target label. Also requires the target attribute so then it knows
        /// how many variants there are. Unlike the ID3 node, this is a full
        /// learner or something, so it actually contains its own testing functions.
        /// </summary>
        /// <param name="c"></param>
        /// <returns></returns>
        public int TestEnsembleClassificaiton(Case c, DAttribute target)
        {
            double[] voting = new double[target.numVariants()];

            for (int i = 0; i < VoteWeights.Length; i++)
            {
                int currentResult = ID3Tools.TestWithTree(c, Trees[i]);
                voting[currentResult] += VoteWeights[i]; //add the tree's voting power to the bucket for its answer
            }

            //find the majority vote in the voting pool

            int    max     = -1;
            double highest = -1;

            for (int i = 0; i < target.numVariants(); i++)
            {
                if (voting[i] > highest)
                {
                    max     = i;
                    highest = voting[i];
                }
            }

            //max should contain the winning variant number for the attribute.

            return(max);
        }
Пример #2
0
        /// <summary>
        /// Like Bagging, but also imposes limits on which attributes the decision tree is allowed to use at each level.
        /// </summary>
        public static EnsembleLearner RandomForest(int numTrees, int subSize, bool AllowDuplicates, int RNGseed, int subAttSize, List <Case> data, List <DAttribute> attributes)
        {
            Random Gen = new Random(RNGseed);

            ID3_Node[]  Trees = new ID3_Node[numTrees];
            List <Case> subset;

            for (int i = 0; i < numTrees; i++)
            {
                subset   = GetRandomSubset(!AllowDuplicates, subSize, Gen, data);                                            //pick some random items to use for this current tree
                Trees[i] = ID3Tools.ID3(attributes, subset, Gen, subAttSize, int.MaxValue, ID3Tools.EntropyCalucalation.IG); //specified to use information gain
            }// make another tree.

            return(new EnsembleLearner(Trees)); //output
        }
Пример #3
0
        public static EnsembleLearner AdaBoost(int numTrees, List <Case> data, List <DAttribute> attributes)
        {
            List <Case> dataCopy = data.ToList(); //copy the data so that we're not editing the source data weights.

            ID3_Node[] FullLearner = new ID3_Node[numTrees];
            double[]   votes       = new double[numTrees];
            //each index gets a decision stump and a corresponding wieght based on its accuracy

            Case.NormalizeWeights(dataCopy);

            for (int i = 0; i < numTrees; i++)
            { //generate that many trees
                ID3_Node current = ID3Tools.ID3(attributes, dataCopy, 1, ID3Tools.EntropyCalucalation.IG);
                double   error   = ID3Tools.FindTestError(dataCopy, attributes, current);

                if (error < .5) //learner is better than random chance (ID3 should ensure that)
                {
                    double vote = .5 * Math.Log((1 - error) / error);
                    //adjust weight of all labels. This does work done when calculating error. Can possibly code to avoid repeats, but that's effort.
                    foreach (Case c in dataCopy)
                    {
                        int treeResult = ID3Tools.TestWithTree(c, current);
                        if (treeResult == c.AttributeVals.Last())
                        {
                            double newWeight = c.getWeight() * Math.Pow(Math.E, -vote);
                            c.setWeight(newWeight); //on correct, set weight to weight^-vote
                        }
                        else
                        {
                            double newWeight = c.getWeight() * Math.Pow(Math.E, vote);
                            c.setWeight(newWeight); //on incorrect, set weight to weight^vote
                        }
                    }

                    Case.NormalizeWeights(dataCopy);

                    FullLearner[i] = current;
                    votes[i]       = vote;
                }
                else
                {
                    throw new Exception("Something went terribly wrong and the weak learner is worse than guessing when considering the current weights.");
                }
            }
            return(new EnsembleLearner(FullLearner, votes));
        }
Пример #4
0
        public void TestDepthLimit()
        {
            List <DAttribute> attributes = new List <DAttribute>(5);

            string[] alphabet = new string[2];
            alphabet[0] = "0"; alphabet[1] = "1";
            attributes.Add(new DAttribute("X_1", 0, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("X_2", 1, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("X_3", 2, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("X_4", 3, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("VarFinal", 4, new List <string>(alphabet), false, true));

            List <Case> TestData = DRT.ParseCSV(attributes.ToArray(), TestPath + @"\simple\simple1.txt");


            ID3_Node Tree = ID3Tools.ID3(attributes, TestData, 1, ID3Tools.EntropyCalucalation.IG);

            //it works

            System.Console.WriteLine(Tree.PrintTree(attributes.ToArray()));
            int i = 0; // a line of code on which to wait afterwards.
        }
Пример #5
0
        public void TestSimple()
        {
            //Initialize the attributes beforehand to make it more readable when debugging

            List <DAttribute> attributes = new List <DAttribute>(5);

            string[] alphabet = new string[2];
            alphabet[0] = "0"; alphabet[1] = "1";
            attributes.Add(new DAttribute("X_1", 0, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("X_2", 1, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("X_3", 2, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("X_4", 3, new List <string>(alphabet), false, false));
            attributes.Add(new DAttribute("VarFinal", 4, new List <string>(alphabet), false, true));

            List <Case> TestData = DRT.ParseCSV(attributes.ToArray(), TestPath + @"\simple\simple1.txt");


            ID3_Node Tree = ID3Tools.ID3(attributes, TestData, 999, ID3Tools.EntropyCalucalation.IG);

            System.Console.WriteLine(Tree.PrintTree(attributes.ToArray()));

            Assert.AreEqual(0, ID3Tools.TestWithTree(TestData[6], Tree));
        }
Пример #6
0
        public static void Main()
        {
            List <DAttribute> attributeBank = new List <DAttribute>(7);

            //Once again, could auto detect, but doing so makes the data harder to read. Furthermore, autodetecting doesn't work for filling in missing values.
            //below data descriptions come from data-desc.txt, located near the data for this training data.

            string[] AVariants;

            //age being numeric means that the actual variants will be figured out at run time. The variant will be overwritten when we pull in the testing data.
            attributeBank.Add(new DAttribute("age", 0, null, DAttribute.Type.BinaryNumeric, false));
            AVariants = new string[] { "admin.", "unknown", "unemployed", "management", "housemaid", "entrepreneur", "student",
                                       "blue-collar", "self-employed", "retired", "technician", "services" };
            attributeBank.Add(new DAttribute("job", 1, new List <string>(AVariants), DAttribute.Type.Categorical, false));
            AVariants = new string[] { "married", "divorced", "single" };
            attributeBank.Add(new DAttribute("marital", 2, new List <string>(AVariants), DAttribute.Type.Categorical, false));
            AVariants = new string[] { "unknown", "secondary", "primary", "tertiary" };
            attributeBank.Add(new DAttribute("education", 3, new List <string>(AVariants), DAttribute.Type.Categorical, false));
            AVariants = new string[] { "yes", "no" };
            attributeBank.Add(new DAttribute("default", 4, new List <string>(AVariants), DAttribute.Type.Categorical, false));

            attributeBank.Add(new DAttribute("balance", 5, null, DAttribute.Type.BinaryNumeric, false));
            AVariants = new string[] { "yes", "no" };
            attributeBank.Add(new DAttribute("housing", 6, new List <string>(AVariants), DAttribute.Type.Categorical, false));
            AVariants = new string[] { "yes", "no" };
            attributeBank.Add(new DAttribute("loan", 7, new List <string>(AVariants), DAttribute.Type.Categorical, false));
            AVariants = new string[] { "unknown", "telephone", "cellular" };
            attributeBank.Add(new DAttribute("contact", 8, new List <string>(AVariants), DAttribute.Type.Categorical, false));

            attributeBank.Add(new DAttribute("day", 9, null, DAttribute.Type.BinaryNumeric, false));
            AVariants = new string[] { "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec" };
            attributeBank.Add(new DAttribute("month", 10, new List <string>(AVariants), DAttribute.Type.Categorical, false));

            attributeBank.Add(new DAttribute("duration", 11, null, DAttribute.Type.BinaryNumeric, false));

            attributeBank.Add(new DAttribute("campaign", 12, null, DAttribute.Type.BinaryNumeric, false));

            attributeBank.Add(new DAttribute("pdays", 13, null, DAttribute.Type.BinaryNumeric, false));

            attributeBank.Add(new DAttribute("previous", 14, null, DAttribute.Type.BinaryNumeric, false));
            AVariants = new string[] { "unknown", "other", "failure", "success" }; //If unknown needs to be filled in, remove it from this list.
            attributeBank.Add(new DAttribute("poutcome", 15, new List <string>(AVariants), DAttribute.Type.Categorical, false));
            AVariants = new string[] { "yes", "no" };
            attributeBank.Add(new DAttribute("result", 16, new List <string>(AVariants), DAttribute.Type.Categorical, true));



            List <Case> TrainBank = DRT.ParseCSV(attributeBank.ToArray(), TestPath + @"\bank\train.csv", true);
            List <Case> TestBank  = DRT.ParseCSV(attributeBank.ToArray(), TestPath + @"\bank\test.csv", false);



            if (UseBoost)
            {
                StringBuilder output     = new StringBuilder();
                StringBuilder outputTree = new StringBuilder();

                output.Append("T(rees),Training Error,Testing Error\n"); //going to generate a csv file the ensemble learner's performance

                EnsembleLearner current = null;                          //initialize. Doesn't matter what to
                for (int i = 1; i < NumIterations; i++)                  //Assignment specifies 1000 iterations
                {
                    current = EnsembleTools.AdaBoost(i, TrainBank, attributeBank);

                    double TrainingError = current.TestEnsembleClassMass(TrainBank, attributeBank);
                    double TestingError  = current.TestEnsembleClassMass(TestBank, attributeBank);


                    Console.WriteLine("Built an AdaBoost Learner with " + i + " Trees.");
                    output.Append(i + "," + TrainingError + "," + TestingError + "\n");                     //write a new line for the CSV file
                    TrainBank = DRT.ParseCSV(attributeBank.ToArray(), TestPath + @"\bank\train.csv", true); //reset data since I'm too lazy to copy it
                }

                StringBuilder output2 = new StringBuilder();
                outputTree.Append("Tree#,Training Error,Testing Error\n");
                for (int i = 0; i < NumIterations - 1; i++)
                {
                    ID3_Node node = current.Trees[i];

                    double TrainingError = ID3Tools.FindTestError(TrainBank, attributeBank, node);
                    double TestingError  = ID3Tools.FindTestError(TestBank, attributeBank, node);

                    int index = i + 1;
                    outputTree.Append(index + "," + TrainingError + "," + TestingError + "\n"); //write a new line for the CSV file
                }

                Console.WriteLine("Writing all results to Ensemble\\ Learning/TestingData/RunResults/ResultsBankBoost.csv");
                System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankBoost.csv", output.ToString());
                System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankBoostTrees.csv", outputTree.ToString());
            }

            if (UseBag)
            {
                StringBuilder output     = new StringBuilder();
                StringBuilder outputTree = new StringBuilder();

                output.Append("T(rees),Training Error,Testing Error\n"); //going to generate a csv file the ensemble learner's performance

                EnsembleLearner current = null;                          //initialize. Doesn't matter what to
                for (int i = 1; i < NumIterations; i++)                  //Assignment specifies 1000 iterations
                {
                    current = EnsembleTools.Bagging(i, TrainBank.Count, true, RNGseed, TrainBank, attributeBank);

                    double TrainingError = current.TestEnsembleClassMass(TrainBank, attributeBank);
                    double TestingError  = current.TestEnsembleClassMass(TestBank, attributeBank);

                    Console.WriteLine("Built a Bagged Learner with " + i + " Trees.");

                    output.Append(i + "," + TrainingError + "," + TestingError + "\n"); //write a new line for the CSV file
                }

                StringBuilder output2 = new StringBuilder();
                outputTree.Append("Tree#,Training Error,Testing Error\n");
                for (int i = 0; i < NumIterations - 1; i++)
                {
                    ID3_Node node = current.Trees[i];

                    double TrainingError = ID3Tools.FindTestError(TrainBank, attributeBank, node);
                    double TestingError  = ID3Tools.FindTestError(TestBank, attributeBank, node);

                    int index = i + 1;
                    outputTree.Append(index + "," + TrainingError + "," + TestingError + "\n"); //write a new line for the CSV file
                }

                Console.WriteLine("Writing all results to Ensemble\\ Learning/TestingData/RunResults/ResultsBankBag.csv");
                System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankBagTemp.csv", output.ToString());
                System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankBagTreesTemp.csv", outputTree.ToString());
            }

            if (UseBagBias)
            {
                Random Gen = new Random(RNGseed);

                double averageResult = 0;
                foreach (Case c in TrainBank)
                {
                    averageResult += c.AttributeVals[16]; //add target label value
                }
                averageResult = averageResult / (double)TrainBank.Count;

                double AverageTreeVariance = 0;
                double AverageBagVariance  = 0;

                double AverageTreeBias = 0;
                double AverageBagBias  = 0;

                StringBuilder output = new StringBuilder();
                output.Append("TreeBias,EnsBias,TreeVar,EnsVar\n");

                for (int i = 1; i < 101; i++)
                {
                    List <Case>     Sample  = EnsembleTools.GetRandomSubset(true, 1000, Gen, TrainBank);               //Generate samples without replacement
                    EnsembleLearner current = EnsembleTools.Bagging(1000, 1000, true, RNGseed, Sample, attributeBank); //Generate samples allowing duplicates
                    //Calculate bias first

                    double Bias = 0;//tree
                    foreach (Case c in TrainBank)
                    {
                        // (1 - prediction) ^ 2
                        if (ID3Tools.TestWithTree(c, current.Trees[0]) != c.AttributeVals[16]) //Incorrect guess
                        {
                            Bias += 1;
                        }
                    }
                    Bias = Bias / (double)TrainBank.Count;
                    output.Append(Bias + ",");
                    AverageTreeBias += Bias;

                    Bias = 0;//Ensemble
                    foreach (Case c in TrainBank)
                    {
                        // (1 - prediction) ^ 2
                        if (current.TestEnsembleClassificaiton(c, attributeBank[16]) != c.AttributeVals[16]) //Incorrect guess
                        {
                            Bias += 1;
                        }
                    }
                    Bias            = Bias / (double)TrainBank.Count;
                    AverageBagBias += Bias;
                    output.Append(Bias + ",");

                    //now variance
                    double Variance = 0;//tree
                    foreach (Case c in TrainBank)
                    {
                        Variance += Math.Pow(ID3Tools.TestWithTree(c, current.Trees[0]) - averageResult, 2); //add target label value
                    }
                    Variance = Variance / (double)(TrainBank.Count);

                    AverageTreeVariance += Variance;
                    output.Append(Variance + ",");


                    Variance = 0;//ensemble
                    foreach (Case c in TrainBank)
                    {
                        Variance += Math.Pow(current.TestEnsembleClassificaiton(c, attributeBank[16]) - averageResult, 2); //add target label value
                    }
                    Variance = Variance / (double)(TrainBank.Count);

                    AverageBagVariance += Variance;
                    output.Append(Variance + "\n");

                    Console.WriteLine("Completed Bias and Variance calculations for Bagged Learner number " + i);
                }

                AverageTreeVariance = AverageTreeVariance / 100;
                AverageTreeBias     = AverageTreeBias / 100;
                AverageBagVariance  = AverageBagVariance / 100;
                AverageBagBias      = AverageBagBias / 100;

                output.Append("FinalVals\n" + AverageTreeBias + "," + AverageBagBias + "," + AverageTreeVariance + "," + AverageBagVariance);
                Console.WriteLine();
                System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankBagAnalysis.csv", output.ToString());
            }

            if (UseRandTrees)
            {
                for (int numAttributes = 2; numAttributes < 7; numAttributes += 2)
                {
                    StringBuilder output  = new StringBuilder();
                    StringBuilder output2 = new StringBuilder();

                    output.Append("T(rees),Training Error,Testing Error\n"); //going to generate a csv file the ensemble learner's performance

                    EnsembleLearner current = null;                          //initialize. Doesn't matter what to
                    for (int i = 1; i < NumIterations; i++)                  //Assignment specifies 1000 iterations
                    {
                        current = EnsembleTools.RandomForest(i, TrainBank.Count, true, RNGseed, numAttributes, TrainBank, attributeBank);

                        double TrainingError = current.TestEnsembleClassMass(TrainBank, attributeBank);
                        double TestingError  = current.TestEnsembleClassMass(TestBank, attributeBank);

                        Console.WriteLine("Built a Random Forest Learner with " + i + " Trees.");

                        output.Append(i + "," + TrainingError + "," + TestingError + "\n"); //write a new line for the CSV file
                    }

                    output2.Append("Tree#,Training Error,Testing Error\n");
                    for (int i = 0; i < NumIterations - 1; i++)
                    {
                        ID3_Node node = current.Trees[i];

                        double TrainingError = ID3Tools.FindTestError(TrainBank, attributeBank, node);
                        double TestingError  = ID3Tools.FindTestError(TestBank, attributeBank, node);

                        int index = i + 1;
                        output2.Append(index + "," + TrainingError + "," + TestingError + "\n"); //write a new line for the CSV file
                    }

                    Console.WriteLine("Writing all results to Ensemble\\ Learning/TestingData/RunResults/ResultsBankRForest.csv");
                    System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankRForest" + numAttributes + ".csv", output.ToString());
                    System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankRForest" + numAttributes + "Trees.csv", output2.ToString());
                }
            }


            if (UseRForestBias)
            {
                Random Gen = new Random(RNGseed);

                double averageResult = 0;
                foreach (Case c in TrainBank)
                {
                    averageResult += c.AttributeVals[16]; //add target label value
                }
                averageResult = averageResult / (double)TrainBank.Count;

                double AverageTreeVariance    = 0;
                double AverageRForestVariance = 0;

                double AverageTreeBias    = 0;
                double AverageRForestBias = 0;

                StringBuilder output = new StringBuilder();
                output.Append("TreeBias,EnsBias,TreeVar,EnsVar\n");

                for (int i = 1; i < 101; i++)
                {
                    List <Case>     Sample  = EnsembleTools.GetRandomSubset(true, 1000, Gen, TrainBank);                       //Generate samples without replacement
                    EnsembleLearner current = EnsembleTools.RandomForest(1000, 1000, true, RNGseed, 4, Sample, attributeBank); //Generate samples allowing duplicates
                    //Calculate bias first

                    double Bias = 0;//tree
                    foreach (Case c in TrainBank)
                    {
                        // (1 - prediction) ^ 2
                        if (ID3Tools.TestWithTree(c, current.Trees[0]) != c.AttributeVals[16]) //Incorrect guess
                        {
                            Bias += 1;
                        }
                    }
                    Bias = Bias / (double)TrainBank.Count;
                    output.Append(Bias + ",");
                    AverageTreeBias += Bias;

                    Bias = 0;//Ensemble
                    foreach (Case c in TrainBank)
                    {
                        // (1 - prediction) ^ 2
                        if (current.TestEnsembleClassificaiton(c, attributeBank[16]) != c.AttributeVals[16]) //Incorrect guess
                        {
                            Bias += 1;
                        }
                    }
                    Bias = Bias / (double)TrainBank.Count;
                    AverageRForestBias += Bias;
                    output.Append(Bias + ",");

                    //now variance
                    double Variance = 0;//tree
                    foreach (Case c in TrainBank)
                    {
                        Variance += Math.Pow(ID3Tools.TestWithTree(c, current.Trees[0]) - averageResult, 2); //add target label value
                    }
                    Variance = Variance / (double)(TrainBank.Count);

                    AverageTreeVariance += Variance;
                    output.Append(Variance + ",");


                    Variance = 0;//ensemble
                    foreach (Case c in TrainBank)
                    {
                        Variance += Math.Pow(current.TestEnsembleClassificaiton(c, attributeBank[16]) - averageResult, 2); //add target label value
                    }
                    Variance = Variance / (double)(TrainBank.Count);

                    AverageRForestVariance += Variance;
                    output.Append(Variance + "\n");

                    Console.WriteLine("Completed Bias and Variance calculations for RForest Learner number " + i);
                }

                AverageTreeVariance    = AverageTreeVariance / 100;
                AverageTreeBias        = AverageTreeBias / 100;
                AverageRForestVariance = AverageRForestVariance / 100;
                AverageRForestBias     = AverageRForestBias / 100;

                output.Append("FinalVals\n" + AverageTreeBias + "," + AverageRForestBias + "," + AverageTreeVariance + "," + AverageRForestVariance);
                Console.WriteLine();
                System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankRForest4Analysis.csv", output.ToString());
            }
        }