Exemple #1
0
        /// <summary>
        /// Uses input params to generate a leaf node where FinalAttributeID points to a final attribue and FinalValue refers to said attribute's variant.
        /// </summary>
        /// <returns></returns>
        public static ID3_Node Leaf(int FinalAttributeID, int FinalValue)
        {
            ID3_Node output = new ID3_Node(FinalAttributeID);

            output.Value = FinalValue;

            return(output);
        }
Exemple #2
0
        /// <summary>
        /// Given a Case by which to test with a ID3_Node representing a decision tree, find the tree's label variant number for the Case and return it.
        /// </summary>
        /// <returns></returns>
        public static int TestWithTree(Case test, ID3_Node Tree)
        {
            if (ReferenceEquals(Tree.getChildren(), null))
            {
                return(Tree.Value);
            }

            /*
             * if(Tree.getChildren()[test.AttributeVals[Tree.AttributeID]] == null) //if the child is null pick another value that is valid. (example not present in training data)
             * {
             *  if(Tree.getChildren().Length == test.AttributeVals[Tree.AttributeID])
             * }*/

            // we have to assume that the attribute values are integers and that the attributes are not purely numeric
            return(TestWithTree(test, Tree.getChildren()[(int)test.AttributeVals[Tree.AttributeID]]));
        }
        public static void Main()
        {
            // ========= Part 1 ============= //

            if (BuildCarTrees)
            {
                //This is the car example.
                List <DAttribute> attributeCars = new List <DAttribute>(7);
                //while I could auto detect this, it's much easier to read the trees if I name the DataAttributes ahead of time
                //below data descriptions come from data-desc.txt, located near the data for this training data.
                string[] AVariants = new string[] { "vhigh", "high", "med", "low" }; //array of attribute variants to pass in to an attribute


                attributeCars.Add(new DAttribute("buying", 0, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                attributeCars.Add(new DAttribute("maint", 1, new List <string>(AVariants), DAttribute.Type.Categorical, false));

                AVariants = new string[] { "2", "3", "4", "5more" };
                attributeCars.Add(new DAttribute("doors", 2, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "2", "4", "more" };
                attributeCars.Add(new DAttribute("persons", 3, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "small", "med", "big" };
                attributeCars.Add(new DAttribute("lug_boot", 4, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "low", "med", "high" };
                attributeCars.Add(new DAttribute("safety", 5, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "unacc", "acc", "good", "vgood" };
                attributeCars.Add(new DAttribute("label", 6, new List <string>(AVariants), DAttribute.Type.Categorical, true));


                List <Case> TrainCars = DRT.ParseCSV(attributeCars.ToArray(), TestPath + @"\car\train.csv");
                List <Case> TestCars  = DRT.ParseCSV(attributeCars.ToArray(), TestPath + @"\car\test.csv");

                StringBuilder TreeLayout = new StringBuilder();

                for (int depth = 1; depth < 7; depth++)
                {
                    ID3_Node Tree = ID3Tools.ID3(attributeCars, TrainCars, depth, ID3Tools.EntropyCalucalation.IG);
                    //add the tree to the string builder and prepare to write it to a file.

                    Double TrainError = ID3Tools.FindTestError(TrainCars, attributeCars, Tree);
                    Double TestError  = ID3Tools.FindTestError(TestCars, attributeCars, Tree);

                    TreeLayout.Append("Information Gain Cars, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeCars.ToArray()) + "\n ----------------------------------------------------------------- \n");
                    Console.WriteLine("Finished an IG Tree");
                }

                for (int depth = 1; depth < 7; depth++)
                {
                    ID3_Node Tree = ID3Tools.ID3(attributeCars, TrainCars, depth, ID3Tools.EntropyCalucalation.GI);
                    //add the tree to the string builder and prepare to write it to a file.

                    Double TrainError = ID3Tools.FindTestError(TrainCars, attributeCars, Tree);
                    Double TestError  = ID3Tools.FindTestError(TestCars, attributeCars, Tree);

                    TreeLayout.Append("Gini Index Cars, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeCars.ToArray()) + "\n ----------------------------------------------------------------- \n");
                    Console.WriteLine("Finished a GI Tree");
                }

                for (int depth = 1; depth < 7; depth++)
                {
                    ID3_Node Tree = ID3Tools.ID3(attributeCars, TrainCars, depth, ID3Tools.EntropyCalucalation.ME);
                    //add the tree to the string builder and prepare to write it to a file.

                    Double TrainError = ID3Tools.FindTestError(TrainCars, attributeCars, Tree);
                    Double TestError  = ID3Tools.FindTestError(TestCars, attributeCars, Tree);

                    TreeLayout.Append("Majority Error Cars, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeCars.ToArray()) + "\n ----------------------------------------------------------------- \n");
                    Console.WriteLine("Finished an ME Tree");
                }

                Console.WriteLine("Writing all results to DecisionTree/TestingData/RunResults/ResultsCars.txt");
                System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsCars.txt", TreeLayout.ToString());
            }

            // ========= Part 2 ============= //
            // bank information
            if (BuildBankTrees)
            {
                List <DAttribute> attributeBank = new List <DAttribute>(7);
                //Once again, could auto detect, but doing so makes the data harder to read. Furthermore, autodetecting doesn't work for filling in missing values.
                //below data descriptions come from data-desc.txt, located near the data for this training data.

                string[] AVariants;

                //age being numeric means that the actual variants will be figured out at run time. The variant will be overwritten when we pull in the testing data.
                attributeBank.Add(new DAttribute("age", 0, null, DAttribute.Type.BinaryNumeric, false));
                AVariants = new string[] { "admin.", "unknown", "unemployed", "management", "housemaid", "entrepreneur", "student",
                                           "blue-collar", "self-employed", "retired", "technician", "services" };
                attributeBank.Add(new DAttribute("job", 1, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "married", "divorced", "single" };
                attributeBank.Add(new DAttribute("marital", 2, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "unknown", "secondary", "primary", "tertiary" };
                attributeBank.Add(new DAttribute("education", 3, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "yes", "no" };
                attributeBank.Add(new DAttribute("default", 4, new List <string>(AVariants), DAttribute.Type.Categorical, false));

                attributeBank.Add(new DAttribute("balance", 5, null, DAttribute.Type.BinaryNumeric, false));
                AVariants = new string[] { "yes", "no" };
                attributeBank.Add(new DAttribute("housing", 6, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "yes", "no" };
                attributeBank.Add(new DAttribute("loan", 7, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "unknown", "telephone", "cellular" };
                attributeBank.Add(new DAttribute("contact", 8, new List <string>(AVariants), DAttribute.Type.Categorical, false));

                attributeBank.Add(new DAttribute("day", 9, null, DAttribute.Type.BinaryNumeric, false));
                AVariants = new string[] { "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec" };
                attributeBank.Add(new DAttribute("month", 10, new List <string>(AVariants), DAttribute.Type.Categorical, false));

                attributeBank.Add(new DAttribute("duration", 11, null, DAttribute.Type.BinaryNumeric, false));

                attributeBank.Add(new DAttribute("campaign", 12, null, DAttribute.Type.BinaryNumeric, false));

                attributeBank.Add(new DAttribute("pdays", 13, null, DAttribute.Type.BinaryNumeric, false));

                attributeBank.Add(new DAttribute("previous", 14, null, DAttribute.Type.BinaryNumeric, false));
                AVariants = new string[] { "unknown", "other", "failure", "success" }; //If unknown needs to be filled in, remove it from this list.
                attributeBank.Add(new DAttribute("poutcome", 15, new List <string>(AVariants), DAttribute.Type.Categorical, false));
                AVariants = new string[] { "yes", "no" };
                attributeBank.Add(new DAttribute("result", 16, new List <string>(AVariants), DAttribute.Type.Categorical, true));

                if (BuildBankTreeNormal)
                {
                    List <Case> TrainBank = DRT.ParseCSV(attributeBank.ToArray(), TestPath + @"\bank\train.csv", true);
                    List <Case> TestBank  = DRT.ParseCSV(attributeBank.ToArray(), TestPath + @"\bank\test.csv", false);

                    StringBuilder TreeLayout = new StringBuilder();

                    for (int depth = 1; depth < 17; depth++)
                    {
                        ID3_Node Tree = ID3Tools.ID3(attributeBank, TrainBank, depth, ID3Tools.EntropyCalucalation.IG);
                        //add the tree to the string builder and prepare to write it to a file.

                        Double TrainError = ID3Tools.FindTestError(TrainBank, attributeBank, Tree);
                        Double TestError  = ID3Tools.FindTestError(TestBank, attributeBank, Tree);

                        TreeLayout.Append("Information Gain Bank, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeBank.ToArray()) + "\n ----------------------------------------------------------------- \n");
                        Console.WriteLine("Finished an IG Tree");
                    }

                    for (int depth = 1; depth < 17; depth++)
                    {
                        ID3_Node Tree = ID3Tools.ID3(attributeBank, TrainBank, depth, ID3Tools.EntropyCalucalation.GI);
                        //add the tree to the string builder and prepare to write it to a file.

                        Double TrainError = ID3Tools.FindTestError(TrainBank, attributeBank, Tree);
                        Double TestError  = ID3Tools.FindTestError(TestBank, attributeBank, Tree);

                        TreeLayout.Append("Gini Index Bank, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeBank.ToArray()) + "\n ----------------------------------------------------------------- \n");
                        Console.WriteLine("Finished a GI Tree");
                    }

                    for (int depth = 1; depth < 17; depth++)
                    {
                        ID3_Node Tree = ID3Tools.ID3(attributeBank, TrainBank, depth, ID3Tools.EntropyCalucalation.ME);
                        //add the tree to the string builder and prepare to write it to a file.

                        Double TrainError = ID3Tools.FindTestError(TrainBank, attributeBank, Tree);
                        Double TestError  = ID3Tools.FindTestError(TestBank, attributeBank, Tree);

                        TreeLayout.Append("Majority Error Bank, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeBank.ToArray()) + "\n ----------------------------------------------------------------- \n");
                        Console.WriteLine("Finished an ME Tree");
                    }

                    Console.WriteLine("Writing all results to DecisionTree/TestingData/RunResults/ResultsBankNormal.txt");
                    System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankNormal.txt", TreeLayout.ToString());
                }
                if (BuildBankMissingVals)
                {
                    //In this case, the "unknown" values in poutcome
                    attributeBank[15] = new DAttribute("poutcome", 15, new List <string>(new string[] { "unknown", "other", "failure", "success" }), DAttribute.Type.Categorical, false);

                    //Now we rebuild all the datasets, which will have elements filled in by the majority elements.
                    List <Case> TrainBank = DRT.ParseCSV(attributeBank.ToArray(), TestPath + @"\bank\train.csv", true);
                    List <Case> TestBank  = DRT.ParseCSV(attributeBank.ToArray(), TestPath + @"\bank\test.csv", false);

                    StringBuilder TreeLayout = new StringBuilder();

                    for (int depth = 1; depth < 17; depth++)
                    {
                        ID3_Node Tree = ID3Tools.ID3(attributeBank, TrainBank, depth, ID3Tools.EntropyCalucalation.IG);
                        //add the tree to the string builder and prepare to write it to a file.

                        Double TrainError = ID3Tools.FindTestError(TrainBank, attributeBank, Tree);
                        Double TestError  = ID3Tools.FindTestError(TestBank, attributeBank, Tree);

                        TreeLayout.Append("Information Gain Bank, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeBank.ToArray()) + "\n ----------------------------------------------------------------- \n");
                        Console.WriteLine("Finished an IG Tree");
                    }

                    for (int depth = 1; depth < 17; depth++)
                    {
                        ID3_Node Tree = ID3Tools.ID3(attributeBank, TrainBank, depth, ID3Tools.EntropyCalucalation.GI);
                        //add the tree to the string builder and prepare to write it to a file.

                        Double TrainError = ID3Tools.FindTestError(TrainBank, attributeBank, Tree);
                        Double TestError  = ID3Tools.FindTestError(TestBank, attributeBank, Tree);

                        TreeLayout.Append("Gini Index Bank, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeBank.ToArray()) + "\n ----------------------------------------------------------------- \n");
                        Console.WriteLine("Finished a GI Tree");
                    }

                    for (int depth = 1; depth < 17; depth++)
                    {
                        ID3_Node Tree = ID3Tools.ID3(attributeBank, TrainBank, depth, ID3Tools.EntropyCalucalation.ME);
                        //add the tree to the string builder and prepare to write it to a file.

                        Double TrainError = ID3Tools.FindTestError(TrainBank, attributeBank, Tree);
                        Double TestError  = ID3Tools.FindTestError(TestBank, attributeBank, Tree);

                        TreeLayout.Append("Majority Error Bank, Max Depth of " + depth + ". Test Error = " + TestError + ". TrainError = " + TrainError + " \n \n" + Tree.PrintTree(attributeBank.ToArray()) + "\n ----------------------------------------------------------------- \n");
                        Console.WriteLine("Finished an ME Tree");
                    }

                    Console.WriteLine("Writing all results to DecisionTree/TestingData/RunResults/ResultsBankMissingVals.txt");
                    System.IO.File.WriteAllText(TestPath + @"/RunResults/ResultsBankMissingVals.txt", TreeLayout.ToString());
                }
            }
        }
Exemple #4
0
        /// <summary>
        /// Creates the root of a decision tree and recursively builds the rest. This is the method that actually does it, while the other copies
        /// just call this one with different input parameters.
        /// </summary>
        /// <param name="attributes">A list of the attributes that describe the data, in the order that they do so.</param>
        /// <param name="data">A list of cases representing the training data</param>
        /// <param name="Gen">A random number generator used to determine which attributes to use at any given level</param>
        /// <param name="subAttSize">The number of attributes the tree is allowed to use at any given level</param>
        /// <param name="limitAttributes">Bool representing whether the tree should limit the number of attributes used at any level</param>
        /// <param name="DepthRemaining">The allowed number of levels for the tree to reach</param>
        /// <param name="calc">The desired method of entropy calculation. IG = Information Gain. GI = Gini Index. MA = Majority Error</param>
        /// <returns></returns>
        private static ID3_Node ID3(List <DAttribute> attributes, List <Case> data, Random Gen, int subAttSize, bool limitAttributes, int DepthRemaining, EntropyCalucalation calc)
        {
            if (DepthRemaining > -1 && attributes.Count > 1 && data.Count > 0) // Check more layers are allowed and if there are usable attributes remaining, and that there's at least one data point
            {
                //The next line assumes that the last attribute is always the final one, and the only final one.
                //DataAttributes that are final are marked, so you should be able to change that behavior very easily.
                double[] proportions = GetLabelDistribution(data, attributes.Last());
                double   entropy     = CalculateEntropy(proportions, calc);
                if (entropy == 0)               //set only has one output label
                {
                    int result = 0;             //default
                    for (int i = 0; i < proportions.Length; i++)
                    {                           //track down the item that makes up the whole set
                        if (proportions[i] > 0) //found the only one that's greater than 0
                        {
                            result = i;
                            break;
                        }
                    }
                    return(ID3_Node.Leaf(attributes.Last().ID, result)); //Entire data set is of the same label. Tree is a single leaf node. Return a leaf node.
                }
                else
                {                                                     //find the attribute that results in the lowest entropy and divide on that.
                    List <DAttribute> WorkingAttributes = attributes; //by default, the usable attributes will be all the remaining attributes
                    if (limitAttributes)                              //if we're to limit the attributes
                    {
                        if (attributes.Count <= subAttSize)
                        {
                            //just use all of them since that's how many we're to use
                        }

                        else
                        {//pick subAttSize attributes at random to use
                            WorkingAttributes = new List <DAttribute>(subAttSize);
                            int[] usedAttributes = new int[subAttSize];
                            for (int i = 0; i < subAttSize; i++)
                            {
                                usedAttributes[i] = -1; //initialize all to unusable values
                            }
                            for (int i = 0; i < subAttSize; i++)
                            {
                                int  random = Gen.Next(attributes.Count - 1); //get a random index for attributes, not including the final label.
                                bool repeat = false;                          //track if random has already used this index for this set we're building.
                                for (int j = 0; j < subAttSize; j++)
                                {
                                    if (usedAttributes[i] == random)
                                    { //found a copy.
                                        repeat = true;
                                        break;
                                    }
                                    WorkingAttributes.Add(attributes[random]);
                                }

                                if (repeat)
                                {
                                    i--; //decrement i so that we can try again at the same i value.
                                    continue;
                                }

                                WorkingAttributes[i] = attributes[i]; //add the new item to working attributes
                            }
                        }
                    }
                    List <Case>[] LowestEntropySets = null;                    //default
                    double        LowestEntropy     = double.PositiveInfinity; //max value by default.
                    int           BestAttNum        = 0;                       //default
                    int           BestAttID         = -1;

                    for (int a = 0; a < WorkingAttributes.Count - 1; a++) //check all attributes, but the last (that's the label, and we don't want to judge by that.)
                    {
                        int           numVars     = WorkingAttributes[a].numVariants();
                        int           aID         = WorkingAttributes[a].ID; //refers to the position in a case's data to find the desired attribute value.
                        List <Case>[] CurrentSets = new List <Case> [numVars];

                        for (int i = 0; i < numVars; i++) //initialize all the lists
                        {
                            CurrentSets[i] = new List <Case>();
                        }

                        foreach (Case c in data) // populate lists by dividing by attribute value
                        {                        //Tree not compatible with pure numeric attributes. Assume that it is not pure numeric
                            if (attributes[a].AttType == DAttribute.Type.Categorical || attributes[a].AttType == DAttribute.Type.BinaryNumeric)
                            {
                                CurrentSets[(int)c.AttributeVals[aID]].Add(c); //Add the case to a list pertaining to its attribute value.
                            }
                            //We can safely assume that there is an attribute variant for the integer represented by the value.
                            else if (attributes[a].AttType == DAttribute.Type.Numeric)
                            {
                                throw new Exception("ID3 algorithm cannot build a tree with a purely numeric input");
                            }
                        }

                        //now that the data is split, calculate each set's entropy, weight it, and recombine it all

                        double sumEntropy = 0;

                        foreach (List <Case> set in CurrentSets)
                        {
                            //We're doing the same entropy calculation as the one at the top. We need to know the distribution of outputs.
                            double weight = (double)(set.Count) / (double)data.Count;  //percentage of data represented by 'set'
                            if (set.Count > 0)
                            {
                                double[] distribution = GetLabelDistribution(set, attributes.Last()); //assuming the last attribute is the final one.
                                sumEntropy += weight * CalculateEntropy(distribution, calc);          //weighted entropy value
                            }
                        }

                        if (sumEntropy < LowestEntropy)     //Lowest entropy so far
                        {                                   // record keep track of the sets created and attribute number
                            LowestEntropy     = sumEntropy; //if equal, favor the pre-existing value
                            LowestEntropySets = CurrentSets;
                            BestAttNum        = a;
                            BestAttID         = aID;
                        }

                        //check all of them. \_:)_/
                    }

                    List <ID3_Node>   Children          = new List <ID3_Node>();
                    List <DAttribute> UpdatedAttributes = attributes.ToList(); //Copy the list and remove the winning attribute (dividing by it again wouldn't be helpful.)
                    //UpdatedAttributes.RemoveAt(BestAttNum);

                    for (int i = 0; i < UpdatedAttributes.Count; i++)
                    {
                        if (UpdatedAttributes[i].ID == BestAttID)
                        {
                            UpdatedAttributes.RemoveAt(i);
                            break;
                        }
                    }

                    //create children recursively by use of the ID3 algorithm
                    for (int i = 0; i < LowestEntropySets.Length; i++)
                    {
                        //Notice how we're copying the list before we pass it in. Things get weird if you don't do that.
                        ID3_Node child = ID3(UpdatedAttributes.ToList(), LowestEntropySets[i], DepthRemaining - 1, calc);
                        //if (!ReferenceEquals(child, null)) //if the child is not null, add it to the list
                        //{// if we let the children be null, then it maintains order in the child array.
                        Children.Add(child);
                        //}
                    }

                    //point null children to real children. In effect, the tree guesses where to go in the event that values not present in the training data show up.
                    List <int> nullChildren = new List <int>();
                    int        nonNullChild = 0;

                    for (int i = 0; i < Children.Count; i++)
                    {
                        if (Children[i] == null)
                        {
                            nullChildren.Add(i);
                        }
                        else
                        {
                            nonNullChild = i;
                        }
                    }

                    foreach (int i in nullChildren)
                    {
                        Children[i] = Children[nonNullChild];
                    }

                    ID3_Node output = new ID3_Node(BestAttID);

                    if (nullChildren.Count == Children.Count) //All children are null
                    {
                        double[] proportion = GetLabelDistribution(data, attributes.Last());
                        double   max        = 0;
                        int      mode       = -1; //number representing most common Final label in the current dataset
                        for (int i = 0; i < proportion.Length; i++)
                        {
                            if (proportion[i] > max)
                            {
                                max  = proportion[i];
                                mode = i;
                            }
                        }
                        return(ID3_Node.Leaf(attributes.Last().ID, mode));
                    }


                    else  //add children to
                    {
                        output.addChildren(Children.ToArray());
                    }
                    return(output);
                }
            }
            //Only reach this spot if the program builds the tree beyond the depth limit. Return null.
            return(null);
        }
Exemple #5
0
        /// <summary>
        /// Finds the error of the input tree according to the test labels.
        /// </summary>
        /// <param name="TestFilepath"></param>
        /// <param name=""></param>
        /// <returns></returns>
        public static double FindTestError(List <Case> TestCases, List <DAttribute> attributes, ID3_Node Tree)
        {
            double RightAnswers = 0;
            double sumWeight    = 0;

            foreach (Case C in TestCases)
            {
                if (C.AttributeVals.Last() == TestWithTree(C, Tree))
                {
                    RightAnswers += C.getWeight(); //add the case's weight (usually one)
                }

                sumWeight += C.getWeight();
            }


            return(1.0 - (RightAnswers / sumWeight));
        }