Example #1
0
        /// <summary>
        /// Given an array of the number of occurences of each label, calculate entopy using the method indicated by EntropyCalculation.
        /// Throws an InvalidOperationException if given an invalid EntropyCalcuation.
        /// </summary>
        /// <param name="numLabelType"></param>
        /// <returns>Well it should return a number between zero and one, but the methods don't seem to work that way. However,
        /// high entropy is still high while zero is still the minimum, so it still works well enough.</returns>
        public static double CalculateEntropy(double[] numLabelType, EntropyCalucalation CalcType)
        { //Jackson suggested I use an enum. It's defined at the bottom of the class if you're wondering. I like the look of it.
            switch (CalcType)
            {
            case EntropyCalucalation.IG:
                return(InformationGain(numLabelType));

            case EntropyCalucalation.GI:
                return(GiniIndex(numLabelType));

            case EntropyCalucalation.ME:
                return(MajorityError(numLabelType));
            }

            //No valid calcuation type detected.
            throw new InvalidOperationException("CalculateEntropy() must have a designated EntroypyCalculation.");
        }
Example #2
0
        /// <summary>
        /// Creates the root of a decision tree and recursively builds the rest. This is the method that actually does it, while the other copies
        /// just call this one with different input parameters.
        /// </summary>
        /// <param name="attributes">A list of the attributes that describe the data, in the order that they do so.</param>
        /// <param name="data">A list of cases representing the training data</param>
        /// <param name="Gen">A random number generator used to determine which attributes to use at any given level</param>
        /// <param name="subAttSize">The number of attributes the tree is allowed to use at any given level</param>
        /// <param name="limitAttributes">Bool representing whether the tree should limit the number of attributes used at any level</param>
        /// <param name="DepthRemaining">The allowed number of levels for the tree to reach</param>
        /// <param name="calc">The desired method of entropy calculation. IG = Information Gain. GI = Gini Index. MA = Majority Error</param>
        /// <returns></returns>
        private static ID3_Node ID3(List <DAttribute> attributes, List <Case> data, Random Gen, int subAttSize, bool limitAttributes, int DepthRemaining, EntropyCalucalation calc)
        {
            if (DepthRemaining > -1 && attributes.Count > 1 && data.Count > 0) // Check more layers are allowed and if there are usable attributes remaining, and that there's at least one data point
            {
                //The next line assumes that the last attribute is always the final one, and the only final one.
                //DataAttributes that are final are marked, so you should be able to change that behavior very easily.
                double[] proportions = GetLabelDistribution(data, attributes.Last());
                double   entropy     = CalculateEntropy(proportions, calc);
                if (entropy == 0)               //set only has one output label
                {
                    int result = 0;             //default
                    for (int i = 0; i < proportions.Length; i++)
                    {                           //track down the item that makes up the whole set
                        if (proportions[i] > 0) //found the only one that's greater than 0
                        {
                            result = i;
                            break;
                        }
                    }
                    return(ID3_Node.Leaf(attributes.Last().ID, result)); //Entire data set is of the same label. Tree is a single leaf node. Return a leaf node.
                }
                else
                {                                                     //find the attribute that results in the lowest entropy and divide on that.
                    List <DAttribute> WorkingAttributes = attributes; //by default, the usable attributes will be all the remaining attributes
                    if (limitAttributes)                              //if we're to limit the attributes
                    {
                        if (attributes.Count <= subAttSize)
                        {
                            //just use all of them since that's how many we're to use
                        }

                        else
                        {//pick subAttSize attributes at random to use
                            WorkingAttributes = new List <DAttribute>(subAttSize);
                            int[] usedAttributes = new int[subAttSize];
                            for (int i = 0; i < subAttSize; i++)
                            {
                                usedAttributes[i] = -1; //initialize all to unusable values
                            }
                            for (int i = 0; i < subAttSize; i++)
                            {
                                int  random = Gen.Next(attributes.Count - 1); //get a random index for attributes, not including the final label.
                                bool repeat = false;                          //track if random has already used this index for this set we're building.
                                for (int j = 0; j < subAttSize; j++)
                                {
                                    if (usedAttributes[i] == random)
                                    { //found a copy.
                                        repeat = true;
                                        break;
                                    }
                                    WorkingAttributes.Add(attributes[random]);
                                }

                                if (repeat)
                                {
                                    i--; //decrement i so that we can try again at the same i value.
                                    continue;
                                }

                                WorkingAttributes[i] = attributes[i]; //add the new item to working attributes
                            }
                        }
                    }
                    List <Case>[] LowestEntropySets = null;                    //default
                    double        LowestEntropy     = double.PositiveInfinity; //max value by default.
                    int           BestAttNum        = 0;                       //default
                    int           BestAttID         = -1;

                    for (int a = 0; a < WorkingAttributes.Count - 1; a++) //check all attributes, but the last (that's the label, and we don't want to judge by that.)
                    {
                        int           numVars     = WorkingAttributes[a].numVariants();
                        int           aID         = WorkingAttributes[a].ID; //refers to the position in a case's data to find the desired attribute value.
                        List <Case>[] CurrentSets = new List <Case> [numVars];

                        for (int i = 0; i < numVars; i++) //initialize all the lists
                        {
                            CurrentSets[i] = new List <Case>();
                        }

                        foreach (Case c in data) // populate lists by dividing by attribute value
                        {                        //Tree not compatible with pure numeric attributes. Assume that it is not pure numeric
                            if (attributes[a].AttType == DAttribute.Type.Categorical || attributes[a].AttType == DAttribute.Type.BinaryNumeric)
                            {
                                CurrentSets[(int)c.AttributeVals[aID]].Add(c); //Add the case to a list pertaining to its attribute value.
                            }
                            //We can safely assume that there is an attribute variant for the integer represented by the value.
                            else if (attributes[a].AttType == DAttribute.Type.Numeric)
                            {
                                throw new Exception("ID3 algorithm cannot build a tree with a purely numeric input");
                            }
                        }

                        //now that the data is split, calculate each set's entropy, weight it, and recombine it all

                        double sumEntropy = 0;

                        foreach (List <Case> set in CurrentSets)
                        {
                            //We're doing the same entropy calculation as the one at the top. We need to know the distribution of outputs.
                            double weight = (double)(set.Count) / (double)data.Count;  //percentage of data represented by 'set'
                            if (set.Count > 0)
                            {
                                double[] distribution = GetLabelDistribution(set, attributes.Last()); //assuming the last attribute is the final one.
                                sumEntropy += weight * CalculateEntropy(distribution, calc);          //weighted entropy value
                            }
                        }

                        if (sumEntropy < LowestEntropy)     //Lowest entropy so far
                        {                                   // record keep track of the sets created and attribute number
                            LowestEntropy     = sumEntropy; //if equal, favor the pre-existing value
                            LowestEntropySets = CurrentSets;
                            BestAttNum        = a;
                            BestAttID         = aID;
                        }

                        //check all of them. \_:)_/
                    }

                    List <ID3_Node>   Children          = new List <ID3_Node>();
                    List <DAttribute> UpdatedAttributes = attributes.ToList(); //Copy the list and remove the winning attribute (dividing by it again wouldn't be helpful.)
                    //UpdatedAttributes.RemoveAt(BestAttNum);

                    for (int i = 0; i < UpdatedAttributes.Count; i++)
                    {
                        if (UpdatedAttributes[i].ID == BestAttID)
                        {
                            UpdatedAttributes.RemoveAt(i);
                            break;
                        }
                    }

                    //create children recursively by use of the ID3 algorithm
                    for (int i = 0; i < LowestEntropySets.Length; i++)
                    {
                        //Notice how we're copying the list before we pass it in. Things get weird if you don't do that.
                        ID3_Node child = ID3(UpdatedAttributes.ToList(), LowestEntropySets[i], DepthRemaining - 1, calc);
                        //if (!ReferenceEquals(child, null)) //if the child is not null, add it to the list
                        //{// if we let the children be null, then it maintains order in the child array.
                        Children.Add(child);
                        //}
                    }

                    //point null children to real children. In effect, the tree guesses where to go in the event that values not present in the training data show up.
                    List <int> nullChildren = new List <int>();
                    int        nonNullChild = 0;

                    for (int i = 0; i < Children.Count; i++)
                    {
                        if (Children[i] == null)
                        {
                            nullChildren.Add(i);
                        }
                        else
                        {
                            nonNullChild = i;
                        }
                    }

                    foreach (int i in nullChildren)
                    {
                        Children[i] = Children[nonNullChild];
                    }

                    ID3_Node output = new ID3_Node(BestAttID);

                    if (nullChildren.Count == Children.Count) //All children are null
                    {
                        double[] proportion = GetLabelDistribution(data, attributes.Last());
                        double   max        = 0;
                        int      mode       = -1; //number representing most common Final label in the current dataset
                        for (int i = 0; i < proportion.Length; i++)
                        {
                            if (proportion[i] > max)
                            {
                                max  = proportion[i];
                                mode = i;
                            }
                        }
                        return(ID3_Node.Leaf(attributes.Last().ID, mode));
                    }


                    else  //add children to
                    {
                        output.addChildren(Children.ToArray());
                    }
                    return(output);
                }
            }
            //Only reach this spot if the program builds the tree beyond the depth limit. Return null.
            return(null);
        }
Example #3
0
 /// <summary>
 /// Creates the root of a decision tree and recursively builds the rest. This particular variant is adjusted for the random
 /// forest ensemble learning algorithm.
 /// </summary>
 /// <param name="attributes">A list of the attributes that describe the data, in the order that they do so.</param>
 /// <param name="data">A list of cases representing the training data</param>
 /// <param name="Gen">A random number generator used to determine which attributes to use at any given level</param>
 /// <param name="subAttSize">The number of attributes the tree is allowed to use at any given level</param>
 /// <param name="DepthRemaining">The allowed number of levels for the tree to reach</param>
 /// <param name="calc">The desired method of entropy calculation. IG = Information Gain. GI = Gini Index. MA = Majority Error</param>
 /// <returns></returns>
 public static ID3_Node ID3(List <DAttribute> attributes, List <Case> data, Random Gen, int subAttSize, int DepthRemaining, EntropyCalucalation calc)
 {
     return(ID3(attributes, data, Gen, subAttSize, true, DepthRemaining, calc));
 }
Example #4
0
 /// <summary>
 /// Creates the root of a decision tree and recursively builds the rest.
 /// </summary>
 /// <param name="attributes">A list of the attributes that describe the data, in the order that they do so.</param>
 /// <param name="data">A list of cases representing the training data</param>
 /// <param name="DepthRemaining">The allowed number of levels for the tree to reach</param>
 /// <param name="calc">The desired method of entropy calculation. IG = Information Gain. GI = Gini Index. MA = Majority Error</param>
 /// <returns></returns>
 public static ID3_Node ID3(List <DAttribute> attributes, List <Case> data, int DepthRemaining, EntropyCalucalation calc)
 {
     return(ID3(attributes, data, null, 0, false, DepthRemaining, calc));
 }