Beispiel #1
0
        public void Test(string arffFilePath, ID3Node root)
        {
            // Load the examples into S
            Instances S = new weka.core.Instances(new java.io.FileReader(arffFilePath));

            this.Test(S, root);
        }
Beispiel #2
0
        public ID3Node Train(Instances S, double confidenceLevel, int maxDepth = 0)
        {
            int targetAttributeIndex = S.numAttributes() - 1;

            // Store the attribute indexes in a list. They will get removed as we split on attributes.
            List <int> attributeIndexes = new List <int>();

            for (int i = 0; i < S.numAttributes() - 1; i++)
            {
                attributeIndexes.Add(i);
            }

            this.RootNode       = new ID3Node();
            this.RootNode.Depth = 1;

            this.TrainRecursive(this.RootNode, S, targetAttributeIndex, attributeIndexes, confidenceLevel, maxDepth);

            if (Log.NodeOn == true)
            {
                ID3Node.DFS(this.RootNode, S);
            }

            Log.LogNode("Number of Nodes is {0}", ID3Node.NodeCount(this.RootNode));
            Log.LogNode("Max Tree Depth including leaves is {0}", ID3Node.MaxDepth(this.RootNode));

            return(this.RootNode);
        }
Beispiel #3
0
        public int Predict(ID3Node node, Instance example)
        {
            // If the node is a leaf, return the value
            if (node.IsLeaf)
            {
                return(node.TargetValue);
            }

            // Else, figure out which path to take
            double attributeValue = example.value(node.SplitAttributeIndex);

            if (Double.IsNaN(attributeValue))
            {
                // TODO: use fractional test based on weights
                int highestWeightedAttribute = 0;
                for (int i = 0; i < node.ChildNodes.Count(); i++)
                {
                    if (node.ChildNodes[i].Weight > node.ChildNodes[highestWeightedAttribute].Weight)
                    {
                        highestWeightedAttribute = i;
                    }
                }

                attributeValue = highestWeightedAttribute;
            }

            ID3Node nextNode = node.ChildNodes[(int)attributeValue];

            return(this.Predict(nextNode, example));
        }
Beispiel #4
0
        public static void DFS(ID3Node root, Instances instances)
        {
            ID3Node.Print(root, instances);

            if (root.IsLeaf)
            {
                return;
            }

            for (int i = 0; i < root.ChildNodes.Count(); i++)
            {
                ID3Node.DFS(root.ChildNodes[i], instances);
            }
        }
Beispiel #5
0
        public static int NodeCount(ID3Node node)
        {
            if (node.IsLeaf)
            {
                return(1);
            }

            int childNodeCount = 0;

            for (int i = 0; i < node.ChildNodes.Count(); i++)
            {
                childNodeCount += NodeCount(node.ChildNodes[i]);
            }

            return(childNodeCount);
        }
Beispiel #6
0
        public static int MaxDepth(ID3Node node)
        {
            if (node.IsLeaf)
            {
                return(1);
            }

            int maxDepth = 0;

            for (int i = 0; i < node.ChildNodes.Count(); i++)
            {
                int childMaxDepth = MaxDepth(node.ChildNodes[i]);
                if (childMaxDepth > maxDepth)
                {
                    maxDepth = childMaxDepth;
                }
                ;
            }

            return(maxDepth + 1);
        }
Beispiel #7
0
        public static void Print(ID3Node node, Instances instances)
        {
            int targetAttribute = instances.numAttributes() - 1;

            string output;

            if (node.IsLeaf == true)
            {
                string value = instances.attribute(targetAttribute).value(node.TargetValue);
                output = String.Format("Leaf {0} with Weight {1}", value, node.Weight);
            }
            else
            {
                int           numChildren = node.ChildNodes.Count();
                List <string> childValues = new List <string>();
                for (int i = 0; i < instances.attribute(node.SplitAttributeIndex).numValues(); i++)
                {
                    childValues.Add(instances.attribute(node.SplitAttributeIndex).value(i));
                }
                output = String.Format("Split {0} with {1} children: {2}", instances.attribute(node.SplitAttributeIndex).name(), numChildren, String.Join(",", childValues));
            }

            Console.WriteLine(output);
        }
Beispiel #8
0
        public void Test(Instances S, ID3Node root)
        {
            int targetAttributeIndex = S.numAttributes() - 1;

            // Evaluate each example
            int truePositive      = 0;
            int falseNegative     = 0;
            int falsePositive     = 0;
            int trueNegative      = 0;
            int actualPositive    = 0;
            int actualNegative    = 0;
            int predictedPositive = 0;
            int predictedNegative = 0;

            for (int i = 0; i < S.numInstances(); i++)
            {
                // Compare predicted value to actual value
                int predictedValue = this.Predict(root, S.instance(i));
                int actualValue    = (int)S.instance(i).value(targetAttributeIndex);

                // Classify it as TP, TN, FP, FN
                if (actualValue == 0)
                {
                    // Actual value is true
                    actualPositive++;

                    if (predictedValue == 0)
                    {
                        predictedPositive++;
                        truePositive++;
                    }
                    else if (predictedValue == 1)
                    {
                        predictedNegative++;
                        falseNegative++;
                    }
                    else
                    {
                        throw new Exception(String.Format("Unexpected predicted value of {0}", predictedValue));
                    }
                }
                else if (actualValue == 1)
                {
                    // Actual value is false
                    actualNegative++;

                    if (predictedValue == 0)
                    {
                        predictedPositive++;
                        falsePositive++;
                    }
                    else if (predictedValue == 1)
                    {
                        predictedNegative++;
                        trueNegative++;
                    }
                    else
                    {
                        throw new Exception(String.Format("Unexpected predicted value of {0}", predictedValue));
                    }
                }
                else
                {
                    throw new Exception(String.Format("Unexpected actual value of {0}", actualValue));
                }
            }

            Log.LogStats("truePositive: {0}", truePositive);
            Log.LogStats("falseNegative: {0}", falseNegative);
            Log.LogStats("falsePositive: {0}", falsePositive);
            Log.LogStats("trueNegative: {0}", trueNegative);
            Log.LogStats("actualPositive: {0}", actualPositive);
            Log.LogStats("actualNegative: {0}", actualNegative);
            Log.LogStats("predictedPositive: {0}", predictedPositive);
            Log.LogStats("predictedNegative: {0}", predictedNegative);

            double precision = truePositive / (double)(truePositive + falsePositive);
            double recall    = truePositive / (double)(truePositive + falseNegative);
            double accuracy  = (truePositive + trueNegative) / (double)(S.numInstances());

            Log.LogStats("precision: {0}", precision);
            Log.LogStats("recall: {0}", recall);
            Log.LogStats("accuracy: {0}", accuracy);
        }
Beispiel #9
0
        public void TrainRecursive(ID3Node root, Instances S, int targetAttributeIndex, List <int> attributeList, double confidenceLevel, int maxDepth = 0)
        {
            // For each possible discrete value that the target attribute can have, count how many times it is present in the examples
            Dictionary <int, Instances> targetValueCounts = new Dictionary <int, Instances>();

            for (int i = 0; i < S.attribute(targetAttributeIndex).numValues(); i++)
            {
                targetValueCounts.Add(i, new Instances(S, 0, 0));
            }

            // Check the most common target attribute value of every example in S
            // and keep track of whether all target values are the same value
            int  countOfS                = S.numInstances();
            int  firstTargetValue        = (int)S.instance(0).value(targetAttributeIndex);
            bool allTargetValuesAreEqual = true;

            for (int i = 0; i < countOfS; i++)
            {
                if (Double.IsNaN(S.instance(i).value(targetAttributeIndex)))
                {
                    // For target values, this shouldn't happen
                    throw new Exception(String.Format("Value at targetAttributeIndex {0} is NaN", targetAttributeIndex));
                }

                int value = (int)S.instance(i).value(targetAttributeIndex);
                targetValueCounts[value].add(S.instance(i));

                if (firstTargetValue != value)
                {
                    allTargetValuesAreEqual = false;
                }
            }

            // If all target values are the same we can make this a leaf with that value and return
            if (allTargetValuesAreEqual == true)
            {
                root.IsLeaf      = true;
                root.TargetValue = firstTargetValue;
                Log.LogInfo("All Targets Equal. Node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight);
                return;
            }

            // Find the most common target attribute value
            int mostCommonTargetValue = 0;

            for (int i = 0; i < targetValueCounts.Count(); i++)
            {
                if (targetValueCounts[i].numInstances() > targetValueCounts[mostCommonTargetValue].numInstances())
                {
                    mostCommonTargetValue = i;
                }
            }

            // Check if the attribute list is empty and if so return most common target value
            if (attributeList.Count == 0)
            {
                // Now set the node to this target value and return
                root.IsLeaf      = true;
                root.TargetValue = mostCommonTargetValue;
                Log.LogInfo("Attribute List Empty. Node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight);
                return;
            }

            // Figure out which attribute will give us the most gain
            double gainSum = 0;
            SortedList <double, int> sortedGainList = new SortedList <double, int>();

            for (int i = 0; i < attributeList.Count(); i++)
            {
                double gain = this.CalculateGain(S, i, targetAttributeIndex);
                gainSum += gain;

                // TODO: remove
                if (Double.IsNaN(gain))
                {
                }

                // We use a sorted list which must have a unique key. Since the key is gain, then this might not be unique
                // across all attributes. Thus, if we encounter duplicate keys figure out which on has higher gain ratio.
                // Whichever has higher gain ratio wins and gets into the list. Later, we pick from the list the attribute
                // with highest gain ratio anyways so we won't lose any information with this approach.
                if (sortedGainList.ContainsKey(gain))
                {
                    double oldGainRatio = this.CalculateGainRatio(S, sortedGainList[gain], targetAttributeIndex);
                    double newGainRatio = this.CalculateGainRatio(S, i, targetAttributeIndex);

                    if (newGainRatio > oldGainRatio)
                    {
                        // Replace the old value with the one that has higher gain ratio
                        sortedGainList[gain] = i;
                    }
                }
                else
                {
                    sortedGainList.Add(gain, i);
                }
            }

            double maxGain          = sortedGainList.Last().Key;
            int    maxGainAttribute = sortedGainList.Last().Value;
            double averageGain      = gainSum / attributeList.Count();

            // Use gain ratio on top N% from the gainListOrdered and calculate maxGainRatio
            double maxGainRatio          = 0;
            int    maxGainRatioAttribute = sortedGainList.Count() - 1; // default to the largest gain
            double NPercent    = 0.2;
            int    topNPercent = (int)Math.Ceiling(NPercent * sortedGainList.Count());

            for (int i = 0; i < topNPercent; i++)
            {
                int reverse_i = sortedGainList.Count() - 1 - i; // Since we are search the list from bottom to top

                int    index     = sortedGainList.ElementAt(reverse_i).Value;
                double gainRatio = this.CalculateGainRatio(S, index, targetAttributeIndex);

                if (gainRatio > maxGainRatio)
                {
                    maxGainRatio          = gainRatio;
                    maxGainRatioAttribute = index;
                }
            }

            // Now we know which attribute to split on
            Log.LogGain("MaxGainRatio {0} from attrib {1}. Max Gain {2} from attrib {3}. Avg Gain {4}.", maxGainRatio, maxGainRatioAttribute, maxGain, maxGainAttribute, averageGain);

            // Check if we should stop splitting
            if (ChiSquare.ChiSquaredTest(confidenceLevel, S, maxGainRatioAttribute, targetAttributeIndex) == false)
            {
                root.IsLeaf      = true;
                root.TargetValue = mostCommonTargetValue;
                Log.LogInfo("ChiSquared stop split. Node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight);
                return;
            }

            // We are going to split. Create a new list of attributes that won't include the attribute we split on.
            root.SplitAttributeIndex = maxGainRatioAttribute;
            List <int> newAttributeList = new List <int>(attributeList);

            newAttributeList.RemoveAt(maxGainRatioAttribute);

            // Partition the examples by their attribute value
            Dictionary <int, Instances> examplesVi = new Dictionary <int, Instances>();

            // Initialize the examplesVi dictionary
            for (int i = 0; i < S.attribute(maxGainRatioAttribute).numValues(); i++)
            {
                examplesVi.Add(i, new Instances(S, 0, 0));
            }

            // Fill the examplesVi dictionary
            int totalExamplesVi = 0;

            for (int i = 0; i < S.numInstances(); i++)
            {
                if (Double.IsNaN(S.instance(i).value(maxGainRatioAttribute)))
                {
                    Log.LogVerbose("IsNaN encountered for instance {0} of maxGainAttribute {1}", i, maxGainRatioAttribute);
                    continue;
                }

                int value = (int)S.instance(i).value(maxGainRatioAttribute);
                examplesVi[value].add(S.instance(i));
                totalExamplesVi++;
            }

            // Split
            for (int i = 0; i < S.attribute(maxGainRatioAttribute).numValues(); i++)
            {
                ID3Node newChild = new ID3Node();
                newChild.Depth = root.Depth + 1;
                root.ChildNodes.Add(newChild);

                if (examplesVi[i].numInstances() == 0) // no more examples to split on
                {
                    newChild.IsLeaf      = true;
                    newChild.TargetValue = mostCommonTargetValue;
                    Log.LogInfo("No instances to split on. Create new leaf child from parent split {0}, new value {1}", root.SplitAttributeIndex, newChild.TargetValue, root.IsLeaf, root.Weight);
                }
                else if (maxDepth > 0 && newChild.Depth > maxDepth) // we hit max depth
                {
                    newChild.IsLeaf      = true;
                    newChild.TargetValue = mostCommonTargetValue;
                    Log.LogInfo("Hit max depth of {0}. Create new leaf child from parent split {1}, new value {2}", maxDepth, root.SplitAttributeIndex, newChild.TargetValue, root.IsLeaf, root.Weight);
                }
                else
                {
                    Log.LogInfo("Splitting from node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight);

                    newChild.IsLeaf = false;
                    newChild.SplitAttributeIndex = i;
                    newChild.Weight = examplesVi[i].numInstances() / (double)totalExamplesVi;
                    this.TrainRecursive(newChild, examplesVi[i], targetAttributeIndex, newAttributeList, confidenceLevel, maxDepth);
                }
            }
        }