/// <summary> /// Return true if within the confidence interval /// </summary> public static bool ChiSquaredTest(double confidenceInterval, Instances S, int attributeIndex, int targetAttributeIndex) { //attributeIndex = 89; double threshold = 1 - confidenceInterval; int df = S.attribute(attributeIndex).numValues() - 1; double chiSquaredStatistic = ChiSquare.ApproximateChiSquared(S, attributeIndex, targetAttributeIndex); double pValue = ChiSquareUtils.pochisq(chiSquaredStatistic, df); Log.LogInfo("ChiSquared pValue is {0} and threshold is {1}", pValue, threshold); if (Double.IsNaN(pValue)) { return(false); } else { bool result = (pValue <= threshold); return(result); } }
private static double ApproximateChiSquared(Instances S, int attributeIndex, int targetAttributeIndex) { // In the int array, element 1 is Positive and element 2 is Negative int indexOfPositive = 0; int indexOfNegative = 1; Dictionary <int, int[]> examplesList = new Dictionary <int, int[]>(); for (int i = 0; i < S.attribute(attributeIndex).numValues(); i++) { examplesList.Add(i, new int[2]); } // Partition each example into a bucket based on its attribute value // Also, get a count of positive and negative target values double p = 0; double n = 0; int droppedExamples = 0; for (int i = 0; i < S.numInstances(); i++) { double value = S.instance(i).value(attributeIndex); if (Double.IsNaN(value)) { // Drop missing/unknown values but keep track of how many are dropped droppedExamples++; Log.LogVerbose("IsNaN encountered calculating chi-squared stat for attribute {0}", attributeIndex); continue; } int targetValue = (int)S.instance(i).value(targetAttributeIndex); if (targetValue == ID3.PositiveTargetValue) { p++; examplesList[(int)value][indexOfPositive]++; } else if (targetValue == ID3.NegativeTargetValue) { n++; examplesList[(int)value][indexOfNegative]++; } else { throw new Exception(String.Format("Unexpected targetValue value of {0}", targetValue)); } } // Go through each partition to sum up the Chi-Squared statistic double chiSquaredStatistic = 0; for (int i = 0; i < S.attribute(attributeIndex).numValues(); i++) { double pi = examplesList[i][indexOfPositive]; double ni = examplesList[i][indexOfNegative]; double expectedPi = ChiSquare.ExpectedPi(p, n, pi, ni); double expectedNi = ChiSquare.ExpectedNi(p, n, pi, ni); double piTerm = expectedPi == 0 ? 0 : Math.Pow(pi - expectedPi, 2) / expectedPi; double niTerm = expectedNi == 0 ? 0 : Math.Pow(ni - expectedNi, 2) / expectedNi; chiSquaredStatistic += piTerm + niTerm; } if (Double.IsNaN(chiSquaredStatistic)) { } return(chiSquaredStatistic); }
public void TrainRecursive(ID3Node root, Instances S, int targetAttributeIndex, List <int> attributeList, double confidenceLevel, int maxDepth = 0) { // For each possible discrete value that the target attribute can have, count how many times it is present in the examples Dictionary <int, Instances> targetValueCounts = new Dictionary <int, Instances>(); for (int i = 0; i < S.attribute(targetAttributeIndex).numValues(); i++) { targetValueCounts.Add(i, new Instances(S, 0, 0)); } // Check the most common target attribute value of every example in S // and keep track of whether all target values are the same value int countOfS = S.numInstances(); int firstTargetValue = (int)S.instance(0).value(targetAttributeIndex); bool allTargetValuesAreEqual = true; for (int i = 0; i < countOfS; i++) { if (Double.IsNaN(S.instance(i).value(targetAttributeIndex))) { // For target values, this shouldn't happen throw new Exception(String.Format("Value at targetAttributeIndex {0} is NaN", targetAttributeIndex)); } int value = (int)S.instance(i).value(targetAttributeIndex); targetValueCounts[value].add(S.instance(i)); if (firstTargetValue != value) { allTargetValuesAreEqual = false; } } // If all target values are the same we can make this a leaf with that value and return if (allTargetValuesAreEqual == true) { root.IsLeaf = true; root.TargetValue = firstTargetValue; Log.LogInfo("All Targets Equal. Node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight); return; } // Find the most common target attribute value int mostCommonTargetValue = 0; for (int i = 0; i < targetValueCounts.Count(); i++) { if (targetValueCounts[i].numInstances() > targetValueCounts[mostCommonTargetValue].numInstances()) { mostCommonTargetValue = i; } } // Check if the attribute list is empty and if so return most common target value if (attributeList.Count == 0) { // Now set the node to this target value and return root.IsLeaf = true; root.TargetValue = mostCommonTargetValue; Log.LogInfo("Attribute List Empty. Node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight); return; } // Figure out which attribute will give us the most gain double gainSum = 0; SortedList <double, int> sortedGainList = new SortedList <double, int>(); for (int i = 0; i < attributeList.Count(); i++) { double gain = this.CalculateGain(S, i, targetAttributeIndex); gainSum += gain; // TODO: remove if (Double.IsNaN(gain)) { } // We use a sorted list which must have a unique key. Since the key is gain, then this might not be unique // across all attributes. Thus, if we encounter duplicate keys figure out which on has higher gain ratio. // Whichever has higher gain ratio wins and gets into the list. Later, we pick from the list the attribute // with highest gain ratio anyways so we won't lose any information with this approach. if (sortedGainList.ContainsKey(gain)) { double oldGainRatio = this.CalculateGainRatio(S, sortedGainList[gain], targetAttributeIndex); double newGainRatio = this.CalculateGainRatio(S, i, targetAttributeIndex); if (newGainRatio > oldGainRatio) { // Replace the old value with the one that has higher gain ratio sortedGainList[gain] = i; } } else { sortedGainList.Add(gain, i); } } double maxGain = sortedGainList.Last().Key; int maxGainAttribute = sortedGainList.Last().Value; double averageGain = gainSum / attributeList.Count(); // Use gain ratio on top N% from the gainListOrdered and calculate maxGainRatio double maxGainRatio = 0; int maxGainRatioAttribute = sortedGainList.Count() - 1; // default to the largest gain double NPercent = 0.2; int topNPercent = (int)Math.Ceiling(NPercent * sortedGainList.Count()); for (int i = 0; i < topNPercent; i++) { int reverse_i = sortedGainList.Count() - 1 - i; // Since we are search the list from bottom to top int index = sortedGainList.ElementAt(reverse_i).Value; double gainRatio = this.CalculateGainRatio(S, index, targetAttributeIndex); if (gainRatio > maxGainRatio) { maxGainRatio = gainRatio; maxGainRatioAttribute = index; } } // Now we know which attribute to split on Log.LogGain("MaxGainRatio {0} from attrib {1}. Max Gain {2} from attrib {3}. Avg Gain {4}.", maxGainRatio, maxGainRatioAttribute, maxGain, maxGainAttribute, averageGain); // Check if we should stop splitting if (ChiSquare.ChiSquaredTest(confidenceLevel, S, maxGainRatioAttribute, targetAttributeIndex) == false) { root.IsLeaf = true; root.TargetValue = mostCommonTargetValue; Log.LogInfo("ChiSquared stop split. Node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight); return; } // We are going to split. Create a new list of attributes that won't include the attribute we split on. root.SplitAttributeIndex = maxGainRatioAttribute; List <int> newAttributeList = new List <int>(attributeList); newAttributeList.RemoveAt(maxGainRatioAttribute); // Partition the examples by their attribute value Dictionary <int, Instances> examplesVi = new Dictionary <int, Instances>(); // Initialize the examplesVi dictionary for (int i = 0; i < S.attribute(maxGainRatioAttribute).numValues(); i++) { examplesVi.Add(i, new Instances(S, 0, 0)); } // Fill the examplesVi dictionary int totalExamplesVi = 0; for (int i = 0; i < S.numInstances(); i++) { if (Double.IsNaN(S.instance(i).value(maxGainRatioAttribute))) { Log.LogVerbose("IsNaN encountered for instance {0} of maxGainAttribute {1}", i, maxGainRatioAttribute); continue; } int value = (int)S.instance(i).value(maxGainRatioAttribute); examplesVi[value].add(S.instance(i)); totalExamplesVi++; } // Split for (int i = 0; i < S.attribute(maxGainRatioAttribute).numValues(); i++) { ID3Node newChild = new ID3Node(); newChild.Depth = root.Depth + 1; root.ChildNodes.Add(newChild); if (examplesVi[i].numInstances() == 0) // no more examples to split on { newChild.IsLeaf = true; newChild.TargetValue = mostCommonTargetValue; Log.LogInfo("No instances to split on. Create new leaf child from parent split {0}, new value {1}", root.SplitAttributeIndex, newChild.TargetValue, root.IsLeaf, root.Weight); } else if (maxDepth > 0 && newChild.Depth > maxDepth) // we hit max depth { newChild.IsLeaf = true; newChild.TargetValue = mostCommonTargetValue; Log.LogInfo("Hit max depth of {0}. Create new leaf child from parent split {1}, new value {2}", maxDepth, root.SplitAttributeIndex, newChild.TargetValue, root.IsLeaf, root.Weight); } else { Log.LogInfo("Splitting from node with split {0}, value {1}, leaf {2}, weight {3}", root.SplitAttributeIndex, root.TargetValue, root.IsLeaf, root.Weight); newChild.IsLeaf = false; newChild.SplitAttributeIndex = i; newChild.Weight = examplesVi[i].numInstances() / (double)totalExamplesVi; this.TrainRecursive(newChild, examplesVi[i], targetAttributeIndex, newAttributeList, confidenceLevel, maxDepth); } } }