/// <summary> Computes entropy after splitting without considering the /// class values. /// </summary> public double splitEnt(Distribution bags) { double returnValue = 0; int i; for (i = 0; i < bags.numBags(); i++) returnValue = returnValue + logFunc(bags.perBag(i)); return logFunc(bags.total()) - returnValue; }
/// <summary> This method computes the information gain in the same way /// C4.5 does. /// /// </summary> /// <param name="distribution">the distribution /// </param> /// <param name="totalNoInst">weight of ALL instances (including the /// ones with missing values). /// </param> public double splitCritValue(Distribution bags, double totalNoInst) { double numerator; double noUnknown; double unknownRate; noUnknown = totalNoInst - bags.total(); unknownRate = noUnknown / totalNoInst; numerator = (oldEnt(bags) - newEnt(bags)); numerator = (1 - unknownRate) * numerator; // Splits with no gain are useless. if (Utils.eq(numerator, 0)) return 0; return numerator / bags.total(); }
/// <summary> Computes entropy of distribution before splitting.</summary> public double oldEnt(Distribution bags) { double returnValue = 0; int j; for (j = 0; j < bags.numClasses(); j++) returnValue = returnValue + logFunc(bags.perClass(j)); return logFunc(bags.total()) - returnValue; }
/// <summary> This method is a straightforward implementation of the information /// gain criterion for the given distribution. /// </summary> public override double splitCritValue(Distribution bags) { double numerator; numerator = oldEnt(bags) - newEnt(bags); // Splits with no gain are useless. if (Utils.eq(numerator, 0)) { //UPGRADE_TODO: The equivalent in .NET for field 'java.lang.Double.MAX_VALUE' may return a different value. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1043'" return System.Double.MaxValue; } // We take the reciprocal value because we want to minimize the // splitting criterion's value. return bags.total() / numerator; }
/// <summary> Selects C4.5-type split for the given dataset.</summary> public override ClassifierSplitModel selectModel(Instances data) { double minResult; //double currentResult; BinC45Split[] currentModel; BinC45Split bestModel = null; NoSplit noSplitModel = null; double averageInfoGain = 0; int validModels = 0; bool multiVal = true; Distribution checkDistribution; double sumOfWeights; int i; try { // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); noSplitModel = new NoSplit(checkDistribution); if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) return noSplitModel; // Check if all attributes are nominal and have a // lot of values. System.Collections.IEnumerator enu = data.enumerateAttributes(); //UPGRADE_TODO: Method 'java.util.Enumeration.hasMoreElements' was converted to 'System.Collections.IEnumerator.MoveNext' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationhasMoreElements'" while (enu.MoveNext()) { //UPGRADE_TODO: Method 'java.util.Enumeration.nextElement' was converted to 'System.Collections.IEnumerator.Current' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationnextElement'" weka.core.Attribute attribute = (weka.core.Attribute)enu.Current; if ((attribute.Numeric) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } currentModel = new BinC45Split[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) if ((data.attribute(i).Numeric) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances())))) { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else currentModel[i] = null; } // Check if any useful split was found. if (validModels == 0) return noSplitModel; averageInfoGain = averageInfoGain / (double) validModels; // Find "best" attribute to split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) // Use 1E-3 here to get a closer approximation to the original // implementation. if ((currentModel[i].infoGain() >= (averageInfoGain - 1e-3)) && Utils.gr(currentModel[i].gainRatio(), minResult)) { bestModel = currentModel[i]; minResult = currentModel[i].gainRatio(); } } // Check if useful split was found. if (Utils.eq(minResult, 0)) return noSplitModel; // Add all Instances with unknown values for the corresponding // attribute to the distribution for the model, so that // the complete distribution is stored with the model. bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()); // Set the split point analogue to C45 if attribute numeric. bestModel.SplitPoint = m_allData; return bestModel; } catch (System.Exception e) { System.Console.WriteLine(e.StackTrace + " " + e.Message); } return null; }
/// <summary> Computes estimated errors for leaf.</summary> private double getEstimatedErrorsForDistribution(Distribution theDistribution) { if (Utils.eq(theDistribution.total(), 0)) return 0; else return theDistribution.numIncorrect() + Stats.addErrs(theDistribution.total(), theDistribution.numIncorrect(), m_CF); }
/// <summary> Creates split on numeric attribute. /// /// </summary> /// <exception cref="Exception">if something goes wrong /// </exception> private void handleNumericAttribute(Instances trainInstances) { int firstMiss; int next = 1; int last = 0; int splitIndex = - 1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. System.Collections.IEnumerator enu = trainInstances.enumerateInstances(); i = 0; //UPGRADE_TODO: Method 'java.util.Enumeration.hasMoreElements' was converted to 'System.Collections.IEnumerator.MoveNext' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationhasMoreElements'" while (enu.MoveNext()) { //UPGRADE_TODO: Method 'java.util.Enumeration.nextElement' was converted to 'System.Collections.IEnumerator.Current' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationnextElement'" instance = (Instance) enu.Current; if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return ; // Compute values of criteria for all possible split // indices. defaultEnt = infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value_Renamed(m_attIndex) + 1e-5 < trainInstances.instance(next).value_Renamed(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } m_index++; } last = next; } next++; } // Was there any useful split? if (m_index == 0) return ; // Compute modified information gain for best split. m_infoGain = m_infoGain - (Utils.log2(m_index) / m_sumOfWeights); if (Utils.smOrEq(m_infoGain, 0)) return ; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value_Renamed(m_attIndex) + trainInstances.instance(splitIndex).value_Renamed(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value_Renamed(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value_Renamed(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); }
/// <summary> Help method for computing the split entropy.</summary> private double splitEnt(Distribution bags, double totalnoInst) { double returnValue = 0; double noUnknown; int i; noUnknown = totalnoInst - bags.total(); if (Utils.gr(bags.total(), 0)) { for (i = 0; i < bags.numBags(); i++) returnValue = returnValue - logFunc(bags.perBag(i)); returnValue = returnValue - logFunc(noUnknown); returnValue = returnValue + logFunc(totalnoInst); } return returnValue; }