/// <summary> This method is a straightforward implementation of the gain /// ratio criterion for the given distribution. /// </summary> public override double splitCritValue(Distribution bags) { double numerator; double denumerator; numerator = oldEnt(bags) - newEnt(bags); // Splits with no gain are useless. if (Utils.eq(numerator, 0)) { //UPGRADE_TODO: The equivalent in .NET for field 'java.lang.Double.MAX_VALUE' may return a different value. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1043'" return System.Double.MaxValue; } denumerator = splitEnt(bags); // Test if split is trivial. if (Utils.eq(denumerator, 0)) { //UPGRADE_TODO: The equivalent in .NET for field 'java.lang.Double.MAX_VALUE' may return a different value. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1043'" return System.Double.MaxValue; } // We take the reciprocal value because we want to minimize the // splitting criterion's value. return denumerator / numerator; }
/// <summary> Computes entropy of distribution before splitting.</summary> public double oldEnt(Distribution bags) { double returnValue = 0; int j; for (j = 0; j < bags.numClasses(); j++) returnValue = returnValue + logFunc(bags.perClass(j)); return logFunc(bags.total()) - returnValue; }
/// <summary> Computes entropy after splitting without considering the /// class values. /// </summary> public double splitEnt(Distribution bags) { double returnValue = 0; int i; for (i = 0; i < bags.numBags(); i++) returnValue = returnValue + logFunc(bags.perBag(i)); return logFunc(bags.total()) - returnValue; }
/// <summary> Computes entropy of distribution after splitting.</summary> public double newEnt(Distribution bags) { double returnValue = 0; int i, j; for (i = 0; i < bags.numBags(); i++) { for (j = 0; j < bags.numClasses(); j++) returnValue = returnValue + logFunc(bags.perClassPerBag(i, j)); returnValue = returnValue - logFunc(bags.perBag(i)); } return - returnValue; }
/// <summary> Help method for computing the split entropy.</summary> private double splitEnt(Distribution bags, double totalnoInst) { double returnValue = 0; double noUnknown; int i; noUnknown = totalnoInst - bags.total(); if (Utils.gr(bags.total(), 0)) { for (i = 0; i < bags.numBags(); i++) returnValue = returnValue - logFunc(bags.perBag(i)); returnValue = returnValue - logFunc(noUnknown); returnValue = returnValue + logFunc(totalnoInst); } return returnValue; }
/// <summary> This method computes the gain ratio in the same way C4.5 does. /// /// </summary> /// <param name="bags">the distribution /// </param> /// <param name="totalnoInst">the weight of ALL instances /// </param> /// <param name="numerator">the info gain /// </param> public double splitCritValue(Distribution bags, double totalnoInst, double numerator) { double denumerator; // Compute split info. denumerator = splitEnt(bags, totalnoInst); // Test if split is trivial. if (Utils.eq(denumerator, 0)) return 0; denumerator = denumerator / totalnoInst; return numerator / denumerator; }
/// <summary> This method computes the information gain in the same way /// C4.5 does. /// /// </summary> /// <param name="distribution">the distribution /// </param> /// <param name="totalNoInst">weight of ALL instances /// </param> /// <param name="oldEnt">entropy with respect to "no-split"-model. /// </param> public double splitCritValue(Distribution bags, double totalNoInst, double oldEnt) { double numerator; double noUnknown; double unknownRate; noUnknown = totalNoInst - bags.total(); unknownRate = noUnknown / totalNoInst; numerator = (oldEnt - newEnt(bags)); numerator = (1 - unknownRate) * numerator; // Splits with no gain are useless. if (Utils.eq(numerator, 0)) return 0; return numerator / bags.total(); }
/// <summary> Computes entropy of test distribution with respect to training distribution.</summary> public override double splitCritValue(Distribution train, Distribution test) { double result = 0; int numClasses = 0; int i, j; // Find out relevant number of classes for (j = 0; j < test.numClasses(); j++) if (Utils.gr(train.perClass(j), 0) || Utils.gr(test.perClass(j), 0)) numClasses++; // Compute entropy of test data with respect to training data for (i = 0; i < test.numBags(); i++) if (Utils.gr(test.perBag(i), 0)) { for (j = 0; j < test.numClasses(); j++) if (Utils.gr(test.perClassPerBag(i, j), 0)) result -= test.perClassPerBag(i, j) * System.Math.Log(train.perClassPerBag(i, j) + 1); result += test.perBag(i) * System.Math.Log(train.perBag(i) + numClasses); } return result / log2; }
/// <summary> Builds the tree structure with hold out set /// /// </summary> /// <param name="train">the data for which the tree structure is to be /// generated. /// </param> /// <param name="test">the test data for potential pruning /// </param> /// <param name="keepData">is training Data to be kept? /// </param> /// <exception cref="Exception">if something goes wrong /// </exception> public virtual void buildTree(Instances train, Instances test, bool keepData) { Instances[] localTrain, localTest; int i; if (keepData) { m_train = train; } m_isLeaf = false; m_isEmpty = false; m_sons = null; m_localModel = m_toSelectModel.selectModel(train, test); m_test = new Distribution(test, m_localModel); if (m_localModel.numSubsets() > 1) { localTrain = m_localModel.split(train); localTest = m_localModel.split(test); train = test = null; m_sons = new ClassifierTree[m_localModel.numSubsets()]; for (i = 0; i < m_sons.Length; i++) { m_sons[i] = getNewTree(localTrain[i], localTest[i]); localTrain[i] = null; localTest[i] = null; } } else { m_isLeaf = true; if (Utils.eq(train.sumOfWeights(), 0)) m_isEmpty = true; train = test = null; } }
/// <summary> Computes result of splitting criterion for given training and /// test distributions and given default distribution. /// /// </summary> /// <returns> value of splitting criterion. 0 by default /// </returns> public virtual double splitCritValue(Distribution train, Distribution test, Distribution defC) { return 0; }
/// <summary> Builds the tree structure. /// /// </summary> /// <param name="data">the data for which the tree structure is to be /// generated. /// </param> /// <param name="keepData">is training data to be kept? /// </param> /// <exception cref="Exception">if something goes wrong /// </exception> public virtual void buildTree(Instances data, bool keepData) { Instances[] localInstances; if (keepData) { m_train = data; } m_test = null; m_isLeaf = false; m_isEmpty = false; m_sons = null; m_localModel = m_toSelectModel.selectModel(data); if (m_localModel.numSubsets() > 1) { localInstances = m_localModel.split(data); data = null; m_sons = new ClassifierTree[m_localModel.numSubsets()]; for (int i = 0; i < m_sons.Length; i++) { m_sons[i] = getNewTree(localInstances[i]); localInstances[i] = null; } } else { m_isLeaf = true; if (Utils.eq(data.sumOfWeights(), 0)) m_isEmpty = true; data = null; } }
/// <summary> Computes result of splitting criterion for given distribution. /// /// </summary> /// <returns> value of splitting criterion. 0 by default /// </returns> public virtual double splitCritValue(Distribution bags) { return 0; }
/// <summary> Computes result of splitting criterion for given training and /// test distributions and given number of classes. /// /// </summary> /// <returns> value of splitting criterion. 0 by default /// </returns> public virtual double splitCritValue(Distribution train, Distribution test, int noClassesDefault) { return 0; }
/// <summary> Creates distribution with only one bag by merging all /// bags of given distribution. /// </summary> public Distribution(Distribution toMerge) { totaL = toMerge.totaL; m_perClass = new double[toMerge.numClasses()]; Array.Copy(toMerge.m_perClass, 0, m_perClass, 0, toMerge.numClasses()); m_perClassPerBag = new double[1][]; for (int i = 0; i < 1; i++) { m_perClassPerBag[i] = new double[0]; } m_perClassPerBag[0] = new double[toMerge.numClasses()]; Array.Copy(toMerge.m_perClass, 0, m_perClassPerBag[0], 0, toMerge.numClasses()); m_perBag = new double[1]; m_perBag[0] = totaL; }
/// <summary> Computes entropy for given distribution.</summary> public override double splitCritValue(Distribution bags) { return newEnt(bags); }
/// <summary> Creates distribution with two bags by merging all bags apart of /// the indicated one. /// </summary> public Distribution(Distribution toMerge, int index) { int i; totaL = toMerge.totaL; m_perClass = new double[toMerge.numClasses()]; Array.Copy(toMerge.m_perClass, 0, m_perClass, 0, toMerge.numClasses()); m_perClassPerBag = new double[2][]; for (int i2 = 0; i2 < 2; i2++) { m_perClassPerBag[i2] = new double[0]; } m_perClassPerBag[0] = new double[toMerge.numClasses()]; Array.Copy(toMerge.m_perClassPerBag[index], 0, m_perClassPerBag[0], 0, toMerge.numClasses()); m_perClassPerBag[1] = new double[toMerge.numClasses()]; for (i = 0; i < toMerge.numClasses(); i++) m_perClassPerBag[1][i] = toMerge.m_perClass[i] - m_perClassPerBag[0][i]; m_perBag = new double[2]; m_perBag[0] = toMerge.m_perBag[index]; m_perBag[1] = totaL - m_perBag[0]; }
/// <summary> Cleanup in order to save memory.</summary> public void cleanup(Instances justHeaderInfo) { m_train = justHeaderInfo; m_test = null; if (!m_isLeaf) for (int i = 0; i < m_sons.Length; i++) m_sons[i].cleanup(justHeaderInfo); }
/// <summary> Creates split on enumerated attribute. /// /// </summary> /// <exception cref="Exception">if something goes wrong /// </exception> private void handleEnumeratedAttribute(Instances trainInstances) { Distribution newDistribution, secondDistribution; int numAttValues; double currIG, currGR; Instance instance; int i; numAttValues = trainInstances.attribute(m_attIndex).numValues(); newDistribution = new Distribution(numAttValues, trainInstances.numClasses()); // Only Instances with known values are relevant. System.Collections.IEnumerator enu = trainInstances.enumerateInstances(); //UPGRADE_TODO: Method 'java.util.Enumeration.hasMoreElements' was converted to 'System.Collections.IEnumerator.MoveNext' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationhasMoreElements'" while (enu.MoveNext()) { //UPGRADE_TODO: Method 'java.util.Enumeration.nextElement' was converted to 'System.Collections.IEnumerator.Current' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationnextElement'" instance = (Instance) enu.Current; if (!instance.isMissing(m_attIndex)) { //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" newDistribution.add((int) instance.value_Renamed(m_attIndex), instance); } } m_distribution = newDistribution; // For all values for (i = 0; i < numAttValues; i++) { if (Utils.grOrEq(newDistribution.perBag(i), m_minNoObj)) { secondDistribution = new Distribution(newDistribution, i); // Check if minimum number of Instances in the two // subsets. if (secondDistribution.check(m_minNoObj)) { m_numSubsets = 2; currIG = m_infoGainCrit.splitCritValue(secondDistribution, m_sumOfWeights); currGR = m_gainRatioCrit.splitCritValue(secondDistribution, m_sumOfWeights, currIG); if ((i == 0) || Utils.gr(currGR, m_gainRatio)) { m_gainRatio = currGR; m_infoGain = currIG; m_splitPoint = (double) i; m_distribution = secondDistribution; } } } } }
/// <summary> Subtracts the given distribution from this one. The results /// has only one bag. /// </summary> public Distribution subtract(Distribution toSubstract) { Distribution newDist = new Distribution(1, m_perClass.Length); newDist.m_perBag[0] = totaL - toSubstract.totaL; newDist.totaL = newDist.m_perBag[0]; for (int i = 0; i < m_perClass.Length; i++) { newDist.m_perClassPerBag[0][i] = m_perClass[i] - toSubstract.m_perClass[i]; newDist.m_perClass[i] = newDist.m_perClassPerBag[0][i]; } return newDist; }
/// <summary> Clones distribution (Deep copy of distribution).</summary> public virtual System.Object Clone() { int i, j; Distribution newDistribution = new Distribution(m_perBag.Length, m_perClass.Length); for (i = 0; i < m_perBag.Length; i++) { newDistribution.m_perBag[i] = m_perBag[i]; for (j = 0; j < m_perClass.Length; j++) newDistribution.m_perClassPerBag[i][j] = m_perClassPerBag[i][j]; } for (j = 0; j < m_perClass.Length; j++) newDistribution.m_perClass[j] = m_perClass[j]; newDistribution.totaL = totaL; return newDistribution; }
/// <summary> Creates split on enumerated attribute. /// /// </summary> /// <exception cref="Exception">if something goes wrong /// </exception> private void handleEnumeratedAttribute(Instances trainInstances) { Instance instance; m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses()); // Only Instances with known values are relevant. System.Collections.IEnumerator enu = trainInstances.enumerateInstances(); //UPGRADE_TODO: Method 'java.util.Enumeration.hasMoreElements' was converted to 'System.Collections.IEnumerator.MoveNext' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationhasMoreElements'" while (enu.MoveNext()) { //UPGRADE_TODO: Method 'java.util.Enumeration.nextElement' was converted to 'System.Collections.IEnumerator.Current' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationnextElement'" instance = (Instance) enu.Current; if (!instance.isMissing(m_attIndex)) { //UPGRADE_WARNING: Data types in Visual C# might be different. Verify the accuracy of narrowing conversions. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1042'" m_distribution.add((int) instance.value_Renamed(m_attIndex), instance); } } // Check if minimum number of Instances in at least two // subsets. if (m_distribution.check(m_minNoObj)) { m_numSubsets = m_complexityIndex; m_infoGain = infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights); m_gainRatio = gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); } }
/// <summary> Creates "no-split"-split for given distribution.</summary> public NoSplit(Distribution distribution) { m_distribution = new Distribution(distribution); m_numSubsets = 1; }
/// <summary> Creates split on numeric attribute. /// /// </summary> /// <exception cref="Exception">if something goes wrong /// </exception> private void handleNumericAttribute(Instances trainInstances) { int firstMiss; int next = 1; int last = 0; int splitIndex = - 1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. System.Collections.IEnumerator enu = trainInstances.enumerateInstances(); i = 0; //UPGRADE_TODO: Method 'java.util.Enumeration.hasMoreElements' was converted to 'System.Collections.IEnumerator.MoveNext' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationhasMoreElements'" while (enu.MoveNext()) { //UPGRADE_TODO: Method 'java.util.Enumeration.nextElement' was converted to 'System.Collections.IEnumerator.Current' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationnextElement'" instance = (Instance) enu.Current; if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return ; // Compute values of criteria for all possible split // indices. defaultEnt = infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value_Renamed(m_attIndex) + 1e-5 < trainInstances.instance(next).value_Renamed(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } m_index++; } last = next; } next++; } // Was there any useful split? if (m_index == 0) return ; // Compute modified information gain for best split. m_infoGain = m_infoGain - (Utils.log2(m_index) / m_sumOfWeights); if (Utils.smOrEq(m_infoGain, 0)) return ; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value_Renamed(m_attIndex) + trainInstances.instance(splitIndex).value_Renamed(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value_Renamed(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value_Renamed(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); }
/// <summary> Sets distribution associated with model.</summary> public override void resetDistribution(Instances data) { Instances insts = new Instances(data, data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { if (whichSubset(data.instance(i)) > - 1) { insts.add(data.instance(i)); } } Distribution newD = new Distribution(insts, this); newD.addInstWithUnknown(data, m_attIndex); m_distribution = newD; }
/// <summary> Creates a "no-split"-split for a given set of instances. /// /// </summary> /// <exception cref="Exception">if split can't be built successfully /// </exception> public override void buildClassifier(Instances instances) { m_distribution = new Distribution(instances); m_numSubsets = 1; }
/// <summary> Computes estimated errors for leaf.</summary> private double getEstimatedErrorsForDistribution(Distribution theDistribution) { if (Utils.eq(theDistribution.total(), 0)) return 0; else return theDistribution.numIncorrect() + Stats.addErrs(theDistribution.total(), theDistribution.numIncorrect(), m_CF); }
/// <summary> Selects C4.5-type split for the given dataset.</summary> public override ClassifierSplitModel selectModel(Instances data) { double minResult; //double currentResult; BinC45Split[] currentModel; BinC45Split bestModel = null; NoSplit noSplitModel = null; double averageInfoGain = 0; int validModels = 0; bool multiVal = true; Distribution checkDistribution; double sumOfWeights; int i; try { // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); noSplitModel = new NoSplit(checkDistribution); if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) return noSplitModel; // Check if all attributes are nominal and have a // lot of values. System.Collections.IEnumerator enu = data.enumerateAttributes(); //UPGRADE_TODO: Method 'java.util.Enumeration.hasMoreElements' was converted to 'System.Collections.IEnumerator.MoveNext' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationhasMoreElements'" while (enu.MoveNext()) { //UPGRADE_TODO: Method 'java.util.Enumeration.nextElement' was converted to 'System.Collections.IEnumerator.Current' which has a different behavior. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1073_javautilEnumerationnextElement'" weka.core.Attribute attribute = (weka.core.Attribute)enu.Current; if ((attribute.Numeric) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } currentModel = new BinC45Split[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) if ((data.attribute(i).Numeric) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances())))) { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else currentModel[i] = null; } // Check if any useful split was found. if (validModels == 0) return noSplitModel; averageInfoGain = averageInfoGain / (double) validModels; // Find "best" attribute to split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) // Use 1E-3 here to get a closer approximation to the original // implementation. if ((currentModel[i].infoGain() >= (averageInfoGain - 1e-3)) && Utils.gr(currentModel[i].gainRatio(), minResult)) { bestModel = currentModel[i]; minResult = currentModel[i].gainRatio(); } } // Check if useful split was found. if (Utils.eq(minResult, 0)) return noSplitModel; // Add all Instances with unknown values for the corresponding // attribute to the distribution for the model, so that // the complete distribution is stored with the model. bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()); // Set the split point analogue to C45 if attribute numeric. bestModel.SplitPoint = m_allData; return bestModel; } catch (System.Exception e) { System.Console.WriteLine(e.StackTrace + " " + e.Message); } return null; }
/// <summary> Sets distribution associated with model.</summary> public virtual void resetDistribution(Instances data) { m_distribution = new Distribution(data, this); }