public void TestEntropy() { Assert.AreEqual(1.4591, smallDistribution.Entropy(), 0.0001); }
/** * <summary> The DecisionNode method takes {@link InstanceList} data as input and then it sets the class label parameter by finding * the most occurred class label of given data, it then gets distinct class labels as class labels List. Later, it adds ordered * indices to the indexList and shuffles them randomly. Then, it gets the class distribution of given data and finds the best entropy value * of these class distribution. * <p/> * If an attribute of given data is {@link DiscreteIndexedAttribute}, it creates a Distribution according to discrete indexed attribute class distribution * and finds the entropy. If it is better than the last best entropy it reassigns the best entropy, best attribute and best split value according to * the newly founded best entropy's index. At the end, it also add new distribution to the class distribution . * <p/> * If an attribute of given data is {@link DiscreteAttribute}, it directly finds the entropy. If it is better than the last best entropy it * reassigns the best entropy, best attribute and best split value according to the newly founded best entropy's index. * <p/> * If an attribute of given data is {@link ContinuousAttribute}, it creates two distributions; left and right according to class distribution * and discrete distribution respectively, and finds the entropy. If it is better than the last best entropy it reassigns the best entropy, * best attribute and best split value according to the newly founded best entropy's index. At the end, it also add new distribution to * the right distribution and removes from left distribution.</summary> * * <param name="data"> {@link InstanceList} input.</param> * <param name="condition">{@link DecisionCondition} to check.</param> * <param name="parameter">RandomForestParameter like seed, ensembleSize, attributeSubsetSize.</param> * <param name="isStump"> Refers to decision trees with only 1 splitting rule.</param> */ public DecisionNode(InstanceList.InstanceList data, DecisionCondition condition, RandomForestParameter parameter, bool isStump) { int bestAttribute = -1, size; double bestSplitValue = 0; this._condition = condition; this._data = data; _classLabel = Classifier.Classifier.GetMaximum(data.GetClassLabels()); _leaf = true; var classLabels = data.GetDistinctClassLabels(); if (classLabels.Count == 1) { return; } if (isStump && condition != null) { return; } var indexList = new List <int>(); for (var i = 0; i < data.Get(0).AttributeSize(); i++) { indexList.Add(i); } if (parameter != null && parameter.GetAttributeSubsetSize() < data.Get(0).AttributeSize()) { size = parameter.GetAttributeSubsetSize(); } else { size = data.Get(0).AttributeSize(); } var classDistribution = data.ClassDistribution(); var bestEntropy = data.ClassDistribution().Entropy(); for (var j = 0; j < size; j++) { var index = indexList[j]; double entropy; if (data.Get(0).GetAttribute(index) is DiscreteIndexedAttribute) { for (var k = 0; k < ((DiscreteIndexedAttribute)data.Get(0).GetAttribute(index)).GetMaxIndex(); k++) { var distribution = data.DiscreteIndexedAttributeClassDistribution(index, k); if (distribution.GetSum() > 0) { classDistribution.RemoveDistribution(distribution); entropy = (classDistribution.Entropy() * classDistribution.GetSum() + distribution.Entropy() * distribution.GetSum()) / data.Size(); if (entropy < bestEntropy) { bestEntropy = entropy; bestAttribute = index; bestSplitValue = k; } classDistribution.AddDistribution(distribution); } } } else { if (data.Get(0).GetAttribute(index) is DiscreteAttribute) { entropy = EntropyForDiscreteAttribute(index); if (entropy < bestEntropy) { bestEntropy = entropy; bestAttribute = index; } } else { if (data.Get(0).GetAttribute(index) is ContinuousAttribute) { data.Sort(index); var previousValue = double.MinValue; var leftDistribution = data.ClassDistribution(); var rightDistribution = new DiscreteDistribution(); for (var k = 0; k < data.Size(); k++) { var instance = data.Get(k); if (k == 0) { previousValue = ((ContinuousAttribute)instance.GetAttribute(index)).GetValue(); } else { if (((ContinuousAttribute)instance.GetAttribute(index)).GetValue() != previousValue) { var splitValue = (previousValue + ((ContinuousAttribute)instance.GetAttribute(index)) .GetValue()) / 2; previousValue = ((ContinuousAttribute)instance.GetAttribute(index)).GetValue(); entropy = (leftDistribution.GetSum() / data.Size()) * leftDistribution.Entropy() + (rightDistribution.GetSum() / data.Size()) * rightDistribution.Entropy(); if (entropy < bestEntropy) { bestEntropy = entropy; bestSplitValue = splitValue; bestAttribute = index; } } } leftDistribution.RemoveItem(instance.GetClassLabel()); rightDistribution.AddItem(instance.GetClassLabel()); } } } } } if (bestAttribute != -1) { _leaf = false; if (data.Get(0).GetAttribute(bestAttribute) is DiscreteIndexedAttribute) { CreateChildrenForDiscreteIndexed(bestAttribute, (int)bestSplitValue, parameter, isStump); } else { if (data.Get(0).GetAttribute(bestAttribute) is DiscreteAttribute) { CreateChildrenForDiscrete(bestAttribute, parameter, isStump); } else { if (data.Get(0).GetAttribute(bestAttribute) is ContinuousAttribute) { CreateChildrenForContinuous(bestAttribute, bestSplitValue, parameter, isStump); } } } } }