/** * <summary> The createChildrenForDiscrete method creates an List of values, a partition with respect to attributes and an List * of DecisionNodes as children.</summary> * * <param name="attributeIndex">Index of the attribute.</param> * <param name="parameter"> RandomForestParameter like seed, ensembleSize, attributeSubsetSize.</param> * <param name="isStump"> Refers to decision trees with only 1 splitting rule.</param> */ private void CreateChildrenForDiscrete(int attributeIndex, RandomForestParameter parameter, bool isStump) { var valueList = _data.GetAttributeValueList(attributeIndex); var childrenData = _data.DivideWithRespectToAttribute(attributeIndex); _children = new List <DecisionNode>(); for (var i = 0; i < valueList.Count; i++) { _children.Add(new DecisionNode(childrenData.Get(i), new DecisionCondition(attributeIndex, new DiscreteAttribute(valueList[i])), parameter, isStump)); } }
public void TestTrain() { var randomForest = new RandomForest(); var randomForestParameter = new RandomForestParameter(1, 100, 35); randomForest.Train(iris.GetInstanceList(), randomForestParameter); Assert.AreEqual(2.00, 100 * randomForest.Test(iris.GetInstanceList()).GetErrorRate(), 0.01); randomForest.Train(bupa.GetInstanceList(), randomForestParameter); Assert.AreEqual(42.03, 100 * randomForest.Test(bupa.GetInstanceList()).GetErrorRate(), 0.01); randomForest.Train(dermatology.GetInstanceList(), randomForestParameter); Assert.AreEqual(2.46, 100 * randomForest.Test(dermatology.GetInstanceList()).GetErrorRate(), 0.01); randomForest.Train(car.GetInstanceList(), randomForestParameter); Assert.AreEqual(0.0, 100 * randomForest.Test(car.GetInstanceList()).GetErrorRate(), 0.01); randomForest.Train(tictactoe.GetInstanceList(), randomForestParameter); Assert.AreEqual(0.0, 100 * randomForest.Test(tictactoe.GetInstanceList()).GetErrorRate(), 0.01); randomForest.Train(nursery.GetInstanceList(), randomForestParameter); Assert.AreEqual(0.0, 100 * randomForest.Test(nursery.GetInstanceList()).GetErrorRate(), 0.01); }
/** * <summary> The createChildrenForDiscreteIndexed method creates an List of DecisionNodes as children and a partition with respect to * indexed attribute.</summary> * * <param name="attributeIndex">Index of the attribute.</param> * <param name="attributeValue">Value of the attribute.</param> * <param name="parameter"> RandomForestParameter like seed, ensembleSize, attributeSubsetSize.</param> * <param name="isStump"> Refers to decision trees with only 1 splitting rule.</param> */ private void CreateChildrenForDiscreteIndexed(int attributeIndex, int attributeValue, RandomForestParameter parameter, bool isStump) { var childrenData = _data.DivideWithRespectToIndexedAttribute(attributeIndex, attributeValue); _children = new List <DecisionNode> { new DecisionNode(childrenData.Get(0), new DecisionCondition(attributeIndex, new DiscreteIndexedAttribute("", attributeValue, ((DiscreteIndexedAttribute)_data.Get(0).GetAttribute(attributeIndex)).GetMaxIndex())), parameter, isStump), new DecisionNode(childrenData.Get(1), new DecisionCondition(attributeIndex, new DiscreteIndexedAttribute("", -1, ((DiscreteIndexedAttribute)_data.Get(0).GetAttribute(attributeIndex)).GetMaxIndex())), parameter, isStump) }; }
/** * <summary> The DecisionNode method takes {@link InstanceList} data as input and then it sets the class label parameter by finding * the most occurred class label of given data, it then gets distinct class labels as class labels List. Later, it adds ordered * indices to the indexList and shuffles them randomly. Then, it gets the class distribution of given data and finds the best entropy value * of these class distribution. * <p/> * If an attribute of given data is {@link DiscreteIndexedAttribute}, it creates a Distribution according to discrete indexed attribute class distribution * and finds the entropy. If it is better than the last best entropy it reassigns the best entropy, best attribute and best split value according to * the newly founded best entropy's index. At the end, it also add new distribution to the class distribution . * <p/> * If an attribute of given data is {@link DiscreteAttribute}, it directly finds the entropy. If it is better than the last best entropy it * reassigns the best entropy, best attribute and best split value according to the newly founded best entropy's index. * <p/> * If an attribute of given data is {@link ContinuousAttribute}, it creates two distributions; left and right according to class distribution * and discrete distribution respectively, and finds the entropy. If it is better than the last best entropy it reassigns the best entropy, * best attribute and best split value according to the newly founded best entropy's index. At the end, it also add new distribution to * the right distribution and removes from left distribution.</summary> * * <param name="data"> {@link InstanceList} input.</param> * <param name="condition">{@link DecisionCondition} to check.</param> * <param name="parameter">RandomForestParameter like seed, ensembleSize, attributeSubsetSize.</param> * <param name="isStump"> Refers to decision trees with only 1 splitting rule.</param> */ public DecisionNode(InstanceList.InstanceList data, DecisionCondition condition, RandomForestParameter parameter, bool isStump) { int bestAttribute = -1, size; double bestSplitValue = 0; this._condition = condition; this._data = data; _classLabel = Classifier.Classifier.GetMaximum(data.GetClassLabels()); _leaf = true; var classLabels = data.GetDistinctClassLabels(); if (classLabels.Count == 1) { return; } if (isStump && condition != null) { return; } var indexList = new List <int>(); for (var i = 0; i < data.Get(0).AttributeSize(); i++) { indexList.Add(i); } if (parameter != null && parameter.GetAttributeSubsetSize() < data.Get(0).AttributeSize()) { size = parameter.GetAttributeSubsetSize(); } else { size = data.Get(0).AttributeSize(); } var classDistribution = data.ClassDistribution(); var bestEntropy = data.ClassDistribution().Entropy(); for (var j = 0; j < size; j++) { var index = indexList[j]; double entropy; if (data.Get(0).GetAttribute(index) is DiscreteIndexedAttribute) { for (var k = 0; k < ((DiscreteIndexedAttribute)data.Get(0).GetAttribute(index)).GetMaxIndex(); k++) { var distribution = data.DiscreteIndexedAttributeClassDistribution(index, k); if (distribution.GetSum() > 0) { classDistribution.RemoveDistribution(distribution); entropy = (classDistribution.Entropy() * classDistribution.GetSum() + distribution.Entropy() * distribution.GetSum()) / data.Size(); if (entropy < bestEntropy) { bestEntropy = entropy; bestAttribute = index; bestSplitValue = k; } classDistribution.AddDistribution(distribution); } } } else { if (data.Get(0).GetAttribute(index) is DiscreteAttribute) { entropy = EntropyForDiscreteAttribute(index); if (entropy < bestEntropy) { bestEntropy = entropy; bestAttribute = index; } } else { if (data.Get(0).GetAttribute(index) is ContinuousAttribute) { data.Sort(index); var previousValue = double.MinValue; var leftDistribution = data.ClassDistribution(); var rightDistribution = new DiscreteDistribution(); for (var k = 0; k < data.Size(); k++) { var instance = data.Get(k); if (k == 0) { previousValue = ((ContinuousAttribute)instance.GetAttribute(index)).GetValue(); } else { if (((ContinuousAttribute)instance.GetAttribute(index)).GetValue() != previousValue) { var splitValue = (previousValue + ((ContinuousAttribute)instance.GetAttribute(index)) .GetValue()) / 2; previousValue = ((ContinuousAttribute)instance.GetAttribute(index)).GetValue(); entropy = (leftDistribution.GetSum() / data.Size()) * leftDistribution.Entropy() + (rightDistribution.GetSum() / data.Size()) * rightDistribution.Entropy(); if (entropy < bestEntropy) { bestEntropy = entropy; bestSplitValue = splitValue; bestAttribute = index; } } } leftDistribution.RemoveItem(instance.GetClassLabel()); rightDistribution.AddItem(instance.GetClassLabel()); } } } } } if (bestAttribute != -1) { _leaf = false; if (data.Get(0).GetAttribute(bestAttribute) is DiscreteIndexedAttribute) { CreateChildrenForDiscreteIndexed(bestAttribute, (int)bestSplitValue, parameter, isStump); } else { if (data.Get(0).GetAttribute(bestAttribute) is DiscreteAttribute) { CreateChildrenForDiscrete(bestAttribute, parameter, isStump); } else { if (data.Get(0).GetAttribute(bestAttribute) is ContinuousAttribute) { CreateChildrenForContinuous(bestAttribute, bestSplitValue, parameter, isStump); } } } } }
/** * <summary> The createChildrenForContinuous method creates an List of DecisionNodes as children and a partition with respect to * continuous attribute and the given split value.</summary> * * <param name="attributeIndex">Index of the attribute.</param> * <param name="parameter"> RandomForestParameter like seed, ensembleSize, attributeSubsetSize.</param> * <param name="isStump"> Refers to decision trees with only 1 splitting rule.</param> * <param name="splitValue"> Split value is used for partitioning.</param> */ private void CreateChildrenForContinuous(int attributeIndex, double splitValue, RandomForestParameter parameter, bool isStump) { var childrenData = _data.DivideWithRespectToAttribute(attributeIndex, splitValue); _children = new List <DecisionNode> { new DecisionNode(childrenData.Get(0), new DecisionCondition(attributeIndex, '<', new ContinuousAttribute(splitValue)), parameter, isStump), new DecisionNode(childrenData.Get(1), new DecisionCondition(attributeIndex, '>', new ContinuousAttribute(splitValue)), parameter, isStump) }; }