private void split(DecisionNode root, double[][] input, int[] output, int height) { // 2. If all examples are for the same class, return the single-node // tree with the output label corresponding to this common class. double entropy = Measures.Entropy(output, outputClasses); if (entropy == 0) { if (output.Length > 0) { root.Output = output[0]; } return; } // 3. If number of predicting attributes is empty, then return the single-node // tree with the output label corresponding to the most common value of // the target attributes in the examples. // how many variables have been used less than the limit int[] candidates = Matrix.Find(attributeUsageCount, x => x < join); if (candidates.Length == 0 || (maxHeight > 0 && height == maxHeight)) { root.Output = Measures.Mode(output); return; } // 4. Otherwise, try to select the attribute which // best explains the data sample subset. If the tree // is part of a random forest, only consider a percentage // of the candidate attributes at each split point if (MaxVariables > 0) { candidates = Vector.Sample(candidates, MaxVariables); } var scores = new double[candidates.Length]; var thresholds = new double[candidates.Length]; var partitions = new int[candidates.Length][][]; // For each attribute in the data set Parallel.For(0, scores.Length, ParallelOptions, i => { scores[i] = computeGainRatio(input, output, candidates[i], entropy, out partitions[i], out thresholds[i]); }); // Select the attribute with maximum gain ratio int maxGainIndex; scores.Max(out maxGainIndex); var maxGainPartition = partitions[maxGainIndex]; var maxGainAttribute = candidates[maxGainIndex]; var maxGainRange = inputRanges[maxGainAttribute]; var maxGainThreshold = thresholds[maxGainIndex]; // Mark this attribute as already used attributeUsageCount[maxGainAttribute]++; double[][] inputSubset; int[] outputSubset; // Now, create next nodes and pass those partitions as their responsibilities. if (tree.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete) { // This is a discrete nature attribute. We will branch at each // possible value for the discrete variable and call recursion. DecisionNode[] children = new DecisionNode[maxGainPartition.Length]; // Create a branch for each possible value for (int i = 0; i < children.Length; i++) { children[i] = new DecisionNode(tree) { Parent = root, Value = i + maxGainRange.Min, Comparison = ComparisonKind.Equal, }; inputSubset = input.Get(maxGainPartition[i]); outputSubset = output.Get(maxGainPartition[i]); split(children[i], inputSubset, outputSubset, height + 1); // recursion } root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } else if (maxGainPartition.Length > 1) { // This is a continuous nature attribute, and we achieved two partitions // using the partitioning scheme. We will branch on two possible settings: // either the value is greater than a currently detected optimal threshold // or it is less. DecisionNode[] children = { new DecisionNode(tree) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.LessThanOrEqual }, new DecisionNode(tree) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.GreaterThan } }; // Create a branch for lower values inputSubset = input.Get(maxGainPartition[0]); outputSubset = output.Get(maxGainPartition[0]); split(children[0], inputSubset, outputSubset, height + 1); // Create a branch for higher values inputSubset = input.Get(maxGainPartition[1]); outputSubset = output.Get(maxGainPartition[1]); split(children[1], inputSubset, outputSubset, height + 1); root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } else { // This is a continuous nature attribute, but all variables are equal // to a constant. If there is only a constant value as the predictor // and there are multiple output labels associated with this constant // value, there isn't much we can do. This node will be a leaf. // We will set the class label for this node as the // majority of the currently selected output classes. outputSubset = output.Get(maxGainPartition[0]); root.Output = Measures.Mode(outputSubset); } attributeUsageCount[maxGainAttribute]--; }
private void split(DecisionNode root, int[][] input, int[] output, int height) { // 2. If all examples are for the same class, return the single-node // tree with the output label corresponding to this common class. double entropy = Measures.Entropy(output, Model.NumberOfClasses); if (entropy == 0) { if (output.Length > 0) { root.Output = output[0]; } return; } // 3. If number of predicting attributes is empty, then return the single-node // tree with the output label corresponding to the most common value of // the target attributes in the examples. // // how many variables have been used less than the limit (if there is a limit) int candidateCount = AttributeUsageCount.Count(x => Join == 0 ? true : x < Join); if (candidateCount == 0 || (MaxHeight > 0 && height == MaxHeight)) { root.Output = Measures.Mode(output); return; } // 4. Otherwise, try to select the attribute which // best explains the data sample subset. double[] scores = new double[candidateCount]; int[][][] partitions = new int[candidateCount][][]; int[][][] outputSubs = new int[candidateCount][][]; // Retrieve candidate attribute indices int[] candidates = new int[candidateCount]; for (int i = 0, k = 0; i < AttributeUsageCount.Length; i++) { if (AttributeUsageCount[i] < Join) { candidates[k++] = i; } } // For each attribute in the data set Parallel.For(0, scores.Length, ParallelOptions, i => { scores[i] = computeGainRatio(input, output, candidates[i], entropy, out partitions[i], out outputSubs[i]); }); // Select the attribute with maximum gain ratio int maxGainIndex; scores.Max(out maxGainIndex); var maxGainPartition = partitions[maxGainIndex]; var maxGainOutputs = outputSubs[maxGainIndex]; var maxGainAttribute = candidates[maxGainIndex]; var maxGainRange = inputRanges[maxGainAttribute]; AttributeUsageCount[maxGainAttribute]++; // Now, create next nodes and pass those partitions as their responsibilities. DecisionNode[] children = new DecisionNode[maxGainPartition.Length]; for (int i = 0; i < children.Length; i++) { children[i] = new DecisionNode(Model) { Parent = root, Comparison = ComparisonKind.Equal, Value = i + maxGainRange.Min }; int[][] inputSubset = input.Get(maxGainPartition[i]); int[] outputSubset = maxGainOutputs[i]; split(children[i], inputSubset, outputSubset, height + 1); // recursion if (children[i].IsLeaf) { // If the resulting node is a leaf, and it has not // been assigned a value because there were no available // output samples in this category, we will be assigning // the most common label for the current node to it. if (!Rejection && !children[i].Output.HasValue) { children[i].Output = Measures.Mode(output); } } } AttributeUsageCount[maxGainAttribute]--; root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); }
private void split(DecisionNode root, double[][] inputs, int[] outputs, double[] weights, int height) { // 2. If all examples are for the same class, return the single-node // tree with the output label corresponding to this common class. double entropy = Measures.WeightedEntropy(outputs, weights, Model.NumberOfClasses); if (entropy == 0) { if (outputs.Length > 0) { root.Output = outputs[0]; } return; } // 3. If number of predicting attributes is empty, then return the single-node // tree with the output label corresponding to the most common value of // the target attributes in the examples. // how many variables have been used less than the limit (if there is a limit) int[] candidates = Matrix.Find(AttributeUsageCount, x => Join == 0 ? true : x < Join); if (candidates.Length == 0 || (MaxHeight > 0 && height == MaxHeight) || (minimumSplitSize > 0 && inputs.Length < minimumSplitSize)) { root.Output = Measures.WeightedMode(outputs, weights); return; } // 4. Otherwise, try to select the attribute which // best explains the data sample subset. If the tree // is part of a random forest, only consider a percentage // of the candidate attributes at each split point if (MaxVariables > 0 && candidates.Length > MaxVariables) { candidates = Vector.Sample(candidates, MaxVariables); } var scores = new double[candidates.Length]; var thresholds = new double[candidates.Length]; var partitions = new List <int> [candidates.Length][]; if (ParallelOptions.MaxDegreeOfParallelism == 1) { // For each attribute in the data set for (int i = 0; i < scores.Length; i++) { scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i], entropy, out partitions[i], out thresholds[i]); } } else { // For each attribute in the data set Parallel.For(0, scores.Length, ParallelOptions, i => { scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i], entropy, out partitions[i], out thresholds[i]); }); } // Select the attribute with maximum gain ratio int maxGainIndex; scores.Max(out maxGainIndex); var maxGainPartition = partitions[maxGainIndex]; var maxGainAttribute = candidates[maxGainIndex]; var maxGainRange = inputRanges[maxGainAttribute]; var maxGainThreshold = thresholds[maxGainIndex]; // Mark this attribute as already used AttributeUsageCount[maxGainAttribute]++; double[][] inputSubset; int[] outputSubset; double[] weightSubset; // Now, create next nodes and pass those partitions as their responsibilities. if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete) { // This is a discrete nature attribute. We will branch at each // possible value for the discrete variable and call recursion. var children = new DecisionNode[maxGainPartition.Length]; // Create a branch for each possible value for (int i = 0; i < children.Length; i++) { children[i] = new DecisionNode(Model) { Parent = root, Value = i + maxGainRange.Min, Comparison = ComparisonKind.Equal, }; inputSubset = inputs.Get(maxGainPartition[i]); outputSubset = outputs.Get(maxGainPartition[i]); weightSubset = weights.Get(maxGainPartition[i]); if (outputSubset.Length == 0) { //in this case the we have no samples for this category //but we still want to be able to make a decision, so we will give the best of the current node as output outputSubset = new int[1] { Measures.WeightedMode(outputs, weights) }; weightSubset = new double[1] { 1 }; //does not matter } split(children[i], inputSubset, outputSubset, weightSubset, height + 1); // recursion } root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } else if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Continuous) { List <int> partitionBelowThreshold = maxGainPartition[0]; List <int> partitionAboveThreshold = maxGainPartition[1]; if (partitionBelowThreshold != null && partitionAboveThreshold != null) { //Before we branch we test whether each node is big enough, we stop here and set it as a leaf node if (partitionAboveThreshold.Count < minimumLeafSize || partitionBelowThreshold.Count < minimumLeafSize) { root.Output = Measures.WeightedMode(outputs, weights); } else { // This is a continuous nature attribute, and we achieved two partitions // using the partitioning scheme. We will branch on two possible settings: // either the value is greater than a currently detected optimal threshold // or it is less. DecisionNode[] children = { new DecisionNode(Model) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.LessThanOrEqual }, new DecisionNode(Model) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.GreaterThan } }; // Create a branch for lower values inputSubset = inputs.Get(partitionBelowThreshold); outputSubset = outputs.Get(partitionBelowThreshold); weightSubset = weights.Get(partitionBelowThreshold); split(children[0], inputSubset, outputSubset, weightSubset, height + 1); // Create a branch for higher values inputSubset = inputs.Get(partitionAboveThreshold); outputSubset = outputs.Get(partitionAboveThreshold); weightSubset = weights.Get(partitionAboveThreshold); split(children[1], inputSubset, outputSubset, weightSubset, height + 1); root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } } else { // This is a continuous nature attribute, but all variables are equal // to a constant. If there is only a constant value as the predictor // and there are multiple output labels associated with this constant // value, there isn't much we can do. This node will be a leaf. // We will set the class label for this node as the // majority of the currently selected output classes. var outputIndices = partitionBelowThreshold ?? partitionAboveThreshold; outputSubset = outputs.Get(outputIndices); root.Output = Measures.Mode(outputSubset); } } AttributeUsageCount[maxGainAttribute]--; }