private void split(DecisionNode root, int[][] input, int[] output, int height) { // 2. If all examples are for the same class, return the single-node // tree with the output label corresponding to this common class. double entropy = Measures.Entropy(output, Model.NumberOfClasses); if (entropy == 0) { if (output.Length > 0) { root.Output = output[0]; } return; } // 3. If number of predicting attributes is empty, then return the single-node // tree with the output label corresponding to the most common value of // the target attributes in the examples. // // how many variables have been used less than the limit (if there is a limit) int candidateCount = AttributeUsageCount.Count(x => Join == 0 ? true : x < Join); if (candidateCount == 0 || (MaxHeight > 0 && height == MaxHeight)) { root.Output = Measures.Mode(output); return; } // 4. Otherwise, try to select the attribute which // best explains the data sample subset. double[] scores = new double[candidateCount]; int[][][] partitions = new int[candidateCount][][]; int[][][] outputSubs = new int[candidateCount][][]; // Retrieve candidate attribute indices int[] candidates = new int[candidateCount]; for (int i = 0, k = 0; i < AttributeUsageCount.Length; i++) { if (AttributeUsageCount[i] < Join) { candidates[k++] = i; } } // For each attribute in the data set Parallel.For(0, scores.Length, ParallelOptions, i => { scores[i] = computeGainRatio(input, output, candidates[i], entropy, out partitions[i], out outputSubs[i]); }); // Select the attribute with maximum gain ratio int maxGainIndex; scores.Max(out maxGainIndex); var maxGainPartition = partitions[maxGainIndex]; var maxGainOutputs = outputSubs[maxGainIndex]; var maxGainAttribute = candidates[maxGainIndex]; var maxGainRange = inputRanges[maxGainAttribute]; AttributeUsageCount[maxGainAttribute]++; // Now, create next nodes and pass those partitions as their responsibilities. DecisionNode[] children = new DecisionNode[maxGainPartition.Length]; for (int i = 0; i < children.Length; i++) { children[i] = new DecisionNode(Model) { Parent = root, Comparison = ComparisonKind.Equal, Value = i + maxGainRange.Min }; int[][] inputSubset = input.Get(maxGainPartition[i]); int[] outputSubset = maxGainOutputs[i]; split(children[i], inputSubset, outputSubset, height + 1); // recursion if (children[i].IsLeaf) { // If the resulting node is a leaf, and it has not // been assigned a value because there were no available // output samples in this category, we will be assigning // the most common label for the current node to it. if (!Rejection && !children[i].Output.HasValue) { children[i].Output = Measures.Mode(output); } } } AttributeUsageCount[maxGainAttribute]--; root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); }
private double computeInfoContinuous(double[][] input, int[] output, int attributeIndex, out int[][] partitions, out double threshold) { // Compute the information gain obtained by using // this current attribute as the next decision node. double[] t = thresholds[attributeIndex]; double bestGain = Double.NegativeInfinity; // If there are no possible thresholds that we can use // to split the data (i.e. if all values are the same) if (t.Length == 0) { // Then they all belong to the same partition partitions = new int[][] { Vector.Range(input.Length) }; threshold = Double.NegativeInfinity; return(bestGain); } double bestThreshold = t[0]; partitions = null; var idx1 = new List <int>(input.Length); var idx2 = new List <int>(input.Length); var output1 = new List <int>(input.Length); var output2 = new List <int>(input.Length); double[] values = new double[input.Length]; for (int i = 0; i < values.Length; i++) { values[i] = input[i][attributeIndex]; } // For each possible splitting point of the attribute for (int i = 0; i < t.Length; i += splitStep) { // Partition the remaining data set // according to the threshold value double value = t[i]; idx1.Clear(); idx2.Clear(); output1.Clear(); output2.Clear(); for (int j = 0; j < values.Length; j++) { double x = values[j]; if (x <= value) { idx1.Add(j); output1.Add(output[j]); } else if (x > value) { idx2.Add(j); output2.Add(output[j]); } } double p1 = (double)output1.Count / output.Length; double p2 = (double)output2.Count / output.Length; double splitGain = -p1 *Measures.Entropy(output1, outputClasses) + -p2 *Measures.Entropy(output2, outputClasses); if (splitGain > bestGain) { bestThreshold = value; bestGain = splitGain; if (idx1.Count > 0 && idx2.Count > 0) { partitions = new int[][] { idx1.ToArray(), idx2.ToArray() } } ; else if (idx1.Count > 0) { partitions = new int[][] { idx1.ToArray() } } ; else if (idx2.Count > 0) { partitions = new int[][] { idx2.ToArray() } } ; else { partitions = new int[][] { } }; } } threshold = bestThreshold; return(bestGain); }
private double computeInfoContinuous(double[][] input, int[] output, double[] weight, int attributeIndex, out List <int>[] partitions, out List <int> missingValues, out double threshold) { // Compute the information gain obtained by using // this current attribute as the next decision node. double[] t = thresholds[attributeIndex]; double bestGain = Double.NegativeInfinity; missingValues = new List <int>(); for (int j = 0; j < input.Length; j++) { if (Double.IsNaN(input[j][attributeIndex])) { missingValues.Add(j); } } // If there are no possible thresholds that we can use // to split the data (i.e. if all values are the same) if (t.Length == 0) { // Then they all belong to the same partition partitions = new[] { new List <int>(Vector.Range(input.Length)), null }; threshold = Double.NegativeInfinity; return(bestGain); } partitions = null; double bestThreshold = t[0]; var indicesBelowThreshold = new List <int>(input.Length); var indicesAboveThreshold = new List <int>(input.Length); var output1 = new List <int>(input.Length); var output2 = new List <int>(input.Length); var weights1 = new List <double>(input.Length); var weights2 = new List <double>(input.Length); // For each possible splitting point of the attribute for (int i = 0; i < t.Length; i += splitStep) { // Partition the remaining data set // according to the threshold value double value = t[i]; for (int j = 0; j < input.Length; j++) { double x = input[j][attributeIndex]; if (Double.IsNaN(x)) { continue; } else if (x <= value) { indicesBelowThreshold.Add(j); output1.Add(output[j]); weights1.Add(weight[j]); } else if (x > value) { indicesAboveThreshold.Add(j); output2.Add(output[j]); weights2.Add(weight[j]); } } double weightSum = weight.Sum(); double p1 = weights1.Sum() / weightSum; double p2 = weights2.Sum() / weightSum; double splitGain = -p1 *Measures.WeightedEntropy(output1, weights1, Model.NumberOfClasses) + -p2 *Measures.WeightedEntropy(output2, weights2, Model.NumberOfClasses); if (splitGain > bestGain) { bestThreshold = value; bestGain = splitGain; if (indicesBelowThreshold.Count == 0) { indicesBelowThreshold = null; } if (indicesAboveThreshold.Count == 0) { indicesAboveThreshold = null; } partitions = new[] { indicesBelowThreshold, indicesAboveThreshold }; indicesBelowThreshold = new List <int>(input.Length); indicesAboveThreshold = new List <int>(input.Length); } else { indicesBelowThreshold.Clear(); indicesAboveThreshold.Clear(); } output1.Clear(); output2.Clear(); weights1.Clear(); weights2.Clear(); } threshold = bestThreshold; return(bestGain); }
private void split(DecisionNode root, double[][] input, int[] output, int height) { // 2. If all examples are for the same class, return the single-node // tree with the output label corresponding to this common class. double entropy = Measures.Entropy(output, outputClasses); if (entropy == 0) { if (output.Length > 0) { root.Output = output[0]; } return; } // 3. If number of predicting attributes is empty, then return the single-node // tree with the output label corresponding to the most common value of // the target attributes in the examples. // how many variables have been used less than the limit int[] candidates = Matrix.Find(attributeUsageCount, x => x < join); if (candidates.Length == 0 || (maxHeight > 0 && height == maxHeight)) { root.Output = Measures.Mode(output); return; } // 4. Otherwise, try to select the attribute which // best explains the data sample subset. If the tree // is part of a random forest, only consider a percentage // of the candidate attributes at each split point if (MaxVariables > 0) { candidates = Vector.Sample(candidates, MaxVariables); } var scores = new double[candidates.Length]; var thresholds = new double[candidates.Length]; var partitions = new int[candidates.Length][][]; // For each attribute in the data set Parallel.For(0, scores.Length, ParallelOptions, i => { scores[i] = computeGainRatio(input, output, candidates[i], entropy, out partitions[i], out thresholds[i]); }); // Select the attribute with maximum gain ratio int maxGainIndex; scores.Max(out maxGainIndex); var maxGainPartition = partitions[maxGainIndex]; var maxGainAttribute = candidates[maxGainIndex]; var maxGainRange = inputRanges[maxGainAttribute]; var maxGainThreshold = thresholds[maxGainIndex]; // Mark this attribute as already used attributeUsageCount[maxGainAttribute]++; double[][] inputSubset; int[] outputSubset; // Now, create next nodes and pass those partitions as their responsibilities. if (tree.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete) { // This is a discrete nature attribute. We will branch at each // possible value for the discrete variable and call recursion. DecisionNode[] children = new DecisionNode[maxGainPartition.Length]; // Create a branch for each possible value for (int i = 0; i < children.Length; i++) { children[i] = new DecisionNode(tree) { Parent = root, Value = i + maxGainRange.Min, Comparison = ComparisonKind.Equal, }; inputSubset = input.Get(maxGainPartition[i]); outputSubset = output.Get(maxGainPartition[i]); split(children[i], inputSubset, outputSubset, height + 1); // recursion } root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } else if (maxGainPartition.Length > 1) { // This is a continuous nature attribute, and we achieved two partitions // using the partitioning scheme. We will branch on two possible settings: // either the value is greater than a currently detected optimal threshold // or it is less. DecisionNode[] children = { new DecisionNode(tree) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.LessThanOrEqual }, new DecisionNode(tree) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.GreaterThan } }; // Create a branch for lower values inputSubset = input.Get(maxGainPartition[0]); outputSubset = output.Get(maxGainPartition[0]); split(children[0], inputSubset, outputSubset, height + 1); // Create a branch for higher values inputSubset = input.Get(maxGainPartition[1]); outputSubset = output.Get(maxGainPartition[1]); split(children[1], inputSubset, outputSubset, height + 1); root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } else { // This is a continuous nature attribute, but all variables are equal // to a constant. If there is only a constant value as the predictor // and there are multiple output labels associated with this constant // value, there isn't much we can do. This node will be a leaf. // We will set the class label for this node as the // majority of the currently selected output classes. outputSubset = output.Get(maxGainPartition[0]); root.Output = Measures.Mode(outputSubset); } attributeUsageCount[maxGainAttribute]--; }
private void split(DecisionNode root, double[][] inputs, int[] outputs, double[] weights, int height) { // 2. If all examples are for the same class, return the single-node // tree with the output label corresponding to this common class. double entropy = Measures.WeightedEntropy(outputs, weights, Model.NumberOfClasses); if (entropy == 0) { if (outputs.Length > 0) { root.Output = outputs[0]; } return; } // 3. If number of predicting attributes is empty, then return the single-node // tree with the output label corresponding to the most common value of // the target attributes in the examples. // how many variables have been used less than the limit (if there is a limit) int[] candidates = Matrix.Find(AttributeUsageCount, x => Join == 0 ? true : x < Join); if (candidates.Length == 0 || (MaxHeight > 0 && height == MaxHeight) || (minimumSplitSize > 0 && inputs.Length < minimumSplitSize)) { root.Output = Measures.WeightedMode(outputs, weights); return; } // 4. Otherwise, try to select the attribute which // best explains the data sample subset. If the tree // is part of a random forest, only consider a percentage // of the candidate attributes at each split point if (MaxVariables > 0 && candidates.Length > MaxVariables) { candidates = Vector.Sample(candidates, MaxVariables); } var scores = new double[candidates.Length]; var thresholds = new double[candidates.Length]; var partitions = new List <int> [candidates.Length][]; if (ParallelOptions.MaxDegreeOfParallelism == 1) { // For each attribute in the data set for (int i = 0; i < scores.Length; i++) { scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i], entropy, out partitions[i], out thresholds[i]); } } else { // For each attribute in the data set Parallel.For(0, scores.Length, ParallelOptions, i => { scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i], entropy, out partitions[i], out thresholds[i]); }); } // Select the attribute with maximum gain ratio int maxGainIndex; scores.Max(out maxGainIndex); var maxGainPartition = partitions[maxGainIndex]; var maxGainAttribute = candidates[maxGainIndex]; var maxGainRange = inputRanges[maxGainAttribute]; var maxGainThreshold = thresholds[maxGainIndex]; // Mark this attribute as already used AttributeUsageCount[maxGainAttribute]++; double[][] inputSubset; int[] outputSubset; double[] weightSubset; // Now, create next nodes and pass those partitions as their responsibilities. if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete) { // This is a discrete nature attribute. We will branch at each // possible value for the discrete variable and call recursion. var children = new DecisionNode[maxGainPartition.Length]; // Create a branch for each possible value for (int i = 0; i < children.Length; i++) { children[i] = new DecisionNode(Model) { Parent = root, Value = i + maxGainRange.Min, Comparison = ComparisonKind.Equal, }; inputSubset = inputs.Get(maxGainPartition[i]); outputSubset = outputs.Get(maxGainPartition[i]); weightSubset = weights.Get(maxGainPartition[i]); if (outputSubset.Length == 0) { //in this case the we have no samples for this category //but we still want to be able to make a decision, so we will give the best of the current node as output outputSubset = new int[1] { Measures.WeightedMode(outputs, weights) }; weightSubset = new double[1] { 1 }; //does not matter } split(children[i], inputSubset, outputSubset, weightSubset, height + 1); // recursion } root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } else if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Continuous) { List <int> partitionBelowThreshold = maxGainPartition[0]; List <int> partitionAboveThreshold = maxGainPartition[1]; if (partitionBelowThreshold != null && partitionAboveThreshold != null) { //Before we branch we test whether each node is big enough, we stop here and set it as a leaf node if (partitionAboveThreshold.Count < minimumLeafSize || partitionBelowThreshold.Count < minimumLeafSize) { root.Output = Measures.WeightedMode(outputs, weights); } else { // This is a continuous nature attribute, and we achieved two partitions // using the partitioning scheme. We will branch on two possible settings: // either the value is greater than a currently detected optimal threshold // or it is less. DecisionNode[] children = { new DecisionNode(Model) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.LessThanOrEqual }, new DecisionNode(Model) { Parent = root, Value = maxGainThreshold, Comparison = ComparisonKind.GreaterThan } }; // Create a branch for lower values inputSubset = inputs.Get(partitionBelowThreshold); outputSubset = outputs.Get(partitionBelowThreshold); weightSubset = weights.Get(partitionBelowThreshold); split(children[0], inputSubset, outputSubset, weightSubset, height + 1); // Create a branch for higher values inputSubset = inputs.Get(partitionAboveThreshold); outputSubset = outputs.Get(partitionAboveThreshold); weightSubset = weights.Get(partitionAboveThreshold); split(children[1], inputSubset, outputSubset, weightSubset, height + 1); root.Branches.AttributeIndex = maxGainAttribute; root.Branches.AddRange(children); } } else { // This is a continuous nature attribute, but all variables are equal // to a constant. If there is only a constant value as the predictor // and there are multiple output labels associated with this constant // value, there isn't much we can do. This node will be a leaf. // We will set the class label for this node as the // majority of the currently selected output classes. var outputIndices = partitionBelowThreshold ?? partitionAboveThreshold; outputSubset = outputs.Get(outputIndices); root.Output = Measures.Mode(outputSubset); } } AttributeUsageCount[maxGainAttribute]--; }