Beispiel #1
0
        private void split(DecisionNode root, int[][] input, int[] output, int height)
        {
            // 2. If all examples are for the same class, return the single-node
            //    tree with the output label corresponding to this common class.
            double entropy = Measures.Entropy(output, Model.NumberOfClasses);

            if (entropy == 0)
            {
                if (output.Length > 0)
                {
                    root.Output = output[0];
                }
                return;
            }

            // 3. If number of predicting attributes is empty, then return the single-node
            //    tree with the output label corresponding to the most common value of
            //    the target attributes in the examples.
            //

            // how many variables have been used less than the limit (if there is a limit)
            int candidateCount = AttributeUsageCount.Count(x => Join == 0 ? true : x < Join);

            if (candidateCount == 0 || (MaxHeight > 0 && height == MaxHeight))
            {
                root.Output = Measures.Mode(output);
                return;
            }


            // 4. Otherwise, try to select the attribute which
            //    best explains the data sample subset.

            double[]  scores     = new double[candidateCount];
            int[][][] partitions = new int[candidateCount][][];
            int[][][] outputSubs = new int[candidateCount][][];

            // Retrieve candidate attribute indices
            int[] candidates = new int[candidateCount];
            for (int i = 0, k = 0; i < AttributeUsageCount.Length; i++)
            {
                if (AttributeUsageCount[i] < Join)
                {
                    candidates[k++] = i;
                }
            }


            // For each attribute in the data set
            Parallel.For(0, scores.Length, ParallelOptions, i =>
            {
                scores[i] = computeGainRatio(input, output, candidates[i],
                                             entropy, out partitions[i], out outputSubs[i]);
            });

            // Select the attribute with maximum gain ratio
            int maxGainIndex; scores.Max(out maxGainIndex);
            var maxGainPartition = partitions[maxGainIndex];
            var maxGainOutputs   = outputSubs[maxGainIndex];
            var maxGainAttribute = candidates[maxGainIndex];
            var maxGainRange     = inputRanges[maxGainAttribute];

            AttributeUsageCount[maxGainAttribute]++;

            // Now, create next nodes and pass those partitions as their responsibilities.
            DecisionNode[] children = new DecisionNode[maxGainPartition.Length];

            for (int i = 0; i < children.Length; i++)
            {
                children[i] = new DecisionNode(Model)
                {
                    Parent     = root,
                    Comparison = ComparisonKind.Equal,
                    Value      = i + maxGainRange.Min
                };


                int[][] inputSubset  = input.Get(maxGainPartition[i]);
                int[]   outputSubset = maxGainOutputs[i];

                split(children[i], inputSubset, outputSubset, height + 1); // recursion

                if (children[i].IsLeaf)
                {
                    // If the resulting node is a leaf, and it has not
                    // been assigned a value because there were no available
                    // output samples in this category, we will be assigning
                    // the most common label for the current node to it.
                    if (!Rejection && !children[i].Output.HasValue)
                    {
                        children[i].Output = Measures.Mode(output);
                    }
                }
            }


            AttributeUsageCount[maxGainAttribute]--;

            root.Branches.AttributeIndex = maxGainAttribute;
            root.Branches.AddRange(children);
        }
Beispiel #2
0
        private double computeInfoContinuous(double[][] input, int[] output,
                                             int attributeIndex, out int[][] partitions, out double threshold)
        {
            // Compute the information gain obtained by using
            // this current attribute as the next decision node.
            double[] t        = thresholds[attributeIndex];
            double   bestGain = Double.NegativeInfinity;

            // If there are no possible thresholds that we can use
            // to split the data (i.e. if all values are the same)
            if (t.Length == 0)
            {
                // Then they all belong to the same partition
                partitions = new int[][] { Vector.Range(input.Length) };
                threshold  = Double.NegativeInfinity;
                return(bestGain);
            }

            double bestThreshold = t[0];

            partitions = null;

            var idx1 = new List <int>(input.Length);
            var idx2 = new List <int>(input.Length);

            var output1 = new List <int>(input.Length);
            var output2 = new List <int>(input.Length);

            double[] values = new double[input.Length];
            for (int i = 0; i < values.Length; i++)
            {
                values[i] = input[i][attributeIndex];
            }

            // For each possible splitting point of the attribute
            for (int i = 0; i < t.Length; i += splitStep)
            {
                // Partition the remaining data set
                // according to the threshold value
                double value = t[i];

                idx1.Clear();
                idx2.Clear();

                output1.Clear();
                output2.Clear();

                for (int j = 0; j < values.Length; j++)
                {
                    double x = values[j];

                    if (x <= value)
                    {
                        idx1.Add(j);
                        output1.Add(output[j]);
                    }
                    else if (x > value)
                    {
                        idx2.Add(j);
                        output2.Add(output[j]);
                    }
                }

                double p1 = (double)output1.Count / output.Length;
                double p2 = (double)output2.Count / output.Length;

                double splitGain =
                    -p1 *Measures.Entropy(output1, outputClasses) +
                    -p2 *Measures.Entropy(output2, outputClasses);

                if (splitGain > bestGain)
                {
                    bestThreshold = value;
                    bestGain      = splitGain;

                    if (idx1.Count > 0 && idx2.Count > 0)
                    {
                        partitions = new int[][] { idx1.ToArray(), idx2.ToArray() }
                    }
                    ;
                    else if (idx1.Count > 0)
                    {
                        partitions = new int[][] { idx1.ToArray() }
                    }
                    ;
                    else if (idx2.Count > 0)
                    {
                        partitions = new int[][] { idx2.ToArray() }
                    }
                    ;
                    else
                    {
                        partitions = new int[][] { }
                    };
                }
            }

            threshold = bestThreshold;
            return(bestGain);
        }
Beispiel #3
0
        private double computeInfoContinuous(double[][] input, int[] output, double[] weight,
                                             int attributeIndex, out List <int>[] partitions, out List <int> missingValues, out double threshold)
        {
            // Compute the information gain obtained by using
            // this current attribute as the next decision node.
            double[] t        = thresholds[attributeIndex];
            double   bestGain = Double.NegativeInfinity;

            missingValues = new List <int>();
            for (int j = 0; j < input.Length; j++)
            {
                if (Double.IsNaN(input[j][attributeIndex]))
                {
                    missingValues.Add(j);
                }
            }

            // If there are no possible thresholds that we can use
            // to split the data (i.e. if all values are the same)
            if (t.Length == 0)
            {
                // Then they all belong to the same partition
                partitions = new[] { new List <int>(Vector.Range(input.Length)), null };
                threshold  = Double.NegativeInfinity;
                return(bestGain);
            }

            partitions = null;

            double bestThreshold = t[0];

            var indicesBelowThreshold = new List <int>(input.Length);
            var indicesAboveThreshold = new List <int>(input.Length);

            var output1 = new List <int>(input.Length);
            var output2 = new List <int>(input.Length);

            var weights1 = new List <double>(input.Length);
            var weights2 = new List <double>(input.Length);

            // For each possible splitting point of the attribute
            for (int i = 0; i < t.Length; i += splitStep)
            {
                // Partition the remaining data set
                // according to the threshold value
                double value = t[i];

                for (int j = 0; j < input.Length; j++)
                {
                    double x = input[j][attributeIndex];

                    if (Double.IsNaN(x))
                    {
                        continue;
                    }
                    else if (x <= value)
                    {
                        indicesBelowThreshold.Add(j);
                        output1.Add(output[j]);
                        weights1.Add(weight[j]);
                    }
                    else if (x > value)
                    {
                        indicesAboveThreshold.Add(j);
                        output2.Add(output[j]);
                        weights2.Add(weight[j]);
                    }
                }

                double weightSum = weight.Sum();
                double p1        = weights1.Sum() / weightSum;
                double p2        = weights2.Sum() / weightSum;

                double splitGain =
                    -p1 *Measures.WeightedEntropy(output1, weights1, Model.NumberOfClasses) +
                    -p2 *Measures.WeightedEntropy(output2, weights2, Model.NumberOfClasses);

                if (splitGain > bestGain)
                {
                    bestThreshold = value;
                    bestGain      = splitGain;

                    if (indicesBelowThreshold.Count == 0)
                    {
                        indicesBelowThreshold = null;
                    }
                    if (indicesAboveThreshold.Count == 0)
                    {
                        indicesAboveThreshold = null;
                    }
                    partitions = new[] { indicesBelowThreshold, indicesAboveThreshold };

                    indicesBelowThreshold = new List <int>(input.Length);
                    indicesAboveThreshold = new List <int>(input.Length);
                }
                else
                {
                    indicesBelowThreshold.Clear();
                    indicesAboveThreshold.Clear();
                }

                output1.Clear();
                output2.Clear();
                weights1.Clear();
                weights2.Clear();
            }

            threshold = bestThreshold;
            return(bestGain);
        }
Beispiel #4
0
        private void split(DecisionNode root, double[][] input, int[] output, int height)
        {
            // 2. If all examples are for the same class, return the single-node
            //    tree with the output label corresponding to this common class.
            double entropy = Measures.Entropy(output, outputClasses);

            if (entropy == 0)
            {
                if (output.Length > 0)
                {
                    root.Output = output[0];
                }
                return;
            }

            // 3. If number of predicting attributes is empty, then return the single-node
            //    tree with the output label corresponding to the most common value of
            //    the target attributes in the examples.

            // how many variables have been used less than the limit
            int[] candidates = Matrix.Find(attributeUsageCount, x => x < join);

            if (candidates.Length == 0 || (maxHeight > 0 && height == maxHeight))
            {
                root.Output = Measures.Mode(output);
                return;
            }


            // 4. Otherwise, try to select the attribute which
            //    best explains the data sample subset. If the tree
            //    is part of a random forest, only consider a percentage
            //    of the candidate attributes at each split point

            if (MaxVariables > 0)
            {
                candidates = Vector.Sample(candidates, MaxVariables);
            }

            var scores     = new double[candidates.Length];
            var thresholds = new double[candidates.Length];
            var partitions = new int[candidates.Length][][];

            // For each attribute in the data set
            Parallel.For(0, scores.Length, ParallelOptions, i =>
            {
                scores[i] = computeGainRatio(input, output, candidates[i],
                                             entropy, out partitions[i], out thresholds[i]);
            });

            // Select the attribute with maximum gain ratio
            int maxGainIndex; scores.Max(out maxGainIndex);
            var maxGainPartition = partitions[maxGainIndex];
            var maxGainAttribute = candidates[maxGainIndex];
            var maxGainRange     = inputRanges[maxGainAttribute];
            var maxGainThreshold = thresholds[maxGainIndex];

            // Mark this attribute as already used
            attributeUsageCount[maxGainAttribute]++;

            double[][] inputSubset;
            int[]      outputSubset;

            // Now, create next nodes and pass those partitions as their responsibilities.
            if (tree.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete)
            {
                // This is a discrete nature attribute. We will branch at each
                // possible value for the discrete variable and call recursion.
                DecisionNode[] children = new DecisionNode[maxGainPartition.Length];

                // Create a branch for each possible value
                for (int i = 0; i < children.Length; i++)
                {
                    children[i] = new DecisionNode(tree)
                    {
                        Parent     = root,
                        Value      = i + maxGainRange.Min,
                        Comparison = ComparisonKind.Equal,
                    };

                    inputSubset  = input.Get(maxGainPartition[i]);
                    outputSubset = output.Get(maxGainPartition[i]);
                    split(children[i], inputSubset, outputSubset, height + 1); // recursion
                }

                root.Branches.AttributeIndex = maxGainAttribute;
                root.Branches.AddRange(children);
            }

            else if (maxGainPartition.Length > 1)
            {
                // This is a continuous nature attribute, and we achieved two partitions
                // using the partitioning scheme. We will branch on two possible settings:
                // either the value is greater than a currently detected optimal threshold
                // or it is less.

                DecisionNode[] children =
                {
                    new DecisionNode(tree)
                    {
                        Parent     = root, Value = maxGainThreshold,
                        Comparison = ComparisonKind.LessThanOrEqual
                    },

                    new DecisionNode(tree)
                    {
                        Parent     = root, Value = maxGainThreshold,
                        Comparison = ComparisonKind.GreaterThan
                    }
                };

                // Create a branch for lower values
                inputSubset  = input.Get(maxGainPartition[0]);
                outputSubset = output.Get(maxGainPartition[0]);
                split(children[0], inputSubset, outputSubset, height + 1);

                // Create a branch for higher values
                inputSubset  = input.Get(maxGainPartition[1]);
                outputSubset = output.Get(maxGainPartition[1]);
                split(children[1], inputSubset, outputSubset, height + 1);

                root.Branches.AttributeIndex = maxGainAttribute;
                root.Branches.AddRange(children);
            }
            else
            {
                // This is a continuous nature attribute, but all variables are equal
                // to a constant. If there is only a constant value as the predictor
                // and there are multiple output labels associated with this constant
                // value, there isn't much we can do. This node will be a leaf.

                // We will set the class label for this node as the
                // majority of the currently selected output classes.

                outputSubset = output.Get(maxGainPartition[0]);
                root.Output  = Measures.Mode(outputSubset);
            }

            attributeUsageCount[maxGainAttribute]--;
        }
Beispiel #5
0
        private void split(DecisionNode root, double[][] inputs, int[] outputs, double[] weights, int height)
        {
            // 2. If all examples are for the same class, return the single-node
            //    tree with the output label corresponding to this common class.
            double entropy = Measures.WeightedEntropy(outputs, weights, Model.NumberOfClasses);

            if (entropy == 0)
            {
                if (outputs.Length > 0)
                {
                    root.Output = outputs[0];
                }
                return;
            }

            // 3. If number of predicting attributes is empty, then return the single-node
            //    tree with the output label corresponding to the most common value of
            //    the target attributes in the examples.

            // how many variables have been used less than the limit (if there is a limit)
            int[] candidates = Matrix.Find(AttributeUsageCount, x => Join == 0 ? true : x < Join);

            if (candidates.Length == 0 || (MaxHeight > 0 && height == MaxHeight) ||
                (minimumSplitSize > 0 && inputs.Length < minimumSplitSize))
            {
                root.Output = Measures.WeightedMode(outputs, weights);
                return;
            }


            // 4. Otherwise, try to select the attribute which
            //    best explains the data sample subset. If the tree
            //    is part of a random forest, only consider a percentage
            //    of the candidate attributes at each split point

            if (MaxVariables > 0 && candidates.Length > MaxVariables)
            {
                candidates = Vector.Sample(candidates, MaxVariables);
            }

            var scores     = new double[candidates.Length];
            var thresholds = new double[candidates.Length];
            var partitions = new List <int> [candidates.Length][];

            if (ParallelOptions.MaxDegreeOfParallelism == 1)
            {
                // For each attribute in the data set
                for (int i = 0; i < scores.Length; i++)
                {
                    scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i],
                                                 entropy, out partitions[i], out thresholds[i]);
                }
            }
            else
            {
                // For each attribute in the data set
                Parallel.For(0, scores.Length, ParallelOptions, i =>
                {
                    scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i],
                                                 entropy, out partitions[i], out thresholds[i]);
                });
            }

            // Select the attribute with maximum gain ratio
            int maxGainIndex; scores.Max(out maxGainIndex);
            var maxGainPartition = partitions[maxGainIndex];
            var maxGainAttribute = candidates[maxGainIndex];
            var maxGainRange     = inputRanges[maxGainAttribute];
            var maxGainThreshold = thresholds[maxGainIndex];

            // Mark this attribute as already used
            AttributeUsageCount[maxGainAttribute]++;

            double[][] inputSubset;
            int[]      outputSubset;
            double[]   weightSubset;

            // Now, create next nodes and pass those partitions as their responsibilities.
            if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete)
            {
                // This is a discrete nature attribute. We will branch at each
                // possible value for the discrete variable and call recursion.
                var children = new DecisionNode[maxGainPartition.Length];

                // Create a branch for each possible value
                for (int i = 0; i < children.Length; i++)
                {
                    children[i] = new DecisionNode(Model)
                    {
                        Parent     = root,
                        Value      = i + maxGainRange.Min,
                        Comparison = ComparisonKind.Equal,
                    };

                    inputSubset  = inputs.Get(maxGainPartition[i]);
                    outputSubset = outputs.Get(maxGainPartition[i]);
                    weightSubset = weights.Get(maxGainPartition[i]);

                    if (outputSubset.Length == 0)
                    {
                        //in this case the we have no samples for this category
                        //but we still want to be able to make a decision, so we will give the best of the current node as output
                        outputSubset = new int[1] {
                            Measures.WeightedMode(outputs, weights)
                        };
                        weightSubset = new double[1] {
                            1
                        };                                 //does not matter
                    }

                    split(children[i], inputSubset, outputSubset, weightSubset, height + 1); // recursion
                }

                root.Branches.AttributeIndex = maxGainAttribute;
                root.Branches.AddRange(children);
            }
            else if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Continuous)
            {
                List <int> partitionBelowThreshold = maxGainPartition[0];
                List <int> partitionAboveThreshold = maxGainPartition[1];


                if (partitionBelowThreshold != null && partitionAboveThreshold != null)
                {
                    //Before we branch we test whether each node is big enough, we stop here and set it as a leaf node
                    if (partitionAboveThreshold.Count < minimumLeafSize || partitionBelowThreshold.Count < minimumLeafSize)
                    {
                        root.Output = Measures.WeightedMode(outputs, weights);
                    }
                    else
                    {
                        // This is a continuous nature attribute, and we achieved two partitions
                        // using the partitioning scheme. We will branch on two possible settings:
                        // either the value is greater than a currently detected optimal threshold
                        // or it is less.

                        DecisionNode[] children =
                        {
                            new DecisionNode(Model)
                            {
                                Parent     = root, Value = maxGainThreshold,
                                Comparison = ComparisonKind.LessThanOrEqual
                            },

                            new DecisionNode(Model)
                            {
                                Parent     = root, Value = maxGainThreshold,
                                Comparison = ComparisonKind.GreaterThan
                            }
                        };

                        // Create a branch for lower values
                        inputSubset  = inputs.Get(partitionBelowThreshold);
                        outputSubset = outputs.Get(partitionBelowThreshold);
                        weightSubset = weights.Get(partitionBelowThreshold);
                        split(children[0], inputSubset, outputSubset, weightSubset, height + 1);

                        // Create a branch for higher values
                        inputSubset  = inputs.Get(partitionAboveThreshold);
                        outputSubset = outputs.Get(partitionAboveThreshold);
                        weightSubset = weights.Get(partitionAboveThreshold);
                        split(children[1], inputSubset, outputSubset, weightSubset, height + 1);

                        root.Branches.AttributeIndex = maxGainAttribute;
                        root.Branches.AddRange(children);
                    }
                }
                else
                {
                    // This is a continuous nature attribute, but all variables are equal
                    // to a constant. If there is only a constant value as the predictor
                    // and there are multiple output labels associated with this constant
                    // value, there isn't much we can do. This node will be a leaf.

                    // We will set the class label for this node as the
                    // majority of the currently selected output classes.

                    var outputIndices = partitionBelowThreshold ?? partitionAboveThreshold;
                    outputSubset = outputs.Get(outputIndices);
                    root.Output  = Measures.Mode(outputSubset);
                }
            }

            AttributeUsageCount[maxGainAttribute]--;
        }