Пример #1
0
        private double computeInfoDiscrete(double[][] input, int[] output, double[] weight,
                                           int attributeIndex, out List <int>[] partitions, out List <int> missingValues)
        {
            // Compute the information gain obtained by using
            // this current attribute as the next decision node.
            double info = 0;

            IntRange valueRange             = inputRanges[attributeIndex];
            int      numberOfDistinctValues = valueRange.Length + 1;

            partitions = new List <int> [numberOfDistinctValues];

            missingValues = new List <int>();
            for (int j = 0; j < input.Length; j++)
            {
                if (Double.IsNaN(input[j][attributeIndex]))
                {
                    missingValues.Add(j);
                }
            }

            // For each possible value of the attribute
            for (int i = 0; i < numberOfDistinctValues; i++)
            {
                int value = valueRange.Min + i;

                // Partition the remaining data set
                // according to the attribute values
                var indicesInPartition = new List <int>();

                double weightTotalSum  = 0;
                double weightSubsetSum = 0;

                for (int j = 0; j < input.Length; j++)
                {
                    double x = input[j][attributeIndex];
                    if (!Double.IsNaN(x) && x == value)
                    {
                        indicesInPartition.Add(j);
                        weightSubsetSum += weight[j];
                    }
                    weightTotalSum += weight[j];
                }

                // For each of the instances under responsibility
                // of this node, check which have the same value
                int[]    outputSubset = output.Get(indicesInPartition);
                double[] weightSubset = weight.Get(indicesInPartition);

                // Check the entropy gain originating from this partitioning
                double e = Measures.WeightedEntropy(outputSubset, weightSubset, Model.NumberOfClasses);
                info += (weightSubsetSum / weightTotalSum) * e;

                partitions[i] = indicesInPartition;
            }

            return(info);
        }
Пример #2
0
        private double computeInfoContinuous(double[][] input, int[] output, double[] weight,
                                             int attributeIndex, out List <int>[] partitions, out List <int> missingValues, out double threshold)
        {
            // Compute the information gain obtained by using
            // this current attribute as the next decision node.
            double[] t        = thresholds[attributeIndex];
            double   bestGain = Double.NegativeInfinity;

            missingValues = new List <int>();
            for (int j = 0; j < input.Length; j++)
            {
                if (Double.IsNaN(input[j][attributeIndex]))
                {
                    missingValues.Add(j);
                }
            }

            // If there are no possible thresholds that we can use
            // to split the data (i.e. if all values are the same)
            if (t.Length == 0)
            {
                // Then they all belong to the same partition
                partitions = new[] { new List <int>(Vector.Range(input.Length)), null };
                threshold  = Double.NegativeInfinity;
                return(bestGain);
            }

            partitions = null;

            double bestThreshold = t[0];

            var indicesBelowThreshold = new List <int>(input.Length);
            var indicesAboveThreshold = new List <int>(input.Length);

            var output1 = new List <int>(input.Length);
            var output2 = new List <int>(input.Length);

            var weights1 = new List <double>(input.Length);
            var weights2 = new List <double>(input.Length);

            // For each possible splitting point of the attribute
            for (int i = 0; i < t.Length; i += splitStep)
            {
                // Partition the remaining data set
                // according to the threshold value
                double value = t[i];

                for (int j = 0; j < input.Length; j++)
                {
                    double x = input[j][attributeIndex];

                    if (Double.IsNaN(x))
                    {
                        continue;
                    }
                    else if (x <= value)
                    {
                        indicesBelowThreshold.Add(j);
                        output1.Add(output[j]);
                        weights1.Add(weight[j]);
                    }
                    else if (x > value)
                    {
                        indicesAboveThreshold.Add(j);
                        output2.Add(output[j]);
                        weights2.Add(weight[j]);
                    }
                }

                double weightSum = weight.Sum();
                double p1        = weights1.Sum() / weightSum;
                double p2        = weights2.Sum() / weightSum;

                double splitGain =
                    -p1 *Measures.WeightedEntropy(output1, weights1, Model.NumberOfClasses) +
                    -p2 *Measures.WeightedEntropy(output2, weights2, Model.NumberOfClasses);

                if (splitGain > bestGain)
                {
                    bestThreshold = value;
                    bestGain      = splitGain;

                    if (indicesBelowThreshold.Count == 0)
                    {
                        indicesBelowThreshold = null;
                    }
                    if (indicesAboveThreshold.Count == 0)
                    {
                        indicesAboveThreshold = null;
                    }
                    partitions = new[] { indicesBelowThreshold, indicesAboveThreshold };

                    indicesBelowThreshold = new List <int>(input.Length);
                    indicesAboveThreshold = new List <int>(input.Length);
                }
                else
                {
                    indicesBelowThreshold.Clear();
                    indicesAboveThreshold.Clear();
                }

                output1.Clear();
                output2.Clear();
                weights1.Clear();
                weights2.Clear();
            }

            threshold = bestThreshold;
            return(bestGain);
        }
Пример #3
0
        private void split(DecisionNode root, double[][] inputs, int[] outputs, double[] weights, int height)
        {
            // 2. If all examples are for the same class, return the single-node
            //    tree with the output label corresponding to this common class.
            double entropy = Measures.WeightedEntropy(outputs, weights, Model.NumberOfClasses);

            if (entropy == 0)
            {
                if (outputs.Length > 0)
                {
                    root.Output = outputs[0];
                }
                return;
            }

            // 3. If number of predicting attributes is empty, then return the single-node
            //    tree with the output label corresponding to the most common value of
            //    the target attributes in the examples.

            // how many variables have been used less than the limit (if there is a limit)
            int[] candidates = Matrix.Find(AttributeUsageCount, x => Join == 0 ? true : x < Join);

            if (candidates.Length == 0 || (MaxHeight > 0 && height == MaxHeight) ||
                (minimumSplitSize > 0 && inputs.Length < minimumSplitSize))
            {
                root.Output = Measures.WeightedMode(outputs, weights);
                return;
            }


            // 4. Otherwise, try to select the attribute which
            //    best explains the data sample subset. If the tree
            //    is part of a random forest, only consider a percentage
            //    of the candidate attributes at each split point

            if (MaxVariables > 0 && candidates.Length > MaxVariables)
            {
                candidates = Vector.Sample(candidates, MaxVariables);
            }

            var scores     = new double[candidates.Length];
            var thresholds = new double[candidates.Length];
            var partitions = new List <int> [candidates.Length][];

            if (ParallelOptions.MaxDegreeOfParallelism == 1)
            {
                // For each attribute in the data set
                for (int i = 0; i < scores.Length; i++)
                {
                    scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i],
                                                 entropy, out partitions[i], out thresholds[i]);
                }
            }
            else
            {
                // For each attribute in the data set
                Parallel.For(0, scores.Length, ParallelOptions, i =>
                {
                    scores[i] = computeGainRatio(inputs, outputs, weights, candidates[i],
                                                 entropy, out partitions[i], out thresholds[i]);
                });
            }

            // Select the attribute with maximum gain ratio
            int maxGainIndex; scores.Max(out maxGainIndex);
            var maxGainPartition = partitions[maxGainIndex];
            var maxGainAttribute = candidates[maxGainIndex];
            var maxGainRange     = inputRanges[maxGainAttribute];
            var maxGainThreshold = thresholds[maxGainIndex];

            // Mark this attribute as already used
            AttributeUsageCount[maxGainAttribute]++;

            double[][] inputSubset;
            int[]      outputSubset;
            double[]   weightSubset;

            // Now, create next nodes and pass those partitions as their responsibilities.
            if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Discrete)
            {
                // This is a discrete nature attribute. We will branch at each
                // possible value for the discrete variable and call recursion.
                var children = new DecisionNode[maxGainPartition.Length];

                // Create a branch for each possible value
                for (int i = 0; i < children.Length; i++)
                {
                    children[i] = new DecisionNode(Model)
                    {
                        Parent     = root,
                        Value      = i + maxGainRange.Min,
                        Comparison = ComparisonKind.Equal,
                    };

                    inputSubset  = inputs.Get(maxGainPartition[i]);
                    outputSubset = outputs.Get(maxGainPartition[i]);
                    weightSubset = weights.Get(maxGainPartition[i]);

                    if (outputSubset.Length == 0)
                    {
                        //in this case the we have no samples for this category
                        //but we still want to be able to make a decision, so we will give the best of the current node as output
                        outputSubset = new int[1] {
                            Measures.WeightedMode(outputs, weights)
                        };
                        weightSubset = new double[1] {
                            1
                        };                                 //does not matter
                    }

                    split(children[i], inputSubset, outputSubset, weightSubset, height + 1); // recursion
                }

                root.Branches.AttributeIndex = maxGainAttribute;
                root.Branches.AddRange(children);
            }
            else if (Model.Attributes[maxGainAttribute].Nature == DecisionVariableKind.Continuous)
            {
                List <int> partitionBelowThreshold = maxGainPartition[0];
                List <int> partitionAboveThreshold = maxGainPartition[1];


                if (partitionBelowThreshold != null && partitionAboveThreshold != null)
                {
                    //Before we branch we test whether each node is big enough, we stop here and set it as a leaf node
                    if (partitionAboveThreshold.Count < minimumLeafSize || partitionBelowThreshold.Count < minimumLeafSize)
                    {
                        root.Output = Measures.WeightedMode(outputs, weights);
                    }
                    else
                    {
                        // This is a continuous nature attribute, and we achieved two partitions
                        // using the partitioning scheme. We will branch on two possible settings:
                        // either the value is greater than a currently detected optimal threshold
                        // or it is less.

                        DecisionNode[] children =
                        {
                            new DecisionNode(Model)
                            {
                                Parent     = root, Value = maxGainThreshold,
                                Comparison = ComparisonKind.LessThanOrEqual
                            },

                            new DecisionNode(Model)
                            {
                                Parent     = root, Value = maxGainThreshold,
                                Comparison = ComparisonKind.GreaterThan
                            }
                        };

                        // Create a branch for lower values
                        inputSubset  = inputs.Get(partitionBelowThreshold);
                        outputSubset = outputs.Get(partitionBelowThreshold);
                        weightSubset = weights.Get(partitionBelowThreshold);
                        split(children[0], inputSubset, outputSubset, weightSubset, height + 1);

                        // Create a branch for higher values
                        inputSubset  = inputs.Get(partitionAboveThreshold);
                        outputSubset = outputs.Get(partitionAboveThreshold);
                        weightSubset = weights.Get(partitionAboveThreshold);
                        split(children[1], inputSubset, outputSubset, weightSubset, height + 1);

                        root.Branches.AttributeIndex = maxGainAttribute;
                        root.Branches.AddRange(children);
                    }
                }
                else
                {
                    // This is a continuous nature attribute, but all variables are equal
                    // to a constant. If there is only a constant value as the predictor
                    // and there are multiple output labels associated with this constant
                    // value, there isn't much we can do. This node will be a leaf.

                    // We will set the class label for this node as the
                    // majority of the currently selected output classes.

                    var outputIndices = partitionBelowThreshold ?? partitionAboveThreshold;
                    outputSubset = outputs.Get(outputIndices);
                    root.Output  = Measures.Mode(outputSubset);
                }
            }

            AttributeUsageCount[maxGainAttribute]--;
        }