Example #1
0
        private void normalize(DecisionTreeNode <T, D> node)
        {
            if (_labelWeights != null)
            {
                for (int i = 0; i < _labelWeights.Length; i++)
                {
                    node.Distribution[i] *= _labelWeights[i];
                }
            }
            float sum = 0;

            for (int i = 0; i < _numLabels; i++)
            {
                sum += node.Distribution[i];
            }
            node.TrainingDataCount = sum;
            Decider <T, D> .Normalize(node.Distribution);

            float entropy = 0;

            for (int i = 0; i < LabelCount; i++)
            {
                entropy += node.Distribution[i] * (float)Math.Log(node.Distribution[i], 2);
            }
            node.Entropy = -entropy;
            if (node.NodeType == NodeType.Branch)
            {
                normalize(node.Left);
                normalize(node.Right);
            }
        }
Example #2
0
 public DecisionTreeNode(Decider <T, D> decider, DecisionTreeNode <T, D> left, DecisionTreeNode <T, D> right)
 {
     _type    = NodeType.Branch;
     _decider = decider;
     _left    = left;
     _right   = right;
 }
Example #3
0
        private static DecisionTreeNode <T, D> computeDepthFirst(DecisionTreeNode <T, D> node, List <T> data, IFeatureFactory <T, D> factory, int numFeatures, int numThresholds, int numLabels, float[] labelWeights, int depth)
        {
            GC.Collect();
            if (data.Count == 0)
            {
                UpdateManager.WriteLine("No data at depth {0}", depth);
                return(null);
            }
            if (data[0] is IComparable <T> )
            {
                data.Sort();
            }
            if (checkDelta(data))
            {
                UpdateManager.WriteLine("Delta function at depth {0}", depth);
                int     label = data[0].Label;
                float[] dist  = new float[numLabels];
                dist[label] = 1;
                return(new DecisionTreeNode <T, D>(dist));
            }
            int            dataCount   = data.Count;
            Decider <T, D> bestDecider = null;
            float          bestScore   = float.MinValue;

            float[] bestLeftDistribution  = null;
            float[] bestRightDistribution = null;
            using (ThreadLocal <DecisionResult> results = new ThreadLocal <DecisionResult>(() => new DecisionResult {
                Score = bestScore
            }, true))
            {
                Parallel.For(0, numFeatures, i =>
                {
                    float[] leftDistribution;
                    float[] rightDistribution;
                    Decider <T, D> decider = new Decider <T, D>(factory);
                    decider.LoadData(data);
                    float score = decider.ChooseThreshold(numThresholds, numLabels, labelWeights, out leftDistribution, out rightDistribution);
                    if (score > results.Value.Score)
                    {
                        results.Value = new DecisionResult {
                            LeftDistribution = leftDistribution, RightDistribution = rightDistribution, Decider = decider, Score = score
                        };
                    }
                });
                foreach (var result in results.Values)
                {
                    if (result.Score > bestScore)
                    {
                        bestLeftDistribution  = result.LeftDistribution;
                        bestRightDistribution = result.RightDistribution;
                        bestDecider           = result.Decider;
                        bestScore             = result.Score;
                    }
                }
            }

            float support = 0;

            if (labelWeights != null)
            {
                foreach (T point in data)
                {
                    support += labelWeights[point.Label];
                }
            }
            else
            {
                support = dataCount;
            }
            if (bestScore == float.MinValue || dataCount < MinimumSupport)
            {
                UpdateManager.WriteLine("Stopping due to lack of data at depth {0}, {1} < {2}", depth, dataCount, MinimumSupport);
                float[] distribution = new float[numLabels];
                for (int i = 0; i < dataCount; i++)
                {
                    distribution[data[i].Label]++;
                }
                if (labelWeights != null)
                {
                    for (int i = 0; i < distribution.Length; i++)
                    {
                        distribution[i] *= labelWeights[i];
                    }
                }
                return(new DecisionTreeNode <T, D>(distribution));
            }
            if (depth == MaximumDepth - 2)
            {
                UpdateManager.WriteLine("Last branch node trained at depth {0}", depth);
                node.Left     = new DecisionTreeNode <T, D>(bestLeftDistribution);
                node.Right    = new DecisionTreeNode <T, D>(bestRightDistribution);
                node.NodeType = NodeType.Branch;
                node.Decider  = bestDecider;
                return(node);
            }
            Decision[] decisions = bestDecider.Decide(data);
            List <T>   leftData  = new List <T>();
            List <T>   rightData = new List <T>();

            for (int i = 0; i < decisions.Length; i++)
            {
                if (decisions[i] == Decision.Left)
                {
                    leftData.Add(data[i]);
                }
                else
                {
                    rightData.Add(data[i]);
                }
            }
            if (leftData.Count == 0 || rightData.Count == 0)
            {
                throw new Exception("Error");
            }
            UpdateManager.WriteLine("Branch node at depth {0} trained.", depth);
            node.Left     = computeDepthFirst(new DecisionTreeNode <T, D>(), leftData, factory, numFeatures, numThresholds, numLabels, labelWeights, depth + 1);
            node.Right    = computeDepthFirst(new DecisionTreeNode <T, D>(), rightData, factory, numFeatures, numThresholds, numLabels, labelWeights, depth + 1);
            node.Decider  = bestDecider;
            node.NodeType = NodeType.Branch;
            return(node);
        }
Example #4
0
        private static DecisionTreeNode <T, D> computeBreadthFirst(float threshold, List <T> data, IFeatureFactory <T, D> factory, int numFeatures, int numThresholds, int numLabels, float[] labelWeights)
        {
            string id = "DecisionTree.ComputeBreadthFirst";
            Queue <SplitCandidate> candidates = new Queue <SplitCandidate>();
            SplitCandidate         start      = new SplitCandidate(new List <int>(), 1, 0);

            for (int i = 0; i < data.Count; i++)
            {
                start.Indices.Add(i);
            }
            start.Entropy = calculateEntropy(data, start.Indices, labelWeights, numLabels);
            start.Support = calculateSupport(data, start.Indices, labelWeights);
            candidates.Enqueue(start);
            bool changed = true;

            float[] leftDistribution, rightDistribution;
            int     tries     = (int)_numberOfTries;
            float   increment = threshold / tries;
            Dictionary <int, Decider <T, D> > deciders = new Dictionary <int, Decider <T, D> >();

            while (tries > 0)
            {
                if (!changed)
                {
                    threshold -= increment;
                    UpdateManager.WriteLine("Decreasing threshold to {0}", threshold);
                }
                GC.Collect();
                int count = candidates.Count;
                for (int i = 0; i < count; i++)
                {
                    SplitCandidate candidate = candidates.Dequeue();

                    if (candidate.Delta)
                    {
                        candidates.Enqueue(candidate);
                        continue;
                    }
                    if (MaximumDepth > 0 && candidate.Level >= MaximumDepth - 1)
                    {
                        candidates.Enqueue(candidate);
                        continue;
                    }
                    if (candidate.Support < MinimumSupport)
                    {
                        candidates.Enqueue(candidate);
                        continue;
                    }
                    int dataCount = candidate.Indices.Count;
                    if (candidate.Values == null)
                    {
                        candidate.Values = new float[dataCount];
                    }
                    if (candidate.Labels == null)
                    {
                        candidate.Labels = new int[dataCount];
                    }
                    if (candidate.Weights == null)
                    {
                        candidate.Weights = new float[dataCount];
                    }

                    candidates.Enqueue(candidate);
                }
                float bestGain = float.MinValue;
                for (int k = 0; k < numFeatures; k++)
                {
                    UpdateManager.RaiseProgress(k, numFeatures);
                    Decider <T, D> decider = new Decider <T, D>(factory);
                    decider.ApplyFeature(data);
                    for (int i = 0; i < count; i++)
                    {
                        SplitCandidate candidate = candidates.Dequeue();
                        if (MaximumDepth > 0 && candidate.Level >= MaximumDepth - 1)
                        {
                            candidates.Enqueue(candidate);
                            continue;
                        }
                        if (candidate.Delta)
                        {
                            candidates.Enqueue(candidate);
                            continue;
                        }
                        if (candidate.Support < MinimumSupport)
                        {
                            candidates.Enqueue(candidate);
                            continue;
                        }
                        List <int> indices   = candidate.Indices;
                        int        dataCount = indices.Count;
                        for (int j = 0; j < dataCount; j++)
                        {
                            T point = data[indices[j]];
                            candidate.Values[j]  = point.FeatureValue;
                            candidate.Labels[j]  = point.Label;
                            candidate.Weights[j] = point.Weight;
                        }
                        decider.SetData(candidate.Values, candidate.Weights, candidate.Labels);
                        float gain = candidate.Entropy + decider.ChooseThreshold(numThresholds, numLabels, labelWeights, out leftDistribution, out rightDistribution);
                        bestGain = Math.Max(gain, bestGain);
                        if ((gain > threshold || candidate.Level < MinimumDepth) && gain > candidate.EntropyGain)
                        {
                            candidate.EntropyGain = gain;
                            candidate.Decider     = new Decider <T, D>(decider.Feature, decider.Threshold);
                        }
                        candidates.Enqueue(candidate);
                    }
                }
                UpdateManager.WriteLine(id, "\rNodes Added:");
                changed = false;
                for (int i = 0; i < count; i++)
                {
                    SplitCandidate candidate = candidates.Dequeue();
                    if (candidate.Decider == null)
                    {
                        candidates.Enqueue(candidate);
                        continue;
                    }
                    changed = true;
                    List <int> indices   = candidate.Indices;
                    int        dataCount = candidate.Indices.Count;
                    List <T>   points    = new List <T>();
                    for (int j = 0; j < dataCount; j++)
                    {
                        points.Add(data[indices[j]]);
                    }
                    Decision[] decisions = candidate.Decider.Decide(points);
                    List <int> left      = new List <int>();
                    List <int> right     = new List <int>();
                    for (int j = 0; j < dataCount; j++)
                    {
                        if (decisions[j] == Decision.Left)
                        {
                            left.Add(indices[j]);
                        }
                        else
                        {
                            right.Add(indices[j]);
                        }
                    }
                    SplitCandidate leftCandidate  = new SplitCandidate(left, 2 * candidate.Index, candidate.Level + 1);
                    SplitCandidate rightCandidate = new SplitCandidate(right, 2 * candidate.Index + 1, candidate.Level + 1);
                    leftCandidate.Entropy  = calculateEntropy(data, left, labelWeights, numLabels);
                    leftCandidate.Support  = calculateSupport(data, left, labelWeights);
                    leftCandidate.Delta    = calculateDelta(data, left);
                    rightCandidate.Entropy = calculateEntropy(data, right, labelWeights, numLabels);
                    rightCandidate.Support = calculateSupport(data, right, labelWeights);
                    rightCandidate.Delta   = calculateDelta(data, right);
                    UpdateManager.WriteLine(id, "{3:00000}:{0:0.000}|{1:0.000} {2:0.000} {4}", leftCandidate.Support / candidate.Support, rightCandidate.Support / candidate.Support, candidate.EntropyGain, candidate.Index, candidate.Decider);
                    deciders[candidate.Index] = candidate.Decider;
                    candidates.Enqueue(leftCandidate);
                    candidates.Enqueue(rightCandidate);
                }
                if (!changed)
                {
                    UpdateManager.WriteLine("No new nodes added, best entropy gain was {0}", bestGain);
                    tries--;
                }
                if (bestGain == float.MinValue)
                {
                    break;
                }
            }
            Dictionary <int, List <int> > leafIndices = new Dictionary <int, List <int> >();

            while (candidates.Count > 0)
            {
                SplitCandidate candidate = candidates.Dequeue();
                leafIndices[candidate.Index] = candidate.Indices;
            }
            return(buildTree(new DecisionTreeNode <T, D>(), 1, deciders, leafIndices, data, numLabels, labelWeights));
        }
Example #5
0
 public DeciderState(IFeatureFactory <T, D> factory)
 {
     Current    = new Decider <T, D>(factory);
     BestEnergy = float.MaxValue;
 }