private void normalize(DecisionTreeNode <T, D> node) { if (_labelWeights != null) { for (int i = 0; i < _labelWeights.Length; i++) { node.Distribution[i] *= _labelWeights[i]; } } float sum = 0; for (int i = 0; i < _numLabels; i++) { sum += node.Distribution[i]; } node.TrainingDataCount = sum; Decider <T, D> .Normalize(node.Distribution); float entropy = 0; for (int i = 0; i < LabelCount; i++) { entropy += node.Distribution[i] * (float)Math.Log(node.Distribution[i], 2); } node.Entropy = -entropy; if (node.NodeType == NodeType.Branch) { normalize(node.Left); normalize(node.Right); } }
public DecisionTreeNode(Decider <T, D> decider, DecisionTreeNode <T, D> left, DecisionTreeNode <T, D> right) { _type = NodeType.Branch; _decider = decider; _left = left; _right = right; }
private static DecisionTreeNode <T, D> computeDepthFirst(DecisionTreeNode <T, D> node, List <T> data, IFeatureFactory <T, D> factory, int numFeatures, int numThresholds, int numLabels, float[] labelWeights, int depth) { GC.Collect(); if (data.Count == 0) { UpdateManager.WriteLine("No data at depth {0}", depth); return(null); } if (data[0] is IComparable <T> ) { data.Sort(); } if (checkDelta(data)) { UpdateManager.WriteLine("Delta function at depth {0}", depth); int label = data[0].Label; float[] dist = new float[numLabels]; dist[label] = 1; return(new DecisionTreeNode <T, D>(dist)); } int dataCount = data.Count; Decider <T, D> bestDecider = null; float bestScore = float.MinValue; float[] bestLeftDistribution = null; float[] bestRightDistribution = null; using (ThreadLocal <DecisionResult> results = new ThreadLocal <DecisionResult>(() => new DecisionResult { Score = bestScore }, true)) { Parallel.For(0, numFeatures, i => { float[] leftDistribution; float[] rightDistribution; Decider <T, D> decider = new Decider <T, D>(factory); decider.LoadData(data); float score = decider.ChooseThreshold(numThresholds, numLabels, labelWeights, out leftDistribution, out rightDistribution); if (score > results.Value.Score) { results.Value = new DecisionResult { LeftDistribution = leftDistribution, RightDistribution = rightDistribution, Decider = decider, Score = score }; } }); foreach (var result in results.Values) { if (result.Score > bestScore) { bestLeftDistribution = result.LeftDistribution; bestRightDistribution = result.RightDistribution; bestDecider = result.Decider; bestScore = result.Score; } } } float support = 0; if (labelWeights != null) { foreach (T point in data) { support += labelWeights[point.Label]; } } else { support = dataCount; } if (bestScore == float.MinValue || dataCount < MinimumSupport) { UpdateManager.WriteLine("Stopping due to lack of data at depth {0}, {1} < {2}", depth, dataCount, MinimumSupport); float[] distribution = new float[numLabels]; for (int i = 0; i < dataCount; i++) { distribution[data[i].Label]++; } if (labelWeights != null) { for (int i = 0; i < distribution.Length; i++) { distribution[i] *= labelWeights[i]; } } return(new DecisionTreeNode <T, D>(distribution)); } if (depth == MaximumDepth - 2) { UpdateManager.WriteLine("Last branch node trained at depth {0}", depth); node.Left = new DecisionTreeNode <T, D>(bestLeftDistribution); node.Right = new DecisionTreeNode <T, D>(bestRightDistribution); node.NodeType = NodeType.Branch; node.Decider = bestDecider; return(node); } Decision[] decisions = bestDecider.Decide(data); List <T> leftData = new List <T>(); List <T> rightData = new List <T>(); for (int i = 0; i < decisions.Length; i++) { if (decisions[i] == Decision.Left) { leftData.Add(data[i]); } else { rightData.Add(data[i]); } } if (leftData.Count == 0 || rightData.Count == 0) { throw new Exception("Error"); } UpdateManager.WriteLine("Branch node at depth {0} trained.", depth); node.Left = computeDepthFirst(new DecisionTreeNode <T, D>(), leftData, factory, numFeatures, numThresholds, numLabels, labelWeights, depth + 1); node.Right = computeDepthFirst(new DecisionTreeNode <T, D>(), rightData, factory, numFeatures, numThresholds, numLabels, labelWeights, depth + 1); node.Decider = bestDecider; node.NodeType = NodeType.Branch; return(node); }
private static DecisionTreeNode <T, D> computeBreadthFirst(float threshold, List <T> data, IFeatureFactory <T, D> factory, int numFeatures, int numThresholds, int numLabels, float[] labelWeights) { string id = "DecisionTree.ComputeBreadthFirst"; Queue <SplitCandidate> candidates = new Queue <SplitCandidate>(); SplitCandidate start = new SplitCandidate(new List <int>(), 1, 0); for (int i = 0; i < data.Count; i++) { start.Indices.Add(i); } start.Entropy = calculateEntropy(data, start.Indices, labelWeights, numLabels); start.Support = calculateSupport(data, start.Indices, labelWeights); candidates.Enqueue(start); bool changed = true; float[] leftDistribution, rightDistribution; int tries = (int)_numberOfTries; float increment = threshold / tries; Dictionary <int, Decider <T, D> > deciders = new Dictionary <int, Decider <T, D> >(); while (tries > 0) { if (!changed) { threshold -= increment; UpdateManager.WriteLine("Decreasing threshold to {0}", threshold); } GC.Collect(); int count = candidates.Count; for (int i = 0; i < count; i++) { SplitCandidate candidate = candidates.Dequeue(); if (candidate.Delta) { candidates.Enqueue(candidate); continue; } if (MaximumDepth > 0 && candidate.Level >= MaximumDepth - 1) { candidates.Enqueue(candidate); continue; } if (candidate.Support < MinimumSupport) { candidates.Enqueue(candidate); continue; } int dataCount = candidate.Indices.Count; if (candidate.Values == null) { candidate.Values = new float[dataCount]; } if (candidate.Labels == null) { candidate.Labels = new int[dataCount]; } if (candidate.Weights == null) { candidate.Weights = new float[dataCount]; } candidates.Enqueue(candidate); } float bestGain = float.MinValue; for (int k = 0; k < numFeatures; k++) { UpdateManager.RaiseProgress(k, numFeatures); Decider <T, D> decider = new Decider <T, D>(factory); decider.ApplyFeature(data); for (int i = 0; i < count; i++) { SplitCandidate candidate = candidates.Dequeue(); if (MaximumDepth > 0 && candidate.Level >= MaximumDepth - 1) { candidates.Enqueue(candidate); continue; } if (candidate.Delta) { candidates.Enqueue(candidate); continue; } if (candidate.Support < MinimumSupport) { candidates.Enqueue(candidate); continue; } List <int> indices = candidate.Indices; int dataCount = indices.Count; for (int j = 0; j < dataCount; j++) { T point = data[indices[j]]; candidate.Values[j] = point.FeatureValue; candidate.Labels[j] = point.Label; candidate.Weights[j] = point.Weight; } decider.SetData(candidate.Values, candidate.Weights, candidate.Labels); float gain = candidate.Entropy + decider.ChooseThreshold(numThresholds, numLabels, labelWeights, out leftDistribution, out rightDistribution); bestGain = Math.Max(gain, bestGain); if ((gain > threshold || candidate.Level < MinimumDepth) && gain > candidate.EntropyGain) { candidate.EntropyGain = gain; candidate.Decider = new Decider <T, D>(decider.Feature, decider.Threshold); } candidates.Enqueue(candidate); } } UpdateManager.WriteLine(id, "\rNodes Added:"); changed = false; for (int i = 0; i < count; i++) { SplitCandidate candidate = candidates.Dequeue(); if (candidate.Decider == null) { candidates.Enqueue(candidate); continue; } changed = true; List <int> indices = candidate.Indices; int dataCount = candidate.Indices.Count; List <T> points = new List <T>(); for (int j = 0; j < dataCount; j++) { points.Add(data[indices[j]]); } Decision[] decisions = candidate.Decider.Decide(points); List <int> left = new List <int>(); List <int> right = new List <int>(); for (int j = 0; j < dataCount; j++) { if (decisions[j] == Decision.Left) { left.Add(indices[j]); } else { right.Add(indices[j]); } } SplitCandidate leftCandidate = new SplitCandidate(left, 2 * candidate.Index, candidate.Level + 1); SplitCandidate rightCandidate = new SplitCandidate(right, 2 * candidate.Index + 1, candidate.Level + 1); leftCandidate.Entropy = calculateEntropy(data, left, labelWeights, numLabels); leftCandidate.Support = calculateSupport(data, left, labelWeights); leftCandidate.Delta = calculateDelta(data, left); rightCandidate.Entropy = calculateEntropy(data, right, labelWeights, numLabels); rightCandidate.Support = calculateSupport(data, right, labelWeights); rightCandidate.Delta = calculateDelta(data, right); UpdateManager.WriteLine(id, "{3:00000}:{0:0.000}|{1:0.000} {2:0.000} {4}", leftCandidate.Support / candidate.Support, rightCandidate.Support / candidate.Support, candidate.EntropyGain, candidate.Index, candidate.Decider); deciders[candidate.Index] = candidate.Decider; candidates.Enqueue(leftCandidate); candidates.Enqueue(rightCandidate); } if (!changed) { UpdateManager.WriteLine("No new nodes added, best entropy gain was {0}", bestGain); tries--; } if (bestGain == float.MinValue) { break; } } Dictionary <int, List <int> > leafIndices = new Dictionary <int, List <int> >(); while (candidates.Count > 0) { SplitCandidate candidate = candidates.Dequeue(); leafIndices[candidate.Index] = candidate.Indices; } return(buildTree(new DecisionTreeNode <T, D>(), 1, deciders, leafIndices, data, numLabels, labelWeights)); }
public DeciderState(IFeatureFactory <T, D> factory) { Current = new Decider <T, D>(factory); BestEnergy = float.MaxValue; }