public void Train(Datas.Useable train) { var branch_score = train.getLabelCounts(); int max_correct_branch = branch_score.Values.Max(); if (branch_score.Values.Sum() != max_correct_branch) // All children in one levels { var tups = new Tuple <float, float> [train._Labels.Count]; int cols = train._CountColumns; int rows = train._CountRows; int best_correct = int.MinValue; for (int c = 0; c < cols; c++) { for (int r = 0; r < rows; r++) { tups[r] = new Tuple <float, float>(train._Labels[r], train._Data[r, c]); } Array.Sort(tups, (Tuple <float, float> a, Tuple <float, float> b) => { return(a.Item2.CompareTo(b.Item2)); }); var branch_less_data = new Dictionary <float, int>(); var branch_more_data = new Dictionary <float, int>(); foreach (var kvp in branch_score) { branch_less_data[kvp.Key] = 0; branch_more_data[kvp.Key] = kvp.Value; } for (int split_point = 0; split_point < rows - 1; split_point++) { var tup = tups[split_point]; float this_label = tup.Item1; float this_value = tup.Item2; branch_less_data[this_label]++; branch_more_data[this_label]--; float next_value = tups[split_point + 1].Item2; // Skip identical values. float split_value = (this_value + next_value) / 2; if ((this_value < split_value) == (next_value < split_value)) { continue; } int correct = branch_less_data.Values.Max() + branch_more_data.Values.Max(); if (correct > best_correct) { best_correct = correct; this._BranchSplitValue = split_value; this._BranchColumn = c; this._BranchLessClassification = branch_less_data.ArgMax(); this._BranchMoreClassification = branch_more_data.ArgMax(); } } } if (best_correct != int.MinValue) { return; } } this._LeafClassification = branch_score.ArgMax(); this._IsLeaf = true; }
public void Train(Datas.Useable train, int current_depth) { var branch_score = train.getLabelCounts(); int max_correct_branch = branch_score.Values.Max(); if ((branch_score.Values.Sum() != max_correct_branch) && // All children are in one cluster. (this._MaxDepth != current_depth)) // Limit Levels { var tups = new Tuple <float, float> [train._Labels.Count]; int cols = train._CountColumns; int rows = train._CountRows; double best_entropy = double.MaxValue; int best_column = -1; float best_split = -1; for (int c = 0; c < cols; c++) { for (int r = 0; r < rows; r++) { tups[r] = new Tuple <float, float>(train._Labels[r], train._Data[r, c]); } Array.Sort(tups, (Tuple <float, float> a, Tuple <float, float> b) => { return(a.Item2.CompareTo(b.Item2)); }); var branch_less_data = new Dictionary <float, int>(); var branch_more_data = new Dictionary <float, int>(); foreach (var kvp in branch_score) { branch_less_data[kvp.Key] = 0; branch_more_data[kvp.Key] = kvp.Value; } for (int split_point = 0; split_point < rows - 1; split_point++) { var tup = tups[split_point]; float this_label = tup.Item1; float this_value = tup.Item2; branch_less_data[this_label]++; branch_more_data[this_label]--; float next_value = tups[split_point + 1].Item2; // Skip identical values. float split_value = (this_value + next_value) / 2; if ((this_value < split_value) == (next_value < split_value)) { continue; } double p_less = (split_point + 1.0) / rows; double p_more = 1 - p_less; double entropy = p_less * branch_less_data.Values.Entropy() + p_more * branch_more_data.Values.Entropy(); if (entropy < best_entropy) { best_entropy = entropy; best_split = split_value; best_column = c; } } } if (best_column != -1) { this._BranchSplitValue = best_split; this._BranchColumn = best_column; Datas.Useable less, more; train.Split( this._BranchColumn, this._BranchSplitValue, out less, out more); this._BranchLess = new DecisionTree(this._MaxDepth); this._BranchMore = new DecisionTree(this._MaxDepth); this._BranchLess.Train(less, current_depth + 1); this._BranchMore.Train(more, current_depth + 1); return; } } this._LeafClassification = branch_score.ArgMax(); this._IsLeaf = true; }