private void bwLoadData_DoWork(object sender, DoWorkEventArgs e) { var args = e.Argument as ToBackgroundWorkerArgs; var mat = Matrix <float> .Build.DenseOfColumns(this.GetForMatrix( args._PreProcessTransform._Transform, args._Count)); var du = new Datas.Useable[args._PreProcessTransform._Data.Length]; for (int i = 0; i < du.Length; i++) { du[i] = new Datas.Useable( args._PreProcessTransform._Data[i]._Data * mat, args._PreProcessTransform._Data[i]._Labels); } if (this.bwLoadData.CancellationPending) { e.Result = null; } else { e.Result = du; } }
public void SetData(Datas.Useable train, Datas.Useable test) { this._Train = train; this._Test = test; this._Loaded = true; this.LoadData(); }
private void bwLoadData_DoWork(object sender, DoWorkEventArgs e) { try { var args = e.Argument as ToBackgroundWorkerArgs; var train = args._Data[0]; var label_counts = train.getLabelCounts(); var max_labels = label_counts.Values.Max(); var total_rows = max_labels * label_counts.Count; if (total_rows > 5 * train._CountRows) { max_labels = 5 * train._CountRows / label_counts.Count; total_rows = max_labels * label_counts.Count; } var new_train_data = Matrix <float> .Build.Dense(total_rows, train._CountColumns); var new_train_labels = Vector <float> .Build.Dense(total_rows); int new_dex = 0; foreach (var key in label_counts.Keys) { int used = 0; int old_dex = 0; while (used < max_labels) { if (train._Labels[old_dex] == key) { new_train_labels[new_dex] = key; new_train_data.SetRow(new_dex, train._Data.Row(old_dex)); new_dex++; used++; } old_dex = (old_dex + 1) % train._CountRows; } } if (this.bwLoadData.CancellationPending) { e.Result = null; } else { var ret = args._Data.Clone() as Datas.Useable[]; ret[0] = new Datas.Useable(new_train_data, new_train_labels); e.Result = ret; } } catch (Exception exc) { e.Result = exc.ToString(); } }
public ConfusionMatrix(Func <float[], float> func, Datas.Useable data) { // Point.x is actual // Point.y is predicted var counts = new Dictionary <PointF, int>(); int rows = data._CountRows; int cols = data._CountColumns; int count = 0; var parameters = new float[cols]; for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { parameters[c] = data._Data[r, c]; } var key = new PointF(data._Labels[r], func(parameters)); if (!counts.TryGetValue(key, out count)) { count = 0; } counts[key] = ++count; } var hs = new HashSet <float>(); foreach (var pt in counts.Keys) { hs.Add(pt.X); hs.Add(pt.Y); } this._KeysIndexToValues = hs.ToArray(); Array.Sort(this._KeysIndexToValues); this._KeysValuesToIndex = new Dictionary <float, int>(); count = 0; foreach (var value in this._KeysIndexToValues) { this._KeysValuesToIndex[value] = count++; } this._Matrix = Matrix <float> .Build.Dense(this._KeysIndexToValues.Length, this._KeysIndexToValues.Length, 0); foreach (var kvp in counts) { this._Matrix[this._KeysValuesToIndex[kvp.Key.X], this._KeysValuesToIndex[kvp.Key.Y]] = kvp.Value; } this._TotalPoints = rows; }
public ToBackgroundWorkerArgsTree( Datas.Useable train, Datas.Useable test, int max_depth) { this._Train = train; this._Test = test; this._MaxDepth = max_depth; }
public ToBackgroundWorkerkNN( Datas.Useable train, Datas.Useable test, int kNN) { this._Train = train; this._Test = test; this._kNN = kNN; }
public ToBackgroundWorkerArgsForest( Datas.Useable train, Datas.Useable test, int max_depth, int tree_count) { this._Train = train; this._Test = test; this._MaxDepth = max_depth; this._TreeCount = tree_count; }
public ToBackgroundWorkerArgsAdaBoost( Datas.Useable train, Datas.Useable test, Func <Classifiers.BoostableClassifiers.BoostableClassifier> f, int boosts) { this._Train = train; this._Test = test; this._Factory = f; this._Boosts = boosts; }
/// <summary> /// Split as in a decision tree /// </summary> /// <param name="split_column"></param> /// <param name="split_value"></param> /// <param name="less"></param> /// <param name="more"></param> internal void Split( int split_column, float split_value, out Datas.Useable less, out Datas.Useable more) { int count_less = 0; int rows = this._CountRows; int cols = this._CountColumns; for (int r = 0; r < rows; r++) { if (this._Data[r, split_column] < split_value) { count_less++; } } int count_more = rows - count_less; Vector <float> labels_less = Vector <float> .Build.Dense(count_less); Matrix <float> data___less = Matrix <float> .Build.Dense(count_less, cols); Vector <float> labels_more = Vector <float> .Build.Dense(count_more); Matrix <float> data___more = Matrix <float> .Build.Dense(count_more, cols); int dex_less = 0; int dex_more = 0; for (int r = 0; r < rows; r++) { if (this._Data[r, split_column] < split_value) { data___less.SetRow(dex_less, this._Data.Row(r)); labels_less[dex_less] = this._Labels[r]; dex_less++; } else { data___more.SetRow(dex_more, this._Data.Row(r)); labels_more[dex_more] = this._Labels[r]; dex_more++; } } less = new Datas.Useable(data___less, labels_less); more = new Datas.Useable(data___more, labels_more); }
/// <summary> /// Randomly split data to percentage. /// </summary> /// <param name="percent_first"></param> /// <param name="first"></param> /// <param name="second"></param> public void Split( float percent_first, out Datas.Useable first, out Datas.Useable second) { int rows = this._CountRows; int cols = this._CountColumns; int count_first = Math.Max(1, (int)Math.Round(percent_first * this._CountRows)); this.Split( count_first, out first, out second); }
/// <summary> /// Randomly split data to percentage. /// </summary> /// <param name="percent_first"></param> /// <param name="first"></param> /// <param name="second"></param> public void Split( int count_first, out Datas.Useable first, out Datas.Useable second) { int rows = this._CountRows; int cols = this._CountColumns; int count_second = rows - count_first; var bools = Util.PickRandom(count_first, count_second); Vector <float> labels_first = Vector <float> .Build.Dense(count_first); Vector <float> labels_second = Vector <float> .Build.Dense(count_second); Matrix <float> data_first = Matrix <float> .Build.Dense(count_first, cols); Matrix <float> data_second = Matrix <float> .Build.Dense(count_second, cols); int dex_less = 0; int dex_more = 0; for (int r = 0; r < rows; r++) { if (bools[r]) { data_first.SetRow(dex_less, this._Data.Row(r)); labels_first[dex_less] = this._Labels[r]; dex_less++; } else { data_second.SetRow(dex_more, this._Data.Row(r)); labels_second[dex_more] = this._Labels[r]; dex_more++; } } first = new Datas.Useable(data_first, labels_first); second = new Datas.Useable(data_second, labels_second); }
private void bwLoadData_DoWork(object sender, DoWorkEventArgs e) { var args = e.Argument as ToBackgroundWorkerArgs; var data = new Datas.Useable[2]; args._Data.Split( args._PercentTest, out data[1], out data[0]); if (this.bwLoadData.CancellationPending) { e.Result = null; } else { e.Result = data; } }
public void Train(Datas.Useable train) { int rows = train._CountRows; int cols = train._CountColumns; int count = Math.Max(2, (int)Math.Round(this._HoldOut * train._CountRows)); Matrix <float> data = Matrix <float> .Build.Dense(count, cols); Vector <float> labels = Vector <float> .Build.Dense(count); var subset = new Datas.Useable(data, labels); Boolean[] bools = null; for (int i = 0; i < this._TreeCount; i++) { int good_dex = 0; int main_dex = 0; Util.PickRandom(count, rows - count, ref bools); foreach (var b in bools) { if (b) { data.SetRow(good_dex, train._Data.Row(main_dex)); labels[good_dex] = train._Labels[main_dex]; good_dex++; } main_dex++; } this._Trees[i] = new DecisionTree(this._MaxDepth); this._Trees[i].Train(subset); } }
public void Train(Datas.Useable train) { int rows = train._CountRows; int cols = train._CountColumns; var weights = new float[rows]; var predictions = new bool[rows]; var parameters = new float[cols]; for (int r = 0; r < rows; r++) { weights[r] = 1.0f / rows; } this._Classifiers = new BoostableClassifier[this._Boosts]; this._ClassifierWeights = new float[this._Boosts]; for (int i = 1; i <= this._Boosts; i++) { var classy = this._Factory(); classy.Train(train, weights); float error = 0; for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { parameters[c] = train._Data[r, c]; } predictions[r] = train._Labels[r] == classy.Predict(parameters); if (!predictions[r]) { error += weights[r]; } } if (error == 0) { String err = "Adaboost Error is 0"; Console.WriteLine(err); throw new Exception(err); } float alpha = 0.5f * (float)Math.Log((1 - error) / error); float sum_weights = 0; for (int r = 0; r < rows; r++) { float learning = (predictions[r] ? 1 : -1); float new_weight = weights[r] * (float)Math.Exp(-alpha * learning); if (float.IsInfinity(new_weight) || float.IsNaN(new_weight) || (new_weight == 0)) { // Don't update } else { weights[r] = new_weight; } sum_weights += weights[r]; } for (int r = 0; r < rows; r++) { weights[r] /= sum_weights; } // Console.WriteLine("\tAlpha: " + alpha); // Console.WriteLine("\tMin: " + weights.Min()); // Console.Write("\tMax: " + weights.Max() + '\t'); // Console.WriteLine(); this._ClassifierWeights[i - 1] = alpha; this._Classifiers[i - 1] = classy; } }
public void Train(Datas.Useable train) { this.Train(train, 0); }
private void bwLoadData_DoWork(object sender, DoWorkEventArgs e) { try { Classifier classif = null; Datas.Useable train = null; Datas.Useable test = null; ConfusionMatrix conf_train, conf_test; if (e.Argument is ToBackgroundWorkerArgsTree) { var args = e.Argument as ToBackgroundWorkerArgsTree; train = args._Train; test = args._Test; classif = new DecisionTree(args._MaxDepth); classif.Train(train); conf_train = new ConfusionMatrix(classif.Compile, train); conf_test = new ConfusionMatrix(classif.Compile, test); } else if (e.Argument is ToBackgroundWorkerArgsForest) { var args = e.Argument as ToBackgroundWorkerArgsForest; train = args._Train; test = args._Test; classif = new RandomForest(args._MaxDepth, args._TreeCount); classif.Train(train); conf_train = new ConfusionMatrix(classif.Compile, train); conf_test = new ConfusionMatrix(classif.Compile, test); } else if (e.Argument is ToBackgroundWorkerArgsAdaBoost) { var args = e.Argument as ToBackgroundWorkerArgsAdaBoost; train = args._Train; test = args._Test; classif = new AdaBoost(args._Factory, args._Boosts); classif.Train(train); conf_train = new ConfusionMatrix(classif.Compile, train); conf_test = new ConfusionMatrix(classif.Compile, test); } else if (e.Argument is ToBackgroundWorkerkNN) { var args = e.Argument as ToBackgroundWorkerkNN; train = args._Train; test = args._Test; classif = new kNN(args._kNN); classif.Train(train); conf_train = null; conf_test = new ConfusionMatrix(classif.Compile, test); } else { throw new Exception("Not recognized stuff"); } if (this.bwLoadData.CancellationPending) { e.Result = null; } else { e.Result = new TrainerReturn( conf_train, conf_test, classif); } } catch (Exception exc) { e.Result = exc.ToString(); } }
public void Train(Datas.Useable train, int current_depth) { var branch_score = train.getLabelCounts(); int max_correct_branch = branch_score.Values.Max(); if ((branch_score.Values.Sum() != max_correct_branch) && // All children are in one cluster. (this._MaxDepth != current_depth)) // Limit Levels { var tups = new Tuple <float, float> [train._Labels.Count]; int cols = train._CountColumns; int rows = train._CountRows; double best_entropy = double.MaxValue; int best_column = -1; float best_split = -1; for (int c = 0; c < cols; c++) { for (int r = 0; r < rows; r++) { tups[r] = new Tuple <float, float>(train._Labels[r], train._Data[r, c]); } Array.Sort(tups, (Tuple <float, float> a, Tuple <float, float> b) => { return(a.Item2.CompareTo(b.Item2)); }); var branch_less_data = new Dictionary <float, int>(); var branch_more_data = new Dictionary <float, int>(); foreach (var kvp in branch_score) { branch_less_data[kvp.Key] = 0; branch_more_data[kvp.Key] = kvp.Value; } for (int split_point = 0; split_point < rows - 1; split_point++) { var tup = tups[split_point]; float this_label = tup.Item1; float this_value = tup.Item2; branch_less_data[this_label]++; branch_more_data[this_label]--; float next_value = tups[split_point + 1].Item2; // Skip identical values. float split_value = (this_value + next_value) / 2; if ((this_value < split_value) == (next_value < split_value)) { continue; } double p_less = (split_point + 1.0) / rows; double p_more = 1 - p_less; double entropy = p_less * branch_less_data.Values.Entropy() + p_more * branch_more_data.Values.Entropy(); if (entropy < best_entropy) { best_entropy = entropy; best_split = split_value; best_column = c; } } } if (best_column != -1) { this._BranchSplitValue = best_split; this._BranchColumn = best_column; Datas.Useable less, more; train.Split( this._BranchColumn, this._BranchSplitValue, out less, out more); this._BranchLess = new DecisionTree(this._MaxDepth); this._BranchMore = new DecisionTree(this._MaxDepth); this._BranchLess.Train(less, current_depth + 1); this._BranchMore.Train(more, current_depth + 1); return; } } this._LeafClassification = branch_score.ArgMax(); this._IsLeaf = true; }
public void Train(Datas.Useable train) { var branch_score = train.getLabelCounts(); int max_correct_branch = branch_score.Values.Max(); if (branch_score.Values.Sum() != max_correct_branch) // All children in one levels { var tups = new Tuple <float, float> [train._Labels.Count]; int cols = train._CountColumns; int rows = train._CountRows; int best_correct = int.MinValue; for (int c = 0; c < cols; c++) { for (int r = 0; r < rows; r++) { tups[r] = new Tuple <float, float>(train._Labels[r], train._Data[r, c]); } Array.Sort(tups, (Tuple <float, float> a, Tuple <float, float> b) => { return(a.Item2.CompareTo(b.Item2)); }); var branch_less_data = new Dictionary <float, int>(); var branch_more_data = new Dictionary <float, int>(); foreach (var kvp in branch_score) { branch_less_data[kvp.Key] = 0; branch_more_data[kvp.Key] = kvp.Value; } for (int split_point = 0; split_point < rows - 1; split_point++) { var tup = tups[split_point]; float this_label = tup.Item1; float this_value = tup.Item2; branch_less_data[this_label]++; branch_more_data[this_label]--; float next_value = tups[split_point + 1].Item2; // Skip identical values. float split_value = (this_value + next_value) / 2; if ((this_value < split_value) == (next_value < split_value)) { continue; } int correct = branch_less_data.Values.Max() + branch_more_data.Values.Max(); if (correct > best_correct) { best_correct = correct; this._BranchSplitValue = split_value; this._BranchColumn = c; this._BranchLessClassification = branch_less_data.ArgMax(); this._BranchMoreClassification = branch_more_data.ArgMax(); } } } if (best_correct != int.MinValue) { return; } } this._LeafClassification = branch_score.ArgMax(); this._IsLeaf = true; }
public ToBackgroundWorkerArgs(Datas.Useable data, float percent_test) { this._Data = data; this._PercentTest = percent_test; }
public void Train(Datas.Useable data, float[] weights) { var branch_score = new Dictionary <float, float>(); for (int i = 0; i < weights.Length; i++) { float f = data._Labels[i]; float count; if (!branch_score.TryGetValue(f, out count)) { count = 0; } branch_score[f] = count + weights[i]; } float max_correct_branch = branch_score.Values.Max(); if (branch_score.Values.Sum() != max_correct_branch) // All children are in one cluster. { int cols = data._CountColumns; int rows = data._CountRows; var tups = new Tuple <float, float, float> [rows]; float max_correct = max_correct_branch; for (int c = 0; c < cols; c++) { for (int r = 0; r < rows; r++) { tups[r] = new Tuple <float, float, float>( data._Labels[r], data._Data[r, c], weights[r]); } Array.Sort(tups, (Tuple <float, float, float> a, Tuple <float, float, float> b) => { return(a.Item2.CompareTo(b.Item2)); }); var branch_less_data = new Dictionary <float, float>(); var branch_more_data = new Dictionary <float, float>(); foreach (var kvp in branch_score) { branch_less_data[kvp.Key] = 0; branch_more_data[kvp.Key] = kvp.Value; } for (int split_point = 0; split_point < rows - 1; split_point++) { var tup = tups[split_point]; float this_label = tup.Item1; float this_value = tup.Item2; branch_less_data[this_label] += tup.Item3; branch_more_data[this_label] -= tup.Item3; float next_value = tups[split_point + 1].Item2; // Skip identical values. float split_value = (this_value + next_value) / 2; if ((this_value < split_value) == (next_value < split_value)) { continue; } float correct = branch_less_data.Values.Max() + branch_more_data.Values.Max(); if (correct > max_correct) { max_correct = correct; this._BranchSplitValue = split_value; this._BranchColumn = c; this._BranchLess = branch_less_data.ArgMax(); this._BranchMore = branch_more_data.ArgMax(); } } } // Better options exist. We should split the branch! if (max_correct != max_correct_branch) { return; } } this._LeafClassification = branch_score.ArgMax(); this._IsLeaf = true; }
private void bwLoadData_DoWork(object sender, DoWorkEventArgs e) { var args = e.Argument as ToBackgroundWorkerArgs; var ret = new Datas.Useable[args._Data.Length]; int ret_dex = 0; int new_colmns = 0; foreach (var b in args._PassThrough) { if (b) { new_colmns++; } } int new_dex = 0; int old_dex = 0; var new_columns_indices = new int[new_colmns]; foreach (var b in args._PassThrough) { if (b) { new_columns_indices[new_dex++] = old_dex++; } else { old_dex++; } } foreach (var di in args._Data) { int rows = di._Rows; var data = Matrix <float> .Build.Dense(rows, new_colmns, 0); var data_labels = Vector <float> .Build.Dense(rows); for (int r = 0; r < rows; r++) { var row = di._DataPoints[r]; for (int c = 0; c < new_colmns; c++) { data[r, c] = row[new_columns_indices[c]]; } data_labels[r] = row[args._LabelIndex]; } ret[ret_dex++] = new Datas.Useable(data, data_labels); } if (this.bwLoadData.CancellationPending) { e.Result = null; } else { e.Result = ret; } }
public void Train(Datas.Useable data, float[] weights) { var branch_score = new Dictionary <float, float>(); for (int i = 0; i < weights.Length; i++) { float f = data._Labels[i]; float count; if (!branch_score.TryGetValue(f, out count)) { count = 0; } branch_score[f] = count + weights[i]; } float max_correct_branch = branch_score.Values.Max(); if ((branch_score.Values.Sum() != max_correct_branch) && // All children are in one cluster. (this._Depth != _MaxDepth)) // Limit Levels { int cols = data._CountColumns; int rows = data._CountRows; var tups = new Tuple <float, float, float> [rows]; double best_entropy = double.MaxValue; int best_column = -1; float best_split = -1; for (int c = 0; c < cols; c++) { for (int r = 0; r < rows; r++) { tups[r] = new Tuple <float, float, float>( data._Labels[r], data._Data[r, c], weights[r]); } Array.Sort(tups, (Tuple <float, float, float> a, Tuple <float, float, float> b) => { return(a.Item2.CompareTo(b.Item2)); }); var branch_less_data = new Dictionary <float, float>(); var branch_more_data = new Dictionary <float, float>(); foreach (var kvp in branch_score) { branch_less_data[kvp.Key] = 0; branch_more_data[kvp.Key] = kvp.Value; } for (int split_point = 0; split_point < rows - 1; split_point++) { var tup = tups[split_point]; float this_label = tup.Item1; float this_value = tup.Item2; branch_less_data[this_label] += tup.Item3; branch_more_data[this_label] -= tup.Item3; float next_value = tups[split_point + 1].Item2; // Skip identical values. float split_value = (this_value + next_value) / 2; if ((this_value < split_value) == (next_value < split_value)) { continue; } double p_less = (split_point + 1.0) / rows; double p_more = 1 - p_less; double entropy = p_less * branch_less_data.Values.Entropy() + p_more * branch_more_data.Values.Entropy(); if (entropy < best_entropy) { best_entropy = entropy; best_split = split_value; best_column = c; } } } // Better options exist. We should split the branch! if (best_entropy != double.MaxValue) { this._BranchSplitValue = best_split; this._BranchColumn = best_column; // Console.WriteLine("Best Split: "+ this._BranchSplitValue+ " on column:"+ this._BranchColumn); int count_less = 0; int count_more = 0; var branch_less_data = new Dictionary <float, float>(); var branch_more_data = new Dictionary <float, float>(); for (int r = 0; r < rows; r++) { if (data._Data[r, this._BranchColumn] < this._BranchSplitValue) { count_less++; float f = data._Labels[r]; float count; if (!branch_less_data.TryGetValue(f, out count)) { count = 0; } branch_less_data[f] = count + weights[r]; } else { count_more++; float f = data._Labels[r]; float count; if (!branch_more_data.TryGetValue(f, out count)) { count = 0; } branch_more_data[f] = count + weights[r]; } } // Only split data less if we need too! if ((this._Depth == this._MaxDepth - 1) || (branch_less_data.Values.NumberOfNonZeros() == branch_less_data.Count - 1)) { this._BranchLess = new DecisionTree(branch_less_data.ArgMax()); } else { var labels_less = Vector <float> .Build.Dense(count_less); var weights_less = new float[count_less]; var data_less = Matrix <float> .Build.Dense(count_less, cols); int dex_less = 0; for (int r = 0; r < rows; r++) { if (data._Data[r, this._BranchColumn] < this._BranchSplitValue) { data_less.SetRow(dex_less, data._Data.Row(r)); labels_less[dex_less] = data._Labels[r]; weights_less[dex_less] = weights[r]; dex_less++; } } Datas.Useable less = new Datas.Useable(data_less, labels_less); this._BranchLess = new DecisionTree(this._Depth + 1, this._MaxDepth); this._BranchLess.Train(less, weights_less); } // Only split data more if we need too! if ((this._Depth == this._MaxDepth - 1) || (branch_more_data.Values.NumberOfNonZeros() == branch_more_data.Count - 1)) { this._BranchMore = new DecisionTree(branch_more_data.ArgMax()); } else { var labels_more = Vector <float> .Build.Dense(count_more); var weights_more = new float[count_more]; var data_more = Matrix <float> .Build.Dense(count_more, cols); int dex_more = 0; for (int r = 0; r < rows; r++) { if (data._Data[r, this._BranchColumn] >= this._BranchSplitValue) { data_more.SetRow(dex_more, data._Data.Row(r)); labels_more[dex_more] = data._Labels[r]; weights_more[dex_more] = weights[r]; dex_more++; } } Datas.Useable more = new Datas.Useable(data_more, labels_more); this._BranchMore = new DecisionTree(this._Depth + 1, this._MaxDepth); this._BranchMore.Train(more, weights_more); } return; } } this._LeafClassification = branch_score.ArgMax(); this._IsLeaf = true; }