public override void VTrain(VMatrix features, VMatrix labels) { _features = new VMatrix(features, 0, 0, features.Rows(), features.Cols()); if (labels.Data != null) { _labels = new VMatrix(labels, 0, 0, labels.Rows(), labels.Cols()); } _clusters = new List <Cluster>(); Console.Write("Algorithm: "); if (_algorithm == "k") { Console.WriteLine("k-means (k = " + _k + ")"); // Features.Shuffle(Rand, Labels); // create the initial clusters for (var k = 0; k < _k; k++) { var cluster = new Cluster(k, _features, k, _ignore); _clusters.Add(cluster); if (_outputFile != null) { cluster.PrintCentroid(_outputFile); } } double lastSsd = double.MinValue; for (;;) { var ssd = TrainK(); if (_outputFile != null) { _outputFile.WriteLine(string.Format("Sum squared-distance of each row with its centroid={0}", ssd)); } if (ssd != lastSsd) { lastSsd = ssd; if (_outputFile != null) { _outputFile.WriteLine("Recomputing the centroids of each cluster..."); } foreach (var cluster in _clusters) { cluster.Recalculate(); cluster.ClearInstances(); if (_outputFile != null) { cluster.PrintCentroid(_outputFile); } } } else { break; } } } else if (_algorithm == "single") { if (_outputFile != null) { _outputFile.WriteLine("HAC single (k = " + _k + ")"); } // create the initial clusters for (var row = 0; row < _features.Rows(); row++) { var cluster = new Cluster(0, _features, row, _ignore); cluster.AddInstance(row); _clusters.Add(cluster); } // create the distance matrix _distances = new double[_features.Rows(), _features.Rows()]; for (var row = 0; row < _features.Rows(); row++) { for (var row2 = row; row2 < _features.Rows(); row2++) { double distance = 0; if (row2 > row) { distance = _clusters[row].GetDistance(_features.Row(row2)); } _distances[row, row2] = distance; if (row != row2) { _distances[row2, row] = distance; } } } int iteration = 0; do { TrainSingle(iteration++); } while (_clusters.Count > _k); } else if (_algorithm == "complete") { if (_outputFile != null) { _outputFile.WriteLine("HAC complete (k = " + _k + ")"); } // create the initial clusters for (var row = 0; row < _features.Rows(); row++) { var cluster = new Cluster(0, _features, row, _ignore); cluster.AddInstance(row); _clusters.Add(cluster); } // create the distance matrix _distances = new double[_features.Rows(), _features.Rows()]; for (var row = 0; row < _features.Rows(); row++) { for (var row2 = row; row2 < _features.Rows(); row2++) { double distance = 0; if (row2 > row) { distance = _clusters[row].GetDistance(_features.Row(row2)); } _distances[row, row2] = distance; if (row != row2) { _distances[row2, row] = distance; } } } int iteration = 0; do { TrainComplete(iteration++); } while (_clusters.Count > _k); } else if (_algorithm == "average") { if (_outputFile != null) { _outputFile.WriteLine("HAC average (k = " + _k + ")"); } // create the initial clusters for (var row = 0; row < _features.Rows(); row++) { var cluster = new Cluster(0, _features, row, _ignore); cluster.AddInstance(row); _clusters.Add(cluster); } // create the distance matrix _distances = new double[_features.Rows(), _features.Rows()]; for (var row = 0; row < _features.Rows(); row++) { for (var row2 = row; row2 < _features.Rows(); row2++) { double distance = 0; if (row2 > row) { distance = _clusters[row].GetDistance(_features.Row(row2)); } _distances[row, row2] = distance; if (row != row2) { _distances[row2, row] = distance; } } } int iteration = 0; do { TrainAverage(iteration++); } while (_clusters.Count > _k); } else { throw new Exception("Inavlid Algorithm - " + _algorithm); } if (_outputFile != null) { _outputFile.WriteLine(); _outputFile.WriteLine("Cluster centroids:"); _outputFile.Write("Cluster#\t\t\t"); for (var c = 0; c < _clusters.Count; c++) { _outputFile.Write("\t\t" + c); } _outputFile.WriteLine(); _outputFile.Write("# of instances:\t\t\t"); for (var c = 0; c < _clusters.Count; c++) { _outputFile.Write("\t\t" + _clusters[c].Instances.Count); } _outputFile.WriteLine(); _outputFile.WriteLine("=========================================================================================================="); for (var col = 0; col < _features.Cols(); col++) { if (!_ignore.Contains(col)) { _outputFile.Write(_features.AttrName(col)); foreach (var cluster in _clusters) { if (cluster.Centroid[col] == Matrix.MISSING) { _outputFile.Write("\t?"); } else if (_features.ValueCount(col) < 2) { // continuous _outputFile.Write(string.Format("\t{0:0.#####}", cluster.Centroid[col])); } else { _outputFile.Write("\t" + _features.AttrValue(col, (int)cluster.Centroid[col])); } } _outputFile.WriteLine(); } } double sse = 0; _outputFile.Write("Sum squared error:\t"); foreach (var cluster in _clusters) { var error = cluster.GetSSE(); sse += error; _outputFile.Write(string.Format("\t{0:0.#####}", error)); } _outputFile.WriteLine(); _outputFile.WriteLine("Number of clusters: " + _clusters.Count); _outputFile.WriteLine(string.Format("Total sum squared error: {0:0.#####}", sse)); _outputFile.WriteLine(string.Format("DBI: {0}", GetDBI())); } if (_outputFile != null) { _outputFile.Close(); } }
public double VMeasureAccuracy(VMatrix features, VMatrix labels, Matrix confusion) { if (features.Rows() != labels.Rows()) { throw (new Exception("Expected the features and labels to have the same number of rows")); } if (labels.Cols() != 1) { throw (new Exception("Sorry, this method currently only supports one-dimensional labels")); } if (features.Rows() == 0) { throw (new Exception("Expected at least one row")); } var cl = 0; if (Parameters.Verbose) { Console.Write("VMeasureAccuracy "); cl = Console.CursorLeft; } var count = features.Rows(); var begRow = 0; if (this is BPTT) { var learner = this as BPTT; begRow = learner.m_k - 1; count -= begRow; } var labelValues = labels.ValueCount(0); if (labelValues == 0) // If the label is continuous... { // The label is continuous, so measure root mean squared error var pred = new double[1]; var sse = 0.0; for (var i = 0; i < features.Rows(); i++) { if (Parameters.Verbose) { Console.SetCursorPosition(cl, Console.CursorTop); Console.Write(i); } var feat = features.Row(i); var targ = labels.Row(i); pred[0] = 0.0; // make sure the prediction is not biased by a previous prediction Predict(feat, pred); if (i >= begRow) { var delta = targ[0] - pred[0]; sse += (delta * delta); } } if (Parameters.Verbose) { Console.WriteLine(); } return(Math.Sqrt(sse / count)); } else { // The label is nominal, so measure predictive accuracy if (confusion != null) { confusion.SetSize(labelValues, labelValues); for (var i = 0; i < labelValues; i++) { confusion.SetAttrName(i, labels.AttrValue(0, i)); } } var correctCount = 0; var prediction = new double[1]; for (var i = 0; i < features.Rows(); i++) { if (Parameters.Verbose) { Console.SetCursorPosition(cl, Console.CursorTop); Console.Write(i); } var feat = features.Row(i); var lab = labels.Get(i, 0); if (lab != Matrix.MISSING) { var targ = (int)lab; if (targ >= labelValues) { throw new Exception("The label is out of range"); } Predict(feat, prediction); if (i >= begRow) { var pred = (int)prediction[0]; if (confusion != null) { confusion.Set(targ, pred, confusion.Get(targ, pred) + 1); } if (pred == targ) { correctCount++; } } } else { count--; } } if (Parameters.Verbose) { Console.WriteLine(); } return((double)correctCount / count); } }