public override void VTrain(VMatrix features, VMatrix labels) { _features = new VMatrix(features, 0, 0, features.Rows(), features.Cols()); if (labels.Data != null) { _labels = new VMatrix(labels, 0, 0, labels.Rows(), labels.Cols()); } _clusters = new List <Cluster>(); Console.Write("Algorithm: "); if (_algorithm == "k") { Console.WriteLine("k-means (k = " + _k + ")"); // Features.Shuffle(Rand, Labels); // create the initial clusters for (var k = 0; k < _k; k++) { var cluster = new Cluster(k, _features, k, _ignore); _clusters.Add(cluster); if (_outputFile != null) { cluster.PrintCentroid(_outputFile); } } double lastSsd = double.MinValue; for (;;) { var ssd = TrainK(); if (_outputFile != null) { _outputFile.WriteLine(string.Format("Sum squared-distance of each row with its centroid={0}", ssd)); } if (ssd != lastSsd) { lastSsd = ssd; if (_outputFile != null) { _outputFile.WriteLine("Recomputing the centroids of each cluster..."); } foreach (var cluster in _clusters) { cluster.Recalculate(); cluster.ClearInstances(); if (_outputFile != null) { cluster.PrintCentroid(_outputFile); } } } else { break; } } } else if (_algorithm == "single") { if (_outputFile != null) { _outputFile.WriteLine("HAC single (k = " + _k + ")"); } // create the initial clusters for (var row = 0; row < _features.Rows(); row++) { var cluster = new Cluster(0, _features, row, _ignore); cluster.AddInstance(row); _clusters.Add(cluster); } // create the distance matrix _distances = new double[_features.Rows(), _features.Rows()]; for (var row = 0; row < _features.Rows(); row++) { for (var row2 = row; row2 < _features.Rows(); row2++) { double distance = 0; if (row2 > row) { distance = _clusters[row].GetDistance(_features.Row(row2)); } _distances[row, row2] = distance; if (row != row2) { _distances[row2, row] = distance; } } } int iteration = 0; do { TrainSingle(iteration++); } while (_clusters.Count > _k); } else if (_algorithm == "complete") { if (_outputFile != null) { _outputFile.WriteLine("HAC complete (k = " + _k + ")"); } // create the initial clusters for (var row = 0; row < _features.Rows(); row++) { var cluster = new Cluster(0, _features, row, _ignore); cluster.AddInstance(row); _clusters.Add(cluster); } // create the distance matrix _distances = new double[_features.Rows(), _features.Rows()]; for (var row = 0; row < _features.Rows(); row++) { for (var row2 = row; row2 < _features.Rows(); row2++) { double distance = 0; if (row2 > row) { distance = _clusters[row].GetDistance(_features.Row(row2)); } _distances[row, row2] = distance; if (row != row2) { _distances[row2, row] = distance; } } } int iteration = 0; do { TrainComplete(iteration++); } while (_clusters.Count > _k); } else if (_algorithm == "average") { if (_outputFile != null) { _outputFile.WriteLine("HAC average (k = " + _k + ")"); } // create the initial clusters for (var row = 0; row < _features.Rows(); row++) { var cluster = new Cluster(0, _features, row, _ignore); cluster.AddInstance(row); _clusters.Add(cluster); } // create the distance matrix _distances = new double[_features.Rows(), _features.Rows()]; for (var row = 0; row < _features.Rows(); row++) { for (var row2 = row; row2 < _features.Rows(); row2++) { double distance = 0; if (row2 > row) { distance = _clusters[row].GetDistance(_features.Row(row2)); } _distances[row, row2] = distance; if (row != row2) { _distances[row2, row] = distance; } } } int iteration = 0; do { TrainAverage(iteration++); } while (_clusters.Count > _k); } else { throw new Exception("Inavlid Algorithm - " + _algorithm); } if (_outputFile != null) { _outputFile.WriteLine(); _outputFile.WriteLine("Cluster centroids:"); _outputFile.Write("Cluster#\t\t\t"); for (var c = 0; c < _clusters.Count; c++) { _outputFile.Write("\t\t" + c); } _outputFile.WriteLine(); _outputFile.Write("# of instances:\t\t\t"); for (var c = 0; c < _clusters.Count; c++) { _outputFile.Write("\t\t" + _clusters[c].Instances.Count); } _outputFile.WriteLine(); _outputFile.WriteLine("=========================================================================================================="); for (var col = 0; col < _features.Cols(); col++) { if (!_ignore.Contains(col)) { _outputFile.Write(_features.AttrName(col)); foreach (var cluster in _clusters) { if (cluster.Centroid[col] == Matrix.MISSING) { _outputFile.Write("\t?"); } else if (_features.ValueCount(col) < 2) { // continuous _outputFile.Write(string.Format("\t{0:0.#####}", cluster.Centroid[col])); } else { _outputFile.Write("\t" + _features.AttrValue(col, (int)cluster.Centroid[col])); } } _outputFile.WriteLine(); } } double sse = 0; _outputFile.Write("Sum squared error:\t"); foreach (var cluster in _clusters) { var error = cluster.GetSSE(); sse += error; _outputFile.Write(string.Format("\t{0:0.#####}", error)); } _outputFile.WriteLine(); _outputFile.WriteLine("Number of clusters: " + _clusters.Count); _outputFile.WriteLine(string.Format("Total sum squared error: {0:0.#####}", sse)); _outputFile.WriteLine(string.Format("DBI: {0}", GetDBI())); } if (_outputFile != null) { _outputFile.Close(); } }