예제 #1
0
        public override void VTrain(VMatrix features, VMatrix labels)
        {
            _features = new VMatrix(features, 0, 0, features.Rows(), features.Cols());
            if (labels.Data != null)
            {
                _labels = new VMatrix(labels, 0, 0, labels.Rows(), labels.Cols());
            }
            _clusters = new List <Cluster>();

            Console.Write("Algorithm: ");

            if (_algorithm == "k")
            {
                Console.WriteLine("k-means (k = " + _k + ")");

//				Features.Shuffle(Rand, Labels);

                // create the initial clusters
                for (var k = 0; k < _k; k++)
                {
                    var cluster = new Cluster(k, _features, k, _ignore);
                    _clusters.Add(cluster);
                    if (_outputFile != null)
                    {
                        cluster.PrintCentroid(_outputFile);
                    }
                }

                double lastSsd = double.MinValue;

                for (;;)
                {
                    var ssd = TrainK();
                    if (_outputFile != null)
                    {
                        _outputFile.WriteLine(string.Format("Sum squared-distance of each row with its centroid={0}", ssd));
                    }

                    if (ssd != lastSsd)
                    {
                        lastSsd = ssd;
                        if (_outputFile != null)
                        {
                            _outputFile.WriteLine("Recomputing the centroids of each cluster...");
                        }
                        foreach (var cluster in _clusters)
                        {
                            cluster.Recalculate();
                            cluster.ClearInstances();
                            if (_outputFile != null)
                            {
                                cluster.PrintCentroid(_outputFile);
                            }
                        }
                    }
                    else
                    {
                        break;
                    }
                }
            }
            else if (_algorithm == "single")
            {
                if (_outputFile != null)
                {
                    _outputFile.WriteLine("HAC single (k = " + _k + ")");
                }

                // create the initial clusters
                for (var row = 0; row < _features.Rows(); row++)
                {
                    var cluster = new Cluster(0, _features, row, _ignore);
                    cluster.AddInstance(row);
                    _clusters.Add(cluster);
                }

                // create the distance matrix
                _distances = new double[_features.Rows(), _features.Rows()];

                for (var row = 0; row < _features.Rows(); row++)
                {
                    for (var row2 = row; row2 < _features.Rows(); row2++)
                    {
                        double distance = 0;
                        if (row2 > row)
                        {
                            distance = _clusters[row].GetDistance(_features.Row(row2));
                        }
                        _distances[row, row2] = distance;
                        if (row != row2)
                        {
                            _distances[row2, row] = distance;
                        }
                    }
                }

                int iteration = 0;

                do
                {
                    TrainSingle(iteration++);
                } while (_clusters.Count > _k);
            }
            else if (_algorithm == "complete")
            {
                if (_outputFile != null)
                {
                    _outputFile.WriteLine("HAC complete (k = " + _k + ")");
                }

                // create the initial clusters
                for (var row = 0; row < _features.Rows(); row++)
                {
                    var cluster = new Cluster(0, _features, row, _ignore);
                    cluster.AddInstance(row);
                    _clusters.Add(cluster);
                }

                // create the distance matrix
                _distances = new double[_features.Rows(), _features.Rows()];

                for (var row = 0; row < _features.Rows(); row++)
                {
                    for (var row2 = row; row2 < _features.Rows(); row2++)
                    {
                        double distance = 0;
                        if (row2 > row)
                        {
                            distance = _clusters[row].GetDistance(_features.Row(row2));
                        }
                        _distances[row, row2] = distance;
                        if (row != row2)
                        {
                            _distances[row2, row] = distance;
                        }
                    }
                }

                int iteration = 0;

                do
                {
                    TrainComplete(iteration++);
                } while (_clusters.Count > _k);
            }
            else if (_algorithm == "average")
            {
                if (_outputFile != null)
                {
                    _outputFile.WriteLine("HAC average (k = " + _k + ")");
                }

                // create the initial clusters
                for (var row = 0; row < _features.Rows(); row++)
                {
                    var cluster = new Cluster(0, _features, row, _ignore);
                    cluster.AddInstance(row);
                    _clusters.Add(cluster);
                }

                // create the distance matrix
                _distances = new double[_features.Rows(), _features.Rows()];

                for (var row = 0; row < _features.Rows(); row++)
                {
                    for (var row2 = row; row2 < _features.Rows(); row2++)
                    {
                        double distance = 0;
                        if (row2 > row)
                        {
                            distance = _clusters[row].GetDistance(_features.Row(row2));
                        }
                        _distances[row, row2] = distance;
                        if (row != row2)
                        {
                            _distances[row2, row] = distance;
                        }
                    }
                }

                int iteration = 0;

                do
                {
                    TrainAverage(iteration++);
                } while (_clusters.Count > _k);
            }
            else
            {
                throw new Exception("Inavlid Algorithm - " + _algorithm);
            }

            if (_outputFile != null)
            {
                _outputFile.WriteLine();
                _outputFile.WriteLine("Cluster centroids:");

                _outputFile.Write("Cluster#\t\t\t");
                for (var c = 0; c < _clusters.Count; c++)
                {
                    _outputFile.Write("\t\t" + c);
                }
                _outputFile.WriteLine();

                _outputFile.Write("# of instances:\t\t\t");
                for (var c = 0; c < _clusters.Count; c++)
                {
                    _outputFile.Write("\t\t" + _clusters[c].Instances.Count);
                }
                _outputFile.WriteLine();

                _outputFile.WriteLine("==========================================================================================================");
                for (var col = 0; col < _features.Cols(); col++)
                {
                    if (!_ignore.Contains(col))
                    {
                        _outputFile.Write(_features.AttrName(col));
                        foreach (var cluster in _clusters)
                        {
                            if (cluster.Centroid[col] == Matrix.MISSING)
                            {
                                _outputFile.Write("\t?");
                            }
                            else if (_features.ValueCount(col) < 2)
                            {
                                // continuous
                                _outputFile.Write(string.Format("\t{0:0.#####}", cluster.Centroid[col]));
                            }
                            else
                            {
                                _outputFile.Write("\t" + _features.AttrValue(col, (int)cluster.Centroid[col]));
                            }
                        }
                        _outputFile.WriteLine();
                    }
                }

                double sse = 0;
                _outputFile.Write("Sum squared error:\t");
                foreach (var cluster in _clusters)
                {
                    var error = cluster.GetSSE();
                    sse += error;
                    _outputFile.Write(string.Format("\t{0:0.#####}", error));
                }
                _outputFile.WriteLine();

                _outputFile.WriteLine("Number of clusters: " + _clusters.Count);
                _outputFile.WriteLine(string.Format("Total sum squared error: {0:0.#####}", sse));
                _outputFile.WriteLine(string.Format("DBI: {0}", GetDBI()));
            }

            if (_outputFile != null)
            {
                _outputFile.Close();
            }
        }