/// <summary> /// Initializes the centroids. /// </summary> /// <param name="data">The model inputs.</param> /// <param name="weights">The weight of importance for each input sample.</param> private void InitializeCentroids(double[][] data, double[] weights) { int rows = data.Length; int cols = data[0].Length; // indices of centroids are placed in centroidIndices int[] centroidIndices = MakeBatch(rows, this.InitializationBatchSize.Value); // indices of the points in the validation batch are placed in validationIndices int[] validationIndices = MakeBatch(rows, this.InitializationBatchSize.Value); // the actual centroids double[][] centroidBatch = new double[this.InitializationBatchSize.Value][]; // the actual points in the validation batch double[][] validationBatch = new double[this.InitializationBatchSize.Value][]; // the weights of the validation points double[] validationWeights = new double[this.InitializationBatchSize.Value]; // temporary variables double minDistortion = -1; double[][] bestCentroids = new double[this.numberOfInitializations][]; for (int i = 0; i < this.numberOfInitializations; i++) { for (int j = 0; j < this.InitializationBatchSize; j++) { // Copying points to batches int centroidIndex = centroidIndices[j]; int validationIndex = validationIndices[j]; centroidBatch[j] = new double[cols]; validationBatch[j] = new double[cols]; data[centroidIndex].CopyTo(centroidBatch[j]); data[validationIndex].CopyTo(validationBatch[j]); validationWeights[j] = weights[validationIndex]; } // Computing distortion of the current centroid set. Clusters.Randomize(centroidBatch, UseSeeding); double distortion = Clusters.Distortion(validationBatch, weights: validationWeights); // If this is the very first centroid set // or is better than the best so far // we remember it. if (minDistortion == -1 || distortion < minDistortion) { minDistortion = distortion; bestCentroids = Clusters.Centroids; } } // Setting the initial centroids // to the best found set. Clusters.Centroids = bestCentroids; }
/// <summary> /// Learns a model that can map the given inputs to the desired outputs. /// </summary> /// <param name="x">The model inputs.</param> /// <param name="weights">The weight of importance for each input sample.</param> /// <returns>A model that has learned how to produce suitable outputs /// given the input data <paramref name="x" />.</returns> /// <exception cref="ArgumentNullException">points</exception> /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception> public KModesClusterCollection <T> Learn(T[][] x, double[] weights = null) { // Initial argument checking if (x == null) { throw new ArgumentNullException("points"); } if (x.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } int k = this.K; int rows = x.Length; int cols = x[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. // if (this.Clusters.Centroids[0] == null) { Clusters.Randomize(x); } // Initial variables int[] labels = new int[rows]; double[] proportions = Clusters.Proportions; T[][] centroids = Clusters.Centroids; T[][] newCentroids = new T[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new T[cols]; } var clusters = new ConcurrentBag <T[]> [k]; this.Iterations = 0; do // Main loop { // Reset the centroids and the // cluster member counters' for (int i = 0; i < k; i++) { clusters[i] = new ConcurrentBag <T[]>(); } // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, Parallel.For(0, x.Length, ParallelOptions, i => { // Get the point T[] point = x[i]; // Compute the nearest cluster centroid int c = labels[i] = Clusters.Decide(x[i]); // Accumulate in the corresponding centroid clusters[c].Add(point); }); // Next we will compute each cluster's new centroid // value by computing the mode in each cluster. Parallel.For(0, k, ParallelOptions, i => { if (clusters[i].Count == 0) { newCentroids[i] = centroids[i]; } else { T[][] p = Matrix.Transpose <ConcurrentBag <T[]>, T>(clusters[i]); // For each dimension for (int d = 0; d < this.Dimension; d++) { newCentroids[i][d] = p[d].Mode(alreadySorted: false, inPlace: true); } } }); // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). if (converged(centroids, newCentroids)) { break; } // go to next generation for (int i = 0; i < centroids.Length; i++) { centroids[i] = newCentroids[i]; } }while (true); // Compute cluster information (optional) for (int i = 0; i < k; i++) { // Compute the proportion of samples in the cluster proportions[i] = clusters[i].Count / (double)x.Length; } if (ComputeError) { // Compute the average error Error = Clusters.Distortion(x, labels); } Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length); // Return the classification result return(Clusters); }
/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="points">The data where to compute the algorithm.</param> /// public int[] Compute(T[][] points) { // Initial argument checking if (points == null) { throw new ArgumentNullException("points"); } if (points.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } int k = this.K; int rows = points.Length; int cols = points[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. // if (this.Clusters.Centroids[0] == null) { Clusters.Randomize(points); } // Initial variables int[] labels = new int[rows]; double[] proportions = Clusters.Proportions; T[][] centroids = Clusters.Centroids; T[][] newCentroids = new T[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new T[cols]; } var clusters = new List <T[]> [k]; for (int i = 0; i < k; i++) { clusters[i] = new List <T[]>(); } Iterations = 0; do // Main loop { // Reset the centroids and the // cluster member counters' for (int i = 0; i < k; i++) { clusters[i].Clear(); } // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, for (int i = 0; i < points.Length; i++) { // Get the point T[] point = points[i]; // Compute the nearest cluster centroid int c = labels[i] = Clusters.Nearest(points[i]); // Accumulate in the corresponding centroid clusters[c].Add(point); } // Next we will compute each cluster's new centroid // value by computing the mode in each cluster. for (int i = 0; i < k; i++) { if (clusters[i].Count == 0) { newCentroids[i] = centroids[i]; continue; } T[][] p = clusters[i].ToArray(); // For each dimension for (int d = 0; d < Dimension; d++) { T[] values = p.GetColumn(d); T mode = values.Mode(alreadySorted: false, inPlace: true); newCentroids[i][d] = mode; } } // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). if (converged(centroids, newCentroids)) { break; } // go to next generation for (int i = 0; i < centroids.Length; i++) { centroids[i] = newCentroids[i]; } }while (true); // Compute cluster information (optional) for (int i = 0; i < k; i++) { // Compute the proportion of samples in the cluster proportions[i] = clusters[i].Count / (double)points.Length; } if (ComputeError) { // Compute the average error Error = Clusters.Distortion(points, labels); } // Return the classification result return(labels); }
/// <summary> /// Learns a model that can map the given inputs to the desired outputs. /// </summary> /// <param name="x">The model inputs.</param> /// <param name="weights">The weight of importance for each input sample.</param> /// <returns>A model that has learned how to produce suitable outputs /// given the input data <paramref name="x" />.</returns> /// <exception cref="ArgumentNullException">points</exception> /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception> public KMedoidsClusterCollection <T> Learn(T[][] x, double[] weights = null) { // Initial argument checking if (x == null) { throw new ArgumentNullException("points"); } if (x.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } // Perform initialization of the clusters int[] currentMedoidIndicesArray = Clusters.Randomize(x, Initialization, ParallelOptions); // Detect initial medoid indices if (currentMedoidIndicesArray == null) { currentMedoidIndicesArray = Vector.Create(value: -1, size: K); Parallel.For(0, x.Length, ParallelOptions, i => { for (int j = 0; j < Clusters.Centroids.Length; j++) { if (Distance.Distance(Clusters.Centroids[j], x[i]) == 0) { int prev = Interlocked.CompareExchange(ref currentMedoidIndicesArray[j], i, -1); if (prev != -1) { throw new Exception("Duplicate medoid #{0} detected: {1} and {2}".Format(j, prev, i)); } break; } } }); } for (int i = 0; i < currentMedoidIndicesArray.Length; ++i) { if (currentMedoidIndicesArray[i] == -1) { throw new Exception("Medoid #{0} not found.".Format(i)); } } Iterations = 0; int[] labels = new int[x.Length]; // Special case - one medoid. if (K == 1) { // Arrange point with minimal total cost as medoid. int imin = -1; double min = Double.PositiveInfinity; for (int i = 0; i < x.Length; i++) { double cost = 0.0; for (int j = 0; j < x.Length; j++) { cost += Distance.Distance(x[i], x[j]); } if (cost < min) { imin = i; } } Clusters.Centroids[0] = x[imin]; } else { Compute(x, labels, currentMedoidIndicesArray); } // Miscellaneous final computations if (ComputeError) { // Compute the average error #if DEBUG var expected = Clusters.Decide(x); if (!expected.IsEqual(labels)) { throw new Exception(); } #endif Error = Clusters.Distortion(x, labels); } Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length); // Return the classification result return(Clusters); }