/// <summary> /// Learns a model that can map the given inputs to the desired outputs. /// </summary> /// <param name="x">The model inputs.</param> /// <param name="weights">The weight of importance for each input sample.</param> /// <returns>A model that has learned how to produce suitable outputs /// given the input data <paramref name="x" />.</returns> /// <exception cref="ArgumentNullException">points</exception> /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception> public KModesClusterCollection <T> Learn(T[][] x, double[] weights = null) { // Initial argument checking if (x == null) { throw new ArgumentNullException("points"); } if (x.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } int k = this.K; int rows = x.Length; int cols = x[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. // if (this.Clusters.Centroids[0] == null) { Clusters.Randomize(x); } // Initial variables int[] labels = new int[rows]; double[] proportions = Clusters.Proportions; T[][] centroids = Clusters.Centroids; T[][] newCentroids = new T[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new T[cols]; } var clusters = new ConcurrentBag <T[]> [k]; this.Iterations = 0; do // Main loop { // Reset the centroids and the // cluster member counters' for (int i = 0; i < k; i++) { clusters[i] = new ConcurrentBag <T[]>(); } // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, Parallel.For(0, x.Length, ParallelOptions, i => { // Get the point T[] point = x[i]; // Compute the nearest cluster centroid int c = labels[i] = Clusters.Decide(x[i]); // Accumulate in the corresponding centroid clusters[c].Add(point); }); // Next we will compute each cluster's new centroid // value by computing the mode in each cluster. Parallel.For(0, k, ParallelOptions, i => { if (clusters[i].Count == 0) { newCentroids[i] = centroids[i]; } else { T[][] p = Matrix.Transpose <ConcurrentBag <T[]>, T>(clusters[i]); // For each dimension for (int d = 0; d < this.Dimension; d++) { newCentroids[i][d] = p[d].Mode(alreadySorted: false, inPlace: true); } } }); // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). if (converged(centroids, newCentroids)) { break; } // go to next generation for (int i = 0; i < centroids.Length; i++) { centroids[i] = newCentroids[i]; } }while (true); // Compute cluster information (optional) for (int i = 0; i < k; i++) { // Compute the proportion of samples in the cluster proportions[i] = clusters[i].Count / (double)x.Length; } if (ComputeError) { // Compute the average error Error = Clusters.Distortion(x, labels); } Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length); // Return the classification result return(Clusters); }
/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="data">The data where to compute the algorithm.</param> /// <param name="weights">The weight to consider for each data sample. This is used in weighted K-Means</param> /// <param name="weightSum">The total sum of the weights in <paramref name="weights"/>.</param> /// private int[] Compute(double[][] data, double[] weights, double weightSum) { this.Iterations = 0; // TODO: Implement a faster version using the triangle // inequality to reduce the number of distance calculations // // - http://www-cse.ucsd.edu/~elkan/kmeansicml03.pdf // - http://mloss.org/software/view/48/ // int k = this.K; int rows = data.Length; int cols = data[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. // if (this.Clusters.Centroids[0] == null) { Randomize(data); } // Initial variables int[] labels = new int[rows]; double[] count = new double[k]; double[][] centroids = clusters.Centroids; double[][] newCentroids = new double[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new double[cols]; } Object[] syncObjects = new Object[K]; for (int i = 0; i < syncObjects.Length; i++) { syncObjects[i] = new Object(); } Iterations = 0; bool shouldStop = false; while (!shouldStop) // Main loop { Array.Clear(count, 0, count.Length); for (int i = 0; i < newCentroids.Length; i++) { Array.Clear(newCentroids[i], 0, newCentroids[i].Length); } // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, Parallel.For(0, data.Length, ParallelOptions, i => { // Get the point double[] point = data[i]; double weight = weights[i]; // Get the nearest cluster centroid int c = labels[i] = Clusters.Decide(point); // Get the closest cluster centroid double[] centroid = newCentroids[c]; lock (syncObjects[c]) { // Increase the cluster's sample counter count[c] += weight; // Accumulate in the cluster centroid for (int j = 0; j < point.Length; j++) { centroid[j] += point[j] * weight; } } }); // Next we will compute each cluster's new centroid // by dividing the accumulated sums by the number of // samples in each cluster, thus averaging its members. Parallel.For(0, newCentroids.Length, ParallelOptions, i => { double sum = count[i]; if (sum > 0) { for (int j = 0; j < newCentroids[i].Length; j++) { newCentroids[i][j] /= sum; } } }); // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). shouldStop = converged(centroids, newCentroids); // go to next generation Parallel.For(0, centroids.Length, ParallelOptions, i => { for (int j = 0; j < centroids[i].Length; j++) { centroids[i][j] = newCentroids[i][j]; } }); } for (int i = 0; i < clusters.Centroids.Length; i++) { // Compute the proportion of samples in the cluster clusters.Proportions[i] = count[i] / weightSum; } ComputeInformation(data, labels); return(labels); }
/// <summary> /// Learns a model that can map the given inputs to the desired outputs. /// </summary> /// <param name="x">The model inputs.</param> /// <param name="weights">The weight of importance for each input sample.</param> /// <returns>A model that has learned how to produce suitable outputs /// given the input data <paramref name="x" />.</returns> public override KMeansClusterCollection Learn(double[][] x, double[] weights = null) { // Initial argument checking if (x == null) { throw new ArgumentNullException("x"); } if (x.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } if (weights == null) { weights = Vector.Ones(x.Length); } if (x.Length != weights.Length) { throw new ArgumentException("Data weights vector must be the same length as data samples."); } double weightSum = weights.Sum(); if (weightSum <= 0) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } int cols = x.Columns(); for (int i = 0; i < x.Length; i++) { if (x[i].Length != cols) { throw new DimensionMismatchException("data", "The points matrix should be rectangular. The vector at position {} has a different length than previous ones."); } } int k = Clusters.Count; KMeans kmeans = new KMeans(2) { Distance = (IDistance <double[]>)Clusters.Distance, ComputeError = false, ComputeCovariances = false, UseSeeding = UseSeeding, Tolerance = Tolerance, MaxIterations = MaxIterations, }; var centroids = Clusters.Centroids; var clusters = new double[k][][]; var distortions = new double[k]; // 1. Start with all data points in one cluster clusters[0] = x; // 2. Repeat steps 3 to 6 (k-1) times to obtain K centroids for (int current = 1; current < k; current++) { // 3. Choose cluster with largest distortion int choosen; distortions.Max(current, out choosen); // 4. Split cluster into two sub-clusters var splits = split(clusters[choosen], kmeans); clusters[choosen] = splits.Item1; clusters[current] = splits.Item2; // 5. Replace chosen centroid and add a new one centroids[choosen] = kmeans.Clusters.Centroids[0]; centroids[current] = kmeans.Clusters.Centroids[1]; // Recompute distortions for the updated clusters distortions[choosen] = kmeans.Clusters[0].Distortion(clusters[choosen]); distortions[current] = kmeans.Clusters[1].Distortion(clusters[current]); // 6. Increment cluster count (current = current + 1) } Clusters.NumberOfInputs = cols; Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length); if (ComputeProportions) { int[] y = Clusters.Decide(x); int[] counts = y.Histogram(); counts.Divide(y.Length, result: Clusters.Proportions); ComputeInformation(x, y); } else { ComputeInformation(x); } return(Clusters); }
/// <summary> /// Learns a model that can map the given inputs to the desired outputs. /// </summary> /// <param name="x">The model inputs.</param> /// <param name="weights">The weight of importance for each input sample.</param> /// <returns>A model that has learned how to produce suitable outputs /// given the input data <paramref name="x" />.</returns> /// <exception cref="ArgumentNullException">points</exception> /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception> public KMedoidsClusterCollection <T> Learn(T[][] x, double[] weights = null) { // Initial argument checking if (x == null) { throw new ArgumentNullException("points"); } if (x.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } // Perform initialization of the clusters int[] currentMedoidIndicesArray = Clusters.Randomize(x, Initialization, ParallelOptions); // Detect initial medoid indices if (currentMedoidIndicesArray == null) { currentMedoidIndicesArray = Vector.Create(value: -1, size: K); Parallel.For(0, x.Length, ParallelOptions, i => { for (int j = 0; j < Clusters.Centroids.Length; j++) { if (Distance.Distance(Clusters.Centroids[j], x[i]) == 0) { int prev = Interlocked.CompareExchange(ref currentMedoidIndicesArray[j], i, -1); if (prev != -1) { throw new Exception("Duplicate medoid #{0} detected: {1} and {2}".Format(j, prev, i)); } break; } } }); } for (int i = 0; i < currentMedoidIndicesArray.Length; ++i) { if (currentMedoidIndicesArray[i] == -1) { throw new Exception("Medoid #{0} not found.".Format(i)); } } Iterations = 0; int[] labels = new int[x.Length]; // Special case - one medoid. if (K == 1) { // Arrange point with minimal total cost as medoid. int imin = -1; double min = Double.PositiveInfinity; for (int i = 0; i < x.Length; i++) { double cost = 0.0; for (int j = 0; j < x.Length; j++) { cost += Distance.Distance(x[i], x[j]); } if (cost < min) { imin = i; } } Clusters.Centroids[0] = x[imin]; } else { Compute(x, labels, currentMedoidIndicesArray); } // Miscellaneous final computations if (ComputeError) { // Compute the average error #if DEBUG var expected = Clusters.Decide(x); if (!expected.IsEqual(labels)) { throw new Exception(); } #endif Error = Clusters.Distortion(x, labels); } Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length); // Return the classification result return(Clusters); }
/// <summary> /// Learns a model that can map the given inputs to the desired outputs. /// </summary> /// <param name="x">The model inputs.</param> /// <param name="weights">The weight of importance for each input sample.</param> /// <returns>A model that has learned how to produce suitable outputs /// given the input data <paramref name="x" />.</returns> public override KMeansClusterCollection Learn(double[][] x, double[] weights = null) { if (x == null) { throw new ArgumentNullException("x", "No data supplied to Mini-Batch K-Means. The parameter x cannot be null."); } if (x.Length < K) { throw new ArgumentException("x", "Not enough points. There should be more points than the number K of clusters."); } if (weights == null) { weights = Vector.Ones(x.Length); } else { if (x.Length != weights.Length) { throw new DimensionMismatchException("weights", "Data weights vector must be the same length as data samples."); } } if (this.InitializationBatchSize > x.Length) { this.InitializationBatchSize = x.Length; } double weightSum = weights.Sum(); if (weightSum <= 0) { throw new ArgumentException("weights", "Not enough points. There should be more points than the number K of clusters."); } if (!x.IsRectangular()) { throw new DimensionMismatchException("x", "The points matrix should be rectangular. The vector at position {} has a different length than previous ones."); } if (this.batchSize > x.Length) { throw new ArgumentException("Not enough points. There should be more points in the dataset than in a batch. "); } if (this.initializationBatchSize.HasValue == false) { this.InitializationBatchSize = 3 * this.K; } int k = this.K; int rows = x.Length; int cols = x[0].Length; // Initial variables int[] labels = new int[rows]; double[] count = new double[k]; double[][] centroids = Clusters.Centroids; double[][] newCentroids = Jagged.Zeros(k, cols); int[] batchIndices = new int[this.batchSize]; Iterations = 0; bool shouldStop = false; if (Clusters.Centroids != null) { InitializeCentroids(x, weights); } for (int i = 0; i < k; i++) { count[i] = 0; } while (!shouldStop) // Main loop { Iterations = Iterations + 1; // Getting indices of the points in this iteration's batch batchIndices = MakeBatch(rows, this.batchSize); foreach (int index in batchIndices) { // Caching the center nearest to the point x[index] // and storing it in labels[index]. double[] point = x[index]; int clusterIndex = Clusters.Decide(point); labels[index] = clusterIndex; } // The centroids from the previous iteration will remain in the variable centroids. // The refined centroids will be stored in the variable newCentroids on which we are going to operate // in this iteration. for (int i = 0; i < centroids.Length; i++) { for (int j = 0; j < centroids[i].Length; j++) { newCentroids[i][j] = centroids[i][j]; } } // Updating the centroids. foreach (int pointIndex in batchIndices) { double[] point = x[pointIndex]; int clusterIndex = labels[pointIndex]; count[clusterIndex]++; double eta = 1.0 / count[clusterIndex]; // Gradient step. for (int i = 0; i < newCentroids[clusterIndex].Length; i++) { newCentroids[clusterIndex][i] = (1.0 - eta) * newCentroids[clusterIndex][i] + eta * point[i] * weights[pointIndex]; } } ; // The algorithm stops when there is no further change in the // centroids (relative difference is lower than the threshold). shouldStop = converged(centroids, newCentroids); // Copying the refined centroids // from the variable newCentroids to the variable centroids. for (int i = 0; i < centroids.Length; i++) { for (int j = 0; j < centroids[i].Length; j++) { centroids[i][j] = newCentroids[i][j]; } } } // ... decide for every point in x? for (int i = 0; i < Clusters.Centroids.Length; i++) { // Computing the proportion of samples in the cluster. Clusters.Proportions[i] = count[i] / weightSum; } this.Labels = labels; ComputeInformation(x, labels); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K); Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length); return(Clusters); }