/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="points">The data where to compute the algorithm.</param> /// <param name="threshold">The relative convergence threshold /// for the algorithm. Default is 1e-5.</param> /// public int[] Compute(double[][] points, double threshold) { int k = Clusters.Count; KMeans kmeans = new KMeans(2, Clusters.Distance); double[][] centroids = Clusters.Centroids; double[][][] clusters = new double[k][][]; double[] distortions = new double[k]; // 1. Start with all data points in one cluster clusters[0] = points; // 2. Repeat steps 3 to 6 (k-1) times to obtain K centroids for (int current = 1; current < k; current++) { // 3. Choose cluster with largest distortion int choosen; distortions.Max(current, out choosen); // 4. Split cluster into two sub-clusters var splits = split(clusters[choosen], kmeans, threshold); clusters[choosen] = splits.Item1; clusters[current] = splits.Item2; // 5. Replace chosen centroid and add a new one centroids[choosen] = kmeans.Clusters.Centroids[0]; centroids[current] = kmeans.Clusters.Centroids[1]; // Recompute distortions for the updated clusters distortions[choosen] = kmeans.Clusters[0].Distortion(clusters[choosen]); distortions[current] = kmeans.Clusters[1].Distortion(clusters[current]); // 6. Increment cluster count (current = current + 1) } return(Clusters.Nearest(points)); }
public int[] Nearest(double[][] points) { return(Clusters.Nearest(points)); }
public int Nearest(double[] point) { return(Clusters.Nearest(point)); }
/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="data">The data where to compute the algorithm.</param> /// <param name="threshold">The relative convergence threshold /// for the algorithm. Default is 1e-5.</param> /// <param name="computeInformation">Pass <c>true</c> to compute additional information /// when the algorithm finishes, such as cluster variances and proportions; false /// otherwise. Default is true.</param> /// public int[] Compute(double[][] data, double threshold = 1e-5, bool computeInformation = true) { // Initial argument checking if (data == null) { throw new ArgumentNullException("data"); } if (data.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } if (threshold < 0) { throw new ArgumentException("Threshold should be a positive number.", "threshold"); } // TODO: Implement a faster version using the triangle // inequality to reduce the number of distance calculations // // - http://www-cse.ucsd.edu/~elkan/kmeansicml03.pdf // - http://mloss.org/software/view/48/ // int k = this.K; int rows = data.Length; int cols = data[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. if (this.Clusters.Centroids[0] == null) { Randomize(data, useSeeding: false); } // Initial variables int[] count = new int[k]; int[] labels = new int[rows]; double[][] centroids = clusters.Centroids; double[][] newCentroids = new double[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new double[cols]; } Object[] syncObjects = new Object[k]; for (int i = 0; i < syncObjects.Length; i++) { syncObjects[i] = new Object(); } bool shouldStop = false; while (!shouldStop) // Main loop { // Reset the centroids and the member counters for (int i = 0; i < newCentroids.Length; i++) { Array.Clear(newCentroids[i], 0, newCentroids[i].Length); } Array.Clear(count, 0, count.Length); // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, Parallel.For(0, data.Length, i => { // Get the point double[] point = data[i]; // Get the nearest cluster centroid int c = labels[i] = Clusters.Nearest(point); // Increase the cluster's sample counter Interlocked.Increment(ref count[c]); // Get the closest cluster centroid double[] centroid = newCentroids[c]; lock (syncObjects[c]) { // Accumulate in the cluster centroid for (int j = 0; j < point.Length; j++) { centroid[j] += point[j]; } } }); // Next we will compute each cluster's new centroid // by dividing the accumulated sums by the number of // samples in each cluster, thus averaging its members. for (int i = 0; i < newCentroids.Length; i++) { double clusterCount = count[i]; if (clusterCount != 0) { for (int j = 0; j < newCentroids[i].Length; j++) { newCentroids[i][j] /= clusterCount; } } } // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). shouldStop = converged(centroids, newCentroids, threshold); // go to next generation for (int i = 0; i < centroids.Length; i++) { for (int j = 0; j < centroids[i].Length; j++) { centroids[i][j] = newCentroids[i][j]; } } } for (int i = 0; i < centroids.Length; i++) { // Compute the proportion of samples in the cluster clusters.Proportions[i] = count[i] / (double)data.Length; } if (computeInformation) { // Compute cluster information (optional) for (int i = 0; i < centroids.Length; i++) { // Extract the data for the current cluster double[][] sub = data.Submatrix(labels.Find(x => x == i)); if (sub.Length > 0) { // Compute the current cluster variance clusters.Covariances[i] = Statistics.Tools.Covariance(sub, centroids[i]); } else { // The cluster doesn't have any samples clusters.Covariances[i] = new double[cols, cols]; } } } // Return the classification result return(labels); }
/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="data">The data where to compute the algorithm.</param> /// <param name="weights">The weight associated with each data point.</param> /// public override int[] Compute(double[][] data, double[] weights) { // Initial argument checking if (data == null) { throw new ArgumentNullException("data"); } if (data.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } if (weights == null) { throw new ArgumentNullException("weights"); } if (data.Length != weights.Length) { throw new ArgumentException("Data weights vector must be the same length as data samples."); } double weightSum = weights.Sum(); if (weightSum <= 0) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } int cols = data[0].Length; for (int i = 0; i < data.Length; i++) { if (data[0].Length != cols) { throw new DimensionMismatchException("data", "The points matrix should be rectangular. The vector at position {} has a different length than previous ones."); } } int k = Clusters.Count; KMeans kmeans = new KMeans(2) { Distance = (IDistance <double[]>)Clusters.Distance, ComputeError = false, ComputeCovariances = false, UseSeeding = UseSeeding, Tolerance = Tolerance, MaxIterations = MaxIterations, }; double[][] centroids = Clusters.Centroids; double[][][] clusters = new double[k][][]; double[] distortions = new double[k]; // 1. Start with all data points in one cluster clusters[0] = data; // 2. Repeat steps 3 to 6 (k-1) times to obtain K centroids for (int current = 1; current < k; current++) { // 3. Choose cluster with largest distortion int choosen; distortions.Max(current, out choosen); // 4. Split cluster into two sub-clusters var splits = split(clusters[choosen], kmeans); clusters[choosen] = splits.Item1; clusters[current] = splits.Item2; // 5. Replace chosen centroid and add a new one centroids[choosen] = kmeans.Clusters.Centroids[0]; centroids[current] = kmeans.Clusters.Centroids[1]; // Recompute distortions for the updated clusters distortions[choosen] = kmeans.Clusters[0].Distortion(clusters[choosen]); distortions[current] = kmeans.Clusters[1].Distortion(clusters[current]); // 6. Increment cluster count (current = current + 1) } return(Clusters.Nearest(data)); }
/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="points">The data where to compute the algorithm.</param> /// public int[] Compute(T[][] points) { // Initial argument checking if (points == null) { throw new ArgumentNullException("points"); } if (points.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } int k = this.K; int rows = points.Length; int cols = points[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. // if (this.Clusters.Centroids[0] == null) { Clusters.Randomize(points); } // Initial variables int[] labels = new int[rows]; double[] proportions = Clusters.Proportions; T[][] centroids = Clusters.Centroids; T[][] newCentroids = new T[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new T[cols]; } var clusters = new List <T[]> [k]; for (int i = 0; i < k; i++) { clusters[i] = new List <T[]>(); } Iterations = 0; do // Main loop { // Reset the centroids and the // cluster member counters' for (int i = 0; i < k; i++) { clusters[i].Clear(); } // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, for (int i = 0; i < points.Length; i++) { // Get the point T[] point = points[i]; // Compute the nearest cluster centroid int c = labels[i] = Clusters.Nearest(points[i]); // Accumulate in the corresponding centroid clusters[c].Add(point); } // Next we will compute each cluster's new centroid // value by computing the mode in each cluster. for (int i = 0; i < k; i++) { if (clusters[i].Count == 0) { newCentroids[i] = centroids[i]; continue; } T[][] p = clusters[i].ToArray(); // For each dimension for (int d = 0; d < Dimension; d++) { T[] values = p.GetColumn(d); T mode = values.Mode(alreadySorted: false, inPlace: true); newCentroids[i][d] = mode; } } // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). if (converged(centroids, newCentroids)) { break; } // go to next generation for (int i = 0; i < centroids.Length; i++) { centroids[i] = newCentroids[i]; } }while (true); // Compute cluster information (optional) for (int i = 0; i < k; i++) { // Compute the proportion of samples in the cluster proportions[i] = clusters[i].Count / (double)points.Length; } if (ComputeError) { // Compute the average error Error = Clusters.Distortion(points, labels); } // Return the classification result return(labels); }
/// <summary> /// Performs the actual clustering, given a set of data points and /// a convergence threshold. The remaining parameters must be set /// before returning the method. /// </summary> /// protected virtual void PerformClustering(double[][] data, double threshold, double[][] newCentroids, int[] count, int[] labels, double[][] centroids) { Object[] syncObjects = new Object[K]; for (int i = 0; i < syncObjects.Length; i++) { syncObjects[i] = new Object(); } bool shouldStop = false; while (!shouldStop) // Main loop { // Reset the centroids and the member counters for (int i = 0; i < newCentroids.Length; i++) { Array.Clear(newCentroids[i], 0, newCentroids[i].Length); } Array.Clear(count, 0, count.Length); // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, Parallel.For(0, data.Length, i => { // Get the point double[] point = data[i]; // Get the nearest cluster centroid int c = labels[i] = Clusters.Nearest(point); // Increase the cluster's sample counter Interlocked.Increment(ref count[c]); // Get the closest cluster centroid double[] centroid = newCentroids[c]; lock (syncObjects[c]) { // Accumulate in the cluster centroid for (int j = 0; j < point.Length; j++) { centroid[j] += point[j]; } } }); // Next we will compute each cluster's new centroid // by dividing the accumulated sums by the number of // samples in each cluster, thus averaging its members. for (int i = 0; i < newCentroids.Length; i++) { double clusterCount = count[i]; if (clusterCount != 0) { for (int j = 0; j < newCentroids[i].Length; j++) { newCentroids[i][j] /= clusterCount; } } } // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). shouldStop = converged(centroids, newCentroids, threshold); // go to next generation for (int i = 0; i < centroids.Length; i++) { for (int j = 0; j < centroids[i].Length; j++) { centroids[i][j] = newCentroids[i][j]; } } } }
/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="points">The data where to compute the algorithm.</param> /// <param name="threshold">The relative convergence threshold /// for the algorithm. Default is 1e-5.</param> /// public int[] Compute(TData[] points, double threshold = 1e-5) { // Initial argument checking if (points == null) { throw new ArgumentNullException("points"); } if (threshold < 0) { throw new ArgumentException("Threshold should be a positive number.", "threshold"); } int k = this.K; int rows = points.Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. if (Clusters.Centroids[0] == null) { Randomize(points); } // Initial variables int[] labels = new int[rows]; double[] proportions = Clusters.Proportions; TData[] centroids = Clusters.Centroids; TData[] newCentroids = new TData[k]; List <TData>[] clusters = new List <TData> [k]; for (int i = 0; i < k; i++) { clusters[i] = new List <TData>(); } do // Main loop { // Reset the centroids and the // cluster member counters' for (int i = 0; i < k; i++) { clusters[i].Clear(); } // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, for (int i = 0; i < points.Length; i++) { // Get the point TData point = points[i]; // Compute the nearest cluster centroid int c = labels[i] = Clusters.Nearest(points[i]); // Accumulate in the corresponding centroid clusters[c].Add(point); } // Next we will compute each cluster's new centroid // value by computing the mode in each cluster. for (int i = 0; i < k; i++) { newCentroids[i] = Accord.Statistics.Tools.Mode <TData>(clusters[i].ToArray()); } // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). if (converged(centroids, newCentroids, threshold)) { break; } // go to next generation for (int i = 0; i < centroids.Length; i++) { centroids[i] = newCentroids[i]; } }while (true); // Compute cluster information (optional) for (int i = 0; i < k; i++) { // Compute the proportion of samples in the cluster proportions[i] = clusters[i].Count / (double)points.Length; } // Return the classification result return(labels); }
public int[] Compute(double[][] data, double threshold = 1e-5, bool computeInformation = true) { // Initial argument checking if (data == null) { throw new ArgumentNullException("data"); } if (data.Length < K) { throw new ArgumentException("Not enough points. There should be more points than the number K of clusters."); } if (threshold < 0) { throw new ArgumentException("Threshold should be a positive number.", "threshold"); } int k = this.K; int rows = data.Length; int cols = data[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. if (this.Clusters.Centroids[0] == null) { Randomize(data, useSeeding: false); } // Initial variables int[] count = new int[k]; int[] labels = new int[rows]; double[][] centroids = clusters.Centroids; double[][] newCentroids = new double[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new double[cols]; } double[][,] covariances = clusters.Covariances; double[] proportions = clusters.Proportions; bool shouldStop = false; while (!shouldStop) // Main loop { // Reset the centroids and the member counters for (int i = 0; i < newCentroids.Length; i++) { Array.Clear(newCentroids[i], 0, newCentroids[i].Length); } Array.Clear(count, 0, count.Length); for (int i = 0; i < data.Length; i++) { // Get the point double[] point = data[i]; // Get the nearest cluster centroid int c = labels[i] = Clusters.Nearest(point); // Increase the cluster's sample counter count[c]++; // Accumulate in the corresponding centroid for (int j = 0; j < point.Length; j++) { newCentroids[c][j] += point[j]; } } for (int i = 0; i < newCentroids.Length; i++) { double clusterCount = count[i]; if (clusterCount != 0) { for (int j = 0; j < newCentroids[i].Length; j++) { newCentroids[i][j] /= clusterCount; } } } shouldStop = converged(centroids, newCentroids, threshold); // go to next generation for (int i = 0; i < centroids.Length; i++) { for (int j = 0; j < centroids[i].Length; j++) { centroids[i][j] = newCentroids[i][j]; } } } if (computeInformation) { // Compute cluster information (optional) for (int i = 0; i < centroids.Length; i++) { // Extract the data for the current cluster double[][] sub = data.Submatrix(labels.Find(x => x == i)); if (sub.Length > 0) { // Compute the current cluster variance covariances[i] = Accord.Statistics.Tools.Covariance(sub, centroids[i]); } else { // The cluster doesn't have any samples covariances[i] = new double[cols, cols]; } // Compute the proportion of samples in the cluster proportions[i] = (double)sub.Length / data.Length; } } clusters.Centroids = centroids; // Return the classification result return(labels); }
/// <summary> /// Divides the input data into K clusters. /// </summary> /// /// <param name="data">The data where to compute the algorithm.</param> /// <param name="weights">The weight to consider for each data sample. This is used in weighted K-Means</param> /// <param name="weightSum">The total sum of the weights in <paramref name="weights"/>.</param> /// protected virtual int[] Compute(double[][] data, double[] weights, double weightSum) { this.Iterations = 0; // TODO: Implement a faster version using the triangle // inequality to reduce the number of distance calculations // // - http://www-cse.ucsd.edu/~elkan/kmeansicml03.pdf // - http://mloss.org/software/view/48/ // int k = this.K; int rows = data.Length; int cols = data[0].Length; // Perform a random initialization of the clusters // if the algorithm has not been initialized before. // if (this.Clusters.Centroids[0] == null) { Randomize(data); } // Initial variables int[] labels = new int[rows]; double[] count = new double[k]; double[][] centroids = clusters.Centroids; double[][] newCentroids = new double[k][]; for (int i = 0; i < newCentroids.Length; i++) { newCentroids[i] = new double[cols]; } Object[] syncObjects = new Object[K]; for (int i = 0; i < syncObjects.Length; i++) { syncObjects[i] = new Object(); } Iterations = 0; bool shouldStop = false; while (!shouldStop) // Main loop { Array.Clear(count, 0, count.Length); for (int i = 0; i < newCentroids.Length; i++) { Array.Clear(newCentroids[i], 0, newCentroids[i].Length); } // First we will accumulate the data points // into their nearest clusters, storing this // information into the newClusters variable. // For each point in the data set, Parallel.For(0, data.Length, i => { // Get the point double[] point = data[i]; double weight = weights[i]; // Get the nearest cluster centroid int c = labels[i] = Clusters.Nearest(point); // Get the closest cluster centroid double[] centroid = newCentroids[c]; lock (syncObjects[c]) { // Increase the cluster's sample counter count[c] += weight; // Accumulate in the cluster centroid for (int j = 0; j < point.Length; j++) { centroid[j] += point[j] * weight; } } }); // Next we will compute each cluster's new centroid // by dividing the accumulated sums by the number of // samples in each cluster, thus averaging its members. Parallel.For(0, newCentroids.Length, i => { double sum = count[i]; if (sum > 0) { for (int j = 0; j < newCentroids[i].Length; j++) { newCentroids[i][j] /= sum; } } }); // The algorithm stops when there is no further change in the // centroids (relative difference is less than the threshold). shouldStop = converged(centroids, newCentroids); // go to next generation Parallel.For(0, centroids.Length, i => { for (int j = 0; j < centroids[i].Length; j++) { centroids[i][j] = newCentroids[i][j]; } }); } for (int i = 0; i < clusters.Centroids.Length; i++) { // Compute the proportion of samples in the cluster clusters.Proportions[i] = count[i] / weightSum; } return(labels); }