/// <summary> /// initialize first iteration by choosing random mean points within the given array of points /// </summary> /// <param name="points"></param> /// <returns></returns> virtual protected KMeansCluster[] pickStartingClusters(int[] points, int numClusters) { Random rnd = new Random(); KMeansCluster[] currentClusters = new KMeansCluster[numClusters]; for (int i = 0; i < numClusters; i++) { currentClusters[i] = new KMeansCluster(points[rnd.Next(points.Length)]); } return(currentClusters); }
/// <summary> /// initialize first iteration by choosing first mean point at random and next ones with the KMeans++ algorithm /// </summary> /// <param name="points"></param> /// <returns></returns> /// <remarks> /// The exact algorithm is as follows: /// 1 - Choose one center uniformly at random from among the data points. /// 2 - For each data point x, compute D(x), the distance between x and the nearest center that has already been chosen. /// 3 - Choose one new data point at random as a new center, using a weighted probability distribution where a point x is chosen with probability proportional to D(x)2. /// 4 - Repeat Steps 2 and 3 until k centers have been chosen. /// 5 - Now that the initial centers have been chosen, proceed using standard k-means clustering. /// </remarks> override protected KMeansCluster[] pickStartingClusters(int[] points, int numClusters) { Random rnd = new Random(); KMeansCluster[] currentClusters = new KMeansCluster[numClusters]; // first item is chosen randomly currentClusters[0] = new KMeansCluster(points[rnd.Next(points.Length)]); for (int i = 1; i < numClusters; i++) { // compute the total of squared distances of each point compared to existing clusters float accumulatedDistances = 0.0f; // store results of the first loop into this array float[] accDistances = new float[points.Length]; for (int pointIdx = 0; pointIdx < points.Length; pointIdx++) { // find the minimum distance between the current point and all existing clusters float minDistance = currentClusters[0].distanceFromMean(points[pointIdx]); for (int clusterIdx = 1; clusterIdx < i; clusterIdx++) { float currentDistance = currentClusters[clusterIdx].distanceFromMean(points[pointIdx]); if (currentDistance < minDistance) { minDistance = currentDistance; } } // accumulate squared min distance // note: points already used in previous clusters will have zero distance, so they will not be picked in // the following loop as they have the same accDistances value as the previous point accumulatedDistances += minDistance * minDistance; accDistances[pointIdx] = accumulatedDistances; } // pick a random point in the distribution of squared min distances float targetPoint = (float)rnd.NextDouble() * accumulatedDistances; // create new cluster using this point as mean for (int pointIdx = 0; pointIdx < points.Length; pointIdx++) { if (accDistances[pointIdx] >= targetPoint) { currentClusters[i] = new KMeansCluster(points[pointIdx]); break; } } } return(currentClusters); }
/// <summary> /// classify array of points into numCluster clusters /// </summary> /// <param name="points"></param> /// <param name="numCluster">number of clusters to divide </param> /// <returns>array of clusters</returns> public KMeansCluster[] Run(int[] points, int numClusters) { KMeansCluster[] currentClusters = pickStartingClusters(points, numClusters); // iteratively improve the clusters by moving points to the cluster with the nearby centroid while (true) { KMeansCluster[] newClusters = new KMeansCluster[numClusters]; for (int i = 0; i < numClusters; i++) { newClusters[i] = new KMeansCluster(); } assingPointsToClusters(points, currentClusters, newClusters); dumpClustersToConsole(newClusters); if (isStable(currentClusters, newClusters)) { return(newClusters); } currentClusters = newClusters; } }