private static double CalculateClusteringInformation(double[][] data, int[] clustering, ref double[][] means, ref int[] centroidIdx, int clusterCount, ref int[] clusterItemCount, KMeansCalculateDistanceDelegate calculateDistanceFunction) { // Reset the means to zero for all clusters foreach (var mean in means) { for (int i = 0; i < mean.Length; i++) { mean[i] = 0; } } // Calculate the means for each cluster // Do this in two phases, first sum them all up and then divide by the count in each cluster for (int i = 0; i < data.Length; i++) { // Sum up the means var row = data[i]; var clusterIdx = clustering[i]; // What cluster is data i assigned to ++clusterItemCount[clusterIdx]; // Increment the count of the cluster that row i is assigned to for (int j = 0; j < row.Length; j++) { means[clusterIdx][j] += row[j]; } } // Now divide to get the average for (int k = 0; k < means.Length; k++) { for (int a = 0; a < means[k].Length; a++) { int itemCount = clusterItemCount[k]; means[k][a] /= itemCount > 0 ? itemCount : 1; } } double totalDistance = 0; // Calc the centroids double[] minDistances = new double[clusterCount].Select(x => double.MaxValue).ToArray(); for (int i = 0; i < data.Length; i++) { var clusterIdx = clustering[i]; // What cluster is data i assigned to //var distance = CalculateDistance(data[i], means[clusterIdx]); var distance = calculateDistanceFunction(data[i], means[clusterIdx]); totalDistance += distance; if (distance < minDistances[clusterIdx]) { minDistances[clusterIdx] = distance; centroidIdx[clusterIdx] = i; } } //double totalCentroidDistance = minDistances.Sum(); return(totalDistance); }
/// <summary> /// Calculates the distance for each point in <see cref="data"/> from each of the centroid in <see cref="centroidIdx"/> and /// assigns the data item to the cluster with the minimum distance. /// </summary> /// <returns>true if any clustering arrangement has changed, false if clustering did not change.</returns> private static bool AssignClustering(double[][] data, int[] clustering, int[] centroidIdx, int clusterCount, KMeansCalculateDistanceDelegate calculateDistanceFunction) { bool changed = false; for (int i = 0; i < data.Length; i++) { double minDistance = double.MaxValue; int minClusterIndex = -1; for (int k = 0; k < clusterCount; k++) { double distance = calculateDistanceFunction(data[i], data[centroidIdx[k]]); if (distance < minDistance) { minDistance = distance; minClusterIndex = k; } // todo: track outliers here as well and maintain an average and std calculation for the distances! } // Re-arrange the clustering for datapoint if needed if (minClusterIndex != -1 && clustering[i] != minClusterIndex) { changed = true; clustering[i] = minClusterIndex; } } return(changed); }
/// <summary> /// Clusters the given item set into the desired number of clusters. /// </summary> /// <typeparam name="T"></typeparam> /// <param name="items">the list of data items that should be processed, this can be an array of primitive values such as <see cref="System.Double[]"/> /// or a class struct that exposes properties using the <see cref="KMeansValueAttribute"/></param> /// <param name="clusterCount">the desired number of clusters</param> /// <param name="maxIterations">the maximum number of iterations to perform</param> /// <param name="calculateDistanceFunction">optional, custom distance function, if omitted then the euclidean distance will be used as default</param> /// <param name="randomSeed">optional, a seed for the random generator that initially arranges the clustering of the nodes (specify the same value to ensure that the start ordering will be the same)</param> /// <param name="initialCentroidIndices">optional, the initial centroid configuration (as indicies into the <see cref="items"/> array). When this is used the <see cref="randomSeed"/> has no effect. /// Experiment with this as the initial arrangements of the centroids has a huge impact on the final cluster arrangement.</param> /// <returns>a result containing the items arranged into clusters as well as the centroids converged on and the total distance value for the cluster nodes.</returns> public static KMeansResults <T> Cluster <T>(T[] items, int clusterCount, int maxIterations, KMeansCalculateDistanceDelegate calculateDistanceFunction = null, int randomSeed = 0, int[] initialCentroidIndices = null) { double[][] data = ConvertEntities(items); // Use the built in Euclidean distance calculation if no custom one is specified if (calculateDistanceFunction == null) { calculateDistanceFunction = CalculateDistance; } bool hasChanges = true; int iteration = 0; double totalDistance = 0; int numData = data.Length; int numAttributes = data[0].Length; // Create a random initial clustering assignment int[] clustering = InitializeClustering(numData, clusterCount, randomSeed); // Create cluster means and centroids double[][] means = CreateMatrix(clusterCount, numAttributes); int[] centroidIdx = new int[clusterCount]; int[] clusterItemCount = new int[clusterCount]; // If we specify initial centroid indices then let's assign clustering based on those immediately if (initialCentroidIndices != null && initialCentroidIndices.Length == clusterCount) { centroidIdx = initialCentroidIndices; AssignClustering(data, clustering, centroidIdx, clusterCount, calculateDistanceFunction); // Debug.WriteLine("Pre-Seeded Centroids resulted in initial clustering: " + string.Join(",", clustering.Select(x => x.ToString()).ToArray())); } // Perform the clustering while (hasChanges && iteration < maxIterations) { clusterItemCount = new int[clusterCount]; totalDistance = CalculateClusteringInformation(data, clustering, ref means, ref centroidIdx, clusterCount, ref clusterItemCount, calculateDistanceFunction); // Debug.WriteLine("------------- Iter: " + iteration); // Debug.WriteLine("Clustering: " + string.Join(",", clustering.Select(x => x.ToString()).ToArray())); // Debug.WriteLine("Means: " + string.Join(",", means.Select(x => "[" + string.Join(",", x.Select(y => y.ToString("#0.0")).ToArray()) + "]").ToArray())); // Debug.WriteLine("Centroids: " + string.Join(",", centroidIdx.Select(x => x.ToString()).ToArray())); // Debug.WriteLine("Cluster Counts: " + string.Join(",", clusterItemCount.Select(x => x.ToString()).ToArray())); hasChanges = AssignClustering(data, clustering, centroidIdx, clusterCount, calculateDistanceFunction); ++iteration; } // Create the final clusters T[][] clusters = new T[clusterCount][]; for (int k = 0; k < clusters.Length; k++) { clusters[k] = new T[clusterItemCount[k]]; } int[] clustersCurIdx = new int[clusterCount]; for (int i = 0; i < clustering.Length; i++) { clusters[clustering[i]][clustersCurIdx[clustering[i]]] = items[i]; ++clustersCurIdx[clustering[i]]; } // Return the results return(new KMeansResults <T>(clusters, means, centroidIdx, totalDistance)); }