Exemple #1
0
        private static double CalculateClusteringInformation(double[][] data, int[] clustering, ref double[][] means, ref int[] centroidIdx,
                                                             int clusterCount, ref int[] clusterItemCount, KMeansCalculateDistanceDelegate calculateDistanceFunction)
        {
            // Reset the means to zero for all clusters
            foreach (var mean in means)
            {
                for (int i = 0; i < mean.Length; i++)
                {
                    mean[i] = 0;
                }
            }

            // Calculate the means for each cluster
            // Do this in two phases, first sum them all up and then divide by the count in each cluster
            for (int i = 0; i < data.Length; i++)
            {
                // Sum up the means
                var row        = data[i];
                var clusterIdx = clustering[i]; // What cluster is data i assigned to
                ++clusterItemCount[clusterIdx]; // Increment the count of the cluster that row i is assigned to
                for (int j = 0; j < row.Length; j++)
                {
                    means[clusterIdx][j] += row[j];
                }
            }

            // Now divide to get the average
            for (int k = 0; k < means.Length; k++)
            {
                for (int a = 0; a < means[k].Length; a++)
                {
                    int itemCount = clusterItemCount[k];
                    means[k][a] /= itemCount > 0 ? itemCount : 1;
                }
            }

            double totalDistance = 0;

            // Calc the centroids
            double[] minDistances = new double[clusterCount].Select(x => double.MaxValue).ToArray();
            for (int i = 0; i < data.Length; i++)
            {
                var clusterIdx = clustering[i]; // What cluster is data i assigned to
                //var distance = CalculateDistance(data[i], means[clusterIdx]);
                var distance = calculateDistanceFunction(data[i], means[clusterIdx]);
                totalDistance += distance;
                if (distance < minDistances[clusterIdx])
                {
                    minDistances[clusterIdx] = distance;
                    centroidIdx[clusterIdx]  = i;
                }
            }
            //double totalCentroidDistance = minDistances.Sum();

            return(totalDistance);
        }
Exemple #2
0
        /// <summary>
        /// Calculates the distance for each point in <see cref="data"/> from each of the centroid in <see cref="centroidIdx"/> and
        /// assigns the data item to the cluster with the minimum distance.
        /// </summary>
        /// <returns>true if any clustering arrangement has changed, false if clustering did not change.</returns>
        private static bool AssignClustering(double[][] data, int[] clustering, int[] centroidIdx, int clusterCount, KMeansCalculateDistanceDelegate calculateDistanceFunction)
        {
            bool changed = false;

            for (int i = 0; i < data.Length; i++)
            {
                double minDistance     = double.MaxValue;
                int    minClusterIndex = -1;

                for (int k = 0; k < clusterCount; k++)
                {
                    double distance = calculateDistanceFunction(data[i], data[centroidIdx[k]]);
                    if (distance < minDistance)
                    {
                        minDistance     = distance;
                        minClusterIndex = k;
                    }
                    // todo: track outliers here as well and maintain an average and std calculation for the distances!
                }

                // Re-arrange the clustering for datapoint if needed
                if (minClusterIndex != -1 && clustering[i] != minClusterIndex)
                {
                    changed       = true;
                    clustering[i] = minClusterIndex;
                }
            }

            return(changed);
        }
Exemple #3
0
        /// <summary>
        /// Clusters the given item set into the desired number of clusters.
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="items">the list of data items that should be processed, this can be an array of primitive values such as <see cref="System.Double[]"/>
        /// or a class struct that exposes properties using the <see cref="KMeansValueAttribute"/></param>
        /// <param name="clusterCount">the desired number of clusters</param>
        /// <param name="maxIterations">the maximum number of iterations to perform</param>
        /// <param name="calculateDistanceFunction">optional, custom distance function, if omitted then the euclidean distance will be used as default</param>
        /// <param name="randomSeed">optional, a seed for the random generator that initially arranges the clustering of the nodes (specify the same value to ensure that the start ordering will be the same)</param>
        /// <param name="initialCentroidIndices">optional, the initial centroid configuration (as indicies into the <see cref="items"/> array). When this is used the <see cref="randomSeed"/> has no effect.
        /// Experiment with this as the initial arrangements of the centroids has a huge impact on the final cluster arrangement.</param>
        /// <returns>a result containing the items arranged into clusters as well as the centroids converged on and the total distance value for the cluster nodes.</returns>
        public static KMeansResults <T> Cluster <T>(T[] items, int clusterCount, int maxIterations, KMeansCalculateDistanceDelegate calculateDistanceFunction = null, int randomSeed = 0, int[] initialCentroidIndices = null)
        {
            double[][] data = ConvertEntities(items);

            // Use the built in Euclidean distance calculation if no custom one is specified
            if (calculateDistanceFunction == null)
            {
                calculateDistanceFunction = CalculateDistance;
            }

            bool   hasChanges    = true;
            int    iteration     = 0;
            double totalDistance = 0;
            int    numData       = data.Length;
            int    numAttributes = data[0].Length;

            // Create a random initial clustering assignment
            int[] clustering = InitializeClustering(numData, clusterCount, randomSeed);

            // Create cluster means and centroids
            double[][] means            = CreateMatrix(clusterCount, numAttributes);
            int[]      centroidIdx      = new int[clusterCount];
            int[]      clusterItemCount = new int[clusterCount];

            // If we specify initial centroid indices then let's assign clustering based on those immediately
            if (initialCentroidIndices != null && initialCentroidIndices.Length == clusterCount)
            {
                centroidIdx = initialCentroidIndices;
                AssignClustering(data, clustering, centroidIdx, clusterCount, calculateDistanceFunction);
//                Debug.WriteLine("Pre-Seeded Centroids resulted in initial clustering: " + string.Join(",", clustering.Select(x => x.ToString()).ToArray()));
            }

            // Perform the clustering
            while (hasChanges && iteration < maxIterations)
            {
                clusterItemCount = new int[clusterCount];
                totalDistance    = CalculateClusteringInformation(data, clustering, ref means, ref centroidIdx, clusterCount, ref clusterItemCount, calculateDistanceFunction);

//                Debug.WriteLine("------------- Iter: " + iteration);
//                Debug.WriteLine("Clustering: " + string.Join(",", clustering.Select(x => x.ToString()).ToArray()));
//                Debug.WriteLine("Means: " + string.Join(",", means.Select(x => "[" + string.Join(",", x.Select(y => y.ToString("#0.0")).ToArray()) + "]").ToArray()));
//                Debug.WriteLine("Centroids: " + string.Join(",", centroidIdx.Select(x => x.ToString()).ToArray()));
//                Debug.WriteLine("Cluster Counts: " + string.Join(",", clusterItemCount.Select(x => x.ToString()).ToArray()));

                hasChanges = AssignClustering(data, clustering, centroidIdx, clusterCount, calculateDistanceFunction);
                ++iteration;
            }

            // Create the final clusters
            T[][] clusters = new T[clusterCount][];
            for (int k = 0; k < clusters.Length; k++)
            {
                clusters[k] = new T[clusterItemCount[k]];
            }

            int[] clustersCurIdx = new int[clusterCount];
            for (int i = 0; i < clustering.Length; i++)
            {
                clusters[clustering[i]][clustersCurIdx[clustering[i]]] = items[i];
                ++clustersCurIdx[clustering[i]];
            }

            // Return the results
            return(new KMeansResults <T>(clusters, means, centroidIdx, totalDistance));
        }