Example #1
0
        /// <summary>
        /// Returns an array where the element at index i contains the cluster entry associated with the entry
        /// </summary>
        /// <param name="data"></param>
        /// <param name="k"></param>
        /// <returns></returns>
        public int[] ClusterEMD(float[][] data, int k, int nofRuns, int[] _bestCenters = null)
        {
            Console.WriteLine("K-means++ (EMD) clustering {0} elements into {1} clusters with {2} runs...", data.Count(), k, nofRuns);
            int      filenameId = RandomGen.Next(0, 10000000);
            DateTime start      = DateTime.UtcNow;

            int[] bestCenters   = new int[data.Count()];
            int[] recordCenters = new int[data.Count()]; // we return indices only, the centers are discarded

            // load previous indices if passed
            bool skipInit = false;

            if (_bestCenters != null)
            {
                skipInit = true;
                Array.Copy(_bestCenters, bestCenters, _bestCenters.Length);
                Array.Copy(_bestCenters, recordCenters, _bestCenters.Length);
            }

            double recordDistance = double.MaxValue;

            for (int run = 0; run < nofRuns; ++run)
            {
                float[,] centers = new float[k, data[0].Count()];

                Console.WriteLine("K-means++ starting clustering...");
                double lastDistance    = double.MaxValue;
                bool   distanceChanged = true;

                if (!skipInit)
                {
                    bestCenters = new int[data.Count()];
                    centers     = FindStartingCentersEMD(data, k);
                }
                else
                {
                    // find new cluster centers // todo: it isnt theoretically sound to take the mean when using EMD distance metric
                    centers  = CalculateNewCenters(data, bestCenters, k);
                    skipInit = false;
                }
                float[,] centerCenterDistances = new float[k, k];

                while (distanceChanged)
                {
                    // calculate cluster-cluster distances to use triangle inequality
                    CalculateClusterDistancesEMD(centerCenterDistances, centers);

                    // find closest cluster for each element
                    long   sharedLoopCounter = 0;
                    double totalDistance     = 0;
                    using (var progress = new ProgressBar())
                    {
                        Parallel.For(0, Global.NOF_THREADS,
                                     i =>
                        {
                            double threadDistance = 0;
                            long iter             = 0;
                            for (int j = Util.GetWorkItemsIndices(data.Length, Global.NOF_THREADS, i).Item1;
                                 j < Util.GetWorkItemsIndices(data.Length, Global.NOF_THREADS, i).Item2; ++j)
                            {  // go through all data
                               // assume previous cluster was good, this is better for the triangle inequality
                                double distance = GetEarthMoverDistance(data, centers, j, bestCenters[j]);
                                int bestIndex   = bestCenters[j];
                                for (int m = 0; m < k; m++)  // go through centers
                                {
                                    if (centerCenterDistances[bestIndex, m] < 2 * distance && bestIndex != m)
                                    {
                                        double tempDistance = GetEarthMoverDistance(data, centers, j, m);
                                        if (tempDistance < distance)
                                        {
                                            distance  = tempDistance;
                                            bestIndex = m;
                                        }
                                    }
                                }
                                bestCenters[j]  = bestIndex;
                                threadDistance += distance;
                                iter++;

                                if (iter % 100000 == 0)
                                {
                                    Interlocked.Add(ref sharedLoopCounter, 100000);
                                    AddDouble(ref totalDistance, threadDistance);
                                    threadDistance = 0;
                                    progress.Report((double)Interlocked.Read(ref sharedLoopCounter) / data.Length, sharedLoopCounter);
                                }
                            }
                            Interlocked.Add(ref sharedLoopCounter, iter % 100000);
                            progress.Report((double)Interlocked.Read(ref sharedLoopCounter) / data.Length, sharedLoopCounter);

                            AddDouble(ref totalDistance, threadDistance);
                        });
                    }

                    centers         = CalculateNewCenters(data, bestCenters, k);
                    totalDistance  /= data.Length;
                    distanceChanged = !(totalDistance == lastDistance);

                    double diff = lastDistance - totalDistance;

                    Console.WriteLine("Saving intermediate table to file...");

                    FileHandler.SaveToFile(recordCenters, "EMDTable_temp_" + filenameId + ".txt");

                    if (totalDistance < recordDistance)
                    {
                        recordDistance = totalDistance;
                        Array.Copy(bestCenters, recordCenters, recordCenters.Length);
                    }

                    Console.WriteLine("Current average distance: {0} Improvement: {1}, {2}%", totalDistance, diff,
                                      100.0 * (1.0 - totalDistance / lastDistance));

                    lastDistance = totalDistance;
                }
            }
            Console.WriteLine("Best distance found: " + recordDistance);
            TimeSpan elapsed = DateTime.UtcNow - start;

            Console.WriteLine("K-means++ clustering (EMD) completed in {0}d {1}h {2}m {3}s", elapsed.Days, elapsed.Hours, elapsed.Minutes, elapsed.Seconds);

            // print starting hand chart
            return(recordCenters);
        }