/// <summary> /// Returns an array where the element at index i contains the cluster entry associated with the entry /// </summary> /// <param name="data"></param> /// <param name="k"></param> /// <returns></returns> public int[] ClusterEMD(float[][] data, int k, int nofRuns, int[] _bestCenters = null) { Console.WriteLine("K-means++ (EMD) clustering {0} elements into {1} clusters with {2} runs...", data.Count(), k, nofRuns); int filenameId = RandomGen.Next(0, 10000000); DateTime start = DateTime.UtcNow; int[] bestCenters = new int[data.Count()]; int[] recordCenters = new int[data.Count()]; // we return indices only, the centers are discarded // load previous indices if passed bool skipInit = false; if (_bestCenters != null) { skipInit = true; Array.Copy(_bestCenters, bestCenters, _bestCenters.Length); Array.Copy(_bestCenters, recordCenters, _bestCenters.Length); } double recordDistance = double.MaxValue; for (int run = 0; run < nofRuns; ++run) { float[,] centers = new float[k, data[0].Count()]; Console.WriteLine("K-means++ starting clustering..."); double lastDistance = double.MaxValue; bool distanceChanged = true; if (!skipInit) { bestCenters = new int[data.Count()]; centers = FindStartingCentersEMD(data, k); } else { // find new cluster centers // todo: it isnt theoretically sound to take the mean when using EMD distance metric centers = CalculateNewCenters(data, bestCenters, k); skipInit = false; } float[,] centerCenterDistances = new float[k, k]; while (distanceChanged) { // calculate cluster-cluster distances to use triangle inequality CalculateClusterDistancesEMD(centerCenterDistances, centers); // find closest cluster for each element long sharedLoopCounter = 0; double totalDistance = 0; using (var progress = new ProgressBar()) { Parallel.For(0, Global.NOF_THREADS, i => { double threadDistance = 0; long iter = 0; for (int j = Util.GetWorkItemsIndices(data.Length, Global.NOF_THREADS, i).Item1; j < Util.GetWorkItemsIndices(data.Length, Global.NOF_THREADS, i).Item2; ++j) { // go through all data // assume previous cluster was good, this is better for the triangle inequality double distance = GetEarthMoverDistance(data, centers, j, bestCenters[j]); int bestIndex = bestCenters[j]; for (int m = 0; m < k; m++) // go through centers { if (centerCenterDistances[bestIndex, m] < 2 * distance && bestIndex != m) { double tempDistance = GetEarthMoverDistance(data, centers, j, m); if (tempDistance < distance) { distance = tempDistance; bestIndex = m; } } } bestCenters[j] = bestIndex; threadDistance += distance; iter++; if (iter % 100000 == 0) { Interlocked.Add(ref sharedLoopCounter, 100000); AddDouble(ref totalDistance, threadDistance); threadDistance = 0; progress.Report((double)Interlocked.Read(ref sharedLoopCounter) / data.Length, sharedLoopCounter); } } Interlocked.Add(ref sharedLoopCounter, iter % 100000); progress.Report((double)Interlocked.Read(ref sharedLoopCounter) / data.Length, sharedLoopCounter); AddDouble(ref totalDistance, threadDistance); }); } centers = CalculateNewCenters(data, bestCenters, k); totalDistance /= data.Length; distanceChanged = !(totalDistance == lastDistance); double diff = lastDistance - totalDistance; Console.WriteLine("Saving intermediate table to file..."); FileHandler.SaveToFile(recordCenters, "EMDTable_temp_" + filenameId + ".txt"); if (totalDistance < recordDistance) { recordDistance = totalDistance; Array.Copy(bestCenters, recordCenters, recordCenters.Length); } Console.WriteLine("Current average distance: {0} Improvement: {1}, {2}%", totalDistance, diff, 100.0 * (1.0 - totalDistance / lastDistance)); lastDistance = totalDistance; } } Console.WriteLine("Best distance found: " + recordDistance); TimeSpan elapsed = DateTime.UtcNow - start; Console.WriteLine("K-means++ clustering (EMD) completed in {0}d {1}h {2}m {3}s", elapsed.Days, elapsed.Hours, elapsed.Minutes, elapsed.Seconds); // print starting hand chart return(recordCenters); }