/// <summary> /// Sets up the Azure storage (Points and Centroids) for the first k-means iteration. /// </summary> public void InitializeStorage() { AzureHelper.LogPerformance(() => { Random random = new Random(); if (jobData.Points == null) { // Initialize the points blob with N random ClusterPoints Points = AzureHelper.CreateBlob(jobData.JobID.ToString(), AzureHelper.PointsBlob); using (ObjectStreamWriter <ClusterPoint> stream = new ObjectStreamWriter <ClusterPoint>(Points, point => point.ToByteArray(), ClusterPoint.Size)) { for (int i = 0; i < jobData.N; i++) { stream.Write(new ClusterPoint( random.NextDouble() * 100 - 50, random.NextDouble() * 100 - 50, Guid.Empty)); } } } else { // Use the given points blob Points = AzureHelper.GetBlob(jobData.Points); // Initialize N based on that using (ObjectStreamReader <ClusterPoint> stream = new ObjectStreamReader <ClusterPoint>(Points, ClusterPoint.FromByteArray, ClusterPoint.Size)) { jobData.N = (int)stream.Length; } } // Initialize the centroids blob with K random Centroids Centroids = AzureHelper.CreateBlob(jobData.JobID.ToString(), AzureHelper.CentroidsBlob); using (ObjectStreamWriter <Centroid> stream = new ObjectStreamWriter <Centroid>(Centroids, point => point.ToByteArray(), Centroid.Size)) { for (int i = 0; i < jobData.K; i++) { stream.Write(new Centroid( Guid.NewGuid(), random.Next(-PointRange, PointRange), random.Next(-PointRange, PointRange))); } } }, jobID: jobData.JobID.ToString(), methodName: "InitializeStorage", iterationCount: IterationCount, points: new Lazy <string>(() => Points.Uri.ToString()), centroids: new Lazy <string>(() => Centroids.Uri.ToString()), machineID: MachineID); }
/// <summary> /// Handles a worker's TaskResult from a running k-means job. Adds up the partial sums from the TaskResult. /// </summary> /// <param name="message"></param> /// <returns>False if the given taskData result has already been counted, true otherwise.</returns> public bool ProcessWorkerResponse(KMeansTaskResult taskResult, IEnumerable <Worker> workers) { // Make sure we're actually still waiting for a result for this taskData // If not, this might be a duplicate queue message if (!TaskResultMatchesRunningTask(taskResult)) { return(true); } AzureHelper.LogPerformance(() => { KMeansTask task = TaskResultWithTaskID(taskResult.TaskID); task.Running = false; // The task has returned a response, which means that it has stopped running // Add the worker's updated points blocks if (taskResult.PointsBlockListBlob != null) { using (Stream stream = AzureHelper.GetBlob(taskResult.PointsBlockListBlob).OpenRead()) { BinaryFormatter bf = new BinaryFormatter(); List <string> pointsBlockList = bf.Deserialize(stream) as List <string>; pointsBlockIDs.AddRange(pointsBlockList); } } // Copy out and integrate the data from the worker response AddDataFromTaskResult(taskResult); }, jobData.JobID.ToString(), methodName: "ProcessWorkerResponse", iterationCount: IterationCount, points: Points.Uri.ToString(), centroids: Centroids.Uri.ToString(), machineID: MachineID); // If this is the last worker to return, this iteration is done and we should start the next one if (NoMoreRunningTasks()) { NextIteration(workers); } return(true); }
/// <summary> /// Assigns each ClusterPoint in the "points" blob to the nearest centroid, recording results into TaskResult. /// </summary> private void ProcessPoints() { CloudBlockBlob pointsBlob = AzureHelper.GetBlob(task.Points); // Do the mapping and write the new blob int numThreads = Environment.ProcessorCount; PointsProcessedData[,] pointSumsPerCentroidPerThread = new PointsProcessedData[numThreads, task.K]; int[] pointsChangedPerThread = new int[numThreads]; string[][] blockIDsPerThread = new string[numThreads][]; System.Threading.Tasks.Parallel.For(0, numThreads, threadID => { // A note about caching: The outer block, using (ObjectCachedStreamReader...), reads from the appropriate partition in the Points blob, or, if it exists, a cache file on disk corresponding to the current iteration. // In the previous iteration (if any), this cache file was generated by the inner block, using (ObjectCachedBlockWriter...), which writes to a set of block on the Points blob, as well as to the cache file on disk corresponding to the *next iteration*. // Note the "task.Iteration + 1" in this using statement. // // The reason why we have separate cache files for each iteration is that we can't write to a cache file while we're reading it. So instead we have to write to a separate file. // // TODO: Having separate cache files is wasteful of disk space. Ideally we would only keep cache files from the current iteration and the next one. using (ObjectCachedStreamReader <ClusterPoint> stream = new ObjectCachedStreamReader <ClusterPoint>(pointsBlob, ClusterPoint.FromByteArray, ClusterPoint.Size, AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, subPartitionNumber: threadID, subTotalPartitions: numThreads, iterationNumber: task.Iteration)) { // Log cache hit or miss System.Diagnostics.Trace.TraceInformation("[WorkerRole] Cache {1} for file {0}", stream.CacheFilePath, stream.UsingCache ? "hit" : "miss"); // Process the points using (ObjectCachedBlockWriter <ClusterPoint> writeStream = new ObjectCachedBlockWriter <ClusterPoint>(pointsBlob, point => point.ToByteArray(), ClusterPoint.Size, AzureHelper.GetCachedFilePath(AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, threadID, task.Iteration + 1))) { foreach (var point in stream) { // Assign the point to the nearest centroid Guid oldCentroidID = point.CentroidID; int closestCentroidIndex = centroids.MinIndex(centroid => Point.Distance(point, centroid)); Guid newCentroidID = point.CentroidID = centroids[closestCentroidIndex].ID; // Write the updated point to the writeStream writeStream.Write(point); // Update the number of points changed if (oldCentroidID != newCentroidID) { pointsChangedPerThread[threadID]++; } // Update the point sums if (pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] == null) { pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] = new PointsProcessedData(); } pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].PartialPointSum += point; pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].NumPointsProcessed++; } // Collect the block IDs from writeStream writeStream.FlushBlock(); blockIDsPerThread[threadID] = writeStream.BlockList.ToArray(); } } }); // Combine the per-thread block lists and write the full block list to a blob. Then include that as part of TaskResult List <string> blockIDs = new List <string>(); foreach (string[] blockIDsFromThread in blockIDsPerThread) { blockIDs.AddRange(blockIDsFromThread); } CloudBlob blockIDsBlob = AzureHelper.CreateBlob(task.JobID.ToString(), Guid.NewGuid().ToString()); using (Stream stream = blockIDsBlob.OpenWrite()) { BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(stream, blockIDs); } TaskResult.PointsBlockListBlob = blockIDsBlob.Uri; // Total up the per-thread pointSumsPerCentroid TaskResult.PointsProcessedDataByCentroid = new Dictionary <Guid, PointsProcessedData>(); for (int i = 0; i < task.K; ++i) { Guid centroidID = centroids[i].ID; TaskResult.PointsProcessedDataByCentroid[centroidID] = new PointsProcessedData(); for (int j = 0; j < numThreads; ++j) { if (pointSumsPerCentroidPerThread[j, i] != null) { TaskResult.PointsProcessedDataByCentroid[centroidID].PartialPointSum += pointSumsPerCentroidPerThread[j, i].PartialPointSum; TaskResult.PointsProcessedDataByCentroid[centroidID].NumPointsProcessed += pointSumsPerCentroidPerThread[j, i].NumPointsProcessed; } } } // Total up the per-thread numPointsChanged TaskResult.NumPointsChanged = 0; foreach (int threadPointsChanged in pointsChangedPerThread) { TaskResult.NumPointsChanged += threadPointsChanged; } }
private void InitializeCentroids() { using (ObjectStreamReader <Centroid> stream = new ObjectStreamReader <Centroid>(AzureHelper.GetBlob(task.Centroids), Centroid.FromByteArray, Centroid.Size)) { centroids = stream.ToList(); } }