/// <summary> /// Flushes the current block to Azure storage and adds the new Block ID to BlockList. /// </summary> public void FlushBlock() { if (blockStream.Length == 0) { return; } string blockID = AzureHelper.GenerateRandomBlockID(); blockStream.Position = 0; blob.PutBlock(blockID, blockStream, null); _blockList.Add(blockID); blockStream.Close(); blockStream = new MemoryStream(); }
/// <summary> /// Sets up the Azure storage (Points and Centroids) for the first k-means iteration. /// </summary> public void InitializeStorage() { AzureHelper.LogPerformance(() => { Random random = new Random(); if (jobData.Points == null) { // Initialize the points blob with N random ClusterPoints Points = AzureHelper.CreateBlob(jobData.JobID.ToString(), AzureHelper.PointsBlob); using (ObjectStreamWriter <ClusterPoint> stream = new ObjectStreamWriter <ClusterPoint>(Points, point => point.ToByteArray(), ClusterPoint.Size)) { for (int i = 0; i < jobData.N; i++) { stream.Write(new ClusterPoint( random.NextDouble() * 100 - 50, random.NextDouble() * 100 - 50, Guid.Empty)); } } } else { // Use the given points blob Points = AzureHelper.GetBlob(jobData.Points); // Initialize N based on that using (ObjectStreamReader <ClusterPoint> stream = new ObjectStreamReader <ClusterPoint>(Points, ClusterPoint.FromByteArray, ClusterPoint.Size)) { jobData.N = (int)stream.Length; } } // Initialize the centroids blob with K random Centroids Centroids = AzureHelper.CreateBlob(jobData.JobID.ToString(), AzureHelper.CentroidsBlob); using (ObjectStreamWriter <Centroid> stream = new ObjectStreamWriter <Centroid>(Centroids, point => point.ToByteArray(), Centroid.Size)) { for (int i = 0; i < jobData.K; i++) { stream.Write(new Centroid( Guid.NewGuid(), random.Next(-PointRange, PointRange), random.Next(-PointRange, PointRange))); } } }, jobID: jobData.JobID.ToString(), methodName: "InitializeStorage", iterationCount: IterationCount, points: new Lazy <string>(() => Points.Uri.ToString()), centroids: new Lazy <string>(() => Centroids.Uri.ToString()), machineID: MachineID); }
private void RecalculateCentroids() { AzureHelper.LogPerformance(() => { // Initialize the output blob CloudBlob writeBlob = AzureHelper.CreateBlob(jobData.JobID.ToString(), Guid.NewGuid().ToString()); // Do the mapping and write the new blob using (ObjectStreamReader <Centroid> stream = new ObjectStreamReader <Centroid>(Centroids, Centroid.FromByteArray, Centroid.Size)) { var newCentroids = stream.Select(c => { Point newCentroidPoint; if (totalPointsProcessedDataByCentroid.ContainsKey(c.ID) && totalPointsProcessedDataByCentroid[c.ID].NumPointsProcessed != 0) { newCentroidPoint = totalPointsProcessedDataByCentroid[c.ID].PartialPointSum / (double)totalPointsProcessedDataByCentroid[c.ID].NumPointsProcessed; } else { newCentroidPoint = new Point(); } c.X = newCentroidPoint.X; c.Y = newCentroidPoint.Y; return(c); }); using (ObjectStreamWriter <Centroid> writeStream = new ObjectStreamWriter <Centroid>(writeBlob, point => point.ToByteArray(), Centroid.Size)) { foreach (Centroid c in newCentroids) { writeStream.Write(c); } } } // Copy the contents of the new blob back into the old blob Centroids.CopyFromBlob(writeBlob); System.Diagnostics.Trace.TraceInformation("[ServerRole] Finished RecalculateCentroids(). Total points changed: {0}", TotalNumPointsChanged); ResetPointChangedCounts(); }, jobData.JobID.ToString(), methodName: "RecalculateCentroids", iterationCount: IterationCount, points: Points.Uri.ToString(), centroids: Centroids.Uri.ToString(), machineID: MachineID); }
/// <summary> /// Enqueues M messages into a queue. Each message is an instruction to a worker to process a partition of the k-means data. /// </summary> public void EnqueueTasks(IEnumerable <Worker> workers) { AzureHelper.LogPerformance(() => { int workerNumber = 0; // Loop through the known workers and give them each a chunk of the points. // Note: This loop must execute in the same order every time, otherwise caching will not work -- the workers will get a different workerNumber each time and therefore a different chunk of the points. // We use OrderBy on the PartitionKey to guarantee stable ordering. foreach (Worker worker in workers.OrderBy(worker => worker.PartitionKey)) { KMeansTaskData taskData = new KMeansTaskData(jobData, Guid.NewGuid(), workerNumber++, workers.Count(), Centroids.Uri, DateTime.UtcNow, IterationCount, worker.BuddyGroupID); taskData.Points = Points.Uri; tasks.Add(new KMeansTask(taskData)); AzureHelper.EnqueueMessage(AzureHelper.GetWorkerRequestQueue(worker.PartitionKey), taskData, true); } }, jobData.JobID.ToString(), methodName: "EnqueueTasks", iterationCount: IterationCount, points: Points.Uri.ToString(), centroids: Centroids.Uri.ToString(), machineID: MachineID); }
/// <summary> /// Checks whether to move into the next iteration, and performs the appropriate actions to make it happen. /// </summary> private void NextIteration(IEnumerable <Worker> workers) { System.Diagnostics.Trace.TraceInformation("[ServerRole] NextIteration() JobID={0}", jobData.JobID); CommitPointsBlob(); IterationCount++; if (!string.IsNullOrEmpty(jobData.ProgressEmail)) { AzureHelper.SendStatusEmail(jobData.ProgressEmail, jobData.JobID, IterationCount); } if (NumPointsChangedAboveThreshold() && !MaxIterationCountExceeded()) { RecalculateCentroids(); EnqueueTasks(workers); } else { ReturnResults(); } }
/// <summary> /// Handles a worker's TaskResult from a running k-means job. Adds up the partial sums from the TaskResult. /// </summary> /// <param name="message"></param> /// <returns>False if the given taskData result has already been counted, true otherwise.</returns> public bool ProcessWorkerResponse(KMeansTaskResult taskResult, IEnumerable <Worker> workers) { // Make sure we're actually still waiting for a result for this taskData // If not, this might be a duplicate queue message if (!TaskResultMatchesRunningTask(taskResult)) { return(true); } AzureHelper.LogPerformance(() => { KMeansTask task = TaskResultWithTaskID(taskResult.TaskID); task.Running = false; // The task has returned a response, which means that it has stopped running // Add the worker's updated points blocks if (taskResult.PointsBlockListBlob != null) { using (Stream stream = AzureHelper.GetBlob(taskResult.PointsBlockListBlob).OpenRead()) { BinaryFormatter bf = new BinaryFormatter(); List <string> pointsBlockList = bf.Deserialize(stream) as List <string>; pointsBlockIDs.AddRange(pointsBlockList); } } // Copy out and integrate the data from the worker response AddDataFromTaskResult(taskResult); }, jobData.JobID.ToString(), methodName: "ProcessWorkerResponse", iterationCount: IterationCount, points: Points.Uri.ToString(), centroids: Centroids.Uri.ToString(), machineID: MachineID); // If this is the last worker to return, this iteration is done and we should start the next one if (NoMoreRunningTasks()) { NextIteration(workers); } return(true); }
/// <summary> /// Assigns each ClusterPoint in the "points" blob to the nearest centroid, recording results into TaskResult. /// </summary> private void ProcessPoints() { CloudBlockBlob pointsBlob = AzureHelper.GetBlob(task.Points); // Do the mapping and write the new blob int numThreads = Environment.ProcessorCount; PointsProcessedData[,] pointSumsPerCentroidPerThread = new PointsProcessedData[numThreads, task.K]; int[] pointsChangedPerThread = new int[numThreads]; string[][] blockIDsPerThread = new string[numThreads][]; System.Threading.Tasks.Parallel.For(0, numThreads, threadID => { // A note about caching: The outer block, using (ObjectCachedStreamReader...), reads from the appropriate partition in the Points blob, or, if it exists, a cache file on disk corresponding to the current iteration. // In the previous iteration (if any), this cache file was generated by the inner block, using (ObjectCachedBlockWriter...), which writes to a set of block on the Points blob, as well as to the cache file on disk corresponding to the *next iteration*. // Note the "task.Iteration + 1" in this using statement. // // The reason why we have separate cache files for each iteration is that we can't write to a cache file while we're reading it. So instead we have to write to a separate file. // // TODO: Having separate cache files is wasteful of disk space. Ideally we would only keep cache files from the current iteration and the next one. using (ObjectCachedStreamReader <ClusterPoint> stream = new ObjectCachedStreamReader <ClusterPoint>(pointsBlob, ClusterPoint.FromByteArray, ClusterPoint.Size, AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, subPartitionNumber: threadID, subTotalPartitions: numThreads, iterationNumber: task.Iteration)) { // Log cache hit or miss System.Diagnostics.Trace.TraceInformation("[WorkerRole] Cache {1} for file {0}", stream.CacheFilePath, stream.UsingCache ? "hit" : "miss"); // Process the points using (ObjectCachedBlockWriter <ClusterPoint> writeStream = new ObjectCachedBlockWriter <ClusterPoint>(pointsBlob, point => point.ToByteArray(), ClusterPoint.Size, AzureHelper.GetCachedFilePath(AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, threadID, task.Iteration + 1))) { foreach (var point in stream) { // Assign the point to the nearest centroid Guid oldCentroidID = point.CentroidID; int closestCentroidIndex = centroids.MinIndex(centroid => Point.Distance(point, centroid)); Guid newCentroidID = point.CentroidID = centroids[closestCentroidIndex].ID; // Write the updated point to the writeStream writeStream.Write(point); // Update the number of points changed if (oldCentroidID != newCentroidID) { pointsChangedPerThread[threadID]++; } // Update the point sums if (pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] == null) { pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] = new PointsProcessedData(); } pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].PartialPointSum += point; pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].NumPointsProcessed++; } // Collect the block IDs from writeStream writeStream.FlushBlock(); blockIDsPerThread[threadID] = writeStream.BlockList.ToArray(); } } }); // Combine the per-thread block lists and write the full block list to a blob. Then include that as part of TaskResult List <string> blockIDs = new List <string>(); foreach (string[] blockIDsFromThread in blockIDsPerThread) { blockIDs.AddRange(blockIDsFromThread); } CloudBlob blockIDsBlob = AzureHelper.CreateBlob(task.JobID.ToString(), Guid.NewGuid().ToString()); using (Stream stream = blockIDsBlob.OpenWrite()) { BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(stream, blockIDs); } TaskResult.PointsBlockListBlob = blockIDsBlob.Uri; // Total up the per-thread pointSumsPerCentroid TaskResult.PointsProcessedDataByCentroid = new Dictionary <Guid, PointsProcessedData>(); for (int i = 0; i < task.K; ++i) { Guid centroidID = centroids[i].ID; TaskResult.PointsProcessedDataByCentroid[centroidID] = new PointsProcessedData(); for (int j = 0; j < numThreads; ++j) { if (pointSumsPerCentroidPerThread[j, i] != null) { TaskResult.PointsProcessedDataByCentroid[centroidID].PartialPointSum += pointSumsPerCentroidPerThread[j, i].PartialPointSum; TaskResult.PointsProcessedDataByCentroid[centroidID].NumPointsProcessed += pointSumsPerCentroidPerThread[j, i].NumPointsProcessed; } } } // Total up the per-thread numPointsChanged TaskResult.NumPointsChanged = 0; foreach (int threadPointsChanged in pointsChangedPerThread) { TaskResult.NumPointsChanged += threadPointsChanged; } }
private void InitializeCentroids() { using (ObjectStreamReader <Centroid> stream = new ObjectStreamReader <Centroid>(AzureHelper.GetBlob(task.Centroids), Centroid.FromByteArray, Centroid.Size)) { centroids = stream.ToList(); } }
private void CommitPointsBlob() { AzureHelper.CommitBlockBlob(Points, pointsBlockIDs); pointsBlockIDs.Clear(); }