private void AddPointsProcessedDataForCentroid(Guid centroidID, PointsProcessedData data) { if (!totalPointsProcessedDataByCentroid.ContainsKey(centroidID)) { totalPointsProcessedDataByCentroid[centroidID] = new PointsProcessedData(); } totalPointsProcessedDataByCentroid[centroidID] += data; }
public void op_AdditionTest() { PointsProcessedData d1 = new PointsProcessedData(); PointsProcessedData d2 = new PointsProcessedData { NumPointsProcessed = 5, PartialPointSum = new Point(5, 5) }; PointsProcessedData expected = new PointsProcessedData { NumPointsProcessed = 5, PartialPointSum = new Point(5, 5) }; PointsProcessedData actual; actual = (d1 + d2); Assert.AreEqual(expected.NumPointsProcessed, actual.NumPointsProcessed); Assert.AreEqual(expected.PartialPointSum.X, actual.PartialPointSum.X); Assert.AreEqual(expected.PartialPointSum.Y, actual.PartialPointSum.Y); }
/// <summary> /// Assigns each ClusterPoint in the "points" blob to the nearest centroid, recording results into TaskResult. /// </summary> private void ProcessPoints() { CloudBlockBlob pointsBlob = AzureHelper.GetBlob(task.Points); // Do the mapping and write the new blob int numThreads = Environment.ProcessorCount; PointsProcessedData[,] pointSumsPerCentroidPerThread = new PointsProcessedData[numThreads, task.K]; int[] pointsChangedPerThread = new int[numThreads]; string[][] blockIDsPerThread = new string[numThreads][]; System.Threading.Tasks.Parallel.For(0, numThreads, threadID => { // A note about caching: The outer block, using (ObjectCachedStreamReader...), reads from the appropriate partition in the Points blob, or, if it exists, a cache file on disk corresponding to the current iteration. // In the previous iteration (if any), this cache file was generated by the inner block, using (ObjectCachedBlockWriter...), which writes to a set of block on the Points blob, as well as to the cache file on disk corresponding to the *next iteration*. // Note the "task.Iteration + 1" in this using statement. // // The reason why we have separate cache files for each iteration is that we can't write to a cache file while we're reading it. So instead we have to write to a separate file. // // TODO: Having separate cache files is wasteful of disk space. Ideally we would only keep cache files from the current iteration and the next one. using (ObjectCachedStreamReader <ClusterPoint> stream = new ObjectCachedStreamReader <ClusterPoint>(pointsBlob, ClusterPoint.FromByteArray, ClusterPoint.Size, AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, subPartitionNumber: threadID, subTotalPartitions: numThreads, iterationNumber: task.Iteration)) { // Log cache hit or miss System.Diagnostics.Trace.TraceInformation("[WorkerRole] Cache {1} for file {0}", stream.CacheFilePath, stream.UsingCache ? "hit" : "miss"); // Process the points using (ObjectCachedBlockWriter <ClusterPoint> writeStream = new ObjectCachedBlockWriter <ClusterPoint>(pointsBlob, point => point.ToByteArray(), ClusterPoint.Size, AzureHelper.GetCachedFilePath(AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, threadID, task.Iteration + 1))) { foreach (var point in stream) { // Assign the point to the nearest centroid Guid oldCentroidID = point.CentroidID; int closestCentroidIndex = centroids.MinIndex(centroid => Point.Distance(point, centroid)); Guid newCentroidID = point.CentroidID = centroids[closestCentroidIndex].ID; // Write the updated point to the writeStream writeStream.Write(point); // Update the number of points changed if (oldCentroidID != newCentroidID) { pointsChangedPerThread[threadID]++; } // Update the point sums if (pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] == null) { pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] = new PointsProcessedData(); } pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].PartialPointSum += point; pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].NumPointsProcessed++; } // Collect the block IDs from writeStream writeStream.FlushBlock(); blockIDsPerThread[threadID] = writeStream.BlockList.ToArray(); } } }); // Combine the per-thread block lists and write the full block list to a blob. Then include that as part of TaskResult List <string> blockIDs = new List <string>(); foreach (string[] blockIDsFromThread in blockIDsPerThread) { blockIDs.AddRange(blockIDsFromThread); } CloudBlob blockIDsBlob = AzureHelper.CreateBlob(task.JobID.ToString(), Guid.NewGuid().ToString()); using (Stream stream = blockIDsBlob.OpenWrite()) { BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(stream, blockIDs); } TaskResult.PointsBlockListBlob = blockIDsBlob.Uri; // Total up the per-thread pointSumsPerCentroid TaskResult.PointsProcessedDataByCentroid = new Dictionary <Guid, PointsProcessedData>(); for (int i = 0; i < task.K; ++i) { Guid centroidID = centroids[i].ID; TaskResult.PointsProcessedDataByCentroid[centroidID] = new PointsProcessedData(); for (int j = 0; j < numThreads; ++j) { if (pointSumsPerCentroidPerThread[j, i] != null) { TaskResult.PointsProcessedDataByCentroid[centroidID].PartialPointSum += pointSumsPerCentroidPerThread[j, i].PartialPointSum; TaskResult.PointsProcessedDataByCentroid[centroidID].NumPointsProcessed += pointSumsPerCentroidPerThread[j, i].NumPointsProcessed; } } } // Total up the per-thread numPointsChanged TaskResult.NumPointsChanged = 0; foreach (int threadPointsChanged in pointsChangedPerThread) { TaskResult.NumPointsChanged += threadPointsChanged; } }
/// <summary> /// Assigns each ClusterPoint in the "points" blob to the nearest centroid, recording results into TaskResult. /// </summary> private void ProcessPoints() { CloudBlockBlob pointsBlob = AzureHelper.GetBlob(task.Points); // Do the mapping and write the new blob int numThreads = Environment.ProcessorCount; PointsProcessedData[,] pointSumsPerCentroidPerThread = new PointsProcessedData[numThreads, task.K]; int[] pointsChangedPerThread = new int[numThreads]; string[][] blockIDsPerThread = new string[numThreads][]; System.Threading.Tasks.Parallel.For(0, numThreads, threadID => { // A note about caching: The outer block, using (ObjectCachedStreamReader...), reads from the appropriate partition in the Points blob, or, if it exists, a cache file on disk corresponding to the current iteration. // In the previous iteration (if any), this cache file was generated by the inner block, using (ObjectCachedBlockWriter...), which writes to a set of block on the Points blob, as well as to the cache file on disk corresponding to the *next iteration*. // Note the "task.Iteration + 1" in this using statement. // // The reason why we have separate cache files for each iteration is that we can't write to a cache file while we're reading it. So instead we have to write to a separate file. // // TODO: Having separate cache files is wasteful of disk space. Ideally we would only keep cache files from the current iteration and the next one. using (ObjectCachedStreamReader<ClusterPoint> stream = new ObjectCachedStreamReader<ClusterPoint>(pointsBlob, ClusterPoint.FromByteArray, ClusterPoint.Size, AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, subPartitionNumber: threadID, subTotalPartitions: numThreads, iterationNumber: task.Iteration)) { // Log cache hit or miss System.Diagnostics.Trace.TraceInformation("[WorkerRole] Cache {1} for file {0}", stream.CacheFilePath, stream.UsingCache ? "hit" : "miss"); // Process the points using (ObjectCachedBlockWriter<ClusterPoint> writeStream = new ObjectCachedBlockWriter<ClusterPoint>(pointsBlob, point => point.ToByteArray(), ClusterPoint.Size, AzureHelper.GetCachedFilePath(AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, threadID, task.Iteration + 1))) { foreach (var point in stream) { // Assign the point to the nearest centroid Guid oldCentroidID = point.CentroidID; int closestCentroidIndex = centroids.MinIndex(centroid => Point.Distance(point, centroid)); Guid newCentroidID = point.CentroidID = centroids[closestCentroidIndex].ID; // Write the updated point to the writeStream writeStream.Write(point); // Update the number of points changed if (oldCentroidID != newCentroidID) { pointsChangedPerThread[threadID]++; } // Update the point sums if (pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] == null) { pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] = new PointsProcessedData(); } pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].PartialPointSum += point; pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].NumPointsProcessed++; } // Collect the block IDs from writeStream writeStream.FlushBlock(); blockIDsPerThread[threadID] = writeStream.BlockList.ToArray(); } } }); // Combine the per-thread block lists and write the full block list to a blob. Then include that as part of TaskResult List<string> blockIDs = new List<string>(); foreach (string[] blockIDsFromThread in blockIDsPerThread) { blockIDs.AddRange(blockIDsFromThread); } CloudBlob blockIDsBlob = AzureHelper.CreateBlob(task.JobID.ToString(), Guid.NewGuid().ToString()); using (Stream stream = blockIDsBlob.OpenWrite()) { BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(stream, blockIDs); } TaskResult.PointsBlockListBlob = blockIDsBlob.Uri; // Total up the per-thread pointSumsPerCentroid TaskResult.PointsProcessedDataByCentroid = new Dictionary<Guid, PointsProcessedData>(); for (int i = 0; i < task.K; ++i) { Guid centroidID = centroids[i].ID; TaskResult.PointsProcessedDataByCentroid[centroidID] = new PointsProcessedData(); for (int j = 0; j < numThreads; ++j) { if (pointSumsPerCentroidPerThread[j, i] != null) { TaskResult.PointsProcessedDataByCentroid[centroidID].PartialPointSum += pointSumsPerCentroidPerThread[j, i].PartialPointSum; TaskResult.PointsProcessedDataByCentroid[centroidID].NumPointsProcessed += pointSumsPerCentroidPerThread[j, i].NumPointsProcessed; } } } // Total up the per-thread numPointsChanged TaskResult.NumPointsChanged = 0; foreach (int threadPointsChanged in pointsChangedPerThread) { TaskResult.NumPointsChanged += threadPointsChanged; } }