Example #1
0
        private void AddPointsProcessedDataForCentroid(Guid centroidID, PointsProcessedData data)
        {
            if (!totalPointsProcessedDataByCentroid.ContainsKey(centroidID))
            {
                totalPointsProcessedDataByCentroid[centroidID] = new PointsProcessedData();
            }

            totalPointsProcessedDataByCentroid[centroidID] += data;
        }
        public void op_AdditionTest()
        {
            PointsProcessedData d1 = new PointsProcessedData();
            PointsProcessedData d2 = new PointsProcessedData {
                NumPointsProcessed = 5,
                PartialPointSum = new Point(5, 5)
            };

            PointsProcessedData expected = new PointsProcessedData
            {
                NumPointsProcessed = 5,
                PartialPointSum = new Point(5, 5)
            };

            PointsProcessedData actual;
            actual = (d1 + d2);

            Assert.AreEqual(expected.NumPointsProcessed, actual.NumPointsProcessed);
            Assert.AreEqual(expected.PartialPointSum.X, actual.PartialPointSum.X);
            Assert.AreEqual(expected.PartialPointSum.Y, actual.PartialPointSum.Y);
        }
        /// <summary>
        /// Assigns each ClusterPoint in the "points" blob to the nearest centroid, recording results into TaskResult.
        /// </summary>
        private void ProcessPoints()
        {
            CloudBlockBlob pointsBlob = AzureHelper.GetBlob(task.Points);

            // Do the mapping and write the new blob
            int numThreads = Environment.ProcessorCount;

            PointsProcessedData[,] pointSumsPerCentroidPerThread = new PointsProcessedData[numThreads, task.K];
            int[]      pointsChangedPerThread = new int[numThreads];
            string[][] blockIDsPerThread      = new string[numThreads][];

            System.Threading.Tasks.Parallel.For(0, numThreads, threadID =>
            {
                // A note about caching: The outer block, using (ObjectCachedStreamReader...), reads from the appropriate partition in the Points blob, or, if it exists, a cache file on disk corresponding to the current iteration.
                // In the previous iteration (if any), this cache file was generated by the inner block, using (ObjectCachedBlockWriter...), which writes to a set of block on the Points blob, as well as to the cache file on disk corresponding to the *next iteration*.
                // Note the "task.Iteration + 1" in this using statement.
                //
                // The reason why we have separate cache files for each iteration is that we can't write to a cache file while we're reading it. So instead we have to write to a separate file.
                //
                // TODO: Having separate cache files is wasteful of disk space. Ideally we would only keep cache files from the current iteration and the next one.

                using (ObjectCachedStreamReader <ClusterPoint> stream = new ObjectCachedStreamReader <ClusterPoint>(pointsBlob, ClusterPoint.FromByteArray, ClusterPoint.Size,
                                                                                                                    AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, subPartitionNumber: threadID, subTotalPartitions: numThreads, iterationNumber: task.Iteration))
                {
                    // Log cache hit or miss
                    System.Diagnostics.Trace.TraceInformation("[WorkerRole] Cache {1} for file {0}", stream.CacheFilePath, stream.UsingCache ? "hit" : "miss");

                    // Process the points
                    using (ObjectCachedBlockWriter <ClusterPoint> writeStream = new ObjectCachedBlockWriter <ClusterPoint>(pointsBlob, point => point.ToByteArray(), ClusterPoint.Size,
                                                                                                                           AzureHelper.GetCachedFilePath(AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, threadID, task.Iteration + 1)))
                    {
                        foreach (var point in stream)
                        {
                            // Assign the point to the nearest centroid
                            Guid oldCentroidID       = point.CentroidID;
                            int closestCentroidIndex = centroids.MinIndex(centroid => Point.Distance(point, centroid));
                            Guid newCentroidID       = point.CentroidID = centroids[closestCentroidIndex].ID;

                            // Write the updated point to the writeStream
                            writeStream.Write(point);

                            // Update the number of points changed
                            if (oldCentroidID != newCentroidID)
                            {
                                pointsChangedPerThread[threadID]++;
                            }

                            // Update the point sums
                            if (pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] == null)
                            {
                                pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] = new PointsProcessedData();
                            }
                            pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].PartialPointSum += point;
                            pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].NumPointsProcessed++;
                        }

                        // Collect the block IDs from writeStream
                        writeStream.FlushBlock();
                        blockIDsPerThread[threadID] = writeStream.BlockList.ToArray();
                    }
                }
            });

            // Combine the per-thread block lists and write the full block list to a blob. Then include that as part of TaskResult
            List <string> blockIDs = new List <string>();

            foreach (string[] blockIDsFromThread in blockIDsPerThread)
            {
                blockIDs.AddRange(blockIDsFromThread);
            }
            CloudBlob blockIDsBlob = AzureHelper.CreateBlob(task.JobID.ToString(), Guid.NewGuid().ToString());

            using (Stream stream = blockIDsBlob.OpenWrite())
            {
                BinaryFormatter bf = new BinaryFormatter();
                bf.Serialize(stream, blockIDs);
            }
            TaskResult.PointsBlockListBlob = blockIDsBlob.Uri;

            // Total up the per-thread pointSumsPerCentroid
            TaskResult.PointsProcessedDataByCentroid = new Dictionary <Guid, PointsProcessedData>();
            for (int i = 0; i < task.K; ++i)
            {
                Guid centroidID = centroids[i].ID;
                TaskResult.PointsProcessedDataByCentroid[centroidID] = new PointsProcessedData();

                for (int j = 0; j < numThreads; ++j)
                {
                    if (pointSumsPerCentroidPerThread[j, i] != null)
                    {
                        TaskResult.PointsProcessedDataByCentroid[centroidID].PartialPointSum    += pointSumsPerCentroidPerThread[j, i].PartialPointSum;
                        TaskResult.PointsProcessedDataByCentroid[centroidID].NumPointsProcessed += pointSumsPerCentroidPerThread[j, i].NumPointsProcessed;
                    }
                }
            }

            // Total up the per-thread numPointsChanged
            TaskResult.NumPointsChanged = 0;
            foreach (int threadPointsChanged in pointsChangedPerThread)
            {
                TaskResult.NumPointsChanged += threadPointsChanged;
            }
        }
Example #4
0
        private void AddPointsProcessedDataForCentroid(Guid centroidID, PointsProcessedData data)
        {
            if (!totalPointsProcessedDataByCentroid.ContainsKey(centroidID))
            {
                totalPointsProcessedDataByCentroid[centroidID] = new PointsProcessedData();
            }

            totalPointsProcessedDataByCentroid[centroidID] += data;
        }
        /// <summary>
        /// Assigns each ClusterPoint in the "points" blob to the nearest centroid, recording results into TaskResult.
        /// </summary>
        private void ProcessPoints()
        {
            CloudBlockBlob pointsBlob = AzureHelper.GetBlob(task.Points);

            // Do the mapping and write the new blob
            int numThreads = Environment.ProcessorCount;
            PointsProcessedData[,] pointSumsPerCentroidPerThread = new PointsProcessedData[numThreads, task.K];
            int[] pointsChangedPerThread = new int[numThreads];
            string[][] blockIDsPerThread = new string[numThreads][];

            System.Threading.Tasks.Parallel.For(0, numThreads, threadID =>
            {
                // A note about caching: The outer block, using (ObjectCachedStreamReader...), reads from the appropriate partition in the Points blob, or, if it exists, a cache file on disk corresponding to the current iteration.
                // In the previous iteration (if any), this cache file was generated by the inner block, using (ObjectCachedBlockWriter...), which writes to a set of block on the Points blob, as well as to the cache file on disk corresponding to the *next iteration*.
                // Note the "task.Iteration + 1" in this using statement.
                //
                // The reason why we have separate cache files for each iteration is that we can't write to a cache file while we're reading it. So instead we have to write to a separate file.
                //
                // TODO: Having separate cache files is wasteful of disk space. Ideally we would only keep cache files from the current iteration and the next one.

                using (ObjectCachedStreamReader<ClusterPoint> stream = new ObjectCachedStreamReader<ClusterPoint>(pointsBlob, ClusterPoint.FromByteArray, ClusterPoint.Size,
                    AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, subPartitionNumber: threadID, subTotalPartitions: numThreads, iterationNumber: task.Iteration))
                {
                    // Log cache hit or miss
                    System.Diagnostics.Trace.TraceInformation("[WorkerRole] Cache {1} for file {0}", stream.CacheFilePath, stream.UsingCache ? "hit" : "miss");

                    // Process the points
                    using (ObjectCachedBlockWriter<ClusterPoint> writeStream = new ObjectCachedBlockWriter<ClusterPoint>(pointsBlob, point => point.ToByteArray(), ClusterPoint.Size,
                        AzureHelper.GetCachedFilePath(AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, threadID, task.Iteration + 1)))
                    {
                        foreach (var point in stream)
                        {
                            // Assign the point to the nearest centroid
                            Guid oldCentroidID = point.CentroidID;
                            int closestCentroidIndex = centroids.MinIndex(centroid => Point.Distance(point, centroid));
                            Guid newCentroidID = point.CentroidID = centroids[closestCentroidIndex].ID;

                            // Write the updated point to the writeStream
                            writeStream.Write(point);

                            // Update the number of points changed
                            if (oldCentroidID != newCentroidID)
                            {
                                pointsChangedPerThread[threadID]++;
                            }

                            // Update the point sums
                            if (pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] == null)
                            {
                                pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] = new PointsProcessedData();
                            }
                            pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].PartialPointSum += point;
                            pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].NumPointsProcessed++;
                        }

                        // Collect the block IDs from writeStream
                        writeStream.FlushBlock();
                        blockIDsPerThread[threadID] = writeStream.BlockList.ToArray();
                    }
                }
            });

            // Combine the per-thread block lists and write the full block list to a blob. Then include that as part of TaskResult
            List<string> blockIDs = new List<string>();
            foreach (string[] blockIDsFromThread in blockIDsPerThread)
            {
                blockIDs.AddRange(blockIDsFromThread);
            }
            CloudBlob blockIDsBlob = AzureHelper.CreateBlob(task.JobID.ToString(), Guid.NewGuid().ToString());
            using (Stream stream = blockIDsBlob.OpenWrite())
            {
                BinaryFormatter bf = new BinaryFormatter();
                bf.Serialize(stream, blockIDs);
            }
            TaskResult.PointsBlockListBlob =  blockIDsBlob.Uri;

            // Total up the per-thread pointSumsPerCentroid
            TaskResult.PointsProcessedDataByCentroid = new Dictionary<Guid, PointsProcessedData>();
            for (int i = 0; i < task.K; ++i)
            {
                Guid centroidID = centroids[i].ID;
                TaskResult.PointsProcessedDataByCentroid[centroidID] = new PointsProcessedData();

                for (int j = 0; j < numThreads; ++j)
                {
                    if (pointSumsPerCentroidPerThread[j, i] != null)
                    {
                        TaskResult.PointsProcessedDataByCentroid[centroidID].PartialPointSum += pointSumsPerCentroidPerThread[j, i].PartialPointSum;
                        TaskResult.PointsProcessedDataByCentroid[centroidID].NumPointsProcessed += pointSumsPerCentroidPerThread[j, i].NumPointsProcessed;
                    }
                }
            }

            // Total up the per-thread numPointsChanged
            TaskResult.NumPointsChanged = 0;
            foreach (int threadPointsChanged in pointsChangedPerThread)
            {
                TaskResult.NumPointsChanged += threadPointsChanged;
            }
        }