/// <summary>
        /// Flushes the current block to Azure storage and adds the new Block ID to BlockList.
        /// </summary>
        public void FlushBlock()
        {
            if (blockStream.Length == 0)
            {
                return;
            }

            string blockID = AzureHelper.GenerateRandomBlockID();

            blockStream.Position = 0;
            blob.PutBlock(blockID, blockStream, null);
            _blockList.Add(blockID);

            blockStream.Close();
            blockStream = new MemoryStream();
        }
Esempio n. 2
0
        /// <summary>
        /// Sets up the Azure storage (Points and Centroids) for the first k-means iteration.
        /// </summary>
        public void InitializeStorage()
        {
            AzureHelper.LogPerformance(() =>
            {
                Random random = new Random();

                if (jobData.Points == null)
                {
                    // Initialize the points blob with N random ClusterPoints
                    Points = AzureHelper.CreateBlob(jobData.JobID.ToString(), AzureHelper.PointsBlob);
                    using (ObjectStreamWriter <ClusterPoint> stream = new ObjectStreamWriter <ClusterPoint>(Points, point => point.ToByteArray(), ClusterPoint.Size))
                    {
                        for (int i = 0; i < jobData.N; i++)
                        {
                            stream.Write(new ClusterPoint(
                                             random.NextDouble() * 100 - 50,
                                             random.NextDouble() * 100 - 50,
                                             Guid.Empty));
                        }
                    }
                }
                else
                {
                    // Use the given points blob
                    Points = AzureHelper.GetBlob(jobData.Points);

                    // Initialize N based on that
                    using (ObjectStreamReader <ClusterPoint> stream = new ObjectStreamReader <ClusterPoint>(Points, ClusterPoint.FromByteArray, ClusterPoint.Size))
                    {
                        jobData.N = (int)stream.Length;
                    }
                }

                // Initialize the centroids blob with K random Centroids
                Centroids = AzureHelper.CreateBlob(jobData.JobID.ToString(), AzureHelper.CentroidsBlob);
                using (ObjectStreamWriter <Centroid> stream = new ObjectStreamWriter <Centroid>(Centroids, point => point.ToByteArray(), Centroid.Size))
                {
                    for (int i = 0; i < jobData.K; i++)
                    {
                        stream.Write(new Centroid(
                                         Guid.NewGuid(),
                                         random.Next(-PointRange, PointRange),
                                         random.Next(-PointRange, PointRange)));
                    }
                }
            }, jobID: jobData.JobID.ToString(), methodName: "InitializeStorage", iterationCount: IterationCount, points: new Lazy <string>(() => Points.Uri.ToString()), centroids: new Lazy <string>(() => Centroids.Uri.ToString()), machineID: MachineID);
        }
Esempio n. 3
0
        private void RecalculateCentroids()
        {
            AzureHelper.LogPerformance(() =>
            {
                // Initialize the output blob
                CloudBlob writeBlob = AzureHelper.CreateBlob(jobData.JobID.ToString(), Guid.NewGuid().ToString());

                // Do the mapping and write the new blob
                using (ObjectStreamReader <Centroid> stream = new ObjectStreamReader <Centroid>(Centroids, Centroid.FromByteArray, Centroid.Size))
                {
                    var newCentroids = stream.Select(c =>
                    {
                        Point newCentroidPoint;
                        if (totalPointsProcessedDataByCentroid.ContainsKey(c.ID) && totalPointsProcessedDataByCentroid[c.ID].NumPointsProcessed != 0)
                        {
                            newCentroidPoint = totalPointsProcessedDataByCentroid[c.ID].PartialPointSum
                                               / (double)totalPointsProcessedDataByCentroid[c.ID].NumPointsProcessed;
                        }
                        else
                        {
                            newCentroidPoint = new Point();
                        }

                        c.X = newCentroidPoint.X;
                        c.Y = newCentroidPoint.Y;

                        return(c);
                    });

                    using (ObjectStreamWriter <Centroid> writeStream = new ObjectStreamWriter <Centroid>(writeBlob, point => point.ToByteArray(), Centroid.Size))
                    {
                        foreach (Centroid c in newCentroids)
                        {
                            writeStream.Write(c);
                        }
                    }
                }

                // Copy the contents of the new blob back into the old blob
                Centroids.CopyFromBlob(writeBlob);

                System.Diagnostics.Trace.TraceInformation("[ServerRole] Finished RecalculateCentroids(). Total points changed: {0}", TotalNumPointsChanged);

                ResetPointChangedCounts();
            }, jobData.JobID.ToString(), methodName: "RecalculateCentroids", iterationCount: IterationCount, points: Points.Uri.ToString(), centroids: Centroids.Uri.ToString(), machineID: MachineID);
        }
Esempio n. 4
0
        /// <summary>
        /// Enqueues M messages into a queue. Each message is an instruction to a worker to process a partition of the k-means data.
        /// </summary>
        public void EnqueueTasks(IEnumerable <Worker> workers)
        {
            AzureHelper.LogPerformance(() =>
            {
                int workerNumber = 0;

                // Loop through the known workers and give them each a chunk of the points.
                // Note: This loop must execute in the same order every time, otherwise caching will not work -- the workers will get a different workerNumber each time and therefore a different chunk of the points.
                // We use OrderBy on the PartitionKey to guarantee stable ordering.
                foreach (Worker worker in workers.OrderBy(worker => worker.PartitionKey))
                {
                    KMeansTaskData taskData = new KMeansTaskData(jobData, Guid.NewGuid(), workerNumber++, workers.Count(), Centroids.Uri, DateTime.UtcNow, IterationCount, worker.BuddyGroupID);
                    taskData.Points         = Points.Uri;

                    tasks.Add(new KMeansTask(taskData));

                    AzureHelper.EnqueueMessage(AzureHelper.GetWorkerRequestQueue(worker.PartitionKey), taskData, true);
                }
            }, jobData.JobID.ToString(), methodName: "EnqueueTasks", iterationCount: IterationCount, points: Points.Uri.ToString(), centroids: Centroids.Uri.ToString(), machineID: MachineID);
        }
Esempio n. 5
0
        /// <summary>
        /// Checks whether to move into the next iteration, and performs the appropriate actions to make it happen.
        /// </summary>
        private void NextIteration(IEnumerable <Worker> workers)
        {
            System.Diagnostics.Trace.TraceInformation("[ServerRole] NextIteration() JobID={0}", jobData.JobID);

            CommitPointsBlob();

            IterationCount++;
            if (!string.IsNullOrEmpty(jobData.ProgressEmail))
            {
                AzureHelper.SendStatusEmail(jobData.ProgressEmail, jobData.JobID, IterationCount);
            }

            if (NumPointsChangedAboveThreshold() && !MaxIterationCountExceeded())
            {
                RecalculateCentroids();
                EnqueueTasks(workers);
            }
            else
            {
                ReturnResults();
            }
        }
Esempio n. 6
0
        /// <summary>
        /// Handles a worker's TaskResult from a running k-means job. Adds up the partial sums from the TaskResult.
        /// </summary>
        /// <param name="message"></param>
        /// <returns>False if the given taskData result has already been counted, true otherwise.</returns>
        public bool ProcessWorkerResponse(KMeansTaskResult taskResult, IEnumerable <Worker> workers)
        {
            // Make sure we're actually still waiting for a result for this taskData
            // If not, this might be a duplicate queue message
            if (!TaskResultMatchesRunningTask(taskResult))
            {
                return(true);
            }

            AzureHelper.LogPerformance(() =>
            {
                KMeansTask task = TaskResultWithTaskID(taskResult.TaskID);
                task.Running    = false; // The task has returned a response, which means that it has stopped running

                // Add the worker's updated points blocks
                if (taskResult.PointsBlockListBlob != null)
                {
                    using (Stream stream = AzureHelper.GetBlob(taskResult.PointsBlockListBlob).OpenRead())
                    {
                        BinaryFormatter bf            = new BinaryFormatter();
                        List <string> pointsBlockList = bf.Deserialize(stream) as List <string>;
                        pointsBlockIDs.AddRange(pointsBlockList);
                    }
                }

                // Copy out and integrate the data from the worker response
                AddDataFromTaskResult(taskResult);
            }, jobData.JobID.ToString(), methodName: "ProcessWorkerResponse", iterationCount: IterationCount, points: Points.Uri.ToString(), centroids: Centroids.Uri.ToString(), machineID: MachineID);

            // If this is the last worker to return, this iteration is done and we should start the next one
            if (NoMoreRunningTasks())
            {
                NextIteration(workers);
            }

            return(true);
        }
        /// <summary>
        /// Assigns each ClusterPoint in the "points" blob to the nearest centroid, recording results into TaskResult.
        /// </summary>
        private void ProcessPoints()
        {
            CloudBlockBlob pointsBlob = AzureHelper.GetBlob(task.Points);

            // Do the mapping and write the new blob
            int numThreads = Environment.ProcessorCount;

            PointsProcessedData[,] pointSumsPerCentroidPerThread = new PointsProcessedData[numThreads, task.K];
            int[]      pointsChangedPerThread = new int[numThreads];
            string[][] blockIDsPerThread      = new string[numThreads][];

            System.Threading.Tasks.Parallel.For(0, numThreads, threadID =>
            {
                // A note about caching: The outer block, using (ObjectCachedStreamReader...), reads from the appropriate partition in the Points blob, or, if it exists, a cache file on disk corresponding to the current iteration.
                // In the previous iteration (if any), this cache file was generated by the inner block, using (ObjectCachedBlockWriter...), which writes to a set of block on the Points blob, as well as to the cache file on disk corresponding to the *next iteration*.
                // Note the "task.Iteration + 1" in this using statement.
                //
                // The reason why we have separate cache files for each iteration is that we can't write to a cache file while we're reading it. So instead we have to write to a separate file.
                //
                // TODO: Having separate cache files is wasteful of disk space. Ideally we would only keep cache files from the current iteration and the next one.

                using (ObjectCachedStreamReader <ClusterPoint> stream = new ObjectCachedStreamReader <ClusterPoint>(pointsBlob, ClusterPoint.FromByteArray, ClusterPoint.Size,
                                                                                                                    AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, subPartitionNumber: threadID, subTotalPartitions: numThreads, iterationNumber: task.Iteration))
                {
                    // Log cache hit or miss
                    System.Diagnostics.Trace.TraceInformation("[WorkerRole] Cache {1} for file {0}", stream.CacheFilePath, stream.UsingCache ? "hit" : "miss");

                    // Process the points
                    using (ObjectCachedBlockWriter <ClusterPoint> writeStream = new ObjectCachedBlockWriter <ClusterPoint>(pointsBlob, point => point.ToByteArray(), ClusterPoint.Size,
                                                                                                                           AzureHelper.GetCachedFilePath(AzureHelper.GetLocalResourceRootPath("cache"), task.JobID.ToString(), task.PartitionNumber, task.M, threadID, task.Iteration + 1)))
                    {
                        foreach (var point in stream)
                        {
                            // Assign the point to the nearest centroid
                            Guid oldCentroidID       = point.CentroidID;
                            int closestCentroidIndex = centroids.MinIndex(centroid => Point.Distance(point, centroid));
                            Guid newCentroidID       = point.CentroidID = centroids[closestCentroidIndex].ID;

                            // Write the updated point to the writeStream
                            writeStream.Write(point);

                            // Update the number of points changed
                            if (oldCentroidID != newCentroidID)
                            {
                                pointsChangedPerThread[threadID]++;
                            }

                            // Update the point sums
                            if (pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] == null)
                            {
                                pointSumsPerCentroidPerThread[threadID, closestCentroidIndex] = new PointsProcessedData();
                            }
                            pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].PartialPointSum += point;
                            pointSumsPerCentroidPerThread[threadID, closestCentroidIndex].NumPointsProcessed++;
                        }

                        // Collect the block IDs from writeStream
                        writeStream.FlushBlock();
                        blockIDsPerThread[threadID] = writeStream.BlockList.ToArray();
                    }
                }
            });

            // Combine the per-thread block lists and write the full block list to a blob. Then include that as part of TaskResult
            List <string> blockIDs = new List <string>();

            foreach (string[] blockIDsFromThread in blockIDsPerThread)
            {
                blockIDs.AddRange(blockIDsFromThread);
            }
            CloudBlob blockIDsBlob = AzureHelper.CreateBlob(task.JobID.ToString(), Guid.NewGuid().ToString());

            using (Stream stream = blockIDsBlob.OpenWrite())
            {
                BinaryFormatter bf = new BinaryFormatter();
                bf.Serialize(stream, blockIDs);
            }
            TaskResult.PointsBlockListBlob = blockIDsBlob.Uri;

            // Total up the per-thread pointSumsPerCentroid
            TaskResult.PointsProcessedDataByCentroid = new Dictionary <Guid, PointsProcessedData>();
            for (int i = 0; i < task.K; ++i)
            {
                Guid centroidID = centroids[i].ID;
                TaskResult.PointsProcessedDataByCentroid[centroidID] = new PointsProcessedData();

                for (int j = 0; j < numThreads; ++j)
                {
                    if (pointSumsPerCentroidPerThread[j, i] != null)
                    {
                        TaskResult.PointsProcessedDataByCentroid[centroidID].PartialPointSum    += pointSumsPerCentroidPerThread[j, i].PartialPointSum;
                        TaskResult.PointsProcessedDataByCentroid[centroidID].NumPointsProcessed += pointSumsPerCentroidPerThread[j, i].NumPointsProcessed;
                    }
                }
            }

            // Total up the per-thread numPointsChanged
            TaskResult.NumPointsChanged = 0;
            foreach (int threadPointsChanged in pointsChangedPerThread)
            {
                TaskResult.NumPointsChanged += threadPointsChanged;
            }
        }
 private void InitializeCentroids()
 {
     using (ObjectStreamReader <Centroid> stream = new ObjectStreamReader <Centroid>(AzureHelper.GetBlob(task.Centroids), Centroid.FromByteArray, Centroid.Size))
     {
         centroids = stream.ToList();
     }
 }
Esempio n. 9
0
 private void CommitPointsBlob()
 {
     AzureHelper.CommitBlockBlob(Points, pointsBlockIDs);
     pointsBlockIDs.Clear();
 }