// shuffle data and write them to different partions (different files on disk for now) public static List <DataVector> ShuffleDataAndGetInitialCentriods(string originalDataFile, int partitionsNum, int clustersNum, string executionDirectory) { List <DataVector> data = DataPartitionCache.ReadDataFile(originalDataFile); // shuffle, not truely random, but sufficient for our purpose data = data.OrderBy(a => Guid.NewGuid()).ToList(); string dataDirectory = Path.Combine(executionDirectory, Constants.DataDirectory); // clean things up first if (Directory.Exists(dataDirectory)) { Directory.Delete(dataDirectory, true); } Directory.CreateDirectory(dataDirectory); int residualCount = data.Count; int batchSize = data.Count / partitionsNum; for (int i = 0; i < partitionsNum; i++) { int linesCount = residualCount > batchSize ? batchSize : residualCount; using (StreamWriter writer = new StreamWriter( File.OpenWrite(Path.Combine(executionDirectory, Constants.DataDirectory, i.ToString(CultureInfo.InvariantCulture))))) { for (int j = i * batchSize; j < (i * batchSize) + linesCount; j++) { writer.WriteLine(data[j].ToString()); } writer.Close(); } } return(InitializeCentroids(clustersNum, data, executionDirectory)); }
public LegacyKMeansTask( DataPartitionCache dataPartition, [Parameter(Value = typeof(KMeansConfiguratioinOptions.K))] int clustersNumber, [Parameter(Value = typeof(KMeansConfiguratioinOptions.ExecutionDirectory))] string executionDirectory) { _dataPartition = dataPartition; _clustersNum = clustersNumber; _kMeansExecutionDirectory = executionDirectory; if (_centroids == null) { string centroidFile = Path.Combine(_kMeansExecutionDirectory, Constants.CentroidsFile); _centroids = new Centroids(DataPartitionCache.ReadDataFile(centroidFile)); } }
public byte[] Call(byte[] memento) { // TODO: this belongs to dedicated data loader layer, will refactor once we have that _groupCommClient.Initialize(); string centroidFile = Path.Combine(_kMeansExecutionDirectory, Constants.CentroidsFile); _centroids = new Centroids(DataPartitionCache.ReadDataFile(centroidFile)); float loss = float.MaxValue; float newLoss; while (true) { if (_isInitialIteration) { // broadcast initial centroids to all slave nodes Logger.Log(Level.Info, "Broadcasting INITIAL centroids to all slave nodes: " + _centroids); _isInitialIteration = false; } else { ProcessedResults results = _meansReducerReceiver.Reduce(); _centroids = new Centroids(results.Means.Select(m => m.Mean).ToList()); Logger.Log(Level.Info, "Broadcasting new centroids to all slave nodes: " + _centroids); newLoss = results.Loss; Logger.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "The new loss value {0} at iteration {1} ", newLoss, _iteration)); if (newLoss > loss) { _controlBroadcastSender.Send(ControlMessage.STOP); throw new InvalidOperationException( string.Format(CultureInfo.InvariantCulture, "The new loss {0} is larger than previous loss {1}, while loss function must be monotonically decreasing across iterations", newLoss, loss)); } else if (newLoss.Equals(loss)) { Logger.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "KMeans clustering has converged with a loss value of {0} at iteration {1} ", newLoss, _iteration)); break; } else { loss = newLoss; } } _controlBroadcastSender.Send(ControlMessage.RECEIVE); _dataBroadcastSender.Send(_centroids); _iteration++; } _controlBroadcastSender.Send(ControlMessage.STOP); return(null); }
public byte[] CallWithWritingToFileSystem(byte[] memento) { string centroidFile = Path.Combine(_kMeansExecutionDirectory, Constants.CentroidsFile); _centroids = new Centroids(DataPartitionCache.ReadDataFile(centroidFile)); _dataPartition.LabelData(_centroids); _partialMeans = ComputePartialMeans(); // should be replaced with Group Communication using (StreamWriter writer = new StreamWriter( File.OpenWrite(Path.Combine(_kMeansExecutionDirectory, Constants.DataDirectory, Constants.PartialMeanFilePrefix + _dataPartition.Partition)))) { for (int i = 0; i < _partialMeans.Count; i++) { writer.WriteLine(_partialMeans[i].ToString()); } writer.Close(); } return(null); }