Exemplo n.º 1
0
        // shuffle data and write them to different partions (different files on disk for now)
        public static List <DataVector> ShuffleDataAndGetInitialCentriods(string originalDataFile, int partitionsNum, int clustersNum, string executionDirectory)
        {
            List <DataVector> data = DataPartitionCache.ReadDataFile(originalDataFile);

            // shuffle, not truely random, but sufficient for our purpose
            data = data.OrderBy(a => Guid.NewGuid()).ToList();
            string dataDirectory = Path.Combine(executionDirectory, Constants.DataDirectory);

            // clean things up first
            if (Directory.Exists(dataDirectory))
            {
                Directory.Delete(dataDirectory, true);
            }
            Directory.CreateDirectory(dataDirectory);

            int residualCount = data.Count;
            int batchSize     = data.Count / partitionsNum;

            for (int i = 0; i < partitionsNum; i++)
            {
                int linesCount = residualCount > batchSize ? batchSize : residualCount;
                using (StreamWriter writer = new StreamWriter(
                           File.OpenWrite(Path.Combine(executionDirectory, Constants.DataDirectory, i.ToString(CultureInfo.InvariantCulture)))))
                {
                    for (int j = i * batchSize; j < (i * batchSize) + linesCount; j++)
                    {
                        writer.WriteLine(data[j].ToString());
                    }
                    writer.Close();
                }
            }
            return(InitializeCentroids(clustersNum, data, executionDirectory));
        }
Exemplo n.º 2
0
 public LegacyKMeansTask(
     DataPartitionCache dataPartition,
     [Parameter(Value = typeof(KMeansConfiguratioinOptions.K))] int clustersNumber,
     [Parameter(Value = typeof(KMeansConfiguratioinOptions.ExecutionDirectory))] string executionDirectory)
 {
     _dataPartition            = dataPartition;
     _clustersNum              = clustersNumber;
     _kMeansExecutionDirectory = executionDirectory;
     if (_centroids == null)
     {
         string centroidFile = Path.Combine(_kMeansExecutionDirectory, Constants.CentroidsFile);
         _centroids = new Centroids(DataPartitionCache.ReadDataFile(centroidFile));
     }
 }
Exemplo n.º 3
0
        public byte[] Call(byte[] memento)
        {
            // TODO: this belongs to dedicated data loader layer, will refactor once we have that
            _groupCommClient.Initialize();
            string centroidFile = Path.Combine(_kMeansExecutionDirectory, Constants.CentroidsFile);

            _centroids = new Centroids(DataPartitionCache.ReadDataFile(centroidFile));

            float loss = float.MaxValue;
            float newLoss;

            while (true)
            {
                if (_isInitialIteration)
                {
                    // broadcast initial centroids to all slave nodes
                    Logger.Log(Level.Info, "Broadcasting INITIAL centroids to all slave nodes: " + _centroids);
                    _isInitialIteration = false;
                }
                else
                {
                    ProcessedResults results = _meansReducerReceiver.Reduce();
                    _centroids = new Centroids(results.Means.Select(m => m.Mean).ToList());
                    Logger.Log(Level.Info, "Broadcasting new centroids to all slave nodes: " + _centroids);
                    newLoss = results.Loss;
                    Logger.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "The new loss value {0} at iteration {1} ", newLoss, _iteration));
                    if (newLoss > loss)
                    {
                        _controlBroadcastSender.Send(ControlMessage.STOP);
                        throw new InvalidOperationException(
                                  string.Format(CultureInfo.InvariantCulture, "The new loss {0} is larger than previous loss {1}, while loss function must be monotonically decreasing across iterations", newLoss, loss));
                    }
                    else if (newLoss.Equals(loss))
                    {
                        Logger.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "KMeans clustering has converged with a loss value of {0} at iteration {1} ", newLoss, _iteration));
                        break;
                    }
                    else
                    {
                        loss = newLoss;
                    }
                }
                _controlBroadcastSender.Send(ControlMessage.RECEIVE);
                _dataBroadcastSender.Send(_centroids);
                _iteration++;
            }
            _controlBroadcastSender.Send(ControlMessage.STOP);
            return(null);
        }
Exemplo n.º 4
0
 public KMeansSlaveTask(
     DataPartitionCache dataPartition,
     [Parameter(typeof(KMeansConfiguratioinOptions.TotalNumEvaluators))] int clustersNumber,
     IGroupCommClient groupCommClient)
 {
     using (Logger.LogFunction("KMeansSlaveTask::KMeansSlaveTask"))
     {
         _dataPartition            = dataPartition;
         _groupCommClient          = groupCommClient;
         _clustersNum              = clustersNumber;
         _commGroup                = _groupCommClient.GetCommunicationGroup(Constants.KMeansCommunicationGroupName);
         _dataBroadcastReceiver    = _commGroup.GetBroadcastReceiver <Centroids>(Constants.CentroidsBroadcastOperatorName);
         _partialMeansSender       = _commGroup.GetReduceSender <ProcessedResults>(Constants.MeansReduceOperatorName);
         _controlBroadcastReceiver = _commGroup.GetBroadcastReceiver <ControlMessage>(Constants.ControlMessageBroadcastOperatorName);
     }
 }
Exemplo n.º 5
0
        public byte[] CallWithWritingToFileSystem(byte[] memento)
        {
            string centroidFile = Path.Combine(_kMeansExecutionDirectory, Constants.CentroidsFile);

            _centroids = new Centroids(DataPartitionCache.ReadDataFile(centroidFile));

            _dataPartition.LabelData(_centroids);
            _partialMeans = ComputePartialMeans();

            // should be replaced with Group Communication
            using (StreamWriter writer = new StreamWriter(
                       File.OpenWrite(Path.Combine(_kMeansExecutionDirectory, Constants.DataDirectory, Constants.PartialMeanFilePrefix + _dataPartition.Partition))))
            {
                for (int i = 0; i < _partialMeans.Count; i++)
                {
                    writer.WriteLine(_partialMeans[i].ToString());
                }
                writer.Close();
            }

            return(null);
        }