private KMeansDriverHandlers(
            [Parameter(typeof(NumPartitions))] int numPartitions,
            GroupCommDriver groupCommDriver,
            IEvaluatorRequestor evaluatorRequestor,
            CommandLineArguments arguments)
        {
            _executionDirectory = Path.Combine(Directory.GetCurrentDirectory(), Constants.KMeansExecutionBaseDirectory, Guid.NewGuid().ToString("N").Substring(0, 4));
            string dataFile = arguments.Arguments.First();

            DataVector.ShuffleDataAndGetInitialCentriods(
                dataFile,
                numPartitions,
                _clustersNumber,
                _executionDirectory);

            _totalEvaluators = numPartitions + 1;

            _groupCommDriver    = groupCommDriver;
            _evaluatorRequestor = evaluatorRequestor;

            _centroidCodecConf = CodecToStreamingCodecConfiguration <Centroids> .Conf
                                 .Set(CodecToStreamingCodecConfiguration <Centroids> .Codec, GenericType <CentroidsCodec> .Class)
                                 .Build();

            IConfiguration dataConverterConfig1 = PipelineDataConverterConfiguration <Centroids> .Conf
                                                  .Set(PipelineDataConverterConfiguration <Centroids> .DataConverter, GenericType <DefaultPipelineDataConverter <Centroids> > .Class)
                                                  .Build();

            _controlMessageCodecConf = CodecToStreamingCodecConfiguration <ControlMessage> .Conf
                                       .Set(CodecToStreamingCodecConfiguration <ControlMessage> .Codec, GenericType <ControlMessageCodec> .Class)
                                       .Build();

            IConfiguration dataConverterConfig2 = PipelineDataConverterConfiguration <ControlMessage> .Conf
                                                  .Set(PipelineDataConverterConfiguration <ControlMessage> .DataConverter, GenericType <DefaultPipelineDataConverter <ControlMessage> > .Class)
                                                  .Build();

            _processedResultsCodecConf = CodecToStreamingCodecConfiguration <ProcessedResults> .Conf
                                         .Set(CodecToStreamingCodecConfiguration <ProcessedResults> .Codec, GenericType <ProcessedResultsCodec> .Class)
                                         .Build();

            IConfiguration reduceFunctionConfig = ReduceFunctionConfiguration <ProcessedResults> .Conf
                                                  .Set(ReduceFunctionConfiguration <ProcessedResults> .ReduceFunction, GenericType <KMeansMasterTask.AggregateMeans> .Class)
                                                  .Build();

            IConfiguration dataConverterConfig3 = PipelineDataConverterConfiguration <ProcessedResults> .Conf
                                                  .Set(PipelineDataConverterConfiguration <ProcessedResults> .DataConverter, GenericType <DefaultPipelineDataConverter <ProcessedResults> > .Class)
                                                  .Build();

            _commGroup = _groupCommDriver.DefaultGroup
                         .AddBroadcast <Centroids>(Constants.CentroidsBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig1)
                         .AddBroadcast <ControlMessage>(Constants.ControlMessageBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig2)
                         .AddReduce <ProcessedResults>(Constants.MeansReduceOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, reduceFunctionConfig, dataConverterConfig3)
                         .Build();

            _groupCommTaskStarter = new TaskStarter(_groupCommDriver, _totalEvaluators);
        }
Beispiel #2
0
        public DataVector MultiplyScalar(float scalar)
        {
            DataVector result = new DataVector(Data, Label);

            for (int i = 0; i < Data.Count; i++)
            {
                result.Data[i] *= scalar;
            }
            return(result);
        }
Beispiel #3
0
        public KMeansDriverHandlers([Parameter(typeof(NumPartitions))] int numPartitions, GroupCommDriver groupCommDriver)
        {
            Identifier          = "KMeansDriverId";
            _executionDirectory = Path.Combine(Directory.GetCurrentDirectory(), Constants.KMeansExecutionBaseDirectory, Guid.NewGuid().ToString("N").Substring(0, 4));
            ISet <string> arguments = ClrHandlerHelper.GetCommandLineArguments();
            string        dataFile  = arguments.Single(a => a.StartsWith("DataFile", StringComparison.Ordinal)).Split(':')[1];

            DataVector.ShuffleDataAndGetInitialCentriods(
                Path.Combine(Directory.GetCurrentDirectory(), "reef", "global", dataFile),
                numPartitions,
                _clustersNumber,
                _executionDirectory);

            _totalEvaluators = numPartitions + 1;

            _groupCommDriver = groupCommDriver;

            _centroidCodecConf = CodecToStreamingCodecConfiguration <Centroids> .Conf
                                 .Set(CodecConfiguration <Centroids> .Codec, GenericType <CentroidsCodec> .Class)
                                 .Build();

            IConfiguration dataConverterConfig1 = PipelineDataConverterConfiguration <Centroids> .Conf
                                                  .Set(PipelineDataConverterConfiguration <Centroids> .DataConverter, GenericType <DefaultPipelineDataConverter <Centroids> > .Class)
                                                  .Build();

            _controlMessageCodecConf = CodecToStreamingCodecConfiguration <ControlMessage> .Conf
                                       .Set(CodecConfiguration <ControlMessage> .Codec, GenericType <ControlMessageCodec> .Class)
                                       .Build();

            IConfiguration dataConverterConfig2 = PipelineDataConverterConfiguration <ControlMessage> .Conf
                                                  .Set(PipelineDataConverterConfiguration <ControlMessage> .DataConverter, GenericType <DefaultPipelineDataConverter <ControlMessage> > .Class)
                                                  .Build();

            _processedResultsCodecConf = CodecToStreamingCodecConfiguration <ProcessedResults> .Conf
                                         .Set(CodecConfiguration <ProcessedResults> .Codec, GenericType <ProcessedResultsCodec> .Class)
                                         .Build();

            IConfiguration reduceFunctionConfig = ReduceFunctionConfiguration <ProcessedResults> .Conf
                                                  .Set(ReduceFunctionConfiguration <ProcessedResults> .ReduceFunction, GenericType <KMeansMasterTask.AggregateMeans> .Class)
                                                  .Build();

            IConfiguration dataConverterConfig3 = PipelineDataConverterConfiguration <ProcessedResults> .Conf
                                                  .Set(PipelineDataConverterConfiguration <ProcessedResults> .DataConverter, GenericType <DefaultPipelineDataConverter <ProcessedResults> > .Class)
                                                  .Build();

            _commGroup = _groupCommDriver.DefaultGroup
                         .AddBroadcast <Centroids>(Constants.CentroidsBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig1)
                         .AddBroadcast <ControlMessage>(Constants.ControlMessageBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig2)
                         .AddReduce <ProcessedResults>(Constants.MeansReduceOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, reduceFunctionConfig, dataConverterConfig3)
                         .Build();

            _groupCommTaskStarter = new TaskStarter(_groupCommDriver, _totalEvaluators);

            CreateClassHierarchy();
        }
Beispiel #4
0
 private void VectorsArithmeticPrecondition(DataVector other)
 {
     if (other == null || other.Data == null)
     {
         throw new ArgumentNullException("other");
     }
     if (Data.Count != other.Data.Count)
     {
         throw new InvalidOperationException("vector dimentionality mismatch");
     }
 }
Beispiel #5
0
        // by default use squared euclidean disatance
        // a naive implemenation without considering things like data normalization or overflow
        // and it is not particular about efficiency
        public float DistanceTo(DataVector other)
        {
            VectorsArithmeticPrecondition(other);
            float d = 0;

            for (int i = 0; i < Data.Count; i++)
            {
                float diff = Data[i] - other.Data[i];
                d += diff * diff;
            }
            return(d);
        }
Beispiel #6
0
        private float ComputeLossFunction(Centroids centroids, List <DataVector> labeledData)
        {
            float d = 0;

            for (int i = 0; i < centroids.Points.Count; i++)
            {
                DataVector        centroid = centroids.Points[i];
                List <DataVector> slice    = labeledData.Where(v => v.Label == centroid.Label).ToList();
                d += centroid.DistanceTo(slice);
            }
            return(d);
        }
Beispiel #7
0
 public static DataVector Mean(List<DataVector> vectors)
 {
     if (vectors == null || vectors.Count == 0)
     {
         throw new ArgumentNullException("vectors");
     }
     DataVector mean = new DataVector(vectors[0].Dimension);
     for (int i = 0; i < vectors.Count; i++)
     {
         mean = mean.Add(vectors[i], ignoreLabel: true);
     }
     return mean.Normalize(vectors.Count);
 }
Beispiel #8
0
 public static PartialMean FromString(string str)
 {
     if (string.IsNullOrWhiteSpace(str))
     {
         throw new ArgumentException("str");
     }
     string[] parts = str.Split('#');
     if (parts == null || parts.Length != 2)
     {
         throw new ArgumentException("Cannot deserialize PartialMean from string " + str);
     }
     return(new PartialMean(DataVector.FromString(parts[0]), int.Parse(parts[1], CultureInfo.InvariantCulture)));
 }
Beispiel #9
0
        public DataVector Normalize(float normalizationFactor)
        {
            if (normalizationFactor == 0)
            {
                throw new InvalidOperationException("normalizationFactor is zero");
            }
            DataVector result = new DataVector(Data, Label);

            for (int i = 0; i < Data.Count; i++)
            {
                result.Data[i] /= normalizationFactor;
            }
            return(result);
        }
Beispiel #10
0
        public static DataVector Mean(List <DataVector> vectors)
        {
            if (vectors == null || vectors.Count == 0)
            {
                throw new ArgumentNullException("vectors");
            }
            DataVector mean = new DataVector(vectors[0].Dimension);

            for (int i = 0; i < vectors.Count; i++)
            {
                mean = mean.Add(vectors[i], ignoreLabel: true);
            }
            return(mean.Normalize(vectors.Count));
        }
        public List <PartialMean> ComputePartialMeans()
        {
            List <PartialMean> partialMeans = new PartialMean[_clustersNum].ToList();

            for (int i = 0; i < _clustersNum; i++)
            {
                List <DataVector> slices  = _dataPartition.DataVectors.Where(d => d.Label == i).ToList();
                DataVector        average = new DataVector(_dataPartition.DataVectors[0].Dimension);

                if (slices.Count > 1)
                {
                    average = DataVector.Mean(slices);
                }
                average.Label   = i;
                partialMeans[i] = new PartialMean(average, slices.Count);
            }
            return(partialMeans);
        }
Beispiel #12
0
        public DataVector Add(DataVector other, bool ignoreLabel = false)
        {
            VectorsArithmeticPrecondition(other);
            if (!ignoreLabel)
            {
                if (Label != other.Label)
                {
                    throw new InvalidOperationException("by default cannot apply addition operation on data of different labels.");
                }
            }
            List <float> sumData = new List <float>(Data);

            for (int i = 0; i < Data.Count; i++)
            {
                sumData[i] += other.Data[i];
            }
            return(new DataVector(sumData, ignoreLabel ? -1 : Label));
        }
Beispiel #13
0
        private List <PartialMean> ComputePartialMeans()
        {
            Logger.Log(Level.Verbose, "cluster number " + _clustersNum);
            List <PartialMean> partialMeans = new PartialMean[_clustersNum].ToList();

            for (int i = 0; i < _clustersNum; i++)
            {
                List <DataVector> slices  = _dataPartition.DataVectors.Where(d => d.Label == i).ToList();
                DataVector        average = new DataVector(_dataPartition.DataVectors[0].Dimension);

                if (slices.Count > 1)
                {
                    average = DataVector.Mean(slices);
                }
                average.Label   = i;
                partialMeans[i] = new PartialMean(average, slices.Count);
                Logger.Log(Level.Info, "Adding to partial means list: " + partialMeans[i]);
            }
            return(partialMeans);
        }
        // read initial data from file and marked it as unlabeled (not associated with any centroid)
        public static List <DataVector> ReadDataFile(string path, char seperator = ',')
        {
            List <DataVector> data = new List <DataVector>();
            FileStream        file = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read);

            using (StreamReader reader = new StreamReader(file))
            {
                while (!reader.EndOfStream)
                {
                    string line = reader.ReadLine();
                    if (!string.IsNullOrWhiteSpace(line))
                    {
                        data.Add(DataVector.FromString(line));
                    }
                }
                reader.Close();
            }

            return(data);
        }
Beispiel #15
0
 // by default use squared euclidean disatance 
 // a naive implemenation without considering things like data normalization or overflow 
 // and it is not particular about efficiency
 public float DistanceTo(DataVector other)
 {
     VectorsArithmeticPrecondition(other);
     float d = 0;
     for (int i = 0; i < Data.Count; i++)
     {
         float diff = Data[i] - other.Data[i];
         d += diff * diff;
     }
     return d;
 }
Beispiel #16
0
 public DataVector Add(DataVector other, bool ignoreLabel = false)
 {
     VectorsArithmeticPrecondition(other);
     if (!ignoreLabel)
     {
         if (Label != other.Label)
         {
             throw new InvalidOperationException("by default cannot apply addition operation on data of different labels.");
         }
     }
     List<float> sumData = new List<float>(Data);
     for (int i = 0; i < Data.Count; i++)
     {
         sumData[i] += other.Data[i];
     }
     return new DataVector(sumData, ignoreLabel ? -1 : Label);
 }
Beispiel #17
0
        private List<PartialMean> ComputePartialMeans()
        {
            Logger.Log(Level.Verbose, "cluster number " + _clustersNum);
            List<PartialMean> partialMeans = new PartialMean[_clustersNum].ToList();
            for (int i = 0; i < _clustersNum; i++)
            {
                List<DataVector> slices = _dataPartition.DataVectors.Where(d => d.Label == i).ToList();
                DataVector average = new DataVector(_dataPartition.DataVectors[0].Dimension);

                if (slices.Count > 1)
                {
                    average = DataVector.Mean(slices);
                }
                average.Label = i;
                partialMeans[i] = new PartialMean(average, slices.Count);
                Logger.Log(Level.Info, "Adding to partial means list: " + partialMeans[i]);
            }
            return partialMeans;
        }
Beispiel #18
0
 public PartialMean(DataVector vector, int size)
 {
     Mean = vector;
     Size = size;
 }
Beispiel #19
0
 public DataVector Normalize(float normalizationFactor)
 {
     if (normalizationFactor == 0)
     {
         throw new InvalidOperationException("normalizationFactor is zero");
     }
     DataVector result = new DataVector(Data, Label);
     for (int i = 0; i < Data.Count; i++)
     {
         result.Data[i] /= normalizationFactor;
     }
     return result;
 }
Beispiel #20
0
 public DataVector MultiplyScalar(float scalar)
 {
     DataVector result = new DataVector(Data, Label);
     for (int i = 0; i < Data.Count; i++)
     {
         result.Data[i] *= scalar;
     }
     return result;
 }
Beispiel #21
0
 private void VectorsArithmeticPrecondition(DataVector other)
 {
     if (other == null || other.Data == null)
     {
         throw new ArgumentNullException("other");
     }
     if (Data.Count != other.Data.Count)
     {
         throw new InvalidOperationException("vector dimentionality mismatch");
     }
 }
Beispiel #22
0
 public PartialMean(DataVector vector, int size)
 {
     Mean = vector;
     Size = size;
 }