private KMeansDriverHandlers( [Parameter(typeof(NumPartitions))] int numPartitions, GroupCommDriver groupCommDriver, IEvaluatorRequestor evaluatorRequestor, CommandLineArguments arguments) { _executionDirectory = Path.Combine(Directory.GetCurrentDirectory(), Constants.KMeansExecutionBaseDirectory, Guid.NewGuid().ToString("N").Substring(0, 4)); string dataFile = arguments.Arguments.First(); DataVector.ShuffleDataAndGetInitialCentriods( dataFile, numPartitions, _clustersNumber, _executionDirectory); _totalEvaluators = numPartitions + 1; _groupCommDriver = groupCommDriver; _evaluatorRequestor = evaluatorRequestor; _centroidCodecConf = CodecToStreamingCodecConfiguration <Centroids> .Conf .Set(CodecToStreamingCodecConfiguration <Centroids> .Codec, GenericType <CentroidsCodec> .Class) .Build(); IConfiguration dataConverterConfig1 = PipelineDataConverterConfiguration <Centroids> .Conf .Set(PipelineDataConverterConfiguration <Centroids> .DataConverter, GenericType <DefaultPipelineDataConverter <Centroids> > .Class) .Build(); _controlMessageCodecConf = CodecToStreamingCodecConfiguration <ControlMessage> .Conf .Set(CodecToStreamingCodecConfiguration <ControlMessage> .Codec, GenericType <ControlMessageCodec> .Class) .Build(); IConfiguration dataConverterConfig2 = PipelineDataConverterConfiguration <ControlMessage> .Conf .Set(PipelineDataConverterConfiguration <ControlMessage> .DataConverter, GenericType <DefaultPipelineDataConverter <ControlMessage> > .Class) .Build(); _processedResultsCodecConf = CodecToStreamingCodecConfiguration <ProcessedResults> .Conf .Set(CodecToStreamingCodecConfiguration <ProcessedResults> .Codec, GenericType <ProcessedResultsCodec> .Class) .Build(); IConfiguration reduceFunctionConfig = ReduceFunctionConfiguration <ProcessedResults> .Conf .Set(ReduceFunctionConfiguration <ProcessedResults> .ReduceFunction, GenericType <KMeansMasterTask.AggregateMeans> .Class) .Build(); IConfiguration dataConverterConfig3 = PipelineDataConverterConfiguration <ProcessedResults> .Conf .Set(PipelineDataConverterConfiguration <ProcessedResults> .DataConverter, GenericType <DefaultPipelineDataConverter <ProcessedResults> > .Class) .Build(); _commGroup = _groupCommDriver.DefaultGroup .AddBroadcast <Centroids>(Constants.CentroidsBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig1) .AddBroadcast <ControlMessage>(Constants.ControlMessageBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig2) .AddReduce <ProcessedResults>(Constants.MeansReduceOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, reduceFunctionConfig, dataConverterConfig3) .Build(); _groupCommTaskStarter = new TaskStarter(_groupCommDriver, _totalEvaluators); }
public DataVector MultiplyScalar(float scalar) { DataVector result = new DataVector(Data, Label); for (int i = 0; i < Data.Count; i++) { result.Data[i] *= scalar; } return(result); }
public KMeansDriverHandlers([Parameter(typeof(NumPartitions))] int numPartitions, GroupCommDriver groupCommDriver) { Identifier = "KMeansDriverId"; _executionDirectory = Path.Combine(Directory.GetCurrentDirectory(), Constants.KMeansExecutionBaseDirectory, Guid.NewGuid().ToString("N").Substring(0, 4)); ISet <string> arguments = ClrHandlerHelper.GetCommandLineArguments(); string dataFile = arguments.Single(a => a.StartsWith("DataFile", StringComparison.Ordinal)).Split(':')[1]; DataVector.ShuffleDataAndGetInitialCentriods( Path.Combine(Directory.GetCurrentDirectory(), "reef", "global", dataFile), numPartitions, _clustersNumber, _executionDirectory); _totalEvaluators = numPartitions + 1; _groupCommDriver = groupCommDriver; _centroidCodecConf = CodecToStreamingCodecConfiguration <Centroids> .Conf .Set(CodecConfiguration <Centroids> .Codec, GenericType <CentroidsCodec> .Class) .Build(); IConfiguration dataConverterConfig1 = PipelineDataConverterConfiguration <Centroids> .Conf .Set(PipelineDataConverterConfiguration <Centroids> .DataConverter, GenericType <DefaultPipelineDataConverter <Centroids> > .Class) .Build(); _controlMessageCodecConf = CodecToStreamingCodecConfiguration <ControlMessage> .Conf .Set(CodecConfiguration <ControlMessage> .Codec, GenericType <ControlMessageCodec> .Class) .Build(); IConfiguration dataConverterConfig2 = PipelineDataConverterConfiguration <ControlMessage> .Conf .Set(PipelineDataConverterConfiguration <ControlMessage> .DataConverter, GenericType <DefaultPipelineDataConverter <ControlMessage> > .Class) .Build(); _processedResultsCodecConf = CodecToStreamingCodecConfiguration <ProcessedResults> .Conf .Set(CodecConfiguration <ProcessedResults> .Codec, GenericType <ProcessedResultsCodec> .Class) .Build(); IConfiguration reduceFunctionConfig = ReduceFunctionConfiguration <ProcessedResults> .Conf .Set(ReduceFunctionConfiguration <ProcessedResults> .ReduceFunction, GenericType <KMeansMasterTask.AggregateMeans> .Class) .Build(); IConfiguration dataConverterConfig3 = PipelineDataConverterConfiguration <ProcessedResults> .Conf .Set(PipelineDataConverterConfiguration <ProcessedResults> .DataConverter, GenericType <DefaultPipelineDataConverter <ProcessedResults> > .Class) .Build(); _commGroup = _groupCommDriver.DefaultGroup .AddBroadcast <Centroids>(Constants.CentroidsBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig1) .AddBroadcast <ControlMessage>(Constants.ControlMessageBroadcastOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, dataConverterConfig2) .AddReduce <ProcessedResults>(Constants.MeansReduceOperatorName, Constants.MasterTaskId, TopologyTypes.Flat, reduceFunctionConfig, dataConverterConfig3) .Build(); _groupCommTaskStarter = new TaskStarter(_groupCommDriver, _totalEvaluators); CreateClassHierarchy(); }
private void VectorsArithmeticPrecondition(DataVector other) { if (other == null || other.Data == null) { throw new ArgumentNullException("other"); } if (Data.Count != other.Data.Count) { throw new InvalidOperationException("vector dimentionality mismatch"); } }
// by default use squared euclidean disatance // a naive implemenation without considering things like data normalization or overflow // and it is not particular about efficiency public float DistanceTo(DataVector other) { VectorsArithmeticPrecondition(other); float d = 0; for (int i = 0; i < Data.Count; i++) { float diff = Data[i] - other.Data[i]; d += diff * diff; } return(d); }
private float ComputeLossFunction(Centroids centroids, List <DataVector> labeledData) { float d = 0; for (int i = 0; i < centroids.Points.Count; i++) { DataVector centroid = centroids.Points[i]; List <DataVector> slice = labeledData.Where(v => v.Label == centroid.Label).ToList(); d += centroid.DistanceTo(slice); } return(d); }
public static DataVector Mean(List<DataVector> vectors) { if (vectors == null || vectors.Count == 0) { throw new ArgumentNullException("vectors"); } DataVector mean = new DataVector(vectors[0].Dimension); for (int i = 0; i < vectors.Count; i++) { mean = mean.Add(vectors[i], ignoreLabel: true); } return mean.Normalize(vectors.Count); }
public static PartialMean FromString(string str) { if (string.IsNullOrWhiteSpace(str)) { throw new ArgumentException("str"); } string[] parts = str.Split('#'); if (parts == null || parts.Length != 2) { throw new ArgumentException("Cannot deserialize PartialMean from string " + str); } return(new PartialMean(DataVector.FromString(parts[0]), int.Parse(parts[1], CultureInfo.InvariantCulture))); }
public DataVector Normalize(float normalizationFactor) { if (normalizationFactor == 0) { throw new InvalidOperationException("normalizationFactor is zero"); } DataVector result = new DataVector(Data, Label); for (int i = 0; i < Data.Count; i++) { result.Data[i] /= normalizationFactor; } return(result); }
public static DataVector Mean(List <DataVector> vectors) { if (vectors == null || vectors.Count == 0) { throw new ArgumentNullException("vectors"); } DataVector mean = new DataVector(vectors[0].Dimension); for (int i = 0; i < vectors.Count; i++) { mean = mean.Add(vectors[i], ignoreLabel: true); } return(mean.Normalize(vectors.Count)); }
public List <PartialMean> ComputePartialMeans() { List <PartialMean> partialMeans = new PartialMean[_clustersNum].ToList(); for (int i = 0; i < _clustersNum; i++) { List <DataVector> slices = _dataPartition.DataVectors.Where(d => d.Label == i).ToList(); DataVector average = new DataVector(_dataPartition.DataVectors[0].Dimension); if (slices.Count > 1) { average = DataVector.Mean(slices); } average.Label = i; partialMeans[i] = new PartialMean(average, slices.Count); } return(partialMeans); }
public DataVector Add(DataVector other, bool ignoreLabel = false) { VectorsArithmeticPrecondition(other); if (!ignoreLabel) { if (Label != other.Label) { throw new InvalidOperationException("by default cannot apply addition operation on data of different labels."); } } List <float> sumData = new List <float>(Data); for (int i = 0; i < Data.Count; i++) { sumData[i] += other.Data[i]; } return(new DataVector(sumData, ignoreLabel ? -1 : Label)); }
private List <PartialMean> ComputePartialMeans() { Logger.Log(Level.Verbose, "cluster number " + _clustersNum); List <PartialMean> partialMeans = new PartialMean[_clustersNum].ToList(); for (int i = 0; i < _clustersNum; i++) { List <DataVector> slices = _dataPartition.DataVectors.Where(d => d.Label == i).ToList(); DataVector average = new DataVector(_dataPartition.DataVectors[0].Dimension); if (slices.Count > 1) { average = DataVector.Mean(slices); } average.Label = i; partialMeans[i] = new PartialMean(average, slices.Count); Logger.Log(Level.Info, "Adding to partial means list: " + partialMeans[i]); } return(partialMeans); }
// read initial data from file and marked it as unlabeled (not associated with any centroid) public static List <DataVector> ReadDataFile(string path, char seperator = ',') { List <DataVector> data = new List <DataVector>(); FileStream file = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read); using (StreamReader reader = new StreamReader(file)) { while (!reader.EndOfStream) { string line = reader.ReadLine(); if (!string.IsNullOrWhiteSpace(line)) { data.Add(DataVector.FromString(line)); } } reader.Close(); } return(data); }
// by default use squared euclidean disatance // a naive implemenation without considering things like data normalization or overflow // and it is not particular about efficiency public float DistanceTo(DataVector other) { VectorsArithmeticPrecondition(other); float d = 0; for (int i = 0; i < Data.Count; i++) { float diff = Data[i] - other.Data[i]; d += diff * diff; } return d; }
public DataVector Add(DataVector other, bool ignoreLabel = false) { VectorsArithmeticPrecondition(other); if (!ignoreLabel) { if (Label != other.Label) { throw new InvalidOperationException("by default cannot apply addition operation on data of different labels."); } } List<float> sumData = new List<float>(Data); for (int i = 0; i < Data.Count; i++) { sumData[i] += other.Data[i]; } return new DataVector(sumData, ignoreLabel ? -1 : Label); }
private List<PartialMean> ComputePartialMeans() { Logger.Log(Level.Verbose, "cluster number " + _clustersNum); List<PartialMean> partialMeans = new PartialMean[_clustersNum].ToList(); for (int i = 0; i < _clustersNum; i++) { List<DataVector> slices = _dataPartition.DataVectors.Where(d => d.Label == i).ToList(); DataVector average = new DataVector(_dataPartition.DataVectors[0].Dimension); if (slices.Count > 1) { average = DataVector.Mean(slices); } average.Label = i; partialMeans[i] = new PartialMean(average, slices.Count); Logger.Log(Level.Info, "Adding to partial means list: " + partialMeans[i]); } return partialMeans; }
public PartialMean(DataVector vector, int size) { Mean = vector; Size = size; }
public DataVector Normalize(float normalizationFactor) { if (normalizationFactor == 0) { throw new InvalidOperationException("normalizationFactor is zero"); } DataVector result = new DataVector(Data, Label); for (int i = 0; i < Data.Count; i++) { result.Data[i] /= normalizationFactor; } return result; }
public DataVector MultiplyScalar(float scalar) { DataVector result = new DataVector(Data, Label); for (int i = 0; i < Data.Count; i++) { result.Data[i] *= scalar; } return result; }