/// <summary> /// Fit the centers to the provided data. /// </summary> /// <param name="data">The data to fit</param> /// <param name="weights">Weights for the data. If not provided, all points are equally weighted.</param> public void Fit(List <LabelVector> data, float[] weights = null) { UpdateManager.WriteLine("Fitting centers using k-means clustering..."); _data = data; _minDist = new float[data.Count]; _jobs = new List <Job>(); if (data.Count > JobSize) { int jobCount = _data.Count / JobSize; if (_data.Count % JobSize != 0) { jobCount++; } for (int i = 0; i < _data.Count; i += JobSize) { _jobs.Add(new Job { StartIndex = i, EndIndex = Math.Min(i + JobSize, _data.Count) }); } } else { _jobs.Add(new Job { StartIndex = 0, EndIndex = data.Count }); } UpdateManager.WriteLine("Data partitioned into {0} job{1}.", _jobs.Count, _jobs.Count == 1 ? "" : "s"); _centers = new List <LabelVector>(); UpdateManager.WriteLine("Initializing centers..."); UpdateManager.AddIndent(); if (data.Count > PARALLEL_THRESHOLD) { seedCentersParallel(); } else { seedCenters(); } UpdateManager.RemoveIndent(); for (short label = 0; label < _centers.Count; label++) { _centers[label].Label = label; } UpdateManager.WriteLine("Running Lloyd's algorithm..."); Stopwatch sw = new Stopwatch(); UpdateManager.Write("Building initial clusters..."); sw.Start(); while (runJobs(o => assignCenters(o))) { sw.Stop(); UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds); for (int i = 0; i < _centers.Count; i++) { _centers[i] = new LabelVector(_data[0].Count); } sw.Reset(); UpdateManager.Write("Recomputing cluster centers..."); sw.Start(); for (int i = 0; i < data.Count; i++) { LabelVector x = data[i]; if (weights != null) { float featureValue = _centers[x.Label].FeatureValue; _centers[x.Label] += weights[i] * x; _centers[x.Label].FeatureValue += weights[i]; } else { float featureValue = _centers[x.Label].FeatureValue; _centers[x.Label] += x; _centers[x.Label].FeatureValue += 1; } } for (short i = 0; i < _centers.Count; i++) { _centers[i] /= _centers[i].FeatureValue; _centers[i].Label = i; } sw.Stop(); UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds); sw.Reset(); UpdateManager.Write("Rebuilding clusters..."); sw.Start(); } sw.Stop(); UpdateManager.WriteLine("Done and done [{0}ms]", sw.ElapsedMilliseconds); UpdateManager.WriteLine("Fitting complete."); }
private void seedCentersParallel() { UpdateManager.WriteLine("Performing scalable K-Means++ initialization..."); LabelVector center = _data.SelectRandom().Clone() as LabelVector; center.Label = 0; _centers.Add(center); runJobs(o => calculatePhi(o)); _phi = _data.Sum(o => o.FeatureValue); int N = (int)Math.Log(_phi); if (MaxIterations != 0) { N = MaxIterations; } Stopwatch sw = new Stopwatch(); for (int i = 0; i < N; i++) { sw.Reset(); UpdateManager.Write("Gathering center candidates ({0} of {1})...", i + 1, N); sw.Start(); List <LabelVector> newCenters = runJobs(new Func <object, List <LabelVector> >(findCenterCandidates)); _centers.AddRange(newCenters); for (short s = 0; s < _centers.Count; s++) { _centers[s].Label = s; } sw.Stop(); UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds); sw.Reset(); UpdateManager.Write("Recalculating phi..."); sw.Start(); runJobs(o => calculatePhi(o)); _phi = _data.Sum(o => o.FeatureValue); sw.Stop(); UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds); } sw.Reset(); UpdateManager.Write("Assigning centers..."); sw.Start(); for (short s = 0; s < _centers.Count; s++) { _centers[s].Label = s; } runJobs(o => assignCenters(o)); sw.Stop(); UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds); float[] weights = new float[_centers.Count]; foreach (LabelVector x in _data) { weights[x.Label] += 1; } KMeans clusterCenters = new KMeans(_k); UpdateManager.AddIndent(); clusterCenters.Fit(_centers, weights); UpdateManager.RemoveIndent(); _centers = clusterCenters.Centers; }