예제 #1
0
        /// <summary>
        /// Fit the centers to the provided data.
        /// </summary>
        /// <param name="data">The data to fit</param>
        /// <param name="weights">Weights for the data.  If not provided, all points are equally weighted.</param>
        public void Fit(List <LabelVector> data, float[] weights = null)
        {
            UpdateManager.WriteLine("Fitting centers using k-means clustering...");
            _data    = data;
            _minDist = new float[data.Count];

            _jobs = new List <Job>();
            if (data.Count > JobSize)
            {
                int jobCount = _data.Count / JobSize;
                if (_data.Count % JobSize != 0)
                {
                    jobCount++;
                }
                for (int i = 0; i < _data.Count; i += JobSize)
                {
                    _jobs.Add(new Job {
                        StartIndex = i, EndIndex = Math.Min(i + JobSize, _data.Count)
                    });
                }
            }
            else
            {
                _jobs.Add(new Job {
                    StartIndex = 0, EndIndex = data.Count
                });
            }

            UpdateManager.WriteLine("Data partitioned into {0} job{1}.", _jobs.Count, _jobs.Count == 1 ? "" : "s");

            _centers = new List <LabelVector>();

            UpdateManager.WriteLine("Initializing centers...");
            UpdateManager.AddIndent();
            if (data.Count > PARALLEL_THRESHOLD)
            {
                seedCentersParallel();
            }
            else
            {
                seedCenters();
            }
            UpdateManager.RemoveIndent();

            for (short label = 0; label < _centers.Count; label++)
            {
                _centers[label].Label = label;
            }

            UpdateManager.WriteLine("Running Lloyd's algorithm...");
            Stopwatch sw = new Stopwatch();

            UpdateManager.Write("Building initial clusters...");
            sw.Start();
            while (runJobs(o => assignCenters(o)))
            {
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);

                for (int i = 0; i < _centers.Count; i++)
                {
                    _centers[i] = new LabelVector(_data[0].Count);
                }
                sw.Reset();
                UpdateManager.Write("Recomputing cluster centers...");
                sw.Start();
                for (int i = 0; i < data.Count; i++)
                {
                    LabelVector x = data[i];
                    if (weights != null)
                    {
                        float featureValue = _centers[x.Label].FeatureValue;
                        _centers[x.Label] += weights[i] * x;
                        _centers[x.Label].FeatureValue += weights[i];
                    }
                    else
                    {
                        float featureValue = _centers[x.Label].FeatureValue;
                        _centers[x.Label] += x;
                        _centers[x.Label].FeatureValue += 1;
                    }
                }
                for (short i = 0; i < _centers.Count; i++)
                {
                    _centers[i]      /= _centers[i].FeatureValue;
                    _centers[i].Label = i;
                }
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);
                sw.Reset();
                UpdateManager.Write("Rebuilding clusters...");
                sw.Start();
            }
            sw.Stop();
            UpdateManager.WriteLine("Done and done [{0}ms]", sw.ElapsedMilliseconds);

            UpdateManager.WriteLine("Fitting complete.");
        }
예제 #2
0
        private void seedCentersParallel()
        {
            UpdateManager.WriteLine("Performing scalable K-Means++ initialization...");
            LabelVector center = _data.SelectRandom().Clone() as LabelVector;

            center.Label = 0;
            _centers.Add(center);
            runJobs(o => calculatePhi(o));
            _phi = _data.Sum(o => o.FeatureValue);

            int N = (int)Math.Log(_phi);

            if (MaxIterations != 0)
            {
                N = MaxIterations;
            }

            Stopwatch sw = new Stopwatch();

            for (int i = 0; i < N; i++)
            {
                sw.Reset();
                UpdateManager.Write("Gathering center candidates ({0} of {1})...", i + 1, N);
                sw.Start();
                List <LabelVector> newCenters = runJobs(new Func <object, List <LabelVector> >(findCenterCandidates));
                _centers.AddRange(newCenters);
                for (short s = 0; s < _centers.Count; s++)
                {
                    _centers[s].Label = s;
                }
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);

                sw.Reset();
                UpdateManager.Write("Recalculating phi...");
                sw.Start();
                runJobs(o => calculatePhi(o));
                _phi = _data.Sum(o => o.FeatureValue);
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);
            }

            sw.Reset();
            UpdateManager.Write("Assigning centers...");
            sw.Start();
            for (short s = 0; s < _centers.Count; s++)
            {
                _centers[s].Label = s;
            }
            runJobs(o => assignCenters(o));
            sw.Stop();
            UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);
            float[] weights = new float[_centers.Count];
            foreach (LabelVector x in _data)
            {
                weights[x.Label] += 1;
            }
            KMeans clusterCenters = new KMeans(_k);

            UpdateManager.AddIndent();
            clusterCenters.Fit(_centers, weights);
            UpdateManager.RemoveIndent();

            _centers = clusterCenters.Centers;
        }