Beispiel #1
0
        /// <summary>
        /// Adds two vectors of the same size together.
        /// </summary>
        /// <param name="lhs">The left hand vector</param>
        /// <param name="rhs">The right hand vector</param>
        /// <returns>The sum of the two vectors</returns>
        public static LabelVector operator +(LabelVector lhs, LabelVector rhs)
        {
            LabelVector result = lhs.Clone() as LabelVector;

            lhs.DoAdd(rhs, result);
            return(result);
        }
Beispiel #2
0
        /// <summary>
        /// Multiplies all of the values of the vector by a scalar value.
        /// </summary>
        /// <param name="x">The vector</param>
        /// <param name="scalar">The scalar value</param>
        /// <returns>The product of a vector and a scalar</returns>
        public static LabelVector operator *(LabelVector x, double scalar)
        {
            LabelVector result = x.Clone() as LabelVector;

            x.DoMultiply(scalar, result);
            return(result);
        }
Beispiel #3
0
        /// <summary>
        /// Subtracts two vectors of the same size together.
        /// </summary>
        /// <param name="lhs">The left hand vector</param>
        /// <param name="rhs">The right hand vector</param>
        /// <returns>The difference of the two vectors</returns>
        public static LabelVector operator -(LabelVector lhs, LabelVector rhs)
        {
            LabelVector result = lhs.Clone() as LabelVector;

            lhs.DoSubtract(rhs, result);
            return(result);
        }
Beispiel #4
0
        /// <summary>
        /// Returns an exact copy of the data point in its current state.
        /// </summary>
        /// <returns>A copy of the point</returns>
        public new object Clone()
        {
            double[] values = new double[Count];
            Array.Copy(Values, values, Count);
            LabelVector result = new LabelVector(values);

            result.Label        = Label;
            result.FeatureValue = FeatureValue;
            result.Weight       = Weight;
            return(result);
        }
Beispiel #5
0
        private void calculatePhi(object arg)
        {
            List <LabelVector> centers = _centers.Select(o => (LabelVector)o.Clone()).ToList();
            Job job = arg as Job;

            for (int i = job.StartIndex; i < job.EndIndex; i++)
            {
                LabelVector x      = _data[i];
                LabelVector center = findNearest(x, centers);
                LabelVector dx     = x - center;
                x.FeatureValue = (float)dx.DotProduct(dx);
            }
        }
Beispiel #6
0
        private LabelVector findNearest(LabelVector x, List <LabelVector> centers)
        {
            LabelVector nearest = centers[0];
            double      minDist = x.SquaredDistance(nearest);

            for (int i = 1; i < centers.Count; i++)
            {
                double test = x.SquaredDistance(centers[i], minDist);
                if (test < minDist)
                {
                    minDist = test;
                    nearest = centers[i];
                }
            }
            return(nearest);
        }
Beispiel #7
0
        private bool assignCenters(object arg)
        {
            List <LabelVector> centers = _centers.Select(o => (LabelVector)o.Clone()).ToList();
            bool change = false;
            Job  job    = arg as Job;

            for (int i = job.StartIndex; i < job.EndIndex; i++)
            {
                LabelVector x      = _data[i];
                LabelVector center = findNearest(x, centers);
                if (x.Label != center.Label)
                {
                    x.Label = center.Label;
                    change  = true;
                }
            }
            return(change);
        }
Beispiel #8
0
        private List <LabelVector> findCenterCandidates(object arg)
        {
            Job job = arg as Job;
            List <LabelVector> result = new List <LabelVector>();
            double             norm   = OversampleRate / _phi;

            for (int i = job.StartIndex; i < job.EndIndex; i++)
            {
                LabelVector x      = _data[i];
                double      sample = x.FeatureValue * norm;
                if (ThreadsafeRandom.Test(sample))
                {
                    LabelVector center = x.Clone() as LabelVector;
                    result.Add(center);
                }
            }
            return(result);
        }
Beispiel #9
0
        private void seedCenters()
        {
            UpdateManager.WriteLine("Performing K-Means++ initialization...");
            LabelVector center = _data.SelectRandom().Clone() as LabelVector;

            center.Label = 0;
            _centers.Add(center);

            Stopwatch sw = new Stopwatch();

            for (int i = 1; i < _k; i++)
            {
                sw.Reset();
                UpdateManager.Write("Recalculating phi...");
                sw.Start();
                runJobs(o => calculatePhi(o));
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);

                float[]     dist = _data.Select(o => o.FeatureValue).ToArray().Normalize();
                LabelVector x    = _data[dist.Sample()];
                _centers.Add(x.Clone() as LabelVector);
            }
        }
Beispiel #10
0
        /// <summary>
        /// Fit the centers to the provided data.
        /// </summary>
        /// <param name="data">The data to fit</param>
        /// <param name="weights">Weights for the data.  If not provided, all points are equally weighted.</param>
        public void Fit(List <LabelVector> data, float[] weights = null)
        {
            UpdateManager.WriteLine("Fitting centers using k-means clustering...");
            _data    = data;
            _minDist = new float[data.Count];

            _jobs = new List <Job>();
            if (data.Count > JobSize)
            {
                int jobCount = _data.Count / JobSize;
                if (_data.Count % JobSize != 0)
                {
                    jobCount++;
                }
                for (int i = 0; i < _data.Count; i += JobSize)
                {
                    _jobs.Add(new Job {
                        StartIndex = i, EndIndex = Math.Min(i + JobSize, _data.Count)
                    });
                }
            }
            else
            {
                _jobs.Add(new Job {
                    StartIndex = 0, EndIndex = data.Count
                });
            }

            UpdateManager.WriteLine("Data partitioned into {0} job{1}.", _jobs.Count, _jobs.Count == 1 ? "" : "s");

            _centers = new List <LabelVector>();

            UpdateManager.WriteLine("Initializing centers...");
            UpdateManager.AddIndent();
            if (data.Count > PARALLEL_THRESHOLD)
            {
                seedCentersParallel();
            }
            else
            {
                seedCenters();
            }
            UpdateManager.RemoveIndent();

            for (short label = 0; label < _centers.Count; label++)
            {
                _centers[label].Label = label;
            }

            UpdateManager.WriteLine("Running Lloyd's algorithm...");
            Stopwatch sw = new Stopwatch();

            UpdateManager.Write("Building initial clusters...");
            sw.Start();
            while (runJobs(o => assignCenters(o)))
            {
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);

                for (int i = 0; i < _centers.Count; i++)
                {
                    _centers[i] = new LabelVector(_data[0].Count);
                }
                sw.Reset();
                UpdateManager.Write("Recomputing cluster centers...");
                sw.Start();
                for (int i = 0; i < data.Count; i++)
                {
                    LabelVector x = data[i];
                    if (weights != null)
                    {
                        float featureValue = _centers[x.Label].FeatureValue;
                        _centers[x.Label] += weights[i] * x;
                        _centers[x.Label].FeatureValue += weights[i];
                    }
                    else
                    {
                        float featureValue = _centers[x.Label].FeatureValue;
                        _centers[x.Label] += x;
                        _centers[x.Label].FeatureValue += 1;
                    }
                }
                for (short i = 0; i < _centers.Count; i++)
                {
                    _centers[i]      /= _centers[i].FeatureValue;
                    _centers[i].Label = i;
                }
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);
                sw.Reset();
                UpdateManager.Write("Rebuilding clusters...");
                sw.Start();
            }
            sw.Stop();
            UpdateManager.WriteLine("Done and done [{0}ms]", sw.ElapsedMilliseconds);

            UpdateManager.WriteLine("Fitting complete.");
        }
Beispiel #11
0
        private void seedCentersParallel()
        {
            UpdateManager.WriteLine("Performing scalable K-Means++ initialization...");
            LabelVector center = _data.SelectRandom().Clone() as LabelVector;

            center.Label = 0;
            _centers.Add(center);
            runJobs(o => calculatePhi(o));
            _phi = _data.Sum(o => o.FeatureValue);

            int N = (int)Math.Log(_phi);

            if (MaxIterations != 0)
            {
                N = MaxIterations;
            }

            Stopwatch sw = new Stopwatch();

            for (int i = 0; i < N; i++)
            {
                sw.Reset();
                UpdateManager.Write("Gathering center candidates ({0} of {1})...", i + 1, N);
                sw.Start();
                List <LabelVector> newCenters = runJobs(new Func <object, List <LabelVector> >(findCenterCandidates));
                _centers.AddRange(newCenters);
                for (short s = 0; s < _centers.Count; s++)
                {
                    _centers[s].Label = s;
                }
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);

                sw.Reset();
                UpdateManager.Write("Recalculating phi...");
                sw.Start();
                runJobs(o => calculatePhi(o));
                _phi = _data.Sum(o => o.FeatureValue);
                sw.Stop();
                UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);
            }

            sw.Reset();
            UpdateManager.Write("Assigning centers...");
            sw.Start();
            for (short s = 0; s < _centers.Count; s++)
            {
                _centers[s].Label = s;
            }
            runJobs(o => assignCenters(o));
            sw.Stop();
            UpdateManager.WriteLine("Done [{0}ms]", sw.ElapsedMilliseconds);
            float[] weights = new float[_centers.Count];
            foreach (LabelVector x in _data)
            {
                weights[x.Label] += 1;
            }
            KMeans clusterCenters = new KMeans(_k);

            UpdateManager.AddIndent();
            clusterCenters.Fit(_centers, weights);
            UpdateManager.RemoveIndent();

            _centers = clusterCenters.Centers;
        }