예제 #1
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="points">The data where to compute the algorithm.</param>
        /// <param name="threshold">The relative convergence threshold
        ///   for the algorithm. Default is 1e-5.</param>
        ///
        public int[] Compute(double[][] points, double threshold)
        {
            int k = Clusters.Count;

            KMeans kmeans = new KMeans(2, Clusters.Distance);

            double[][]   centroids   = Clusters.Centroids;
            double[][][] clusters    = new double[k][][];
            double[]     distortions = new double[k];

            // 1. Start with all data points in one cluster
            clusters[0] = points;

            // 2. Repeat steps 3 to 6 (k-1) times to obtain K centroids
            for (int current = 1; current < k; current++)
            {
                // 3. Choose cluster with largest distortion
                int choosen; distortions.Max(current, out choosen);

                // 4. Split cluster into two sub-clusters
                var splits = split(clusters[choosen], kmeans, threshold);

                clusters[choosen] = splits.Item1;
                clusters[current] = splits.Item2;

                // 5. Replace chosen centroid and add a new one
                centroids[choosen] = kmeans.Clusters.Centroids[0];
                centroids[current] = kmeans.Clusters.Centroids[1];

                // Recompute distortions for the updated clusters
                distortions[choosen] = kmeans.Clusters[0].Distortion(clusters[choosen]);
                distortions[current] = kmeans.Clusters[1].Distortion(clusters[current]);

                // 6. Increment cluster count (current = current + 1)
            }

            return(Clusters.Nearest(points));
        }
예제 #2
0
 public int[] Nearest(double[][] points)
 {
     return(Clusters.Nearest(points));
 }
예제 #3
0
 public int Nearest(double[] point)
 {
     return(Clusters.Nearest(point));
 }
예제 #4
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="data">The data where to compute the algorithm.</param>
        /// <param name="threshold">The relative convergence threshold
        ///   for the algorithm. Default is 1e-5.</param>
        /// <param name="computeInformation">Pass <c>true</c> to compute additional information
        ///   when the algorithm finishes, such as cluster variances and proportions; false
        ///   otherwise. Default is true.</param>
        ///
        public int[] Compute(double[][] data, double threshold = 1e-5, bool computeInformation = true)
        {
            // Initial argument checking
            if (data == null)
            {
                throw new ArgumentNullException("data");
            }
            if (data.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }
            if (threshold < 0)
            {
                throw new ArgumentException("Threshold should be a positive number.", "threshold");
            }

            // TODO: Implement a faster version using the triangle
            // inequality to reduce the number of distance calculations
            //
            //  - http://www-cse.ucsd.edu/~elkan/kmeansicml03.pdf
            //  - http://mloss.org/software/view/48/
            //

            int k    = this.K;
            int rows = data.Length;
            int cols = data[0].Length;


            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.
            if (this.Clusters.Centroids[0] == null)
            {
                Randomize(data, useSeeding: false);
            }


            // Initial variables
            int[]      count        = new int[k];
            int[]      labels       = new int[rows];
            double[][] centroids    = clusters.Centroids;
            double[][] newCentroids = new double[k][];
            for (int i = 0; i < newCentroids.Length; i++)
            {
                newCentroids[i] = new double[cols];
            }

            Object[] syncObjects = new Object[k];
            for (int i = 0; i < syncObjects.Length; i++)
            {
                syncObjects[i] = new Object();
            }


            bool shouldStop = false;

            while (!shouldStop) // Main loop
            {
                // Reset the centroids and the member counters
                for (int i = 0; i < newCentroids.Length; i++)
                {
                    Array.Clear(newCentroids[i], 0, newCentroids[i].Length);
                }
                Array.Clear(count, 0, count.Length);

                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                Parallel.For(0, data.Length, i =>
                {
                    // Get the point
                    double[] point = data[i];

                    // Get the nearest cluster centroid
                    int c = labels[i] = Clusters.Nearest(point);

                    // Increase the cluster's sample counter
                    Interlocked.Increment(ref count[c]);

                    // Get the closest cluster centroid
                    double[] centroid = newCentroids[c];

                    lock (syncObjects[c])
                    {
                        // Accumulate in the cluster centroid
                        for (int j = 0; j < point.Length; j++)
                        {
                            centroid[j] += point[j];
                        }
                    }
                });

                // Next we will compute each cluster's new centroid
                //  by dividing the accumulated sums by the number of
                //  samples in each cluster, thus averaging its members.
                for (int i = 0; i < newCentroids.Length; i++)
                {
                    double clusterCount = count[i];

                    if (clusterCount != 0)
                    {
                        for (int j = 0; j < newCentroids[i].Length; j++)
                        {
                            newCentroids[i][j] /= clusterCount;
                        }
                    }
                }


                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                shouldStop = converged(centroids, newCentroids, threshold);

                // go to next generation
                for (int i = 0; i < centroids.Length; i++)
                {
                    for (int j = 0; j < centroids[i].Length; j++)
                    {
                        centroids[i][j] = newCentroids[i][j];
                    }
                }
            }



            for (int i = 0; i < centroids.Length; i++)
            {
                // Compute the proportion of samples in the cluster
                clusters.Proportions[i] = count[i] / (double)data.Length;
            }


            if (computeInformation)
            {
                // Compute cluster information (optional)
                for (int i = 0; i < centroids.Length; i++)
                {
                    // Extract the data for the current cluster
                    double[][] sub = data.Submatrix(labels.Find(x => x == i));

                    if (sub.Length > 0)
                    {
                        // Compute the current cluster variance
                        clusters.Covariances[i] = Statistics.Tools.Covariance(sub, centroids[i]);
                    }
                    else
                    {
                        // The cluster doesn't have any samples
                        clusters.Covariances[i] = new double[cols, cols];
                    }
                }
            }


            // Return the classification result
            return(labels);
        }
예제 #5
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="data">The data where to compute the algorithm.</param>
        /// <param name="weights">The weight associated with each data point.</param>
        ///
        public override int[] Compute(double[][] data, double[] weights)
        {
            // Initial argument checking
            if (data == null)
            {
                throw new ArgumentNullException("data");
            }

            if (data.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            if (weights == null)
            {
                throw new ArgumentNullException("weights");
            }

            if (data.Length != weights.Length)
            {
                throw new ArgumentException("Data weights vector must be the same length as data samples.");
            }

            double weightSum = weights.Sum();

            if (weightSum <= 0)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            int cols = data[0].Length;

            for (int i = 0; i < data.Length; i++)
            {
                if (data[0].Length != cols)
                {
                    throw new DimensionMismatchException("data", "The points matrix should be rectangular. The vector at position {} has a different length than previous ones.");
                }
            }


            int k = Clusters.Count;

            KMeans kmeans = new KMeans(2)
            {
                Distance           = (IDistance <double[]>)Clusters.Distance,
                ComputeError       = false,
                ComputeCovariances = false,
                UseSeeding         = UseSeeding,
                Tolerance          = Tolerance,
                MaxIterations      = MaxIterations,
            };

            double[][]   centroids   = Clusters.Centroids;
            double[][][] clusters    = new double[k][][];
            double[]     distortions = new double[k];

            // 1. Start with all data points in one cluster
            clusters[0] = data;

            // 2. Repeat steps 3 to 6 (k-1) times to obtain K centroids
            for (int current = 1; current < k; current++)
            {
                // 3. Choose cluster with largest distortion
                int choosen; distortions.Max(current, out choosen);

                // 4. Split cluster into two sub-clusters
                var splits = split(clusters[choosen], kmeans);

                clusters[choosen] = splits.Item1;
                clusters[current] = splits.Item2;

                // 5. Replace chosen centroid and add a new one
                centroids[choosen] = kmeans.Clusters.Centroids[0];
                centroids[current] = kmeans.Clusters.Centroids[1];

                // Recompute distortions for the updated clusters
                distortions[choosen] = kmeans.Clusters[0].Distortion(clusters[choosen]);
                distortions[current] = kmeans.Clusters[1].Distortion(clusters[current]);

                // 6. Increment cluster count (current = current + 1)
            }


            return(Clusters.Nearest(data));
        }
예제 #6
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="points">The data where to compute the algorithm.</param>
        ///
        public int[] Compute(T[][] points)
        {
            // Initial argument checking
            if (points == null)
            {
                throw new ArgumentNullException("points");
            }

            if (points.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            int k    = this.K;
            int rows = points.Length;
            int cols = points[0].Length;

            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.
            //
            if (this.Clusters.Centroids[0] == null)
            {
                Clusters.Randomize(points);
            }

            // Initial variables
            int[]    labels       = new int[rows];
            double[] proportions  = Clusters.Proportions;
            T[][]    centroids    = Clusters.Centroids;
            T[][]    newCentroids = new T[k][];
            for (int i = 0; i < newCentroids.Length; i++)
            {
                newCentroids[i] = new T[cols];
            }

            var clusters = new List <T[]> [k];

            for (int i = 0; i < k; i++)
            {
                clusters[i] = new List <T[]>();
            }

            Iterations = 0;

            do // Main loop
            {
                // Reset the centroids and the
                //  cluster member counters'
                for (int i = 0; i < k; i++)
                {
                    clusters[i].Clear();
                }


                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                for (int i = 0; i < points.Length; i++)
                {
                    // Get the point
                    T[] point = points[i];

                    // Compute the nearest cluster centroid
                    int c = labels[i] = Clusters.Nearest(points[i]);

                    // Accumulate in the corresponding centroid
                    clusters[c].Add(point);
                }

                // Next we will compute each cluster's new centroid
                //  value by computing the mode in each cluster.

                for (int i = 0; i < k; i++)
                {
                    if (clusters[i].Count == 0)
                    {
                        newCentroids[i] = centroids[i];
                        continue;
                    }

                    T[][] p = clusters[i].ToArray();

                    // For each dimension
                    for (int d = 0; d < Dimension; d++)
                    {
                        T[] values = p.GetColumn(d);

                        T mode = values.Mode(alreadySorted: false, inPlace: true);

                        newCentroids[i][d] = mode;
                    }
                }


                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                if (converged(centroids, newCentroids))
                {
                    break;
                }


                // go to next generation
                for (int i = 0; i < centroids.Length; i++)
                {
                    centroids[i] = newCentroids[i];
                }
            }while (true);


            // Compute cluster information (optional)
            for (int i = 0; i < k; i++)
            {
                // Compute the proportion of samples in the cluster
                proportions[i] = clusters[i].Count / (double)points.Length;
            }

            if (ComputeError)
            {
                // Compute the average error
                Error = Clusters.Distortion(points, labels);
            }

            // Return the classification result
            return(labels);
        }
예제 #7
0
        /// <summary>
        ///   Performs the actual clustering, given a set of data points and
        ///   a convergence threshold. The remaining parameters must be set
        ///   before returning the method.
        /// </summary>
        ///
        protected virtual void PerformClustering(double[][] data, double threshold,
                                                 double[][] newCentroids, int[] count,
                                                 int[] labels, double[][] centroids)
        {
            Object[] syncObjects = new Object[K];
            for (int i = 0; i < syncObjects.Length; i++)
            {
                syncObjects[i] = new Object();
            }


            bool shouldStop = false;

            while (!shouldStop) // Main loop
            {
                // Reset the centroids and the member counters
                for (int i = 0; i < newCentroids.Length; i++)
                {
                    Array.Clear(newCentroids[i], 0, newCentroids[i].Length);
                }
                Array.Clear(count, 0, count.Length);

                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                Parallel.For(0, data.Length, i =>
                {
                    // Get the point
                    double[] point = data[i];

                    // Get the nearest cluster centroid
                    int c = labels[i] = Clusters.Nearest(point);

                    // Increase the cluster's sample counter
                    Interlocked.Increment(ref count[c]);

                    // Get the closest cluster centroid
                    double[] centroid = newCentroids[c];

                    lock (syncObjects[c])
                    {
                        // Accumulate in the cluster centroid
                        for (int j = 0; j < point.Length; j++)
                        {
                            centroid[j] += point[j];
                        }
                    }
                });

                // Next we will compute each cluster's new centroid
                //  by dividing the accumulated sums by the number of
                //  samples in each cluster, thus averaging its members.
                for (int i = 0; i < newCentroids.Length; i++)
                {
                    double clusterCount = count[i];

                    if (clusterCount != 0)
                    {
                        for (int j = 0; j < newCentroids[i].Length; j++)
                        {
                            newCentroids[i][j] /= clusterCount;
                        }
                    }
                }


                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                shouldStop = converged(centroids, newCentroids, threshold);

                // go to next generation
                for (int i = 0; i < centroids.Length; i++)
                {
                    for (int j = 0; j < centroids[i].Length; j++)
                    {
                        centroids[i][j] = newCentroids[i][j];
                    }
                }
            }
        }
예제 #8
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="points">The data where to compute the algorithm.</param>
        /// <param name="threshold">The relative convergence threshold
        /// for the algorithm. Default is 1e-5.</param>
        ///
        public int[] Compute(TData[] points, double threshold = 1e-5)
        {
            // Initial argument checking
            if (points == null)
            {
                throw new ArgumentNullException("points");
            }

            if (threshold < 0)
            {
                throw new ArgumentException("Threshold should be a positive number.", "threshold");
            }


            int k    = this.K;
            int rows = points.Length;


            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.

            if (Clusters.Centroids[0] == null)
            {
                Randomize(points);
            }


            // Initial variables
            int[]    labels       = new int[rows];
            double[] proportions  = Clusters.Proportions;
            TData[]  centroids    = Clusters.Centroids;
            TData[]  newCentroids = new TData[k];

            List <TData>[] clusters = new List <TData> [k];
            for (int i = 0; i < k; i++)
            {
                clusters[i] = new List <TData>();
            }


            do // Main loop
            {
                // Reset the centroids and the
                //  cluster member counters'
                for (int i = 0; i < k; i++)
                {
                    clusters[i].Clear();
                }


                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                for (int i = 0; i < points.Length; i++)
                {
                    // Get the point
                    TData point = points[i];

                    // Compute the nearest cluster centroid
                    int c = labels[i] = Clusters.Nearest(points[i]);

                    // Accumulate in the corresponding centroid
                    clusters[c].Add(point);
                }

                // Next we will compute each cluster's new centroid
                //  value by computing the mode in each cluster.

                for (int i = 0; i < k; i++)
                {
                    newCentroids[i] = Accord.Statistics.Tools.Mode <TData>(clusters[i].ToArray());
                }


                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                if (converged(centroids, newCentroids, threshold))
                {
                    break;
                }


                // go to next generation
                for (int i = 0; i < centroids.Length; i++)
                {
                    centroids[i] = newCentroids[i];
                }
            }while (true);


            // Compute cluster information (optional)
            for (int i = 0; i < k; i++)
            {
                // Compute the proportion of samples in the cluster
                proportions[i] = clusters[i].Count / (double)points.Length;
            }


            // Return the classification result
            return(labels);
        }
예제 #9
0
            public int[] Compute(double[][] data, double threshold = 1e-5, bool computeInformation = true)
            {
                // Initial argument checking
                if (data == null)
                {
                    throw new ArgumentNullException("data");
                }
                if (data.Length < K)
                {
                    throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
                }
                if (threshold < 0)
                {
                    throw new ArgumentException("Threshold should be a positive number.", "threshold");
                }



                int k    = this.K;
                int rows = data.Length;
                int cols = data[0].Length;


                // Perform a random initialization of the clusters
                // if the algorithm has not been initialized before.
                if (this.Clusters.Centroids[0] == null)
                {
                    Randomize(data, useSeeding: false);
                }


                // Initial variables
                int[]      count        = new int[k];
                int[]      labels       = new int[rows];
                double[][] centroids    = clusters.Centroids;
                double[][] newCentroids = new double[k][];
                for (int i = 0; i < newCentroids.Length; i++)
                {
                    newCentroids[i] = new double[cols];
                }

                double[][,] covariances = clusters.Covariances;
                double[] proportions = clusters.Proportions;


                bool shouldStop = false;

                while (!shouldStop) // Main loop
                {
                    // Reset the centroids and the member counters
                    for (int i = 0; i < newCentroids.Length; i++)
                    {
                        Array.Clear(newCentroids[i], 0, newCentroids[i].Length);
                    }
                    Array.Clear(count, 0, count.Length);


                    for (int i = 0; i < data.Length; i++)
                    {
                        // Get the point
                        double[] point = data[i];

                        // Get the nearest cluster centroid
                        int c = labels[i] = Clusters.Nearest(point);

                        // Increase the cluster's sample counter
                        count[c]++;

                        // Accumulate in the corresponding centroid
                        for (int j = 0; j < point.Length; j++)
                        {
                            newCentroids[c][j] += point[j];
                        }
                    }

                    for (int i = 0; i < newCentroids.Length; i++)
                    {
                        double clusterCount = count[i];

                        if (clusterCount != 0)
                        {
                            for (int j = 0; j < newCentroids[i].Length; j++)
                            {
                                newCentroids[i][j] /= clusterCount;
                            }
                        }
                    }



                    shouldStop = converged(centroids, newCentroids, threshold);

                    // go to next generation
                    for (int i = 0; i < centroids.Length; i++)
                    {
                        for (int j = 0; j < centroids[i].Length; j++)
                        {
                            centroids[i][j] = newCentroids[i][j];
                        }
                    }
                }


                if (computeInformation)
                {
                    // Compute cluster information (optional)
                    for (int i = 0; i < centroids.Length; i++)
                    {
                        // Extract the data for the current cluster
                        double[][] sub = data.Submatrix(labels.Find(x => x == i));

                        if (sub.Length > 0)
                        {
                            // Compute the current cluster variance
                            covariances[i] = Accord.Statistics.Tools.Covariance(sub, centroids[i]);
                        }
                        else
                        {
                            // The cluster doesn't have any samples
                            covariances[i] = new double[cols, cols];
                        }

                        // Compute the proportion of samples in the cluster
                        proportions[i] = (double)sub.Length / data.Length;
                    }
                }

                clusters.Centroids = centroids;

                // Return the classification result
                return(labels);
            }
예제 #10
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="data">The data where to compute the algorithm.</param>
        /// <param name="weights">The weight to consider for each data sample. This is used in weighted K-Means</param>
        /// <param name="weightSum">The total sum of the weights in <paramref name="weights"/>.</param>
        ///
        protected virtual int[] Compute(double[][] data, double[] weights, double weightSum)
        {
            this.Iterations = 0;

            // TODO: Implement a faster version using the triangle
            // inequality to reduce the number of distance calculations
            //
            //  - http://www-cse.ucsd.edu/~elkan/kmeansicml03.pdf
            //  - http://mloss.org/software/view/48/
            //

            int k    = this.K;
            int rows = data.Length;
            int cols = data[0].Length;

            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.
            //
            if (this.Clusters.Centroids[0] == null)
            {
                Randomize(data);
            }

            // Initial variables
            int[]      labels       = new int[rows];
            double[]   count        = new double[k];
            double[][] centroids    = clusters.Centroids;
            double[][] newCentroids = new double[k][];
            for (int i = 0; i < newCentroids.Length; i++)
            {
                newCentroids[i] = new double[cols];
            }

            Object[] syncObjects = new Object[K];
            for (int i = 0; i < syncObjects.Length; i++)
            {
                syncObjects[i] = new Object();
            }

            Iterations = 0;

            bool shouldStop = false;

            while (!shouldStop) // Main loop
            {
                Array.Clear(count, 0, count.Length);
                for (int i = 0; i < newCentroids.Length; i++)
                {
                    Array.Clear(newCentroids[i], 0, newCentroids[i].Length);
                }

                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                Parallel.For(0, data.Length, i =>
                {
                    // Get the point
                    double[] point = data[i];
                    double weight  = weights[i];

                    // Get the nearest cluster centroid
                    int c = labels[i] = Clusters.Nearest(point);

                    // Get the closest cluster centroid
                    double[] centroid = newCentroids[c];

                    lock (syncObjects[c])
                    {
                        // Increase the cluster's sample counter
                        count[c] += weight;

                        // Accumulate in the cluster centroid
                        for (int j = 0; j < point.Length; j++)
                        {
                            centroid[j] += point[j] * weight;
                        }
                    }
                });

                // Next we will compute each cluster's new centroid
                //  by dividing the accumulated sums by the number of
                //  samples in each cluster, thus averaging its members.
                Parallel.For(0, newCentroids.Length, i =>
                {
                    double sum = count[i];

                    if (sum > 0)
                    {
                        for (int j = 0; j < newCentroids[i].Length; j++)
                        {
                            newCentroids[i][j] /= sum;
                        }
                    }
                });

                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                shouldStop = converged(centroids, newCentroids);

                // go to next generation
                Parallel.For(0, centroids.Length, i =>
                {
                    for (int j = 0; j < centroids[i].Length; j++)
                    {
                        centroids[i][j] = newCentroids[i][j];
                    }
                });
            }

            for (int i = 0; i < clusters.Centroids.Length; i++)
            {
                // Compute the proportion of samples in the cluster
                clusters.Proportions[i] = count[i] / weightSum;
            }

            return(labels);
        }