Exemple #1
0
        /// <summary>
        /// Learns a model that can map the given inputs to the desired outputs.
        /// </summary>
        /// <param name="x">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        /// <returns>A model that has learned how to produce suitable outputs
        /// given the input data <paramref name="x" />.</returns>
        /// <exception cref="ArgumentNullException">points</exception>
        /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception>
        public KModesClusterCollection <T> Learn(T[][] x, double[] weights = null)
        {
            // Initial argument checking
            if (x == null)
            {
                throw new ArgumentNullException("points");
            }

            if (x.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            int k    = this.K;
            int rows = x.Length;
            int cols = x[0].Length;

            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.
            //
            if (this.Clusters.Centroids[0] == null)
            {
                Clusters.Randomize(x);
            }

            // Initial variables
            int[]    labels       = new int[rows];
            double[] proportions  = Clusters.Proportions;
            T[][]    centroids    = Clusters.Centroids;
            T[][]    newCentroids = new T[k][];
            for (int i = 0; i < newCentroids.Length; i++)
            {
                newCentroids[i] = new T[cols];
            }

            var clusters = new ConcurrentBag <T[]> [k];

            this.Iterations = 0;

            do // Main loop
            {
                // Reset the centroids and the
                //  cluster member counters'
                for (int i = 0; i < k; i++)
                {
                    clusters[i] = new ConcurrentBag <T[]>();
                }

                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                Parallel.For(0, x.Length, ParallelOptions, i =>
                {
                    // Get the point
                    T[] point = x[i];

                    // Compute the nearest cluster centroid
                    int c = labels[i] = Clusters.Decide(x[i]);

                    // Accumulate in the corresponding centroid
                    clusters[c].Add(point);
                });

                // Next we will compute each cluster's new centroid
                //  value by computing the mode in each cluster.

                Parallel.For(0, k, ParallelOptions, i =>
                {
                    if (clusters[i].Count == 0)
                    {
                        newCentroids[i] = centroids[i];
                    }
                    else
                    {
                        T[][] p = Matrix.Transpose <ConcurrentBag <T[]>, T>(clusters[i]);

                        // For each dimension
                        for (int d = 0; d < this.Dimension; d++)
                        {
                            newCentroids[i][d] = p[d].Mode(alreadySorted: false, inPlace: true);
                        }
                    }
                });


                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                if (converged(centroids, newCentroids))
                {
                    break;
                }


                // go to next generation
                for (int i = 0; i < centroids.Length; i++)
                {
                    centroids[i] = newCentroids[i];
                }
            }while (true);


            // Compute cluster information (optional)
            for (int i = 0; i < k; i++)
            {
                // Compute the proportion of samples in the cluster
                proportions[i] = clusters[i].Count / (double)x.Length;
            }

            if (ComputeError)
            {
                // Compute the average error
                Error = Clusters.Distortion(x, labels);
            }

            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length);

            // Return the classification result
            return(Clusters);
        }
Exemple #2
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="data">The data where to compute the algorithm.</param>
        /// <param name="weights">The weight to consider for each data sample. This is used in weighted K-Means</param>
        /// <param name="weightSum">The total sum of the weights in <paramref name="weights"/>.</param>
        ///
        private int[] Compute(double[][] data, double[] weights, double weightSum)
        {
            this.Iterations = 0;

            // TODO: Implement a faster version using the triangle
            // inequality to reduce the number of distance calculations
            //
            //  - http://www-cse.ucsd.edu/~elkan/kmeansicml03.pdf
            //  - http://mloss.org/software/view/48/
            //

            int k    = this.K;
            int rows = data.Length;
            int cols = data[0].Length;

            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.
            //
            if (this.Clusters.Centroids[0] == null)
            {
                Randomize(data);
            }

            // Initial variables
            int[]      labels       = new int[rows];
            double[]   count        = new double[k];
            double[][] centroids    = clusters.Centroids;
            double[][] newCentroids = new double[k][];
            for (int i = 0; i < newCentroids.Length; i++)
            {
                newCentroids[i] = new double[cols];
            }

            Object[] syncObjects = new Object[K];
            for (int i = 0; i < syncObjects.Length; i++)
            {
                syncObjects[i] = new Object();
            }

            Iterations = 0;

            bool shouldStop = false;

            while (!shouldStop) // Main loop
            {
                Array.Clear(count, 0, count.Length);
                for (int i = 0; i < newCentroids.Length; i++)
                {
                    Array.Clear(newCentroids[i], 0, newCentroids[i].Length);
                }

                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                Parallel.For(0, data.Length, ParallelOptions, i =>
                {
                    // Get the point
                    double[] point = data[i];
                    double weight  = weights[i];

                    // Get the nearest cluster centroid
                    int c = labels[i] = Clusters.Decide(point);

                    // Get the closest cluster centroid
                    double[] centroid = newCentroids[c];

                    lock (syncObjects[c])
                    {
                        // Increase the cluster's sample counter
                        count[c] += weight;

                        // Accumulate in the cluster centroid
                        for (int j = 0; j < point.Length; j++)
                        {
                            centroid[j] += point[j] * weight;
                        }
                    }
                });

                // Next we will compute each cluster's new centroid
                //  by dividing the accumulated sums by the number of
                //  samples in each cluster, thus averaging its members.
                Parallel.For(0, newCentroids.Length, ParallelOptions, i =>
                {
                    double sum = count[i];

                    if (sum > 0)
                    {
                        for (int j = 0; j < newCentroids[i].Length; j++)
                        {
                            newCentroids[i][j] /= sum;
                        }
                    }
                });

                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                shouldStop = converged(centroids, newCentroids);

                // go to next generation
                Parallel.For(0, centroids.Length, ParallelOptions, i =>
                {
                    for (int j = 0; j < centroids[i].Length; j++)
                    {
                        centroids[i][j] = newCentroids[i][j];
                    }
                });
            }

            for (int i = 0; i < clusters.Centroids.Length; i++)
            {
                // Compute the proportion of samples in the cluster
                clusters.Proportions[i] = count[i] / weightSum;
            }

            ComputeInformation(data, labels);

            return(labels);
        }
Exemple #3
0
        /// <summary>
        /// Learns a model that can map the given inputs to the desired outputs.
        /// </summary>
        /// <param name="x">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        /// <returns>A model that has learned how to produce suitable outputs
        /// given the input data <paramref name="x" />.</returns>
        public override KMeansClusterCollection Learn(double[][] x, double[] weights = null)
        {
            // Initial argument checking
            if (x == null)
            {
                throw new ArgumentNullException("x");
            }

            if (x.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            if (weights == null)
            {
                weights = Vector.Ones(x.Length);
            }

            if (x.Length != weights.Length)
            {
                throw new ArgumentException("Data weights vector must be the same length as data samples.");
            }

            double weightSum = weights.Sum();

            if (weightSum <= 0)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            int cols = x.Columns();

            for (int i = 0; i < x.Length; i++)
            {
                if (x[i].Length != cols)
                {
                    throw new DimensionMismatchException("data", "The points matrix should be rectangular. The vector at position {} has a different length than previous ones.");
                }
            }

            int k = Clusters.Count;

            KMeans kmeans = new KMeans(2)
            {
                Distance           = (IDistance <double[]>)Clusters.Distance,
                ComputeError       = false,
                ComputeCovariances = false,
                UseSeeding         = UseSeeding,
                Tolerance          = Tolerance,
                MaxIterations      = MaxIterations,
            };

            var centroids   = Clusters.Centroids;
            var clusters    = new double[k][][];
            var distortions = new double[k];

            // 1. Start with all data points in one cluster
            clusters[0] = x;

            // 2. Repeat steps 3 to 6 (k-1) times to obtain K centroids
            for (int current = 1; current < k; current++)
            {
                // 3. Choose cluster with largest distortion
                int choosen; distortions.Max(current, out choosen);

                // 4. Split cluster into two sub-clusters
                var splits = split(clusters[choosen], kmeans);

                clusters[choosen] = splits.Item1;
                clusters[current] = splits.Item2;

                // 5. Replace chosen centroid and add a new one
                centroids[choosen] = kmeans.Clusters.Centroids[0];
                centroids[current] = kmeans.Clusters.Centroids[1];

                // Recompute distortions for the updated clusters
                distortions[choosen] = kmeans.Clusters[0].Distortion(clusters[choosen]);
                distortions[current] = kmeans.Clusters[1].Distortion(clusters[current]);

                // 6. Increment cluster count (current = current + 1)
            }

            Clusters.NumberOfInputs = cols;

            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length);

            if (ComputeProportions)
            {
                int[] y      = Clusters.Decide(x);
                int[] counts = y.Histogram();
                counts.Divide(y.Length, result: Clusters.Proportions);

                ComputeInformation(x, y);
            }
            else
            {
                ComputeInformation(x);
            }

            return(Clusters);
        }
Exemple #4
0
        /// <summary>
        /// Learns a model that can map the given inputs to the desired outputs.
        /// </summary>
        /// <param name="x">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        /// <returns>A model that has learned how to produce suitable outputs
        /// given the input data <paramref name="x" />.</returns>
        /// <exception cref="ArgumentNullException">points</exception>
        /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception>
        public KMedoidsClusterCollection <T> Learn(T[][] x, double[] weights = null)
        {
            // Initial argument checking
            if (x == null)
            {
                throw new ArgumentNullException("points");
            }

            if (x.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            // Perform initialization of the clusters
            int[] currentMedoidIndicesArray = Clusters.Randomize(x, Initialization, ParallelOptions);

            // Detect initial medoid indices
            if (currentMedoidIndicesArray == null)
            {
                currentMedoidIndicesArray = Vector.Create(value: -1, size: K);
                Parallel.For(0, x.Length, ParallelOptions, i =>
                {
                    for (int j = 0; j < Clusters.Centroids.Length; j++)
                    {
                        if (Distance.Distance(Clusters.Centroids[j], x[i]) == 0)
                        {
                            int prev = Interlocked.CompareExchange(ref currentMedoidIndicesArray[j], i, -1);
                            if (prev != -1)
                            {
                                throw new Exception("Duplicate medoid #{0} detected: {1} and {2}".Format(j, prev, i));
                            }
                            break;
                        }
                    }
                });
            }

            for (int i = 0; i < currentMedoidIndicesArray.Length; ++i)
            {
                if (currentMedoidIndicesArray[i] == -1)
                {
                    throw new Exception("Medoid #{0} not found.".Format(i));
                }
            }



            Iterations = 0;

            int[] labels = new int[x.Length];

            // Special case - one medoid.
            if (K == 1)
            {
                // Arrange point with minimal total cost as medoid.
                int    imin = -1;
                double min  = Double.PositiveInfinity;
                for (int i = 0; i < x.Length; i++)
                {
                    double cost = 0.0;
                    for (int j = 0; j < x.Length; j++)
                    {
                        cost += Distance.Distance(x[i], x[j]);
                    }
                    if (cost < min)
                    {
                        imin = i;
                    }
                }

                Clusters.Centroids[0] = x[imin];
            }
            else
            {
                Compute(x, labels, currentMedoidIndicesArray);
            }

            // Miscellaneous final computations
            if (ComputeError)
            {
                // Compute the average error
#if DEBUG
                var expected = Clusters.Decide(x);
                if (!expected.IsEqual(labels))
                {
                    throw new Exception();
                }
#endif

                Error = Clusters.Distortion(x, labels);
            }

            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length);

            // Return the classification result
            return(Clusters);
        }
        /// <summary>
        /// Learns a model that can map the given inputs to the desired outputs.
        /// </summary>
        /// <param name="x">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        /// <returns>A model that has learned how to produce suitable outputs
        /// given the input data <paramref name="x" />.</returns>
        public override KMeansClusterCollection Learn(double[][] x, double[] weights = null)
        {
            if (x == null)
            {
                throw new ArgumentNullException("x", "No data supplied to Mini-Batch K-Means. The parameter x cannot be null.");
            }

            if (x.Length < K)
            {
                throw new ArgumentException("x", "Not enough points. There should be more points than the number K of clusters.");
            }

            if (weights == null)
            {
                weights = Vector.Ones(x.Length);
            }
            else
            {
                if (x.Length != weights.Length)
                {
                    throw new DimensionMismatchException("weights", "Data weights vector must be the same length as data samples.");
                }
            }
            if (this.InitializationBatchSize > x.Length)
            {
                this.InitializationBatchSize = x.Length;
            }

            double weightSum = weights.Sum();

            if (weightSum <= 0)
            {
                throw new ArgumentException("weights", "Not enough points. There should be more points than the number K of clusters.");
            }

            if (!x.IsRectangular())
            {
                throw new DimensionMismatchException("x", "The points matrix should be rectangular. The vector at position {} has a different length than previous ones.");
            }

            if (this.batchSize > x.Length)
            {
                throw new ArgumentException("Not enough points. There should be more points in the dataset than in a batch. ");
            }
            if (this.initializationBatchSize.HasValue == false)
            {
                this.InitializationBatchSize = 3 * this.K;
            }

            int k    = this.K;
            int rows = x.Length;
            int cols = x[0].Length;

            // Initial variables
            int[]      labels       = new int[rows];
            double[]   count        = new double[k];
            double[][] centroids    = Clusters.Centroids;
            double[][] newCentroids = Jagged.Zeros(k, cols);
            int[]      batchIndices = new int[this.batchSize];

            Iterations = 0;

            bool shouldStop = false;

            if (Clusters.Centroids != null)
            {
                InitializeCentroids(x, weights);
            }

            for (int i = 0; i < k; i++)
            {
                count[i] = 0;
            }

            while (!shouldStop) // Main loop
            {
                Iterations = Iterations + 1;
                // Getting indices of the points in this iteration's batch
                batchIndices = MakeBatch(rows, this.batchSize);

                foreach (int index in batchIndices)
                {
                    // Caching the center nearest to the point x[index]
                    // and storing it in labels[index].
                    double[] point        = x[index];
                    int      clusterIndex = Clusters.Decide(point);
                    labels[index] = clusterIndex;
                }

                // The centroids from the previous iteration will remain in the variable centroids.
                // The refined centroids will be stored in the variable newCentroids on which we are going to operate
                // in this iteration.
                for (int i = 0; i < centroids.Length; i++)
                {
                    for (int j = 0; j < centroids[i].Length; j++)
                    {
                        newCentroids[i][j] = centroids[i][j];
                    }
                }

                // Updating the centroids.
                foreach (int pointIndex in batchIndices)
                {
                    double[] point        = x[pointIndex];
                    int      clusterIndex = labels[pointIndex];
                    count[clusterIndex]++;
                    double eta = 1.0 / count[clusterIndex];
                    // Gradient step.
                    for (int i = 0; i < newCentroids[clusterIndex].Length; i++)
                    {
                        newCentroids[clusterIndex][i] = (1.0 - eta) * newCentroids[clusterIndex][i] + eta * point[i] * weights[pointIndex];
                    }
                }
                ;

                // The algorithm stops when there is no further change in the
                // centroids (relative difference is lower than the threshold).
                shouldStop = converged(centroids, newCentroids);

                // Copying the refined centroids
                // from the variable newCentroids to the variable centroids.
                for (int i = 0; i < centroids.Length; i++)
                {
                    for (int j = 0; j < centroids[i].Length; j++)
                    {
                        centroids[i][j] = newCentroids[i][j];
                    }
                }
            }
            // ... decide for every point in x?

            for (int i = 0; i < Clusters.Centroids.Length; i++)
            {
                // Computing the proportion of samples in the cluster.
                Clusters.Proportions[i] = count[i] / weightSum;
            }

            this.Labels = labels;

            ComputeInformation(x, labels);

            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length);

            return(Clusters);
        }