Ejemplo n.º 1
0
        /// <summary>
        /// Initializes the centroids.
        /// </summary>
        /// <param name="data">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        private void InitializeCentroids(double[][] data, double[] weights)
        {
            int rows = data.Length;
            int cols = data[0].Length;

            // indices of centroids are placed in centroidIndices
            int[] centroidIndices = MakeBatch(rows, this.InitializationBatchSize.Value);

            // indices of the points in the validation batch are placed in validationIndices
            int[] validationIndices = MakeBatch(rows, this.InitializationBatchSize.Value);

            // the actual centroids
            double[][] centroidBatch = new double[this.InitializationBatchSize.Value][];

            // the actual points in the validation batch
            double[][] validationBatch = new double[this.InitializationBatchSize.Value][];

            // the weights of the validation points
            double[] validationWeights = new double[this.InitializationBatchSize.Value];

            // temporary variables
            double minDistortion = -1;

            double[][] bestCentroids = new double[this.numberOfInitializations][];

            for (int i = 0; i < this.numberOfInitializations; i++)
            {
                for (int j = 0; j < this.InitializationBatchSize; j++)
                {
                    // Copying points to batches
                    int centroidIndex   = centroidIndices[j];
                    int validationIndex = validationIndices[j];
                    centroidBatch[j]   = new double[cols];
                    validationBatch[j] = new double[cols];
                    data[centroidIndex].CopyTo(centroidBatch[j]);
                    data[validationIndex].CopyTo(validationBatch[j]);
                    validationWeights[j] = weights[validationIndex];
                }
                // Computing distortion of the current centroid set.
                Clusters.Randomize(centroidBatch, UseSeeding);
                double distortion = Clusters.Distortion(validationBatch, weights: validationWeights);
                // If this is the very first centroid set
                // or is better than the best so far
                // we remember it.
                if (minDistortion == -1 || distortion < minDistortion)
                {
                    minDistortion = distortion;
                    bestCentroids = Clusters.Centroids;
                }
            }
            // Setting the initial centroids
            // to the best found set.
            Clusters.Centroids = bestCentroids;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Learns a model that can map the given inputs to the desired outputs.
        /// </summary>
        /// <param name="x">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        /// <returns>A model that has learned how to produce suitable outputs
        /// given the input data <paramref name="x" />.</returns>
        /// <exception cref="ArgumentNullException">points</exception>
        /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception>
        public KModesClusterCollection <T> Learn(T[][] x, double[] weights = null)
        {
            // Initial argument checking
            if (x == null)
            {
                throw new ArgumentNullException("points");
            }

            if (x.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            int k    = this.K;
            int rows = x.Length;
            int cols = x[0].Length;

            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.
            //
            if (this.Clusters.Centroids[0] == null)
            {
                Clusters.Randomize(x);
            }

            // Initial variables
            int[]    labels       = new int[rows];
            double[] proportions  = Clusters.Proportions;
            T[][]    centroids    = Clusters.Centroids;
            T[][]    newCentroids = new T[k][];
            for (int i = 0; i < newCentroids.Length; i++)
            {
                newCentroids[i] = new T[cols];
            }

            var clusters = new ConcurrentBag <T[]> [k];

            this.Iterations = 0;

            do // Main loop
            {
                // Reset the centroids and the
                //  cluster member counters'
                for (int i = 0; i < k; i++)
                {
                    clusters[i] = new ConcurrentBag <T[]>();
                }

                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                Parallel.For(0, x.Length, ParallelOptions, i =>
                {
                    // Get the point
                    T[] point = x[i];

                    // Compute the nearest cluster centroid
                    int c = labels[i] = Clusters.Decide(x[i]);

                    // Accumulate in the corresponding centroid
                    clusters[c].Add(point);
                });

                // Next we will compute each cluster's new centroid
                //  value by computing the mode in each cluster.

                Parallel.For(0, k, ParallelOptions, i =>
                {
                    if (clusters[i].Count == 0)
                    {
                        newCentroids[i] = centroids[i];
                    }
                    else
                    {
                        T[][] p = Matrix.Transpose <ConcurrentBag <T[]>, T>(clusters[i]);

                        // For each dimension
                        for (int d = 0; d < this.Dimension; d++)
                        {
                            newCentroids[i][d] = p[d].Mode(alreadySorted: false, inPlace: true);
                        }
                    }
                });


                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                if (converged(centroids, newCentroids))
                {
                    break;
                }


                // go to next generation
                for (int i = 0; i < centroids.Length; i++)
                {
                    centroids[i] = newCentroids[i];
                }
            }while (true);


            // Compute cluster information (optional)
            for (int i = 0; i < k; i++)
            {
                // Compute the proportion of samples in the cluster
                proportions[i] = clusters[i].Count / (double)x.Length;
            }

            if (ComputeError)
            {
                // Compute the average error
                Error = Clusters.Distortion(x, labels);
            }

            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length);

            // Return the classification result
            return(Clusters);
        }
Ejemplo n.º 3
0
        /// <summary>
        ///   Divides the input data into K clusters.
        /// </summary>
        ///
        /// <param name="points">The data where to compute the algorithm.</param>
        ///
        public int[] Compute(T[][] points)
        {
            // Initial argument checking
            if (points == null)
            {
                throw new ArgumentNullException("points");
            }

            if (points.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            int k    = this.K;
            int rows = points.Length;
            int cols = points[0].Length;

            // Perform a random initialization of the clusters
            // if the algorithm has not been initialized before.
            //
            if (this.Clusters.Centroids[0] == null)
            {
                Clusters.Randomize(points);
            }

            // Initial variables
            int[]    labels       = new int[rows];
            double[] proportions  = Clusters.Proportions;
            T[][]    centroids    = Clusters.Centroids;
            T[][]    newCentroids = new T[k][];
            for (int i = 0; i < newCentroids.Length; i++)
            {
                newCentroids[i] = new T[cols];
            }

            var clusters = new List <T[]> [k];

            for (int i = 0; i < k; i++)
            {
                clusters[i] = new List <T[]>();
            }

            Iterations = 0;

            do // Main loop
            {
                // Reset the centroids and the
                //  cluster member counters'
                for (int i = 0; i < k; i++)
                {
                    clusters[i].Clear();
                }


                // First we will accumulate the data points
                // into their nearest clusters, storing this
                // information into the newClusters variable.

                // For each point in the data set,
                for (int i = 0; i < points.Length; i++)
                {
                    // Get the point
                    T[] point = points[i];

                    // Compute the nearest cluster centroid
                    int c = labels[i] = Clusters.Nearest(points[i]);

                    // Accumulate in the corresponding centroid
                    clusters[c].Add(point);
                }

                // Next we will compute each cluster's new centroid
                //  value by computing the mode in each cluster.

                for (int i = 0; i < k; i++)
                {
                    if (clusters[i].Count == 0)
                    {
                        newCentroids[i] = centroids[i];
                        continue;
                    }

                    T[][] p = clusters[i].ToArray();

                    // For each dimension
                    for (int d = 0; d < Dimension; d++)
                    {
                        T[] values = p.GetColumn(d);

                        T mode = values.Mode(alreadySorted: false, inPlace: true);

                        newCentroids[i][d] = mode;
                    }
                }


                // The algorithm stops when there is no further change in the
                //  centroids (relative difference is less than the threshold).
                if (converged(centroids, newCentroids))
                {
                    break;
                }


                // go to next generation
                for (int i = 0; i < centroids.Length; i++)
                {
                    centroids[i] = newCentroids[i];
                }
            }while (true);


            // Compute cluster information (optional)
            for (int i = 0; i < k; i++)
            {
                // Compute the proportion of samples in the cluster
                proportions[i] = clusters[i].Count / (double)points.Length;
            }

            if (ComputeError)
            {
                // Compute the average error
                Error = Clusters.Distortion(points, labels);
            }

            // Return the classification result
            return(labels);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Learns a model that can map the given inputs to the desired outputs.
        /// </summary>
        /// <param name="x">The model inputs.</param>
        /// <param name="weights">The weight of importance for each input sample.</param>
        /// <returns>A model that has learned how to produce suitable outputs
        /// given the input data <paramref name="x" />.</returns>
        /// <exception cref="ArgumentNullException">points</exception>
        /// <exception cref="ArgumentException">Not enough points. There should be more points than the number K of clusters.</exception>
        public KMedoidsClusterCollection <T> Learn(T[][] x, double[] weights = null)
        {
            // Initial argument checking
            if (x == null)
            {
                throw new ArgumentNullException("points");
            }

            if (x.Length < K)
            {
                throw new ArgumentException("Not enough points. There should be more points than the number K of clusters.");
            }

            // Perform initialization of the clusters
            int[] currentMedoidIndicesArray = Clusters.Randomize(x, Initialization, ParallelOptions);

            // Detect initial medoid indices
            if (currentMedoidIndicesArray == null)
            {
                currentMedoidIndicesArray = Vector.Create(value: -1, size: K);
                Parallel.For(0, x.Length, ParallelOptions, i =>
                {
                    for (int j = 0; j < Clusters.Centroids.Length; j++)
                    {
                        if (Distance.Distance(Clusters.Centroids[j], x[i]) == 0)
                        {
                            int prev = Interlocked.CompareExchange(ref currentMedoidIndicesArray[j], i, -1);
                            if (prev != -1)
                            {
                                throw new Exception("Duplicate medoid #{0} detected: {1} and {2}".Format(j, prev, i));
                            }
                            break;
                        }
                    }
                });
            }

            for (int i = 0; i < currentMedoidIndicesArray.Length; ++i)
            {
                if (currentMedoidIndicesArray[i] == -1)
                {
                    throw new Exception("Medoid #{0} not found.".Format(i));
                }
            }



            Iterations = 0;

            int[] labels = new int[x.Length];

            // Special case - one medoid.
            if (K == 1)
            {
                // Arrange point with minimal total cost as medoid.
                int    imin = -1;
                double min  = Double.PositiveInfinity;
                for (int i = 0; i < x.Length; i++)
                {
                    double cost = 0.0;
                    for (int j = 0; j < x.Length; j++)
                    {
                        cost += Distance.Distance(x[i], x[j]);
                    }
                    if (cost < min)
                    {
                        imin = i;
                    }
                }

                Clusters.Centroids[0] = x[imin];
            }
            else
            {
                Compute(x, labels, currentMedoidIndicesArray);
            }

            // Miscellaneous final computations
            if (ComputeError)
            {
                // Compute the average error
#if DEBUG
                var expected = Clusters.Decide(x);
                if (!expected.IsEqual(labels))
                {
                    throw new Exception();
                }
#endif

                Error = Clusters.Distortion(x, labels);
            }

            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfClasses == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfOutputs == K);
            Accord.Diagnostics.Debug.Assert(Clusters.NumberOfInputs == x[0].Length);

            // Return the classification result
            return(Clusters);
        }