Exemplo n.º 1
0
        /// <summary>
        /// Computes the Davies-Bouldin index to assess the quality of a given partition
        /// of the specified data.
        /// </summary>
        /// <remarks>
        /// <inheritdoc cref="MinimumCentroidLinkage"
        /// path="para[@id='parameterDescription']"/>
        /// <para>
        /// The Davies-Bouldin index takes on small values for partitions
        /// with high similarity among observations in each part and
        /// low similarities among parts.
        /// As a consequence, the best partition is considered the one with
        /// the smallest Davies–Bouldin index.
        /// </para>
        /// <para>
        /// This method applies Euclidean distances. The intra-cluster distance is
        /// implemented as the centroid diameter, or the average distance
        /// between the elements
        /// in the cluster and the cluster centroid.
        /// The inter-cluster distance is implemented as the
        /// centroid linkage, i.e. the distance between cluster
        /// centroids.
        /// </para>
        /// </remarks>
        /// <param name="data">The data whose rows represent the available
        /// observations.</param>
        /// <param name="partition">The data partition to evaluate.</param>
        /// <returns>The Davies Bouldin Index for the given data partition.</returns>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/>.<br/>
        /// -or-<br/>
        /// <paramref name="partition"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// A part in <paramref name="partition"/> contains a position
        /// which is not valid as a row index of <paramref name="data"/>.
        /// </exception>
        /// <seealso href="http://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index"/>
        public static double DaviesBouldinIndex(DoubleMatrix data,
                                                IndexPartition <double> partition)
        {
            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (partition is null)
            {
                throw new ArgumentNullException(nameof(partition));
            }

            int numberOfClusters = partition.Count;

            double[] categories = partition.Identifiers.ToArray();
            double   distance;
            int      size;

            // Compute cluster centroids, their distance matrix, and,
            // in each cluster, the average distance to
            // the corresponding centroid.
            DoubleMatrix centroids = DoubleMatrix.Dense(
                numberOfClusters, data.NumberOfColumns);
            DoubleMatrix averageDistanceFromCentroid =
                DoubleMatrix.Dense(numberOfClusters, 1);

            int numberOfObservations = data.NumberOfRows;

            for (int i = 0; i < numberOfClusters; i++)
            {
                var currentPart = partition[categories[i]];
                if (currentPart.Max >= numberOfObservations)
                {
                    throw new ArgumentException(
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"),
                              nameof(partition));
                }
                DoubleMatrix cluster = data[currentPart, ":"];
                centroids[i, ":"] = Stat.Mean(cluster, DataOperation.OnColumns);
                size     = cluster.NumberOfRows;
                distance = 0.0;

                for (int r = 0; r < size; r++)
                {
                    distance += Distance.Euclidean(cluster[r, ":"], centroids[i, ":"]);
                }
                averageDistanceFromCentroid[i] = distance / size;
            }

            DoubleMatrix centroidDistances = Distance.Euclidean(centroids);

            // DBI = (1/numberOfIndividuals) Σ  max{ (S(i) + S(j) ) / d(C(i), C(j)) }
            //             i  j≠i

            double daviesBouldinIndex = 0.0;

            double[] max = new double[numberOfClusters];
            double   current;

            for (int i = 0; i < numberOfClusters; i++)
            {
                max[i] = Double.NegativeInfinity;
                for (int j = 0; j < numberOfClusters; j++)
                {
                    if (j != i)
                    {
                        current = (averageDistanceFromCentroid[i] +
                                   averageDistanceFromCentroid[j])
                                  / centroidDistances[i, j];
                        if (max[i] < current)
                        {
                            max[i] = current;
                        }
                    }
                }
                daviesBouldinIndex += max[i];
            }

            daviesBouldinIndex /= numberOfClusters;

            return(daviesBouldinIndex);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Computes the Dunn index to assess the quality of a given partition
        /// of the specified data.
        /// </summary>
        /// <param name="data">The data whose rows represent the available
        /// observations.</param>
        /// <param name="partition">The data partition to evaluate.</param>
        /// <remarks>
        /// <inheritdoc cref="MinimumCentroidLinkage"
        /// path="para[@id='parameterDescription']"/>
        /// <para>
        /// The Dunn index aims to identify dense and well-separated clusters.
        /// It is defined as the ratio between the minimal inter-cluster distance to
        /// maximal intra-cluster distance.
        /// Since this criterion seeks clusters with high intra-cluster similarity
        /// and low inter-cluster similarity, clusters with
        /// high Dunn index are more desirable.
        /// </para>
        /// <para>
        /// This method applies Euclidean distances. The intra-cluster distance is
        /// implemented as the maximal distance between any pair of elements
        /// in the cluster. The inter-cluster distance is implemented as the
        /// single linkage, i.e. the shortest distance between pairs of
        /// individuals belonging to different clusters.
        /// </para>
        ///</remarks>
        /// <returns>The Dunn Index for the given data partition.</returns>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/>.<br/>
        /// -or-<br/>
        /// <paramref name="partition"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// A part in <paramref name="partition"/> contains a position
        /// which is not valid as a row index of <paramref name="data"/>.
        /// </exception>
        /// <seealso href="http://en.wikipedia.org/wiki/Dunn_index"/>
        public static double DunnIndex(DoubleMatrix data, IndexPartition <double> partition)
        {
            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (partition is null)
            {
                throw new ArgumentNullException(nameof(partition));
            }

            int numberOfClusters = partition.Count;

            double[] categories = partition.Identifiers.ToArray();

            DoubleMatrix[] clusters = new DoubleMatrix[numberOfClusters];
            double         diameter, maxDiameter;

            int numberOfObservations = data.NumberOfRows;

            // Compute clusters, cluster diameters and their maximum
            maxDiameter = Double.NegativeInfinity;
            for (int i = 0; i < numberOfClusters; i++)
            {
                var currentPart = partition[categories[i]];
                if (currentPart.Max >= numberOfObservations)
                {
                    throw new ArgumentException(
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"),
                              nameof(partition));
                }
                clusters[i] = data[currentPart, ":"];
                diameter    = Distance.CompleteDiameter(clusters[i]);
                if (maxDiameter < diameter)
                {
                    maxDiameter = diameter;
                }
            }

            double min = Double.PositiveInfinity;
            double current;

            for (int i = 0; i < numberOfClusters; i++)
            {
                for (int j = i + 1; j < numberOfClusters; j++)
                {
                    current = Distance.SingleLinkage(clusters[i], clusters[j]);
                    if (current < min)
                    {
                        min = current;
                    }
                }
            }

            // DI = min { min     { D(i,j) / max { D(k) } } }
            //       i     j, j≠i             k

            double dunnIndex = min / maxDiameter;

            return(dunnIndex);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Discovers optimal clusters
        /// in a data set.
        /// </summary>
        /// <param name="maximumNumberOfParts">
        /// The maximum number of parts allowed in the optimal
        /// partition.
        /// </param>
        /// <param name="data">
        /// The matrix whose rows contain the observations for the
        /// items under study.
        /// </param>
        /// <remarks>
        /// <para>
        /// Method <see cref="Discover(DoubleMatrix, int)"/> partitions
        /// a collection of items in no more than the specified
        /// <paramref name="maximumNumberOfParts"/>,
        /// given the specified <paramref name="data"/>, by minimizing the sum of
        /// intra-cluster squared deviations.
        /// </para>
        /// <para>
        /// This method uses a default Cross-Entropy context of
        /// type <see cref="PartitionOptimizationContext"/> to identify the
        /// optimal partition.
        /// If different partitioning criteria need to be applied,
        /// or extra control on the
        /// parameters of the underlying algorithm is required,
        /// a specialized <see cref="PartitionOptimizationContext"/> can be
        /// can be instantiated and hence exploited executing
        /// method <see cref="SystemPerformanceOptimizer.Optimize(
        /// SystemPerformanceOptimizationContext, double, int)">Optimize</see>
        /// on a <see cref="SystemPerformanceOptimizer"/> object.
        /// See the documentation about <see cref="PartitionOptimizationContext"/>
        /// for additional examples.
        /// </para>
        /// </remarks>
        /// <example>
        /// <para>
        /// In the following example, a partition that optimally split 12 items
        /// is discovered
        /// given
        /// an artificial data set regarding the items under study.
        /// </para>
        /// <para>
        /// <code title="Optimal partitioning of a data set."
        /// source="..\Novacta.Analytics.CodeExamples\ClustersDiscoverExample0.cs.txt"
        /// language="cs" />
        /// </para>
        /// </example>
        /// <returns>
        /// A partition of the row indexes valid for <paramref name="data"/>.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentOutOfRangeException">
        /// <paramref name="maximumNumberOfParts"/> is not greater than one.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// <paramref name="maximumNumberOfParts"/> is not less than
        /// the number of rows in <paramref name="data"/>.
        /// </exception>
        /// <seealso cref="PartitionOptimizationContext"/>
        /// <seealso cref="SystemPerformanceOptimizer"/>
        public static IndexPartition <double> Discover(
            DoubleMatrix data,
            int maximumNumberOfParts)
        {
            #region Input validation

            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (maximumNumberOfParts < 2)
            {
                throw new ArgumentOutOfRangeException(
                          nameof(maximumNumberOfParts),
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_GREATER_THAN_VALUE"),
                              "1"));
            }

            int stateDimension = data.NumberOfRows;

            if (stateDimension <= maximumNumberOfParts)
            {
                throw new ArgumentException(
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_ROWS"),
                              nameof(maximumNumberOfParts),
                              nameof(data)),
                          nameof(maximumNumberOfParts)
                          );
            }

            #endregion

            double objectiveFunction(DoubleMatrix x)
            {
                double performance = 0.0;
                var    partition   = IndexPartition.Create(x);

                foreach (double category in partition.Identifiers)
                {
                    performance += Stat.Sum(
                        Stat.SumOfSquaredDeviations(
                            data[partition[category], ":"],
                            DataOperation.OnColumns));
                }

                return(performance);
            }

            var optimizer =
                new SystemPerformanceOptimizer();

            var context = new PartitionOptimizationContext(
                objectiveFunction: objectiveFunction,
                stateDimension: stateDimension,
                partitionDimension: maximumNumberOfParts,
                probabilitySmoothingCoefficient: .8,
                optimizationGoal: OptimizationGoal.Minimization,
                minimumNumberOfIterations: 3,
                maximumNumberOfIterations: 1000);

            double rarity = .01;

            int sampleSize = 500 * maximumNumberOfParts;

            var results = optimizer.Optimize(
                context,
                rarity,
                sampleSize);

            return(IndexPartition.Create(results.OptimalState));
        }
Exemplo n.º 4
0
        /// <summary>
        /// Computes the minimum centroid linkage among parts
        /// in the given partition of the specified data.
        /// </summary>
        /// <param name="data">The data whose rows represent the available
        /// observations.</param>
        /// <param name="partition">The data partition to evaluate.</param>
        /// <returns>Returns the minimum value of
        /// <see cref="Distance.CentroidLinkage">CentroidLinkage</see>
        /// over the pairs of clusters corresponding to parts
        /// in <paramref name="partition" />.</returns>
        /// <remarks>
        /// <para id='parameterDescription'>
        /// Each column of <paramref name="data"/> is associated to one of
        /// the variables under study, while
        /// its rows are associated to the individuals. The
        /// <paramref name="partition"/> is intended to define parts which
        /// contains row indexes valid for <paramref name="data"/>.
        /// </para>
        /// <para>
        /// This method applies Euclidean distances.
        /// </para>
        /// </remarks>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/>.<br/>
        /// -or-<br/>
        /// <paramref name="partition"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// A part in <paramref name="partition"/> contains a position
        /// which is not valid as a row index of <paramref name="data"/>.
        /// </exception>
        public static double MinimumCentroidLinkage(DoubleMatrix data,
                                                    IndexPartition <double> partition)
        {
            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (partition is null)
            {
                throw new ArgumentNullException(nameof(partition));
            }

            double minimumMeanLinkage, currentMeanLinkage;

            DoubleMatrix left, right;

            minimumMeanLinkage = double.PositiveInfinity;
            double[] categories = partition.Identifiers.ToArray();

            int numberOfClusters     = categories.Length;
            int numberOfObservations = data.NumberOfRows;

            for (int i = 0; i < numberOfClusters; i++)
            {
                var currentLeftPart = partition[categories[i]];
                if (i == 0)
                {
                    if (currentLeftPart.Max >= numberOfObservations)
                    {
                        throw new ArgumentException(
                                  ImplementationServices.GetResourceString(
                                      "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"),
                                  nameof(partition));
                    }
                }
                left = data[currentLeftPart, ":"];
                for (int j = i + 1; j < numberOfClusters; j++)
                {
                    var currentRightPart = partition[categories[j]];
                    if (i == 0)
                    {
                        if (currentRightPart.Max >= numberOfObservations)
                        {
                            throw new ArgumentException(
                                      ImplementationServices.GetResourceString(
                                          "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"),
                                      nameof(partition));
                        }
                    }
                    right = data[currentRightPart, ":"];
                    currentMeanLinkage = Distance.CentroidLinkage(left, right);
                    if (currentMeanLinkage < minimumMeanLinkage)
                    {
                        minimumMeanLinkage = currentMeanLinkage;
                    }
                }
            }

            return(minimumMeanLinkage);
        }
Exemplo n.º 5
0
        /// <summary>
        /// Explains existing clusters by selecting
        /// a number of features from the specified corresponding data set.
        /// </summary>
        /// <param name="data">
        /// The matrix whose columns contain the features observed at the
        /// items under study.
        /// </param>
        /// <param name="partition">
        /// A partition of the row indexes valid for <paramref name="data"/>.
        /// </param>
        /// <param name="numberOfExplanatoryFeatures">
        /// The number of features to be selected.
        /// </param>
        /// <remarks>
        /// <para>
        /// Method <see cref="Explain(
        /// DoubleMatrix, IndexPartition{double}, int)"/>
        /// selects the specified <paramref name="numberOfExplanatoryFeatures"/>
        /// from the given
        /// <paramref name="data"/>, by minimizing the Davies-Bouldin
        /// Index corresponding to
        /// the <paramref name="partition"/> of the items under study.
        /// </para>
        /// <para>
        /// This method uses a default Cross-Entropy context of
        /// type <see cref="CombinationOptimizationContext"/> to identify the
        /// optimal features.
        /// If different selection criteria need to be applied,
        /// or extra control on the
        /// parameters of the underlying algorithm is required,
        /// a specialized <see cref="CombinationOptimizationContext"/> can be
        /// can be instantiated and hence exploited executing
        /// method <see cref="SystemPerformanceOptimizer.Optimize(
        /// SystemPerformanceOptimizationContext, double, int)">Optimize</see>
        /// on a <see cref="SystemPerformanceOptimizer"/> object.
        /// See the documentation about <see cref="CombinationOptimizationContext"/>
        /// for additional examples.
        /// </para>
        /// </remarks>
        /// <example>
        /// <para>
        /// In the following example, an existing partition of 12 items is explained
        /// by selecting 2 features out of the seven ones available in
        /// an artificial data set regarding the items under study.
        /// </para>
        /// <para>
        /// <code title="Selecting features from a data set to explain a given partition."
        /// source="..\Novacta.Analytics.CodeExamples\ClustersExplainExample0.cs.txt"
        /// language="cs" />
        /// </para>
        /// </example>
        /// <returns>
        /// The collection of column indexes, valid for <paramref name="data"/>, that
        /// correspond to the features selected to explain the
        /// given <paramref name="partition"/> of row indexes.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/> is <b>null</b>.<br/>
        /// -or-<br/>
        /// <paramref name="partition"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentOutOfRangeException">
        /// <paramref name="numberOfExplanatoryFeatures"/> is not positive.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// <paramref name="numberOfExplanatoryFeatures"/> is not less than
        /// the number of columns in <paramref name="data"/>.<br/>
        /// -or-<br/>
        /// A part in <paramref name="partition"/> contains a position
        /// which is not valid as a row index of <paramref name="data"/>.
        /// </exception>
        /// <seealso cref="IndexPartition.DaviesBouldinIndex(
        /// DoubleMatrix, IndexPartition{double})"/>
        /// <seealso cref="CombinationOptimizationContext"/>
        /// <seealso cref="SystemPerformanceOptimizer"/>
        public static IndexCollection Explain(
            DoubleMatrix data,
            IndexPartition <double> partition,
            int numberOfExplanatoryFeatures)
        {
            #region Input validation

            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (numberOfExplanatoryFeatures < 1)
            {
                throw new ArgumentOutOfRangeException(
                          nameof(numberOfExplanatoryFeatures),
                          ImplementationServices.GetResourceString(
                              "STR_EXCEPT_PAR_MUST_BE_POSITIVE"));
            }

            int stateDimension = data.NumberOfColumns;

            if (stateDimension <= numberOfExplanatoryFeatures)
            {
                throw new ArgumentException(
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_COLUMNS"),
                              nameof(numberOfExplanatoryFeatures),
                              nameof(data)),
                          nameof(numberOfExplanatoryFeatures)
                          );
            }

            if (partition is null)
            {
                throw new ArgumentNullException(nameof(partition));
            }

            #endregion

            double objectiveFunction(DoubleMatrix x)
            {
                IndexCollection selected = x.FindNonzero();

                double performance =
                    IndexPartition.DaviesBouldinIndex(
                        data: data[":", selected],
                        partition: partition);

                return(performance);
            }

            var optimizer =
                new SystemPerformanceOptimizer();

            var context = new CombinationOptimizationContext(
                objectiveFunction: objectiveFunction,
                stateDimension: stateDimension,
                combinationDimension: numberOfExplanatoryFeatures,
                probabilitySmoothingCoefficient: .8,
                optimizationGoal: OptimizationGoal.Minimization,
                minimumNumberOfIterations: 3,
                maximumNumberOfIterations: 1000);

            double rarity = .01;

            int sampleSize = 1000 * stateDimension;

            var results = optimizer.Optimize(
                context,
                rarity,
                sampleSize);

            var optimalState = results.OptimalState;

            return(optimalState.FindNonzero());
        }