/// <summary> /// Computes the Davies-Bouldin index to assess the quality of a given partition /// of the specified data. /// </summary> /// <remarks> /// <inheritdoc cref="MinimumCentroidLinkage" /// path="para[@id='parameterDescription']"/> /// <para> /// The Davies-Bouldin index takes on small values for partitions /// with high similarity among observations in each part and /// low similarities among parts. /// As a consequence, the best partition is considered the one with /// the smallest Davies–Bouldin index. /// </para> /// <para> /// This method applies Euclidean distances. The intra-cluster distance is /// implemented as the centroid diameter, or the average distance /// between the elements /// in the cluster and the cluster centroid. /// The inter-cluster distance is implemented as the /// centroid linkage, i.e. the distance between cluster /// centroids. /// </para> /// </remarks> /// <param name="data">The data whose rows represent the available /// observations.</param> /// <param name="partition">The data partition to evaluate.</param> /// <returns>The Davies Bouldin Index for the given data partition.</returns> /// <exception cref="ArgumentNullException"> /// <paramref name="data"/>.<br/> /// -or-<br/> /// <paramref name="partition"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentException"> /// A part in <paramref name="partition"/> contains a position /// which is not valid as a row index of <paramref name="data"/>. /// </exception> /// <seealso href="http://en.wikipedia.org/wiki/Davies%E2%80%93Bouldin_index"/> public static double DaviesBouldinIndex(DoubleMatrix data, IndexPartition <double> partition) { if (data is null) { throw new ArgumentNullException(nameof(data)); } if (partition is null) { throw new ArgumentNullException(nameof(partition)); } int numberOfClusters = partition.Count; double[] categories = partition.Identifiers.ToArray(); double distance; int size; // Compute cluster centroids, their distance matrix, and, // in each cluster, the average distance to // the corresponding centroid. DoubleMatrix centroids = DoubleMatrix.Dense( numberOfClusters, data.NumberOfColumns); DoubleMatrix averageDistanceFromCentroid = DoubleMatrix.Dense(numberOfClusters, 1); int numberOfObservations = data.NumberOfRows; for (int i = 0; i < numberOfClusters; i++) { var currentPart = partition[categories[i]]; if (currentPart.Max >= numberOfObservations) { throw new ArgumentException( ImplementationServices.GetResourceString( "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"), nameof(partition)); } DoubleMatrix cluster = data[currentPart, ":"]; centroids[i, ":"] = Stat.Mean(cluster, DataOperation.OnColumns); size = cluster.NumberOfRows; distance = 0.0; for (int r = 0; r < size; r++) { distance += Distance.Euclidean(cluster[r, ":"], centroids[i, ":"]); } averageDistanceFromCentroid[i] = distance / size; } DoubleMatrix centroidDistances = Distance.Euclidean(centroids); // DBI = (1/numberOfIndividuals) Σ max{ (S(i) + S(j) ) / d(C(i), C(j)) } // i j≠i double daviesBouldinIndex = 0.0; double[] max = new double[numberOfClusters]; double current; for (int i = 0; i < numberOfClusters; i++) { max[i] = Double.NegativeInfinity; for (int j = 0; j < numberOfClusters; j++) { if (j != i) { current = (averageDistanceFromCentroid[i] + averageDistanceFromCentroid[j]) / centroidDistances[i, j]; if (max[i] < current) { max[i] = current; } } } daviesBouldinIndex += max[i]; } daviesBouldinIndex /= numberOfClusters; return(daviesBouldinIndex); }
/// <summary> /// Computes the Dunn index to assess the quality of a given partition /// of the specified data. /// </summary> /// <param name="data">The data whose rows represent the available /// observations.</param> /// <param name="partition">The data partition to evaluate.</param> /// <remarks> /// <inheritdoc cref="MinimumCentroidLinkage" /// path="para[@id='parameterDescription']"/> /// <para> /// The Dunn index aims to identify dense and well-separated clusters. /// It is defined as the ratio between the minimal inter-cluster distance to /// maximal intra-cluster distance. /// Since this criterion seeks clusters with high intra-cluster similarity /// and low inter-cluster similarity, clusters with /// high Dunn index are more desirable. /// </para> /// <para> /// This method applies Euclidean distances. The intra-cluster distance is /// implemented as the maximal distance between any pair of elements /// in the cluster. The inter-cluster distance is implemented as the /// single linkage, i.e. the shortest distance between pairs of /// individuals belonging to different clusters. /// </para> ///</remarks> /// <returns>The Dunn Index for the given data partition.</returns> /// <exception cref="ArgumentNullException"> /// <paramref name="data"/>.<br/> /// -or-<br/> /// <paramref name="partition"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentException"> /// A part in <paramref name="partition"/> contains a position /// which is not valid as a row index of <paramref name="data"/>. /// </exception> /// <seealso href="http://en.wikipedia.org/wiki/Dunn_index"/> public static double DunnIndex(DoubleMatrix data, IndexPartition <double> partition) { if (data is null) { throw new ArgumentNullException(nameof(data)); } if (partition is null) { throw new ArgumentNullException(nameof(partition)); } int numberOfClusters = partition.Count; double[] categories = partition.Identifiers.ToArray(); DoubleMatrix[] clusters = new DoubleMatrix[numberOfClusters]; double diameter, maxDiameter; int numberOfObservations = data.NumberOfRows; // Compute clusters, cluster diameters and their maximum maxDiameter = Double.NegativeInfinity; for (int i = 0; i < numberOfClusters; i++) { var currentPart = partition[categories[i]]; if (currentPart.Max >= numberOfObservations) { throw new ArgumentException( ImplementationServices.GetResourceString( "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"), nameof(partition)); } clusters[i] = data[currentPart, ":"]; diameter = Distance.CompleteDiameter(clusters[i]); if (maxDiameter < diameter) { maxDiameter = diameter; } } double min = Double.PositiveInfinity; double current; for (int i = 0; i < numberOfClusters; i++) { for (int j = i + 1; j < numberOfClusters; j++) { current = Distance.SingleLinkage(clusters[i], clusters[j]); if (current < min) { min = current; } } } // DI = min { min { D(i,j) / max { D(k) } } } // i j, j≠i k double dunnIndex = min / maxDiameter; return(dunnIndex); }
/// <summary> /// Discovers optimal clusters /// in a data set. /// </summary> /// <param name="maximumNumberOfParts"> /// The maximum number of parts allowed in the optimal /// partition. /// </param> /// <param name="data"> /// The matrix whose rows contain the observations for the /// items under study. /// </param> /// <remarks> /// <para> /// Method <see cref="Discover(DoubleMatrix, int)"/> partitions /// a collection of items in no more than the specified /// <paramref name="maximumNumberOfParts"/>, /// given the specified <paramref name="data"/>, by minimizing the sum of /// intra-cluster squared deviations. /// </para> /// <para> /// This method uses a default Cross-Entropy context of /// type <see cref="PartitionOptimizationContext"/> to identify the /// optimal partition. /// If different partitioning criteria need to be applied, /// or extra control on the /// parameters of the underlying algorithm is required, /// a specialized <see cref="PartitionOptimizationContext"/> can be /// can be instantiated and hence exploited executing /// method <see cref="SystemPerformanceOptimizer.Optimize( /// SystemPerformanceOptimizationContext, double, int)">Optimize</see> /// on a <see cref="SystemPerformanceOptimizer"/> object. /// See the documentation about <see cref="PartitionOptimizationContext"/> /// for additional examples. /// </para> /// </remarks> /// <example> /// <para> /// In the following example, a partition that optimally split 12 items /// is discovered /// given /// an artificial data set regarding the items under study. /// </para> /// <para> /// <code title="Optimal partitioning of a data set." /// source="..\Novacta.Analytics.CodeExamples\ClustersDiscoverExample0.cs.txt" /// language="cs" /> /// </para> /// </example> /// <returns> /// A partition of the row indexes valid for <paramref name="data"/>. /// </returns> /// <exception cref="ArgumentNullException"> /// <paramref name="data"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentOutOfRangeException"> /// <paramref name="maximumNumberOfParts"/> is not greater than one. /// </exception> /// <exception cref="ArgumentException"> /// <paramref name="maximumNumberOfParts"/> is not less than /// the number of rows in <paramref name="data"/>. /// </exception> /// <seealso cref="PartitionOptimizationContext"/> /// <seealso cref="SystemPerformanceOptimizer"/> public static IndexPartition <double> Discover( DoubleMatrix data, int maximumNumberOfParts) { #region Input validation if (data is null) { throw new ArgumentNullException(nameof(data)); } if (maximumNumberOfParts < 2) { throw new ArgumentOutOfRangeException( nameof(maximumNumberOfParts), string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_GREATER_THAN_VALUE"), "1")); } int stateDimension = data.NumberOfRows; if (stateDimension <= maximumNumberOfParts) { throw new ArgumentException( string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_ROWS"), nameof(maximumNumberOfParts), nameof(data)), nameof(maximumNumberOfParts) ); } #endregion double objectiveFunction(DoubleMatrix x) { double performance = 0.0; var partition = IndexPartition.Create(x); foreach (double category in partition.Identifiers) { performance += Stat.Sum( Stat.SumOfSquaredDeviations( data[partition[category], ":"], DataOperation.OnColumns)); } return(performance); } var optimizer = new SystemPerformanceOptimizer(); var context = new PartitionOptimizationContext( objectiveFunction: objectiveFunction, stateDimension: stateDimension, partitionDimension: maximumNumberOfParts, probabilitySmoothingCoefficient: .8, optimizationGoal: OptimizationGoal.Minimization, minimumNumberOfIterations: 3, maximumNumberOfIterations: 1000); double rarity = .01; int sampleSize = 500 * maximumNumberOfParts; var results = optimizer.Optimize( context, rarity, sampleSize); return(IndexPartition.Create(results.OptimalState)); }
/// <summary> /// Computes the minimum centroid linkage among parts /// in the given partition of the specified data. /// </summary> /// <param name="data">The data whose rows represent the available /// observations.</param> /// <param name="partition">The data partition to evaluate.</param> /// <returns>Returns the minimum value of /// <see cref="Distance.CentroidLinkage">CentroidLinkage</see> /// over the pairs of clusters corresponding to parts /// in <paramref name="partition" />.</returns> /// <remarks> /// <para id='parameterDescription'> /// Each column of <paramref name="data"/> is associated to one of /// the variables under study, while /// its rows are associated to the individuals. The /// <paramref name="partition"/> is intended to define parts which /// contains row indexes valid for <paramref name="data"/>. /// </para> /// <para> /// This method applies Euclidean distances. /// </para> /// </remarks> /// <exception cref="ArgumentNullException"> /// <paramref name="data"/>.<br/> /// -or-<br/> /// <paramref name="partition"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentException"> /// A part in <paramref name="partition"/> contains a position /// which is not valid as a row index of <paramref name="data"/>. /// </exception> public static double MinimumCentroidLinkage(DoubleMatrix data, IndexPartition <double> partition) { if (data is null) { throw new ArgumentNullException(nameof(data)); } if (partition is null) { throw new ArgumentNullException(nameof(partition)); } double minimumMeanLinkage, currentMeanLinkage; DoubleMatrix left, right; minimumMeanLinkage = double.PositiveInfinity; double[] categories = partition.Identifiers.ToArray(); int numberOfClusters = categories.Length; int numberOfObservations = data.NumberOfRows; for (int i = 0; i < numberOfClusters; i++) { var currentLeftPart = partition[categories[i]]; if (i == 0) { if (currentLeftPart.Max >= numberOfObservations) { throw new ArgumentException( ImplementationServices.GetResourceString( "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"), nameof(partition)); } } left = data[currentLeftPart, ":"]; for (int j = i + 1; j < numberOfClusters; j++) { var currentRightPart = partition[categories[j]]; if (i == 0) { if (currentRightPart.Max >= numberOfObservations) { throw new ArgumentException( ImplementationServices.GetResourceString( "STR_EXCEPT_INP_PART_CONTAINS_INVALID_INDEX"), nameof(partition)); } } right = data[currentRightPart, ":"]; currentMeanLinkage = Distance.CentroidLinkage(left, right); if (currentMeanLinkage < minimumMeanLinkage) { minimumMeanLinkage = currentMeanLinkage; } } } return(minimumMeanLinkage); }
/// <summary> /// Explains existing clusters by selecting /// a number of features from the specified corresponding data set. /// </summary> /// <param name="data"> /// The matrix whose columns contain the features observed at the /// items under study. /// </param> /// <param name="partition"> /// A partition of the row indexes valid for <paramref name="data"/>. /// </param> /// <param name="numberOfExplanatoryFeatures"> /// The number of features to be selected. /// </param> /// <remarks> /// <para> /// Method <see cref="Explain( /// DoubleMatrix, IndexPartition{double}, int)"/> /// selects the specified <paramref name="numberOfExplanatoryFeatures"/> /// from the given /// <paramref name="data"/>, by minimizing the Davies-Bouldin /// Index corresponding to /// the <paramref name="partition"/> of the items under study. /// </para> /// <para> /// This method uses a default Cross-Entropy context of /// type <see cref="CombinationOptimizationContext"/> to identify the /// optimal features. /// If different selection criteria need to be applied, /// or extra control on the /// parameters of the underlying algorithm is required, /// a specialized <see cref="CombinationOptimizationContext"/> can be /// can be instantiated and hence exploited executing /// method <see cref="SystemPerformanceOptimizer.Optimize( /// SystemPerformanceOptimizationContext, double, int)">Optimize</see> /// on a <see cref="SystemPerformanceOptimizer"/> object. /// See the documentation about <see cref="CombinationOptimizationContext"/> /// for additional examples. /// </para> /// </remarks> /// <example> /// <para> /// In the following example, an existing partition of 12 items is explained /// by selecting 2 features out of the seven ones available in /// an artificial data set regarding the items under study. /// </para> /// <para> /// <code title="Selecting features from a data set to explain a given partition." /// source="..\Novacta.Analytics.CodeExamples\ClustersExplainExample0.cs.txt" /// language="cs" /> /// </para> /// </example> /// <returns> /// The collection of column indexes, valid for <paramref name="data"/>, that /// correspond to the features selected to explain the /// given <paramref name="partition"/> of row indexes. /// </returns> /// <exception cref="ArgumentNullException"> /// <paramref name="data"/> is <b>null</b>.<br/> /// -or-<br/> /// <paramref name="partition"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentOutOfRangeException"> /// <paramref name="numberOfExplanatoryFeatures"/> is not positive. /// </exception> /// <exception cref="ArgumentException"> /// <paramref name="numberOfExplanatoryFeatures"/> is not less than /// the number of columns in <paramref name="data"/>.<br/> /// -or-<br/> /// A part in <paramref name="partition"/> contains a position /// which is not valid as a row index of <paramref name="data"/>. /// </exception> /// <seealso cref="IndexPartition.DaviesBouldinIndex( /// DoubleMatrix, IndexPartition{double})"/> /// <seealso cref="CombinationOptimizationContext"/> /// <seealso cref="SystemPerformanceOptimizer"/> public static IndexCollection Explain( DoubleMatrix data, IndexPartition <double> partition, int numberOfExplanatoryFeatures) { #region Input validation if (data is null) { throw new ArgumentNullException(nameof(data)); } if (numberOfExplanatoryFeatures < 1) { throw new ArgumentOutOfRangeException( nameof(numberOfExplanatoryFeatures), ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_POSITIVE")); } int stateDimension = data.NumberOfColumns; if (stateDimension <= numberOfExplanatoryFeatures) { throw new ArgumentException( string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_COLUMNS"), nameof(numberOfExplanatoryFeatures), nameof(data)), nameof(numberOfExplanatoryFeatures) ); } if (partition is null) { throw new ArgumentNullException(nameof(partition)); } #endregion double objectiveFunction(DoubleMatrix x) { IndexCollection selected = x.FindNonzero(); double performance = IndexPartition.DaviesBouldinIndex( data: data[":", selected], partition: partition); return(performance); } var optimizer = new SystemPerformanceOptimizer(); var context = new CombinationOptimizationContext( objectiveFunction: objectiveFunction, stateDimension: stateDimension, combinationDimension: numberOfExplanatoryFeatures, probabilitySmoothingCoefficient: .8, optimizationGoal: OptimizationGoal.Minimization, minimumNumberOfIterations: 3, maximumNumberOfIterations: 1000); double rarity = .01; int sampleSize = 1000 * stateDimension; var results = optimizer.Optimize( context, rarity, sampleSize); var optimalState = results.OptimalState; return(optimalState.FindNonzero()); }