/// <summary> /// Discovers optimal clusters /// in a data set. /// </summary> /// <param name="maximumNumberOfParts"> /// The maximum number of parts allowed in the optimal /// partition. /// </param> /// <param name="data"> /// The matrix whose rows contain the observations for the /// items under study. /// </param> /// <remarks> /// <para> /// Method <see cref="Discover(DoubleMatrix, int)"/> partitions /// a collection of items in no more than the specified /// <paramref name="maximumNumberOfParts"/>, /// given the specified <paramref name="data"/>, by minimizing the sum of /// intra-cluster squared deviations. /// </para> /// <para> /// This method uses a default Cross-Entropy context of /// type <see cref="PartitionOptimizationContext"/> to identify the /// optimal partition. /// If different partitioning criteria need to be applied, /// or extra control on the /// parameters of the underlying algorithm is required, /// a specialized <see cref="PartitionOptimizationContext"/> can be /// can be instantiated and hence exploited executing /// method <see cref="SystemPerformanceOptimizer.Optimize( /// SystemPerformanceOptimizationContext, double, int)">Optimize</see> /// on a <see cref="SystemPerformanceOptimizer"/> object. /// See the documentation about <see cref="PartitionOptimizationContext"/> /// for additional examples. /// </para> /// </remarks> /// <example> /// <para> /// In the following example, a partition that optimally split 12 items /// is discovered /// given /// an artificial data set regarding the items under study. /// </para> /// <para> /// <code title="Optimal partitioning of a data set." /// source="..\Novacta.Analytics.CodeExamples\ClustersDiscoverExample0.cs.txt" /// language="cs" /> /// </para> /// </example> /// <returns> /// A partition of the row indexes valid for <paramref name="data"/>. /// </returns> /// <exception cref="ArgumentNullException"> /// <paramref name="data"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentOutOfRangeException"> /// <paramref name="maximumNumberOfParts"/> is not greater than one. /// </exception> /// <exception cref="ArgumentException"> /// <paramref name="maximumNumberOfParts"/> is not less than /// the number of rows in <paramref name="data"/>. /// </exception> /// <seealso cref="PartitionOptimizationContext"/> /// <seealso cref="SystemPerformanceOptimizer"/> public static IndexPartition <double> Discover( DoubleMatrix data, int maximumNumberOfParts) { #region Input validation if (data is null) { throw new ArgumentNullException(nameof(data)); } if (maximumNumberOfParts < 2) { throw new ArgumentOutOfRangeException( nameof(maximumNumberOfParts), string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_GREATER_THAN_VALUE"), "1")); } int stateDimension = data.NumberOfRows; if (stateDimension <= maximumNumberOfParts) { throw new ArgumentException( string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_ROWS"), nameof(maximumNumberOfParts), nameof(data)), nameof(maximumNumberOfParts) ); } #endregion double objectiveFunction(DoubleMatrix x) { double performance = 0.0; var partition = IndexPartition.Create(x); foreach (double category in partition.Identifiers) { performance += Stat.Sum( Stat.SumOfSquaredDeviations( data[partition[category], ":"], DataOperation.OnColumns)); } return(performance); } var optimizer = new SystemPerformanceOptimizer(); var context = new PartitionOptimizationContext( objectiveFunction: objectiveFunction, stateDimension: stateDimension, partitionDimension: maximumNumberOfParts, probabilitySmoothingCoefficient: .8, optimizationGoal: OptimizationGoal.Minimization, minimumNumberOfIterations: 3, maximumNumberOfIterations: 1000); double rarity = .01; int sampleSize = 500 * maximumNumberOfParts; var results = optimizer.Optimize( context, rarity, sampleSize); return(IndexPartition.Create(results.OptimalState)); }
public void Main() { // Set the number of items and features under study. const int numberOfItems = 12; int numberOfFeatures = 7; // Create a matrix that will represent // an artificial data set, // having 12 items (rows) and 7 features (columns). // This will store the observations which // partition discovery will be based on. var data = DoubleMatrix.Dense( numberOfRows: numberOfItems, numberOfColumns: numberOfFeatures); // Fill the data rows by sampling from a different // distribution while, respectively, drawing observations // for items 0 to 3, 4 to 7, and 8 to 11: these will be the // three different parts expected to be included in the // optimal partition. double mu = 1.0; var g = new GaussianDistribution(mu: mu, sigma: .01); IndexCollection range = IndexCollection.Range(0, 3); for (int j = 0; j < numberOfFeatures; j++) { data[range, j] = g.Sample(sampleSize: range.Count); } mu += 5.0; g.Mu = mu; range = IndexCollection.Range(4, 7); for (int j = 0; j < numberOfFeatures; j++) { data[range, j] = g.Sample(sampleSize: range.Count); } mu += 5.0; g.Mu = mu; range = IndexCollection.Range(8, 11); for (int j = 0; j < numberOfFeatures; j++) { data[range, j] = g.Sample(sampleSize: range.Count); } Console.WriteLine("The data set:"); Console.WriteLine(data); // Define the optimization problem as // the minimization of the Davies-Bouldin Index // of a candidate partition. double objectiveFunction(DoubleMatrix x) { // An argument x has 12 entries, each belonging // to the set {0,...,k-1}, where k is the // maximum number of allowed parts, so // x[j]==i signals that, at x, item j // has been assigned to part i. IndexPartition <double> selected = IndexPartition.Create(x); var performance = IndexPartition.DaviesBouldinIndex( data: data, partition: selected); return(performance); } var optimizationGoal = OptimizationGoal.Minimization; // Define the maximum number of parts allowed in the // partition to be discovered. int maximumNumberOfParts = 3; // Create the required context. var context = new PartitionOptimizationContext( objectiveFunction: objectiveFunction, stateDimension: numberOfItems, partitionDimension: maximumNumberOfParts, probabilitySmoothingCoefficient: .8, optimizationGoal: optimizationGoal, minimumNumberOfIterations: 3, maximumNumberOfIterations: 1000); // Create the optimizer. var optimizer = new SystemPerformanceOptimizer() { PerformanceEvaluationParallelOptions = { MaxDegreeOfParallelism = -1 }, SampleGenerationParallelOptions = { MaxDegreeOfParallelism = -1 } }; // Set optimization parameters. double rarity = 0.01; int sampleSize = 2000; // Solve the problem. var results = optimizer.Optimize( context, rarity, sampleSize); IndexPartition <double> optimalPartition = IndexPartition.Create(results.OptimalState); // Show the results. Console.WriteLine( "The Cross-Entropy optimizer has converged: {0}.", results.HasConverged); Console.WriteLine(); Console.WriteLine("Initial guess parameter:"); Console.WriteLine(context.InitialParameter); Console.WriteLine(); Console.WriteLine("The minimizer of the performance is:"); Console.WriteLine(results.OptimalState); Console.WriteLine(); Console.WriteLine( "The optimal partition is:"); Console.WriteLine(optimalPartition); Console.WriteLine(); Console.WriteLine("The minimum performance is:"); Console.WriteLine(results.OptimalPerformance); Console.WriteLine(); Console.WriteLine("The Dunn Index for the optimal partition is:"); var di = IndexPartition.DunnIndex( data, optimalPartition); Console.WriteLine(di); }