/// <summary> /// Discovers optimal clusters /// in a data set. /// </summary> /// <param name="maximumNumberOfParts"> /// The maximum number of parts allowed in the optimal /// partition. /// </param> /// <param name="data"> /// The matrix whose rows contain the observations for the /// items under study. /// </param> /// <remarks> /// <para> /// Method <see cref="Discover(DoubleMatrix, int)"/> partitions /// a collection of items in no more than the specified /// <paramref name="maximumNumberOfParts"/>, /// given the specified <paramref name="data"/>, by minimizing the sum of /// intra-cluster squared deviations. /// </para> /// <para> /// This method uses a default Cross-Entropy context of /// type <see cref="PartitionOptimizationContext"/> to identify the /// optimal partition. /// If different partitioning criteria need to be applied, /// or extra control on the /// parameters of the underlying algorithm is required, /// a specialized <see cref="PartitionOptimizationContext"/> can be /// can be instantiated and hence exploited executing /// method <see cref="SystemPerformanceOptimizer.Optimize( /// SystemPerformanceOptimizationContext, double, int)">Optimize</see> /// on a <see cref="SystemPerformanceOptimizer"/> object. /// See the documentation about <see cref="PartitionOptimizationContext"/> /// for additional examples. /// </para> /// </remarks> /// <example> /// <para> /// In the following example, a partition that optimally split 12 items /// is discovered /// given /// an artificial data set regarding the items under study. /// </para> /// <para> /// <code title="Optimal partitioning of a data set." /// source="..\Novacta.Analytics.CodeExamples\ClustersDiscoverExample0.cs.txt" /// language="cs" /> /// </para> /// </example> /// <returns> /// A partition of the row indexes valid for <paramref name="data"/>. /// </returns> /// <exception cref="ArgumentNullException"> /// <paramref name="data"/> is <b>null</b>. /// </exception> /// <exception cref="ArgumentOutOfRangeException"> /// <paramref name="maximumNumberOfParts"/> is not greater than one. /// </exception> /// <exception cref="ArgumentException"> /// <paramref name="maximumNumberOfParts"/> is not less than /// the number of rows in <paramref name="data"/>. /// </exception> /// <seealso cref="PartitionOptimizationContext"/> /// <seealso cref="SystemPerformanceOptimizer"/> public static IndexPartition <double> Discover( DoubleMatrix data, int maximumNumberOfParts) { #region Input validation if (data is null) { throw new ArgumentNullException(nameof(data)); } if (maximumNumberOfParts < 2) { throw new ArgumentOutOfRangeException( nameof(maximumNumberOfParts), string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_GREATER_THAN_VALUE"), "1")); } int stateDimension = data.NumberOfRows; if (stateDimension <= maximumNumberOfParts) { throw new ArgumentException( string.Format( CultureInfo.InvariantCulture, ImplementationServices.GetResourceString( "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_ROWS"), nameof(maximumNumberOfParts), nameof(data)), nameof(maximumNumberOfParts) ); } #endregion double objectiveFunction(DoubleMatrix x) { double performance = 0.0; var partition = IndexPartition.Create(x); foreach (double category in partition.Identifiers) { performance += Stat.Sum( Stat.SumOfSquaredDeviations( data[partition[category], ":"], DataOperation.OnColumns)); } return(performance); } var optimizer = new SystemPerformanceOptimizer(); var context = new PartitionOptimizationContext( objectiveFunction: objectiveFunction, stateDimension: stateDimension, partitionDimension: maximumNumberOfParts, probabilitySmoothingCoefficient: .8, optimizationGoal: OptimizationGoal.Minimization, minimumNumberOfIterations: 3, maximumNumberOfIterations: 1000); double rarity = .01; int sampleSize = 500 * maximumNumberOfParts; var results = optimizer.Optimize( context, rarity, sampleSize); return(IndexPartition.Create(results.OptimalState)); }