示例#1
0
        /// <summary>
        /// Discovers optimal clusters
        /// in a data set.
        /// </summary>
        /// <param name="maximumNumberOfParts">
        /// The maximum number of parts allowed in the optimal
        /// partition.
        /// </param>
        /// <param name="data">
        /// The matrix whose rows contain the observations for the
        /// items under study.
        /// </param>
        /// <remarks>
        /// <para>
        /// Method <see cref="Discover(DoubleMatrix, int)"/> partitions
        /// a collection of items in no more than the specified
        /// <paramref name="maximumNumberOfParts"/>,
        /// given the specified <paramref name="data"/>, by minimizing the sum of
        /// intra-cluster squared deviations.
        /// </para>
        /// <para>
        /// This method uses a default Cross-Entropy context of
        /// type <see cref="PartitionOptimizationContext"/> to identify the
        /// optimal partition.
        /// If different partitioning criteria need to be applied,
        /// or extra control on the
        /// parameters of the underlying algorithm is required,
        /// a specialized <see cref="PartitionOptimizationContext"/> can be
        /// can be instantiated and hence exploited executing
        /// method <see cref="SystemPerformanceOptimizer.Optimize(
        /// SystemPerformanceOptimizationContext, double, int)">Optimize</see>
        /// on a <see cref="SystemPerformanceOptimizer"/> object.
        /// See the documentation about <see cref="PartitionOptimizationContext"/>
        /// for additional examples.
        /// </para>
        /// </remarks>
        /// <example>
        /// <para>
        /// In the following example, a partition that optimally split 12 items
        /// is discovered
        /// given
        /// an artificial data set regarding the items under study.
        /// </para>
        /// <para>
        /// <code title="Optimal partitioning of a data set."
        /// source="..\Novacta.Analytics.CodeExamples\ClustersDiscoverExample0.cs.txt"
        /// language="cs" />
        /// </para>
        /// </example>
        /// <returns>
        /// A partition of the row indexes valid for <paramref name="data"/>.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentOutOfRangeException">
        /// <paramref name="maximumNumberOfParts"/> is not greater than one.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// <paramref name="maximumNumberOfParts"/> is not less than
        /// the number of rows in <paramref name="data"/>.
        /// </exception>
        /// <seealso cref="PartitionOptimizationContext"/>
        /// <seealso cref="SystemPerformanceOptimizer"/>
        public static IndexPartition <double> Discover(
            DoubleMatrix data,
            int maximumNumberOfParts)
        {
            #region Input validation

            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (maximumNumberOfParts < 2)
            {
                throw new ArgumentOutOfRangeException(
                          nameof(maximumNumberOfParts),
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_GREATER_THAN_VALUE"),
                              "1"));
            }

            int stateDimension = data.NumberOfRows;

            if (stateDimension <= maximumNumberOfParts)
            {
                throw new ArgumentException(
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_ROWS"),
                              nameof(maximumNumberOfParts),
                              nameof(data)),
                          nameof(maximumNumberOfParts)
                          );
            }

            #endregion

            double objectiveFunction(DoubleMatrix x)
            {
                double performance = 0.0;
                var    partition   = IndexPartition.Create(x);

                foreach (double category in partition.Identifiers)
                {
                    performance += Stat.Sum(
                        Stat.SumOfSquaredDeviations(
                            data[partition[category], ":"],
                            DataOperation.OnColumns));
                }

                return(performance);
            }

            var optimizer =
                new SystemPerformanceOptimizer();

            var context = new PartitionOptimizationContext(
                objectiveFunction: objectiveFunction,
                stateDimension: stateDimension,
                partitionDimension: maximumNumberOfParts,
                probabilitySmoothingCoefficient: .8,
                optimizationGoal: OptimizationGoal.Minimization,
                minimumNumberOfIterations: 3,
                maximumNumberOfIterations: 1000);

            double rarity = .01;

            int sampleSize = 500 * maximumNumberOfParts;

            var results = optimizer.Optimize(
                context,
                rarity,
                sampleSize);

            return(IndexPartition.Create(results.OptimalState));
        }