示例#1
0
        /// <summary>
        /// Discovers optimal clusters
        /// in a data set.
        /// </summary>
        /// <param name="maximumNumberOfParts">
        /// The maximum number of parts allowed in the optimal
        /// partition.
        /// </param>
        /// <param name="data">
        /// The matrix whose rows contain the observations for the
        /// items under study.
        /// </param>
        /// <remarks>
        /// <para>
        /// Method <see cref="Discover(DoubleMatrix, int)"/> partitions
        /// a collection of items in no more than the specified
        /// <paramref name="maximumNumberOfParts"/>,
        /// given the specified <paramref name="data"/>, by minimizing the sum of
        /// intra-cluster squared deviations.
        /// </para>
        /// <para>
        /// This method uses a default Cross-Entropy context of
        /// type <see cref="PartitionOptimizationContext"/> to identify the
        /// optimal partition.
        /// If different partitioning criteria need to be applied,
        /// or extra control on the
        /// parameters of the underlying algorithm is required,
        /// a specialized <see cref="PartitionOptimizationContext"/> can be
        /// can be instantiated and hence exploited executing
        /// method <see cref="SystemPerformanceOptimizer.Optimize(
        /// SystemPerformanceOptimizationContext, double, int)">Optimize</see>
        /// on a <see cref="SystemPerformanceOptimizer"/> object.
        /// See the documentation about <see cref="PartitionOptimizationContext"/>
        /// for additional examples.
        /// </para>
        /// </remarks>
        /// <example>
        /// <para>
        /// In the following example, a partition that optimally split 12 items
        /// is discovered
        /// given
        /// an artificial data set regarding the items under study.
        /// </para>
        /// <para>
        /// <code title="Optimal partitioning of a data set."
        /// source="..\Novacta.Analytics.CodeExamples\ClustersDiscoverExample0.cs.txt"
        /// language="cs" />
        /// </para>
        /// </example>
        /// <returns>
        /// A partition of the row indexes valid for <paramref name="data"/>.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentOutOfRangeException">
        /// <paramref name="maximumNumberOfParts"/> is not greater than one.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// <paramref name="maximumNumberOfParts"/> is not less than
        /// the number of rows in <paramref name="data"/>.
        /// </exception>
        /// <seealso cref="PartitionOptimizationContext"/>
        /// <seealso cref="SystemPerformanceOptimizer"/>
        public static IndexPartition <double> Discover(
            DoubleMatrix data,
            int maximumNumberOfParts)
        {
            #region Input validation

            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (maximumNumberOfParts < 2)
            {
                throw new ArgumentOutOfRangeException(
                          nameof(maximumNumberOfParts),
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_GREATER_THAN_VALUE"),
                              "1"));
            }

            int stateDimension = data.NumberOfRows;

            if (stateDimension <= maximumNumberOfParts)
            {
                throw new ArgumentException(
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_ROWS"),
                              nameof(maximumNumberOfParts),
                              nameof(data)),
                          nameof(maximumNumberOfParts)
                          );
            }

            #endregion

            double objectiveFunction(DoubleMatrix x)
            {
                double performance = 0.0;
                var    partition   = IndexPartition.Create(x);

                foreach (double category in partition.Identifiers)
                {
                    performance += Stat.Sum(
                        Stat.SumOfSquaredDeviations(
                            data[partition[category], ":"],
                            DataOperation.OnColumns));
                }

                return(performance);
            }

            var optimizer =
                new SystemPerformanceOptimizer();

            var context = new PartitionOptimizationContext(
                objectiveFunction: objectiveFunction,
                stateDimension: stateDimension,
                partitionDimension: maximumNumberOfParts,
                probabilitySmoothingCoefficient: .8,
                optimizationGoal: OptimizationGoal.Minimization,
                minimumNumberOfIterations: 3,
                maximumNumberOfIterations: 1000);

            double rarity = .01;

            int sampleSize = 500 * maximumNumberOfParts;

            var results = optimizer.Optimize(
                context,
                rarity,
                sampleSize);

            return(IndexPartition.Create(results.OptimalState));
        }
        public void Main()
        {
            // Set the number of items and features under study.
            const int numberOfItems    = 12;
            int       numberOfFeatures = 7;

            // Create a matrix that will represent
            // an artificial data set,
            // having 12 items (rows) and 7 features (columns).
            // This will store the observations which
            // partition discovery will be based on.
            var data = DoubleMatrix.Dense(
                numberOfRows: numberOfItems,
                numberOfColumns: numberOfFeatures);

            // Fill the data rows by sampling from a different
            // distribution while, respectively, drawing observations
            // for items 0 to 3, 4 to 7, and 8 to 11: these will be the
            // three different parts expected to be included in the
            // optimal partition.
            double mu = 1.0;
            var    g  = new GaussianDistribution(mu: mu, sigma: .01);

            IndexCollection range = IndexCollection.Range(0, 3);

            for (int j = 0; j < numberOfFeatures; j++)
            {
                data[range, j] = g.Sample(sampleSize: range.Count);
            }

            mu   += 5.0;
            g.Mu  = mu;
            range = IndexCollection.Range(4, 7);
            for (int j = 0; j < numberOfFeatures; j++)
            {
                data[range, j] = g.Sample(sampleSize: range.Count);
            }

            mu   += 5.0;
            g.Mu  = mu;
            range = IndexCollection.Range(8, 11);
            for (int j = 0; j < numberOfFeatures; j++)
            {
                data[range, j] = g.Sample(sampleSize: range.Count);
            }

            Console.WriteLine("The data set:");
            Console.WriteLine(data);

            // Define the optimization problem as
            // the minimization of the Davies-Bouldin Index
            // of a candidate partition.
            double objectiveFunction(DoubleMatrix x)
            {
                // An argument x has 12 entries, each belonging
                // to the set {0,...,k-1}, where k is the
                // maximum number of allowed parts, so
                // x[j]==i signals that, at x, item j
                // has been assigned to part i.
                IndexPartition <double> selected =
                    IndexPartition.Create(x);

                var performance = IndexPartition.DaviesBouldinIndex(
                    data: data,
                    partition: selected);

                return(performance);
            }

            var optimizationGoal = OptimizationGoal.Minimization;

            // Define the maximum number of parts allowed in the
            // partition to be discovered.
            int maximumNumberOfParts = 3;

            // Create the required context.
            var context = new PartitionOptimizationContext(
                objectiveFunction: objectiveFunction,
                stateDimension: numberOfItems,
                partitionDimension: maximumNumberOfParts,
                probabilitySmoothingCoefficient: .8,
                optimizationGoal: optimizationGoal,
                minimumNumberOfIterations: 3,
                maximumNumberOfIterations: 1000);

            // Create the optimizer.
            var optimizer = new SystemPerformanceOptimizer()
            {
                PerformanceEvaluationParallelOptions = { MaxDegreeOfParallelism = -1 },
                SampleGenerationParallelOptions      = { MaxDegreeOfParallelism = -1 }
            };

            // Set optimization parameters.
            double rarity     = 0.01;
            int    sampleSize = 2000;

            // Solve the problem.
            var results = optimizer.Optimize(
                context,
                rarity,
                sampleSize);

            IndexPartition <double> optimalPartition =
                IndexPartition.Create(results.OptimalState);

            // Show the results.
            Console.WriteLine(
                "The Cross-Entropy optimizer has converged: {0}.",
                results.HasConverged);

            Console.WriteLine();
            Console.WriteLine("Initial guess parameter:");
            Console.WriteLine(context.InitialParameter);

            Console.WriteLine();
            Console.WriteLine("The minimizer of the performance is:");
            Console.WriteLine(results.OptimalState);

            Console.WriteLine();
            Console.WriteLine(
                "The optimal partition is:");
            Console.WriteLine(optimalPartition);

            Console.WriteLine();
            Console.WriteLine("The minimum performance is:");
            Console.WriteLine(results.OptimalPerformance);

            Console.WriteLine();
            Console.WriteLine("The Dunn Index for the optimal partition is:");
            var di = IndexPartition.DunnIndex(
                data,
                optimalPartition);

            Console.WriteLine(di);
        }