Ejemplo n.º 1
0
        /// <summary>
        /// Explains existing clusters by selecting
        /// a number of features from the specified corresponding data set.
        /// </summary>
        /// <param name="data">
        /// The matrix whose columns contain the features observed at the
        /// items under study.
        /// </param>
        /// <param name="partition">
        /// A partition of the row indexes valid for <paramref name="data"/>.
        /// </param>
        /// <param name="numberOfExplanatoryFeatures">
        /// The number of features to be selected.
        /// </param>
        /// <remarks>
        /// <para>
        /// Method <see cref="Explain(
        /// DoubleMatrix, IndexPartition{double}, int)"/>
        /// selects the specified <paramref name="numberOfExplanatoryFeatures"/>
        /// from the given
        /// <paramref name="data"/>, by minimizing the Davies-Bouldin
        /// Index corresponding to
        /// the <paramref name="partition"/> of the items under study.
        /// </para>
        /// <para>
        /// This method uses a default Cross-Entropy context of
        /// type <see cref="CombinationOptimizationContext"/> to identify the
        /// optimal features.
        /// If different selection criteria need to be applied,
        /// or extra control on the
        /// parameters of the underlying algorithm is required,
        /// a specialized <see cref="CombinationOptimizationContext"/> can be
        /// can be instantiated and hence exploited executing
        /// method <see cref="SystemPerformanceOptimizer.Optimize(
        /// SystemPerformanceOptimizationContext, double, int)">Optimize</see>
        /// on a <see cref="SystemPerformanceOptimizer"/> object.
        /// See the documentation about <see cref="CombinationOptimizationContext"/>
        /// for additional examples.
        /// </para>
        /// </remarks>
        /// <example>
        /// <para>
        /// In the following example, an existing partition of 12 items is explained
        /// by selecting 2 features out of the seven ones available in
        /// an artificial data set regarding the items under study.
        /// </para>
        /// <para>
        /// <code title="Selecting features from a data set to explain a given partition."
        /// source="..\Novacta.Analytics.CodeExamples\ClustersExplainExample0.cs.txt"
        /// language="cs" />
        /// </para>
        /// </example>
        /// <returns>
        /// The collection of column indexes, valid for <paramref name="data"/>, that
        /// correspond to the features selected to explain the
        /// given <paramref name="partition"/> of row indexes.
        /// </returns>
        /// <exception cref="ArgumentNullException">
        /// <paramref name="data"/> is <b>null</b>.<br/>
        /// -or-<br/>
        /// <paramref name="partition"/> is <b>null</b>.
        /// </exception>
        /// <exception cref="ArgumentOutOfRangeException">
        /// <paramref name="numberOfExplanatoryFeatures"/> is not positive.
        /// </exception>
        /// <exception cref="ArgumentException">
        /// <paramref name="numberOfExplanatoryFeatures"/> is not less than
        /// the number of columns in <paramref name="data"/>.<br/>
        /// -or-<br/>
        /// A part in <paramref name="partition"/> contains a position
        /// which is not valid as a row index of <paramref name="data"/>.
        /// </exception>
        /// <seealso cref="IndexPartition.DaviesBouldinIndex(
        /// DoubleMatrix, IndexPartition{double})"/>
        /// <seealso cref="CombinationOptimizationContext"/>
        /// <seealso cref="SystemPerformanceOptimizer"/>
        public static IndexCollection Explain(
            DoubleMatrix data,
            IndexPartition <double> partition,
            int numberOfExplanatoryFeatures)
        {
            #region Input validation

            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (numberOfExplanatoryFeatures < 1)
            {
                throw new ArgumentOutOfRangeException(
                          nameof(numberOfExplanatoryFeatures),
                          ImplementationServices.GetResourceString(
                              "STR_EXCEPT_PAR_MUST_BE_POSITIVE"));
            }

            int stateDimension = data.NumberOfColumns;

            if (stateDimension <= numberOfExplanatoryFeatures)
            {
                throw new ArgumentException(
                          string.Format(
                              CultureInfo.InvariantCulture,
                              ImplementationServices.GetResourceString(
                                  "STR_EXCEPT_PAR_MUST_BE_LESS_THAN_OTHER_COLUMNS"),
                              nameof(numberOfExplanatoryFeatures),
                              nameof(data)),
                          nameof(numberOfExplanatoryFeatures)
                          );
            }

            if (partition is null)
            {
                throw new ArgumentNullException(nameof(partition));
            }

            #endregion

            double objectiveFunction(DoubleMatrix x)
            {
                IndexCollection selected = x.FindNonzero();

                double performance =
                    IndexPartition.DaviesBouldinIndex(
                        data: data[":", selected],
                        partition: partition);

                return(performance);
            }

            var optimizer =
                new SystemPerformanceOptimizer();

            var context = new CombinationOptimizationContext(
                objectiveFunction: objectiveFunction,
                stateDimension: stateDimension,
                combinationDimension: numberOfExplanatoryFeatures,
                probabilitySmoothingCoefficient: .8,
                optimizationGoal: OptimizationGoal.Minimization,
                minimumNumberOfIterations: 3,
                maximumNumberOfIterations: 1000);

            double rarity = .01;

            int sampleSize = 1000 * stateDimension;

            var results = optimizer.Optimize(
                context,
                rarity,
                sampleSize);

            var optimalState = results.OptimalState;

            return(optimalState.FindNonzero());
        }
        public void Main()
        {
            // Set the number of items and features under study.
            const int numberOfItems    = 12;
            int       numberOfFeatures = 7;

            // Define a partition that must be explained.
            // Three parts (clusters) are included,
            // containing, respectively, items 0 to 3,
            // 4 to 7, and 8 to 11.
            var partition = IndexPartition.Create(
                new double[numberOfItems]
            {
                0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2
            });

            // Create a matrix that will represent
            // an artificial data set,
            // having 12 items (rows) and 7 features (columns).
            // This will store the observations which
            // explanation will be based on.
            var data = DoubleMatrix.Dense(
                numberOfRows: numberOfItems,
                numberOfColumns: numberOfFeatures);

            // The first 5 features are built to be almost
            // surely non informative, since they result
            // as samples drawn from a same distribution.
            var g = new GaussianDistribution(mu: 0, sigma: .01);

            for (int j = 0; j < 5; j++)
            {
                data[":", j] = g.Sample(sampleSize: numberOfItems);
            }

            // Features 5 to 6 are instead built to be informative,
            // since they are sampled from different distributions
            // while filling rows whose indexes are in different parts
            // of the partition to be explained.
            var    partIdentifiers = partition.Identifiers;
            double mu = 1.0;

            for (int i = 0; i < partIdentifiers.Count; i++)
            {
                var part     = partition[partIdentifiers[i]];
                int partSize = part.Count;
                g.Mu          = mu;
                data[part, 5] = g.Sample(sampleSize: partSize);
                mu           += 2.0;
                g.Mu          = mu;
                data[part, 6] = g.Sample(sampleSize: partSize);
                mu           += 2.0;
            }

            Console.WriteLine("The data set:");
            Console.WriteLine(data);

            // Define the selection problem as
            // the maximization of the Dunn Index.
            double objectiveFunction(DoubleMatrix x)
            {
                // An argument x has entries equal to one,
                // signaling that the corresponding features
                // are selected at x. Otherwise, the entries
                // are zero.
                IndexCollection selected = x.FindNonzero();

                double performance =
                    IndexPartition.DunnIndex(
                        data: data[":", selected],
                        partition: partition);

                return(performance);
            }

            var optimizationGoal = OptimizationGoal.Maximization;

            // Define how many features must be selected
            // for explanation.
            int numberOfExplanatoryFeatures = 2;

            // Create the required context.
            var context = new CombinationOptimizationContext(
                objectiveFunction: objectiveFunction,
                stateDimension: numberOfFeatures,
                combinationDimension: numberOfExplanatoryFeatures,
                probabilitySmoothingCoefficient: .8,
                optimizationGoal: optimizationGoal,
                minimumNumberOfIterations: 3,
                maximumNumberOfIterations: 1000);

            // Create the optimizer.
            var optimizer = new SystemPerformanceOptimizer()
            {
                PerformanceEvaluationParallelOptions = { MaxDegreeOfParallelism = -1 },
                SampleGenerationParallelOptions      = { MaxDegreeOfParallelism = -1 }
            };

            // Set optimization parameters.
            double rarity     = 0.01;
            int    sampleSize = 1000;

            // Solve the problem.
            var results = optimizer.Optimize(
                context,
                rarity,
                sampleSize);

            IndexCollection optimalExplanatoryFeatureIndexes =
                results.OptimalState.FindNonzero();

            // Show the results.
            Console.WriteLine(
                "The Cross-Entropy optimizer has converged: {0}.",
                results.HasConverged);

            Console.WriteLine();
            Console.WriteLine("Initial guess parameter:");
            Console.WriteLine(context.InitialParameter);

            Console.WriteLine();
            Console.WriteLine("The maximizer of the performance is:");
            Console.WriteLine(results.OptimalState);

            Console.WriteLine();
            Console.WriteLine(
                "The {0} features best explaining the given partition have column indexes:",
                numberOfExplanatoryFeatures);
            Console.WriteLine(optimalExplanatoryFeatureIndexes);

            Console.WriteLine();
            Console.WriteLine("The maximum performance is:");
            Console.WriteLine(results.OptimalPerformance);

            Console.WriteLine();
            Console.WriteLine("This is the Dunn Index for the selected features:");
            var di = IndexPartition.DunnIndex(
                data[":", optimalExplanatoryFeatureIndexes],
                partition);

            Console.WriteLine(di);
        }