Ejemplo n.º 1
0
    /// <summary>
    /// Computes all per-label AUCs as well as the micro- and macro-averaged AUCs.
    /// </summary>
    /// <param name="confusionMatrix">The confusion matrix.</param>
    /// <param name="evaluator">The classifier evaluator.</param>
    /// <param name="x">The x vector of the ground truth.</param>
    /// <param name="y">The y of the ground truth.</param>
    /// <param name="yPredicDistrib">The predictive distributions.</param>
    /// <param name="microAuc">The micro-averaged area under the receiver operating characteristic curve.</param>
    /// <param name="macroAuc">The macro-averaged area under the receiver operating characteristic curve.</param>
    /// <param name="macroAucClassLabelCount">The number of class labels for which the AUC if defined.</param>
    /// <returns>The area under the receiver operating characteristic curve for each class label.</returns>
    /// <remarks>Adapted from MicrosoftResearch.Infer.Learners</remarks>
    private IDictionary <string, double> ComputeLabelAuc(
        ConfusionMatrix <string> confusionMatrix,
        ClassifierEvaluator <IList <Vector>, int, IList <string>, string> evaluator,
        Vector[] x,
        IList <string> y,
        IEnumerable <IDictionary <string, double> > yPredicDistrib,
        out double microAuc,
        out double macroAuc,
        out int macroAucClassLabelCount)
    {
        int instanceCount   = yPredicDistrib.Count();
        var classLabels     = confusionMatrix.ClassLabelSet.Elements.ToArray();
        int classLabelCount = classLabels.Length;
        var labelAuc        = new Dictionary <string, double>();

        // Compute per-label AUC
        macroAucClassLabelCount = classLabelCount;
        foreach (var classLabel in classLabels)
        {
            // One versus rest
            double auc;
            try
            {
                auc = evaluator.AreaUnderRocCurve(classLabel, x, y, yPredicDistrib);
            }
            catch (ArgumentException)
            {
                auc = double.NaN;
                macroAucClassLabelCount--;
            }

            labelAuc.Add(classLabel, auc);
        }

        // Compute micro- and macro-averaged AUC
        microAuc = 0;
        macroAuc = 0;
        foreach (var label in classLabels)
        {
            if (double.IsNaN(labelAuc[label]))
            {
                continue;
            }

            microAuc += confusionMatrix.TrueLabelCount(label) * labelAuc[label] / instanceCount;
            macroAuc += labelAuc[label] / macroAucClassLabelCount;
        }

        return(labelAuc);
    }
Ejemplo n.º 2
0
        /// <summary>
        /// Computes all per-label AUCs as well as the micro- and macro-averaged AUCs.
        /// </summary>
        /// <param name="confusionMatrix">The confusion matrix.</param>
        /// <param name="evaluator">The classifier evaluator.</param>
        /// <param name="groundTruth">The ground truth.</param>
        /// <param name="predictiveDistributions">The predictive distributions.</param>
        /// <param name="microAuc">The micro-averaged area under the receiver operating characteristic curve.</param>
        /// <param name="macroAuc">The macro-averaged area under the receiver operating characteristic curve.</param>
        /// <param name="macroAucClassLabelCount">The number of class labels for which the AUC if defined.</param>
        /// <returns>The area under the receiver operating characteristic curve for each class label.</returns>
        private IDictionary <string, double> ComputeLabelAuc(
            ConfusionMatrix <string> confusionMatrix,
            ClassifierEvaluator <IList <LabeledFeatureValues>, LabeledFeatureValues, IList <LabelDistribution>, string> evaluator,
            IList <LabeledFeatureValues> groundTruth,
            ICollection <IDictionary <string, double> > predictiveDistributions,
            out double microAuc,
            out double macroAuc,
            out int macroAucClassLabelCount)
        {
            int instanceCount   = predictiveDistributions.Count;
            var classLabels     = confusionMatrix.ClassLabelSet.Elements.ToArray();
            int classLabelCount = classLabels.Length;
            var labelAuc        = new Dictionary <string, double>();

            // Compute per-label AUC
            macroAucClassLabelCount = classLabelCount;
            foreach (var classLabel in classLabels)
            {
                // One versus rest
                double auc;
                try
                {
                    auc = evaluator.AreaUnderRocCurve(classLabel, groundTruth, predictiveDistributions);
                }
                catch (ArgumentException)
                {
                    auc = double.NaN;
                    macroAucClassLabelCount--;
                }

                labelAuc.Add(classLabel, auc);
            }

            // Compute micro- and macro-averaged AUC
            microAuc = 0;
            macroAuc = 0;
            foreach (var label in classLabels)
            {
                if (double.IsNaN(labelAuc[label]))
                {
                    continue;
                }

                microAuc += confusionMatrix.TrueLabelCount(label) * labelAuc[label] / instanceCount;
                macroAuc += labelAuc[label] / macroAucClassLabelCount;
            }

            return(labelAuc);
        }
Ejemplo n.º 3
0
    /// <summary>
    /// Writes the evaluation results to a file with the specified name.
    /// </summary>
    /// <param name="writer">The name of the file to write the report to.</param>
    /// <param name="evaluator">The classifier evaluator.</param>
    /// <param name="x">The x vector of the ground truth.</param>
    /// <param name="y">The y of the ground truth.</param>
    /// <param name="yPredicDistrib">The predictive distributions.</param>
    /// <param name="yPredicLabel">The predicted labels.</param>
    /// <remarks>Adapted from MicrosoftResearch.Infer.Learners</remarks>
    private void WriteReport(
        StreamWriter writer,
        ClassifierEvaluator <IList <Vector>, int, IList <string>, string> evaluator,
        Vector[] x,
        IList <string> y,
        IEnumerable <IDictionary <string, double> > yPredicDistrib,
        IEnumerable <string> yPredicLabel)
    {
        // Compute confusion matrix
        var confusionMatrix = evaluator.ConfusionMatrix(x, y, yPredicLabel);

        // Compute mean negative log probability
        double meanNegativeLogProbability =
            evaluator.Evaluate(x, y, yPredicDistrib, Metrics.NegativeLogProbability) / yPredicDistrib.Count();

        // Compute M-measure (averaged pairwise AUC)
        IDictionary <string, IDictionary <string, double> > aucMatrix;
        double auc = evaluator.AreaUnderRocCurve(x, y, yPredicDistrib, out aucMatrix);

        // Compute per-label AUC as well as micro- and macro-averaged AUC
        double microAuc;
        double macroAuc;
        int    macroAucClassLabelCount;
        var    labelAuc = this.ComputeLabelAuc(
            confusionMatrix,
            evaluator,
            x,
            y,
            yPredicDistrib,
            out microAuc,
            out macroAuc,
            out macroAucClassLabelCount);

        // Instance-averaged performance
        this.WriteInstanceAveragedPerformance(writer, confusionMatrix, meanNegativeLogProbability, microAuc);

        // Class-averaged performance
        this.WriteClassAveragedPerformance(writer, confusionMatrix, auc, macroAuc, macroAucClassLabelCount);

        // Performance on individual classes
        this.WriteIndividualClassPerformance(writer, confusionMatrix, labelAuc);

        // Confusion matrix
        this.WriteConfusionMatrix(writer, confusionMatrix);

        // Pairwise AUC
        this.WriteAucMatrix(writer, aucMatrix);
    }
Ejemplo n.º 4
0
        /// <summary>
        /// Writes the evaluation results to a file with the specified name.
        /// </summary>
        /// <param name="writer">The name of the file to write the report to.</param>
        /// <param name="evaluator">The classifier evaluator.</param>
        /// <param name="groundTruth">The ground truth.</param>
        /// <param name="predictiveDistributions">The predictive distributions.</param>
        /// <param name="predictedLabels">The predicted labels.</param>
        private void WriteReport(
            StreamWriter writer,
            ClassifierEvaluator <IList <LabeledFeatureValues>, LabeledFeatureValues, IList <LabelDistribution>, string> evaluator,
            IList <LabeledFeatureValues> groundTruth,
            ICollection <IDictionary <string, double> > predictiveDistributions,
            IEnumerable <string> predictedLabels)
        {
            // Compute confusion matrix
            var confusionMatrix = evaluator.ConfusionMatrix(groundTruth, predictedLabels);

            // Compute mean negative log probability
            double meanNegativeLogProbability =
                evaluator.Evaluate(groundTruth, predictiveDistributions, Metrics.NegativeLogProbability) / predictiveDistributions.Count;

            // Compute M-measure (averaged pairwise AUC)
            IDictionary <string, IDictionary <string, double> > aucMatrix;
            double auc = evaluator.AreaUnderRocCurve(groundTruth, predictiveDistributions, out aucMatrix);

            // Compute per-label AUC as well as micro- and macro-averaged AUC
            double microAuc;
            double macroAuc;
            int    macroAucClassLabelCount;
            var    labelAuc = this.ComputeLabelAuc(
                confusionMatrix,
                evaluator,
                groundTruth,
                predictiveDistributions,
                out microAuc,
                out macroAuc,
                out macroAucClassLabelCount);

            // Instance-averaged performance
            this.WriteInstanceAveragedPerformance(writer, confusionMatrix, meanNegativeLogProbability, microAuc);

            // Class-averaged performance
            this.WriteClassAveragedPerformance(writer, confusionMatrix, auc, macroAuc, macroAucClassLabelCount);

            // Performance on individual classes
            this.WriteIndividualClassPerformance(writer, confusionMatrix, labelAuc);

            // Confusion matrix
            this.WriteConfusionMatrix(writer, confusionMatrix);

            // Pairwise AUC
            this.WriteAucMatrix(writer, aucMatrix);
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Runs the module.
        /// </summary>
        /// <param name="args">The command line arguments for the module.</param>
        /// <param name="usagePrefix">The prefix to print before the usage string.</param>
        /// <returns>True if the run was successful, false otherwise.</returns>
        public override bool Run(string[] args, string usagePrefix)
        {
            string dataSetFile = string.Empty;
            string resultsFile = string.Empty;
            int    crossValidationFoldCount = 5;
            int    iterationCount           = BayesPointMachineClassifierTrainingSettings.IterationCountDefault;
            int    batchCount           = BayesPointMachineClassifierTrainingSettings.BatchCountDefault;
            bool   computeModelEvidence = BayesPointMachineClassifierTrainingSettings.ComputeModelEvidenceDefault;

            var parser = new CommandLineParser();

            parser.RegisterParameterHandler("--data-set", "FILE", "File with training data", v => dataSetFile           = v, CommandLineParameterType.Required);
            parser.RegisterParameterHandler("--results", "FILE", "File with cross-validation results", v => resultsFile = v, CommandLineParameterType.Required);
            parser.RegisterParameterHandler("--folds", "NUM", "Number of cross-validation folds (defaults to " + crossValidationFoldCount + ")", v => crossValidationFoldCount = v, CommandLineParameterType.Optional);
            parser.RegisterParameterHandler("--iterations", "NUM", "Number of training algorithm iterations (defaults to " + iterationCount + ")", v => iterationCount         = v, CommandLineParameterType.Optional);
            parser.RegisterParameterHandler("--batches", "NUM", "Number of batches to split the training data into (defaults to " + batchCount + ")", v => batchCount          = v, CommandLineParameterType.Optional);
            parser.RegisterParameterHandler("--compute-evidence", "Compute model evidence (defaults to " + computeModelEvidence + ")", () => computeModelEvidence = true);

            if (!parser.TryParse(args, usagePrefix))
            {
                return(false);
            }

            // Load and shuffle data
            var dataSet = ClassifierPersistenceUtils.LoadLabeledFeatureValues(dataSetFile);

            BayesPointMachineClassifierModuleUtilities.WriteDataSetInfo(dataSet);

            Rand.Restart(562);
            Rand.Shuffle(dataSet);

            // Create evaluator
            var evaluatorMapping = Mappings.Classifier.ForEvaluation();
            var evaluator        = new ClassifierEvaluator <IList <LabeledFeatureValues>, LabeledFeatureValues, IList <LabelDistribution>, string>(evaluatorMapping);

            // Create performance metrics
            var accuracy = new List <double>();
            var negativeLogProbability = new List <double>();
            var auc             = new List <double>();
            var evidence        = new List <double>();
            var iterationCounts = new List <double>();
            var trainingTime    = new List <double>();

            // Run cross-validation
            int validationSetSize = dataSet.Count / crossValidationFoldCount;

            Console.WriteLine("Running {0}-fold cross-validation on {1}", crossValidationFoldCount, dataSetFile);

            // TODO: Use chained mapping to implement cross-validation
            for (int fold = 0; fold < crossValidationFoldCount; fold++)
            {
                // Construct training and validation sets for fold
                int validationSetStart = fold * validationSetSize;
                int validationSetEnd   = (fold + 1 == crossValidationFoldCount)
                                           ? dataSet.Count
                                           : (fold + 1) * validationSetSize;

                var trainingSet   = new List <LabeledFeatureValues>();
                var validationSet = new List <LabeledFeatureValues>();

                for (int instance = 0; instance < dataSet.Count; instance++)
                {
                    if (validationSetStart <= instance && instance < validationSetEnd)
                    {
                        validationSet.Add(dataSet[instance]);
                    }
                    else
                    {
                        trainingSet.Add(dataSet[instance]);
                    }
                }

                // Print info
                Console.WriteLine("   Fold {0} [validation set instances {1} - {2}]", fold + 1, validationSetStart, validationSetEnd - 1);

                // Create classifier
                var classifier = BayesPointMachineClassifier.CreateBinaryClassifier(Mappings.Classifier);
                classifier.Settings.Training.IterationCount       = iterationCount;
                classifier.Settings.Training.BatchCount           = batchCount;
                classifier.Settings.Training.ComputeModelEvidence = computeModelEvidence;

                int currentIterationCount = 0;
                classifier.IterationChanged += (sender, eventArgs) => { currentIterationCount = eventArgs.CompletedIterationCount; };

                // Train classifier
                var stopWatch = new Stopwatch();
                stopWatch.Start();
                classifier.Train(trainingSet);
                stopWatch.Stop();

                // Produce predictions
                var predictions     = classifier.PredictDistribution(validationSet).ToList();
                var predictedLabels = predictions.Select(
                    prediction => prediction.Aggregate((aggregate, next) => next.Value > aggregate.Value ? next : aggregate).Key).ToList();

                // Iteration count
                iterationCounts.Add(currentIterationCount);

                // Training time
                trainingTime.Add(stopWatch.ElapsedMilliseconds);

                // Compute accuracy
                accuracy.Add(1 - (evaluator.Evaluate(validationSet, predictedLabels, Metrics.ZeroOneError) / predictions.Count));

                // Compute mean negative log probability
                negativeLogProbability.Add(evaluator.Evaluate(validationSet, predictions, Metrics.NegativeLogProbability) / predictions.Count);

                // Compute M-measure (averaged pairwise AUC)
                auc.Add(evaluator.AreaUnderRocCurve(validationSet, predictions));

                // Compute log evidence if desired
                evidence.Add(computeModelEvidence ? classifier.LogModelEvidence : double.NaN);

                // Persist performance metrics
                Console.WriteLine(
                    "      Accuracy = {0,5:0.0000}   NegLogProb = {1,5:0.0000}   AUC = {2,5:0.0000}{3}   Iterations = {4}   Training time = {5}",
                    accuracy[fold],
                    negativeLogProbability[fold],
                    auc[fold],
                    computeModelEvidence ? string.Format("   Log evidence = {0,5:0.0000}", evidence[fold]) : string.Empty,
                    iterationCounts[fold],
                    BayesPointMachineClassifierModuleUtilities.FormatElapsedTime(trainingTime[fold]));

                BayesPointMachineClassifierModuleUtilities.SavePerformanceMetrics(
                    resultsFile, accuracy, negativeLogProbability, auc, evidence, iterationCounts, trainingTime);
            }

            return(true);
        }
Ejemplo n.º 6
0
    /// <summary>
    /// CrossValidate diagnosis
    /// </summary>
    /// <param name="x"></param>
    /// <param name="y"></param>
    /// <param name="mapping"></param>
    /// <param name="reportFileName"></param>
    /// <param name="crossValidationFoldCount"></param>
    /// <param name="iterationCount"></param>
    /// <param name="computeModelEvidence"></param>
    /// <param name="batchCount"></param>
    /// <remarks>Adapted from MicrosoftResearch.Infer.Learners</remarks>
    public CrossValidateMapped(
        Vector[] x,
        IList <string> y,
        GenericClassifierMapping mapping,
        string reportFileName,
        int crossValidationFoldCount, //folds
        int iterationCount,
        bool computeModelEvidence,
        int batchCount)
    {
        Debug.Assert(x != null, "The feature vector must not be null.");
        Debug.Assert(y != null, "The targe variable must not be null.");
        Debug.Assert(mapping != null, "The classifier map must not be null.");
        Debug.Assert(!string.IsNullOrEmpty(reportFileName), "The report file name must not be null/empty.");
        Debug.Assert(iterationCount > 0, "The iteration count must be greater than zero.");
        Debug.Assert(batchCount > 0, "The batch count must be greater than zero.");

        // Shuffle dataset
        shuffleVector(x);

        // Create evaluator
        var evaluatorMapping = mapping.ForEvaluation();
        var evaluator        = new ClassifierEvaluator <
            IList <Vector>,         // the type of the instance source,
            int,                    // the type of an instance
            IList <string>,         // the type of the label source
            string>(                // the type of a label.
            evaluatorMapping);


        // Create performance metrics
        var accuracy = new List <double>();
        var negativeLogProbability = new List <double>();
        var auc             = new List <double>();
        var evidence        = new List <double>();
        var iterationCounts = new List <double>();
        var trainingTime    = new List <double>();

        // Run cross-validation
        int validationSetSize     = x.Length / crossValidationFoldCount;
        int trainingSetSize       = x.Length - validationSetSize;
        int validationFoldSetSize = 0;
        int trainingFoldSetSize   = 0;

        Console.WriteLine(
            "Running {0}-fold cross-validation", crossValidationFoldCount);

        if (validationSetSize == 0 || trainingSetSize == 0)
        {
            Console.WriteLine("Invalid number of folds");
            Console.ReadKey();
            System.Environment.Exit(1);
        }

        for (int fold = 0; fold < crossValidationFoldCount; fold++)
        {
            // Construct training and validation sets for fold
            int validationSetStart = fold * validationSetSize;
            int validationSetEnd   = (fold + 1 == crossValidationFoldCount)
                                       ? x.Length
                                       : (fold + 1) * validationSetSize;


            validationFoldSetSize = validationSetEnd - validationSetStart;
            trainingFoldSetSize   = x.Length - validationFoldSetSize;

            Vector[]       trainingSet         = new Vector[trainingFoldSetSize];
            Vector[]       validationSet       = new Vector[validationFoldSetSize];
            IList <string> trainingSetLabels   = new List <string>();
            IList <string> validationSetLabels = new List <string>();

            for (int instance = 0, iv = 0, it = 0; instance < x.Length; instance++)
            {
                if (validationSetStart <= instance && instance < validationSetEnd)
                {
                    validationSet[iv++] = x[instance];
                    validationSetLabels.Add(y[instance]);
                }
                else
                {
                    trainingSet[it++] = x[instance];
                    trainingSetLabels.Add(y[instance]);
                }
            }

            // Print info
            Console.WriteLine("   Fold {0} [validation set instances {1} - {2}]", fold + 1, validationSetStart, validationSetEnd - 1);

            // Create classifier
            var classifier = BayesPointMachineClassifier.CreateBinaryClassifier(mapping);
            classifier.Settings.Training.IterationCount       = iterationCount;
            classifier.Settings.Training.BatchCount           = batchCount;
            classifier.Settings.Training.ComputeModelEvidence = computeModelEvidence;

            int currentIterationCount = 0;
            classifier.IterationChanged += (sender, eventArgs) => { currentIterationCount = eventArgs.CompletedIterationCount; };

            // Train classifier
            var stopWatch = new Stopwatch();
            stopWatch.Start();
            classifier.Train(trainingSet, trainingSetLabels);
            stopWatch.Stop();

            // Produce predictions
            IEnumerable <IDictionary <string, double> > predictions =
                classifier.PredictDistribution(validationSet);
            var predictedLabels = classifier.Predict(validationSet);

            // Iteration count
            iterationCounts.Add(currentIterationCount);

            // Training time
            trainingTime.Add(stopWatch.ElapsedMilliseconds);

            // Compute accuracy
            accuracy.Add(1 - (evaluator.Evaluate(validationSet, validationSetLabels, predictedLabels, Metrics.ZeroOneError) / predictions.Count()));

            // Compute mean negative log probability
            negativeLogProbability.Add(evaluator.Evaluate(validationSet, validationSetLabels, predictions, Metrics.NegativeLogProbability) / predictions.Count());

            // Compute M-measure (averaged pairwise AUC)
            auc.Add(evaluator.AreaUnderRocCurve(validationSet, validationSetLabels, predictions));

            // Compute log evidence if desired
            evidence.Add(computeModelEvidence ? classifier.LogModelEvidence : double.NaN);

            // Persist performance metrics
            Console.WriteLine(
                "      Accuracy = {0,5:0.0000}   NegLogProb = {1,5:0.0000}   AUC = {2,5:0.0000}{3}   Iterations = {4}   Training time = {5}",
                accuracy[fold],
                negativeLogProbability[fold],
                auc[fold],
                computeModelEvidence ? string.Format("   Log evidence = {0,5:0.0000}", evidence[fold]) : string.Empty,
                iterationCounts[fold],
                FormatElapsedTime(trainingTime[fold]));

            SavePerformanceMetrics(
                reportFileName, accuracy, negativeLogProbability, auc, evidence, iterationCounts, trainingTime);
        }
    }