Esempio n. 1
0
        private const int MAX_TRAINING_MEMORY = 512 * 1024 * 1024; // 512 MB

        /// <summary>
        /// Calculate new weight factors for one iteration of the refinement process.  This is the heart
        /// of the MProphet algorithm.
        /// </summary>
        /// <param name="iteration">Iteration number (special processing happens for iteration 0).</param>
        /// <param name="targetTransitionGroups">Target transition groups.</param>
        /// <param name="decoyTransitionGroups">Decoy transition groups.</param>
        /// <param name="includeSecondBest">Include the second best peaks in the targets as additional decoys?</param>
        /// <param name="weights">Array of weights per calculator.</param>
        /// <param name="decoyMean">Output mean of decoy transition groups.</param>
        /// <param name="decoyStdev">Output standard deviation of decoy transition groups.</param>
        /// <param name="colinearWarning">Set to true if colinearity was detected.</param>
        private void CalculateWeights(
            int iteration,
            ScoredGroupPeaksSet targetTransitionGroups,
            ScoredGroupPeaksSet decoyTransitionGroups,
            bool includeSecondBest,
            double[] weights,
            out double decoyMean,
            out double decoyStdev,
            ref bool colinearWarning)
        {
            if (includeSecondBest)
            {
                ScoredGroupPeaksSet secondBestTransitionGroups;
                targetTransitionGroups.SelectTargetsAndDecoys(out targetTransitionGroups, out secondBestTransitionGroups);
                foreach (var secondBestGroup in secondBestTransitionGroups.ScoredGroupPeaksList)
                {
                    decoyTransitionGroups.Add(secondBestGroup);
                }
            }

            // Select true target peaks using a q-value cutoff filter.
            var qValueCutoff = (iteration == 0 ? 0.15 : 0.02);
            var truePeaks    = targetTransitionGroups.SelectTruePeaks(qValueCutoff, Lambda, decoyTransitionGroups);
            var decoyPeaks   = decoyTransitionGroups.SelectMaxPeaks();

            // Omit first feature during first iteration, since it is used as the initial score value.
            weights[0] = (iteration == 0) ? double.NaN : 0;
            var featureCount = weights.Count(w => !double.IsNaN(w));

            // Copy target and decoy peaks to training data array.
            int totalTrainingPeaks = truePeaks.Count + decoyTransitionGroups.Count;
            // Calculate the maximum number of training peaks (8 bytes per score - double, featurCount + 1 scores per peak)
            int maxTrainingPeaks = MAX_TRAINING_MEMORY / 8 / (featureCount + 1);

            var trainData = new double[Math.Min(totalTrainingPeaks, maxTrainingPeaks), featureCount + 1];

            if (totalTrainingPeaks < maxTrainingPeaks)
            {
                for (int i = 0; i < truePeaks.Count; i++)
                {
                    CopyToTrainData(truePeaks[i].Features, trainData, weights, i, 1);
                }
                for (int i = 0; i < decoyPeaks.Count; i++)
                {
                    CopyToTrainData(decoyPeaks[i].Features, trainData, weights, i + truePeaks.Count, 0);
                }
            }
            else
            {
                double proportionTrue = truePeaks.Count * 1.0 / totalTrainingPeaks;
                int    truePeakCount  = (int)Math.Round(maxTrainingPeaks * proportionTrue);
                int    i = 0;
                foreach (var peak in truePeaks.RandomOrder())
                {
                    if (i < truePeakCount)
                    {
                        CopyToTrainData(peak.Features, trainData, weights, i, 1);
                    }
                    else
                    {
                        break;
                    }
                    i++;
                }
                int decoyPeakCount = maxTrainingPeaks - truePeakCount;
                i = 0;
                foreach (var peak in decoyPeaks.RandomOrder())
                {
                    if (i < decoyPeakCount)
                    {
                        CopyToTrainData(peak.Features, trainData, weights, i + truePeakCount, 0);
                    }
                    else
                    {
                        break;
                    }
                    i++;
                }
            }

            // Use Linear Discriminant Analysis to find weights that separate true and decoy peak scores.
            int info;

            double[] weightsFromLda;
            alglib.fisherlda(
                trainData,
                trainData.GetLength(0),
                trainData.GetLength(1) - 1,
                2,
                out info,
                out weightsFromLda);

            // Check for colinearity.
            if (info == 2)
            {
                colinearWarning = true;
            }

            // Unpack weights array.
            for (int i = 0, j = 0; i < weights.Length; i++)
            {
                if (!double.IsNaN(weights[i]))
                {
                    weights[i] = weightsFromLda[j++];
                }
            }

            // Recalculate all peak scores.
            targetTransitionGroups.ScorePeaks(weights);
            decoyTransitionGroups.ScorePeaks(weights);

            // If the mean target score is less than the mean decoy score, then the
            // weights came out negative, and all the weights and scores must be negated to
            // restore the proper ordering.
            if (targetTransitionGroups.Mean < decoyTransitionGroups.Mean)
            {
                for (int i = 0; i < weights.Length; i++)
                {
                    weights[i] *= -1;
                }
                targetTransitionGroups.ScorePeaks(weights);
                decoyTransitionGroups.ScorePeaks(weights);
            }

            decoyMean  = decoyTransitionGroups.Mean;
            decoyStdev = decoyTransitionGroups.Stdev;
        }
Esempio n. 2
0
        private const int MAX_TRAINING_MEMORY = 512 * 1024 * 1024; // 512 MB

        /// <summary>
        /// Calculate new weight factors for one iteration of the refinement process.  This is the heart
        /// of the MProphet algorithm.
        /// </summary>
        /// <param name="documentPath">The path to the current document for writing score distributions</param>
        /// <param name="targetTransitionGroups">Target transition groups.</param>
        /// <param name="decoyTransitionGroups">Decoy transition groups.</param>
        /// <param name="includeSecondBest">Include the second best peaks in the targets as additional decoys?</param>
        /// <param name="nonParametricPValues">Non-parametric p values used in selecting true peaks if true</param>
        /// <param name="qValueCutoff">The q value cut-off for true peaks in the training</param>
        /// <param name="weights">Array of weights per calculator.</param>
        /// <param name="decoyMean">Output mean of decoy transition groups.</param>
        /// <param name="decoyStdev">Output standard deviation of decoy transition groups.</param>
        /// <param name="colinearWarning">Set to true if colinearity was detected.</param>
        private int CalculateWeights(string documentPath,
                                     ScoredGroupPeaksSet targetTransitionGroups,
                                     ScoredGroupPeaksSet decoyTransitionGroups,
                                     bool includeSecondBest,
                                     bool nonParametricPValues,
                                     double qValueCutoff,
                                     double[] weights,
                                     out double decoyMean,
                                     out double decoyStdev,
                                     ref bool colinearWarning)
        {
            if (includeSecondBest)
            {
                ScoredGroupPeaksSet secondBestTransitionGroups;
                targetTransitionGroups.SelectTargetsAndDecoys(out targetTransitionGroups, out secondBestTransitionGroups);
                foreach (var secondBestGroup in secondBestTransitionGroups.ScoredGroupPeaksList)
                {
                    decoyTransitionGroups.Add(secondBestGroup);
                }
            }

            // Select true target peaks using a q-value cutoff filter.
            var truePeaks  = targetTransitionGroups.SelectTruePeaks(decoyTransitionGroups, qValueCutoff, Lambda, nonParametricPValues);
            var decoyPeaks = decoyTransitionGroups.SelectMaxPeaks();

            WriteDistributionInfo(documentPath, targetTransitionGroups, decoyTransitionGroups); // Only if asked to do so in command-line arguments

            // Better to let a really poor model through for the user to see than to give an error message here
            if (((double)truePeaks.Count) * 10 * 1000 < decoyPeaks.Count) // Targets must be at least 0.01% of decoys (still rejects zero)
            {
                throw new InvalidDataException(string.Format(Resources.MProphetPeakScoringModel_CalculateWeights_Insufficient_target_peaks___0__with__1__decoys__detected_at__2___FDR_to_continue_training_, truePeaks.Count, decoyPeaks.Count, qValueCutoff * 100));
            }
            if (((double)decoyPeaks.Count) * 1000 < truePeaks.Count) // Decoys must be at least 0.1% of targets
            {
                throw new InvalidDataException(string.Format(Resources.MProphetPeakScoringModel_CalculateWeights_Insufficient_decoy_peaks___0__with__1__targets__to_continue_training_, decoyPeaks.Count, truePeaks.Count));
            }

            var featureCount = weights.Count(w => !double.IsNaN(w));

            // Copy target and decoy peaks to training data array.
            int totalTrainingPeaks = truePeaks.Count + decoyTransitionGroups.Count;
            // Calculate the maximum number of training peaks (8 bytes per score - double, featurCount + 1 scores per peak)
            int maxTrainingPeaks = MAX_TRAINING_MEMORY / 8 / (featureCount + 1);

            var trainData = new double[Math.Min(totalTrainingPeaks, maxTrainingPeaks), featureCount + 1];

            if (totalTrainingPeaks < maxTrainingPeaks)
            {
                for (int i = 0; i < truePeaks.Count; i++)
                {
                    CopyToTrainData(truePeaks[i].Features, trainData, weights, i, 1);
                }
                for (int i = 0; i < decoyPeaks.Count; i++)
                {
                    CopyToTrainData(decoyPeaks[i].Features, trainData, weights, i + truePeaks.Count, 0);
                }
            }
            else
            {
                double proportionTrue = truePeaks.Count * 1.0 / totalTrainingPeaks;
                int    truePeakCount  = (int)Math.Round(maxTrainingPeaks * proportionTrue);
                int    i = 0;
                foreach (var peak in truePeaks.RandomOrder(ArrayUtil.RANDOM_SEED))
                {
                    if (i < truePeakCount)
                    {
                        CopyToTrainData(peak.Features, trainData, weights, i, 1);
                    }
                    else
                    {
                        break;
                    }
                    i++;
                }
                int decoyPeakCount = maxTrainingPeaks - truePeakCount;
                i = 0;
                foreach (var peak in decoyPeaks.RandomOrder(ArrayUtil.RANDOM_SEED))
                {
                    if (i < decoyPeakCount)
                    {
                        CopyToTrainData(peak.Features, trainData, weights, i + truePeakCount, 0);
                    }
                    else
                    {
                        break;
                    }
                    i++;
                }
            }

            // Use Linear Discriminant Analysis to find weights that separate true and decoy peak scores.
            int info;

            double[] weightsFromLda;
            alglib.fisherlda(
                trainData,
                trainData.GetLength(0),
                trainData.GetLength(1) - 1,
                2,
                out info,
                out weightsFromLda);

            // Check for colinearity.
            if (info == 2)
            {
                colinearWarning = true;
            }

            // Unpack weights array.
            for (int i = 0, j = 0; i < weights.Length; i++)
            {
                if (!double.IsNaN(weights[i]))
                {
                    weights[i] = weightsFromLda[j++];
                }
            }

            // Recalculate all peak scores.
            targetTransitionGroups.ScorePeaks(weights);
            decoyTransitionGroups.ScorePeaks(weights);

            // If the mean target score is less than the mean decoy score, then the
            // weights came out negative, and all the weights and scores must be negated to
            // restore the proper ordering.
            if (targetTransitionGroups.Mean < decoyTransitionGroups.Mean)
            {
                for (int i = 0; i < weights.Length; i++)
                {
                    weights[i] *= -1;
                }
                targetTransitionGroups.ScorePeaks(weights);
                decoyTransitionGroups.ScorePeaks(weights);
            }

            decoyMean  = decoyTransitionGroups.Mean;
            decoyStdev = decoyTransitionGroups.Stdev;
            return(truePeaks.Count);
        }