Пример #1
0
        private void LoadData(
            string filePath,
            out ScoredGroupPeaksSet targetTransitionGroups,
            out ScoredGroupPeaksSet decoyTransitionGroups)
        {
            var data = new Data(filePath);

            // Find columns of interest in the data file header.
            var mainVarColumn           = -1;
            var decoyColumn             = -1;
            var transitionGroupIdColumn = -1;
            var varColumns = new List <int>();

            for (int i = 0; i < data.Header.Length; i++)
            {
                var heading = data.Header[i].Trim().ToLowerInvariant();
                if (heading.StartsWith("main_var"))         // Not L10N
                {
                    mainVarColumn = i;
                }
                else if (heading.StartsWith("var_"))        // Not L10N
                {
                    varColumns.Add(i);
                }
                else if (heading == "decoy")                // Not L10N
                {
                    decoyColumn = i;
                }
                else if (heading == "transition_group_id")  // Not L10N
                {
                    transitionGroupIdColumn = i;
                }
            }

            Assert.AreNotEqual(-1, mainVarColumn);
            Assert.AreNotEqual(-1, decoyColumn);
            Assert.AreNotEqual(-1, transitionGroupIdColumn);
            Assert.AreNotEqual(0, varColumns.Count);

            // Create transition groups to be filled from data file.
            targetTransitionGroups = new ScoredGroupPeaksSet();
            decoyTransitionGroups  = new ScoredGroupPeaksSet();
            var featuresCount             = varColumns.Count + 1;
            var transitionGroupDictionary = new Dictionary <string, ScoredGroupPeaks>();

            // Process each row containing features for a peak.
            for (int i = 0; i < data.Items.GetLength(0); i++)
            {
                ScoredGroupPeaks transitionGroup;
                var decoy             = data.Items[i, decoyColumn].Trim().ToLower();
                var transitionGroupId = data.Items[i, transitionGroupIdColumn] + decoy; // Append decoy to make unique groups of decoy/target peaks.

                // The peak belongs to a transition group.  Have we seen this group before?
                if (!transitionGroupDictionary.ContainsKey(transitionGroupId))
                {
                    // Create a new transition group.
                    transitionGroup = new ScoredGroupPeaks {
                        Id = transitionGroupId
                    };
                    transitionGroupDictionary[transitionGroupId] = transitionGroup;

                    // Add the new group to the collection of decoy or target groups.
                    if (decoy == "1" || decoy == "true")    // Not L10N
                    {
                        decoyTransitionGroups.Add(transitionGroup);
                    }
                    else
                    {
                        targetTransitionGroups.Add(transitionGroup);
                    }
                }
                else
                {
                    // Retrieve a transition group that was created previously.
                    transitionGroup = transitionGroupDictionary[transitionGroupId];
                }

                // Parse feature values for this peak.
                var features = new float[featuresCount];
                features[0] = (float)double.Parse(data.Items[i, mainVarColumn], CultureInfo.InvariantCulture);
                for (int j = 0; j < varColumns.Count; j++)
                {
                    features[j + 1] = (float)double.Parse(data.Items[i, varColumns[j]], CultureInfo.InvariantCulture);
                }

                // Add the peak to its transition group.
                transitionGroup.Add(new ScoredPeak(features));
            }
        }
Пример #2
0
        public override IPeakScoringModel Train(IList<IList<float[]>> targets, IList<IList<float[]>> decoys, LinearModelParams initParameters,
            bool includeSecondBest = false, bool preTrain = true, IProgressMonitor progressMonitor = null)
        {
            return ChangeProp(ImClone(this), im =>
            {
                    int nWeights = initParameters.Weights.Count;
                    var weights = new double [nWeights];
                    for (int i = 0; i < initParameters.Weights.Count; ++i)
                    {
                        weights[i] = double.IsNaN(initParameters.Weights[i]) ? double.NaN : DEFAULT_WEIGHTS[i];
                    }
                    var parameters = new LinearModelParams(weights);
                    ScoredGroupPeaksSet decoyTransitionGroups = new ScoredGroupPeaksSet(decoys);
                    ScoredGroupPeaksSet targetTransitionGroups = new ScoredGroupPeaksSet(targets);
                    targetTransitionGroups.ScorePeaks(parameters.Weights);

                    if (includeSecondBest)
                    {
                        ScoredGroupPeaksSet secondBestTransitionGroups;
                        targetTransitionGroups.SelectTargetsAndDecoys(out targetTransitionGroups, out secondBestTransitionGroups);
                        foreach (var secondBestGroup in secondBestTransitionGroups.ScoredGroupPeaksList)
                        {
                            decoyTransitionGroups.Add(secondBestGroup);
                        }
                    }
                    decoyTransitionGroups.ScorePeaks(parameters.Weights);
                    im.UsesDecoys = decoys.Count > 0;
                    im.UsesSecondBest = includeSecondBest;
                    im.Parameters = parameters.RescaleParameters(decoyTransitionGroups.Mean, decoyTransitionGroups.Stdev);
                });
        }
Пример #3
0
        /// <summary>
        /// Train the model by iterative calculating weights to separate target and decoy transition groups.
        /// </summary>
        /// <param name="targets">Target transition groups.</param>
        /// <param name="decoys">Decoy transition groups.</param>
        /// <param name="initParameters">Initial model parameters (weights and bias)</param>
        /// <param name="includeSecondBest"> Include the second best peaks in the targets as decoys?</param>
        /// <param name="preTrain">Use a pre-trained model to bootstrap the learning.</param>
        /// <param name="progressMonitor"></param>
        /// <returns>Immutable model with new weights.</returns>
        public override IPeakScoringModel Train(IList<IList<float[]>> targets, IList<IList<float[]>> decoys, LinearModelParams initParameters,
            bool includeSecondBest = false, bool preTrain = true, IProgressMonitor progressMonitor = null)
        {
            if(initParameters == null)
                initParameters = new LinearModelParams(_peakFeatureCalculators.Count);
            return ChangeProp(ImClone(this), im =>
                {
                    targets = targets.Where(list => list.Count > 0).ToList();
                    decoys = decoys.Where(list => list.Count > 0).ToList();
                    var targetTransitionGroups = new ScoredGroupPeaksSet(targets);
                    var decoyTransitionGroups = new ScoredGroupPeaksSet(decoys);
                    // Bootstrap from the pre-trained legacy model
                    if (preTrain)
                    {
                        var preTrainedWeights = new double[initParameters.Weights.Count];
                        for (int i = 0; i < preTrainedWeights.Length; ++i)
                        {
                            if (double.IsNaN(initParameters.Weights[i]))
                            {
                                preTrainedWeights[i] = double.NaN;
                            }
                        }
                        int standardEnabledCount = GetEnabledCount(LegacyScoringModel.StandardFeatureCalculators, initParameters.Weights);
                        int analyteEnabledCount = GetEnabledCount(LegacyScoringModel.AnalyteFeatureCalculators, initParameters.Weights);
                        bool hasStandards = standardEnabledCount >= analyteEnabledCount;
                        var calculators = hasStandards ? LegacyScoringModel.StandardFeatureCalculators : LegacyScoringModel.AnalyteFeatureCalculators;
                        for (int i = 0; i < calculators.Length; ++i)
                        {
                            if (calculators[i].GetType() == typeof (MQuestRetentionTimePredictionCalc))
                                continue;
                            SetCalculatorValue(calculators[i].GetType(), LegacyScoringModel.DEFAULT_WEIGHTS[i], preTrainedWeights);
                        }
                        targetTransitionGroups.ScorePeaks(preTrainedWeights);
                        decoyTransitionGroups.ScorePeaks(preTrainedWeights);
                    }

                    // Iteratively refine the weights through multiple iterations.
                    var calcWeights = new double[initParameters.Weights.Count];
                    Array.Copy(initParameters.Weights.ToArray(), calcWeights, initParameters.Weights.Count);
                    double decoyMean = 0;
                    double decoyStdev = 0;
                    bool colinearWarning = false;
                    // This may take a long time between progress updates, but just measure progress by cycles through the training
                    var status = new ProgressStatus(Resources.MProphetPeakScoringModel_Train_Training_peak_scoring_model);
                    if (progressMonitor != null)
                        progressMonitor.UpdateProgress(status);
                    for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++)
                    {
                        if (progressMonitor != null)
                        {
                            if (progressMonitor.IsCanceled)
                                throw new OperationCanceledException();

                            progressMonitor.UpdateProgress(status =
                                status.ChangeMessage(string.Format(Resources.MProphetPeakScoringModel_Train_Training_peak_scoring_model__iteration__0__of__1__, iteration + 1, MAX_ITERATIONS))
                                      .ChangePercentComplete((iteration + 1) * 100 / (MAX_ITERATIONS + 1)));
                        }

                        im.CalculateWeights(iteration, targetTransitionGroups, decoyTransitionGroups,
                                            includeSecondBest, calcWeights, out decoyMean, out decoyStdev, ref colinearWarning);

                        GC.Collect();   // Each loop generates a number of large objects. GC helps to keep private bytes under control
                    }
                    if (progressMonitor != null)
                        progressMonitor.UpdateProgress(status.ChangePercentComplete(100));

                    var parameters = new LinearModelParams(calcWeights);
                    parameters = parameters.RescaleParameters(decoyMean, decoyStdev);
                    im.Parameters = parameters;
                    im.ColinearWarning = colinearWarning;
                    im.UsesSecondBest = includeSecondBest;
                    im.UsesDecoys = decoys.Count > 0;
                });
        }
Пример #4
0
        /// <summary>
        /// Calculate new weight factors for one iteration of the refinement process.  This is the heart
        /// of the MProphet algorithm.
        /// </summary>
        /// <param name="iteration">Iteration number (special processing happens for iteration 0).</param>
        /// <param name="targetTransitionGroups">Target transition groups.</param>
        /// <param name="decoyTransitionGroups">Decoy transition groups.</param>
        /// <param name="includeSecondBest">Include the second best peaks in the targets as additional decoys?</param>
        /// <param name="weights">Array of weights per calculator.</param>
        /// <param name="decoyMean">Output mean of decoy transition groups.</param>
        /// <param name="decoyStdev">Output standard deviation of decoy transition groups.</param>
        /// <param name="colinearWarning">Set to true if colinearity was detected.</param>
        private void CalculateWeights(
            int iteration,
            ScoredGroupPeaksSet targetTransitionGroups,
            ScoredGroupPeaksSet decoyTransitionGroups,
            bool includeSecondBest,
            double[] weights,
            out double decoyMean,
            out double decoyStdev,
            ref bool colinearWarning)
        {
            if (includeSecondBest)
            {
                ScoredGroupPeaksSet secondBestTransitionGroups;
                targetTransitionGroups.SelectTargetsAndDecoys(out targetTransitionGroups, out secondBestTransitionGroups);
                foreach (var secondBestGroup in secondBestTransitionGroups.ScoredGroupPeaksList)
                {
                    decoyTransitionGroups.Add(secondBestGroup);
                }

            }

            // Select true target peaks using a q-value cutoff filter.
            var qValueCutoff = (iteration == 0 ? 0.15 : 0.02);
            var truePeaks = targetTransitionGroups.SelectTruePeaks(qValueCutoff, Lambda, decoyTransitionGroups);
            var decoyPeaks = decoyTransitionGroups.SelectMaxPeaks();

            // Omit first feature during first iteration, since it is used as the initial score value.
            weights[0] = (iteration == 0) ? double.NaN : 0;
            var featureCount = weights.Count(w => !double.IsNaN(w));

            // Copy target and decoy peaks to training data array.
            int totalTrainingPeaks = truePeaks.Count + decoyTransitionGroups.Count;
            // Calculate the maximum number of training peaks (8 bytes per score - double, featurCount + 1 scores per peak)
            int maxTrainingPeaks = MAX_TRAINING_MEMORY/8/(featureCount + 1);

            var trainData = new double[Math.Min(totalTrainingPeaks, maxTrainingPeaks), featureCount + 1];
            if (totalTrainingPeaks < maxTrainingPeaks)
            {
                for (int i = 0; i < truePeaks.Count; i++)
                    CopyToTrainData(truePeaks[i].Features, trainData, weights, i, 1);
                for (int i = 0; i < decoyPeaks.Count; i++)
                    CopyToTrainData(decoyPeaks[i].Features, trainData, weights, i + truePeaks.Count, 0);
            }
            else
            {
                double proportionTrue = truePeaks.Count*1.0/totalTrainingPeaks;
                int truePeakCount = (int) Math.Round(maxTrainingPeaks*proportionTrue);
                int i = 0;
                foreach (var peak in truePeaks.RandomOrder())
                {
                    if (i < truePeakCount)
                        CopyToTrainData(peak.Features, trainData, weights, i, 1);
                    else
                        break;
                    i++;
                }
                int decoyPeakCount = maxTrainingPeaks - truePeakCount;
                i = 0;
                foreach (var peak in decoyPeaks.RandomOrder())
                {
                    if (i < decoyPeakCount)
                        CopyToTrainData(peak.Features, trainData, weights, i + truePeakCount, 0);
                    else
                        break;
                    i++;
                }
            }

            // Use Linear Discriminant Analysis to find weights that separate true and decoy peak scores.
            int info;
            double[] weightsFromLda;
            alglib.fisherlda(
                trainData,
                trainData.GetLength(0),
                trainData.GetLength(1) - 1,
                2,
                out info,
                out weightsFromLda);

            // Check for colinearity.
            if (info == 2)
            {
                colinearWarning = true;
            }

            // Unpack weights array.
            for (int i = 0, j = 0; i < weights.Length; i++)
            {
                if (!double.IsNaN(weights[i]))
                    weights[i] = weightsFromLda[j++];
            }

            // Recalculate all peak scores.
            targetTransitionGroups.ScorePeaks(weights);
            decoyTransitionGroups.ScorePeaks(weights);

            // If the mean target score is less than the mean decoy score, then the
            // weights came out negative, and all the weights and scores must be negated to
            // restore the proper ordering.
            if (targetTransitionGroups.Mean < decoyTransitionGroups.Mean)
            {
                for (int i = 0; i < weights.Length; i++)
                    weights[i] *= -1;
                targetTransitionGroups.ScorePeaks(weights);
                decoyTransitionGroups.ScorePeaks(weights);
            }

            decoyMean = decoyTransitionGroups.Mean;
            decoyStdev = decoyTransitionGroups.Stdev;
        }
        private void LoadData(
            string filePath,
            out ScoredGroupPeaksSet targetTransitionGroups,
            out ScoredGroupPeaksSet decoyTransitionGroups)
        {
            var data = new Data(filePath);

            // Find columns of interest in the data file header.
            var mainVarColumn = -1;
            var decoyColumn = -1;
            var transitionGroupIdColumn = -1;
            var varColumns = new List<int>();
            for (int i = 0; i < data.Header.Length; i++)
            {
                var heading = data.Header[i].Trim().ToLowerInvariant();
                if (heading.StartsWith("main_var"))         // Not L10N
                    mainVarColumn = i;
                else if (heading.StartsWith("var_"))        // Not L10N
                    varColumns.Add(i);
                else if (heading == "decoy")                // Not L10N
                    decoyColumn = i;
                else if (heading == "transition_group_id")  // Not L10N
                    transitionGroupIdColumn = i;
            }

            Assert.AreNotEqual(-1, mainVarColumn);
            Assert.AreNotEqual(-1, decoyColumn);
            Assert.AreNotEqual(-1, transitionGroupIdColumn);
            Assert.AreNotEqual(0, varColumns.Count);

            // Create transition groups to be filled from data file.
            targetTransitionGroups = new ScoredGroupPeaksSet();
            decoyTransitionGroups = new ScoredGroupPeaksSet();
            var featuresCount = varColumns.Count + 1;
            var transitionGroupDictionary = new Dictionary<string, ScoredGroupPeaks>();

            // Process each row containing features for a peak.
            for (int i = 0; i < data.Items.GetLength(0); i++)
            {
                ScoredGroupPeaks transitionGroup;
                var decoy = data.Items[i, decoyColumn].Trim().ToLower();
                var transitionGroupId = data.Items[i, transitionGroupIdColumn] + decoy; // Append decoy to make unique groups of decoy/target peaks.

                // The peak belongs to a transition group.  Have we seen this group before?
                if (!transitionGroupDictionary.ContainsKey(transitionGroupId))
                {
                    // Create a new transition group.
                    transitionGroup = new ScoredGroupPeaks { Id = transitionGroupId };
                    transitionGroupDictionary[transitionGroupId] = transitionGroup;

                    // Add the new group to the collection of decoy or target groups.
                    if (decoy == "1" || decoy == "true")    // Not L10N
                        decoyTransitionGroups.Add(transitionGroup);
                    else
                        targetTransitionGroups.Add(transitionGroup);
                }
                else
                {
                    // Retrieve a transition group that was created previously.
                    transitionGroup = transitionGroupDictionary[transitionGroupId];
                }

                // Parse feature values for this peak.
                var features = new float[featuresCount];
                features[0] = (float) double.Parse(data.Items[i, mainVarColumn], CultureInfo.InvariantCulture);
                for (int j = 0; j < varColumns.Count; j++)
                    features[j + 1] = (float) double.Parse(data.Items[i, varColumns[j]], CultureInfo.InvariantCulture);

                // Add the peak to its transition group.
                transitionGroup.Add(new ScoredPeak(features));
            }
        }