private void LoadData( string filePath, out ScoredGroupPeaksSet targetTransitionGroups, out ScoredGroupPeaksSet decoyTransitionGroups) { var data = new Data(filePath); // Find columns of interest in the data file header. var mainVarColumn = -1; var decoyColumn = -1; var transitionGroupIdColumn = -1; var varColumns = new List <int>(); for (int i = 0; i < data.Header.Length; i++) { var heading = data.Header[i].Trim().ToLowerInvariant(); if (heading.StartsWith("main_var")) // Not L10N { mainVarColumn = i; } else if (heading.StartsWith("var_")) // Not L10N { varColumns.Add(i); } else if (heading == "decoy") // Not L10N { decoyColumn = i; } else if (heading == "transition_group_id") // Not L10N { transitionGroupIdColumn = i; } } Assert.AreNotEqual(-1, mainVarColumn); Assert.AreNotEqual(-1, decoyColumn); Assert.AreNotEqual(-1, transitionGroupIdColumn); Assert.AreNotEqual(0, varColumns.Count); // Create transition groups to be filled from data file. targetTransitionGroups = new ScoredGroupPeaksSet(); decoyTransitionGroups = new ScoredGroupPeaksSet(); var featuresCount = varColumns.Count + 1; var transitionGroupDictionary = new Dictionary <string, ScoredGroupPeaks>(); // Process each row containing features for a peak. for (int i = 0; i < data.Items.GetLength(0); i++) { ScoredGroupPeaks transitionGroup; var decoy = data.Items[i, decoyColumn].Trim().ToLower(); var transitionGroupId = data.Items[i, transitionGroupIdColumn] + decoy; // Append decoy to make unique groups of decoy/target peaks. // The peak belongs to a transition group. Have we seen this group before? if (!transitionGroupDictionary.ContainsKey(transitionGroupId)) { // Create a new transition group. transitionGroup = new ScoredGroupPeaks { Id = transitionGroupId }; transitionGroupDictionary[transitionGroupId] = transitionGroup; // Add the new group to the collection of decoy or target groups. if (decoy == "1" || decoy == "true") // Not L10N { decoyTransitionGroups.Add(transitionGroup); } else { targetTransitionGroups.Add(transitionGroup); } } else { // Retrieve a transition group that was created previously. transitionGroup = transitionGroupDictionary[transitionGroupId]; } // Parse feature values for this peak. var features = new float[featuresCount]; features[0] = (float)double.Parse(data.Items[i, mainVarColumn], CultureInfo.InvariantCulture); for (int j = 0; j < varColumns.Count; j++) { features[j + 1] = (float)double.Parse(data.Items[i, varColumns[j]], CultureInfo.InvariantCulture); } // Add the peak to its transition group. transitionGroup.Add(new ScoredPeak(features)); } }
public override IPeakScoringModel Train(IList<IList<float[]>> targets, IList<IList<float[]>> decoys, LinearModelParams initParameters, bool includeSecondBest = false, bool preTrain = true, IProgressMonitor progressMonitor = null) { return ChangeProp(ImClone(this), im => { int nWeights = initParameters.Weights.Count; var weights = new double [nWeights]; for (int i = 0; i < initParameters.Weights.Count; ++i) { weights[i] = double.IsNaN(initParameters.Weights[i]) ? double.NaN : DEFAULT_WEIGHTS[i]; } var parameters = new LinearModelParams(weights); ScoredGroupPeaksSet decoyTransitionGroups = new ScoredGroupPeaksSet(decoys); ScoredGroupPeaksSet targetTransitionGroups = new ScoredGroupPeaksSet(targets); targetTransitionGroups.ScorePeaks(parameters.Weights); if (includeSecondBest) { ScoredGroupPeaksSet secondBestTransitionGroups; targetTransitionGroups.SelectTargetsAndDecoys(out targetTransitionGroups, out secondBestTransitionGroups); foreach (var secondBestGroup in secondBestTransitionGroups.ScoredGroupPeaksList) { decoyTransitionGroups.Add(secondBestGroup); } } decoyTransitionGroups.ScorePeaks(parameters.Weights); im.UsesDecoys = decoys.Count > 0; im.UsesSecondBest = includeSecondBest; im.Parameters = parameters.RescaleParameters(decoyTransitionGroups.Mean, decoyTransitionGroups.Stdev); }); }
/// <summary> /// Train the model by iterative calculating weights to separate target and decoy transition groups. /// </summary> /// <param name="targets">Target transition groups.</param> /// <param name="decoys">Decoy transition groups.</param> /// <param name="initParameters">Initial model parameters (weights and bias)</param> /// <param name="includeSecondBest"> Include the second best peaks in the targets as decoys?</param> /// <param name="preTrain">Use a pre-trained model to bootstrap the learning.</param> /// <param name="progressMonitor"></param> /// <returns>Immutable model with new weights.</returns> public override IPeakScoringModel Train(IList<IList<float[]>> targets, IList<IList<float[]>> decoys, LinearModelParams initParameters, bool includeSecondBest = false, bool preTrain = true, IProgressMonitor progressMonitor = null) { if(initParameters == null) initParameters = new LinearModelParams(_peakFeatureCalculators.Count); return ChangeProp(ImClone(this), im => { targets = targets.Where(list => list.Count > 0).ToList(); decoys = decoys.Where(list => list.Count > 0).ToList(); var targetTransitionGroups = new ScoredGroupPeaksSet(targets); var decoyTransitionGroups = new ScoredGroupPeaksSet(decoys); // Bootstrap from the pre-trained legacy model if (preTrain) { var preTrainedWeights = new double[initParameters.Weights.Count]; for (int i = 0; i < preTrainedWeights.Length; ++i) { if (double.IsNaN(initParameters.Weights[i])) { preTrainedWeights[i] = double.NaN; } } int standardEnabledCount = GetEnabledCount(LegacyScoringModel.StandardFeatureCalculators, initParameters.Weights); int analyteEnabledCount = GetEnabledCount(LegacyScoringModel.AnalyteFeatureCalculators, initParameters.Weights); bool hasStandards = standardEnabledCount >= analyteEnabledCount; var calculators = hasStandards ? LegacyScoringModel.StandardFeatureCalculators : LegacyScoringModel.AnalyteFeatureCalculators; for (int i = 0; i < calculators.Length; ++i) { if (calculators[i].GetType() == typeof (MQuestRetentionTimePredictionCalc)) continue; SetCalculatorValue(calculators[i].GetType(), LegacyScoringModel.DEFAULT_WEIGHTS[i], preTrainedWeights); } targetTransitionGroups.ScorePeaks(preTrainedWeights); decoyTransitionGroups.ScorePeaks(preTrainedWeights); } // Iteratively refine the weights through multiple iterations. var calcWeights = new double[initParameters.Weights.Count]; Array.Copy(initParameters.Weights.ToArray(), calcWeights, initParameters.Weights.Count); double decoyMean = 0; double decoyStdev = 0; bool colinearWarning = false; // This may take a long time between progress updates, but just measure progress by cycles through the training var status = new ProgressStatus(Resources.MProphetPeakScoringModel_Train_Training_peak_scoring_model); if (progressMonitor != null) progressMonitor.UpdateProgress(status); for (int iteration = 0; iteration < MAX_ITERATIONS; iteration++) { if (progressMonitor != null) { if (progressMonitor.IsCanceled) throw new OperationCanceledException(); progressMonitor.UpdateProgress(status = status.ChangeMessage(string.Format(Resources.MProphetPeakScoringModel_Train_Training_peak_scoring_model__iteration__0__of__1__, iteration + 1, MAX_ITERATIONS)) .ChangePercentComplete((iteration + 1) * 100 / (MAX_ITERATIONS + 1))); } im.CalculateWeights(iteration, targetTransitionGroups, decoyTransitionGroups, includeSecondBest, calcWeights, out decoyMean, out decoyStdev, ref colinearWarning); GC.Collect(); // Each loop generates a number of large objects. GC helps to keep private bytes under control } if (progressMonitor != null) progressMonitor.UpdateProgress(status.ChangePercentComplete(100)); var parameters = new LinearModelParams(calcWeights); parameters = parameters.RescaleParameters(decoyMean, decoyStdev); im.Parameters = parameters; im.ColinearWarning = colinearWarning; im.UsesSecondBest = includeSecondBest; im.UsesDecoys = decoys.Count > 0; }); }
/// <summary> /// Calculate new weight factors for one iteration of the refinement process. This is the heart /// of the MProphet algorithm. /// </summary> /// <param name="iteration">Iteration number (special processing happens for iteration 0).</param> /// <param name="targetTransitionGroups">Target transition groups.</param> /// <param name="decoyTransitionGroups">Decoy transition groups.</param> /// <param name="includeSecondBest">Include the second best peaks in the targets as additional decoys?</param> /// <param name="weights">Array of weights per calculator.</param> /// <param name="decoyMean">Output mean of decoy transition groups.</param> /// <param name="decoyStdev">Output standard deviation of decoy transition groups.</param> /// <param name="colinearWarning">Set to true if colinearity was detected.</param> private void CalculateWeights( int iteration, ScoredGroupPeaksSet targetTransitionGroups, ScoredGroupPeaksSet decoyTransitionGroups, bool includeSecondBest, double[] weights, out double decoyMean, out double decoyStdev, ref bool colinearWarning) { if (includeSecondBest) { ScoredGroupPeaksSet secondBestTransitionGroups; targetTransitionGroups.SelectTargetsAndDecoys(out targetTransitionGroups, out secondBestTransitionGroups); foreach (var secondBestGroup in secondBestTransitionGroups.ScoredGroupPeaksList) { decoyTransitionGroups.Add(secondBestGroup); } } // Select true target peaks using a q-value cutoff filter. var qValueCutoff = (iteration == 0 ? 0.15 : 0.02); var truePeaks = targetTransitionGroups.SelectTruePeaks(qValueCutoff, Lambda, decoyTransitionGroups); var decoyPeaks = decoyTransitionGroups.SelectMaxPeaks(); // Omit first feature during first iteration, since it is used as the initial score value. weights[0] = (iteration == 0) ? double.NaN : 0; var featureCount = weights.Count(w => !double.IsNaN(w)); // Copy target and decoy peaks to training data array. int totalTrainingPeaks = truePeaks.Count + decoyTransitionGroups.Count; // Calculate the maximum number of training peaks (8 bytes per score - double, featurCount + 1 scores per peak) int maxTrainingPeaks = MAX_TRAINING_MEMORY/8/(featureCount + 1); var trainData = new double[Math.Min(totalTrainingPeaks, maxTrainingPeaks), featureCount + 1]; if (totalTrainingPeaks < maxTrainingPeaks) { for (int i = 0; i < truePeaks.Count; i++) CopyToTrainData(truePeaks[i].Features, trainData, weights, i, 1); for (int i = 0; i < decoyPeaks.Count; i++) CopyToTrainData(decoyPeaks[i].Features, trainData, weights, i + truePeaks.Count, 0); } else { double proportionTrue = truePeaks.Count*1.0/totalTrainingPeaks; int truePeakCount = (int) Math.Round(maxTrainingPeaks*proportionTrue); int i = 0; foreach (var peak in truePeaks.RandomOrder()) { if (i < truePeakCount) CopyToTrainData(peak.Features, trainData, weights, i, 1); else break; i++; } int decoyPeakCount = maxTrainingPeaks - truePeakCount; i = 0; foreach (var peak in decoyPeaks.RandomOrder()) { if (i < decoyPeakCount) CopyToTrainData(peak.Features, trainData, weights, i + truePeakCount, 0); else break; i++; } } // Use Linear Discriminant Analysis to find weights that separate true and decoy peak scores. int info; double[] weightsFromLda; alglib.fisherlda( trainData, trainData.GetLength(0), trainData.GetLength(1) - 1, 2, out info, out weightsFromLda); // Check for colinearity. if (info == 2) { colinearWarning = true; } // Unpack weights array. for (int i = 0, j = 0; i < weights.Length; i++) { if (!double.IsNaN(weights[i])) weights[i] = weightsFromLda[j++]; } // Recalculate all peak scores. targetTransitionGroups.ScorePeaks(weights); decoyTransitionGroups.ScorePeaks(weights); // If the mean target score is less than the mean decoy score, then the // weights came out negative, and all the weights and scores must be negated to // restore the proper ordering. if (targetTransitionGroups.Mean < decoyTransitionGroups.Mean) { for (int i = 0; i < weights.Length; i++) weights[i] *= -1; targetTransitionGroups.ScorePeaks(weights); decoyTransitionGroups.ScorePeaks(weights); } decoyMean = decoyTransitionGroups.Mean; decoyStdev = decoyTransitionGroups.Stdev; }
private void LoadData( string filePath, out ScoredGroupPeaksSet targetTransitionGroups, out ScoredGroupPeaksSet decoyTransitionGroups) { var data = new Data(filePath); // Find columns of interest in the data file header. var mainVarColumn = -1; var decoyColumn = -1; var transitionGroupIdColumn = -1; var varColumns = new List<int>(); for (int i = 0; i < data.Header.Length; i++) { var heading = data.Header[i].Trim().ToLowerInvariant(); if (heading.StartsWith("main_var")) // Not L10N mainVarColumn = i; else if (heading.StartsWith("var_")) // Not L10N varColumns.Add(i); else if (heading == "decoy") // Not L10N decoyColumn = i; else if (heading == "transition_group_id") // Not L10N transitionGroupIdColumn = i; } Assert.AreNotEqual(-1, mainVarColumn); Assert.AreNotEqual(-1, decoyColumn); Assert.AreNotEqual(-1, transitionGroupIdColumn); Assert.AreNotEqual(0, varColumns.Count); // Create transition groups to be filled from data file. targetTransitionGroups = new ScoredGroupPeaksSet(); decoyTransitionGroups = new ScoredGroupPeaksSet(); var featuresCount = varColumns.Count + 1; var transitionGroupDictionary = new Dictionary<string, ScoredGroupPeaks>(); // Process each row containing features for a peak. for (int i = 0; i < data.Items.GetLength(0); i++) { ScoredGroupPeaks transitionGroup; var decoy = data.Items[i, decoyColumn].Trim().ToLower(); var transitionGroupId = data.Items[i, transitionGroupIdColumn] + decoy; // Append decoy to make unique groups of decoy/target peaks. // The peak belongs to a transition group. Have we seen this group before? if (!transitionGroupDictionary.ContainsKey(transitionGroupId)) { // Create a new transition group. transitionGroup = new ScoredGroupPeaks { Id = transitionGroupId }; transitionGroupDictionary[transitionGroupId] = transitionGroup; // Add the new group to the collection of decoy or target groups. if (decoy == "1" || decoy == "true") // Not L10N decoyTransitionGroups.Add(transitionGroup); else targetTransitionGroups.Add(transitionGroup); } else { // Retrieve a transition group that was created previously. transitionGroup = transitionGroupDictionary[transitionGroupId]; } // Parse feature values for this peak. var features = new float[featuresCount]; features[0] = (float) double.Parse(data.Items[i, mainVarColumn], CultureInfo.InvariantCulture); for (int j = 0; j < varColumns.Count; j++) features[j + 1] = (float) double.Parse(data.Items[i, varColumns[j]], CultureInfo.InvariantCulture); // Add the peak to its transition group. transitionGroup.Add(new ScoredPeak(features)); } }