public static SearchMetricsContainer ParseSearchResults(SearchMetricsContainer searchMetrics, WorkflowParameters parameters, string rawFileName) { XElement results = LoadSearchResults(parameters, rawFileName); PsmDataCollection Psms = ExtractPsmData(results, parameters.QcParams.SearchAlgorithm); searchMetrics.ParsePSMs(Psms, parameters); return(searchMetrics); }
public static void GetModificationFrequency(this SearchMetricsContainer searchMetrics, IEnumerable <PsmData> psms, WorkflowParameters parameters) { string nmod = parameters.QcParams.NMod; string kmod = parameters.QcParams.KMod; string xmod = parameters.QcParams.XMod; Dictionary <string, string> Modifications = new Dictionary <string, string>(); Dictionary <string, int> TotalLabelingSites = new Dictionary <string, int>(); Dictionary <string, int> LabelingSitesHit = new Dictionary <string, int>(); Dictionary <string, double> LabelingEfficiency = new Dictionary <string, double>(); List <Modification> mods; List <string> AminosOfInterest = new List <string>(); string[] Mods = new string[] { nmod, kmod, xmod }; // "Prime" the dictionaries foreach (var item in Mods) { if (item == null) { continue; } var splitString = item.Split('@'); // add the key: value pairs as mass@AA:AA Modifications.Add(item, splitString.Last()); // and AA:int TotalLabelingSites.Add(splitString.Last(), 0); LabelingSitesHit.Add(splitString.Last(), 0); AminosOfInterest.Add(splitString.Last()); } foreach (PsmData psm in psms) { mods = psm.Mods; // check the sequence in two steps. First the n-terminus, then remove the n-terminus and check the rest of it. // FIRST STEP: N-TERMINUS if (nmod != null) { // check if the first residue is lysine if (psm.Seq[0] == 'K') { // if so, we need to see if it was only labeled once. Skip the psm if that is the case because it is ambiguous IEnumerable <Modification> nMods = from x in mods where x.Loc == 0 select x; int numMods = nMods.Count(); if (numMods == 1) { // we can't know which reactive site is modified, so don't include this peptide continue; } if (numMods == 0) { // nothing is labeled TotalLabelingSites["["] += 1; if (AminosOfInterest.Contains("K")) { TotalLabelingSites["K"] += 1; } } if (numMods == 2) { TotalLabelingSites["["] += 1; LabelingSitesHit["["] += 1; if (AminosOfInterest.Contains("K")) { TotalLabelingSites["K"] += 1; LabelingSitesHit["K"] += 1; } } } // If the first residue is not lysine else { IEnumerable <Modification> nMods = from x in mods where x.Loc == 0 select x; // add 1 to total n-termini, because it is always there TotalLabelingSites["["] += 1; // get the aa residue letter string residue = psm.Seq[0].ToString(); //see if it is of interest if (AminosOfInterest.Contains(residue)) { // if so, add 1 to total sites for it TotalLabelingSites[residue] += 1; } // now go through each detected modification foreach (Modification mod in nMods) { if (nmod.Contains(mod.Mass.ToString())) { LabelingSitesHit["["] += 1; } else { if (AminosOfInterest.Contains(mod.AA)) { LabelingSitesHit[mod.AA] += 1; } } } } } int start; if (nmod != null) { start = 1; } else { start = 0; } // now continue with the rest for (int i = start; i < psm.Seq.Length; i++) { // check if we care about this amino acid string aa = psm.Seq[i].ToString(); if (AminosOfInterest.Contains(aa)) { // add one to potential labeling sites TotalLabelingSites[aa] += 1; // There should only ever be one modification for each of the rest of the residues, so we can reference it by location to see if it exists bool hit = (from x in mods where x.Loc == i select 1).Count() == 1; if (hit) { LabelingSitesHit[aa] += 1; } } else { continue; } } } // spit out some metrics to the console foreach (string aa in AminosOfInterest) { if (aa == "[") { Console.WriteLine("Total N-term sites: {0}", TotalLabelingSites["["]); } else { Console.WriteLine("Total {0} sites: {1}", aa, TotalLabelingSites[aa]); } } foreach (string aa in AminosOfInterest) { if (aa == "[") { Console.WriteLine("Missed modifications at N-term: {0}", TotalLabelingSites["["] - LabelingSitesHit["["]); } else { Console.WriteLine("Missed modifications at {0}: {1}", aa, TotalLabelingSites[aa] - LabelingSitesHit[aa]); } } // calculate labelling efficiency for each site foreach (var aa in AminosOfInterest) { double efficiency = (double)LabelingSitesHit[aa] / TotalLabelingSites[aa]; LabelingEfficiency.Add(aa, efficiency); if (aa == "[") { Console.WriteLine("Modification frequency at N-term: {0}", efficiency); } else { Console.WriteLine("Modification frequency at {0}: {1}", aa, efficiency); } // if the sites are n-term or K add them to their own attributes if (aa == "[") { searchMetrics.LabelingEfficiencyAtNTerm = efficiency; searchMetrics.SearchData.NLabelSites = TotalLabelingSites[aa]; searchMetrics.SearchData.NLabelSitesHit = LabelingSitesHit[aa]; } else { if (aa == "K") { searchMetrics.LabelingEfficiencyAtK = efficiency; searchMetrics.SearchData.KLabelSites = TotalLabelingSites[aa]; searchMetrics.SearchData.KLabelSitesHit = LabelingSitesHit[aa]; } // if not, then add it to xmod attributes else { searchMetrics.LabelingEfficiencyAtX = efficiency; searchMetrics.LabelX = aa; searchMetrics.SearchData.XLabelSites = TotalLabelingSites[aa]; searchMetrics.SearchData.XLabelSitesHit = LabelingSitesHit[aa]; } } } }
public static void ParsePSMs(this SearchMetricsContainer searchMetrics, PsmDataCollection psmCollection, WorkflowParameters parameters) { int numGoodPSMs, pepsWithNoMissedCleavages; IEnumerable <int> charges; double IdRate, chargeRatio3to2, chargeRatio4to2; double digestionEfficiency, topDecoyScore; double missedCleavageRate; Dictionary <int, int> numCharges = new Dictionary <int, int>(); List <PsmData> psms; IEnumerable <PsmData> goodPsms, nonDecoys; int numSearched = parameters.QcParams.NumberSpectra; // convert the dictionary to a list for easy parsing psms = psmCollection.Values.ToList(); // get the top decoy score topDecoyScore = (from x in psms where x.Decoy select x.Hyperscore) .ToArray().Percentile(95); // get the non-decoys nonDecoys = from x in psms where !x.Decoy select x; // and select the non-decoy hits which are above the top decoy score goodPsms = from x in psms where !x.Decoy & x.Hyperscore > topDecoyScore select x; Console.WriteLine("Total hits: {0}", psms.Count()); Console.WriteLine("Top decoy score: {0}", topDecoyScore); Console.WriteLine("Non-decoy hits: {0}", nonDecoys.Count()); Console.WriteLine("Non-decoy hits above top decoy score: {0}", goodPsms.Count()); // parse out the charges charges = from x in goodPsms select x.Charge; // get the number of each charge, add to a dictionary foreach (int charge in new List <int>() { 2, 3, 4 }) { numCharges.Add(charge, (from x in charges where x == charge select 1).Count()); } // calculate charge ratios chargeRatio3to2 = Convert.ToDouble(numCharges[3]) / Convert.ToDouble(numCharges[2]); chargeRatio4to2 = Convert.ToDouble(numCharges[4]) / Convert.ToDouble(numCharges[2]); // parse out the missed cleavage data pepsWithNoMissedCleavages = (from x in goodPsms where x.MissedCleavages == 0 select 1).Sum(); // number of PSMs is the length of this collection numGoodPSMs = goodPsms.Count(); // missed cleavages per PSM digestionEfficiency = (double)pepsWithNoMissedCleavages / numGoodPSMs; Console.WriteLine("Digestion efficiency: {0}", digestionEfficiency); // get missed cleavage rate, i.e. number of missed cleavages per psm missedCleavageRate = (double)(from x in goodPsms select x.MissedCleavages).Sum() / numGoodPSMs; Console.WriteLine("Missed cleavage rate (/PSM): {0}", missedCleavageRate); // calculate ID rate IdRate = (double)numGoodPSMs / numSearched; Console.WriteLine("IDrate: {0}", IdRate); // get labeling efficiency metrics if ((parameters.QcParams.NMod != null) | (parameters.QcParams.KMod != null) | (parameters.QcParams.XMod != null)) { searchMetrics.GetModificationFrequency(goodPsms, parameters); } // get median mass drift searchMetrics.MedianMassDrift = (from x in goodPsms select x.MassDrift) .ToArray().Percentile(50); searchMetrics.SearchData.PSMsWithNoMissedCleavages = pepsWithNoMissedCleavages; searchMetrics.SearchData.TotalNumGoodPSMs = numGoodPSMs; searchMetrics.SearchData.NumCharge2 = numCharges[2]; searchMetrics.SearchData.NumCharge3 = numCharges[3]; searchMetrics.SearchData.NumCharge4 = numCharges[4]; searchMetrics.IdentificationRate = IdRate; searchMetrics.MissedCleavageRate = missedCleavageRate; searchMetrics.DigestionEfficiency = digestionEfficiency; searchMetrics.ChargeRatio3to2 = chargeRatio3to2; searchMetrics.ChargeRatio4to2 = chargeRatio4to2; }
public static void UniversalDDA(IRawFileThreadManager rawFileThreadManager, WorkflowParameters parameters, QcDataCollection qcDataCollection) { MethodDataContainer methodData; CentroidStreamCollection centroidStreams; SegmentScanCollection segmentScans; TrailerExtraCollection trailerExtras; PrecursorMassCollection precursorMasses; RetentionTimeCollection retentionTimes; ScanEventReactionCollection reactions; ScanMetaDataCollectionDDA metaData = null; PrecursorPeakCollection peakData = null; int nScans; var staticRawFile = rawFileThreadManager.CreateThreadAccessor(); staticRawFile.SelectInstrument(Device.MS, 1); var err = staticRawFile.FileError; if (err.HasError) { Console.WriteLine("ERROR: {0} reports error code: {1}. The associated message is: {2}", Path.GetFileName(staticRawFile.FileName), err.ErrorCode, err.ErrorMessage); Console.WriteLine("Skipping this file"); Log.Error("{FILE} reports error code: {ERRORCODE}. The associated message is: {ERRORMESSAGE}", Path.GetFileName(staticRawFile.FileName), err.ErrorCode, err.ErrorMessage); return; } //staticRawFile.CheckIfBoxcar(); (ScanIndex Index, PrecursorScanCollection precursorScans, ScanDependentsCollections scanDependents) = Extract.ScanIndicesPrecursorsDependents(rawFileThreadManager, MaxProcesses: parameters.MaxProcesses); nScans = Index.ScanEnumerators[MSOrderType.Ms2].Length; using (var rawFile = rawFileThreadManager.CreateThreadAccessor()) { reactions = Extract.ScanEvents(rawFile, Index); methodData = Extract.MethodData(rawFile, Index); (centroidStreams, segmentScans) = Extract.MsData(rawFile: rawFile, index: Index); trailerExtras = Extract.TrailerExtras(rawFile, Index); precursorMasses = Extract.PrecursorMasses(rawFile, precursorScans, trailerExtras, Index); retentionTimes = Extract.RetentionTimes(rawFile, Index); } if (parameters.ParseParams.Parse | parameters.ParseParams.Quant | parameters.ParseParams.Metrics | parameters.RefineMassCharge | parameters.QcParams.QcDirectory != null) { peakData = AnalyzePeaks.AnalyzeAllPeaks(centroidStreams, retentionTimes, precursorMasses, precursorScans, Index, parameters.MaxProcesses); if (parameters.RefineMassCharge) { MonoIsoPredictor.RefineMonoIsoMassChargeValues(parameters, centroidStreams, precursorMasses, trailerExtras, peakData, precursorScans); } metaData = MetaDataProcessingDDA.AggregateMetaDataDDA(centroidStreams, segmentScans, methodData, precursorScans, trailerExtras, precursorMasses, retentionTimes, scanDependents, reactions, Index, parameters.MaxProcesses); } QuantDataCollection quantData = null; if (parameters.ParseParams.Quant) { quantData = Quantification.Quantify(centroidStreams, segmentScans, parameters, methodData, Index); } RawMetricsDataDDA rawMetrics = null; if (parameters.ParseParams.Metrics | parameters.QcParams.QcDirectory != null) { rawMetrics = MetaDataProcessingDDA.GetMetricsDataDDA(metaData, methodData, staticRawFile.FileName, retentionTimes, Index, peakData, precursorScans, quantData); } if (parameters.ParseParams.Metrics) { MetricsWriter.WriteMatrix(rawMetrics, null, staticRawFile.FileName, parameters.ParseParams.OutputDirectory); } if (parameters.ParseParams.Parse | parameters.ParseParams.Quant) { string matrixFileName = ReadWrite.GetPathToFile(parameters.ParseParams.OutputDirectory, staticRawFile.FileName, "_Matrix.txt"); /* * ParseWriter writerDDA = new ParseWriter(matrixFileName, centroidStreams, segmentScans, metaData, retentionTimes, * precursorMasses, precursorScans, peakData, trailerExtras, Index, quantData); * writerDDA.WriteMatrixDDA(methodData.AnalysisOrder); */ MatrixWriter.ParseQuantDDA(matrixFileName, centroidStreams, segmentScans, metaData, retentionTimes, precursorMasses, precursorScans, peakData, trailerExtras, Index, quantData); } if (parameters.ParseParams.WriteMgf) { //ParseWriter writerMGF = new ParseWriter(centroidStreams, segmentScans, parameters, retentionTimes, precursorMasses, precursorScans, trailerExtras, methodData, Index); //writerMGF.WriteMGF(staticRawFile.FileName); MgfWriter.WriteMGF(staticRawFile.FileName, centroidStreams, segmentScans, parameters, retentionTimes, precursorMasses, precursorScans, trailerExtras, methodData, Index); } if (parameters.ParseParams.Chromatogram != null) { ChromatogramWriter.WriteChromatogram(centroidStreams, segmentScans, retentionTimes, methodData, Index, parameters, staticRawFile.FileName); } if (parameters.QcParams.QcDirectory != null) { qcDataCollection = QC.QcWorkflow.LoadOrCreateQcCollection(parameters); SearchMetricsContainer searchMetrics = new SearchMetricsContainer(staticRawFile.FileName, staticRawFile.CreationDate, methodData); // check if the raw file already exists in the QC data with a different name if (QcWorkflow.CheckIfFilePresentInQcCollection(staticRawFile.FileName, qcDataCollection)) { Log.Information("A file with the same creation date and time as {File} already exists in the QC data", staticRawFile.FileName); Console.WriteLine("A file with the same creation date and time as {File} already exists in the QC data. Skipping to next file.", staticRawFile.FileName); } else { if (parameters.QcParams.PerformSearch) { Search.WriteSearchMGF(parameters, centroidStreams, segmentScans, retentionTimes, precursorMasses, precursorScans, trailerExtras, methodData, Index, staticRawFile.FileName, parameters.QcParams.FixedScans); Search.RunSearch(parameters, methodData, staticRawFile.FileName); searchMetrics = SearchQC.ParseSearchResults(searchMetrics, parameters, staticRawFile.FileName, nScans); } QcDataContainer qcData = new QcDataContainer(); qcData.DDA = rawMetrics; qcData.SearchMetrics = searchMetrics; QC.QcWorkflow.UpdateQcCollection(qcDataCollection, qcData, methodData, staticRawFile.FileName); } } }
public static void GetModificationFrequency(this SearchMetricsContainer searchMetrics, IEnumerable <PsmData> psms, WorkflowParameters parameters) { Dictionary <string, string> Modifications = new Dictionary <string, string>(); Dictionary <char, int> TotalModificationsSites = new Dictionary <char, int>(); Dictionary <string, int> ModificationSitesHit = new Dictionary <string, int>(); Dictionary <string, double> ModificationEfficiency = new Dictionary <string, double>(); Dictionary <string, int> AmbiguousSites = new Dictionary <string, int>(); List <Modification> mods; List <char> AminosOfInterest = new List <char>(); string[] Mods = parameters.QcParams.VariableMods.Split(','); //TotalLabelingSites.Add('[', 0); //AmbiguousSites.Add("[", 0); // "Prime" the dictionaries foreach (var item in Mods) { if (item == null) { continue; } var splitString = item.Split('@'); Modifications.Add(splitString.Last(), splitString.First()); TotalModificationsSites.Add(splitString.Last().Last(), 0); AmbiguousSites.Add(splitString.Last(), 0); ModificationSitesHit.Add(item, 0); AminosOfInterest.Add(splitString.Last().Last()); } bool nTermIsVariable = Modifications.ContainsKey("["); foreach (PsmData psm in psms) { mods = psm.Mods.OrderBy(x => x.Loc).ToList(); int startLoc = 0; if (AminosOfInterest.Contains(psm.Seq[0]) && nTermIsVariable && Modifications["["] == Modifications[psm.Seq[0].ToString()]) { // if both nTerm and the AA are variable and have the same modification mass, // then we can't tell which one is labeled if only one gets a hit. // If that is the case, skip the first amino acid in the sequence int hits = (from x in mods where x.Loc == 0 select 1).Count(); if (hits == 1) { startLoc = 1; AmbiguousSites["["]++; AmbiguousSites[psm.Seq[0].ToString()]++; }// do nothing else if (hits == 0) { startLoc = 0; } else { startLoc = 2; ModificationSitesHit[Modifications[psm.Seq[0].ToString()] + "@["]++; ModificationSitesHit[Modifications[psm.Seq[0].ToString()] + "@" + psm.Seq[0].ToString()]++; } } for (int i = startLoc; i < mods.Count; i++) { var mod = mods[i]; if (mod.Loc == 0 & ModificationSitesHit.ContainsKey(mod.Mass.ToString() + "@[")) { ModificationSitesHit[mod.Mass.ToString() + "@["]++; } else if (ModificationSitesHit.ContainsKey(mod.MassAtAa)) { ModificationSitesHit[mod.MassAtAa]++; } } // now add to total modifications sites if (nTermIsVariable) { TotalModificationsSites['[']++; } foreach (var aa in psm.Seq) { if (AminosOfInterest.Contains(aa)) { if (TotalModificationsSites.ContainsKey(aa)) { TotalModificationsSites[aa]++; } } } } // remove ambiguous modification sites from total modification sites foreach (var site in TotalModificationsSites.Keys.ToList()) { TotalModificationsSites[site] = TotalModificationsSites[site] - AmbiguousSites[site.ToString()]; } // spit out some metrics to the console and write efficiencies to the search metrics container /* * foreach (char aa in AminosOfInterest) * { * if (aa == '[') * { * Console.WriteLine("Total N-term sites: {0}", TotalLabelingSites['[']); * } * else * { * Console.WriteLine("Total {0} sites: {1}", aa, TotalLabelingSites[aa]); * } * }*/ foreach (var hits in ModificationSitesHit) { double efficiency = (double)hits.Value / TotalModificationsSites[hits.Key.Last()]; Console.WriteLine("{0} modification frequency: {1}", hits.Key, efficiency); searchMetrics.ModificationFrequency[hits.Key] = efficiency; } }
public static void QcDDA(IRawFileThreadManager rawFileThreadManager, WorkflowParameters parameters) { MethodDataContainer methodData; CentroidStreamCollection centroidStreams; SegmentScanCollection segmentScans; TrailerExtraCollection trailerExtras; PrecursorMassCollection precursorMasses; RetentionTimeCollection retentionTimes; ScanEventReactionCollection reactions; var staticRawFile = rawFileThreadManager.CreateThreadAccessor(); staticRawFile.SelectInstrument(Device.MS, 1); staticRawFile.CheckIfBoxcar(); (ScanIndex Index, PrecursorScanCollection precursorScans, ScanDependentsCollections scanDependents) = Extract.ScanIndicesPrecursorsDependents(rawFileThreadManager); using (var rawFile = rawFileThreadManager.CreateThreadAccessor()) { methodData = Extract.MethodData(rawFile, Index); reactions = Extract.ScanEvents(rawFile, Index); (centroidStreams, segmentScans) = Extract.MsData(rawFile: rawFile, index: Index); trailerExtras = Extract.TrailerExtras(rawFile, Index); precursorMasses = Extract.PrecursorMasses(rawFile, precursorScans, trailerExtras, Index); retentionTimes = Extract.RetentionTimes(rawFile, Index); } PrecursorPeakCollection peakData = AnalyzePeaks.AnalyzeAllPeaks(centroidStreams, retentionTimes, precursorMasses, precursorScans, Index); if (parameters.RefineMassCharge) { MonoIsoPredictor.RefineMonoIsoMassChargeValues(centroidStreams, precursorMasses, trailerExtras, peakData, precursorScans); } ScanMetaDataCollectionDDA metaData = MetaDataProcessingDDA.AggregateMetaDataDDA(centroidStreams, segmentScans, methodData, precursorScans, trailerExtras, precursorMasses, retentionTimes, scanDependents, reactions, Index); RawMetricsDataDDA rawMetrics = MetaDataProcessingDDA.GetMetricsDataDDA(metaData, methodData, staticRawFile.FileName, retentionTimes, Index, peakData, precursorScans); QcDataCollection qcDataCollection = QC.QcWorkflow.LoadOrCreateQcCollection(parameters); SearchMetricsContainer searchMetrics = new SearchMetricsContainer(staticRawFile.FileName, staticRawFile.CreationDate, methodData); if (parameters.QcParams.PerformSearch) { Search.WriteSearchMGF(parameters, centroidStreams, segmentScans, retentionTimes, precursorMasses, precursorScans, trailerExtras, methodData, Index, staticRawFile.FileName, parameters.QcParams.FixedScans); Search.RunSearch(parameters, methodData, staticRawFile.FileName); searchMetrics = SearchQC.ParseSearchResults(searchMetrics, parameters, staticRawFile.FileName); } QcDataContainer qcData = new QcDataContainer(); qcData.DDA = rawMetrics; qcData.SearchMetrics = searchMetrics; QC.QcWorkflow.UpdateQcCollection(qcDataCollection, qcData, methodData, staticRawFile.FileName); staticRawFile.Dispose(); }