public static void ParseSearchResults(this QcDataContainer qcData, RawDataCollection rawData, IRawDataPlus rawFile, QcParameters qcParameters) { XElement results = LoadSearchResults(qcParameters, rawData); PsmDataCollection Psms = ExtractPsmData(results, qcParameters.searchParameters.SearchAlgorithm); qcData.ParsePSMs(Psms, qcParameters); }
public static void UpdateQcCollection(QcDataCollection qcDataCollection, QcDataContainer newQcData, MethodDataContainer methodData, string rawFileName) { qcDataCollection.QcData.Add(methodData.CreationDate, newQcData); qcDataCollection.ProcessedRawFiles.Add(Path.GetFileName(rawFileName)); qcDataCollection.WriteQcToTable(); Console.WriteLine("QC data written to csv file."); try { XmlSerialization.WriteToXmlFile <QcDataCollection>(qcDataCollection.QcFile, qcDataCollection); Log.Information("QC file saved successfully"); Console.WriteLine("QC file saved successfully"); } catch (Exception e) { Log.Error(e, "Failed during serialization of QC data"); Console.WriteLine("ERROR: failure during serialization of QC data."); Environment.Exit(1); } }
public static void ChromIntMetrics(this QcDataContainer qcData, RawDataCollection rawData, MetricsData metrics) { double firstRtToExceed10 = 0; double lastRtToExceed10 = 0; double proportionCovered; var scans = rawData.scanIndex.ScanEnumerators[MSOrderType.Ms]; var reversedScans = scans.Reverse(); var totalIntList = (from x in scans select rawData.metaData[x].SummedIntensity).ToArray(); // get Q1 of total intensity from all scans double threshold = totalIntList.Max() / 10; // get first RT which exceeds Q1 for (int i = 0; i < scans.Length; i++) { int scan = scans[i]; if (totalIntList.MovingAverage(i, 20) > threshold) { firstRtToExceed10 = rawData.retentionTimes[scan]; break; } } for (int i = scans.Length - 1; i >= 0; i--) { int scan = scans[i]; if (totalIntList.MovingAverage(i, 20) > threshold) { lastRtToExceed10 = rawData.retentionTimes[scan]; break; } } // get proportion of run encompassed by these times //proportionCovered = (lastRtToExceedQ1 - firstRtToExceedQ1) / metrics.TotalAnalysisTime; proportionCovered = (lastRtToExceed10 - firstRtToExceed10) / rawData.retentionTimes[rawData.scanIndex.ScanEnumerators[MSOrderType.Ms].Last()]; qcData.TimeBeforeFirstScanToExceedPoint1MaxIntensity = firstRtToExceed10;// - rawData.retentionTimes[1]; qcData.TimeAfterLastScanToExceedPoint1MaxIntensity = rawData.retentionTimes[rawData.scanIndex.ScanEnumerators[MSOrderType.Ms].Last()] - lastRtToExceed10; qcData.FractionOfRunAbovePoint1MaxIntensity = proportionCovered; }
public static void GetModificationFrequency(this QcDataContainer qcData, IEnumerable <PsmData> psms, SearchParameters searchParameters) { string nmod = searchParameters.NMod; string kmod = searchParameters.KMod; string xmod = searchParameters.XMod; Dictionary <string, string> Modifications = new Dictionary <string, string>(); Dictionary <string, int> TotalLabelingSites = new Dictionary <string, int>(); Dictionary <string, int> LabelingSitesHit = new Dictionary <string, int>(); Dictionary <string, double> LabelingEfficiency = new Dictionary <string, double>(); List <Modification> mods; List <string> AminosOfInterest = new List <string>(); string[] Mods = new string[] { nmod, kmod, xmod }; // "Prime" the dictionaries foreach (var item in Mods) { if (item == null) { continue; } var splitString = item.Split('@'); // add the key: value pairs as mass@AA:AA Modifications.Add(item, splitString.Last()); // and AA:int TotalLabelingSites.Add(splitString.Last(), 0); LabelingSitesHit.Add(splitString.Last(), 0); AminosOfInterest.Add(splitString.Last()); } // now we need to get at labeling efficiency int KTotalSites = 0; int KTotalMissed = 0; int NTotalSites = 0; int NTotalMissed = 0; int XTotalSites = 0; int XTotalMissed = 0; // define a function to return the number of labeling sites int NumberOfSites(string aa, string seq) { if (aa == "[" | aa == "]") { return(1); } else { return(seq.Count(x => x.ToString() == aa)); } } foreach (PsmData psm in psms) { mods = psm.Mods; bool skipNterm = false; // check the sequence in two steps. First the n-terminus, then remove the n-terminus and check the rest of it. // FIRST STEP: N-TERMINUS if (nmod != null) { // check if the first residue is lysine if (psm.Seq[0] == 'K') { // if so, we need to see if it was only labeled once. Skip the psm if that is the case because it is ambiguous IEnumerable <Modification> nMods = from x in mods where x.Loc == 0 select x; int numMods = nMods.Count(); if (numMods == 1) { // we can't know which reactive site is modified, so don't include this peptide continue; } if (numMods == 0) { // nothing is labeled TotalLabelingSites["["] += 1; if (AminosOfInterest.Contains("K")) { TotalLabelingSites["K"] += 1; } } if (numMods == 2) { TotalLabelingSites["["] += 1; LabelingSitesHit["["] += 1; if (AminosOfInterest.Contains("K")) { TotalLabelingSites["K"] += 1; LabelingSitesHit["K"] += 1; } } } // If the first residue is not lysine else { IEnumerable <Modification> nMods = from x in mods where x.Loc == 0 select x; // add 1 to total n-termini, because it is always there TotalLabelingSites["["] += 1; // get the aa residue letter string residue = psm.Seq[0].ToString(); //see if it is of interest if (AminosOfInterest.Contains(residue)) { // if so, add 1 to total sites for it TotalLabelingSites[residue] += 1; } // now go through each detected modification foreach (Modification mod in nMods) { if (nmod.Contains(mod.Mass.ToString())) { LabelingSitesHit["["] += 1; } else { if (AminosOfInterest.Contains(mod.AA)) { LabelingSitesHit[mod.AA] += 1; } } } } } int start; if (nmod != null) { start = 1; } else { start = 0; } // now continue with the rest for (int i = start; i < psm.Seq.Length; i++) { // check if we care about this amino acid string aa = psm.Seq[i].ToString(); if (AminosOfInterest.Contains(aa)) { // add one to potential labeling sites TotalLabelingSites[aa] += 1; // There should only ever be one modification for each of the rest of the residues, so we can reference it by location to see if it exists bool hit = (from x in mods where x.Loc == i select 1).Count() == 1; if (hit) { LabelingSitesHit[aa] += 1; } } else { continue; } } } // spit out some metrics to the console foreach (string aa in AminosOfInterest) { if (aa == "[") { Console.WriteLine("Total N-term sites: {0}", TotalLabelingSites["["]); } else { Console.WriteLine("Total {0} sites: {1}", aa, TotalLabelingSites[aa]); } } foreach (string aa in AminosOfInterest) { if (aa == "[") { Console.WriteLine("Missed modifications at N-term: {0}", TotalLabelingSites["["] - LabelingSitesHit["["]); } else { Console.WriteLine("Missed modifications at {0}: {1}", aa, TotalLabelingSites[aa] - LabelingSitesHit[aa]); } } // calculate labelling efficiency for each site foreach (var aa in AminosOfInterest) { double efficiency = (double)LabelingSitesHit[aa] / TotalLabelingSites[aa]; LabelingEfficiency.Add(aa, efficiency); if (aa == "[") { Console.WriteLine("Modification frequency at N-term: {0}", efficiency); } else { Console.WriteLine("Modification frequency at {0}: {1}", aa, efficiency); } // if the sites are n-term or K add them to their own attributes if (aa == "[") { qcData.LabelingEfficiencyAtNTerm = efficiency; } else { if (aa == "K") { qcData.LabelingEfficiencyAtK = efficiency; } // if not, then add it to xmod attributes else { qcData.LabelingEfficiencyAtX = efficiency; qcData.LabelX = aa; } } } }
public static void ParsePSMs(this QcDataContainer qcData, PsmDataCollection psmCollection, QcParameters qcParameters) { XElement results, searchSummary; IEnumerable <XElement> decoyPSMs, search_hits, spectrumQueries; int numGoodPSMs, totalCleavageSites, pepsWithNoMissedCleavages, peptidesWithNoMissedCleavages; IEnumerable <int> allMissedCleavages, charges; double IdRate, chargeRatio3to2, chargeRatio4to2; double digestionEfficiencyByCleavage, digestionEfficiency, topDecoyScore; double missedCleavageRate; Dictionary <int, int> numCharges = new Dictionary <int, int>(); SearchParameters searchParameters = qcParameters.searchParameters; int numSearched = searchParameters.NumSpectra; List <PsmData> psms; IEnumerable <PsmData> goodPsms, nonDecoys; // convert the dictionary to a list for easy parsing psms = psmCollection.Values.ToList(); // get the top decoy score topDecoyScore = (from x in psms where x.Decoy select x.Hyperscore) .ToArray().Percentile(95); // get the non-decoys nonDecoys = from x in psms where !x.Decoy select x; // and select the non-decoy hits which are above the top decoy score goodPsms = from x in psms where !x.Decoy & x.Hyperscore > topDecoyScore select x; Console.WriteLine("Total hits: {0}", psms.Count()); Console.WriteLine("Top decoy score: {0}", topDecoyScore); Console.WriteLine("Non-decoy hits: {0}", nonDecoys.Count()); Console.WriteLine("Non-decoy hits above top decoy score: {0}", goodPsms.Count()); // parse out the charges charges = from x in goodPsms select x.Charge; // get the number of each charge, add to a dictionary foreach (int charge in new List <int>() { 2, 3, 4 }) { numCharges.Add(charge, (from x in charges where x == charge select 1).Count()); } // calculate charge ratios chargeRatio3to2 = Convert.ToDouble(numCharges[3]) / Convert.ToDouble(numCharges[2]); chargeRatio4to2 = Convert.ToDouble(numCharges[4]) / Convert.ToDouble(numCharges[2]); // parse out the missed cleavage data pepsWithNoMissedCleavages = (from x in goodPsms where x.MissedCleavages == 0 select 1).Sum(); // number of PSMs is the length of this collection numGoodPSMs = goodPsms.Count(); // missed cleavages per PSM digestionEfficiency = (double)pepsWithNoMissedCleavages / numGoodPSMs; Console.WriteLine("Digestion efficiency: {0}", digestionEfficiency); // get missed cleavage rate, i.e. number of missed cleavages per psm missedCleavageRate = (double)(from x in goodPsms select x.MissedCleavages).Sum() / numGoodPSMs; Console.WriteLine("Missed cleavage rate (/PSM): {0}", missedCleavageRate); // calculate ID rate IdRate = (double)numGoodPSMs / numSearched; Console.WriteLine("IDrate: {0}", IdRate); // get labeling efficiency metrics if ((searchParameters.NMod != null) | (searchParameters.KMod != null) | (searchParameters.XMod != null)) { qcData.GetModificationFrequency(goodPsms, searchParameters); } // get median mass drift qcData.MedianMassDrift = (from x in goodPsms select x.MassDrift) .ToArray().Percentile(50); qcData.IdentificationRate = IdRate; qcData.MissedCleavageRate = missedCleavageRate; qcData.DigestionEfficiency = digestionEfficiency; qcData.ChargeRatio3to2 = chargeRatio3to2; qcData.ChargeRatio4to2 = chargeRatio4to2; }
public static QcDataContainer ProcessQcData(this QcDataCollection Data, RawDataCollection rawData, IRawDataPlus rawFile, string qcDirectory, string fastaDB = null) { DateTime dateAcquired = rawFile.CreationDate; //RawDataCollection rawData = new RawDataCollection(rawFile); MetricsData metricsData = new MetricsData(); metricsData.GetMetricsData(metaData: rawData.metaData, rawData: rawData, rawFile: rawFile); QcDataContainer qcData = new QcDataContainer(rawData.rawFileName, dateAcquired); qcData.Instrument = rawData.instrument; qcData.ExperimentMsOrder = rawData.methodData.AnalysisOrder; qcData.Ms1Analyzer = rawData.methodData.MassAnalyzers[MSOrderType.Ms].ToString(); qcData.Ms2Analyzer = rawData.methodData.MassAnalyzers[MSOrderType.Ms2].ToString(); if (qcData.ExperimentMsOrder == MSOrderType.Ms3) { qcData.Ms3Analyzer = rawData.methodData.MassAnalyzers[MSOrderType.Ms3].ToString(); } else { qcData.Ms3Analyzer = "None"; } qcData.TotalScans = metricsData.TotalScans; qcData.NumMs1Scans = metricsData.MS1Scans; qcData.NumMs2Scans = metricsData.MS2Scans; qcData.NumMs3Scans = metricsData.MS3Scans; qcData.Ms1ScanRate = metricsData.MS1ScanRate; qcData.Ms2ScanRate = metricsData.MS2ScanRate; qcData.MeanDutyCycle = metricsData.MeanDutyCycle; qcData.MeanTopN = metricsData.MeanTopN; qcData.MedianPrecursorIntensity = metricsData.MedianPrecursorIntensity; qcData.MedianSummedMs2Intensity = metricsData.MedianSummedMS2Intensity; qcData.MedianMs1IsolationInterference = metricsData.MedianMs1IsolationInterference; qcData.MedianMs2FractionConsumingTop80PercentTotalIntensity = metricsData.MedianMs2FractionConsumingTop80PercentTotalIntensity; qcData.NumEsiStabilityFlags = NumberOfEsiFlags(rawData); qcData.QuantMeta = metricsData.QuantMeta; qcData.GradientTime = metricsData.Gradient; qcData.ColumnPeakCapacity = metricsData.PeakCapacity; qcData.ChromIntMetrics(rawData, metricsData); if (!rawData.isBoxCar) { qcData.PeakShape.Asymmetry.P10 = rawData.peakData.PeakShapeMedians.Asymmetry.P10; qcData.PeakShape.Asymmetry.P50 = rawData.peakData.PeakShapeMedians.Asymmetry.P50; qcData.PeakShape.Width.P10 = rawData.peakData.PeakShapeMedians.Width.P10; qcData.PeakShape.Width.P50 = rawData.peakData.PeakShapeMedians.Width.P50; } // add the signal-to-noise distribution to the QC data. These are presented as "median of the ith percentile", so for example we take all the 10th percentile values of // the S2N and put them in a list, then report the median of that list qcData.MedianSummedMs1Intensity = (from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms] select rawData.centroidStreams[x].Intensities.Sum()).ToArray().Percentile(50); // add the fill-time distribution to the QC data. This is more straightforward. Just put all the fill times in an array and use it to instantiate a new distribution. qcData.Ms1FillTime = new Distribution((from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms] select rawData.trailerExtras[x].InjectionTime).ToArray()); qcData.Ms2FillTime = new Distribution((from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms2] select rawData.trailerExtras[x].InjectionTime).ToArray()); qcData.Ms3FillTime = new Distribution((from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms3] select rawData.trailerExtras[x].InjectionTime).ToArray()); //Data.QcData.Add(dateAcquired, newData); //Data.ProcessedRawFiles.Add(Path.GetFileName(rawData.rawFileName)); return(qcData); }
public static void DoQc(QcParameters qcParameters) { QcDataCollection qcDataCollection; string dataDirectory = qcParameters.RawFileDirectory; string qcDirectory = qcParameters.QcDirectory; string qcSearchDataDirecotry = qcParameters.QcSearchDataDirectory; SearchParameters searchParameters = qcParameters.searchParameters; // our qc file string qcFile = Path.Combine(qcDirectory, "QC.xml"); // see if the file exists if (File.Exists(qcFile)) { // if so, open it try { qcDataCollection = XmlSerialization.ReadFromXmlFile <QcDataCollection>(qcFile); Log.Information("QC data file loaded successfully"); } catch (Exception e) { Log.Error(e, "Failed while loading QC data"); throw e; } } else { // if not, check if the directory exists if (!Directory.Exists(qcDirectory)) { Directory.CreateDirectory(qcDirectory); } qcDataCollection = new QcDataCollection(dataDirectory, qcDirectory); Log.Information("Appears to be a new QC directory. New QC data collection created."); } // get our list of new raw files. it is every raw file in the directory that is not listed in the qc data var fileList = Directory.GetFiles(dataDirectory, "*.*", SearchOption.TopDirectoryOnly) .Where(s => s.EndsWith(".raw", StringComparison.OrdinalIgnoreCase)).ToList(); if (fileList.Count() == 0) { Log.Error("No raw files found in {Directory}", dataDirectory); Console.WriteLine("{0} contains no raw files!", dataDirectory); Environment.Exit(1); } fileList.RemoveAll(s => qcDataCollection.ProcessedRawFiles.Contains(Path.GetFileName(s))); Log.Information("Raw files in QC queue: {Files}", fileList); if (fileList.Count() == 0) { Log.Information("No new files to QC"); Console.WriteLine("No new files in the directory to QC!"); Environment.Exit(0); } Console.WriteLine("{0} file(s) to process", fileList.Count()); foreach (string fileName in fileList) { Console.WriteLine("Processing {0}", fileName); IFileHeader rawHeader; // try to open the raw file header try { rawHeader = FileHeaderReaderFactory.ReadFile(fileName);; } catch (Exception) { Log.Information("{File} is not a valid raw file", fileName); Console.WriteLine("{0} is not a valid raw file, continuing to next file.", fileName); continue; } // is it a real raw file? if (rawHeader.FileType == FileType.RawFile) { Log.Information("{File} is a valid raw file", fileName); Log.Information("Creation date: {Date}", rawHeader.CreationDate); Log.Information("File description: {Description}", rawHeader.FileDescription); } else { Log.Information("{File} is not a valid raw file", fileName); Console.WriteLine("{0} is not a valid raw file, continuing to next file.", fileName); continue; } // okay, it is probably a real raw file, let's do the QC // check if the raw file already exists in the QC data with a different name if (qcDataCollection.QcData.Keys.Contains(rawHeader.CreationDate)) { Log.Information("A file with the same creation date and time as {File} already exists in the QC data", fileName); Console.WriteLine("{0} appears to already exist in the QC data with the name {1}. Skipping to next file.", fileName, qcDataCollection.QcData[rawHeader.CreationDate].RawFile); continue; } using (IRawDataPlus rawFile = RawFileReaderFactory.ReadFile(fileName)) { rawFile.SelectInstrument(Device.MS, 1); RawDataCollection rawData = new RawDataCollection(rawFile); rawData.ExtractAll(rawFile); /* * if (idpyPars?.QuantMods != null) * { * rawData.quantData.Quantify(rawData, rawFile, ) * } */ QcDataContainer newQcData = ProcessQcData(Data: qcDataCollection, rawData: rawData, rawFile: rawFile, qcDirectory: qcDirectory); if (searchParameters != null) { Search.WriteSearchMGF(qcParameters, rawData, rawFile, searchParameters.FixedScans); Search.RunSearch(qcParameters, rawData, rawFile); newQcData.ParseSearchResults(rawData, rawFile, qcParameters); /* * if (searchParameters.SearchAlgorithm == SearchAlgorithm.XTandem) * { * SearchQC.ParseXTandem(newQcData, qcParameters); * newQcData.IdentipyParameters = String.Format("\"Algorithm: X!Tandem; fmods: {0}; nmod: {1}; kmod: {2}; xmod: {3}; fastaDB: {4}; xtandemDirectory: {5}\"", * searchParameters.FixedMods, searchParameters.NMod, searchParameters.KMod, searchParameters.XMod, searchParameters.FastaDatabase, searchParameters.XTandemDirectory); * } * else * { * SearchQC.ParseIdentipy(newQcData, rawData, rawFile, qcParameters); * newQcData.IdentipyParameters = String.Format("\"Algorithm: IdentiPy; fmods: {0}; nmod: {1}; kmod: {2}; xmod: {3}; fastaDB: {4}; pythonExecutable: {5}; identipyScript: {6}\"", * searchParameters.FixedMods, searchParameters.NMod, searchParameters.KMod, searchParameters.XMod, searchParameters.FastaDatabase, searchParameters.PythonExecutable, searchParameters.IdentipyScript); * } */ } qcDataCollection.QcData.Add(rawFile.CreationDate, newQcData); qcDataCollection.ProcessedRawFiles.Add(Path.GetFileName(rawData.rawFileName)); qcDataCollection.WriteQcToTable(); } Log.Information("QC finished: {File}", fileName); } Log.Information("QC of all files completed"); Console.WriteLine("QC of all files completed!"); try { XmlSerialization.WriteToXmlFile <QcDataCollection>(qcFile, qcDataCollection); Log.Information("QC file saved successfully"); Console.WriteLine("QC file saved successfully"); } catch (Exception e) { Log.Error(e, "Failed during serialization of QC data"); throw e; } }