public static PsmDataCollection ExtractPsmData(XElement results, SearchAlgorithm searchAlgorithm) { PsmDataCollection psms = new PsmDataCollection(); PsmData psm; if (searchAlgorithm == SearchAlgorithm.XTandem) { foreach (var x in results.Descendants("group").Where(x => x?.Element("protein") != null)) { psm = new PsmData(); psm.Id = Convert.ToInt32(x.Attribute("id").Value); psm.Decoy = x.Attribute("label").Value.StartsWith("DECOY_"); // it is possible for each "group" in the pepXML file to have more than one protein. This just means the peptide isn't // unique to a single protein. However, the scoring and modifications are identical (since it is the same PSM), so we // can just use the first protein. That is what we do below. XElement domain = x.Element("protein").Element("peptide").Element("domain"); psm.Seq = domain.Attribute("seq").Value; psm.Start = Convert.ToInt32(domain.Attribute("start").Value); psm.End = Convert.ToInt32(domain.Attribute("end").Value); psm.Hyperscore = Convert.ToDouble(domain.Attribute("hyperscore").Value); psm.ExpectationValue = Convert.ToDouble(domain.Attribute("expect").Value); psm.MassDrift = (Convert.ToDouble(x.Attribute("mh")?.Value) - Convert.ToDouble(domain?.Attribute("mh").Value)) / Convert.ToDouble(domain?.Attribute("mh").Value) * 1e6; psm.Charge = Convert.ToInt32(x.Attribute("z").Value); psm.MissedCleavages = GetMissedCleavages(psm.Seq); // add the modifications, if there are any if (domain?.Elements("aa") != null) { foreach (XElement aa in domain.Elements("aa")) { Modification mod = new Modification(); // we convert the location to a zero-based index of the peptide mod.Loc = Convert.ToInt32(aa.Attribute("at").Value) - psm.Start; mod.AA = aa.Attribute("type").Value; mod.Mass = Convert.ToDouble(aa.Attribute("modified").Value); psm.Mods.Add(mod); } } psms.Add(psm.Id, psm); } } return(psms); }
public static void ParseSearchResults(this QcDataContainer qcData, RawDataCollection rawData, IRawDataPlus rawFile, QcParameters qcParameters) { XElement results = LoadSearchResults(qcParameters, rawData); PsmDataCollection Psms = ExtractPsmData(results, qcParameters.searchParameters.SearchAlgorithm); qcData.ParsePSMs(Psms, qcParameters); }
public static SearchMetricsContainer ParseSearchResults(SearchMetricsContainer searchMetrics, WorkflowParameters parameters, string rawFileName) { XElement results = LoadSearchResults(parameters, rawFileName); PsmDataCollection Psms = ExtractPsmData(results, parameters.QcParams.SearchAlgorithm); searchMetrics.ParsePSMs(Psms, parameters); return(searchMetrics); }
public static PsmDataCollection ExtractPsmData(XElement results, SearchAlgorithm searchAlgorithm) { PsmDataCollection psms = new PsmDataCollection(); PsmData psm; if (searchAlgorithm == SearchAlgorithm.XTandem) { foreach (var x in results.Descendants("group").Where(x => x?.Element("protein") != null)) { psm = new PsmData(); psm.Id = Convert.ToInt32(x.Attribute("id").Value); psm.Decoy = x.Attribute("label").Value.StartsWith("DECOY_"); // it is possible for each "group" in the pepXML file to have more than one protein. This just means the peptide isn't // unique to a single protein. However, the scoring and modifications are identical (since it is the same PSM), so we // can just use the first protein. That is what we do below. XElement domain = x.Element("protein").Element("peptide").Element("domain"); psm.Seq = domain.Attribute("seq").Value; psm.Start = Convert.ToInt32(domain.Attribute("start").Value); psm.End = Convert.ToInt32(domain.Attribute("end").Value); psm.Hyperscore = Convert.ToDouble(domain.Attribute("hyperscore").Value); psm.ExpectationValue = Convert.ToDouble(domain.Attribute("expect").Value); psm.MassDrift = (Convert.ToDouble(x.Attribute("mh")?.Value) - Convert.ToDouble(domain?.Attribute("mh").Value)) / Convert.ToDouble(domain?.Attribute("mh").Value) * 1e6; psm.Charge = Convert.ToInt32(x.Attribute("z").Value); psm.MissedCleavages = GetMissedCleavages(psm.Seq); // add the modifications, if there are any if (domain?.Elements("aa") != null) { foreach (XElement aa in domain.Elements("aa")) { Modification mod = new Modification(); // we convert the location to a zero-based index of the peptide mod.Loc = Convert.ToInt32(aa.Attribute("at").Value) - psm.Start; mod.AA = aa.Attribute("type").Value; mod.Mass = Convert.ToDouble(aa.Attribute("modified").Value); psm.Mods.Add(mod); } } psms.Add(psm.Id, psm); } } if (searchAlgorithm == SearchAlgorithm.IdentiPy) { XNamespace nsp = "http://regis-web.systemsbiology.net/pepXML"; // first we need to make a dictionary of modification masses etc for the identipy results // the keys are the amino acid mass after modification, which is what identipy reports // the values are the mass difference values, which is what is given in the mass@aa arguments to the CLI XElement summary = results.Descendants(nsp + "search_summary").First(); Dictionary <double, double> modInfo = new Dictionary <double, double>(); foreach (XElement mod in summary.Elements(nsp + "aminoacid_modification")) { modInfo.Add(Convert.ToDouble(mod.Attribute("mass").Value), Convert.ToDouble(mod.Attribute("massdiff").Value)); } foreach (XElement mod in summary.Elements(nsp + "terminal_modification")) { modInfo.Add(Convert.ToDouble(mod.Attribute("mass").Value), Convert.ToDouble(mod.Attribute("massdiff").Value)); } // now we can parse out the data foreach (var x in results.Descendants(nsp + "spectrum_query")) { psm = new PsmData(); psm.Id = Convert.ToInt32(x.Attribute("index").Value); XElement searchHit = x.Element(nsp + "search_result").Element(nsp + "search_hit"); psm.Decoy = searchHit.Attribute("protein").Value.StartsWith("DECOY_"); psm.Seq = searchHit.Attribute("peptide").Value; psm.Start = -1; psm.End = -1; psm.Hyperscore = Convert.ToDouble(searchHit.Elements(nsp + "search_score") .Where(y => y.Attribute("name").Value == "hyperscore").First().Attribute("value").Value); psm.ExpectationValue = Convert.ToDouble(searchHit.Elements(nsp + "search_score") .Where(y => y.Attribute("name").Value == "expect").First().Attribute("value").Value); psm.MassDrift = Convert.ToDouble(searchHit.Attribute("massdiff").Value) / Convert.ToDouble(x.Attribute("precursor_neutral_mass").Value) * 1e6; psm.Charge = Convert.ToInt32(x.Attribute("assumed_charge").Value); psm.MissedCleavages = GetMissedCleavages(psm.Seq); // add the modifications, if there are any if (searchHit.Element(nsp + "modification_info")?.Attribute("mod_nterm_mass") != null) { Modification mod = new Modification(); mod.Loc = 0; // its the n-terminus mod.AA = psm.Seq[0].ToString(); mod.Mass = modInfo[Convert.ToDouble(searchHit.Element(nsp + "modification_info").Attribute("mod_nterm_mass").Value)]; psm.Mods.Add(mod); } if (searchHit.Element(nsp + "modification_info")?.Elements(nsp + "mod_aminoacid_mass") != null) { foreach (XElement aa in searchHit.Element(nsp + "modification_info").Elements(nsp + "mod_aminoacid_mass")) { Modification mod = new Modification(); // we convert the location to a zero-based index of the peptide mod.Loc = Convert.ToInt32(aa.Attribute("position").Value) - 1; mod.AA = psm.Seq[mod.Loc].ToString(); mod.Mass = modInfo[Convert.ToDouble(aa.Attribute("mass").Value)]; psm.Mods.Add(mod); } } psms.Add(psm.Id, psm); } } return(psms); }
public static void ParsePSMs(this SearchMetricsContainer searchMetrics, PsmDataCollection psmCollection, WorkflowParameters parameters) { int numGoodPSMs, pepsWithNoMissedCleavages; IEnumerable <int> charges; double IdRate, chargeRatio3to2, chargeRatio4to2; double digestionEfficiency, topDecoyScore; double missedCleavageRate; Dictionary <int, int> numCharges = new Dictionary <int, int>(); List <PsmData> psms; IEnumerable <PsmData> goodPsms, nonDecoys; int numSearched = parameters.QcParams.NumberSpectra; // convert the dictionary to a list for easy parsing psms = psmCollection.Values.ToList(); // get the top decoy score topDecoyScore = (from x in psms where x.Decoy select x.Hyperscore) .ToArray().Percentile(95); // get the non-decoys nonDecoys = from x in psms where !x.Decoy select x; // and select the non-decoy hits which are above the top decoy score goodPsms = from x in psms where !x.Decoy & x.Hyperscore > topDecoyScore select x; Console.WriteLine("Total hits: {0}", psms.Count()); Console.WriteLine("Top decoy score: {0}", topDecoyScore); Console.WriteLine("Non-decoy hits: {0}", nonDecoys.Count()); Console.WriteLine("Non-decoy hits above top decoy score: {0}", goodPsms.Count()); // parse out the charges charges = from x in goodPsms select x.Charge; // get the number of each charge, add to a dictionary foreach (int charge in new List <int>() { 2, 3, 4 }) { numCharges.Add(charge, (from x in charges where x == charge select 1).Count()); } // calculate charge ratios chargeRatio3to2 = Convert.ToDouble(numCharges[3]) / Convert.ToDouble(numCharges[2]); chargeRatio4to2 = Convert.ToDouble(numCharges[4]) / Convert.ToDouble(numCharges[2]); // parse out the missed cleavage data pepsWithNoMissedCleavages = (from x in goodPsms where x.MissedCleavages == 0 select 1).Sum(); // number of PSMs is the length of this collection numGoodPSMs = goodPsms.Count(); // missed cleavages per PSM digestionEfficiency = (double)pepsWithNoMissedCleavages / numGoodPSMs; Console.WriteLine("Digestion efficiency: {0}", digestionEfficiency); // get missed cleavage rate, i.e. number of missed cleavages per psm missedCleavageRate = (double)(from x in goodPsms select x.MissedCleavages).Sum() / numGoodPSMs; Console.WriteLine("Missed cleavage rate (/PSM): {0}", missedCleavageRate); // calculate ID rate IdRate = (double)numGoodPSMs / numSearched; Console.WriteLine("IDrate: {0}", IdRate); // get labeling efficiency metrics if ((parameters.QcParams.NMod != null) | (parameters.QcParams.KMod != null) | (parameters.QcParams.XMod != null)) { searchMetrics.GetModificationFrequency(goodPsms, parameters); } // get median mass drift searchMetrics.MedianMassDrift = (from x in goodPsms select x.MassDrift) .ToArray().Percentile(50); searchMetrics.SearchData.PSMsWithNoMissedCleavages = pepsWithNoMissedCleavages; searchMetrics.SearchData.TotalNumGoodPSMs = numGoodPSMs; searchMetrics.SearchData.NumCharge2 = numCharges[2]; searchMetrics.SearchData.NumCharge3 = numCharges[3]; searchMetrics.SearchData.NumCharge4 = numCharges[4]; searchMetrics.IdentificationRate = IdRate; searchMetrics.MissedCleavageRate = missedCleavageRate; searchMetrics.DigestionEfficiency = digestionEfficiency; searchMetrics.ChargeRatio3to2 = chargeRatio3to2; searchMetrics.ChargeRatio4to2 = chargeRatio4to2; }