Пример #1
0
        public static PsmDataCollection ExtractPsmData(XElement results, SearchAlgorithm searchAlgorithm)
        {
            PsmDataCollection psms = new PsmDataCollection();
            PsmData           psm;

            if (searchAlgorithm == SearchAlgorithm.XTandem)
            {
                foreach (var x in results.Descendants("group").Where(x => x?.Element("protein") != null))
                {
                    psm = new PsmData();

                    psm.Id = Convert.ToInt32(x.Attribute("id").Value);

                    psm.Decoy = x.Attribute("label").Value.StartsWith("DECOY_");

                    // it is possible for each "group" in the pepXML file to have more than one protein. This just means the peptide isn't
                    // unique to a single protein. However, the scoring and modifications are identical (since it is the same PSM), so we
                    // can just use the first protein. That is what we do below.
                    XElement domain = x.Element("protein").Element("peptide").Element("domain");

                    psm.Seq = domain.Attribute("seq").Value;

                    psm.Start = Convert.ToInt32(domain.Attribute("start").Value);

                    psm.End = Convert.ToInt32(domain.Attribute("end").Value);

                    psm.Hyperscore = Convert.ToDouble(domain.Attribute("hyperscore").Value);

                    psm.ExpectationValue = Convert.ToDouble(domain.Attribute("expect").Value);

                    psm.MassDrift = (Convert.ToDouble(x.Attribute("mh")?.Value) - Convert.ToDouble(domain?.Attribute("mh").Value)) /
                                    Convert.ToDouble(domain?.Attribute("mh").Value) * 1e6;

                    psm.Charge = Convert.ToInt32(x.Attribute("z").Value);

                    psm.MissedCleavages = GetMissedCleavages(psm.Seq);

                    // add the modifications, if there are any
                    if (domain?.Elements("aa") != null)
                    {
                        foreach (XElement aa in domain.Elements("aa"))
                        {
                            Modification mod = new Modification();
                            // we convert the location to a zero-based index of the peptide
                            mod.Loc = Convert.ToInt32(aa.Attribute("at").Value) - psm.Start;

                            mod.AA = aa.Attribute("type").Value;

                            mod.Mass = Convert.ToDouble(aa.Attribute("modified").Value);

                            psm.Mods.Add(mod);
                        }
                    }

                    psms.Add(psm.Id, psm);
                }
            }

            return(psms);
        }
Пример #2
0
        public static void ParseSearchResults(this QcDataContainer qcData, RawDataCollection rawData, IRawDataPlus rawFile, QcParameters qcParameters)
        {
            XElement results = LoadSearchResults(qcParameters, rawData);

            PsmDataCollection Psms = ExtractPsmData(results, qcParameters.searchParameters.SearchAlgorithm);

            qcData.ParsePSMs(Psms, qcParameters);
        }
Пример #3
0
        public static SearchMetricsContainer ParseSearchResults(SearchMetricsContainer searchMetrics, WorkflowParameters parameters, string rawFileName)
        {
            XElement results = LoadSearchResults(parameters, rawFileName);

            PsmDataCollection Psms = ExtractPsmData(results, parameters.QcParams.SearchAlgorithm);

            searchMetrics.ParsePSMs(Psms, parameters);

            return(searchMetrics);
        }
Пример #4
0
        public static PsmDataCollection ExtractPsmData(XElement results, SearchAlgorithm searchAlgorithm)
        {
            PsmDataCollection psms = new PsmDataCollection();
            PsmData           psm;

            if (searchAlgorithm == SearchAlgorithm.XTandem)
            {
                foreach (var x in results.Descendants("group").Where(x => x?.Element("protein") != null))
                {
                    psm = new PsmData();

                    psm.Id = Convert.ToInt32(x.Attribute("id").Value);

                    psm.Decoy = x.Attribute("label").Value.StartsWith("DECOY_");

                    // it is possible for each "group" in the pepXML file to have more than one protein. This just means the peptide isn't
                    // unique to a single protein. However, the scoring and modifications are identical (since it is the same PSM), so we
                    // can just use the first protein. That is what we do below.
                    XElement domain = x.Element("protein").Element("peptide").Element("domain");

                    psm.Seq = domain.Attribute("seq").Value;

                    psm.Start = Convert.ToInt32(domain.Attribute("start").Value);

                    psm.End = Convert.ToInt32(domain.Attribute("end").Value);

                    psm.Hyperscore = Convert.ToDouble(domain.Attribute("hyperscore").Value);

                    psm.ExpectationValue = Convert.ToDouble(domain.Attribute("expect").Value);

                    psm.MassDrift = (Convert.ToDouble(x.Attribute("mh")?.Value) - Convert.ToDouble(domain?.Attribute("mh").Value)) /
                                    Convert.ToDouble(domain?.Attribute("mh").Value) * 1e6;

                    psm.Charge = Convert.ToInt32(x.Attribute("z").Value);

                    psm.MissedCleavages = GetMissedCleavages(psm.Seq);

                    // add the modifications, if there are any
                    if (domain?.Elements("aa") != null)
                    {
                        foreach (XElement aa in domain.Elements("aa"))
                        {
                            Modification mod = new Modification();
                            // we convert the location to a zero-based index of the peptide
                            mod.Loc = Convert.ToInt32(aa.Attribute("at").Value) - psm.Start;

                            mod.AA = aa.Attribute("type").Value;

                            mod.Mass = Convert.ToDouble(aa.Attribute("modified").Value);

                            psm.Mods.Add(mod);
                        }
                    }

                    psms.Add(psm.Id, psm);
                }
            }

            if (searchAlgorithm == SearchAlgorithm.IdentiPy)
            {
                XNamespace nsp = "http://regis-web.systemsbiology.net/pepXML";

                // first we need to make a dictionary of modification masses etc for the identipy results
                // the keys are the amino acid mass after modification, which is what identipy reports
                // the values are the mass difference values, which is what is given in the mass@aa arguments to the CLI
                XElement summary = results.Descendants(nsp + "search_summary").First();
                Dictionary <double, double> modInfo = new Dictionary <double, double>();

                foreach (XElement mod in summary.Elements(nsp + "aminoacid_modification"))
                {
                    modInfo.Add(Convert.ToDouble(mod.Attribute("mass").Value), Convert.ToDouble(mod.Attribute("massdiff").Value));
                }
                foreach (XElement mod in summary.Elements(nsp + "terminal_modification"))
                {
                    modInfo.Add(Convert.ToDouble(mod.Attribute("mass").Value), Convert.ToDouble(mod.Attribute("massdiff").Value));
                }

                // now we can parse out the data

                foreach (var x in results.Descendants(nsp + "spectrum_query"))
                {
                    psm = new PsmData();

                    psm.Id = Convert.ToInt32(x.Attribute("index").Value);

                    XElement searchHit = x.Element(nsp + "search_result").Element(nsp + "search_hit");

                    psm.Decoy = searchHit.Attribute("protein").Value.StartsWith("DECOY_");

                    psm.Seq = searchHit.Attribute("peptide").Value;

                    psm.Start = -1;

                    psm.End = -1;

                    psm.Hyperscore = Convert.ToDouble(searchHit.Elements(nsp + "search_score")
                                                      .Where(y => y.Attribute("name").Value == "hyperscore").First().Attribute("value").Value);

                    psm.ExpectationValue = Convert.ToDouble(searchHit.Elements(nsp + "search_score")
                                                            .Where(y => y.Attribute("name").Value == "expect").First().Attribute("value").Value);

                    psm.MassDrift = Convert.ToDouble(searchHit.Attribute("massdiff").Value) / Convert.ToDouble(x.Attribute("precursor_neutral_mass").Value) * 1e6;

                    psm.Charge = Convert.ToInt32(x.Attribute("assumed_charge").Value);

                    psm.MissedCleavages = GetMissedCleavages(psm.Seq);

                    // add the modifications, if there are any
                    if (searchHit.Element(nsp + "modification_info")?.Attribute("mod_nterm_mass") != null)
                    {
                        Modification mod = new Modification();

                        mod.Loc = 0; // its the n-terminus

                        mod.AA = psm.Seq[0].ToString();

                        mod.Mass = modInfo[Convert.ToDouble(searchHit.Element(nsp + "modification_info").Attribute("mod_nterm_mass").Value)];

                        psm.Mods.Add(mod);
                    }

                    if (searchHit.Element(nsp + "modification_info")?.Elements(nsp + "mod_aminoacid_mass") != null)
                    {
                        foreach (XElement aa in searchHit.Element(nsp + "modification_info").Elements(nsp + "mod_aminoacid_mass"))
                        {
                            Modification mod = new Modification();
                            // we convert the location to a zero-based index of the peptide
                            mod.Loc = Convert.ToInt32(aa.Attribute("position").Value) - 1;

                            mod.AA = psm.Seq[mod.Loc].ToString();

                            mod.Mass = modInfo[Convert.ToDouble(aa.Attribute("mass").Value)];

                            psm.Mods.Add(mod);
                        }
                    }

                    psms.Add(psm.Id, psm);
                }
            }

            return(psms);
        }
Пример #5
0
        public static void ParsePSMs(this SearchMetricsContainer searchMetrics, PsmDataCollection psmCollection, WorkflowParameters parameters)
        {
            int numGoodPSMs, pepsWithNoMissedCleavages;
            IEnumerable <int>     charges;
            double                IdRate, chargeRatio3to2, chargeRatio4to2;
            double                digestionEfficiency, topDecoyScore;
            double                missedCleavageRate;
            Dictionary <int, int> numCharges = new Dictionary <int, int>();
            List <PsmData>        psms;
            IEnumerable <PsmData> goodPsms, nonDecoys;

            int numSearched = parameters.QcParams.NumberSpectra;

            // convert the dictionary to a list for easy parsing
            psms = psmCollection.Values.ToList();

            // get the top decoy score
            topDecoyScore = (from x in psms
                             where x.Decoy
                             select x.Hyperscore)
                            .ToArray().Percentile(95);

            // get the non-decoys
            nonDecoys = from x in psms
                        where !x.Decoy
                        select x;

            // and select the non-decoy hits which are above the top decoy score
            goodPsms = from x in psms
                       where !x.Decoy & x.Hyperscore > topDecoyScore
                       select x;

            Console.WriteLine("Total hits: {0}", psms.Count());
            Console.WriteLine("Top decoy score: {0}", topDecoyScore);
            Console.WriteLine("Non-decoy hits: {0}", nonDecoys.Count());
            Console.WriteLine("Non-decoy hits above top decoy score: {0}", goodPsms.Count());


            // parse out the charges
            charges = from x in goodPsms
                      select x.Charge;

            // get the number of each charge, add to a dictionary
            foreach (int charge in new List <int>()
            {
                2, 3, 4
            })
            {
                numCharges.Add(charge, (from x in charges where x == charge select 1).Count());
            }

            // calculate charge ratios
            chargeRatio3to2 = Convert.ToDouble(numCharges[3]) / Convert.ToDouble(numCharges[2]);
            chargeRatio4to2 = Convert.ToDouble(numCharges[4]) / Convert.ToDouble(numCharges[2]);

            // parse out the missed cleavage data
            pepsWithNoMissedCleavages = (from x in goodPsms
                                         where x.MissedCleavages == 0
                                         select 1).Sum();

            // number of PSMs is the length of this collection
            numGoodPSMs = goodPsms.Count();

            // missed cleavages per PSM
            digestionEfficiency = (double)pepsWithNoMissedCleavages / numGoodPSMs;
            Console.WriteLine("Digestion efficiency: {0}", digestionEfficiency);

            // get missed cleavage rate, i.e. number of missed cleavages per psm
            missedCleavageRate = (double)(from x in goodPsms select x.MissedCleavages).Sum() / numGoodPSMs;
            Console.WriteLine("Missed cleavage rate (/PSM): {0}", missedCleavageRate);

            // calculate ID rate
            IdRate = (double)numGoodPSMs / numSearched;
            Console.WriteLine("IDrate: {0}", IdRate);

            // get labeling efficiency metrics
            if ((parameters.QcParams.NMod != null) | (parameters.QcParams.KMod != null) | (parameters.QcParams.XMod != null))
            {
                searchMetrics.GetModificationFrequency(goodPsms, parameters);
            }

            // get median mass drift
            searchMetrics.MedianMassDrift = (from x in goodPsms
                                             select x.MassDrift)
                                            .ToArray().Percentile(50);

            searchMetrics.SearchData.PSMsWithNoMissedCleavages = pepsWithNoMissedCleavages;
            searchMetrics.SearchData.TotalNumGoodPSMs          = numGoodPSMs;
            searchMetrics.SearchData.NumCharge2 = numCharges[2];
            searchMetrics.SearchData.NumCharge3 = numCharges[3];
            searchMetrics.SearchData.NumCharge4 = numCharges[4];

            searchMetrics.IdentificationRate  = IdRate;
            searchMetrics.MissedCleavageRate  = missedCleavageRate;
            searchMetrics.DigestionEfficiency = digestionEfficiency;
            searchMetrics.ChargeRatio3to2     = chargeRatio3to2;
            searchMetrics.ChargeRatio4to2     = chargeRatio4to2;
        }