예제 #1
0
    public void TestCmpd2()
    {
      ParserItem item = new ParserItem();
      ParserFormat format = new ParserFormat();
      format.FormatName = "TurboRAW2MGF, Cmpd";
      format.Add(new ParserItem("rawFile", ""));
      format.Add(new ParserItem("scanNumber", @"Cmpd\s*(\d+)\s*,"));

      TitleParser parser = new TitleParser(format);
      SequestFilename sf = parser.GetValue("Cmpd 2345, xxxxx");
      Assert.AreEqual(2345, sf.FirstScan);
      Assert.AreEqual(2345, sf.LastScan);
    }
예제 #2
0
    public void TestDta()
    {
      ParserItem item = new ParserItem();
      ParserFormat format = new ParserFormat();
      format.FormatName = "TurboRAW2MGF, DTA Format";
      format.Add(new ParserItem("rawFile", @"(.+)\.\d+\.\d+\.\d\.(?:dta|DTA)"));
      format.Add(new ParserItem("scanNumber", @".+\.(\d+)\.(\d+)\.\d\.(?:dta|DTA)"));

      TitleParser parser = new TitleParser(format);
      SequestFilename sf = parser.GetValue("TEST.2345.2346.1.dta");
      Assert.AreEqual("TEST", sf.Experimental);
      Assert.AreEqual(2345, sf.FirstScan);
      Assert.AreEqual(2346, sf.LastScan);
    }
예제 #3
0
        public List <IIdentifiedSpectrum> ReadFromFile(string fileName)
        {
            XElement root = XElement.Load(fileName);
            var      name = root.FindElement("AnalysisSoftwareList").
                            FindElement("AnalysisSoftware").
                            FindElement("SoftwareName").
                            FindElement("cvParam").Attribute("name").Value;

            var defaultExp = Path.GetFileNameWithoutExtension(fileName);

            foreach (var ext in extensions)
            {
                if (defaultExp.ToLower().EndsWith(ext))
                {
                    defaultExp = defaultExp.Substring(0, defaultExp.Length - ext.Length);
                }
            }

            //parsing identification protocol first
            var protocols = root.FindElement("AnalysisProtocolCollection");
            var sip       = protocols.FindElement("SpectrumIdentificationProtocol");
            var modMap    = ParseSearchModificationMap(sip.FindElement("ModificationParams"));
            var proteases = ParseEnzymes(sip.FindElement("Enzymes"));
            var protease  = proteases.FirstOrDefault();

            //parsing sequence collection, including protein<->peptide map
            var seqs       = root.FindElement("SequenceCollection");
            var proteinMap = (from ele in seqs.FindElements("DBSequence")
                              let id = ele.Attribute("id").Value
                                       let accession = ParseAccession(ele.Attribute("accession").Value)
                                                       let db = ele.Attribute("searchDatabase_ref").Value
                                                                select new { Id = id, Accession = accession, DB = db }).ToDictionary(m => m.Id);

            var peptideMap = (from ele in seqs.FindElements("Peptide")
                              let id = ele.Attribute("id").Value
                                       let seq = ele.FindElement("PeptideSequence").Value
                                                 let mods = (from modEle in ele.FindElements("Modification")
                                                             let mod = ParseModification(modEle, modMap)
                                                                       where mod != null
                                                                       orderby mod.Location descending
                                                                       select mod).ToArray()
                                                            let numMiss = protease == null ? 0 : protease.GetMissCleavageSiteCount(seq)
                                                                          select new MzIdentPeptideItem()
            {
                Id = id,
                PureSequence = seq,
                Modifications = mods,
                Sequence = GetModifiedSequence(seq, mods),
                NumMissCleavage = numMiss
            }).ToDictionary(m => m.Id);

            var peptideEvidenceMap = (from g in
                                      (from ele in seqs.FindElements("PeptideEvidence")
                                       select new MzIdentPeptideEvidenceItem()
            {
                Id = ele.Attribute("id").Value,
                PeptideRef = ele.Attribute("peptide_ref").Value,
                DbRef = ele.Attribute("dBSequence_ref").Value,
                Pre = ele.Attribute("pre").Value,
                Post = ele.Attribute("post").Value
            }).GroupBy(m => m.Id)
                                      select g.First()).ToDictionary(m => m.Id);

            //now parsing data
            var data = root.FindElement("DataCollection");

            var result       = new List <IIdentifiedSpectrum>();
            var analysisData = data.FindElement("AnalysisData");
            var idList       = analysisData.FindElement("SpectrumIdentificationList");

            foreach (var sir in idList.FindElements("SpectrumIdentificationResult"))
            {
                var items = FilterItems(sir.FindElements("SpectrumIdentificationItem"), peptideMap, peptideEvidenceMap);

                if (items.Count == 0)
                {
                    continue;
                }

                var spectrum = new IdentifiedSpectrum();
                result.Add(spectrum);

                var spectrumId = sir.Attribute("spectrumID").Value;

                var    sirCvParams = GetCvParams(sir);
                string value;
                if (sirCvParams.TryGetValue("MS:1000796", out value))
                {
                    spectrum.Query.FileScan = TitleParser.GetValue(value);
                }
                else
                {
                    if (spectrumId.StartsWith("index=") || spectrumId.StartsWith("scan="))
                    {
                        spectrum.Query.FileScan.Experimental = defaultExp;
                        spectrum.Query.FileScan.FirstScan    = int.Parse(spectrumId.StringAfter("="));
                        spectrum.Query.FileScan.LastScan     = spectrum.Query.FileScan.FirstScan;
                    }
                    else
                    {
                        spectrum.Query.FileScan.Experimental = spectrumId;
                    }
                }

                if (sirCvParams.TryGetValue("MS:1001115", out value))
                {
                    spectrum.Query.FileScan.FirstScan = int.Parse(value);
                }

                if (spectrum.Query.FileScan.FirstScan == 0)
                {
                    throw new Exception(string.Format("Cannot find scan information in file {0}", fileName));
                }

                bool bFirst = true;

                foreach (var sit in items)
                {
                    if (bFirst) //only parse score once
                    {
                        spectrum.Id             = sit.Attribute("id").Value;
                        spectrum.Charge         = int.Parse(sit.Attribute("chargeState").Value);
                        spectrum.TheoreticalMH  = PrecursorUtils.MzToMH(double.Parse(sit.Attribute("calculatedMassToCharge").Value), spectrum.Charge, true);
                        spectrum.ExperimentalMH = PrecursorUtils.MzToMH(double.Parse(sit.Attribute("experimentalMassToCharge").Value), spectrum.Charge, true);

                        var cvParams = GetCvParams(sit);
                        if (cvParams.TryGetValue("MS:1001121", out value))
                        {
                            spectrum.MatchedIonCount = int.Parse(value);
                        }

                        if (cvParams.TryGetValue("MS:1001362", out value))
                        {
                            spectrum.TheoreticalIonCount = int.Parse(value) + spectrum.MatchedIonCount;
                        }

                        ParseScore(spectrum, cvParams);

                        var userParams = GetUserParams(sit);
                        ParseUserParams(spectrum, userParams);

                        bFirst = false;
                    }

                    var peptide = new IdentifiedPeptide(spectrum);
                    var pep_ref = sit.Attribute("peptide_ref").Value;
                    var pep     = peptideMap[pep_ref];
                    spectrum.Modifications = (from m in pep.Modifications
                                              select string.Format("{0}:{1}", m.Location, m.Item.Name)).Reverse().Merge(",");
                    spectrum.NumMissedCleavages = pep.NumMissCleavage;

                    foreach (var per in sit.FindElements("PeptideEvidenceRef"))
                    {
                        var pe_ref = per.Attribute("peptideEvidence_ref").Value;
                        var pe     = peptideEvidenceMap[pe_ref];
                        peptide.Sequence = pe.Pre + "." + pep.Sequence + "." + pe.Post;

                        var protein = proteinMap[pe.DbRef];
                        peptide.AddProtein(protein.Accession);
                    }
                }
            }

            return(result);
        }
예제 #4
0
        public List <IIdentifiedSpectrum> ReadFromFile(string fileName)
        {
            var result = new MascotPeptideTextFormat().ReadFromFile(fileName);

            FilterSpectra(result);

            UpdateModifications(result);

            foreach (var peptide in result)
            {
                peptide.Peptide.AssignProteins((peptide.Annotations[PROTEIN_KEY] as string).Split(';'));
                peptide.Annotations.Remove(PROTEIN_KEY);
                peptide.TheoreticalMass = peptide.ExperimentalMass;
            }

            var i = 0;

            while (i < result.Count - 1)
            {
                var ititle = result[i].Annotations[TITLE_KEY] as string;
                while (i < result.Count - 1)
                {
                    var jtitle = result[i + 1].Annotations[TITLE_KEY] as string;
                    if (!ititle.Equals(jtitle))
                    {
                        i++;
                        break;
                    }

                    for (int l = result[i + 1].Peptides.Count - 1; l >= 0; l--)
                    {
                        result[i].AddPeptide(result[i + 1].Peptides[l]);
                    }

                    result.RemoveAt(i + 1);
                }
            }

            foreach (var peptide in result)
            {
                var title = peptide.Annotations[TITLE_KEY] as string;
                peptide.Annotations.Remove(TITLE_KEY);

                var oldCharge = peptide.Query.FileScan.Charge;
                peptide.Query.FileScan = TitleParser.GetValue(title);

                peptide.Query.FileScan.Charge = oldCharge;
                if (string.IsNullOrEmpty(peptide.Query.FileScan.Experimental))
                {
                    peptide.Query.FileScan.Experimental = Path.GetFileNameWithoutExtension(fileName);
                }
                var rtstr = peptide.Annotations[RT_KEY] as string;
                if (!string.IsNullOrWhiteSpace(rtstr))
                {
                    peptide.Query.FileScan.RetentionTime = double.Parse(rtstr.StringBefore("-"));
                }
                peptide.Annotations.Remove(RT_KEY);
            }

            return(result);
        }
예제 #5
0
        public virtual List <IIdentifiedSpectrum> ReadFromFile(string fileName)
        {
            if (this.TitleParser == null)
            {
                throw new Exception("Title format is not defined for parsing " + fileName);
            }

            List <IIdentifiedSpectrum> result = new List <IIdentifiedSpectrum>();

            XElement root = XElement.Load(fileName);

            var msms_run_summaries = root.FindDescendants("msms_run_summary");

            foreach (var msms_run_summary in msms_run_summaries)
            {
                var search_summary = msms_run_summary.FindFirstDescendant("search_summary");
                var engine         = search_summary.Attribute("search_engine").Value;
                var ismono         = search_summary.Attribute("precursor_mass_type").Value.Equals("monoisotopic");

                var enzyme        = ParseProtease(msms_run_summary.FindFirstDescendant("sample_enzyme"));
                var modifications = ParseModifications(msms_run_summary);

                var spectrumQueries = msms_run_summary.FindDescendants("spectrum_query");

                foreach (var sp in spectrumQueries)
                {
                    IdentifiedSpectrum sph = new IdentifiedSpectrum();

                    sph.IsPrecursorMonoisotopic = ismono;

                    SequestFilename sf;
                    if (sp.Attribute("spectrumNativeID") != null)
                    {
                        sf = TitleParser.GetValue(sp.Attribute("spectrumNativeID").Value);
                    }
                    else
                    {
                        sf = TitleParser.GetValue(sp.Attribute("spectrum").Value);
                    }
                    sph.Query.FileScan.LongFileName = sf.LongFileName;

                    sph.ExperimentalMass = MyConvert.ToDouble(sp.Attribute("precursor_neutral_mass").Value);
                    sph.Query.Charge     = int.Parse(sp.Attribute("assumed_charge").Value);

                    var searchResult = sp.FindFirstDescendant("search_result");
                    var searchHit    = FindSearchHit(searchResult);

                    if (searchHit == null)
                    {
                        continue;
                    }

                    sph.TheoreticalMass = MyConvert.ToDouble(searchHit.Attribute("calc_neutral_pep_mass").Value);
                    sph.MatchedIonCount = int.Parse(searchHit.Attribute("num_matched_ions").Value);
                    sph.Rank            = int.Parse(searchHit.Attribute("hit_rank").Value);
                    var ticAtt = searchHit.Attribute("tot_num_ions");
                    sph.TheoreticalIonCount = ticAtt == null ? 0 : int.Parse(ticAtt.Value);

                    ParseSearchHit(sph, searchHit, modifications);

                    sph.Engine         = engine;
                    sph.DigestProtease = enzyme;

                    result.Add(sph);
                }
            }

            return(result);
        }