public void TestCmpd2() { ParserItem item = new ParserItem(); ParserFormat format = new ParserFormat(); format.FormatName = "TurboRAW2MGF, Cmpd"; format.Add(new ParserItem("rawFile", "")); format.Add(new ParserItem("scanNumber", @"Cmpd\s*(\d+)\s*,")); TitleParser parser = new TitleParser(format); SequestFilename sf = parser.GetValue("Cmpd 2345, xxxxx"); Assert.AreEqual(2345, sf.FirstScan); Assert.AreEqual(2345, sf.LastScan); }
public void TestDta() { ParserItem item = new ParserItem(); ParserFormat format = new ParserFormat(); format.FormatName = "TurboRAW2MGF, DTA Format"; format.Add(new ParserItem("rawFile", @"(.+)\.\d+\.\d+\.\d\.(?:dta|DTA)")); format.Add(new ParserItem("scanNumber", @".+\.(\d+)\.(\d+)\.\d\.(?:dta|DTA)")); TitleParser parser = new TitleParser(format); SequestFilename sf = parser.GetValue("TEST.2345.2346.1.dta"); Assert.AreEqual("TEST", sf.Experimental); Assert.AreEqual(2345, sf.FirstScan); Assert.AreEqual(2346, sf.LastScan); }
public List <IIdentifiedSpectrum> ReadFromFile(string fileName) { XElement root = XElement.Load(fileName); var name = root.FindElement("AnalysisSoftwareList"). FindElement("AnalysisSoftware"). FindElement("SoftwareName"). FindElement("cvParam").Attribute("name").Value; var defaultExp = Path.GetFileNameWithoutExtension(fileName); foreach (var ext in extensions) { if (defaultExp.ToLower().EndsWith(ext)) { defaultExp = defaultExp.Substring(0, defaultExp.Length - ext.Length); } } //parsing identification protocol first var protocols = root.FindElement("AnalysisProtocolCollection"); var sip = protocols.FindElement("SpectrumIdentificationProtocol"); var modMap = ParseSearchModificationMap(sip.FindElement("ModificationParams")); var proteases = ParseEnzymes(sip.FindElement("Enzymes")); var protease = proteases.FirstOrDefault(); //parsing sequence collection, including protein<->peptide map var seqs = root.FindElement("SequenceCollection"); var proteinMap = (from ele in seqs.FindElements("DBSequence") let id = ele.Attribute("id").Value let accession = ParseAccession(ele.Attribute("accession").Value) let db = ele.Attribute("searchDatabase_ref").Value select new { Id = id, Accession = accession, DB = db }).ToDictionary(m => m.Id); var peptideMap = (from ele in seqs.FindElements("Peptide") let id = ele.Attribute("id").Value let seq = ele.FindElement("PeptideSequence").Value let mods = (from modEle in ele.FindElements("Modification") let mod = ParseModification(modEle, modMap) where mod != null orderby mod.Location descending select mod).ToArray() let numMiss = protease == null ? 0 : protease.GetMissCleavageSiteCount(seq) select new MzIdentPeptideItem() { Id = id, PureSequence = seq, Modifications = mods, Sequence = GetModifiedSequence(seq, mods), NumMissCleavage = numMiss }).ToDictionary(m => m.Id); var peptideEvidenceMap = (from g in (from ele in seqs.FindElements("PeptideEvidence") select new MzIdentPeptideEvidenceItem() { Id = ele.Attribute("id").Value, PeptideRef = ele.Attribute("peptide_ref").Value, DbRef = ele.Attribute("dBSequence_ref").Value, Pre = ele.Attribute("pre").Value, Post = ele.Attribute("post").Value }).GroupBy(m => m.Id) select g.First()).ToDictionary(m => m.Id); //now parsing data var data = root.FindElement("DataCollection"); var result = new List <IIdentifiedSpectrum>(); var analysisData = data.FindElement("AnalysisData"); var idList = analysisData.FindElement("SpectrumIdentificationList"); foreach (var sir in idList.FindElements("SpectrumIdentificationResult")) { var items = FilterItems(sir.FindElements("SpectrumIdentificationItem"), peptideMap, peptideEvidenceMap); if (items.Count == 0) { continue; } var spectrum = new IdentifiedSpectrum(); result.Add(spectrum); var spectrumId = sir.Attribute("spectrumID").Value; var sirCvParams = GetCvParams(sir); string value; if (sirCvParams.TryGetValue("MS:1000796", out value)) { spectrum.Query.FileScan = TitleParser.GetValue(value); } else { if (spectrumId.StartsWith("index=") || spectrumId.StartsWith("scan=")) { spectrum.Query.FileScan.Experimental = defaultExp; spectrum.Query.FileScan.FirstScan = int.Parse(spectrumId.StringAfter("=")); spectrum.Query.FileScan.LastScan = spectrum.Query.FileScan.FirstScan; } else { spectrum.Query.FileScan.Experimental = spectrumId; } } if (sirCvParams.TryGetValue("MS:1001115", out value)) { spectrum.Query.FileScan.FirstScan = int.Parse(value); } if (spectrum.Query.FileScan.FirstScan == 0) { throw new Exception(string.Format("Cannot find scan information in file {0}", fileName)); } bool bFirst = true; foreach (var sit in items) { if (bFirst) //only parse score once { spectrum.Id = sit.Attribute("id").Value; spectrum.Charge = int.Parse(sit.Attribute("chargeState").Value); spectrum.TheoreticalMH = PrecursorUtils.MzToMH(double.Parse(sit.Attribute("calculatedMassToCharge").Value), spectrum.Charge, true); spectrum.ExperimentalMH = PrecursorUtils.MzToMH(double.Parse(sit.Attribute("experimentalMassToCharge").Value), spectrum.Charge, true); var cvParams = GetCvParams(sit); if (cvParams.TryGetValue("MS:1001121", out value)) { spectrum.MatchedIonCount = int.Parse(value); } if (cvParams.TryGetValue("MS:1001362", out value)) { spectrum.TheoreticalIonCount = int.Parse(value) + spectrum.MatchedIonCount; } ParseScore(spectrum, cvParams); var userParams = GetUserParams(sit); ParseUserParams(spectrum, userParams); bFirst = false; } var peptide = new IdentifiedPeptide(spectrum); var pep_ref = sit.Attribute("peptide_ref").Value; var pep = peptideMap[pep_ref]; spectrum.Modifications = (from m in pep.Modifications select string.Format("{0}:{1}", m.Location, m.Item.Name)).Reverse().Merge(","); spectrum.NumMissedCleavages = pep.NumMissCleavage; foreach (var per in sit.FindElements("PeptideEvidenceRef")) { var pe_ref = per.Attribute("peptideEvidence_ref").Value; var pe = peptideEvidenceMap[pe_ref]; peptide.Sequence = pe.Pre + "." + pep.Sequence + "." + pe.Post; var protein = proteinMap[pe.DbRef]; peptide.AddProtein(protein.Accession); } } } return(result); }
public List <IIdentifiedSpectrum> ReadFromFile(string fileName) { var result = new MascotPeptideTextFormat().ReadFromFile(fileName); FilterSpectra(result); UpdateModifications(result); foreach (var peptide in result) { peptide.Peptide.AssignProteins((peptide.Annotations[PROTEIN_KEY] as string).Split(';')); peptide.Annotations.Remove(PROTEIN_KEY); peptide.TheoreticalMass = peptide.ExperimentalMass; } var i = 0; while (i < result.Count - 1) { var ititle = result[i].Annotations[TITLE_KEY] as string; while (i < result.Count - 1) { var jtitle = result[i + 1].Annotations[TITLE_KEY] as string; if (!ititle.Equals(jtitle)) { i++; break; } for (int l = result[i + 1].Peptides.Count - 1; l >= 0; l--) { result[i].AddPeptide(result[i + 1].Peptides[l]); } result.RemoveAt(i + 1); } } foreach (var peptide in result) { var title = peptide.Annotations[TITLE_KEY] as string; peptide.Annotations.Remove(TITLE_KEY); var oldCharge = peptide.Query.FileScan.Charge; peptide.Query.FileScan = TitleParser.GetValue(title); peptide.Query.FileScan.Charge = oldCharge; if (string.IsNullOrEmpty(peptide.Query.FileScan.Experimental)) { peptide.Query.FileScan.Experimental = Path.GetFileNameWithoutExtension(fileName); } var rtstr = peptide.Annotations[RT_KEY] as string; if (!string.IsNullOrWhiteSpace(rtstr)) { peptide.Query.FileScan.RetentionTime = double.Parse(rtstr.StringBefore("-")); } peptide.Annotations.Remove(RT_KEY); } return(result); }
public virtual List <IIdentifiedSpectrum> ReadFromFile(string fileName) { if (this.TitleParser == null) { throw new Exception("Title format is not defined for parsing " + fileName); } List <IIdentifiedSpectrum> result = new List <IIdentifiedSpectrum>(); XElement root = XElement.Load(fileName); var msms_run_summaries = root.FindDescendants("msms_run_summary"); foreach (var msms_run_summary in msms_run_summaries) { var search_summary = msms_run_summary.FindFirstDescendant("search_summary"); var engine = search_summary.Attribute("search_engine").Value; var ismono = search_summary.Attribute("precursor_mass_type").Value.Equals("monoisotopic"); var enzyme = ParseProtease(msms_run_summary.FindFirstDescendant("sample_enzyme")); var modifications = ParseModifications(msms_run_summary); var spectrumQueries = msms_run_summary.FindDescendants("spectrum_query"); foreach (var sp in spectrumQueries) { IdentifiedSpectrum sph = new IdentifiedSpectrum(); sph.IsPrecursorMonoisotopic = ismono; SequestFilename sf; if (sp.Attribute("spectrumNativeID") != null) { sf = TitleParser.GetValue(sp.Attribute("spectrumNativeID").Value); } else { sf = TitleParser.GetValue(sp.Attribute("spectrum").Value); } sph.Query.FileScan.LongFileName = sf.LongFileName; sph.ExperimentalMass = MyConvert.ToDouble(sp.Attribute("precursor_neutral_mass").Value); sph.Query.Charge = int.Parse(sp.Attribute("assumed_charge").Value); var searchResult = sp.FindFirstDescendant("search_result"); var searchHit = FindSearchHit(searchResult); if (searchHit == null) { continue; } sph.TheoreticalMass = MyConvert.ToDouble(searchHit.Attribute("calc_neutral_pep_mass").Value); sph.MatchedIonCount = int.Parse(searchHit.Attribute("num_matched_ions").Value); sph.Rank = int.Parse(searchHit.Attribute("hit_rank").Value); var ticAtt = searchHit.Attribute("tot_num_ions"); sph.TheoreticalIonCount = ticAtt == null ? 0 : int.Parse(ticAtt.Value); ParseSearchHit(sph, searchHit, modifications); sph.Engine = engine; sph.DigestProtease = enzyme; result.Add(sph); } } return(result); }