private static List<IIdentifiedProtein> InitProteins() { var mph1 = new IdentifiedSpectrum(); mph1.Query.FileScan.Experimental = "EXP1"; var mp1 = new IdentifiedPeptide(mph1); mp1.AddProtein("Protein1"); mp1.AddProtein("Protein2"); mp1.Sequence = "SEQ1"; var mph2 = new IdentifiedSpectrum(); mph2.Query.FileScan.Experimental = "EXP2"; var mp2 = new IdentifiedPeptide(mph2); mp2.AddProtein("Protein1"); mp2.AddProtein("Protein3"); mp2.Sequence = "SEQ2"; var mpro1 = new IdentifiedProtein("Protein1"); mpro1.Peptides.Add(mp1); mpro1.Peptides.Add(mp2); var mpro2 = new IdentifiedProtein("Protein2"); mpro2.Peptides.Add(mp1); var mpro3 = new IdentifiedProtein("Protein3"); mpro3.Peptides.Add(mp2); var result = new List<IIdentifiedProtein>(); result.Add(mpro3); result.Add(mpro2); result.Add(mpro1); return result; }
private void ParseSearchHit(IIdentifiedSpectrum sph, XElement searchHit, PepXmlModifications ppmods) { var sp = new IdentifiedPeptide(sph); var mod_info = searchHit.FindFirstDescendant("modification_info"); string seq = searchHit.Attribute("peptide").Value; if (mod_info != null) { var modified_peptide = mod_info.Attribute("modified_peptide"); if (modified_peptide != null && !modReg.Match(modified_peptide.Value).Success) { seq = modified_peptide.Value; } else { var pureSeq = seq; var modaas = PeptideProphetUtils.ParseModificationAminoacidMass(mod_info); if (modaas != null && modaas.Count > 0) { modaas.Reverse(); foreach (var modaa in modaas) { string modchar = FindModificationChar(ppmods, modaa, pureSeq); seq = seq.Insert(modaa.Position, modchar); } } } } if (searchHit.Attribute("peptide_prev_aa") != null) { sp.Sequence = searchHit.Attribute("peptide_prev_aa").Value + "." + seq + "." + searchHit.Attribute("peptide_next_aa").Value; } else { sp.Sequence = seq; } sph.NumMissedCleavages = GetAttributeValue(searchHit, "num_missed_cleavages", 0); sph.NumProteaseTermini = GetAttributeValue(searchHit, "num_tol_term", 2); sp.AddProtein(searchHit.Attribute("protein").Value); var NumTotalProteins = int.Parse(searchHit.Attribute("num_tot_proteins").Value); if (NumTotalProteins > 1) { var alternative_proteins = searchHit.FindDescendants("alternative_protein"); foreach (var alternative_protein in alternative_proteins) { sp.AddProtein(alternative_protein.Attribute("protein").Value); } } ParseScoreAndOtherInformation(sph, searchHit); }
public void Test() { IPropertyConverter <IdentifiedSpectrum> io = new IdentifiedSpectrumReferenceConverter <IdentifiedSpectrum>(); var mph = new IdentifiedSpectrum(); var mp1 = new IdentifiedPeptide(mph); mp1.AddProtein("11111"); var mp2 = new IdentifiedPeptide(mph); mp2.AddProtein("22222"); mp2.AddProtein("33333"); Assert.AreEqual("Reference", io.Name); Assert.AreEqual("11111 ! 22222/33333", io.GetProperty(mph)); io.SetProperty(mph, "44444/55555 ! 66666"); Assert.AreEqual(2, mph.Peptides[0].Proteins.Count); Assert.AreEqual("44444", mph.Peptides[0].Proteins[0]); Assert.AreEqual("55555", mph.Peptides[0].Proteins[1]); Assert.AreEqual(1, mph.Peptides[1].Proteins.Count); Assert.AreEqual("66666", mph.Peptides[1].Proteins[0]); }
public void TestGetProteinString() { var mph = new IdentifiedSpectrum(); var mp1 = new IdentifiedPeptide(mph); mp1.AddProtein("P1"); var mp2 = new IdentifiedPeptide(mph); mp2.AddProtein("P2"); mp2.AddProtein("P3"); Assert.AreEqual("P1 ! P2/P3", MascotPeptideHitTextWriter.GetProteinString(mph)); }
public void TestBuildProteins() { var mph1 = new IdentifiedSpectrum(); mph1.Query.FileScan.Experimental = "EXP1"; var mp1 = new IdentifiedPeptide(mph1); mp1.AddProtein("Protein1"); mp1.AddProtein("Protein2"); var mph2 = new IdentifiedSpectrum(); mph2.Query.FileScan.Experimental = "EXP2"; var mp2 = new IdentifiedPeptide(mph2); mp2.AddProtein("Protein1"); mp2.AddProtein("Protein3"); var mphs = new List <IIdentifiedSpectrum>(); mphs.Add(mph1); mphs.Add(mph2); List <IIdentifiedProtein> proteins = MascotUtils.BuildProteins(mphs); Assert.AreEqual(3, proteins.Count); foreach (IdentifiedProtein mp in proteins) { if (mp.Name.Equals("Protein1")) { Assert.AreEqual(2, mp.Peptides.Count); continue; } if (mp.Name.Equals("Protein2")) { Assert.AreEqual(1, mp.Peptides.Count); Assert.AreEqual(mp1, mp.Peptides[0]); continue; } if (mp.Name.Equals("Protein3")) { Assert.AreEqual(1, mp.Peptides.Count); Assert.AreEqual(mp2, mp.Peptides[0]); continue; } } }
public override void SetProperty(T t, string value) { string[] proteins = reg.Split(value); if (t.Peptides.Count != proteins.Length) { t.ClearPeptides(); for (int i = 0; i < proteins.Length; i++) { IIdentifiedPeptide mp = new IdentifiedPeptide(t); string[] parts = proteins[i].Split(chars); foreach (string part in parts) { mp.AddProtein(part); } } } else { for (int i = 0; i < proteins.Length; i++) { string[] parts = proteins[i].Split(chars); t.Peptides[i].ClearProteins(); foreach (string part in parts) { t.Peptides[i].AddProtein(part); } } } }
public List <IIdentifiedSpectrum> ReadFromFile(string fileName) { var result = new List <IIdentifiedSpectrum>(); XElement root = XElement.Load(fileName); var psms = root.FindElement("psms").FindElements("psm"); foreach (var psm in psms) { IIdentifiedSpectrum spec = new IdentifiedSpectrum(); spec.Id = psm.FindAttribute("psm_id").Value.StringAfter("decoy_"); spec.FromDecoy = psm.FindAttribute("decoy").Value.Equals("true"); spec.SpScore = double.Parse(psm.FindElement("svm_score").Value); spec.QValue = double.Parse(psm.FindElement("q_value").Value); spec.Score = double.Parse(psm.FindElement("pep").Value); spec.Probability = double.Parse(psm.FindElement("p_value").Value); spec.TheoreticalMH = double.Parse(psm.FindElement("calc_mass").Value); spec.Query.FileScan.Experimental = Path.GetFileName(fileName).StringBefore("."); var pep = new IdentifiedPeptide(spec); var pepseq = psm.FindElement("peptide_seq"); pep.Sequence = pepseq.FindAttribute("seq").Value; pep.AddProtein(psm.FindElement("protein_id").Value); result.Add(spec); } return(result); }
public void TestBuildProteins() { var mph1 = new IdentifiedSpectrum(); mph1.Query.FileScan.Experimental = "EXP1"; var mp1 = new IdentifiedPeptide(mph1); mp1.AddProtein("Protein1"); mp1.AddProtein("Protein2"); var mph2 = new IdentifiedSpectrum(); mph2.Query.FileScan.Experimental = "EXP2"; var mp2 = new IdentifiedPeptide(mph2); mp2.AddProtein("Protein1"); mp2.AddProtein("Protein3"); var mphs = new List<IIdentifiedSpectrum>(); mphs.Add(mph1); mphs.Add(mph2); List<IIdentifiedProtein> proteins = MascotUtils.BuildProteins(mphs); Assert.AreEqual(3, proteins.Count); foreach (IdentifiedProtein mp in proteins) { if (mp.Name.Equals("Protein1")) { Assert.AreEqual(2, mp.Peptides.Count); continue; } if (mp.Name.Equals("Protein2")) { Assert.AreEqual(1, mp.Peptides.Count); Assert.AreEqual(mp1, mp.Peptides[0]); continue; } if (mp.Name.Equals("Protein3")) { Assert.AreEqual(1, mp.Peptides.Count); Assert.AreEqual(mp2, mp.Peptides[0]); continue; } } }
public void TestAddProtein() { var pi = new IdentifiedPeptide(new IdentifiedSpectrum()); pi.AddProtein("AAAAA\tBBBBB"); Assert.AreEqual(1, pi.Proteins.Count); Assert.AreEqual("AAAAA BBBBB", pi.Proteins[0]); pi.SetProtein(0, "CCCCC\tDDDDD"); Assert.AreEqual(1, pi.Proteins.Count); Assert.AreEqual("CCCCC DDDDD", pi.Proteins[0]); }
private static List <IIdentifiedProtein> InitProteins() { var mph1 = new IdentifiedSpectrum(); mph1.Query.FileScan.Experimental = "EXP1"; var mp1 = new IdentifiedPeptide(mph1); mp1.AddProtein("Protein1"); mp1.AddProtein("Protein2"); mp1.Sequence = "SEQ1"; var mph2 = new IdentifiedSpectrum(); mph2.Query.FileScan.Experimental = "EXP2"; var mp2 = new IdentifiedPeptide(mph2); mp2.AddProtein("Protein1"); mp2.AddProtein("Protein3"); mp2.Sequence = "SEQ2"; var mpro1 = new IdentifiedProtein("Protein1"); mpro1.Peptides.Add(mp1); mpro1.Peptides.Add(mp2); var mpro2 = new IdentifiedProtein("Protein2"); mpro2.Peptides.Add(mp1); var mpro3 = new IdentifiedProtein("Protein3"); mpro3.Peptides.Add(mp2); var result = new List <IIdentifiedProtein>(); result.Add(mpro3); result.Add(mpro2); result.Add(mpro1); return(result); }
public void TestNoredundant() { string header = "\t\"File, Scan(s)\"\tSequence\tMH+\tDiff(MH+)\tCharge\tRank\tScore\tDeltaScore\tExpectValue\tQuery\tIons\tReference\tDIFF_MODIFIED_CANDIDATE\tPI\tMissCleavage\tModification"; IPropertyConverter<IIdentifiedSpectrum> converter = IdentifiedSpectrumPropertyConverterFactory.GetInstance().GetConverters(header, '\t'); Assert.AreEqual(header, converter.Name); IIdentifiedSpectrum mphit = new IdentifiedSpectrum(); mphit.Query.FileScan.ShortFileName = "AAA,1-2"; IdentifiedPeptide mp1 = new IdentifiedPeptide(mphit); mp1.Sequence = "AAAAA"; mp1.AddProtein("PROTEIN1"); mp1.AddProtein("PROTEIN2"); IdentifiedPeptide mp2 = new IdentifiedPeptide(mphit); mp2.Sequence = "BBBBB"; mp2.AddProtein("PROTEIN3"); mphit.TheoreticalMH = 1000.00102; mphit.ExperimentalMH = 1000.0; mphit.Query.Charge = 2; mphit.Rank = 1; mphit.Score = 100.2; mphit.DeltaScore = 0.5; mphit.ExpectValue = 1.1e-2; mphit.Query.QueryId = 10; mphit.NumMissedCleavages = 1; mphit.Modifications = "O18(1)"; string expect = " AAA,1 - 2 AAAAA ! BBBBB 1000.00102 0.00102 2 1 100.2 0.5 1.10E-002 10 0|0 PROTEIN1/PROTEIN2 ! PROTEIN3 0.00 1 O18(1)"; Assert.AreEqual(expect, converter.GetProperty(mphit)); string expectNew = " BBB,2 - 3 BBBBB 1002.00783 -0.00200 3 2 200.2 0.6 1.20E-003 20 0|0 PROTEIN2/PROTEIN4 0.00 2 O18(2)"; converter.SetProperty(mphit, expectNew); Assert.AreEqual(expectNew, converter.GetProperty(mphit)); }
public void Test() { IPropertyConverter<IdentifiedSpectrum> io = new IdentifiedSpectrumReferenceConverter<IdentifiedSpectrum>(); var mph = new IdentifiedSpectrum(); var mp1 = new IdentifiedPeptide(mph); mp1.AddProtein("11111"); var mp2 = new IdentifiedPeptide(mph); mp2.AddProtein("22222"); mp2.AddProtein("33333"); Assert.AreEqual("Reference", io.Name); Assert.AreEqual("11111 ! 22222/33333", io.GetProperty(mph)); io.SetProperty(mph, "44444/55555 ! 66666"); Assert.AreEqual(2, mph.Peptides[0].Proteins.Count); Assert.AreEqual("44444", mph.Peptides[0].Proteins[0]); Assert.AreEqual("55555", mph.Peptides[0].Proteins[1]); Assert.AreEqual(1, mph.Peptides[1].Proteins.Count); Assert.AreEqual("66666", mph.Peptides[1].Proteins[0]); }
// 1. 1 / 1 0 1964.9940 0.0000 5.6970 2133.9 21/30 sw|P02666|CASBBOVIN +1 K.FQSEEQQQTEDELQDK.I protected bool ParseFromOutfileLine(string line, IdentifiedSpectrum entry) { // Console.Out.WriteLine(line); // dfadfas entry.IsProteinFromOutFile = true; string sLine = line.Trim().Replace('/', ' '); string[] sLines = this.reg.Split(sLine); if (sLines.Length < itemIndex.MinCount) { return(false); } entry.Rank = int.Parse(sLines[itemIndex.RankIndex]); entry.SpRank = int.Parse(sLines[itemIndex.SpRankIndex]); entry.TheoreticalMH = MyConvert.ToDouble(sLines[itemIndex.TheoreticalMHIndex]); entry.DeltaScore = MyConvert.ToDouble(sLines[itemIndex.DeltaScoreIndex]); entry.Score = MyConvert.ToDouble(sLines[itemIndex.ScoreIndex]); entry.SpScore = MyConvert.ToDouble(sLines[itemIndex.SpScoreIndex]); entry.MatchedIonCount = int.Parse(sLines[itemIndex.MatchedIonCountIndex]); entry.TheoreticalIonCount = int.Parse(sLines[itemIndex.TheoreticalIonCountIndex]); entry.ClearPeptides(); string sequence; if ('+' != sLines[itemIndex.SequenceIndex][0]) { entry.DuplicatedCount = 0; sequence = sLines[itemIndex.SequenceIndex]; } else { entry.DuplicatedCount = int.Parse(sLines[itemIndex.SequenceIndex].Substring(1, sLines[itemIndex.SequenceIndex].Length - 1)); sequence = sLines[itemIndex.SequenceIndex + 1]; } CheckSequenceValid(ref sequence); var sp = new IdentifiedPeptide(entry); sp.Sequence = sequence; sp.AddProtein(sLines[itemIndex.ProteinIndex]); return(true); }
protected bool ParseFromOutfileLineWithId(List <string> sLines, IdentifiedSpectrum entry) { if (sLines.Count < 12) { return(false); } //entry.Index = int.Parse(sLines[0].Substring(0, sLines[0].Length - 1)); entry.Rank = int.Parse(sLines[1]); entry.SpRank = int.Parse(sLines[2]); //entry.Id = int.Parse(sLines[3]); entry.TheoreticalMH = MyConvert.ToDouble(sLines[4]); entry.DeltaScore = MyConvert.ToDouble(sLines[5]); entry.Score = MyConvert.ToDouble(sLines[6]); entry.SpScore = MyConvert.ToDouble(sLines[7]); entry.MatchedIonCount = int.Parse(sLines[8]); entry.TheoreticalIonCount = int.Parse(sLines[9]); entry.ClearPeptides(); string sequence; if ('+' != sLines[11][0]) { entry.DuplicatedCount = 0; sequence = sLines[11]; } else { entry.DuplicatedCount = int.Parse(sLines[11].Substring(1, sLines[11].Length - 1)); sequence = sLines[12]; } CheckSequenceValid(ref sequence); var sp = new IdentifiedPeptide(entry); sp.Sequence = sequence; sp.AddProtein(sLines[10]); return(true); }
public List <IIdentifiedSpectrum> ReadFromFile(string fileName) { var result = new List <IIdentifiedSpectrum>(); XElement root = XElement.Load(fileName); var features = root.FindElement("featureDescriptions"); var descriptions = features.FindElements("featureDescription"); var missIndex = FindIndex(fileName, descriptions, "# Missed Cleavages"); var scans = root.FindElements("fragSpectrumScan"); foreach (var scan in scans) { var scanNumber = int.Parse(scan.FindAttribute("scanNumber").Value); var psms = scan.FindElements("peptideSpectrumMatch"); foreach (var psm in psms) { IIdentifiedSpectrum spec = new IdentifiedSpectrum(); spec.Query.QueryId = scanNumber; spec.Id = psm.FindAttribute("id").Value.StringAfter("decoy_"); spec.FromDecoy = psm.FindAttribute("isDecoy").Value.Equals("true"); spec.TheoreticalMH = double.Parse(psm.FindAttribute("calculatedMassToCharge").Value); spec.ExperimentalMH = double.Parse(psm.FindAttribute("experimentalMassToCharge").Value); spec.Query.Charge = int.Parse(psm.FindAttribute("chargeState").Value); var pep = new IdentifiedPeptide(spec); pep.Sequence = psm.FindElement("peptide").FindElement("peptideSequence").Value; pep.AddProtein(psm.FindElement("occurence").FindAttribute("proteinId").Value); var featureEles = psm.FindElement("features").FindElements("feature"); //The first one is the score. spec.Score = double.Parse(featureEles[0].Value); spec.NumMissedCleavages = int.Parse(featureEles[missIndex].Value); result.Add(spec); } } return(result); }
public List <IIdentifiedSpectrum> ReadFromFile(string fileName) { var result = new List <IIdentifiedSpectrum>(); XElement root = XElement.Load(fileName); var peptides = root.FindElement("peptides").FindElements("peptide"); foreach (var peptide in peptides) { IIdentifiedSpectrum spec = new IdentifiedSpectrum(); var pep = new IdentifiedPeptide(spec); pep.Sequence = peptide.FindAttribute("peptide_id").Value; spec.FromDecoy = peptide.FindAttribute("decoy").Value.Equals("true"); spec.SpScore = double.Parse(peptide.FindElement("svm_score").Value); spec.QValue = double.Parse(peptide.FindElement("q_value").Value); spec.Score = double.Parse(peptide.FindElement("pep").Value); spec.TheoreticalMass = double.Parse(peptide.FindElement("calc_mass").Value); pep.AddProtein(peptide.FindElement("protein_id").Value); spec.Probability = double.Parse(peptide.FindElement("p_value").Value); result.Add(spec); } return(result); }
public override List <IIdentifiedProtein> ParseProteins(string fileName) { Dictionary <string, IIdentifiedProtein> proteinMap = new Dictionary <string, IIdentifiedProtein>(); using (StreamReader sr = new StreamReader(fileName)) { string line = sr.ReadLine(); string[] headerParts = line.Split('\t'); int seqIndex = Array.FindIndex(headerParts, (m => m == "Sequence")); int proIndex = Array.FindIndex(headerParts, (m => m == "Protein Accessions")); int modIndex = Array.FindIndex(headerParts, (m => m == "Modifications")); int xcIndex = Array.FindIndex(headerParts, (m => m == "XCorr")); int deltaIndex = Array.FindIndex(headerParts, (m => m.EndsWith(" Score"))); int chargeIndex = Array.FindIndex(headerParts, (m => m == "Charge")); int obsIndex = Array.FindIndex(headerParts, (m => m == "m/z [Da]")); int mhIndex = Array.FindIndex(headerParts, (m => m == "MH+ [Da]")); int fscanIndex = Array.FindIndex(headerParts, (m => m == "First Scan")); int lscanIndex = Array.FindIndex(headerParts, (m => m == "Last Scan")); int ionIndex = Array.FindIndex(headerParts, (m => m == "Ions Matched")); int fileIndex = Array.FindIndex(headerParts, (m => m == "Spectrum File")); Progress.SetRange(0, sr.BaseStream.Length); Progress.SetMessage("Parsing file ..."); while ((line = sr.ReadLine()) != null) { if (line.Trim().Length == 0) { break; } string[] parts = line.Split('\t'); if (parts[0].Length == 0) { continue; } Progress.SetPosition(sr.BaseStream.Position); string seq = parts[seqIndex]; string deltaCn = parts[deltaIndex]; if (deltaCn.Length == 0)//rank > 1 { continue; } string protein = parts[proIndex]; if (!proteinMap.ContainsKey(protein)) { sr.ReadLine(); string proLine = sr.ReadLine(); string[] proParts = proLine.Split('\t'); var p = new IdentifiedProtein(protein); p.Coverage = MyConvert.ToDouble(proParts[2]); p.MolecularWeight = MyConvert.ToDouble(proParts[5]) * 1000; p.IsoelectricPoint = MyConvert.ToDouble(proParts[6]); p.Score = MyConvert.ToDouble(proParts[7]); p.Description = proParts[8]; proteinMap[protein] = p; } var pro = proteinMap[protein]; IdentifiedSpectrum spectrum = new IdentifiedSpectrum(); IdentifiedPeptide peptide = new IdentifiedPeptide(spectrum); peptide.Sequence = seq.ToUpper(); peptide.AddProtein(protein); spectrum.Modifications = parts[modIndex]; spectrum.DeltaScore = MyConvert.ToDouble(deltaCn); spectrum.Charge = Convert.ToInt32(parts[chargeIndex]); spectrum.ObservedMz = MyConvert.ToDouble(parts[obsIndex]); spectrum.TheoreticalMH = MyConvert.ToDouble(parts[mhIndex]); spectrum.Ions = parts[ionIndex]; spectrum.Query.FileScan.FirstScan = Convert.ToInt32(parts[fscanIndex]); spectrum.Query.FileScan.LastScan = Convert.ToInt32(parts[lscanIndex]); spectrum.Query.FileScan.Experimental = FileUtils.RemoveAllExtension(parts[fileIndex]); pro.Peptides.Add(peptide); } } var proteins = proteinMap.Values.ToList(); return(proteins); }
public List <IIdentifiedSpectrum> ReadFromFile(string fileName) { XElement root = XElement.Load(fileName); var name = root.FindElement("AnalysisSoftwareList"). FindElement("AnalysisSoftware"). FindElement("SoftwareName"). FindElement("cvParam").Attribute("name").Value; var defaultExp = Path.GetFileNameWithoutExtension(fileName); foreach (var ext in extensions) { if (defaultExp.ToLower().EndsWith(ext)) { defaultExp = defaultExp.Substring(0, defaultExp.Length - ext.Length); } } //parsing identification protocol first var protocols = root.FindElement("AnalysisProtocolCollection"); var sip = protocols.FindElement("SpectrumIdentificationProtocol"); var modMap = ParseSearchModificationMap(sip.FindElement("ModificationParams")); var proteases = ParseEnzymes(sip.FindElement("Enzymes")); var protease = proteases.FirstOrDefault(); //parsing sequence collection, including protein<->peptide map var seqs = root.FindElement("SequenceCollection"); var proteinMap = (from ele in seqs.FindElements("DBSequence") let id = ele.Attribute("id").Value let accession = ParseAccession(ele.Attribute("accession").Value) let db = ele.Attribute("searchDatabase_ref").Value select new { Id = id, Accession = accession, DB = db }).ToDictionary(m => m.Id); var peptideMap = (from ele in seqs.FindElements("Peptide") let id = ele.Attribute("id").Value let seq = ele.FindElement("PeptideSequence").Value let mods = (from modEle in ele.FindElements("Modification") let mod = ParseModification(modEle, modMap) where mod != null orderby mod.Location descending select mod).ToArray() let numMiss = protease == null ? 0 : protease.GetMissCleavageSiteCount(seq) select new MzIdentPeptideItem() { Id = id, PureSequence = seq, Modifications = mods, Sequence = GetModifiedSequence(seq, mods), NumMissCleavage = numMiss }).ToDictionary(m => m.Id); var peptideEvidenceMap = (from g in (from ele in seqs.FindElements("PeptideEvidence") select new MzIdentPeptideEvidenceItem() { Id = ele.Attribute("id").Value, PeptideRef = ele.Attribute("peptide_ref").Value, DbRef = ele.Attribute("dBSequence_ref").Value, Pre = ele.Attribute("pre").Value, Post = ele.Attribute("post").Value }).GroupBy(m => m.Id) select g.First()).ToDictionary(m => m.Id); //now parsing data var data = root.FindElement("DataCollection"); var result = new List <IIdentifiedSpectrum>(); var analysisData = data.FindElement("AnalysisData"); var idList = analysisData.FindElement("SpectrumIdentificationList"); foreach (var sir in idList.FindElements("SpectrumIdentificationResult")) { var items = FilterItems(sir.FindElements("SpectrumIdentificationItem"), peptideMap, peptideEvidenceMap); if (items.Count == 0) { continue; } var spectrum = new IdentifiedSpectrum(); result.Add(spectrum); var spectrumId = sir.Attribute("spectrumID").Value; var sirCvParams = GetCvParams(sir); string value; if (sirCvParams.TryGetValue("MS:1000796", out value)) { spectrum.Query.FileScan = TitleParser.GetValue(value); } else { if (spectrumId.StartsWith("index=") || spectrumId.StartsWith("scan=")) { spectrum.Query.FileScan.Experimental = defaultExp; spectrum.Query.FileScan.FirstScan = int.Parse(spectrumId.StringAfter("=")); spectrum.Query.FileScan.LastScan = spectrum.Query.FileScan.FirstScan; } else { spectrum.Query.FileScan.Experimental = spectrumId; } } if (sirCvParams.TryGetValue("MS:1001115", out value)) { spectrum.Query.FileScan.FirstScan = int.Parse(value); } if (spectrum.Query.FileScan.FirstScan == 0) { throw new Exception(string.Format("Cannot find scan information in file {0}", fileName)); } bool bFirst = true; foreach (var sit in items) { if (bFirst) //only parse score once { spectrum.Id = sit.Attribute("id").Value; spectrum.Charge = int.Parse(sit.Attribute("chargeState").Value); spectrum.TheoreticalMH = PrecursorUtils.MzToMH(double.Parse(sit.Attribute("calculatedMassToCharge").Value), spectrum.Charge, true); spectrum.ExperimentalMH = PrecursorUtils.MzToMH(double.Parse(sit.Attribute("experimentalMassToCharge").Value), spectrum.Charge, true); var cvParams = GetCvParams(sit); if (cvParams.TryGetValue("MS:1001121", out value)) { spectrum.MatchedIonCount = int.Parse(value); } if (cvParams.TryGetValue("MS:1001362", out value)) { spectrum.TheoreticalIonCount = int.Parse(value) + spectrum.MatchedIonCount; } ParseScore(spectrum, cvParams); var userParams = GetUserParams(sit); ParseUserParams(spectrum, userParams); bFirst = false; } var peptide = new IdentifiedPeptide(spectrum); var pep_ref = sit.Attribute("peptide_ref").Value; var pep = peptideMap[pep_ref]; spectrum.Modifications = (from m in pep.Modifications select string.Format("{0}:{1}", m.Location, m.Item.Name)).Reverse().Merge(","); spectrum.NumMissedCleavages = pep.NumMissCleavage; foreach (var per in sit.FindElements("PeptideEvidenceRef")) { var pe_ref = per.Attribute("peptideEvidence_ref").Value; var pe = peptideEvidenceMap[pe_ref]; peptide.Sequence = pe.Pre + "." + pep.Sequence + "." + pe.Post; var protein = proteinMap[pe.DbRef]; peptide.AddProtein(protein.Accession); } } } return(result); }
public Dictionary <int, List <IIdentifiedSpectrum> > DoParsePeptides(string datFilename, int minRank, double minScore, bool isDecoy) { var result = new Dictionary <int, List <IIdentifiedSpectrum> >(); Dictionary <string, string> headers; int queryCount; Dictionary <int, MascotQueryItem> queryItems; Dictionary <string, string> peptideSection; var prefix = isDecoy ? "decoy_" : ""; using (var sr = new StreamReader(datFilename)) { InitializeBoundary(sr); CurrentParameters = ParseSection(sr, "parameters"); var hasDecoy = CurrentParameters.ContainsKey("DECOY") && CurrentParameters["DECOY"].Equals("1"); if (!hasDecoy && isDecoy) { return(result); } var masses = ParseSection(sr, "masses"); CurrentModifications = ParseModification(masses); long curPos = sr.GetCharpos(); CurrentProtease = ParseEnzyme(sr); sr.SetCharpos(curPos); headers = ParseSection(sr, "header"); queryCount = int.Parse(headers["queries"]); queryItems = ParseQueryItems(sr, queryCount, prefix); peptideSection = ParseSection(sr, prefix + "peptides", !isDecoy); } string file = CurrentParameters["FILE"]; if (file.StartsWith("File Name: ")) { file = file.Substring(10).Trim(); } string defaultExperimental = FileUtils.ChangeExtension(new FileInfo(file).Name, ""); bool isPrecursorMonoisotopic = true; if (CurrentParameters.ContainsKey("MASS")) { isPrecursorMonoisotopic = CurrentParameters["MASS"].Equals("Monoisotopic"); } using (var sr = new StreamReader(datFilename)) { //Progress.SetRange(1, queryCount); for (int queryId = 1; queryId <= queryCount; queryId++) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } //Progress.SetPosition(queryId); MascotQueryItem queryItem = queryItems[queryId]; var iPeps = new List <IIdentifiedSpectrum>(); result[queryId] = iPeps; IIdentifiedSpectrum lastHit = null; int rank = 0; for (int k = 1; k <= 10; k++) { string key = "q" + queryId + "_p" + k; if (!peptideSection.ContainsKey(key)) { if (null != lastHit) { lastHit.DeltaScore = 1.0; } break; } string line = peptideSection[key]; if (line == null || line.Equals("-1")) { if (null != lastHit) { lastHit.DeltaScore = 1.0; } break; } Match mDetail = this.peptideRegex.Match(line); if (!mDetail.Success) { throw new Exception("Wrong format of peptides : " + line); } double score = MyConvert.ToDouble(mDetail.Groups["Score"].Value); if (score < minScore) { if (null != lastHit) { lastHit.DeltaScore = 1.0 - score / lastHit.Score; } break; } bool bSameRank = null != lastHit && score == lastHit.Score; if (!bSameRank) { if (null != lastHit) { lastHit.DeltaScore = 1.0 - score / lastHit.Score; } rank++; if (rank > minRank) { break; } } IIdentifiedSpectrum mphit; if (bSameRank) { mphit = lastHit; } else { mphit = new IdentifiedSpectrum(); mphit.IsPrecursorMonoisotopic = isPrecursorMonoisotopic; mphit.Rank = rank; mphit.NumMissedCleavages = int.Parse(mDetail.Groups["MissCleavage"].Value); mphit.TheoreticalMass = MyConvert.ToDouble(mDetail.Groups["TheoreticalMass"].Value); mphit.ExperimentalMass = queryItem.ExperimentalMass; mphit.Score = score; mphit.ExpectValue = ExpectValueCalculator.Calc(mphit.Score, queryItem.MatchCount, 0.05); mphit.Query.QueryId = queryId; mphit.Query.ObservedMz = queryItem.Observed; mphit.Query.Charge = queryItem.Charge; mphit.Query.MatchCount = queryItem.MatchCount; if (queryItem.HomologyScore != 0) { mphit.Annotations[HomologyScoreKey] = queryItem.HomologyScore; } if (CurrentProtease.IsSemiSpecific) { mphit.NumProteaseTermini = 1; } lastHit = mphit; } var pureSeq = mDetail.Groups["Sequence"].Value; string modification = mDetail.Groups["Modification"].Value; var seq = ModifySequence(pureSeq, modification); AssignModification(mphit, modification, CurrentModifications); string proteins = mDetail.Groups["ProteinNames"].Value; Match proteinNameMatch = this.proteinNameRegex.Match(proteins); string key_terms = key + "_terms"; if (!peptideSection.ContainsKey(key_terms)) { throw new Exception("Mascot version is too old. It's not supported."); } string value_terms = peptideSection[key_terms]; Match termsMatch = this.termsRegex.Match(value_terms); int numProteaseTermini = 0; while (proteinNameMatch.Success && termsMatch.Success) { var fullSeq = MyConvert.Format("{0}.{1}.{2}", termsMatch.Groups[1].Value, seq, termsMatch.Groups[2].Value); var name = proteinNameMatch.Groups[1].Value.Replace("/", "_"); if (isDecoy) { name = DECOY_PREFIX + name; } bool findPeptide = false; for (int i = 0; i < mphit.Peptides.Count; i++) { if (mphit.Peptides[i].Sequence == fullSeq) { mphit.Peptides[i].AddProtein(name); findPeptide = true; break; } } if (!findPeptide) { var mp = new IdentifiedPeptide(mphit); mp.Sequence = fullSeq; mp.AddProtein(name); if (CurrentProtease.IsSemiSpecific) { int position = Convert.ToInt32(proteinNameMatch.Groups[2].Value); int count = CurrentProtease.GetNumProteaseTermini(termsMatch.Groups[1].Value[0], pureSeq, termsMatch.Groups[2].Value[0], '-', position); numProteaseTermini = Math.Max(numProteaseTermini, count); } } proteinNameMatch = proteinNameMatch.NextMatch(); termsMatch = termsMatch.NextMatch(); } if (CurrentProtease.IsSemiSpecific) { mphit.NumProteaseTermini = Math.Max(mphit.NumProteaseTermini, numProteaseTermini); } if (!bSameRank) { iPeps.Add(mphit); } } string query = "query" + queryId; Dictionary <string, string> querySection = ParseSection(sr, query); string title = Uri.UnescapeDataString(querySection["title"]); SequestFilename sf = this.TitleParser.GetValue(title); sf.Charge = queryItem.Charge; if (sf.Experimental == null || sf.Experimental.Length == 0) { sf.Experimental = defaultExperimental; } foreach (IIdentifiedSpectrum mp in iPeps) { mp.Query.Title = title; mp.Query.FileScan = sf; } } } return(result); }
/// <summary> /// /// Get the query/peptide map from mascot dat file. /// /// </summary> /// <param name="filename">pFind proteins file</param> /// <param name="minRank">Minimum rank of peptide identified in same spectrum</param> /// <param name="minScore">Minimum score of peptide identified in same spectrum</param> /// <returns>Query/peptide map</returns> public Dictionary <int, List <IIdentifiedSpectrum> > ParsePeptides(string filename, int minRank, double minScore) { var result = new Dictionary <int, List <IIdentifiedSpectrum> >(); var sourceDir = GetSourceFile(filename); using (var sr = new StreamReader(filename)) { var parameters = ParseSection(sr, "Search"); var mm = ParseModification(parameters); foreach (var mod in mm.DynamicModification) { if (!this.ModificationCharMap.ContainsKey(mod.Modification)) { this.ModificationCharMap[mod.Modification] = ModificationConsts.MODIFICATION_CHAR[this.ModificationCharMap.Count + 1]; } } var headers = ParseSection(sr, "Total"); var queryCount = int.Parse(headers["Spectra"]); Progress.SetRange(1, queryCount); for (int queryId = 1; queryId <= queryCount; queryId++) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } Progress.SetPosition(queryId); var speName = MyConvert.Format("Spectrum{0}", queryId); var peptideSection = ParseSection(sr, speName); int candidateCount = int.Parse(peptideSection["ValidCandidate"]); if (candidateCount == 0) { continue; } var expMH = MyConvert.ToDouble(peptideSection["MH"]); var expMz = MyConvert.ToDouble(peptideSection["MZ"]); var charge = int.Parse(peptideSection["Charge"]); var iPeps = new List <IIdentifiedSpectrum>(); result[queryId] = iPeps; IIdentifiedSpectrum lastHit = null; int rank = 0; for (int k = 1; k <= candidateCount; k++) { string key = "NO" + k.ToString(); var scoreKey = key + "_Score"; if (!peptideSection.ContainsKey(scoreKey)) { if (null != lastHit) { lastHit.DeltaScore = 1.0; } break; } double score = MyConvert.ToDouble(peptideSection[scoreKey]); if (score < minScore || score == 0.0) { if (null != lastHit) { lastHit.DeltaScore = 1.0 - score / lastHit.Score; } break; } bool bSameRank = null != lastHit && score == lastHit.Score; if (!bSameRank) { if (null != lastHit) { lastHit.DeltaScore = 1.0 - score / lastHit.Score; } rank++; if (rank > minRank) { break; } } IIdentifiedSpectrum mphit; if (bSameRank) { mphit = lastHit; } else { mphit = new IdentifiedSpectrum(); mphit.Rank = rank; mphit.Score = score; mphit.ExpectValue = MyConvert.ToDouble(peptideSection[key + "_EValue"]); var mhkey = key + "_MH"; if (peptideSection.ContainsKey(mhkey)) { mphit.TheoreticalMH = MyConvert.ToDouble(peptideSection[mhkey]); } else { mphit.TheoreticalMH = MyConvert.ToDouble(peptideSection[key + "_Mass"]); } var micKey = key + "_Matched_Peaks"; if (peptideSection.ContainsKey(micKey)) { mphit.MatchedIonCount = int.Parse(peptideSection[micKey]); mphit.MatchedTIC = MyConvert.ToDouble(peptideSection[key + "_Matched_Intensity"]); } var misKey = key + "_MissCleave"; if (peptideSection.ContainsKey(misKey)) { mphit.NumMissedCleavages = int.Parse(peptideSection[misKey]); } mphit.ExperimentalMH = expMH; mphit.DeltaScore = 1.0; mphit.Query.QueryId = queryId; mphit.Query.ObservedMz = expMz; mphit.Query.Charge = charge; //mphit.Query.MatchCount = queryItem.MatchCount; lastHit = mphit; } var mp = new IdentifiedPeptide(mphit); mp.Sequence = peptideSection[key + "_SQ"]; string modificationPos = peptideSection[key + "_Modify_Pos"]; string modificationName = peptideSection[key + "_Modify_Name"]; Dictionary <int, string> modifications = GetModifications(modificationPos, modificationName); ModifySequence(mp, modifications, mm); AssignModification(mphit, modifications, mm); string proteins = peptideSection[key + "_Proteins"]; var parts = proteins.Split(','); for (int i = 1; i < parts.Count(); i++) { mp.AddProtein(parts[i]); } if (!bSameRank) { iPeps.Add(mphit); } } var title = new FileInfo(peptideSection["Input"]).Name; SequestFilename sf = this.TitleParser.GetValue(title); sf.Charge = charge; if (sf.Experimental == null || sf.Experimental.Length == 0) { sf.Experimental = sourceDir; } foreach (IIdentifiedSpectrum mp in iPeps) { mp.Query.Title = title; mp.Query.FileScan.LongFileName = sf.LongFileName; } } } return(result); }
protected IdentifiedProtein ParseProtein(String proteinContent) { IdentifiedProtein result = GetProtein(proteinContent); List <String> peptideInfoContentList = GetPeptideInfoContentList(proteinContent); foreach (String peptideInfoContent in peptideInfoContentList) { List <String> peptideInfo = GetPeptideInfo(peptideInfoContent); if (0 == peptideInfo.Count) { continue; } IIdentifiedSpectrum mphit = new IdentifiedSpectrum(); // Group 0 : peptide mass from observed m/z double experimentalPeptideMass = MyConvert.ToDouble(peptideInfo[0]); mphit.ExperimentalMass = experimentalPeptideMass; // Group 1 : observed m/z double observed = MyConvert.ToDouble(peptideInfo[1]); mphit.Query.ObservedMz = observed; // Group 2 : charge int charge = int.Parse(peptideInfo[2]); mphit.Query.Charge = charge; // Group 3 : title String title = Uri.UnescapeDataString(peptideInfo[3]).Trim(); mphit.Query.Title = title; SequestFilename sf = MascotUtils.ParseTitle(title, charge); if (sf != null) { mphit.Query.FileScan.LongFileName = sf.LongFileName; } // Group 4 : query mphit.Query.QueryId = int.Parse(peptideInfo[4]); // Group 5 equals Group 1 // Group 6 equals Group 0 // Group 7 : calculated peptide mass mphit.TheoreticalMass = MyConvert.ToDouble(peptideInfo[7]); // Group 8 : different between observed peptide mass and calculated // peptide mass // Group 9 : miss cleavage mphit.NumMissedCleavages = int.Parse(peptideInfo[9]); // Group 10: score mphit.Score = int.Parse(peptideInfo[10]); // Group 11: expect p value mphit.ExpectValue = MyConvert.ToDouble(peptideInfo[11]); // Group 12: rank mphit.Rank = int.Parse(peptideInfo[12]); // Group 13: peptide sequence // K.YEINVLR<u>.</u>N + Label:18O(2) (C-term) String seq = peptideInfo[13].Replace(" ", ""); var mpep = new IdentifiedPeptide(mphit); string[] parts = Regex.Split(seq, "\\+"); if (parts.Length > 1) { seq = parts[0].Trim(); mphit.Modifications = parts[1].Trim(); string[] mods = parts[1].Trim().Split(new[] { ';' }, StringSplitOptions.RemoveEmptyEntries); foreach (string mod in mods) { Match m = this.modificationReg.Match(mod.Trim()); if (!this.modifications.ContainsKey(m.Groups[1].Value)) { this.modifications[m.Groups[1].Value] = ' '; } } } mpep.Sequence = seq; if (GetPeptideFilter().Accept(mphit)) { mpep.AddProtein(result.Name); result.Peptides.Add(mpep); } } return(result); }
public override List <IIdentifiedProtein> ParseProteins(string fileName) { Dictionary <string, IIdentifiedProtein> proteinMap = new Dictionary <string, IIdentifiedProtein>(); Application xApp = new Application(); //得到WorkBook对象, 可以用两种方式之一: 下面的是打开已有的文件 Workbook xBook = xApp.Workbooks._Open(fileName, Missing.Value, Missing.Value, Missing.Value, Missing.Value , Missing.Value, Missing.Value, Missing.Value, Missing.Value , Missing.Value, Missing.Value, Missing.Value, Missing.Value); try { Worksheet xSheet = (Worksheet)xBook.Sheets[1]; int fromRow = 2; int endRow = fromRow; for (; endRow <= xSheet.Rows.Count; endRow++) { string b = xSheet.Value('B', endRow); if (null == b) { break; } } endRow--; Progress.SetRange(fromRow, endRow); Progress.SetMessage("Parsing file ..."); for (int i = fromRow; i <= endRow; i++) { Progress.SetPosition(i); string seq = xSheet.Value('A', i); if (null == seq)//蛋白质信息 { continue; } string deltaCn = xSheet.Value('I', i); if (null == deltaCn)//rank > 1 { continue; } string protein = xSheet.Value('B', i); if (!proteinMap.ContainsKey(protein)) { var p = new IdentifiedProtein(protein); p.Coverage = MyConvert.ToDouble(xSheet.Value('C', i + 2)); p.MolecularWeight = MyConvert.ToDouble(xSheet.Value('F', i + 2)) * 1000; p.IsoelectricPoint = MyConvert.ToDouble(xSheet.Value('G', i + 2)); p.Score = MyConvert.ToDouble(xSheet.Value('H', i + 2)); p.Description = xSheet.Value('I', i + 2); proteinMap[protein] = p; } var pro = proteinMap[protein]; IdentifiedSpectrum spectrum = new IdentifiedSpectrum(); IdentifiedPeptide peptide = new IdentifiedPeptide(spectrum); peptide.Sequence = seq.ToUpper(); peptide.AddProtein(protein); spectrum.Modifications = xSheet.Value('F', i); spectrum.DeltaScore = MyConvert.ToDouble(deltaCn); spectrum.Charge = Convert.ToInt32(xSheet.Value('K', i)); spectrum.ObservedMz = MyConvert.ToDouble(xSheet.Value('L', i)); spectrum.TheoreticalMH = MyConvert.ToDouble(xSheet.Value('M', i)); spectrum.Ions = xSheet.Value('S', i); spectrum.Query.FileScan.FirstScan = Convert.ToInt32(xSheet.Value('P', i)); spectrum.Query.FileScan.LastScan = Convert.ToInt32(xSheet.Value('Q', i)); spectrum.Query.FileScan.Experimental = FileUtils.RemoveAllExtension(xSheet.Value('T', i)); pro.Peptides.Add(peptide); } } finally { xBook.Close(false, Type.Missing, Type.Missing); } var proteins = proteinMap.Values.ToList(); return(proteins); }
public List <IIdentifiedSpectrum> ReadFromFile(string fileName) { XElement root = XElement.Load(fileName); XElement request = root.FindElement("MSSearch_request"); //parsing identification protocol first var modMap = ParseSearchModificationMap(request.FindFirstDescendant("MSSearchSettings_variable")); var protease = ParseProtease(request.FindFirstDescendant("MSSearchSettings_enzyme")); Func <string, int> missCalc; if (protease == null) { missCalc = m => 0; } else { missCalc = m => protease.GetMissCleavageSiteCount(m); } //parsing sequence collection, including protein<->peptide map var result = new List <IIdentifiedSpectrum>(); var response = root.FindElement("MSSearch_response"); var scale = double.Parse(response.FindFirstDescendant("MSResponse_scale").Value); var idList = response.FindFirstDescendant("MSResponse_hitsets"); foreach (var sir in idList.FindElements("MSHitSet")) { var hits = sir.FindElement("MSHitSet_hits"); if (hits == null) { continue; } var spectrum = new IdentifiedSpectrum(); result.Add(spectrum); var title = sir.FindElement("MSHitSet_ids").FindElement("MSHitSet_ids_E").Value; spectrum.Query.FileScan = this.TitleParser.GetValue(title); foreach (var hit in hits.FindElements("MSHits")) { var evalue = double.Parse(hit.FindElement("MSHits_evalue").Value); if (spectrum.Peptides.Count > 0) { if (evalue > spectrum.ExpectValue) { continue; } if (evalue < spectrum.ExpectValue) { spectrum.ClearPeptides(); } } spectrum.ExpectValue = evalue; spectrum.Score = -Math.Log(spectrum.ExpectValue); if (spectrum.Query.Charge == 0) // trust the charge from title { spectrum.Query.Charge = int.Parse(hit.FindElement("MSHits_charge").Value); } spectrum.ExperimentalMass = double.Parse(hit.FindElement("MSHits_mass").Value) / scale; spectrum.TheoreticalMass = double.Parse(hit.FindElement("MSHits_theomass").Value) / scale; var peptide = new IdentifiedPeptide(spectrum); var seq = hit.FindElement("MSHits_pepstring").Value; spectrum.NumMissedCleavages = missCalc(seq); var mods = hit.FindElement("MSHits_mods"); if (mods != null) { var modsloc = (from ele in mods.FindElements("MSModHit") let loc = int.Parse(ele.FindElement("MSModHit_site").Value) let modtype = ele.FindElement("MSModHit_modtype").FindElement("MSMod").Value orderby loc descending select new { Location = loc, ModType = modtype }).ToList(); foreach (var modloc in modsloc) { seq = seq.Insert(modloc.Location + 1, modMap[modloc.ModType]); } } peptide.Sequence = hit.FindElement("MSHits_pepstart").Value + "." + seq + "." + hit.FindElement("MSHits_pepstop").Value; foreach (var pep in hit.FindElement("MSHits_pephits").FindElements("MSPepHit")) { var proteinName = pep.FindElement("MSPepHit_defline").Value.StringBefore(" ").StringBefore("\t"); peptide.AddProtein(proteinName); } } } return(result); }