public void TestBuild() { List <IIdentifiedSpectrum> spectra = new SequestPeptideTextFormat().ReadFromFile(TestContext.CurrentContext.TestDirectory + "/../../../data/TestBuilder.peptides"); Assert.AreEqual(4, spectra.Count); IAccessNumberParser parser = AccessNumberParserFactory.FindOrCreateParser(@"(IPI\d+)", "IPI"); List <IIdentifiedProtein> proteins = new IdentifiedProteinBuilder().Build(spectra); Assert.AreEqual(4, proteins.Count); List <IIdentifiedProteinGroup> groups = new IdentifiedProteinGroupBuilder().Build(proteins); Assert.AreEqual(2, groups.Count); Assert.AreEqual(1, groups[0].Count); Assert.AreEqual("IPI:IPI00784154.1|SW", groups[0][0].Name); Assert.AreEqual(2, groups[1].Count); Assert.AreEqual("REVERSED_00000001", groups[1][0].Name); Assert.AreEqual("REVERSED_00000002", groups[1][1].Name); IIdentifiedResult result = new IdentifiedResultBuilder(parser, "").Build(groups); }
public override IEnumerable <string> Process(string fileName) { IFileFormat <List <IIdentifiedSpectrum> > format = new SequestPeptideTextFormat(); Progress.SetMessage("Reading peptides from " + sourceFileName + " ..."); List <IIdentifiedSpectrum> spectra = format.ReadFromFile(sourceFileName); HashSet <string> dtaFilenames = new HashSet <string>(); spectra.ForEach(spectrum => { dtaFilenames.Add(spectrum.Query.FileScan.LongFileName); }); Progress.SetMessage("Reading peptides from " + fileName + " ..."); List <IIdentifiedSpectrum> subtract = format.ReadFromFile(fileName); Progress.SetMessage("Subtracting peptides ..."); subtract.RemoveAll(m => { return(dtaFilenames.Contains(m.Query.FileScan.LongFileName)); }); string resultFileName = sourceFileName + ".subtracted"; format.WriteToFile(resultFileName, subtract); Progress.SetMessage("Finished."); return(new[] { resultFileName }); }
public void TestParse() { var format = new SequestPeptideTextFormat("\t\"File, Scan(s)\"\tSequence\tMH+\tDiff(MH+)\tCharge\tRank\tXC\tDeltaCn\tSp\tRSp\tIons\tReference\tDIFF_MODIFIED_CANDIDATE\tPI\tMissCleavage\tMatchedTIC"); string spLine = "\tJWH_SAX_35_050906,10755\tK.GLEAEATY*PYEGKDGPCR.Y ! K.GLEAEATYPY*EGKDGPCR.Y\t2094.09819\t-1.27181\t2\t1\t2.71\t0.28\t127.40\t34\t15|51\tIPI:IPI00126770.2|SWISS-PROT:Q9R014|TREMBL:Q91XK6|REFSEQ_NP:NP_036137|ENSEMBL: ! IPI:IPI00126770.2|SWISS-PROT:Q9R014|TREMBL:Q91XK6|REFSEQ_NP:NP_036137|ENSEMBL:\tK.GLEAEAT*YPYEGKDGPCR.Y(2.6670,0.0154) ! K.GLEAEATYPY*EGKDGPCR.Y(2.5464,0.0599)\t4.41\t1\t100.01"; IIdentifiedSpectrum peptide = format.PeptideFormat.ParseString(spLine); Assert.AreEqual("JWH_SAX_35_050906", peptide.Query.FileScan.Experimental); Assert.AreEqual(10755, peptide.Query.FileScan.FirstScan); Assert.AreEqual(10755, peptide.Query.FileScan.LastScan); Assert.AreEqual(2, peptide.Peptides.Count); Assert.AreEqual("K.GLEAEATY*PYEGKDGPCR.Y", peptide.Peptides[0].Sequence); Assert.AreEqual("K.GLEAEATYPY*EGKDGPCR.Y", peptide.Peptides[1].Sequence); Assert.AreEqual(2094.09819, peptide.TheoreticalMH); Assert.AreEqual(1, peptide.Rank); Assert.AreEqual(2.71, peptide.Score, 0.01); Assert.AreEqual(0.28, peptide.DeltaScore, 0.01); Assert.AreEqual(127.4, peptide.SpScore); Assert.AreEqual(34, peptide.SpRank); Assert.AreEqual(15, peptide.MatchedIonCount); Assert.AreEqual(51, peptide.TheoreticalIonCount); Assert.AreEqual(2, peptide.DiffModificationSiteCandidates.Count); Assert.AreEqual(1, peptide.NumMissedCleavages); Assert.AreEqual("K.GLEAEAT*YPYEGKDGPCR.Y", peptide.DiffModificationSiteCandidates[0].Sequence); Assert.AreEqual(2.6670, peptide.DiffModificationSiteCandidates[0].Score); Assert.AreEqual(0.0154, peptide.DiffModificationSiteCandidates[0].DeltaScore); Assert.AreEqual("K.GLEAEATYPY*EGKDGPCR.Y", peptide.DiffModificationSiteCandidates[1].Sequence); Assert.AreEqual(2.5464, peptide.DiffModificationSiteCandidates[1].Score); Assert.AreEqual(0.0599, peptide.DiffModificationSiteCandidates[1].DeltaScore); Assert.AreEqual(100.01, peptide.MatchedTIC, 0.01); Assert.AreEqual(spLine, format.PeptideFormat.GetString(peptide)); }
public override IEnumerable <string> Process(string fileName) { SequestPeptideTextFormat format = new SequestPeptideTextFormat(); string[] files = Directory.GetFiles(imageDirectory, "*.jpg"); var dtaFiles = new HashSet <string>(files.ToList().ConvertAll(m => FileUtils.ChangeExtension(new FileInfo(m).Name, "").ToLower())); List <IIdentifiedSpectrum> spectra = format.ReadFromFile(fileName); var seqs = new HashSet <string>( from spectrum in spectra where dtaFiles.Contains(spectrum.Query.FileScan.LongFileName.ToLower()) select spectrum.Peptide.Sequence); var validated = from spectrum in spectra where seqs.Contains(spectrum.Peptide.Sequence) select spectrum; string resultFilename = fileName + ".validated"; format.WriteToFile(resultFilename, validated.ToList()); return(new[] { resultFilename }); }
public void TestReadPeptideFromFile() { var format = new SequestPeptideTextFormat(); List<IIdentifiedSpectrum> spl = format.ReadFromFile(@"../../../data/Standard_Protein_FIT_060222.peptides"); Assert.AreEqual(287, spl.Count); List<IIdentifiedSpectrum> spl2 = format.ReadFromFile(@"../../../data/Nmix_27_C13.peptides"); Assert.AreEqual(1093, spl2.Count); }
public override IEnumerable <string> Process(string peptideFilename) { SequestPeptideTextFormat format = new SequestPeptideTextFormat(); List <IIdentifiedSpectrum> spectra = format.ReadFromFile(peptideFilename); string indexFilename = FileUtils.ChangeExtension(peptideFilename, ".images.html"); Dictionary <string, List <IIdentifiedSpectrum> > peptideMap = new Dictionary <string, List <IIdentifiedSpectrum> >(); foreach (IIdentifiedSpectrum spectrum in spectra) { string pureSeq = spectrum.Peptide.PureSequence; if (!peptideMap.ContainsKey(pureSeq)) { peptideMap[pureSeq] = new List <IIdentifiedSpectrum>(); } peptideMap[pureSeq].Add(spectrum); } List <string> pureSeqs = new List <string>(peptideMap.Keys); pureSeqs.Sort(); using (StreamWriter sw = new StreamWriter(indexFilename)) { sw.WriteLine("<html>"); foreach (string pureSeq in pureSeqs) { List <IIdentifiedSpectrum> curSpectra = peptideMap[pureSeq]; string seqFilename = imageDirectory + pureSeq + ".html"; using (StreamWriter swSeq = new StreamWriter(seqFilename)) { swSeq.WriteLine("<html>"); foreach (IIdentifiedSpectrum spectrum in curSpectra) { spectrum.Query.FileScan.Extension = "jpg"; string imageFilename = spectrum.Query.FileScan.LongFileName; swSeq.WriteLine(format.PeptideFormat.GetString(spectrum) + "<br>"); swSeq.WriteLine("<img src=\"{0}\"><br>", imageFilename); } swSeq.WriteLine("</html>"); } sw.WriteLine("<a href=\"{0}\" target=\"_blank\">{1}</a><br>", relativeDirectory + pureSeq + ".html", pureSeq); } sw.WriteLine("</html>"); } return(new[] { indexFilename }); }
public override IEnumerable <string> Process(string fileName) { IFileFormat <List <IIdentifiedSpectrum> > format = new SequestPeptideTextFormat(); List <IIdentifiedSpectrum> spectra = format.ReadFromFile(fileName); List <IIdentifiedSpectrum> result = KeepMaxScorePeptideOnly(spectra); string resultFileName = fileName + ".unique"; format.WriteToFile(resultFileName, result); return(new[] { resultFileName }); }
public override IEnumerable <string> Process(string peptidesFilename) { List <IIdentifiedSpectrum> peptides = new SequestPeptideTextFormat().ReadFromFile(peptidesFilename); Dictionary <string, List <IIdentifiedSpectrum> > rawPeptideMap = IdentifiedSpectrumUtils.GetRawPeptideMap(peptides); Progress.SetRange(1, 1, rawPeptideMap.Count); List <string> raws = new List <string>(rawPeptideMap.Keys); raws.Sort(); int position = 0; int totalRaws = rawPeptideMap.Count; foreach (string raw in raws) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } Progress.SetPosition(1, position++); Progress.SetMessage(1, MyConvert.Format("{0}/{1}, Extracting {2} for {3} peptides ...", position, totalRaws, raw, rawPeptideMap[raw].Count)); if (!filePathMap.ContainsKey(raw)) { throw new Exception("Cannot find raw dtas/outs file for " + raw); } if (filePathMap[raw].Count == 1) { ExtractSingleRaw(raw, filePathMap[raw][0], rawPeptideMap[raw]); } else { ExtractMultipleRaw(raw, filePathMap[raw], rawPeptideMap[raw]); } } Progress.SetPosition(1, position); return(new List <string>()); }
public void TestBuild() { List<IIdentifiedSpectrum> spectra = new SequestPeptideTextFormat().ReadFromFile(@"../../../data/TestBuilder.peptides"); Assert.AreEqual(4, spectra.Count); IAccessNumberParser parser = AccessNumberParserFactory.FindOrCreateParser(@"(IPI\d+)", "IPI"); List<IIdentifiedProtein> proteins = new IdentifiedProteinBuilder().Build(spectra); Assert.AreEqual(4, proteins.Count); List<IIdentifiedProteinGroup> groups = new IdentifiedProteinGroupBuilder().Build(proteins); Assert.AreEqual(2, groups.Count); Assert.AreEqual(1, groups[0].Count); Assert.AreEqual("IPI:IPI00784154.1|SW", groups[0][0].Name); Assert.AreEqual(2, groups[1].Count); Assert.AreEqual("REVERSED_00000001", groups[1][0].Name); Assert.AreEqual("REVERSED_00000002", groups[1][1].Name); IIdentifiedResult result = new IdentifiedResultBuilder(parser,"").Build(groups); }
public override IEnumerable <string> Process(string fileName) { var format = new SequestResultTextFormat(); format.Progress = this.Progress; Progress.SetMessage("Reading identified result from " + fileName + " ..."); IIdentifiedResult ir = format.ReadFromFile(fileName); Progress.SetMessage("Removing duplicated peptide ..."); Progress.SetRange(0, ir.Count); for (int i = 0; i < ir.Count; i++) { Progress.SetPosition(i); IIdentifiedProteinGroup group = ir[i]; List <IIdentifiedSpectrum> peps = UniquePeptideDistiller.KeepMaxScorePeptideOnly(group.GetPeptides()); foreach (var protein in group) { protein.Peptides.RemoveAll(m => !peps.Contains(m.Spectrum)); } } string resultFileName = fileName + ".unique"; Progress.SetMessage("Saving proteins to " + resultFileName + " ..."); format.WriteToFile(resultFileName, ir); List <IIdentifiedSpectrum> spectra = ir.GetSpectra(); var peptideFormat = new SequestPeptideTextFormat(format.PeptideFormat.GetHeader()); string peptideFileName = fileName + ".unique.peptides"; Progress.SetMessage("Saving peptides to " + peptideFileName + " ..."); peptideFormat.WriteToFile(peptideFileName, spectra); Progress.SetMessage("Finished."); return(new[] { resultFileName, peptideFileName }); }
public IEnumerable <string> Process(string filename) { List <IIdentifiedSpectrum> sphs = new SequestPeptideTextFormat().ReadFromFile(filename); Dictionary <int, List <IIdentifiedSpectrum> > chargeSphMap = IdentifiedSpectrumUtils.GetChargeMap(sphs); var result = new List <string>(); foreach (int charge in chargeSphMap.Keys) { List <IIdentifiedSpectrum> sphList = chargeSphMap[charge]; double maxDeltaScore = 1.0; double maxScore = 0.0; foreach (IIdentifiedSpectrum sph in sphList) { if (sph.Score > maxScore) { maxScore = sph.Score; } } maxScore += 1.0; string resultFilename = filename + "." + charge + ".png"; var bmp = new Bitmap(this.width, this.height); Graphics g = Graphics.FromImage(bmp); g.FillRectangle(new SolidBrush(Color.White), new Rectangle(0, 0, this.width, this.height)); var font = new Font("Times New Roman", 8); double fontHeight = font.GetHeight(g); g.DrawString("Score", font, new SolidBrush(Color.Black), GetX(maxScore, maxScore) - 20, GetY(maxDeltaScore, 0) + 10); g.DrawString("DeltaScore", font, new SolidBrush(Color.Black), GetX(maxScore, 0) - 20, GetY(maxDeltaScore, maxDeltaScore) - (int)fontHeight - 5); DrawXScale(maxScore, maxDeltaScore, g, font); DrawYScale(maxScore, maxDeltaScore, g, font); var colors = new HashSet <Color>(); foreach (IIdentifiedSpectrum sph in sphList) { Color color = GetColor(sph); if (!colors.Contains(color)) { colors.Add(color); } int x = GetX(maxScore, sph.Score); var y = (int)(this.height - sph.DeltaScore / maxDeltaScore * this.height - this.top); g.FillEllipse(new SolidBrush(color), new Rectangle(x - 1, y - 1, 3, 3)); //Console.WriteLine(MyConvert.Format("{0:0.0000}\t{1:0.0000}\t{2}\t{3}\t{4}\t{5}", // sph.Score, // sph.DeltaScore, // color.Name, // x, // y, // sph.PeptideInfo[0].Proteins[0])); } DrawColorTitle(maxScore, maxDeltaScore, g, font, colors); g.Save(); bmp.Save(resultFilename, ImageFormat.Png); result.Add(resultFilename); } return(result); }
public IEnumerable <string> Process(string filename) { List <string> result = new List <string>(); List <string> proteins = new List <string>(); List <string> lightPeptides = new List <string>(); List <string> heavyPeptides = new List <string>(); Dictionary <string, Sequence> seqMap = new Dictionary <string, Sequence>(); if (File.Exists(filename + ".fasta")) { List <Sequence> seqs = SequenceUtils.Read(ff, filename + ".fasta"); foreach (Sequence seq in seqs) { seqMap[seq.Name] = seq; } } string lightResult = filename + ".light"; string heavyResult = filename + ".heavy"; StreamWriter swLightFasta = null; StreamWriter swHeavyFasta = null; if (seqMap.Count > 0) { swLightFasta = new StreamWriter(lightResult + ".fasta"); swHeavyFasta = new StreamWriter(heavyResult + ".fasta"); } try { using (StreamWriter swLight = new StreamWriter(lightResult)) { using (StreamWriter swHeavy = new StreamWriter(heavyResult)) { using (StreamReader sr = new StreamReader(filename)) { string line = sr.ReadLine(); swLight.WriteLine(line); swHeavy.WriteLine(line); line = sr.ReadLine(); SequestPeptideTextFormat format = new SequestPeptideTextFormat(line); swLight.WriteLine(line); swHeavy.WriteLine(line); bool bIsProtein = true; while ((line = sr.ReadLine()) != null) { if (line.Trim().Length == 0) { WriteGroup(proteins, lightPeptides, heavyPeptides, swLight, swHeavy, swLightFasta, swHeavyFasta, seqMap); break; } if (line.StartsWith("$")) { if (bIsProtein) { proteins.Add(line); continue; } WriteGroup(proteins, lightPeptides, heavyPeptides, swLight, swHeavy, swLightFasta, swHeavyFasta, seqMap); proteins.Clear(); lightPeptides.Clear(); heavyPeptides.Clear(); proteins.Add(line); bIsProtein = true; continue; } bIsProtein = false; IIdentifiedSpectrum sph = format.PeptideFormat.ParseString(line); string matchedSeq = PeptideUtils.GetMatchedSequence(sph.Sequence); double lightMass = lightCalc.GetMass(matchedSeq); double heavyMass = heavyCalc.GetMass(matchedSeq); if (Math.Abs(lightMass - sph.ExperimentalMass) < 0.1) { lightPeptides.Add(line); continue; } if (Math.Abs(heavyMass - sph.ExperimentalMass) < 0.1) { heavyPeptides.Add(line); continue; } throw new Exception(MyConvert.Format("Mass={0:0.0000}; {1:0.0000}; {2:0.0000}", sph.ExperimentalMass, lightMass, heavyMass)); } } } } } finally { if (seqMap.Count > 0) { swLightFasta.Close(); swHeavyFasta.Close(); } } result.Add(lightResult); result.Add(heavyResult); return(result); }
public IdentifiedPeptidesMerger(string[] sourceFiles) { this.format = new SequestPeptideTextFormat(); this.sourceFiles = sourceFiles; }
protected override List <IIdentifiedSpectrum> DoParse() { IAccessNumberParser parser = options.Parent.Database.GetAccessNumberParser(); var peptideFormat = new SequestPeptideTextFormat() { Progress = this.Progress }; Progress.SetRange(0, options.PathNames.Count + 1); var result = new List <IIdentifiedSpectrum>(); IFilter <IIdentifiedSpectrum> spectrumFilter = options.GetFilter(); SequestOutDirectoryParser outDirParser; SequestOutsParser outsParser; SequestOutZipParser outZipParser; string modStr = ""; if (options.SkipSamePeptideButDifferentModificationSite) { modStr = MyConvert.Format(".M{0:0.00}", options.MaxModificationDeltaCn); outsParser = new SequestOutsParser(true, options.MaxModificationDeltaCn); outDirParser = new SequestOutDirectoryParser(true, options.MaxModificationDeltaCn); outZipParser = new SequestOutZipParser(true, options.MaxModificationDeltaCn); } else { outsParser = new SequestOutsParser(true); outDirParser = new SequestOutDirectoryParser(true); outZipParser = new SequestOutZipParser(true); } outsParser.Progress = Progress; outDirParser.Progress = Progress; outZipParser.Progress = Progress; long afterFirstMemory = 0; DateTime afterFirstTime = DateTime.Now; int stepCount = 0; foreach (string pathName in options.PathNames) { stepCount++; if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } AbstractSequestSpectraDistiller distiller; string engine; if (Directory.Exists(pathName)) { var dir = new DirectoryInfo(pathName); if (dir.GetFiles("*.outs").Length > 0 || dir.GetFiles("*.outs.zip").Length > 0) { distiller = new SequestOutsDistiller(outsParser, peptideFormat); } else { distiller = new SequestOutDirectoryDistiller(outDirParser, peptideFormat); } engine = "SEQUEST"; } else if (pathName.ToLower().EndsWith(".xml")) { distiller = new CometSpectraDistiller(peptideFormat); engine = "COMET"; } else if (pathName.ToLower().EndsWith(".msf")) { distiller = new MsfSpectraDistiller(peptideFormat); engine = "PD"; } else //zipfile { ISpectrumParser zipParser; if (ZipUtils.HasFile(pathName, m => m.ToLower().EndsWith(".out"))) { zipParser = outZipParser; } else { zipParser = outsParser; } distiller = new SequestOutZipDistiller(zipParser, peptideFormat); engine = "SEQUEST"; } distiller.Progress = this.Progress; List <IIdentifiedSpectrum> curPeptides = distiller.ParseSpectra(pathName, modStr, stepCount, options.PathNames.Count); int curPeptideCount = curPeptides.Count; if (null != spectrumFilter) { curPeptides.RemoveAll(m => !spectrumFilter.Accept(m)); } curPeptides.ForEach(m => { m.Tag = options.Name; m.Engine = engine; }); result.AddRange(curPeptides); curPeptides = null; GC.Collect(); GC.WaitForPendingFinalizers(); if (stepCount == 1) { afterFirstMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); afterFirstTime = DateTime.Now; } else { long currMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); double averageCost = (double)(currMemory - afterFirstMemory) / (stepCount - 1); double estimatedCost = afterFirstMemory + averageCost * options.PathNames.Count; DateTime currTime = DateTime.Now; var averageTime = currTime.Subtract(afterFirstTime).TotalMinutes / (stepCount - 1); var finishTime = afterFirstTime.AddMinutes(averageTime * (options.PathNames.Count - 1)); Console.WriteLine("{0}/{1}, cost {2}M, avg {3:0.0}M, need {4:0.0}M, will finish at {5:MM-dd HH:mm:ss}", stepCount, options.PathNames.Count, currMemory, averageCost, estimatedCost, finishTime.ToString()); } } return(result); }
public override IEnumerable <string> Process(string fileName) { SequestPeptideTextFormat format = new SequestPeptideTextFormat(); List <IIdentifiedSpectrum> spectra = format.ReadFromFile(fileName); List <IIdentifiedSpectrum> positives = spectra.FindAll(m => { if (IsAmbigiousSequence(m.Sequence)) { return(false); } if (m.DiffModificationSiteCandidates.Count == 0) { return(true); } if (1 == m.DiffModificationSiteCandidates.Count) { string matchedSeq = PeptideUtils.GetMatchedSequence(m.DiffModificationSiteCandidates[0].Sequence); return(matchedSeq.Equals(m.Sequence)); } return(false); }); var ambigious = spectra.Except(positives); var positiveSeqs = new HashSet <string>(positives.ConvertAll(m => PeptideUtils.GetMatchedSequence(m.Sequence))); var keptAmbigious = new List <IIdentifiedSpectrum>(); foreach (IIdentifiedSpectrum m in ambigious) { if (positiveSeqs.Contains(m.Sequence)) { continue; } string matchedSeq; if (IsAmbigiousSequence(m.Sequence)) { matchedSeq = PeptideUtils.GetMatchedSequence(m.DiffModificationSiteCandidates[0].Sequence); } else { matchedSeq = PeptideUtils.GetMatchedSequence(m.Sequence); } if (positiveSeqs.Contains(matchedSeq)) { continue; } keptAmbigious.Add(m); } positives.AddRange(keptAmbigious); var bin = from p in positives group p by(p.Sequence + "_" + p.Charge); List <IIdentifiedSpectrum> final = new List <IIdentifiedSpectrum>(); foreach (var b in bin) { var o = b.OrderByDescending(m => m.Score); final.Add(o.First()); } string resultFileName = fileName + ".single"; format.WriteToFile(resultFileName, final); return(new string[] { resultFileName }); }