private void DoQuantify(string detailDir, List <IIdentifiedSpectrum> querys) { Dictionary <string, List <IIdentifiedSpectrum> > filePepMap = IdentifiedSpectrumUtils.GetRawPeptideMap(querys); int fileCount = 0; foreach (string experimental in filePepMap.Keys) { fileCount++; if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } string rawFilename = option.RawFormat.GetRawFile(option.RawDir, experimental); Progress.SetMessage(MyConvert.Format("{0}/{1} : Processing {2} ...", fileCount, filePepMap.Keys.Count, rawFilename)); SilacQuantificationFileBuilder builder = new SilacQuantificationFileBuilder(option); builder.Progress = this.Progress; builder.SoftwareVersion = this.SoftwareVersion; builder.MinScanNumber = this.MinScanNumber; List <IIdentifiedSpectrum> peps = filePepMap[experimental]; peps.Sort((m1, m2) => m1.IsExtendedIdentification().CompareTo(m2.IsExtendedIdentification())); builder.Quantify(rawFilename, peps, detailDir); } }
protected override IIdentifiedResult GetIdentifiedResult(string fileName) { format = new MascotPeptideTextFormat(); var spectra = format.ReadFromFile(fileName); IIdentifiedResult result; if (isSiteLevel) { result = IdentifiedSpectrumUtils.BuildGroupByPeptide(spectra); } else { result = IdentifiedSpectrumUtils.BuildGroupByUniquePeptide(spectra); } var map = SequenceUtils.ReadAccessNumberReferenceMap(new FastaFormat(), this.fastaFile, this.parser); foreach (var group in result) { var proteins = group[0].Description.Split('/'); group[0].Description = (from p in proteins let ac = parser.GetValue(p) select map[ac]).ToList().Merge(" ! "); } return(result); }
/// <summary> /// 获取筛选后,去除相同谱图,假设为不同电荷谱图以后的谱图列表。 /// </summary> /// <returns></returns> public List <IIdentifiedSpectrum> GetUnconflictedOptimalSpectra() { List <IIdentifiedSpectrum> result = GetOptimalSpectra(); IdentifiedSpectrumUtils.FilterSameSpectrumWithDifferentCharge(result); return(result); }
private double GetPrecursorPPM(List <IIdentifiedSpectrum> spectra) { double precursorPPM = option.PPMTolerance; if (spectra.Count >= 5) { var systemError = IdentifiedSpectrumUtils.GetDeltaPrecursorPPMAccumulator(spectra); if (!double.IsInfinity(systemError.StdDev) && !double.IsNaN(systemError.StdDev)) { precursorPPM = systemError.StdDev * 3; } } return(precursorPPM); }
public override void WriteToFile(string filename, List <IIdentifiedSpectrum> t) { using (var sw = new StreamWriter(filename)) { sw.WriteLine(PeptideFormat.GetHeader()); foreach (IIdentifiedSpectrum mph in t) { sw.WriteLine(PeptideFormat.GetString(mph)); } if (!NotExportSummary) { sw.WriteLine(); sw.WriteLine("----- summary -----"); var totalCount = IdentifiedSpectrumUtils.GetSpectrumCount(t); var totalUniqueCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(t); sw.WriteLine("Total spectra: " + totalCount); sw.WriteLine("Total peptides: " + totalUniqueCount); var tags = (from s in t select s.Tag).Distinct().ToList(); if (tags.Count > 1) { tags.Sort(); sw.WriteLine(); sw.WriteLine("Tag\tSpectra\tPeptides"); sw.WriteLine("All\t{0}\t{1}", totalCount, totalUniqueCount); foreach (var tag in tags) { var tagspectra = from s in t where s.Tag == tag select s; sw.WriteLine("{0}\t{1}\t{2}", tag, IdentifiedSpectrumUtils.GetSpectrumCount(tagspectra), IdentifiedSpectrumUtils.GetUniquePeptideCount(tagspectra)); } } } } string enzymeFile = filename + ".enzyme"; new ProteaseFile().Write(enzymeFile, t); }
public override IEnumerable <string> Process(string peptidesFilename) { List <IIdentifiedSpectrum> peptides = new SequestPeptideTextFormat().ReadFromFile(peptidesFilename); Dictionary <string, List <IIdentifiedSpectrum> > rawPeptideMap = IdentifiedSpectrumUtils.GetRawPeptideMap(peptides); Progress.SetRange(1, 1, rawPeptideMap.Count); List <string> raws = new List <string>(rawPeptideMap.Keys); raws.Sort(); int position = 0; int totalRaws = rawPeptideMap.Count; foreach (string raw in raws) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } Progress.SetPosition(1, position++); Progress.SetMessage(1, MyConvert.Format("{0}/{1}, Extracting {2} for {3} peptides ...", position, totalRaws, raw, rawPeptideMap[raw].Count)); if (!filePathMap.ContainsKey(raw)) { throw new Exception("Cannot find raw dtas/outs file for " + raw); } if (filePathMap[raw].Count == 1) { ExtractSingleRaw(raw, filePathMap[raw][0], rawPeptideMap[raw]); } else { ExtractMultipleRaw(raw, filePathMap[raw], rawPeptideMap[raw]); } } Progress.SetPosition(1, position); return(new List <string>()); }
protected void DoAfterParse(List <IIdentifiedSpectrum> result) { Progress.SetMessage("Remove same spectrum matched with different peptides ..."); IdentifiedSpectrumUtils.RemoveSpectrumWithAmbigiousAssignment(result); if (Options.FilterByPrecursor && Options.FilterByPrecursorDynamicTolerance) { Progress.SetMessage(MyConvert.Format("Filtering by precursor mass tolerance ...", result.Count)); List <IIdentifiedSpectrum> highconfidents = Options.SearchEngine.GetFactory().GetHighConfidentPeptides(result); DynamicPrecursorPPMFilter filter = new DynamicPrecursorPPMFilter(Options.PrecursorPPMTolerance, Options.FilterByPrecursorIsotopic); var sFilter = filter.GetFilter(highconfidents); result.RemoveAll(m => !sFilter.Accept(m)); } if (Options.Parent != null && Options.Parent.Database.RemoveContamination) { var filter = Options.Parent.Database.GetContaminationNameFilter(); if (null != filter) { Progress.SetMessage(MyConvert.Format("Filtering by contamination name : {0}", filter.ToString())); result.RemoveAll(m => filter.Accept(m)); } } if (Options.SearchedByDifferentParameters) { Progress.SetMessage(MyConvert.Format("Merging same spectrum from different search parameters ...", result.Count)); IdentifiedSpectrumUtils.KeepTopPeptideFromSameEngineDifferentParameters(result, Options.ScoreFunction); result.TrimExcess(); GC.Collect(); GC.WaitForPendingFinalizers(); Progress.SetMessage(MyConvert.Format("Total {0} peptides passed minimum tolerance criteria.", result.Count)); } Progress.SetMessage("Parsing protein access number ..."); IdentifiedSpectrumUtils.ResetProteinByAccessNumberParser(result, Options.Parent.Database.GetAccessNumberParser()); Progress.SetMessage("Parsing protein access number finished."); Progress.SetMessage("Sorting peptide sequence ..."); result.ForEach(m => m.SortPeptides()); Progress.SetMessage("Sorting peptide sequence finished."); }
public void WriteToFile(string filename, List <IIdentifiedSpectrum> spectra) { using (var sw = new StreamWriter(filename)) { sw.WriteLine(PeptideFormat.GetHeader()); foreach (IIdentifiedSpectrum pep in spectra) { sw.WriteLine(PeptideFormat.GetString(pep)); } } using (var sw = new StreamWriter(filename + ".summary")) { WriteSummary(sw, IdentifiedSpectrumUtils.GetSpectrumCount(spectra), IdentifiedSpectrumUtils.GetUniquePeptideCount(spectra)); } string enzymeFile = filename + ".enzyme"; new ProteaseFile().Write(enzymeFile, spectra); }
public override IEnumerable <string> Process(string targetFilename) { using (StreamWriter sw = new StreamWriter(targetFilename)) { HashSet <string> unique = new HashSet <string> (); int totalSpectrumCount = 0; Progress.SetRange(1, sourceFiles.Length); int count = 0; LineFormat <IIdentifiedSpectrum> pepFormat = null; foreach (string sourceFile in sourceFiles) { Progress.SetMessage("Processing " + sourceFile + " ..."); var spectra = format.ReadFromFile(sourceFile); totalSpectrumCount += spectra.Count; unique.UnionWith(IdentifiedSpectrumUtils.GetUniquePeptide(spectra)); if (count == 0) { pepFormat = format.PeptideFormat; sw.WriteLine(pepFormat.GetHeader()); } spectra.ForEach(m => sw.WriteLine(pepFormat.GetString(m))); count++; Progress.SetPosition(count); } format.WriteSummary(sw, totalSpectrumCount, unique.Count); Progress.End(); } return(new[] { targetFilename }); }
public IEnumerable <string> Process(string filename) { List <IIdentifiedSpectrum> sphs = new SequestPeptideTextFormat().ReadFromFile(filename); Dictionary <int, List <IIdentifiedSpectrum> > chargeSphMap = IdentifiedSpectrumUtils.GetChargeMap(sphs); var result = new List <string>(); foreach (int charge in chargeSphMap.Keys) { List <IIdentifiedSpectrum> sphList = chargeSphMap[charge]; double maxDeltaScore = 1.0; double maxScore = 0.0; foreach (IIdentifiedSpectrum sph in sphList) { if (sph.Score > maxScore) { maxScore = sph.Score; } } maxScore += 1.0; string resultFilename = filename + "." + charge + ".png"; var bmp = new Bitmap(this.width, this.height); Graphics g = Graphics.FromImage(bmp); g.FillRectangle(new SolidBrush(Color.White), new Rectangle(0, 0, this.width, this.height)); var font = new Font("Times New Roman", 8); double fontHeight = font.GetHeight(g); g.DrawString("Score", font, new SolidBrush(Color.Black), GetX(maxScore, maxScore) - 20, GetY(maxDeltaScore, 0) + 10); g.DrawString("DeltaScore", font, new SolidBrush(Color.Black), GetX(maxScore, 0) - 20, GetY(maxDeltaScore, maxDeltaScore) - (int)fontHeight - 5); DrawXScale(maxScore, maxDeltaScore, g, font); DrawYScale(maxScore, maxDeltaScore, g, font); var colors = new HashSet <Color>(); foreach (IIdentifiedSpectrum sph in sphList) { Color color = GetColor(sph); if (!colors.Contains(color)) { colors.Add(color); } int x = GetX(maxScore, sph.Score); var y = (int)(this.height - sph.DeltaScore / maxDeltaScore * this.height - this.top); g.FillEllipse(new SolidBrush(color), new Rectangle(x - 1, y - 1, 3, 3)); //Console.WriteLine(MyConvert.Format("{0:0.0000}\t{1:0.0000}\t{2}\t{3}\t{4}\t{5}", // sph.Score, // sph.DeltaScore, // color.Name, // x, // y, // sph.PeptideInfo[0].Proteins[0])); } DrawColorTitle(maxScore, maxDeltaScore, g, font, colors); g.Save(); bmp.Save(resultFilename, ImageFormat.Png); result.Add(resultFilename); } return(result); }
protected override List <IIdentifiedSpectrum> DoParse() { var peptideFormat = GetPeptideFormat(); Progress.SetRange(0, options.PathNames.Count + 1); var result = new List <IIdentifiedSpectrum>(); var spectrumFilter = options.GetFilter(); //long afterFirstMemory = 0; //DateTime afterFirstTime = DateTime.Now; int stepCount = 0; foreach (string dataFile in options.PathNames) { stepCount++; if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } var dataParser = GetParser(dataFile); dataParser.Progress = this.Progress; if (options is AbstractTitleDatasetOptions) { dataParser.TitleParser = (options as AbstractTitleDatasetOptions).GetTitleParser(); } List <IIdentifiedSpectrum> curPeptides; string peptideFilename = GetPeptideFile(dataFile); if (new FileInfo(peptideFilename).Exists) { Progress.SetMessage(MyConvert.Format("{0}/{1} : Reading peptides file {2}", stepCount, options.PathNames.Count, Path.GetFileName(peptideFilename))); curPeptides = peptideFormat.ReadFromFile(peptideFilename); if (curPeptides.All(m => m.Proteins.Count == 0)) { var proteinFile = peptideFilename + ".protein"; if (File.Exists(proteinFile)) { IdentifiedSpectrumUtils.FillProteinInformation(curPeptides, proteinFile); } else { throw new Exception(string.Format("No protein information in peptides file {0} and no corresponding protein file exists {1}", peptideFilename, proteinFile)); } } if (curPeptides.All(m => string.IsNullOrEmpty(m.Query.FileScan.Experimental))) { curPeptides.ForEach(m => m.Query.FileScan.Experimental = Path.GetFileNameWithoutExtension(dataFile)); } if (curPeptides.All(m => m.Query.FileScan.FirstScan == 0)) { if (curPeptides.All(m => { int value; if (int.TryParse(m.Id, out value)) { return(value > 0); } else { return(false); } })) { curPeptides.ForEach(m => m.Query.FileScan.FirstScan = int.Parse(m.Id)); } else if (curPeptides.All(m => m.Query.QueryId > 0)) { curPeptides.ForEach(m => m.Query.FileScan.FirstScan = m.Query.QueryId); } } } else { Progress.SetMessage(MyConvert.Format("{0}/{1} : Parsing {2} file {3}", stepCount, options.PathNames.Count, dataParser.Engine, dataFile)); curPeptides = dataParser.ReadFromFile(dataFile); peptideFormat.WriteToFile(peptideFilename, curPeptides); } int curPeptideCount = curPeptides.Count; if (null != spectrumFilter) { curPeptides.RemoveAll(m => !spectrumFilter.Accept(m)); } curPeptides.ForEach(m => { m.Tag = options.Name; m.Engine = options.SearchEngine.ToString(); }); result.AddRange(curPeptides); curPeptides = null; GC.Collect(); GC.WaitForPendingFinalizers(); //if (stepCount == 1) //{ // afterFirstMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); // afterFirstTime = DateTime.Now; //} //else //{ // long currMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); // double averageCost = (double)(currMemory - afterFirstMemory) / (stepCount - 1); // double estimatedCost = afterFirstMemory + averageCost * options.PathNames.Count; // DateTime currTime = DateTime.Now; // var averageTime = currTime.Subtract(afterFirstTime).TotalMinutes / (stepCount - 1); // var finishTime = afterFirstTime.AddMinutes(averageTime * (options.PathNames.Count - 1)); // Console.WriteLine("{0}/{1}, cost {2}M, avg {3:0.0}M, need {4:0.0}M, will finish at {5:MM-dd HH:mm:ss}", stepCount, options.PathNames.Count, currMemory, averageCost, estimatedCost, finishTime.ToString()); //} } return(result); }
public void InitFromOptions(DatasetListOptions dsOptions, IProgressCallback progress, string paramFile) { this.Clear(); this.conflictFunc = dsOptions.Options.GetConflictFunc(); this.fdrCalc = dsOptions.Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); IFilter <IIdentifiedSpectrum> decoyFilter = null; if (dsOptions.Options.FalseDiscoveryRate.FilterByFdr) { decoyFilter = dsOptions.Options.GetDecoySpectrumFilter(); } this.dsOptions = dsOptions; long afterFirstMemory = 0; DateTime afterFirstTime = DateTime.Now; var totalCount = dsOptions.Sum(l => l.PathNames.Count); var usedCount = 0; for (int i = 0; i < dsOptions.Count; i++) { var m = dsOptions[i]; var builder = m.GetBuilder(); builder.Progress = progress; Dataset ds = new Dataset(m); //首先,获取所有通过了固定筛选标准的谱图。 ds.Spectra = builder.ParseFromSearchResult(); ds.PSMPassedFixedCriteriaCount = ds.Spectra.Count; if (dsOptions.Options.FalseDiscoveryRate.FilterByFdr) { //对每个谱图设置是否来自诱饵库 progress.SetMessage("Assigning decoy information..."); DecoyPeptideBuilder.AssignDecoy(ds.Spectra, decoyFilter); var decoyCount = ds.Spectra.Count(l => l.FromDecoy); if (decoyCount == 0) { throw new Exception(string.Format("No decoy protein found at dataset {0}, make sure the protein access number parser and the decoy pattern are correctly defined!", m.Name)); } progress.SetMessage("{0} decoys out of {1} hits found", decoyCount, ds.Spectra.Count); ds.BuildSpectrumBin(); ds.CalculateCurrentFdr(); ds.PushCurrentOptimalResults(string.Format("Before maximum peptide fdr {0}", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr)); progress.SetMessage("Filtering by maximum peptide fdr {0} ...", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); ds.FilterByFdr(dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); ds.Spectra = ds.GetUnconflictedOptimalSpectra(); ds.BuildSpectrumBin(); ds.CalculateCurrentFdr(); ds.PushCurrentOptimalResults(string.Format("After maximum peptide fdr {0}", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr)); } this.Add(ds); if (i == 0) { afterFirstMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); afterFirstTime = DateTime.Now; } else { usedCount += m.PathNames.Count; long currMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); double averageCost = (double)(currMemory - afterFirstMemory) / usedCount; double estimatedCost = afterFirstMemory + averageCost * totalCount; DateTime currTime = DateTime.Now; var averageTime = currTime.Subtract(afterFirstTime).TotalMinutes / usedCount; var finishTime = afterFirstTime.AddMinutes(averageTime * (totalCount - dsOptions[0].PathNames.Count)); progress.SetMessage("{0}/{1} datasets, cost {2}M, avg {3:0.0}M, need {4:0.0}M, will finish at {5:MM-dd HH:mm:ss}", (i + 1), dsOptions.Count, currMemory, averageCost, estimatedCost, finishTime); } } //初始化实验列表 this.ForEach(m => m.InitExperimentals()); if (dsOptions.Count > 1) { if (dsOptions.Options.KeepTopPeptideFromSameEngineButDifferentSearchParameters) { //合并/删除那些相同搜索引擎,不同参数得到的结果。 ProcessDatasetFromSameEngine(progress, (peptides, score) => IdentifiedSpectrumUtils.KeepTopPeptideFromSameEngineDifferentParameters(peptides, score), false); } else { ProcessDatasetFromSameEngine(progress, (peptides, score) => IdentifiedSpectrumUtils.KeepUnconflictPeptidesFromSameEngineDifferentParameters(peptides, score), true); } //初始化不同搜索引擎搜索的dataset之间的overlap关系。 this.OverlapBySearchEngine = FindOverlap((m1, m2) => m1.Options.SearchEngine != m2.Options.SearchEngine); //初始化没有交集的dataset var overlaps = new HashSet <Dataset>(from m in OverlapBySearchEngine from s in m select s); this.NoOverlaps = this.Where(m => !overlaps.Contains(m)).ToList(); if (OverlapBySearchEngine.Count > 0 && dsOptions.Options.FalseDiscoveryRate.FilterByFdr) { //根据最大的fdr进行筛选。 progress.SetMessage("Filtering PSMs by maximum fdr {0}, considering multiple engine overlap...", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); var realFdr = this.FilterByFdr(dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); if (realFdr.ConflictSpectra.Count > 0) { new MascotPeptideTextFormat(UniformHeader.PEPTIDE_HEADER).WriteToFile(Path.ChangeExtension(paramFile, ".conflicted.peps"), realFdr.ConflictSpectra); } //保留每个dataset的spectra为筛选后的结果,以用于后面的迭代。 this.ForEach(m => { m.Spectra = m.GetUnconflictedOptimalSpectra(); }); } } else { this.NoOverlaps = new List <Dataset>(this); this.OverlapBySearchEngine = new List <List <Dataset> >(); } }