public IdentifiedSpectrumBuilderResult Build(string parameterFile) { Options = new BuildSummaryOptions(parameterFile); Options.DatasetList.RemoveDisabled(); IIdentifiedProteinBuilder proteinBuilder = new IdentifiedProteinBuilder(); IIdentifiedProteinGroupBuilder groupBuilder = new IdentifiedProteinGroupBuilder(); var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); BuildResult = new DatasetList(); //从配置进行初始化 BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile); var totalCount = BuildResult.GetOptimalSpectrumCount(); string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal"); using (var sw = new StreamWriter(optimalResultFile)) { new OptimalFileTextWriter().WriteToStream(sw, BuildResult); UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter()) { Progress = this.Progress }; Progress.SetMessage("Filtering PSMs by protein fdr {0}, using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, Options.FalseDiscoveryRate.MaxPeptideFdr); var groupFilter = Options.FalseDiscoveryRate.FilterOneHitWonder ? new IdentifiedProteinGroupSingleWonderPeptideCountFilter(Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount) : null; var ret = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, Options.FalseDiscoveryRate.MaxPeptideFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); //只保留通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(ret.AcceptedSpectra); GC.Collect(); GC.WaitForPendingFinalizers(); sw.WriteLine("After SimpleProteinFDR filter {0} with condition {1}, required peptide fdr = {2} ", ret.ProteinFdr, ret.ProteinCondition, ret.PeptideFdr); BuildResult.ForEach(ds => { sw.WriteLine("Dataset {0}", ds.Options.Name); OptimalResultConditionUtils.WriteSpectrumBin(sw, ds, f1, f2); }); //sw.WriteLine(); //new OptimalFileTextWriter().WriteToStream(sw, BuildResult); return(new IdentifiedSpectrumBuilderResult() { Spectra = ret.AcceptedSpectra.ToList(), PeptideFDR = ret.PeptideFdr, ProteinFDR = ret.ProteinFdr }); } }
protected override IdentifiedSpectrumBuilderResult DoBuild(string parameterFile) { var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); BuildResult = new DatasetList(); //从配置进行初始化 BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile); string optimalFile = FileUtils.ChangeExtension(parameterFile, ".optimal"); new OptimalFileTextWriter().WriteToFile(optimalFile, BuildResult); Progress.SetMessage("Peptide fdr filter done ..."); return(new IdentifiedSpectrumBuilderResult() { Spectra = BuildResult.GetSpectra(), PeptideFDR = Options.FalseDiscoveryRate.MaxPeptideFdr }); }
public IdentifiedSpectrumBuilderResult Build(string parameterFile) { Options = new BuildSummaryOptions(parameterFile); Options.DatasetList.RemoveDisabled(); IIdentifiedProteinGroupFilter conFilter = Options.Database.GetNotContaminationDescriptionFilter(this.Progress); var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); BuildResult = new DatasetList(); //从配置进行初始化 BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile); var totalCount = BuildResult.GetOptimalSpectrumCount(); string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal"); using (var sw = new StreamWriter(optimalResultFile)) { new OptimalFileTextWriter().WriteToStream(sw, BuildResult); UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter()) { Progress = this.Progress }; sw.WriteLine(OptimalFilteredItem.GetHeader()); var uniqueFilter = new IdentifiedProteinGroupUniquePeptideCountFilter(2); OptimalFilteredItem finalItem = null; List <IIdentifiedSpectrum> allSpectrum = Options.PeptideRetrieval ? BuildResult.GetSpectra() : null; int fdrPeptideCount = Options.FalseDiscoveryRate.FdrPeptideCount > 2 ? Options.FalseDiscoveryRate.FdrPeptideCount : 2; double firstStepFdr = Options.FalseDiscoveryRate.MaxPeptideFdr; bool bFirst = true; for (int curPeptideCount = fdrPeptideCount; curPeptideCount >= 2; curPeptideCount--) { //重新根据保留的Spectra构建SpectrumBin。 if (!bFirst) { BuildResult.BuildSpectrumBin(); } bFirst = false; var curItem = new OptimalFilteredItem(); IIdentifiedProteinGroupFilter groupFilter; bool bNeedFirstStep = curPeptideCount > 2; if (bNeedFirstStep) { Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr); //第一步,根据UniquePeptideCount和PeptideCount进行筛选,得到满足蛋白质Fdr要求所对应的肽段fdr。 var countFilter = new IdentifiedProteinGroupPeptideCountFilter(curPeptideCount); if (conFilter != null) { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter, countFilter }); } else { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { uniqueFilter, countFilter }); } curItem.Unique2CountResult = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, firstStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); firstStepFdr = curItem.Unique2CountResult.PeptideFdr; //只保留没有被通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2CountResult.RejectedSpectra)); GC.Collect(); GC.WaitForPendingFinalizers(); } else { curItem.Unique2CountResult = new ProteinFdrFilteredItem(); } Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, firstStepFdr); //第二步,根据UniquePeptideCount进行筛选,计算得到满足给定蛋白质fdr的结果。 double secondStepFdr = bNeedFirstStep ? Options.FalseDiscoveryRate.MaxPeptideFdr : firstStepFdr; if (conFilter != null) { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter }); } else { groupFilter = uniqueFilter; } curItem.Unique2Result = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, secondStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); //只保留没有被通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2Result.RejectedSpectra)); GC.Collect(); GC.WaitForPendingFinalizers(); curItem.Unique1Result = FilterOneHitWonders(conFilter, proteinCalc); GC.Collect(); GC.WaitForPendingFinalizers(); sw.WriteLine(curItem.ToString()); if (finalItem == null || finalItem.TotalProteinCount < curItem.TotalProteinCount) { finalItem = curItem; } curItem = null; GC.Collect(); GC.WaitForPendingFinalizers(); Console.WriteLine(MyConvert.Format("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...cost {3}.", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr, SystemUtils.CostMemory())); } Progress.SetMessage("Filtering PSMs by protein fdr {0} finished, free memory...", Options.FalseDiscoveryRate.FdrValue); if (finalItem != null) { sw.WriteLine(); sw.WriteLine("Final result : "); WriteScoreMap(sw, BuildResult, finalItem.Unique2CountResult); WriteScoreMap(sw, BuildResult, finalItem.Unique2Result); WriteScoreMap(sw, BuildResult, finalItem.Unique1Result); var finalSpectra = finalItem.GetSpectra(); if (Options.PeptideRetrieval) { Progress.SetMessage("Retrivaling peptides passed maximum peptide FDR for proteins passed protein FDR..."); var proteinBuilder = new IdentifiedProteinBuilder(); var groupBuilder = new IdentifiedProteinGroupBuilder(); List <IIdentifiedProtein> proteins = proteinBuilder.Build(finalSpectra); List <IIdentifiedProteinGroup> groups = groupBuilder.Build(proteins); var proteinMap = new Dictionary <string, IIdentifiedProteinGroup>(); foreach (var g in groups) { foreach (var p in g) { proteinMap[p.Name] = g; } } var savedSpectra = new HashSet <IIdentifiedSpectrum>(finalItem.GetSpectra()); foreach (var spectrum in allSpectrum) { if (savedSpectra.Contains(spectrum)) { continue; } var pgs = new HashSet <IIdentifiedProteinGroup>(); foreach (var protein in spectrum.Proteins) { IIdentifiedProteinGroup pg; if (proteinMap.TryGetValue(protein, out pg)) { pgs.Add(pg); } } //if the spectrum doesn't map to protein passed FDR filter, ignore //if the spectrum maps to multiple groups, ignore if (pgs.Count == 0 || pgs.Count > 1) { continue; } //The spectrum should map to all proteins in the group if (pgs.First().All(l => spectrum.Proteins.Contains(l.Name))) { finalSpectra.Add(spectrum); } } } BuildResult.ClearSpectra(); GC.Collect(); GC.WaitForPendingFinalizers(); return(new IdentifiedSpectrumBuilderResult() { Spectra = finalSpectra, PeptideFDR = finalItem.Unique2Result.PeptideFdr, ProteinFDR = Options.FalseDiscoveryRate.FdrValue }); } else { return(new IdentifiedSpectrumBuilderResult() { Spectra = new List <IIdentifiedSpectrum>() }); } } }