public IdentifiedSpectrumBuilderResult Build(string parameterFile) { Options = new BuildSummaryOptions(parameterFile); Options.DatasetList.RemoveDisabled(); IIdentifiedProteinBuilder proteinBuilder = new IdentifiedProteinBuilder(); IIdentifiedProteinGroupBuilder groupBuilder = new IdentifiedProteinGroupBuilder(); var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); BuildResult = new DatasetList(); //从配置进行初始化 BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile); var totalCount = BuildResult.GetOptimalSpectrumCount(); string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal"); using (var sw = new StreamWriter(optimalResultFile)) { new OptimalFileTextWriter().WriteToStream(sw, BuildResult); UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter()) { Progress = this.Progress }; Progress.SetMessage("Filtering PSMs by protein fdr {0}, using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, Options.FalseDiscoveryRate.MaxPeptideFdr); var groupFilter = Options.FalseDiscoveryRate.FilterOneHitWonder ? new IdentifiedProteinGroupSingleWonderPeptideCountFilter(Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount) : null; var ret = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, Options.FalseDiscoveryRate.MaxPeptideFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); //只保留通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(ret.AcceptedSpectra); GC.Collect(); GC.WaitForPendingFinalizers(); sw.WriteLine("After SimpleProteinFDR filter {0} with condition {1}, required peptide fdr = {2} ", ret.ProteinFdr, ret.ProteinCondition, ret.PeptideFdr); BuildResult.ForEach(ds => { sw.WriteLine("Dataset {0}", ds.Options.Name); OptimalResultConditionUtils.WriteSpectrumBin(sw, ds, f1, f2); }); //sw.WriteLine(); //new OptimalFileTextWriter().WriteToStream(sw, BuildResult); return(new IdentifiedSpectrumBuilderResult() { Spectra = ret.AcceptedSpectra.ToList(), PeptideFDR = ret.PeptideFdr, ProteinFDR = ret.ProteinFdr }); } }
public void BuildResult(Dictionary <string, MaxQuantModificationItem> shortModMap, string noredundantFile, string iniFile, List <IIdentifiedSpectrum> spectra) { var proteins = new IdentifiedProteinBuilder().Build(spectra); var groups = new IdentifiedProteinGroupBuilder().Build(proteins); var ir = new IdentifiedResultBuilder(DefaultAccessNumberParser.GetInstance(), "").Build(groups); new MascotResultTextFormat().WriteToFile(noredundantFile, ir); Aminoacids samAminoacids, refAminoacids; InitializeAminoacids(out samAminoacids, out refAminoacids); using (var sw = new StreamWriter(iniFile)) { for (var idx = 0; idx < samAminoacids.Count; idx++) { var samAA = samAminoacids[idx]; var refAA = refAminoacids[idx]; if (samAA.Visible) { sw.WriteLine("<{0}>\tSAM\tREF", samAA.OneName); WriteAtom(sw, samAA, refAA, new[] { Atom.C }, "C"); WriteAtom(sw, samAA, refAA, new[] { Atom.H }, "H"); WriteAtom(sw, samAA, refAA, new[] { Atom.O }, "O"); WriteAtom(sw, samAA, refAA, new[] { Atom.N }, "N"); WriteAtom(sw, samAA, refAA, new[] { Atom.S }, "S"); WriteAtom(sw, samAA, refAA, new[] { Atom.P }, "P"); WriteAtom(sw, samAA, refAA, new[] { Atom.N15, Atom.Nx }, "15N"); WriteAtom(sw, samAA, refAA, new[] { Atom.H2, Atom.Hx }, "2H"); WriteAtom(sw, samAA, refAA, new[] { Atom.C13, Atom.Cx }, "13C"); WriteAtom(sw, samAA, refAA, new[] { Atom.O18, Atom.Ox }, "18O", false); sw.WriteLine(); } } var terms = new[] { new MaxQuantModificationItem() { Symbol = "NTERM", Composition = new AtomComposition("H") }, new MaxQuantModificationItem() { Symbol = "CTERM", Composition = new AtomComposition("OH") } }; var usedMods = (from v in shortModMap.Values orderby v.Symbol select v).ToList(); usedMods.AddRange(terms); foreach (var mod in usedMods) { mod.WriteToSilacINI(sw); } } }
public void TestBuild() { List<IIdentifiedSpectrum> spectra = new SequestPeptideTextFormat().ReadFromFile(@"../../../data/TestBuilder.peptides"); Assert.AreEqual(4, spectra.Count); IAccessNumberParser parser = AccessNumberParserFactory.FindOrCreateParser(@"(IPI\d+)", "IPI"); List<IIdentifiedProtein> proteins = new IdentifiedProteinBuilder().Build(spectra); Assert.AreEqual(4, proteins.Count); List<IIdentifiedProteinGroup> groups = new IdentifiedProteinGroupBuilder().Build(proteins); Assert.AreEqual(2, groups.Count); Assert.AreEqual(1, groups[0].Count); Assert.AreEqual("IPI:IPI00784154.1|SW", groups[0][0].Name); Assert.AreEqual(2, groups[1].Count); Assert.AreEqual("REVERSED_00000001", groups[1][0].Name); Assert.AreEqual("REVERSED_00000002", groups[1][1].Name); IIdentifiedResult result = new IdentifiedResultBuilder(parser,"").Build(groups); }
public IdentifiedSpectrumBuilderResult Build(string parameterFile) { Options = new BuildSummaryOptions(parameterFile); Options.DatasetList.RemoveDisabled(); IIdentifiedProteinGroupFilter conFilter = Options.Database.GetNotContaminationDescriptionFilter(this.Progress); var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); BuildResult = new DatasetList(); //从配置进行初始化 BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile); var totalCount = BuildResult.GetOptimalSpectrumCount(); string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal"); using (var sw = new StreamWriter(optimalResultFile)) { new OptimalFileTextWriter().WriteToStream(sw, BuildResult); UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter()) { Progress = this.Progress }; sw.WriteLine(OptimalFilteredItem.GetHeader()); var uniqueFilter = new IdentifiedProteinGroupUniquePeptideCountFilter(2); OptimalFilteredItem finalItem = null; List <IIdentifiedSpectrum> allSpectrum = Options.PeptideRetrieval ? BuildResult.GetSpectra() : null; int fdrPeptideCount = Options.FalseDiscoveryRate.FdrPeptideCount > 2 ? Options.FalseDiscoveryRate.FdrPeptideCount : 2; double firstStepFdr = Options.FalseDiscoveryRate.MaxPeptideFdr; bool bFirst = true; for (int curPeptideCount = fdrPeptideCount; curPeptideCount >= 2; curPeptideCount--) { //重新根据保留的Spectra构建SpectrumBin。 if (!bFirst) { BuildResult.BuildSpectrumBin(); } bFirst = false; var curItem = new OptimalFilteredItem(); IIdentifiedProteinGroupFilter groupFilter; bool bNeedFirstStep = curPeptideCount > 2; if (bNeedFirstStep) { Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr); //第一步,根据UniquePeptideCount和PeptideCount进行筛选,得到满足蛋白质Fdr要求所对应的肽段fdr。 var countFilter = new IdentifiedProteinGroupPeptideCountFilter(curPeptideCount); if (conFilter != null) { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter, countFilter }); } else { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { uniqueFilter, countFilter }); } curItem.Unique2CountResult = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, firstStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); firstStepFdr = curItem.Unique2CountResult.PeptideFdr; //只保留没有被通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2CountResult.RejectedSpectra)); GC.Collect(); GC.WaitForPendingFinalizers(); } else { curItem.Unique2CountResult = new ProteinFdrFilteredItem(); } Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, firstStepFdr); //第二步,根据UniquePeptideCount进行筛选,计算得到满足给定蛋白质fdr的结果。 double secondStepFdr = bNeedFirstStep ? Options.FalseDiscoveryRate.MaxPeptideFdr : firstStepFdr; if (conFilter != null) { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter }); } else { groupFilter = uniqueFilter; } curItem.Unique2Result = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, secondStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); //只保留没有被通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2Result.RejectedSpectra)); GC.Collect(); GC.WaitForPendingFinalizers(); curItem.Unique1Result = FilterOneHitWonders(conFilter, proteinCalc); GC.Collect(); GC.WaitForPendingFinalizers(); sw.WriteLine(curItem.ToString()); if (finalItem == null || finalItem.TotalProteinCount < curItem.TotalProteinCount) { finalItem = curItem; } curItem = null; GC.Collect(); GC.WaitForPendingFinalizers(); Console.WriteLine(MyConvert.Format("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...cost {3}.", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr, SystemUtils.CostMemory())); } Progress.SetMessage("Filtering PSMs by protein fdr {0} finished, free memory...", Options.FalseDiscoveryRate.FdrValue); if (finalItem != null) { sw.WriteLine(); sw.WriteLine("Final result : "); WriteScoreMap(sw, BuildResult, finalItem.Unique2CountResult); WriteScoreMap(sw, BuildResult, finalItem.Unique2Result); WriteScoreMap(sw, BuildResult, finalItem.Unique1Result); var finalSpectra = finalItem.GetSpectra(); if (Options.PeptideRetrieval) { Progress.SetMessage("Retrivaling peptides passed maximum peptide FDR for proteins passed protein FDR..."); var proteinBuilder = new IdentifiedProteinBuilder(); var groupBuilder = new IdentifiedProteinGroupBuilder(); List <IIdentifiedProtein> proteins = proteinBuilder.Build(finalSpectra); List <IIdentifiedProteinGroup> groups = groupBuilder.Build(proteins); var proteinMap = new Dictionary <string, IIdentifiedProteinGroup>(); foreach (var g in groups) { foreach (var p in g) { proteinMap[p.Name] = g; } } var savedSpectra = new HashSet <IIdentifiedSpectrum>(finalItem.GetSpectra()); foreach (var spectrum in allSpectrum) { if (savedSpectra.Contains(spectrum)) { continue; } var pgs = new HashSet <IIdentifiedProteinGroup>(); foreach (var protein in spectrum.Proteins) { IIdentifiedProteinGroup pg; if (proteinMap.TryGetValue(protein, out pg)) { pgs.Add(pg); } } //if the spectrum doesn't map to protein passed FDR filter, ignore //if the spectrum maps to multiple groups, ignore if (pgs.Count == 0 || pgs.Count > 1) { continue; } //The spectrum should map to all proteins in the group if (pgs.First().All(l => spectrum.Proteins.Contains(l.Name))) { finalSpectra.Add(spectrum); } } } BuildResult.ClearSpectra(); GC.Collect(); GC.WaitForPendingFinalizers(); return(new IdentifiedSpectrumBuilderResult() { Spectra = finalSpectra, PeptideFDR = finalItem.Unique2Result.PeptideFdr, ProteinFDR = Options.FalseDiscoveryRate.FdrValue }); } else { return(new IdentifiedSpectrumBuilderResult() { Spectra = new List <IIdentifiedSpectrum>() }); } } }