protected ProteinFdrFilteredItem FilterOneHitWonders(IIdentifiedProteinGroupFilter conFilter, UniformProteinFdrOptimalResultCalculator proteinCalc) { var oldLevel = Options.FalseDiscoveryRate.FdrLevel; try { List <IIdentifiedProteinGroupFilter> filters = new List <IIdentifiedProteinGroupFilter>(); if (conFilter != null) { filters.Add(conFilter); } if (Options.FalseDiscoveryRate.FilterOneHitWonder && Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount > 1) { filters.Add(new IdentifiedProteinGroupPeptideCountFilter(Options.FalseDiscoveryRate.MinOneHitWonderPeptideCount)); } AndIdentifiedProteinGroupFilter groupFilter = new AndIdentifiedProteinGroupFilter(filters); Progress.SetMessage("Filtering PSMs by protein fdr {0} using unique peptide fdr {0} ...", Options.FalseDiscoveryRate.FdrValue); Options.FalseDiscoveryRate.FdrLevel = FalseDiscoveryRateLevel.UniquePeptide; return(proteinCalc.GetOptimalResultForGroupFilter(BuildResult, Options.FalseDiscoveryRate.FdrValue, Options.FalseDiscoveryRate.FdrValue, groupFilter)); } finally { Options.FalseDiscoveryRate.FdrLevel = oldLevel; } }
public UniformProteinFdrOptimalResultCalculator(IFalseDiscoveryRateCalculator fdrCalc, IIdentifiedProteinGroupFilter decoyFilter) { this.fdrCalc = fdrCalc; this.decoyFilter = decoyFilter; this.proteinBuilder = new IdentifiedProteinBuilder(); this.groupBuilder = new IdentifiedProteinGroupBuilder(); }
public IIdentifiedProteinGroupFilter GetContaminationDescriptionFilter(IProgressCallback progress) { if (HasContaminationDescriptionFilter()) { if (_contaminationGroupFilter == null) { var acParser = GetAccessNumberParser(); var map = IdentifiedResultUtils.GetContaminationAccessNumbers(acParser, Location, ContaminationDescriptionPattern, progress); _contaminationGroupFilter = new IdentifiedProteinGroupContaminationMapFilter(acParser, map); } return _contaminationGroupFilter; } return null; }
private static double CalculateProteinFdr(List <IIdentifiedProteinGroup> groups, IIdentifiedProteinGroupFilter decoyFilter, IFalseDiscoveryRateCalculator calc, out int targetCount) { targetCount = 0; int decoyCount = 0; foreach (var group in groups) { if (decoyFilter.Accept(group)) { decoyCount++; } else { targetCount++; } } return(calc.Calculate(decoyCount, targetCount)); }
public static IdentificationSummary Parse(string proteinFile, string defaultDecoyPattern, IFalseDiscoveryRateCalculator defaultCalc) { IdentificationSummary result = new IdentificationSummary(); result.FileName = FileUtils.ChangeExtension(new FileInfo(proteinFile).Name, ""); Regex decoyReg = new Regex(defaultDecoyPattern); IIdentifiedProteinGroupFilter decoyFilter = null; IFalseDiscoveryRateCalculator curCalc = null; var paramFile = FileUtils.ChangeExtension(proteinFile, ".param"); if (File.Exists(paramFile)) { BuildSummaryOptions options = BuildSummaryOptionsUtils.LoadFromFile(paramFile); if (options.FalseDiscoveryRate.FilterByFdr) { decoyFilter = options.GetDecoyGroupFilter(); curCalc = options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); } } if (decoyFilter == null) { decoyFilter = new IdentifiedProteinGroupNameRegexFilter(defaultDecoyPattern, false); curCalc = defaultCalc; } var peptideFile = FileUtils.ChangeExtension(proteinFile, ".peptides"); if (File.Exists(peptideFile)) { var peptides = new MascotPeptideTextFormat().ReadFromFile(peptideFile); var fullSpectra = GetSpectraByNPT(peptides, 2); var fullTargetSpectra = GetTargetSpectra(decoyReg, fullSpectra); var semiSpectra = GetSpectraByNPT(peptides, 1); var semiTargetSpectra = GetTargetSpectra(decoyReg, semiSpectra); result.FullSpectrumCount = GetSpectrumCount(fullSpectra); result.FullTargetSpectrumCount = GetSpectrumCount(fullTargetSpectra); result.SemiSpectrumCount = GetSpectrumCount(semiSpectra); result.SemiTargetSpectrumCount = GetSpectrumCount(semiTargetSpectra); result.FullPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(fullSpectra); result.FullTargetPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(fullTargetSpectra); result.SemiPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(semiSpectra); result.SemiTargetPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(semiTargetSpectra); result.FullSpectrumFdr = curCalc.Calculate(result.FullSpectrumCount - result.FullTargetSpectrumCount, result.FullTargetSpectrumCount); result.SemiSpectrumFdr = curCalc.Calculate(result.SemiSpectrumCount - result.SemiTargetSpectrumCount, result.SemiTargetSpectrumCount); result.FullPeptideFdr = curCalc.Calculate(result.FullPeptideCount - result.FullTargetPeptideCount, result.FullTargetPeptideCount); result.SemiPeptideFdr = curCalc.Calculate(result.SemiPeptideCount - result.SemiTargetPeptideCount, result.SemiTargetPeptideCount); } if (File.Exists(proteinFile)) { var ir = new MascotResultTextFormat().ReadFromFile(proteinFile); ir.InitUniquePeptideCount(); var u2proteins = (from p in ir where p[0].UniquePeptideCount > 1 select p).ToList(); var u1proteins = (from p in ir where p[0].UniquePeptideCount == 1 select p).ToList(); result.ProteinGroupCount = ir.Count; result.Unique2ProteinGroupCount = u2proteins.Count; int targetCount; result.Unique2ProteinFdr = CalculateProteinFdr(u2proteins, decoyFilter, defaultCalc, out targetCount); result.Unique2ProteinGroupTargetCount = (int)targetCount; result.Unique1ProteinFdr = CalculateProteinFdr(u1proteins, decoyFilter, defaultCalc, out targetCount); result.Unique1ProteinGroupTargetCount = (int)targetCount; } return(result); }
public IdentifiedProteinGroupNotFilter(IIdentifiedProteinGroupFilter filter) { this.filter = filter; }
public ProteinFdrFilteredItem GetOptimalResultForGroupFilter(DatasetList dsList, double initFdr, double maxProteinFdr, IIdentifiedProteinGroupFilter groupFilter) { double curFdr = initFdr; var result = new ProteinFdrFilteredItem(); result.PeptideBeforeFdr = dsList.GetOptimalSpectrumCount(); while (true) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } string condition = groupFilter == null ? "[]" : "[" + groupFilter.FilterCondition + "]"; string task = MyConvert.Format("Filtering {0} protein ... PeptideFdr={1:0.0000}", condition, curFdr); Progress.SetMessage(task); GC.Collect(); GC.WaitForPendingFinalizers(); var filteredSpectra = dsList.FilterByFdr(curFdr).Spectra; filteredSpectra.TrimExcess(); GC.Collect(); GC.WaitForPendingFinalizers(); Progress.SetMessage(task + ", building protein list from peptides..."); List <IIdentifiedProtein> proteins = proteinBuilder.Build(filteredSpectra); Progress.SetMessage(task + ", building protein group from protein list..."); List <IIdentifiedProteinGroup> groups = groupBuilder.Build(proteins); List <IIdentifiedProteinGroup> filteredGroups = groupFilter == null ? groups : groups.FindAll(g => groupFilter.Accept(g)); Progress.SetMessage(task + ", calculating protein fdr..."); double proteinFdr = CalculateProteinGroupFdr(filteredGroups); if (proteinFdr <= maxProteinFdr) { //using (StreamWriter sw = new StreamWriter(@"e:\temp\protein.txt", true)) //{ // sw.WriteLine(task + " kept proteins"); // foreach (var g in filteredGroups) // { // foreach (var p in g) // { // sw.WriteLine(p.Name); // } // } // sw.WriteLine(); //} result.ProteinCondition = condition; result.PeptideFdr = curFdr; result.ProteinFdr = proteinFdr; result.ProteinCount = filteredGroups.Count; Progress.SetMessage(task + ", accepted, processing corresponding PSMs..."); foreach (IIdentifiedProteinGroup group in filteredGroups) { result.AcceptedSpectra.UnionWith(group[0].GetSpectra()); } result.AcceptedSpectra.TrimExcess(); //删除已经被包含在通过筛选的group对应的spectra filteredSpectra.RemoveAll(m => result.AcceptedSpectra.Contains(m)); //删除对应于已通过筛选的蛋白质的spectra(但未通过初始的肽段筛选) var proteinList = new HashSet <string>((from g in filteredGroups from p in g select p.Name).Distinct()); filteredSpectra.RemoveAll(m => { foreach (var pep in m.Peptides) { foreach (var p in pep.Proteins) { if (proteinList.Contains(p)) { return(true); } } } return(false); }); filteredSpectra.TrimExcess(); result.RejectedSpectra = filteredSpectra; List <IIdentifiedProtein> rejectProteins = proteinBuilder.Build(filteredSpectra); List <IIdentifiedProteinGroup> rejectGroups = groupBuilder.Build(rejectProteins); //using (StreamWriter sw = new StreamWriter(@"e:\temp\protein.txt", true)) //{ // sw.WriteLine(task + " rejected proteins"); // foreach (var g in rejectGroups) // { // foreach (var p in g) // { // sw.WriteLine(p.Name); // } // } // sw.WriteLine(); //} GC.Collect(); GC.WaitForPendingFinalizers(); Progress.SetMessage(task + " finished."); return(result); } else { double stepFdr = CalculateStepFdr(curFdr); curFdr -= stepFdr; } } }
public IdentifiedSpectrumBuilderResult Build(string parameterFile) { Options = new BuildSummaryOptions(parameterFile); Options.DatasetList.RemoveDisabled(); IIdentifiedProteinGroupFilter conFilter = Options.Database.GetNotContaminationDescriptionFilter(this.Progress); var fdrCalc = Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); BuildResult = new DatasetList(); //从配置进行初始化 BuildResult.InitFromOptions(Options.DatasetList, this.Progress, parameterFile); var totalCount = BuildResult.GetOptimalSpectrumCount(); string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal"); using (var sw = new StreamWriter(optimalResultFile)) { new OptimalFileTextWriter().WriteToStream(sw, BuildResult); UniformProteinFdrOptimalResultCalculator proteinCalc = new UniformProteinFdrOptimalResultCalculator(fdrCalc, Options.GetDecoyGroupFilter()) { Progress = this.Progress }; sw.WriteLine(OptimalFilteredItem.GetHeader()); var uniqueFilter = new IdentifiedProteinGroupUniquePeptideCountFilter(2); OptimalFilteredItem finalItem = null; List <IIdentifiedSpectrum> allSpectrum = Options.PeptideRetrieval ? BuildResult.GetSpectra() : null; int fdrPeptideCount = Options.FalseDiscoveryRate.FdrPeptideCount > 2 ? Options.FalseDiscoveryRate.FdrPeptideCount : 2; double firstStepFdr = Options.FalseDiscoveryRate.MaxPeptideFdr; bool bFirst = true; for (int curPeptideCount = fdrPeptideCount; curPeptideCount >= 2; curPeptideCount--) { //重新根据保留的Spectra构建SpectrumBin。 if (!bFirst) { BuildResult.BuildSpectrumBin(); } bFirst = false; var curItem = new OptimalFilteredItem(); IIdentifiedProteinGroupFilter groupFilter; bool bNeedFirstStep = curPeptideCount > 2; if (bNeedFirstStep) { Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr); //第一步,根据UniquePeptideCount和PeptideCount进行筛选,得到满足蛋白质Fdr要求所对应的肽段fdr。 var countFilter = new IdentifiedProteinGroupPeptideCountFilter(curPeptideCount); if (conFilter != null) { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter, countFilter }); } else { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { uniqueFilter, countFilter }); } curItem.Unique2CountResult = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, firstStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); firstStepFdr = curItem.Unique2CountResult.PeptideFdr; //只保留没有被通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2CountResult.RejectedSpectra)); GC.Collect(); GC.WaitForPendingFinalizers(); } else { curItem.Unique2CountResult = new ProteinFdrFilteredItem(); } Progress.SetMessage("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 using peptide fdr {1}...", Options.FalseDiscoveryRate.FdrValue, firstStepFdr); //第二步,根据UniquePeptideCount进行筛选,计算得到满足给定蛋白质fdr的结果。 double secondStepFdr = bNeedFirstStep ? Options.FalseDiscoveryRate.MaxPeptideFdr : firstStepFdr; if (conFilter != null) { groupFilter = new AndIdentifiedProteinGroupFilter(new IIdentifiedProteinGroupFilter[] { conFilter, uniqueFilter }); } else { groupFilter = uniqueFilter; } curItem.Unique2Result = proteinCalc.GetOptimalResultForGroupFilter(BuildResult, secondStepFdr, Options.FalseDiscoveryRate.FdrValue, groupFilter); //只保留没有被通过筛选的蛋白质包含的PSMs。 BuildResult.KeepOptimalResultInSetOnly(new HashSet <IIdentifiedSpectrum>(curItem.Unique2Result.RejectedSpectra)); GC.Collect(); GC.WaitForPendingFinalizers(); curItem.Unique1Result = FilterOneHitWonders(conFilter, proteinCalc); GC.Collect(); GC.WaitForPendingFinalizers(); sw.WriteLine(curItem.ToString()); if (finalItem == null || finalItem.TotalProteinCount < curItem.TotalProteinCount) { finalItem = curItem; } curItem = null; GC.Collect(); GC.WaitForPendingFinalizers(); Console.WriteLine(MyConvert.Format("Filtering PSMs by protein fdr {0}, unique peptide count >= 2 and peptide count >= {1} using peptide fdr {2}...cost {3}.", Options.FalseDiscoveryRate.FdrValue, curPeptideCount, firstStepFdr, SystemUtils.CostMemory())); } Progress.SetMessage("Filtering PSMs by protein fdr {0} finished, free memory...", Options.FalseDiscoveryRate.FdrValue); if (finalItem != null) { sw.WriteLine(); sw.WriteLine("Final result : "); WriteScoreMap(sw, BuildResult, finalItem.Unique2CountResult); WriteScoreMap(sw, BuildResult, finalItem.Unique2Result); WriteScoreMap(sw, BuildResult, finalItem.Unique1Result); var finalSpectra = finalItem.GetSpectra(); if (Options.PeptideRetrieval) { Progress.SetMessage("Retrivaling peptides passed maximum peptide FDR for proteins passed protein FDR..."); var proteinBuilder = new IdentifiedProteinBuilder(); var groupBuilder = new IdentifiedProteinGroupBuilder(); List <IIdentifiedProtein> proteins = proteinBuilder.Build(finalSpectra); List <IIdentifiedProteinGroup> groups = groupBuilder.Build(proteins); var proteinMap = new Dictionary <string, IIdentifiedProteinGroup>(); foreach (var g in groups) { foreach (var p in g) { proteinMap[p.Name] = g; } } var savedSpectra = new HashSet <IIdentifiedSpectrum>(finalItem.GetSpectra()); foreach (var spectrum in allSpectrum) { if (savedSpectra.Contains(spectrum)) { continue; } var pgs = new HashSet <IIdentifiedProteinGroup>(); foreach (var protein in spectrum.Proteins) { IIdentifiedProteinGroup pg; if (proteinMap.TryGetValue(protein, out pg)) { pgs.Add(pg); } } //if the spectrum doesn't map to protein passed FDR filter, ignore //if the spectrum maps to multiple groups, ignore if (pgs.Count == 0 || pgs.Count > 1) { continue; } //The spectrum should map to all proteins in the group if (pgs.First().All(l => spectrum.Proteins.Contains(l.Name))) { finalSpectra.Add(spectrum); } } } BuildResult.ClearSpectra(); GC.Collect(); GC.WaitForPendingFinalizers(); return(new IdentifiedSpectrumBuilderResult() { Spectra = finalSpectra, PeptideFDR = finalItem.Unique2Result.PeptideFdr, ProteinFDR = Options.FalseDiscoveryRate.FdrValue }); } else { return(new IdentifiedSpectrumBuilderResult() { Spectra = new List <IIdentifiedSpectrum>() }); } } }