public void RemoveConflictSpectrum(List <IIdentifiedSpectrum> conflicted) { if (conflicted.Count > 0) { IFalseDiscoveryRateCalculator calc = Options.Parent.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); var map = OptimalResults.ToDictionary(m => m.Condition); var bin = Options.Parent.Classification.BuildSpectrumBin(conflicted); foreach (var oi in bin) { var oldOi = map[oi.Condition]; foreach (var s in oi.Spectra) { if (s.FromDecoy) { oldOi.Result.PeptideCountFromDecoyDB--; } else { oldOi.Result.PeptideCountFromTargetDB--; } oldOi.Spectra.Remove(s); } oldOi.Result.FalseDiscoveryRate = calc.Calculate(oldOi.Result.PeptideCountFromDecoyDB, oldOi.Result.PeptideCountFromTargetDB); } } }
public double CalculateSubsetFdr(ScoreDistribution subset, IFalseDiscoveryRateCalculator calc) { double targetCount = 0; double decoyCount = 0; foreach (OptimalResultCondition cond in subset.Keys) { var subsetOrs = subset[cond]; var totalOrs = this[cond]; var counts = from s in subsetOrs join t in totalOrs on s.Score equals t.Score select new { TargetCount = (int)s.PeptideCountFromTargetDB, DecoyCount = (double)s.PeptideCountFromTargetDB * t.PeptideCountFromDecoyDB / t.PeptideCountFromTargetDB }; targetCount += (from c in counts select c.TargetCount).Sum(); decoyCount += (from c in counts select c.DecoyCount).Sum(); } return(calc.Calculate((int)decoyCount, (int)targetCount)); }
public void CalculateFdr(IFalseDiscoveryRateCalculator calc) { int decoy = 0; int target = 0; HashSet <string> fileName = new HashSet <string>(); foreach (var spectrum in this.Spectra) { if (fileName.Contains(spectrum.Query.FileScan.LongFileName)) { continue; } fileName.Add(spectrum.Query.FileScan.LongFileName); if (spectrum.FromDecoy) { decoy++; } else { target++; } } this.Result.PeptideCountFromDecoyDB = decoy; this.Result.PeptideCountFromTargetDB = target; this.Result.FalseDiscoveryRate = calc.Calculate(decoy, target); }
public BuildSummaryResultParser(IFalseDiscoveryRateCalculator calc, string decoyPattern) { this.options = new BuildSummaryResultParserOptions() { TargetFDR = calc is TargetFalseDiscoveryRateCalculator, DecoyPattern = decoyPattern }; }
public void CalculateCurrentFdr() { IFalseDiscoveryRateCalculator calc = Options.Parent.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); foreach (var item in OptimalResults) { item.CalculateFdr(calc); } CalculateToleranceScore(); }
public UniformProteinFdrOptimalResultCalculator(IFalseDiscoveryRateCalculator fdrCalc, IIdentifiedProteinGroupFilter decoyFilter) { this.fdrCalc = fdrCalc; this.decoyFilter = decoyFilter; this.proteinBuilder = new IdentifiedProteinBuilder(); this.groupBuilder = new IdentifiedProteinGroupBuilder(); }
public virtual void InitializeQValue(List <IIdentifiedSpectrum> spectra) { IScoreFunction scoreFunctions = Options.ScoreFunction; CalculateQValueFunc qValueFunc = Options.Parent.FalseDiscoveryRate.GetQValueFunction(); IFalseDiscoveryRateCalculator fdrCalc = Options.Parent.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); qValueFunc(spectra, scoreFunctions, fdrCalc); }
public double CalculateFdr(IFalseDiscoveryRateCalculator calc) { double targetCount = 0; double decoyCount = 0; targetCount += (from ors in this.Values from or in ors select(int) or.PeptideCountFromTargetDB).Sum(); decoyCount += (from ors in this.Values from or in ors select(int) or.PeptideCountFromDecoyDB).Sum(); return(calc.Calculate((int)decoyCount, (int)targetCount)); }
protected void WriteFdrFile(string parameterFile, BuildSummaryOptions conf, List <IIdentifiedSpectrum> result) { Progress.SetMessage("Calculating identified peptide false discovery rate ..."); IFalseDiscoveryRateCalculator calc = conf.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); DecoyPeptideBuilder.AssignDecoy(result, conf.GetDecoySpectrumFilter()); int decoyCount = 0; int targetCount = 0; foreach (IIdentifiedSpectrum mph in result) { if (mph.FromDecoy) { decoyCount++; } else { targetCount++; } } double fdr = calc.Calculate(decoyCount, targetCount); string optimalResultFile = FileUtils.ChangeExtension(parameterFile, ".optimal"); using (var sw = new StreamWriter(optimalResultFile)) { List <string> filters = conf.GetFilterString(); foreach (string filter in filters) { sw.WriteLine(filter); } sw.WriteLine("DecoyCount\t{0}", decoyCount); sw.WriteLine("TargetCount\t{0}", targetCount); sw.WriteLine("FDR\t{0:0.######}", fdr); } }
private static double CalculateProteinFdr(List <IIdentifiedProteinGroup> groups, IIdentifiedProteinGroupFilter decoyFilter, IFalseDiscoveryRateCalculator calc, out int targetCount) { targetCount = 0; int decoyCount = 0; foreach (var group in groups) { if (decoyFilter.Accept(group)) { decoyCount++; } else { targetCount++; } } return(calc.Calculate(decoyCount, targetCount)); }
public static IdentificationSummary Parse(string proteinFile, string defaultDecoyPattern, IFalseDiscoveryRateCalculator defaultCalc) { IdentificationSummary result = new IdentificationSummary(); result.FileName = FileUtils.ChangeExtension(new FileInfo(proteinFile).Name, ""); Regex decoyReg = new Regex(defaultDecoyPattern); IIdentifiedProteinGroupFilter decoyFilter = null; IFalseDiscoveryRateCalculator curCalc = null; var paramFile = FileUtils.ChangeExtension(proteinFile, ".param"); if (File.Exists(paramFile)) { BuildSummaryOptions options = BuildSummaryOptionsUtils.LoadFromFile(paramFile); if (options.FalseDiscoveryRate.FilterByFdr) { decoyFilter = options.GetDecoyGroupFilter(); curCalc = options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); } } if (decoyFilter == null) { decoyFilter = new IdentifiedProteinGroupNameRegexFilter(defaultDecoyPattern, false); curCalc = defaultCalc; } var peptideFile = FileUtils.ChangeExtension(proteinFile, ".peptides"); if (File.Exists(peptideFile)) { var peptides = new MascotPeptideTextFormat().ReadFromFile(peptideFile); var fullSpectra = GetSpectraByNPT(peptides, 2); var fullTargetSpectra = GetTargetSpectra(decoyReg, fullSpectra); var semiSpectra = GetSpectraByNPT(peptides, 1); var semiTargetSpectra = GetTargetSpectra(decoyReg, semiSpectra); result.FullSpectrumCount = GetSpectrumCount(fullSpectra); result.FullTargetSpectrumCount = GetSpectrumCount(fullTargetSpectra); result.SemiSpectrumCount = GetSpectrumCount(semiSpectra); result.SemiTargetSpectrumCount = GetSpectrumCount(semiTargetSpectra); result.FullPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(fullSpectra); result.FullTargetPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(fullTargetSpectra); result.SemiPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(semiSpectra); result.SemiTargetPeptideCount = IdentifiedSpectrumUtils.GetUniquePeptideCount(semiTargetSpectra); result.FullSpectrumFdr = curCalc.Calculate(result.FullSpectrumCount - result.FullTargetSpectrumCount, result.FullTargetSpectrumCount); result.SemiSpectrumFdr = curCalc.Calculate(result.SemiSpectrumCount - result.SemiTargetSpectrumCount, result.SemiTargetSpectrumCount); result.FullPeptideFdr = curCalc.Calculate(result.FullPeptideCount - result.FullTargetPeptideCount, result.FullTargetPeptideCount); result.SemiPeptideFdr = curCalc.Calculate(result.SemiPeptideCount - result.SemiTargetPeptideCount, result.SemiTargetPeptideCount); } if (File.Exists(proteinFile)) { var ir = new MascotResultTextFormat().ReadFromFile(proteinFile); ir.InitUniquePeptideCount(); var u2proteins = (from p in ir where p[0].UniquePeptideCount > 1 select p).ToList(); var u1proteins = (from p in ir where p[0].UniquePeptideCount == 1 select p).ToList(); result.ProteinGroupCount = ir.Count; result.Unique2ProteinGroupCount = u2proteins.Count; int targetCount; result.Unique2ProteinFdr = CalculateProteinFdr(u2proteins, decoyFilter, defaultCalc, out targetCount); result.Unique2ProteinGroupTargetCount = (int)targetCount; result.Unique1ProteinFdr = CalculateProteinFdr(u1proteins, decoyFilter, defaultCalc, out targetCount); result.Unique1ProteinGroupTargetCount = (int)targetCount; } return(result); }
public static void CalculateUniqueQValue(List <IIdentifiedSpectrum> peptides, IScoreFunction scoreFuncs, IFalseDiscoveryRateCalculator fdrCalc) { if (peptides.Count == 0) { return; } scoreFuncs.SortSpectrum(peptides); List <IIdentifiedSpectrum> sameScores = new List <IIdentifiedSpectrum>(); HashSet <string> targetSeq = new HashSet <string>(); HashSet <string> decoySeq = new HashSet <string>(); double lastScore = scoreFuncs.GetScore(peptides[0]); for (int i = 0; i < peptides.Count; i++) { IIdentifiedSpectrum spectrum = peptides[i]; double score = scoreFuncs.GetScore(peptides[i]); if (score == lastScore) { sameScores.Add(spectrum); if (spectrum.FromDecoy) { decoySeq.Add(spectrum.Peptide.PureSequence); } else { targetSeq.Add(spectrum.Peptide.PureSequence); } continue; } else { double qValue = fdrCalc.Calculate(decoySeq.Count, targetSeq.Count); foreach (IIdentifiedSpectrum sameScoreSpectrum in sameScores) { sameScoreSpectrum.QValue = qValue; } sameScores.Clear(); lastScore = score; sameScores.Add(spectrum); if (spectrum.FromDecoy) { decoySeq.Add(spectrum.Peptide.PureSequence); } else { targetSeq.Add(spectrum.Peptide.PureSequence); } continue; } } double lastQValue = fdrCalc.Calculate(decoySeq.Count, targetSeq.Count); foreach (IIdentifiedSpectrum sameScoreSpectrum in sameScores) { sameScoreSpectrum.QValue = lastQValue; } }
/// <summary> /// 根据给定分数排序函数以及FDR计算器对鉴定谱图列表计算QValue。 /// </summary> /// <param name="peptides">谱图列表</param> /// <param name="scoreFuncs">与分数提取、排序相关类</param> /// <param name="fdrCalc">FDR计算器</param> public static void CalculateQValue(List <IIdentifiedSpectrum> peptides, IScoreFunction scoreFuncs, IFalseDiscoveryRateCalculator fdrCalc) { if (peptides.Count == 0) { return; } scoreFuncs.SortSpectrum(peptides); int totalTarget = 0; int totalDecoy = 0; HashSet <string> filenames = new HashSet <string>(); foreach (IIdentifiedSpectrum spectrum in peptides) { spectrum.QValue = 0.0; if (filenames.Contains(spectrum.Query.FileScan.LongFileName)) { continue; } filenames.Add(spectrum.Query.FileScan.LongFileName); if (spectrum.FromDecoy) { totalDecoy++; } else { totalTarget++; } } double lastScore = scoreFuncs.GetScore(peptides[peptides.Count - 1]); double lastQvalue = fdrCalc.Calculate(totalDecoy, totalTarget); for (int i = peptides.Count - 1; i >= 0; i--) { double score = scoreFuncs.GetScore(peptides[i]); if (score != lastScore) { lastScore = score; lastQvalue = fdrCalc.Calculate(totalDecoy, totalTarget); if (lastQvalue == 0.0) { break; } peptides[i].QValue = lastQvalue; } else { peptides[i].QValue = lastQvalue; } if (peptides[i].FromDecoy) { totalDecoy--; } else { totalTarget--; } } }
public void InitFromOptions(DatasetListOptions dsOptions, IProgressCallback progress, string paramFile) { this.Clear(); this.conflictFunc = dsOptions.Options.GetConflictFunc(); this.fdrCalc = dsOptions.Options.FalseDiscoveryRate.GetFalseDiscoveryRateCalculator(); IFilter <IIdentifiedSpectrum> decoyFilter = null; if (dsOptions.Options.FalseDiscoveryRate.FilterByFdr) { decoyFilter = dsOptions.Options.GetDecoySpectrumFilter(); } this.dsOptions = dsOptions; long afterFirstMemory = 0; DateTime afterFirstTime = DateTime.Now; var totalCount = dsOptions.Sum(l => l.PathNames.Count); var usedCount = 0; for (int i = 0; i < dsOptions.Count; i++) { var m = dsOptions[i]; var builder = m.GetBuilder(); builder.Progress = progress; Dataset ds = new Dataset(m); //首先,获取所有通过了固定筛选标准的谱图。 ds.Spectra = builder.ParseFromSearchResult(); ds.PSMPassedFixedCriteriaCount = ds.Spectra.Count; if (dsOptions.Options.FalseDiscoveryRate.FilterByFdr) { //对每个谱图设置是否来自诱饵库 progress.SetMessage("Assigning decoy information..."); DecoyPeptideBuilder.AssignDecoy(ds.Spectra, decoyFilter); var decoyCount = ds.Spectra.Count(l => l.FromDecoy); if (decoyCount == 0) { throw new Exception(string.Format("No decoy protein found at dataset {0}, make sure the protein access number parser and the decoy pattern are correctly defined!", m.Name)); } progress.SetMessage("{0} decoys out of {1} hits found", decoyCount, ds.Spectra.Count); ds.BuildSpectrumBin(); ds.CalculateCurrentFdr(); ds.PushCurrentOptimalResults(string.Format("Before maximum peptide fdr {0}", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr)); progress.SetMessage("Filtering by maximum peptide fdr {0} ...", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); ds.FilterByFdr(dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); ds.Spectra = ds.GetUnconflictedOptimalSpectra(); ds.BuildSpectrumBin(); ds.CalculateCurrentFdr(); ds.PushCurrentOptimalResults(string.Format("After maximum peptide fdr {0}", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr)); } this.Add(ds); if (i == 0) { afterFirstMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); afterFirstTime = DateTime.Now; } else { usedCount += m.PathNames.Count; long currMemory = Process.GetCurrentProcess().WorkingSet64 / (1024 * 1024); double averageCost = (double)(currMemory - afterFirstMemory) / usedCount; double estimatedCost = afterFirstMemory + averageCost * totalCount; DateTime currTime = DateTime.Now; var averageTime = currTime.Subtract(afterFirstTime).TotalMinutes / usedCount; var finishTime = afterFirstTime.AddMinutes(averageTime * (totalCount - dsOptions[0].PathNames.Count)); progress.SetMessage("{0}/{1} datasets, cost {2}M, avg {3:0.0}M, need {4:0.0}M, will finish at {5:MM-dd HH:mm:ss}", (i + 1), dsOptions.Count, currMemory, averageCost, estimatedCost, finishTime); } } //初始化实验列表 this.ForEach(m => m.InitExperimentals()); if (dsOptions.Count > 1) { if (dsOptions.Options.KeepTopPeptideFromSameEngineButDifferentSearchParameters) { //合并/删除那些相同搜索引擎,不同参数得到的结果。 ProcessDatasetFromSameEngine(progress, (peptides, score) => IdentifiedSpectrumUtils.KeepTopPeptideFromSameEngineDifferentParameters(peptides, score), false); } else { ProcessDatasetFromSameEngine(progress, (peptides, score) => IdentifiedSpectrumUtils.KeepUnconflictPeptidesFromSameEngineDifferentParameters(peptides, score), true); } //初始化不同搜索引擎搜索的dataset之间的overlap关系。 this.OverlapBySearchEngine = FindOverlap((m1, m2) => m1.Options.SearchEngine != m2.Options.SearchEngine); //初始化没有交集的dataset var overlaps = new HashSet <Dataset>(from m in OverlapBySearchEngine from s in m select s); this.NoOverlaps = this.Where(m => !overlaps.Contains(m)).ToList(); if (OverlapBySearchEngine.Count > 0 && dsOptions.Options.FalseDiscoveryRate.FilterByFdr) { //根据最大的fdr进行筛选。 progress.SetMessage("Filtering PSMs by maximum fdr {0}, considering multiple engine overlap...", dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); var realFdr = this.FilterByFdr(dsOptions.Options.FalseDiscoveryRate.MaxPeptideFdr); if (realFdr.ConflictSpectra.Count > 0) { new MascotPeptideTextFormat(UniformHeader.PEPTIDE_HEADER).WriteToFile(Path.ChangeExtension(paramFile, ".conflicted.peps"), realFdr.ConflictSpectra); } //保留每个dataset的spectra为筛选后的结果,以用于后面的迭代。 this.ForEach(m => { m.Spectra = m.GetUnconflictedOptimalSpectra(); }); } } else { this.NoOverlaps = new List <Dataset>(this); this.OverlapBySearchEngine = new List <List <Dataset> >(); } }
public QValueCalculator(IScoreFunction scoreFunc, IFalseDiscoveryRateCalculator fdrCalc) { this.scoreFunc = scoreFunc; this.fdrCalc = fdrCalc; }