public static void TestComputePEPValue() { var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); var origDataFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML"); MyFileManager myFileManager = new MyFileManager(true); CommonParameters CommonParameters = new CommonParameters(digestionParams: new DigestionParams()); var myMsDataFile = myFileManager.LoadFile(origDataFile, CommonParameters); var searchModes = new SinglePpmAroundZeroSearchMode(5); List <Protein> proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\hela_snip_for_unitTest.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, null, CommonParameters).OrderBy(b => b.PrecursorMass).ToArray(); PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[listOfSortedms2Scans.Length]; new ClassicSearchEngine(allPsmsArray, listOfSortedms2Scans, variableModifications, fixedModifications, null, proteinList, searchModes, CommonParameters, new List <string>()).Run(); FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(allPsmsArray.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run()); var nonNullPsms = allPsmsArray.Where(p => p != null).ToList(); var nonNullPsmsOriginalCopy = allPsmsArray.Where(p => p != null).ToList(); var accessionCounts = PEP_Analysis.GetAccessionCounts(nonNullPsms); var maxScore = nonNullPsms.Select(n => n.Score).Max(); var maxScorePsm = nonNullPsms.Where(n => n.Score == maxScore).First(); Dictionary <string, int> sequenceToPsmCount = new Dictionary <string, int>(); List <string> sequences = new List <string>(); foreach (PeptideSpectralMatch psm in nonNullPsms) { var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList(); sequences.Add(String.Join("|", ss)); } var s = sequences.GroupBy(i => i); foreach (var grp in s) { sequenceToPsmCount.Add(grp.Key, grp.Count()); } var maxPsmData = PEP_Analysis.CreateOnePsmDataFromPsm(maxScorePsm, accessionCounts, sequenceToPsmCount); Assert.That(maxScorePsm.PeptidesToMatchingFragments.Count, Is.EqualTo(maxPsmData.Ambiguity)); Assert.That(maxScorePsm.DeltaScore, Is.EqualTo(maxPsmData.DeltaScore).Within(0.05)); Assert.That((float)(maxScorePsm.Score - (int)maxScorePsm.Score), Is.EqualTo(maxPsmData.Intensity).Within(0.05)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().MissedCleavages, Is.EqualTo(maxPsmData.MissedCleavagesCount)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().AllModsOneIsNterminus.Values.Count(), Is.EqualTo(maxPsmData.ModsCount)); Assert.That(maxScorePsm.Notch ?? 0, Is.EqualTo(maxPsmData.Notch)); Assert.That(maxScorePsm.PsmCount, Is.EqualTo(maxPsmData.PsmCount)); Assert.That(maxScorePsm.ScanPrecursorCharge, Is.EqualTo(maxPsmData.ScanPrecursorCharge)); PEP_Analysis.ComputePEPValuesForAllPSMsGeneric(nonNullPsms); int trueCount = 0; foreach (var item in allPsmsArray.Where(p => p != null)) { var b = item.FdrInfo.PEP; if (b >= 0.5) { trueCount++; } } Assert.GreaterOrEqual(32, trueCount); }
public static void TestComputePEPValueTopDown() { //just making sure that topdown data goes through the pep calculator without crashing. CommonParameters CommonParameters = new CommonParameters( digestionParams: new DigestionParams(protease: "top-down"), scoreCutoff: 1, assumeOrphanPeaksAreZ1Fragments: false); var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); List <Protein> proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\HeLaFakeTopDown.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var origDataFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML"); MyFileManager myFileManager = new MyFileManager(true); var myMsDataFile = myFileManager.LoadFile(origDataFile, CommonParameters); var searchMode = new SinglePpmAroundZeroSearchMode(5); Tolerance DeconvolutionMassTolerance = new PpmTolerance(5); var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, null, new CommonParameters()).OrderBy(b => b.PrecursorMass).ToArray(); PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[listOfSortedms2Scans.Length]; new ClassicSearchEngine(allPsmsArray, listOfSortedms2Scans, variableModifications, fixedModifications, null, null, null, proteinList, searchMode, CommonParameters, new List <string>()).Run(); var nonNullPsms = allPsmsArray.Where(p => p != null).ToList(); List <PeptideSpectralMatch> moreNonNullPSMs = new List <PeptideSpectralMatch>(); int reps = 3; for (int i = 0; i < reps; i++) { foreach (PeptideSpectralMatch psm in nonNullPsms) { moreNonNullPSMs.Add(psm); } } FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(moreNonNullPSMs.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run()); var maxScore = nonNullPsms.Select(n => n.Score).Max(); PeptideSpectralMatch maxScorePsm = nonNullPsms.Where(n => n.Score == maxScore).First(); Dictionary <string, int> sequenceToPsmCount = new Dictionary <string, int>(); List <string> sequences = new List <string>(); foreach (PeptideSpectralMatch psm in nonNullPsms) { var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList(); sequences.Add(String.Join("|", ss)); } var s = sequences.GroupBy(i => i); foreach (var grp in s) { sequenceToPsmCount.Add(grp.Key, grp.Count()); } Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTimeHI_behavior = new Dictionary <string, Dictionary <int, Tuple <double, double> > >(); Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTemHI_behaviorModifiedPeptides = new Dictionary <string, Dictionary <int, Tuple <double, double> > >(); string[] trainingVariables = PsmData.trainingInfos["topDown"]; int chargeStateMode = 4; var(notch, pwsm) = maxScorePsm.BestMatchingPeptides.First(); var maxPsmData = PEP_Analysis.CreateOnePsmDataEntry(maxScorePsm, sequenceToPsmCount, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, chargeStateMode, pwsm, trainingVariables, notch, !pwsm.Protein.IsDecoy); Assert.That(maxScorePsm.PeptidesToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity)); Assert.That(maxScorePsm.DeltaScore, Is.EqualTo(maxPsmData.DeltaScore).Within(0.05)); Assert.That((float)(maxScorePsm.Score - (int)maxScorePsm.Score), Is.EqualTo(maxPsmData.Intensity).Within(0.05)); Assert.AreEqual(maxPsmData.HydrophobicityZScore, float.NaN); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().MissedCleavages, Is.EqualTo(maxPsmData.MissedCleavagesCount)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().AllModsOneIsNterminus.Values.Count(), Is.EqualTo(maxPsmData.ModsCount)); Assert.That(maxScorePsm.Notch ?? 0, Is.EqualTo(maxPsmData.Notch)); Assert.That(maxScorePsm.PsmCount, Is.EqualTo(maxPsmData.PsmCount * reps)); Assert.That(-Math.Abs(chargeStateMode - maxScorePsm.ScanPrecursorCharge), Is.EqualTo(maxPsmData.PrecursorChargeDiffToMode)); Assert.AreEqual((float)0, maxPsmData.IsVariantPeptide); }
private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) { // Stop if canceled if (GlobalVariables.StopLoops) { return; } // calculate FDR on a per-protease basis (targets and decoys for a specific protease) var psmsGroupedByProtease = AllPsms.GroupBy(p => p.DigestionParams.Protease); foreach (var proteasePsms in psmsGroupedByProtease) { var psms = proteasePsms.ToList(); //determine if Score or DeltaScore performs better if (UseDeltaScore) { const double qValueCutoff = 0.01; //optimize to get the most PSMs at a 1% FDR List <PeptideSpectralMatch> scoreSorted = psms.OrderByDescending(b => b.Score).ThenBy(b => b.PeptideMonisotopicMass.HasValue ? Math.Abs(b.ScanPrecursorMass - b.PeptideMonisotopicMass.Value) : double.MaxValue).GroupBy(b => new Tuple <string, int, double?>(b.FullFilePath, b.ScanNumber, b.PeptideMonisotopicMass)).Select(b => b.First()).ToList(); int ScorePSMs = GetNumPSMsAtqValueCutoff(scoreSorted, qValueCutoff); scoreSorted = psms.OrderByDescending(b => b.DeltaScore).ThenBy(b => b.PeptideMonisotopicMass.HasValue ? Math.Abs(b.ScanPrecursorMass - b.PeptideMonisotopicMass.Value) : double.MaxValue).GroupBy(b => new Tuple <string, int, double?>(b.FullFilePath, b.ScanNumber, b.PeptideMonisotopicMass)).Select(b => b.First()).ToList(); int DeltaScorePSMs = GetNumPSMsAtqValueCutoff(scoreSorted, qValueCutoff); //sort by best method myAnalysisResults.DeltaScoreImprovement = DeltaScorePSMs > ScorePSMs; psms = myAnalysisResults.DeltaScoreImprovement ? psms.OrderByDescending(b => b.DeltaScore).ThenBy(b => b.PeptideMonisotopicMass.HasValue ? Math.Abs(b.ScanPrecursorMass - b.PeptideMonisotopicMass.Value) : double.MaxValue).ToList() : psms.OrderByDescending(b => b.Score).ThenBy(b => b.PeptideMonisotopicMass.HasValue ? Math.Abs(b.ScanPrecursorMass - b.PeptideMonisotopicMass.Value) : double.MaxValue).ToList(); } else //sort by score { psms = psms.OrderByDescending(b => b.Score).ThenBy(b => b.PeptideMonisotopicMass.HasValue ? Math.Abs(b.ScanPrecursorMass - b.PeptideMonisotopicMass.Value) : double.MaxValue).ToList(); } double cumulativeTarget = 0; double cumulativeDecoy = 0; //set up arrays for local FDRs double[] cumulativeTargetPerNotch = new double[MassDiffAcceptorNumNotches + 1]; double[] cumulativeDecoyPerNotch = new double[MassDiffAcceptorNumNotches + 1]; //Assign FDR values to PSMs for (int i = 0; i < psms.Count; i++) { // Stop if canceled if (GlobalVariables.StopLoops) { break; } PeptideSpectralMatch psm = psms[i]; int notch = psm.Notch ?? MassDiffAcceptorNumNotches; if (psm.IsDecoy) { // the PSM can be ambiguous between a target and a decoy sequence // in that case, count it as the fraction of decoy hits // e.g. if the PSM matched to 1 target and 2 decoys, it counts as 2/3 decoy double decoyHits = 0; double totalHits = 0; var hits = psm.BestMatchingPeptides.GroupBy(p => p.Peptide.FullSequence); foreach (var hit in hits) { if (hit.First().Peptide.Protein.IsDecoy) { decoyHits++; } totalHits++; } cumulativeDecoy += decoyHits / totalHits; cumulativeDecoyPerNotch[notch] += decoyHits / totalHits; } else { cumulativeTarget++; cumulativeTargetPerNotch[notch]++; } double qValue = Math.Min(1, cumulativeDecoy / cumulativeTarget); double qValueNotch = Math.Min(1, cumulativeDecoyPerNotch[notch] / cumulativeTargetPerNotch[notch]); double pep = psm.FdrInfo == null ? double.NaN : psm.FdrInfo.PEP; double pepQValue = psm.FdrInfo == null ? double.NaN : psm.FdrInfo.PEP_QValue; psm.SetFdrValues(cumulativeTarget, cumulativeDecoy, qValue, cumulativeTargetPerNotch[notch], cumulativeDecoyPerNotch[notch], qValueNotch, pep, pepQValue); } // set q-value thresholds such that a lower scoring PSM can't have // a higher confidence than a higher scoring PSM //Populate min qValues double qValueThreshold = 1.0; double[] qValueNotchThreshold = new double[MassDiffAcceptorNumNotches + 1]; for (int i = 0; i < qValueNotchThreshold.Length; i++) { qValueNotchThreshold[i] = 1.0; } for (int i = psms.Count - 1; i >= 0; i--) { PeptideSpectralMatch psm = psms[i]; // threshold q-values if (psm.FdrInfo.QValue > qValueThreshold) { psm.FdrInfo.QValue = qValueThreshold; } else if (psm.FdrInfo.QValue < qValueThreshold) { qValueThreshold = psm.FdrInfo.QValue; } // threshold notch q-values int notch = psm.Notch ?? MassDiffAcceptorNumNotches; if (psm.FdrInfo.QValueNotch > qValueNotchThreshold[notch]) { psm.FdrInfo.QValueNotch = qValueNotchThreshold[notch]; } else if (psm.FdrInfo.QValueNotch < qValueNotchThreshold[notch]) { qValueNotchThreshold[notch] = psm.FdrInfo.QValueNotch; } } } if (AnalysisType == "PSM") { CountPsm(); if (AllPsms.Count > 0) { myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis.ComputePEPValuesForAllPSMsGeneric(AllPsms); Compute_PEPValue_Based_QValue(AllPsms); } } if (AnalysisType == "Peptide") { Compute_PEPValue_Based_QValue(AllPsms); } }
public static void TestComputePEPValue() { var variableModifications = new List <Modification>(); var fixedModifications = new List <Modification>(); var origDataFile = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML"); MyFileManager myFileManager = new MyFileManager(true); CommonParameters CommonParameters = new CommonParameters(digestionParams: new DigestionParams()); var myMsDataFile = myFileManager.LoadFile(origDataFile, CommonParameters); var searchModes = new SinglePpmAroundZeroSearchMode(5); List <Protein> proteinList = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\hela_snip_for_unitTest.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1); var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML", CommonParameters).OrderBy(b => b.PrecursorMass).ToArray(); PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[listOfSortedms2Scans.Length]; new ClassicSearchEngine(allPsmsArray, listOfSortedms2Scans, variableModifications, fixedModifications, null, null, null, proteinList, searchModes, CommonParameters, new List <string>()).Run(); FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(allPsmsArray.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run()); var nonNullPsms = allPsmsArray.Where(p => p != null).ToList(); var nonNullPsmsOriginalCopy = allPsmsArray.Where(p => p != null).ToList(); var maxScore = nonNullPsms.Select(n => n.Score).Max(); PeptideSpectralMatch maxScorePsm = nonNullPsms.Where(n => n.Score == maxScore).First(); Dictionary <string, int> sequenceToPsmCount = new Dictionary <string, int>(); List <string> sequences = new List <string>(); foreach (PeptideSpectralMatch psm in nonNullPsms) { var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList(); sequences.Add(String.Join("|", ss)); } var s = sequences.GroupBy(i => i); foreach (var grp in s) { sequenceToPsmCount.Add(grp.Key, grp.Count()); } Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTimeHI_behavior = new Dictionary <string, Dictionary <int, Tuple <double, double> > >(); Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTemHI_behaviorModifiedPeptides = new Dictionary <string, Dictionary <int, Tuple <double, double> > >(); //average hydrophobicity, standard deviation hydrophobicity Tuple <double, double> at = new Tuple <double, double>(33.0, 1.0); Dictionary <int, Tuple <double, double> > HI_Time_avg_dev = new Dictionary <int, Tuple <double, double> > { { 154, at } }; fileSpecificRetTimeHI_behavior.Add(@"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML", HI_Time_avg_dev); string[] trainingVariables = new[] { "HydrophobicityZScore", "Intensity", "ScanPrecursorCharge", "DeltaScore", "Notch", "PsmCount", "ModsCount", "MissedCleavagesCount", "Ambiguity", "LongestFragmentIonSeries", "IsVariantPeptide" }; int chargeStateMode = 4; var(notch, pwsm) = maxScorePsm.BestMatchingPeptides.First(); var maxPsmData = PEP_Analysis.CreateOnePsmDataEntry(maxScorePsm, sequenceToPsmCount, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, chargeStateMode, pwsm, trainingVariables, notch, !pwsm.Protein.IsDecoy); Assert.That(maxScorePsm.PeptidesToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity)); Assert.That(maxScorePsm.DeltaScore, Is.EqualTo(maxPsmData.DeltaScore).Within(0.05)); Assert.That((float)(maxScorePsm.Score - (int)maxScorePsm.Score), Is.EqualTo(maxPsmData.Intensity).Within(0.05)); Assert.That(maxPsmData.HydrophobicityZScore, Is.EqualTo(5.170955).Within(0.05)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().MissedCleavages, Is.EqualTo(maxPsmData.MissedCleavagesCount)); Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().AllModsOneIsNterminus.Values.Count(), Is.EqualTo(maxPsmData.ModsCount)); Assert.That(maxScorePsm.Notch ?? 0, Is.EqualTo(maxPsmData.Notch)); Assert.That(maxScorePsm.PsmCount, Is.EqualTo(maxPsmData.PsmCount)); Assert.That(-Math.Abs(chargeStateMode - maxScorePsm.ScanPrecursorCharge), Is.EqualTo(maxPsmData.PrecursorChargeDiffToMode)); Assert.AreEqual((float)0, maxPsmData.IsVariantPeptide); PEP_Analysis.ComputePEPValuesForAllPSMsGeneric(nonNullPsms); int trueCount = 0; foreach (var item in allPsmsArray.Where(p => p != null)) { var b = item.FdrInfo.PEP; if (b >= 0.5) { trueCount++; } } List <PeptideSpectralMatch> moreNonNullPSMs = new List <PeptideSpectralMatch>(); for (int i = 0; i < 3; i++) { foreach (PeptideSpectralMatch psm in nonNullPsms) { moreNonNullPSMs.Add(psm); } } string expectedMetrics = "************************************************************\r\n* Metrics for Determination of PEP Using Binary Classification \r\n" + "*-----------------------------------------------------------\r\n* Accuracy: 1\r\n* Area Under Curve: 1\r\n* Area under Precision recall Curve: 1\r\n* F1Score: 1\r\n" + "* LogLoss: 2.60551851621861E-10\r\n* LogLossReduction: 0.999999999599165\r\n* PositivePrecision: 1\r\n* PositiveRecall: 1\r\n* NegativePrecision: 1\r\n" + "* NegativeRecall: 1\r\n* Count of Ambiguous Peptides Removed: 0\r\n************************************************************\r\n"; string metrics = PEP_Analysis.ComputePEPValuesForAllPSMsGeneric(moreNonNullPSMs); Assert.AreEqual(expectedMetrics, metrics); Assert.GreaterOrEqual(32, trueCount); //Test Variant Peptide as Input is identified as such as part of PEP calculation input much of the next several lines simply necessry to create a psm. var anMzSpectrum = new MzSpectrum(new double[] { 1, 1 }, new double[] { 2, 2 }, true); Ms2ScanWithSpecificMass scan = new Ms2ScanWithSpecificMass(new MsDataScan(anMzSpectrum, 1, 1, true, Polarity.Negative, 2, null, "", MZAnalyzerType.Orbitrap, 2, null, null, null), 1, 1, "path", new CommonParameters()); Protein variantProtein = new Protein("MPEPPPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }); PeptideWithSetModifications varPep = variantProtein.GetVariantProteins().SelectMany(p => p.Digest(CommonParameters.DigestionParams, null, null)).FirstOrDefault(); PeptideSpectralMatch variantPSM = new PeptideSpectralMatch(varPep, 0, maxScorePsm.Score, maxScorePsm.ScanIndex, scan, new DigestionParams(), null); sequenceToPsmCount = new Dictionary <string, int>(); sequences = new List <string>(); nonNullPsms.Add(variantPSM); foreach (PeptideSpectralMatch psm in nonNullPsms) { var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList(); sequences.Add(String.Join("|", ss)); } s = sequences.GroupBy(i => i); foreach (var grp in s) { sequenceToPsmCount.Add(grp.Key, grp.Count()); } var(vnotch, vpwsm) = variantPSM.BestMatchingPeptides.First(); PsmData variantPsmData = PEP_Analysis.CreateOnePsmDataEntry(variantPSM, sequenceToPsmCount, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, chargeStateMode, vpwsm, trainingVariables, vnotch, !maxScorePsm.IsDecoy); Assert.AreEqual((float)1, variantPsmData.IsVariantPeptide); }