Example #1
0
        public static PsmDataCollection ExtractPsmData(XElement results, SearchAlgorithm searchAlgorithm)
        {
            PsmDataCollection psms = new PsmDataCollection();
            PsmData           psm;

            if (searchAlgorithm == SearchAlgorithm.XTandem)
            {
                foreach (var x in results.Descendants("group").Where(x => x?.Element("protein") != null))
                {
                    psm = new PsmData();

                    psm.Id = Convert.ToInt32(x.Attribute("id").Value);

                    psm.Decoy = x.Attribute("label").Value.StartsWith("DECOY_");

                    // it is possible for each "group" in the pepXML file to have more than one protein. This just means the peptide isn't
                    // unique to a single protein. However, the scoring and modifications are identical (since it is the same PSM), so we
                    // can just use the first protein. That is what we do below.
                    XElement domain = x.Element("protein").Element("peptide").Element("domain");

                    psm.Seq = domain.Attribute("seq").Value;

                    psm.Start = Convert.ToInt32(domain.Attribute("start").Value);

                    psm.End = Convert.ToInt32(domain.Attribute("end").Value);

                    psm.Hyperscore = Convert.ToDouble(domain.Attribute("hyperscore").Value);

                    psm.ExpectationValue = Convert.ToDouble(domain.Attribute("expect").Value);

                    psm.MassDrift = (Convert.ToDouble(x.Attribute("mh")?.Value) - Convert.ToDouble(domain?.Attribute("mh").Value)) /
                                    Convert.ToDouble(domain?.Attribute("mh").Value) * 1e6;

                    psm.Charge = Convert.ToInt32(x.Attribute("z").Value);

                    psm.MissedCleavages = GetMissedCleavages(psm.Seq);

                    // add the modifications, if there are any
                    if (domain?.Elements("aa") != null)
                    {
                        foreach (XElement aa in domain.Elements("aa"))
                        {
                            Modification mod = new Modification();
                            // we convert the location to a zero-based index of the peptide
                            mod.Loc = Convert.ToInt32(aa.Attribute("at").Value) - psm.Start;

                            mod.AA = aa.Attribute("type").Value;

                            mod.Mass = Convert.ToDouble(aa.Attribute("modified").Value);

                            psm.Mods.Add(mod);
                        }
                    }

                    psms.Add(psm.Id, psm);
                }
            }

            return(psms);
        }
Example #2
0
        public static void TestPsmData()
        {
            string searchType = "standard";

            string[] trainingInfoStandard         = PsmData.trainingInfos[searchType];
            string[] expectedTrainingInfoStandard = new[] { "TotalMatchingFragmentCount", "Intensity", "PrecursorChargeDiffToMode", "DeltaScore", "Notch", "PsmCount", "ModsCount", "MissedCleavagesCount", "Ambiguity", "LongestFragmentIonSeries", "HydrophobicityZScore", "IsVariantPeptide" };
            Assert.AreEqual(expectedTrainingInfoStandard, trainingInfoStandard);

            searchType = "topDown";
            string[] trainingInfoTopDown         = PsmData.trainingInfos[searchType];
            string[] expectedTrainingInfoTopDown = new[] { "TotalMatchingFragmentCount", "Intensity", "PrecursorChargeDiffToMode", "DeltaScore", "Notch", "PsmCount", "ModsCount", "Ambiguity", "LongestFragmentIonSeries" };
            Assert.AreEqual(expectedTrainingInfoTopDown, trainingInfoTopDown);

            List <string> positiveAttributes = new List <string> {
                "TotalMatchingFragmentCount", "Intensity", "PrecursorChargeDiffToMode", "DeltaScore", "PsmCount", "LongestFragmentIonSeries"
            };
            List <string> negativeAttributes = new List <string> {
                "Notch", "ModsCount", "MissedCleavagesCount", "Ambiguity", "HydrophobicityZScore", "IsVariantPeptide"
            };

            foreach (string attribute in positiveAttributes)
            {
                Assert.AreEqual(1, PsmData.assumedAttributeDirection[attribute]);
            }
            foreach (string attribute in negativeAttributes)
            {
                Assert.AreEqual(-1, PsmData.assumedAttributeDirection[attribute]);
            }

            PsmData pd = new PsmData
            {
                TotalMatchingFragmentCount = 0,
                Intensity = 1,
                PrecursorChargeDiffToMode = 2,
                DeltaScore               = 3,
                Notch                    = 4,
                PsmCount                 = 5,
                ModsCount                = 6,
                MissedCleavagesCount     = 7,
                Ambiguity                = 8,
                LongestFragmentIonSeries = 9,
                HydrophobicityZScore     = 10,
                IsVariantPeptide         = 0,
                Label                    = false
            };

            string standardToString = "\t0\t1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t0";

            Assert.AreEqual(standardToString, pd.ToString("standard"));

            string topDownToString = "\t0\t1\t2\t3\t4\t5\t6\t8\t9";

            Assert.AreEqual(topDownToString, pd.ToString("topDown"));
        }
Example #3
0
        public static PsmDataCollection ExtractPsmData(XElement results, SearchAlgorithm searchAlgorithm)
        {
            PsmDataCollection psms = new PsmDataCollection();
            PsmData           psm;

            if (searchAlgorithm == SearchAlgorithm.XTandem)
            {
                foreach (var x in results.Descendants("group").Where(x => x?.Element("protein") != null))
                {
                    psm = new PsmData();

                    psm.Id = Convert.ToInt32(x.Attribute("id").Value);

                    psm.Decoy = x.Attribute("label").Value.StartsWith("DECOY_");

                    // it is possible for each "group" in the pepXML file to have more than one protein. This just means the peptide isn't
                    // unique to a single protein. However, the scoring and modifications are identical (since it is the same PSM), so we
                    // can just use the first protein. That is what we do below.
                    XElement domain = x.Element("protein").Element("peptide").Element("domain");

                    psm.Seq = domain.Attribute("seq").Value;

                    psm.Start = Convert.ToInt32(domain.Attribute("start").Value);

                    psm.End = Convert.ToInt32(domain.Attribute("end").Value);

                    psm.Hyperscore = Convert.ToDouble(domain.Attribute("hyperscore").Value);

                    psm.ExpectationValue = Convert.ToDouble(domain.Attribute("expect").Value);

                    psm.MassDrift = (Convert.ToDouble(x.Attribute("mh")?.Value) - Convert.ToDouble(domain?.Attribute("mh").Value)) /
                                    Convert.ToDouble(domain?.Attribute("mh").Value) * 1e6;

                    psm.Charge = Convert.ToInt32(x.Attribute("z").Value);

                    psm.MissedCleavages = GetMissedCleavages(psm.Seq);

                    // add the modifications, if there are any
                    if (domain?.Elements("aa") != null)
                    {
                        foreach (XElement aa in domain.Elements("aa"))
                        {
                            Modification mod = new Modification();
                            // we convert the location to a zero-based index of the peptide
                            mod.Loc = Convert.ToInt32(aa.Attribute("at").Value) - psm.Start;

                            mod.AA = aa.Attribute("type").Value;

                            mod.Mass = Convert.ToDouble(aa.Attribute("modified").Value);

                            psm.Mods.Add(mod);
                        }
                    }

                    psms.Add(psm.Id, psm);
                }
            }

            if (searchAlgorithm == SearchAlgorithm.IdentiPy)
            {
                XNamespace nsp = "http://regis-web.systemsbiology.net/pepXML";

                // first we need to make a dictionary of modification masses etc for the identipy results
                // the keys are the amino acid mass after modification, which is what identipy reports
                // the values are the mass difference values, which is what is given in the mass@aa arguments to the CLI
                XElement summary = results.Descendants(nsp + "search_summary").First();
                Dictionary <double, double> modInfo = new Dictionary <double, double>();

                foreach (XElement mod in summary.Elements(nsp + "aminoacid_modification"))
                {
                    modInfo.Add(Convert.ToDouble(mod.Attribute("mass").Value), Convert.ToDouble(mod.Attribute("massdiff").Value));
                }
                foreach (XElement mod in summary.Elements(nsp + "terminal_modification"))
                {
                    modInfo.Add(Convert.ToDouble(mod.Attribute("mass").Value), Convert.ToDouble(mod.Attribute("massdiff").Value));
                }

                // now we can parse out the data

                foreach (var x in results.Descendants(nsp + "spectrum_query"))
                {
                    psm = new PsmData();

                    psm.Id = Convert.ToInt32(x.Attribute("index").Value);

                    XElement searchHit = x.Element(nsp + "search_result").Element(nsp + "search_hit");

                    psm.Decoy = searchHit.Attribute("protein").Value.StartsWith("DECOY_");

                    psm.Seq = searchHit.Attribute("peptide").Value;

                    psm.Start = -1;

                    psm.End = -1;

                    psm.Hyperscore = Convert.ToDouble(searchHit.Elements(nsp + "search_score")
                                                      .Where(y => y.Attribute("name").Value == "hyperscore").First().Attribute("value").Value);

                    psm.ExpectationValue = Convert.ToDouble(searchHit.Elements(nsp + "search_score")
                                                            .Where(y => y.Attribute("name").Value == "expect").First().Attribute("value").Value);

                    psm.MassDrift = Convert.ToDouble(searchHit.Attribute("massdiff").Value) / Convert.ToDouble(x.Attribute("precursor_neutral_mass").Value) * 1e6;

                    psm.Charge = Convert.ToInt32(x.Attribute("assumed_charge").Value);

                    psm.MissedCleavages = GetMissedCleavages(psm.Seq);

                    // add the modifications, if there are any
                    if (searchHit.Element(nsp + "modification_info")?.Attribute("mod_nterm_mass") != null)
                    {
                        Modification mod = new Modification();

                        mod.Loc = 0; // its the n-terminus

                        mod.AA = psm.Seq[0].ToString();

                        mod.Mass = modInfo[Convert.ToDouble(searchHit.Element(nsp + "modification_info").Attribute("mod_nterm_mass").Value)];

                        psm.Mods.Add(mod);
                    }

                    if (searchHit.Element(nsp + "modification_info")?.Elements(nsp + "mod_aminoacid_mass") != null)
                    {
                        foreach (XElement aa in searchHit.Element(nsp + "modification_info").Elements(nsp + "mod_aminoacid_mass"))
                        {
                            Modification mod = new Modification();
                            // we convert the location to a zero-based index of the peptide
                            mod.Loc = Convert.ToInt32(aa.Attribute("position").Value) - 1;

                            mod.AA = psm.Seq[mod.Loc].ToString();

                            mod.Mass = modInfo[Convert.ToDouble(aa.Attribute("mass").Value)];

                            psm.Mods.Add(mod);
                        }
                    }

                    psms.Add(psm.Id, psm);
                }
            }

            return(psms);
        }
Example #4
0
        public static string ComputePEPValuesForAllPSMsGeneric(List <PeptideSpectralMatch> psms)
        {
            string searchType = DetermineSearchType(psms);

            string[] trainingVariables = PsmData.trainingInfos[searchType];

            //These two dictionaries contain the average and standard deviations of hydrophobicitys measured in 1 minute increments accross each raw
            //file separately. An individully measured hydrobophicty calculated for a specific PSM sequence is compared to these values by computing
            //the z-score. That z-score is used in the the machine learning.
            //Separate dictionaries are created for peptides with modifications becuase SSRcalc doesn't really do a good job predicting hyrophobicity

            //The first string in the dictionary is the filename
            //The value of the dictionary is another dictionary that profiles the hydrophobicity behavior. Each key is a retention time rounded to the nearest minute. The value Tuple is the average and standard deviation, respectively, of the predicted hydrophobicities of the observed peptides eluting at that rounded retention time.
            Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = new Dictionary <string, Dictionary <int, Tuple <double, double> > >();
            Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified   = new Dictionary <string, Dictionary <int, Tuple <double, double> > >();

            if (trainingVariables.Contains("HydrophobicityZScore"))
            {
                fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(psms, false);
                fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified   = ComputeHydrophobicityValues(psms, true);
            }

            Dictionary <string, int> sequenceToPsmCount = GetSequenceToPSMCount(psms);
            int chargeStateMode = GetChargeStateMode(psms);

            MLContext mlContext = new MLContext();
            IDataView dataView  = mlContext.Data.LoadFromEnumerable(CreatePsmData(psms, sequenceToPsmCount, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, chargeStateMode, trainingVariables));

            //
            // Summary:
            //     Split the dataset into the train set and test set according to the given fraction.
            //     Respects the samplingKeyColumnName if provided.
            //
            // Parameters:
            //   data:
            //     The dataset to split.
            //
            //   testFraction:
            //     The fraction of data to go into the test set.
            //
            //   samplingKeyColumnName:
            //     Name of a column to use for grouping rows. If two examples share the same value
            //     of the samplingKeyColumnName, they are guaranteed to appear in the same subset
            //     (train or test). This can be used to ensure no label leakage from the train to
            //     the test set. If null no row grouping will be performed.
            //
            //   seed:
            //     Seed for the random number generator used to select rows for the train-test split.
            //     The seed, '42', is not random but fixed for consistancy. According to the supercomputer Deep Thought the answer to the question of life, the universe and everything was 42 (in Douglas Adam’s Hitchhikers Guide to the Galaxy).

            TrainTestData trainTestSplit = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1, null, 42);
            IDataView     trainingData   = trainTestSplit.TrainSet;
            IDataView     testData       = trainTestSplit.TestSet;

            var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features");

            var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables)
                           .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features"));

            var trainedModel = pipeline.Fit(trainingData);

            var predictionEngine = mlContext.Model.CreatePredictionEngine <PsmData, TruePositivePrediction>(trainedModel);

            string ambiguousScans = "";
            int    ambiguousPeptidesRemovedCount = 0;

            foreach (PeptideSpectralMatch psm in psms)
            {
                if (psm != null)
                {
                    List <int> indiciesOfPeptidesToRemove = new List <int>();
                    List <(int notch, PeptideWithSetModifications pwsm)> bestMatchingPeptidesToRemove = new List <(int notch, PeptideWithSetModifications pwsm)>();
                    List <double> pepValuePredictions = new List <double>();

                    //Here we compute the pepvalue predection for each ambiguous peptide in a PSM. Ambiguous peptides with lower pepvalue predictions are removed from the PSM.
                    foreach (var(Notch, Peptide) in psm.BestMatchingPeptides)
                    {
                        PsmData pd = CreateOnePsmDataEntry(psm, sequenceToPsmCount, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified, fileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified, chargeStateMode, Peptide, trainingVariables, Notch, !Peptide.Protein.IsDecoy);
                        var     pepValuePrediction = predictionEngine.Predict(pd);
                        pepValuePredictions.Add(pepValuePrediction.Probability);
                        //A score is available using the variable pepvaluePrediction.Score
                    }

                    double highestPredictedPEPValue = pepValuePredictions.Max();
                    int    numberOfPredictions      = pepValuePredictions.Count - 1;

                    for (int i = numberOfPredictions; i >= 0; i--)
                    {
                        if (Math.Abs(highestPredictedPEPValue - pepValuePredictions[i]) > AbsoluteProbabilityThatDistinguishesPeptides)
                        {
                            indiciesOfPeptidesToRemove.Add(i);
                            pepValuePredictions.RemoveAt(i);
                        }
                    }

                    int index = 0;

                    foreach (var(Notch, Peptide) in psm.BestMatchingPeptides)
                    {
                        if (indiciesOfPeptidesToRemove.Contains(index))
                        {
                            bestMatchingPeptidesToRemove.Add((Notch, Peptide));
                        }
                        index++;
                    }

                    foreach (var(notch, pwsm) in bestMatchingPeptidesToRemove)
                    {
                        ambiguousScans = ambiguousScans + psm.ScanNumber + "|";
                        psm.RemoveThisAmbiguousPeptide(notch, pwsm);
                        ambiguousPeptidesRemovedCount++;
                    }
                    psm.FdrInfo.PEP = 1 - pepValuePredictions.Max();
                }
            }

            var predictions = trainedModel.Transform(testData);

            CalibratedBinaryClassificationMetrics metrics;

            try
            {
                metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "Label", scoreColumnName: "Score");
                return(PrintBinaryClassificationMetrics(trainer.ToString(), metrics, ambiguousPeptidesRemovedCount));
            }
            catch
            {
                return("");
            }

            //if you want to save a model, you can use this example
            //mlContext.Model.Save(trainedModel, trainingData.Schema, @"C:\Users\User\Downloads\TrainedModel.zip");
        }
Example #5
0
        public static void TestComputePEPValue()
        {
            var              variableModifications = new List <Modification>();
            var              fixedModifications    = new List <Modification>();
            var              origDataFile          = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML");
            MyFileManager    myFileManager         = new MyFileManager(true);
            CommonParameters CommonParameters      = new CommonParameters(digestionParams: new DigestionParams());
            var              myMsDataFile          = myFileManager.LoadFile(origDataFile, CommonParameters);
            var              searchModes           = new SinglePpmAroundZeroSearchMode(5);
            List <Protein>   proteinList           = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\hela_snip_for_unitTest.fasta"), true, DecoyType.Reverse, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                                                      ProteinDbLoader.UniprotOrganismRegex, out var dbErrors, -1);
            var listOfSortedms2Scans = MetaMorpheusTask.GetMs2Scans(myMsDataFile, @"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML", CommonParameters).OrderBy(b => b.PrecursorMass).ToArray();

            PeptideSpectralMatch[] allPsmsArray = new PeptideSpectralMatch[listOfSortedms2Scans.Length];
            new ClassicSearchEngine(allPsmsArray, listOfSortedms2Scans, variableModifications, fixedModifications, null, null, null, proteinList, searchModes, CommonParameters, new List <string>()).Run();
            FdrAnalysisResults fdrResultsClassicDelta = (FdrAnalysisResults)(new FdrAnalysisEngine(allPsmsArray.Where(p => p != null).ToList(), 1, CommonParameters, new List <string>()).Run());

            var nonNullPsms             = allPsmsArray.Where(p => p != null).ToList();
            var nonNullPsmsOriginalCopy = allPsmsArray.Where(p => p != null).ToList();

            var maxScore = nonNullPsms.Select(n => n.Score).Max();
            PeptideSpectralMatch maxScorePsm = nonNullPsms.Where(n => n.Score == maxScore).First();

            Dictionary <string, int> sequenceToPsmCount = new Dictionary <string, int>();

            List <string> sequences = new List <string>();

            foreach (PeptideSpectralMatch psm in nonNullPsms)
            {
                var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList();
                sequences.Add(String.Join("|", ss));
            }

            var s = sequences.GroupBy(i => i);

            foreach (var grp in s)
            {
                sequenceToPsmCount.Add(grp.Key, grp.Count());
            }

            Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTimeHI_behavior = new Dictionary <string, Dictionary <int, Tuple <double, double> > >();
            Dictionary <string, Dictionary <int, Tuple <double, double> > > fileSpecificRetTemHI_behaviorModifiedPeptides = new Dictionary <string, Dictionary <int, Tuple <double, double> > >();

            //average hydrophobicity, standard deviation hydrophobicity
            Tuple <double, double> at = new Tuple <double, double>(33.0, 1.0);

            Dictionary <int, Tuple <double, double> > HI_Time_avg_dev = new Dictionary <int, Tuple <double, double> >
            {
                { 154, at }
            };

            fileSpecificRetTimeHI_behavior.Add(@"TestData\TaGe_SA_HeLa_04_subset_longestSeq.mzML", HI_Time_avg_dev);

            string[] trainingVariables = new[] { "HydrophobicityZScore", "Intensity", "ScanPrecursorCharge", "DeltaScore", "Notch", "PsmCount", "ModsCount", "MissedCleavagesCount", "Ambiguity", "LongestFragmentIonSeries", "IsVariantPeptide" };

            int chargeStateMode = 4;

            var(notch, pwsm) = maxScorePsm.BestMatchingPeptides.First();
            var maxPsmData = PEP_Analysis.CreateOnePsmDataEntry(maxScorePsm, sequenceToPsmCount, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, chargeStateMode, pwsm, trainingVariables, notch, !pwsm.Protein.IsDecoy);

            Assert.That(maxScorePsm.PeptidesToMatchingFragments.Count - 1, Is.EqualTo(maxPsmData.Ambiguity));
            Assert.That(maxScorePsm.DeltaScore, Is.EqualTo(maxPsmData.DeltaScore).Within(0.05));
            Assert.That((float)(maxScorePsm.Score - (int)maxScorePsm.Score), Is.EqualTo(maxPsmData.Intensity).Within(0.05));
            Assert.That(maxPsmData.HydrophobicityZScore, Is.EqualTo(5.170955).Within(0.05));
            Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().MissedCleavages, Is.EqualTo(maxPsmData.MissedCleavagesCount));
            Assert.That(maxScorePsm.BestMatchingPeptides.Select(p => p.Peptide).First().AllModsOneIsNterminus.Values.Count(), Is.EqualTo(maxPsmData.ModsCount));
            Assert.That(maxScorePsm.Notch ?? 0, Is.EqualTo(maxPsmData.Notch));
            Assert.That(maxScorePsm.PsmCount, Is.EqualTo(maxPsmData.PsmCount));
            Assert.That(-Math.Abs(chargeStateMode - maxScorePsm.ScanPrecursorCharge), Is.EqualTo(maxPsmData.PrecursorChargeDiffToMode));
            Assert.AreEqual((float)0, maxPsmData.IsVariantPeptide);

            PEP_Analysis.ComputePEPValuesForAllPSMsGeneric(nonNullPsms);

            int trueCount = 0;

            foreach (var item in allPsmsArray.Where(p => p != null))
            {
                var b = item.FdrInfo.PEP;
                if (b >= 0.5)
                {
                    trueCount++;
                }
            }

            List <PeptideSpectralMatch> moreNonNullPSMs = new List <PeptideSpectralMatch>();

            for (int i = 0; i < 3; i++)
            {
                foreach (PeptideSpectralMatch psm in nonNullPsms)
                {
                    moreNonNullPSMs.Add(psm);
                }
            }

            string expectedMetrics = "************************************************************\r\n*       Metrics for Determination of PEP Using Binary Classification      \r\n" +
                                     "*-----------------------------------------------------------\r\n*       Accuracy:  1\r\n*       Area Under Curve:  1\r\n*       Area under Precision recall Curve:  1\r\n*       F1Score:  1\r\n" +
                                     "*       LogLoss:  2.60551851621861E-10\r\n*       LogLossReduction:  0.999999999599165\r\n*       PositivePrecision:  1\r\n*       PositiveRecall:  1\r\n*       NegativePrecision:  1\r\n" +
                                     "*       NegativeRecall:  1\r\n*       Count of Ambiguous Peptides Removed:  0\r\n************************************************************\r\n";

            string metrics = PEP_Analysis.ComputePEPValuesForAllPSMsGeneric(moreNonNullPSMs);

            Assert.AreEqual(expectedMetrics, metrics);
            Assert.GreaterOrEqual(32, trueCount);

            //Test Variant Peptide as Input is identified as such as part of PEP calculation input much of the next several lines simply necessry to create a psm.

            var anMzSpectrum             = new MzSpectrum(new double[] { 1, 1 }, new double[] { 2, 2 }, true);
            Ms2ScanWithSpecificMass scan = new Ms2ScanWithSpecificMass(new MsDataScan(anMzSpectrum, 1, 1, true, Polarity.Negative, 2, null, "", MZAnalyzerType.Orbitrap, 2, null, null, null), 1, 1, "path", new CommonParameters());
            Protein variantProtein       = new Protein("MPEPPPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> {
                new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
            });
            PeptideWithSetModifications varPep     = variantProtein.GetVariantProteins().SelectMany(p => p.Digest(CommonParameters.DigestionParams, null, null)).FirstOrDefault();
            PeptideSpectralMatch        variantPSM = new PeptideSpectralMatch(varPep, 0, maxScorePsm.Score, maxScorePsm.ScanIndex, scan, new DigestionParams(), null);

            sequenceToPsmCount = new Dictionary <string, int>();
            sequences          = new List <string>();
            nonNullPsms.Add(variantPSM);
            foreach (PeptideSpectralMatch psm in nonNullPsms)
            {
                var ss = psm.BestMatchingPeptides.Select(b => b.Peptide.FullSequence).ToList();
                sequences.Add(String.Join("|", ss));
            }

            s = sequences.GroupBy(i => i);

            foreach (var grp in s)
            {
                sequenceToPsmCount.Add(grp.Key, grp.Count());
            }
            var(vnotch, vpwsm) = variantPSM.BestMatchingPeptides.First();
            PsmData variantPsmData = PEP_Analysis.CreateOnePsmDataEntry(variantPSM, sequenceToPsmCount, fileSpecificRetTimeHI_behavior, fileSpecificRetTemHI_behaviorModifiedPeptides, chargeStateMode, vpwsm, trainingVariables, vnotch, !maxScorePsm.IsDecoy);

            Assert.AreEqual((float)1, variantPsmData.IsVariantPeptide);
        }
Example #6
0
        public static string ComputePEPValuesForAllPSMsGeneric(List <PeptideSpectralMatch> psms)
        {
            Dictionary <string, int> accessionAppearances = GetAccessionCounts(psms);
            Dictionary <string, int> sequenceToPsmCount   = GetSequenceToPSMCount(psms);

            MLContext mlContext = new MLContext();
            IDataView dataView  = mlContext.Data.LoadFromEnumerable(CreatePsmData(psms, accessionAppearances, sequenceToPsmCount));

            //
            // Summary:
            //     Split the dataset into the train set and test set according to the given fraction.
            //     Respects the samplingKeyColumnName if provided.
            //
            // Parameters:
            //   data:
            //     The dataset to split.
            //
            //   testFraction:
            //     The fraction of data to go into the test set.
            //
            //   samplingKeyColumnName:
            //     Name of a column to use for grouping rows. If two examples share the same value
            //     of the samplingKeyColumnName, they are guaranteed to appear in the same subset
            //     (train or test). This can be used to ensure no label leakage from the train to
            //     the test set. If null no row grouping will be performed.
            //
            //   seed:
            //     Seed for the random number generator used to select rows for the train-test split.
            TrainTestData trainTestSplit = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
            IDataView     trainingData   = trainTestSplit.TrainSet;
            IDataView     testData       = trainTestSplit.TestSet;

            var trainer = mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features");

            var pipeline = mlContext.Transforms.Concatenate("Features", "Intensity", "ScanPrecursorCharge", "DeltaScore", "Notch", "PsmCount", "ModsCount", "MissedCleavagesCount", "Ambiguity", "AccessionAppearances", "LongestFragmentIonSeries")
                           .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features"));

            var trainedModel = pipeline.Fit(trainingData);

            var predictionEngine = mlContext.Model.CreatePredictionEngine <PsmData, TruePositivePrediction>(trainedModel);

            string ambiguousScans = "";

            //For Debug
            List <string> someOut = new List <string>();

            someOut.Add("Accessions|Ambiguity|DeltaScore|Intensity|Label|LongestSeries|MissedCleavages|ModsCount|Notch|PsmCount|PrecursorCharge|call|pepValue|Score|QValue");

            foreach (PeptideSpectralMatch psm in psms)
            {
                if (psm != null)
                {
                    List <int> indiciesOfPeptidesToRemove = new List <int>();
                    List <(int notch, PeptideWithSetModifications pwsm)> bestMatchingPeptidesToRemove = new List <(int notch, PeptideWithSetModifications pwsm)>();
                    List <double> pepValuePredictions = new List <double>();

                    //Here we compute the pepvalue predection for each ambiguous peptide in a PSM. Ambiguous peptides with lower pepvalue predictions are removed from the PSM.
                    foreach (var(Notch, Peptide) in psm.BestMatchingPeptides)
                    {
                        PsmData pd = CreateOnePsmDataFromPsm2(psm, Notch, Peptide, accessionAppearances, sequenceToPsmCount);

                        var pepValuePrediction = predictionEngine.Predict(pd);

                        someOut.Add(pd.AccessionAppearances.ToString() + "|" + pd.Ambiguity.ToString() + "|" + pd.DeltaScore.ToString() + "|" + pd.Intensity.ToString() + "|" + pd.Label + "|" + pd.LongestFragmentIonSeries + "|" + pd.MissedCleavagesCount + "|" + pd.ModsCount + "|" + pd.Notch + "|" + pd.PsmCount + "|" + pd.ScanPrecursorCharge + "|" + pepValuePrediction.Prediction + "|" + pepValuePrediction.Probability + "|" + pepValuePrediction.Score);

                        pepValuePredictions.Add(pepValuePrediction.Probability);
                        //A score is available using the variable pepvaluePrediction.Score
                    }

                    double highestPredictedPEPValue = pepValuePredictions.Max();
                    int    numberOfPredictions      = pepValuePredictions.Count - 1;

                    for (int i = numberOfPredictions; i >= 0; i--)
                    {
                        if (Math.Abs(highestPredictedPEPValue - pepValuePredictions[i]) > 0.000001)
                        {
                            indiciesOfPeptidesToRemove.Add(i);
                            pepValuePredictions.RemoveAt(i);
                            //pValuePredictionStrings.RemoveAt(i);
                        }
                    }

                    int index = 0;

                    foreach (var(Notch, Peptide) in psm.BestMatchingPeptides)
                    {
                        if (indiciesOfPeptidesToRemove.Contains(index))
                        {
                            bestMatchingPeptidesToRemove.Add((Notch, Peptide));
                        }
                        index++;
                    }

                    foreach (var(notch, pwsm) in bestMatchingPeptidesToRemove)
                    {
                        ambiguousScans = ambiguousScans + psm.ScanNumber + "|";
                        psm.RemoveThisAmbiguousPeptide(notch, pwsm);
                    }

                    psm.FdrInfo.PEP = 1 - pepValuePredictions[0]; //they should all be the same at this point so it doesn't matter which you take. First is good.
                }
            }

            //For debug
            //File.WriteAllLines(@"C:\Users\Michael Shortreed\Downloads\psmDataVAlues.txt", someOut, System.Text.Encoding.UTF8);

            var predictions = trainedModel.Transform(testData);

            CalibratedBinaryClassificationMetrics metrics;

            try
            {
                metrics = mlContext.BinaryClassification.Evaluate(data: predictions, labelColumnName: "Label", scoreColumnName: "Score");
                return(PrintBinaryClassificationMetrics(trainer.ToString(), metrics));
            }
            catch
            {
                return("");
            }

            //if you want to save a model, you can use this example
            //mlContext.Model.Save(trainedModel, trainingData.Schema, @"C:\Users\User\Downloads\TrainedModel.zip");
        }