Ejemplo n.º 1
0
        public static void StartRamp(IProtease protease, double percentIdentifiedSteps = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35)
        {
            List <Peptide> peps     = new List <Peptide>();
            List <Protein> proteins = new List <Protein>();

            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength))
                    {
                        peps.Add(peptide);
                    }
                    proteins.Add(protein);
                }
            }

            // Fixed seed to make it reproducible
            Random random = new Random(480912341);

            peps = peps.OrderBy(x => random.Next()).ToList();

            for (double percentIdentified = 0; percentIdentified <= 1; percentIdentified += percentIdentifiedSteps)
            {
                // Take the first x % to act as our identified peptides
                List <Peptide> identifiedPeptides = peps.Take((int)(peps.Count * percentIdentified)).ToList();

                List <ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList();
                Console.WriteLine("{0} peptides {1} protein groups", identifiedPeptides.Count, proteinGroups.Count);
            }
        }
Ejemplo n.º 2
0
        public static void ExampleProteinGrouping(IProtease protease, double percentIdentified = 0.01, int maxMissed = 3, int minLength = 5, int maxLength = 50)
        {
            Stopwatch watch = new Stopwatch();

            watch.Start();
            List <Peptide> peps     = new List <Peptide>(1000000);
            List <Protein> proteins = new List <Protein>(7000);

            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    peps.AddRange(protein.Digest(protease, maxMissed, minLength, maxLength));
                    proteins.Add(protein);
                }
            }
            Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds);
            watch.Restart();

            Random random = new Random(480912341);

            // Take the first x % to act as our identified peptides
            List <Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int)(peps.Count * percentIdentified)).ToList();

            List <ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList();

            watch.Stop();

            Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count);
            Console.WriteLine();
            Console.WriteLine("Time elapsed: {0} ms", watch.ElapsedMilliseconds);
        }
Ejemplo n.º 3
0
        public static void TestToString()
        {
            // many of these are just to check that the ToString methods don't cause crashes
            var indexedPeak = new IndexedMassSpectralPeak(1.0, 2.0, 3, 4);

            Assert.That(indexedPeak.ToString().Equals("1.000; 4; 3"));

            var    spectraFile   = new SpectraFileInfo("myFullPath", "", 0, 0, 0);
            string spectraString = spectraFile.ToString();

            var    proteinGroup = new ProteinGroup("Accession", "Gene", "Organism");
            string pgString     = proteinGroup.ToString(new List <SpectraFileInfo> {
                spectraFile
            });

            var identification = new Identification(
                spectraFile, "PEPTIDE", "PEPTIDE", 1.0, 2.0, 3,
                new List <ProteinGroup> {
                proteinGroup
            });
            string idString = identification.ToString();

            var    chromPeak       = new ChromatographicPeak(identification, false, spectraFile);
            string chromPeakString = chromPeak.ToString();

            chromPeak.CalculateIntensityForThisFeature(true);
            string peakAfterCalculatingIntensity = chromPeak.ToString();
        }
Ejemplo n.º 4
0
        public static void TestFlashLfq()
        {
            // get the raw file paths
            SpectraFileInfo raw  = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "a", 0, 0, 0);
            SpectraFileInfo mzml = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "a", 0, 1, 0);

            // create some PSMs
            var            pg  = new ProteinGroup("MyProtein", "gene", "org");
            Identification id1 = new Identification(raw, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            Identification id2 = new Identification(raw, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.05811, 2, new List <ProteinGroup> {
                pg
            });
            Identification id3 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            Identification id4 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.05811, 2, new List <ProteinGroup> {
                pg
            });

            // create the FlashLFQ engine
            FlashLFQEngine engine = new FlashLFQEngine(new List <Identification> {
                id1, id2, id3, id4
            }, normalize: true);

            // run the engine
            var results = engine.Run();

            // check raw results
            Assert.That(results.Peaks[raw].Count == 1);
            Assert.That(results.Peaks[raw].First().Intensity > 0);
            Assert.That(!results.Peaks[raw].First().IsMbrFeature);
            Assert.That(results.PeptideBaseSequences["EGFQVADGPLYR"].GetIntensity(raw) > 0);
            Assert.That(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw) > 0);
            Assert.That(results.ProteinGroups["MyProtein"].GetIntensity(raw) > 0);

            // check mzml results
            Assert.That(results.Peaks[mzml].Count == 1);
            Assert.That(results.Peaks[mzml].First().Intensity > 0);
            Assert.That(!results.Peaks[mzml].First().IsMbrFeature);
            Assert.That(results.PeptideBaseSequences["EGFQVADGPLYR"].GetIntensity(mzml) > 0);
            Assert.That(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml) > 0);
            Assert.That(results.ProteinGroups["MyProtein"].GetIntensity(mzml) > 0);

            // check that condition normalization worked
            int int1 = (int)System.Math.Round(results.Peaks[mzml].First().Intensity, 0);
            int int2 = (int)System.Math.Round(results.Peaks[raw].First().Intensity, 0);

            Assert.That(int1 == int2);

            // test peak output
            results.WriteResults(
                Path.Combine(TestContext.CurrentContext.TestDirectory, @"peaks.tsv"),
                Path.Combine(TestContext.CurrentContext.TestDirectory, @"modSeq.tsv"),
                Path.Combine(TestContext.CurrentContext.TestDirectory, @"baseSeq.tsv"),
                Path.Combine(TestContext.CurrentContext.TestDirectory, @"protein.tsv"));
        }
        void GroupProteinsOrPeptides(
            IDictionary <string, Node> entities,
            IReadOnlyList <string> entityNames,
            Type entityType,
            GlobalIDContainer globalIDTracker)
        {
            for (var count = 0; count != entityNames.Count; count++)
            {
                // Is the key there?
                if (!entities.ContainsKey(entityNames[count]))
                {
                    continue;
                }

                // Get the protein or peptide
                var entity = entities[entityNames[count]];

                // Only proceed if the correct type
                if (entity.GetType() != entityType)
                {
                    continue;
                }

                var duplicates = new NodeChildren <Node>();

                // Look for duplicates and add to a duplicate list
                duplicates.AddRange(FindDuplicates(entity));

                if (duplicates.Count <= 1)
                {
                    continue;
                }

                // Create a protein or peptide group from the duplicates
                Group newGroup;

                if (entityType == typeof(Protein))
                {
                    newGroup = new ProteinGroup(duplicates, globalIDTracker);
                }
                else if (entityType == typeof(Peptide))
                {
                    newGroup = new PeptideGroup(duplicates, globalIDTracker);
                }
                else
                {
                    throw new Exception("Invalid type: must be Protein or Peptide");
                }

                foreach (var duplicateItem in duplicates)
                {
                    // Remove entities from the library, add the new group
                    entities.Remove(duplicateItem.NodeName);
                }

                entities.Add(newGroup.NodeName, newGroup);
            }
        }
Ejemplo n.º 6
0
        public static void TestPeakSplittingRight()
        {
            string fileToWrite = "myMzml.mzML";
            string peptide     = "PEPTIDE";
            double intensity   = 1e6;

            Loaders.LoadElements(Path.Combine(TestContext.CurrentContext.TestDirectory, @"elements.dat"));

            // generate mzml file

            // 1 MS1 scan per peptide
            MsDataScan[] scans = new MsDataScan[10];
            double[]     intensityMultipliers = { 1, 3, 5, 10, 5, 3, 1, 1, 3, 1 };

            for (int s = 0; s < scans.Length; s++)
            {
                ChemicalFormula      cf          = new Proteomics.AminoAcidPolymer.Peptide(peptide).GetChemicalFormula();
                IsotopicDistribution dist        = IsotopicDistribution.GetDistribution(cf, 0.125, 1e-8);
                double[]             mz          = dist.Masses.Select(v => v.ToMz(1)).ToArray();
                double[]             intensities = dist.Intensities.Select(v => v * intensity * intensityMultipliers[s]).ToArray();

                // add the scan
                scans[s] = new MsDataScan(massSpectrum: new MzSpectrum(mz, intensities, false), oneBasedScanNumber: s + 1, msnOrder: 1, isCentroid: true,
                                          polarity: Polarity.Positive, retentionTime: 1.0 + s / 10.0, scanWindowRange: new MzRange(400, 1600), scanFilter: "f",
                                          mzAnalyzer: MZAnalyzerType.Orbitrap, totalIonCurrent: intensities.Sum(), injectionTime: 1.0, noiseData: null, nativeId: "scan=" + (s + 1));
            }

            // write the .mzML
            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(new FakeMsDataFile(scans),
                                                                          Path.Combine(TestContext.CurrentContext.TestDirectory, fileToWrite), false);

            // set up spectra file info
            SpectraFileInfo file1 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, fileToWrite), "", 0, 0, 0);

            // create some PSMs
            var pg = new ProteinGroup("MyProtein", "gene", "org");

            Identification id1 = new Identification(file1, peptide, peptide,
                                                    new Proteomics.AminoAcidPolymer.Peptide(peptide).MonoisotopicMass, 1.3 + 0.001, 1, new List <ProteinGroup> {
                pg
            });

            // create the FlashLFQ engine
            FlashLFQEngine engine = new FlashLFQEngine(new List <Identification> {
                id1
            });

            // run the engine
            var results = engine.Run();
            ChromatographicPeak peak = results.Peaks.First().Value.First();

            Assert.That(peak.Apex.RetentionTime == 1.3);
            Assert.That(peak.SplitRT == 1.6);
            Assert.That(!peak.IsotopicEnvelopes.Any(p => p.RetentionTime > 1.6));
            Assert.That(peak.IsotopicEnvelopes.Count == 6);
        }
Ejemplo n.º 7
0
        public void ProteinGroup_getBaseLinePeptides_Pass(string proteinAccession, List <Peptide> peptides, List <string> groups, int reqNumUnmodPeptides, int reqNumModPeptides, int reqNumOfPepeptides,
                                                          Boolean useBaselinePeptides, int reqNumBaselinePeptides, double correlationCutOff, Boolean compareUnmod, int minNumStoichiometries, string groupToCompare, List <string> baselinePeptideSeq)
        {
            Extensions.IncludeSharedPeptides(peptides, true); //set isUnique
            ProteinGroup ProteinGroupAllGroupsTest = new ProteinGroup(proteinAccession, peptides, groups, reqNumUnmodPeptides, reqNumModPeptides, reqNumOfPepeptides, useBaselinePeptides,
                                                                      reqNumBaselinePeptides, 3, correlationCutOff, compareUnmod, minNumStoichiometries);
            ProteinGroup ProteinGroupSetGroupTest = new ProteinGroup(proteinAccession, peptides, groups, reqNumUnmodPeptides, reqNumModPeptides, reqNumOfPepeptides, useBaselinePeptides,
                                                                     reqNumBaselinePeptides, 3, correlationCutOff, compareUnmod, minNumStoichiometries, groupToCompare);

            Assert.AreEqual(baselinePeptideSeq.Count(), ProteinGroupAllGroupsTest.BaselinePeptides.Count());
            Assert.AreEqual(baselinePeptideSeq.Count(), ProteinGroupSetGroupTest.BaselinePeptides.Count());
            Assert.AreEqual(baselinePeptideSeq, ProteinGroupAllGroupsTest.BaselinePeptides.Select(p => p.Sequence));
            Assert.AreEqual(baselinePeptideSeq, ProteinGroupSetGroupTest.BaselinePeptides.Select(p => p.Sequence));
        }
Ejemplo n.º 8
0
        public void ProteinGroup_useProt_Pass(string proteinAccession, List <Peptide> peptides, List <string> groups, int reqNumUnmodPeptides, int reqNumModPeptides, int reqNumOfPepeptides,
                                              double correlationCutOff, Boolean compareUnmod, int minNumStoichiometries, string groupToCompare, bool useProtBaseline, bool useProtPeptidePeptide)
        {
            ProteinGroup ProteinGroupBaselineAllGroupTest = new ProteinGroup(proteinAccession, peptides, groups, reqNumUnmodPeptides, reqNumModPeptides, reqNumOfPepeptides, true,
                                                                             reqNumUnmodPeptides, 3, correlationCutOff, compareUnmod, minNumStoichiometries);
            ProteinGroup ProteinGroupBaselineSetGroupTest = new ProteinGroup(proteinAccession, peptides, groups, reqNumUnmodPeptides, reqNumModPeptides, reqNumOfPepeptides, true,
                                                                             reqNumUnmodPeptides, 3, correlationCutOff, compareUnmod, minNumStoichiometries, groupToCompare);
            ProteinGroup ProteinGroupPeptidePeptideAllGroupTest = new ProteinGroup(proteinAccession, peptides, groups, reqNumUnmodPeptides, reqNumModPeptides, reqNumOfPepeptides, false,
                                                                                   reqNumUnmodPeptides, 3, correlationCutOff, compareUnmod, minNumStoichiometries);
            ProteinGroup ProteinGroupPeptidePeptideSetGroupTest = new ProteinGroup(proteinAccession, peptides, groups, reqNumUnmodPeptides, reqNumModPeptides, reqNumOfPepeptides, false,
                                                                                   reqNumUnmodPeptides, 3, correlationCutOff, compareUnmod, minNumStoichiometries, groupToCompare);

            Assert.AreEqual(useProtBaseline, ProteinGroupBaselineAllGroupTest.useProt);
            Assert.AreEqual(useProtBaseline, ProteinGroupBaselineSetGroupTest.useProt);
            Assert.AreEqual(useProtPeptidePeptide, ProteinGroupPeptidePeptideAllGroupTest.useProt);
            Assert.AreEqual(useProtPeptidePeptide, ProteinGroupPeptidePeptideSetGroupTest.useProt);
        }
Ejemplo n.º 9
0
        public static void TestProteinGroupsAccessionOutputOrder()
        {
            var p = new HashSet <Protein>();
            List <Tuple <string, string> > gn = new List <Tuple <string, string> >();

            // make protein B
            p.Add(new Protein("-----F----*", "B", null, gn, new Dictionary <int, List <Modification> >(), isDecoy: true));

            // make protein A
            p.Add(new Protein("-----F----**", "A", null, gn, new Dictionary <int, List <Modification> >(), isDecoy: true));

            // add protein B and A to the protein group
            ProteinGroup testGroup = new ProteinGroup(p, null, null);

            // test order is AB and not BA
            Assert.That(testGroup.ProteinGroupName.Equals("A|B"));
            Assert.That(testGroup.Proteins.First().Accession.Equals("B"));
        }
Ejemplo n.º 10
0
        public void ProteinGroup_calcComparison_Pass(string proteinAccession, List <Peptide> peptides, List <string> groups, string groupToCompare,
                                                     int numPairwiseCompairisonsBaselineAllGroup, int numPairwiseCompairisonsBaselineSetGroup, int numPairwiseCompairisonsPeptidePeptideAllGroup,
                                                     int numPairwiseCompairisonsPeptidePeptideSetGroup)
        {
            Extensions.IncludeSharedPeptides(peptides, true); //set isUnique
            ProteinGroup ProteinGroupBaselineAllGroupTest = new ProteinGroup(proteinAccession, peptides, groups, 3, 1, 4, true,
                                                                             3, 3, 0.5, false, 3);
            ProteinGroup ProteinGroupBaselineSetGroupTest = new ProteinGroup(proteinAccession, peptides, groups, 3, 1, 4, true,
                                                                             3, 3, 0.5, false, 3, groupToCompare);
            ProteinGroup ProteinGroupPeptidePeptideAllGroupTest = new ProteinGroup(proteinAccession, peptides, groups, 1, 1, 4, false,
                                                                                   0, 3, 0.5, false, 3);
            ProteinGroup ProteinGroupPeptidePeptideSetGroupTest = new ProteinGroup(proteinAccession, peptides, groups, 1, 1, 2, false,
                                                                                   0, 3, 0.5, false, 3, groupToCompare);

            Assert.AreEqual(numPairwiseCompairisonsBaselineAllGroup, ProteinGroupBaselineAllGroupTest.ProteinPairwiseComparisons.Count());
            Assert.AreEqual(numPairwiseCompairisonsBaselineSetGroup, ProteinGroupBaselineSetGroupTest.ProteinPairwiseComparisons.Count());
            Assert.AreEqual(numPairwiseCompairisonsPeptidePeptideAllGroup, ProteinGroupPeptidePeptideAllGroupTest.ProteinPairwiseComparisons.Count());
            Assert.AreEqual(numPairwiseCompairisonsPeptidePeptideSetGroup, ProteinGroupPeptidePeptideSetGroupTest.ProteinPairwiseComparisons.Count());
        }
Ejemplo n.º 11
0
        public static void Start(IProtease protease, double percentIdentified = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35)
        {
            Console.WriteLine("**Start Protein Grouping**");
            Stopwatch watch = new Stopwatch();

            watch.Start();
            List <Peptide> peps     = new List <Peptide>();
            List <Protein> proteins = new List <Protein>();

            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength))
                    {
                        peps.Add(peptide);
                    }
                    proteins.Add(protein);
                }
            }
            Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds);

            // Fixed seed to make it reproducible
            Random random = new Random(480912341);

            // Take the first x % to act as our identified peptides
            List <Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int)(peps.Count * percentIdentified)).ToList();

            List <ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList();

            watch.Stop();
            Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count);
            Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
            Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024));
            Console.WriteLine("**END Protein Grouping**");
        }
Ejemplo n.º 12
0
        public static List <Identification> ReadPsms(string filepath, bool silent, List <SpectraFileInfo> rawfiles)
        {
            Dictionary <string, ProteinGroup> allProteinGroups = new Dictionary <string, ProteinGroup>();

            _modSequenceToMonoMass = new Dictionary <string, double>();
            List <Identification> ids      = new List <Identification>();
            PsmFileType           fileType = PsmFileType.Unknown;

            string[] delim = new string[] { ";", ",", " or ", "\"", "|" };

            if (!silent)
            {
                Console.WriteLine("Opening PSM file " + filepath);
            }

            StreamReader reader;

            try
            {
                reader = new StreamReader(filepath);
            }
            catch (Exception e)
            {
                if (!silent)
                {
                    Console.WriteLine("Error reading file " + filepath + "\n" + e.Message);
                }

                return(new List <Identification>());
            }

            int lineNum = 1;

            while (reader.Peek() > 0)
            {
                string line = reader.ReadLine();

                try
                {
                    if (lineNum != 1)
                    {
                        if (fileType == PsmFileType.Unknown)
                        {
                            break;
                        }

                        var param = line.Split('\t');

                        // only quantify PSMs below 1% FDR
                        if (fileType == PsmFileType.MetaMorpheus && double.Parse(param[_qValueCol]) > 0.01)
                        {
                            break;
                        }
                        else if (fileType == PsmFileType.Morpheus && double.Parse(param[_qValueCol]) > 1.00)
                        {
                            break;
                        }

                        // only quantify PSMs below 1% notch FDR
                        if (fileType == PsmFileType.MetaMorpheus && double.Parse(param[_qValueNotchCol]) > 0.01)
                        {
                            continue;
                        }

                        // skip decoys
                        if ((fileType == PsmFileType.MetaMorpheus || fileType == PsmFileType.Morpheus) &&
                            param[_decoyCol].Contains("D"))
                        {
                            continue;
                        }

                        // spectrum file name
                        string fileName = param[_fileNameCol];

                        // base sequence
                        string baseSequence = param[_baseSequCol];
                        // skip ambiguous sequence in MetaMorpheus output
                        if (fileType == PsmFileType.MetaMorpheus && (baseSequence.Contains(" or ") || baseSequence.Contains("|")))
                        {
                            lineNum++;
                            continue;
                        }

                        // modified sequence
                        string modSequence = param[_fullSequCol];
                        if (fileType == PsmFileType.TdPortal)
                        {
                            modSequence = baseSequence + modSequence;
                        }

                        // skip ambiguous sequence in MetaMorpheus output
                        if (fileType == PsmFileType.MetaMorpheus && (modSequence.Contains(" or ") || modSequence.Contains("|") || modSequence.Contains("too long")))
                        {
                            lineNum++;
                            continue;
                        }

                        // monoisotopic mass
                        double monoisotopicMass = double.Parse(param[_monoMassCol]);

                        if (_modSequenceToMonoMass.TryGetValue(modSequence, out double storedMonoisotopicMass))
                        {
                            if (storedMonoisotopicMass != monoisotopicMass)
                            {
                                if (!silent)
                                {
                                    Console.WriteLine("Caution! PSM with sequence " + modSequence + " at line " +
                                                      lineNum + " could not be read; " +
                                                      "a peptide with the same modified sequence but a different monoisotopic mass has already been added");
                                }

                                lineNum++;
                                continue;
                            }
                        }
                        else
                        {
                            _modSequenceToMonoMass.Add(modSequence, monoisotopicMass);
                        }

                        // retention time
                        double ms2RetentionTime = double.Parse(param[_msmsRetnCol]);
                        if (fileType == PsmFileType.PeptideShaker)
                        {
                            // peptide shaker RT is in seconds - convert to minutes
                            ms2RetentionTime = ms2RetentionTime / 60.0;
                        }

                        if (ms2RetentionTime < 0)
                        {
                            if (!silent)
                            {
                                Console.WriteLine("Caution! PSM with sequence " + modSequence + " at line " +
                                                  lineNum + " could not be read; retention time was negative");
                            }

                            lineNum++;
                            continue;
                        }

                        // charge state
                        int chargeState;
                        if (fileType == PsmFileType.TdPortal)
                        {
                            chargeState = 1;
                        }
                        else if (fileType == PsmFileType.PeptideShaker)
                        {
                            string charge = new String(param[_chargeStCol].Where(Char.IsDigit).ToArray());
                            chargeState = int.Parse(charge);
                        }
                        else
                        {
                            chargeState = (int)double.Parse(param[_chargeStCol]);
                        }

                        // protein groups
                        List <string> proteinGroupStrings = new List <string>();
                        if (fileType == PsmFileType.MetaMorpheus)
                        {
                            // MetaMorpheus - use all proteins listed
                            var g = param[_protNameCol].Split(delim, StringSplitOptions.RemoveEmptyEntries);
                            if (g.Any())
                            {
                                foreach (var pg in g)
                                {
                                    proteinGroupStrings.Add(pg.Trim());
                                }
                            }
                        }
                        else if (fileType == PsmFileType.Morpheus)
                        {
                            // Morpheus - only one protein listed, use it
                            proteinGroupStrings.Add(param[_protNameCol].Trim());
                        }
                        else if (fileType == PsmFileType.MaxQuant)
                        {
                            // MaxQuant - use the first protein listed
                            var g = param[_protNameCol].Split(delim, StringSplitOptions.RemoveEmptyEntries);
                            if (g.Any())
                            {
                                proteinGroupStrings.Add(g.First().Trim());
                            }
                        }
                        else if (fileType == PsmFileType.PeptideShaker)
                        {
                            // Peptide Shaker - use all proteins listed
                            var g = param[_protNameCol].Split(delim, StringSplitOptions.RemoveEmptyEntries);
                            if (g.Any())
                            {
                                foreach (var pg in g)
                                {
                                    proteinGroupStrings.Add(pg.Trim());
                                }
                            }
                        }
                        else if (fileType == PsmFileType.TdPortal)
                        {
                            // TDPortal - use base sequence as protein group
                            proteinGroupStrings.Add(baseSequence);
                        }
                        else
                        {
                            proteinGroupStrings.Add(param[_protNameCol]);
                        }

                        List <ProteinGroup> proteinGroups = new List <ProteinGroup>();
                        foreach (var proteinGroupName in proteinGroupStrings)
                        {
                            if (allProteinGroups.TryGetValue(proteinGroupName, out ProteinGroup pg))
                            {
                                proteinGroups.Add(pg);
                            }
                            else
                            {
                                ProteinGroup newPg = new ProteinGroup(proteinGroupName, "", "");
                                allProteinGroups.Add(proteinGroupName, newPg);
                                proteinGroups.Add(newPg);
                            }
                        }

                        // construct id
                        var fileNameNoExt    = Path.GetFileNameWithoutExtension(fileName);
                        var rawFileInfoToUse = rawfiles.FirstOrDefault(p => p.FilenameWithoutExtension.Equals(fileNameNoExt));
                        if (rawFileInfoToUse == null)
                        {
                            // skip PSMs for files with no spectrum data input
                            lineNum++;
                            continue;
                        }

                        var ident = new Identification(rawFileInfoToUse, baseSequence, modSequence, monoisotopicMass, ms2RetentionTime, chargeState, proteinGroups);
                        ids.Add(ident);
                    }
                    else
                    {
                        fileType = GetFileTypeFromHeader(line);
                    }

                    lineNum++;
                }
                catch (Exception)
                {
                    if (!silent)
                    {
                        Console.WriteLine("Problem reading line " + lineNum + " of the identification file");
                    }
                    return(new List <Identification>());
                }
            }

            if (fileType == PsmFileType.Unknown)
            {
                throw new Exception("Could not interpret PSM header labels from file: " + filepath);
            }

            reader.Close();

            if (!silent)
            {
                Console.WriteLine("Done reading PSMs; found " + ids.Count);
            }

            return(ids);
        }
Ejemplo n.º 13
0
        public static void TestFlashLfqMergeResults()
        {
            SpectraFileInfo rawA  = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "a", 0, 0, 0);
            SpectraFileInfo mzmlA = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "a", 0, 1, 0);

            // create some PSMs
            var            pgA  = new ProteinGroup("MyProtein", "gene", "org");
            Identification id1A = new Identification(rawA, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pgA
            });
            Identification id2A = new Identification(rawA, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.05811, 2, new List <ProteinGroup> {
                pgA
            });
            Identification id3A = new Identification(mzmlA, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pgA
            });
            Identification id4A = new Identification(mzmlA, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.05811, 2, new List <ProteinGroup> {
                pgA
            });

            // create the FlashLFQ engine
            FlashLFQEngine engineA = new FlashLFQEngine(new List <Identification> {
                id1A, id2A, id3A, id4A
            });

            // run the engine
            var resultsA = engineA.Run();

            SpectraFileInfo rawB  = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "b", 0, 0, 0);
            SpectraFileInfo mzmlB = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "b", 0, 1, 0);

            // create some PSMs
            var            pgB = new ProteinGroup("MyProtein", "gene", "org");
            Identification id1 = new Identification(rawB, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pgB
            });
            Identification id2 = new Identification(rawB, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.05811, 2, new List <ProteinGroup> {
                pgB
            });
            Identification id3 = new Identification(mzmlB, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pgB
            });
            Identification id4 = new Identification(mzmlB, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.05811, 2, new List <ProteinGroup> {
                pgB
            });

            // create the FlashLFQ engine
            FlashLFQEngine engineB = new FlashLFQEngine(new List <Identification> {
                id1, id2, id3, id4
            });

            // run the engine
            var resultsB = engineB.Run();

            resultsA.MergeResultsWith(resultsB);
            Assert.AreEqual(4, resultsA.Peaks.Count);
            Assert.AreEqual(1, resultsA.PeptideBaseSequences.Count);
            Assert.AreEqual(1, resultsA.PeptideModifiedSequences.Count);
            Assert.AreEqual(1, resultsA.ProteinGroups.Count);
            Assert.AreEqual(4, resultsA.SpectraFiles.Count);
        }
Ejemplo n.º 14
0
        public static void TestFlashLfqAdvancedProteinQuant()
        {
            List <string> filesToWrite = new List <string> {
                "mzml_1", "mzml_2"
            };
            List <string> pepSequences = new List <string> {
                "PEPTIDE", "MYPEPTIDE", "VVVVVPEPTIDE"
            };

            double[,] amounts = new double[2, 3] {
                { 1000000, 1000000, 1000000 },
                { 2000000, 2000000, 900000 }
            };
            Loaders.LoadElements(Path.Combine(TestContext.CurrentContext.TestDirectory, @"elements.dat"));

            // generate mzml files (3 peptides each)
            for (int f = 0; f < filesToWrite.Count; f++)
            {
                // 1 MS1 scan per peptide
                MsDataScan[] scans = new MsDataScan[3];

                for (int p = 0; p < pepSequences.Count; p++)
                {
                    ChemicalFormula      cf          = new Proteomics.AminoAcidPolymer.Peptide(pepSequences[p]).GetChemicalFormula();
                    IsotopicDistribution dist        = IsotopicDistribution.GetDistribution(cf, 0.125, 1e-8);
                    double[]             mz          = dist.Masses.Select(v => v.ToMz(1)).ToArray();
                    double[]             intensities = dist.Intensities.Select(v => v * amounts[f, p]).ToArray();

                    // add the scan
                    scans[p] = new MsDataScan(massSpectrum: new MzSpectrum(mz, intensities, false), oneBasedScanNumber: p + 1, msnOrder: 1, isCentroid: true,
                                              polarity: Polarity.Positive, retentionTime: 1.0 + (p / 10.0), scanWindowRange: new MzRange(400, 1600), scanFilter: "f",
                                              mzAnalyzer: MZAnalyzerType.Orbitrap, totalIonCurrent: intensities.Sum(), injectionTime: 1.0, noiseData: null, nativeId: "scan=" + (p + 1));
                }

                // write the .mzML
                IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(new FakeMsDataFile(scans),
                                                                              Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[f] + ".mzML"), false);
            }

            // set up spectra file info
            SpectraFileInfo file1 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[0] + ".mzML"), "a", 0, 0, 0);
            SpectraFileInfo file2 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[1] + ".mzML"), "a", 1, 0, 0);

            // create some PSMs
            var            pg  = new ProteinGroup("MyProtein", "gene", "org");
            Identification id1 = new Identification(file1, "PEPTIDE", "PEPTIDE", 799.35996, 1.01, 1, new List <ProteinGroup> {
                pg
            });
            Identification id2 = new Identification(file1, "MYPEPTIDE", "MYPEPTIDE", 1093.46377, 1.11, 1, new List <ProteinGroup> {
                pg
            });
            Identification id3 = new Identification(file1, "VVVVVPEPTIDE", "VVVVVPEPTIDE", 1294.70203, 1.21, 1, new List <ProteinGroup> {
                pg
            });

            Identification id4 = new Identification(file2, "PEPTIDE", "PEPTIDE", 799.35996, 1.01, 1, new List <ProteinGroup> {
                pg
            });
            Identification id5 = new Identification(file2, "MYPEPTIDE", "MYPEPTIDE", 1093.46377, 1.11, 1, new List <ProteinGroup> {
                pg
            });
            Identification id6 = new Identification(file2, "VVVVVPEPTIDE", "VVVVVPEPTIDE", 1294.70203, 1.21, 1, new List <ProteinGroup> {
                pg
            });

            // create the FlashLFQ engine
            FlashLFQEngine engine = new FlashLFQEngine(new List <Identification> {
                id1, id2, id3, id4, id5, id6
            }, normalize: false, advancedProteinQuant: true);

            // run the engine
            var results = engine.Run();

            // third peptide should be low-weighted
            // protein should be ~sum of first two peptide intensities (a little lower, because some smaller isotope peaks get skipped)
            double file1ProteinIntensity = results.ProteinGroups["MyProtein"].GetIntensity(file1);

            Assert.That(file1ProteinIntensity < 2e6);
            Assert.That(file1ProteinIntensity > 1e6);

            double file2ProteinIntensity = results.ProteinGroups["MyProtein"].GetIntensity(file2);

            Assert.That(file2ProteinIntensity < 4e6);
            Assert.That(file2ProteinIntensity > 3e6);
        }
Ejemplo n.º 15
0
        public static List <Identification> ReadPsms(string filepath, bool silent, List <SpectraFileInfo> rawfiles)
        {
            if (_modSequenceToMonoMass == null)
            {
                _modSequenceToMonoMass = new Dictionary <string, double>();
            }

            if (allProteinGroups == null)
            {
                allProteinGroups = new Dictionary <string, ProteinGroup>();
            }

            var rawFileDictionary          = rawfiles.ToDictionary(p => p.FilenameWithoutExtension, v => v);
            List <Identification> ids      = new List <Identification>();
            PsmFileType           fileType = PsmFileType.Unknown;

            if (!silent)
            {
                Console.WriteLine("Opening PSM file " + filepath);
            }

            StreamReader reader;

            try
            {
                reader = new StreamReader(filepath);
            }
            catch (Exception e)
            {
                if (!silent)
                {
                    Console.WriteLine("Error reading file " + filepath + "\n" + e.Message);
                }

                return(new List <Identification>());
            }

            int lineNum = 0;

            while (reader.Peek() > 0)
            {
                string line = reader.ReadLine();
                lineNum++;

                try
                {
                    if (lineNum == 1)
                    {
                        fileType = GetFileTypeFromHeader(line);

                        if (fileType == PsmFileType.Unknown)
                        {
                            throw new Exception("Could not interpret PSM header labels from file: " + filepath);
                        }
                    }
                    else
                    {
                        var param = line.Split('\t');

                        // only quantify PSMs below 1% FDR
                        if (fileType == PsmFileType.MetaMorpheus && double.Parse(param[_qValueCol], CultureInfo.InvariantCulture) > 0.01)
                        {
                            break;
                        }
                        else if (fileType == PsmFileType.Morpheus && double.Parse(param[_qValueCol], CultureInfo.InvariantCulture) > 1.00)
                        {
                            break;
                        }

                        // only quantify PSMs below 1% notch FDR
                        if (fileType == PsmFileType.MetaMorpheus && double.Parse(param[_qValueNotchCol], CultureInfo.InvariantCulture) > 0.01)
                        {
                            continue;
                        }

                        // skip decoys
                        if ((fileType == PsmFileType.MetaMorpheus || fileType == PsmFileType.Morpheus) &&
                            param[_decoyCol].Contains("D"))
                        {
                            continue;
                        }

                        // spectrum file name
                        string fileName = param[_fileNameCol];

                        // base sequence
                        string baseSequence = param[_baseSequCol];

                        // modified sequence
                        string modSequence = param[_fullSequCol];

                        // skip ambiguous sequence in MetaMorpheus output
                        if (fileType == PsmFileType.MetaMorpheus && (modSequence.Contains(" or ") || modSequence.Contains("|") || modSequence.ToLowerInvariant().Contains("too long")))
                        {
                            continue;
                        }

                        // monoisotopic mass
                        if (double.TryParse(param[_monoMassCol], NumberStyles.Number, CultureInfo.InvariantCulture, out double monoisotopicMass))
                        {
                            if (_modSequenceToMonoMass.TryGetValue(modSequence, out double storedMonoisotopicMass))
                            {
                                if (storedMonoisotopicMass != monoisotopicMass)
                                {
                                    if (!silent)
                                    {
                                        Console.WriteLine("Caution! PSM with sequence " + modSequence + " at line " +
                                                          lineNum + " could not be read; " +
                                                          "a peptide with the same modified sequence but a different monoisotopic mass has already been added");
                                    }

                                    continue;
                                }
                            }
                            else
                            {
                                _modSequenceToMonoMass.Add(modSequence, monoisotopicMass);
                            }
                        }
                        else
                        {
                            if (!silent)
                            {
                                Console.WriteLine("PSM with sequence " + modSequence + " at line " +
                                                  lineNum + " could not be read; " +
                                                  "monoisotopic mass was not interpretable: \"" + param[_monoMassCol] + "\"");
                            }

                            continue;
                        }

                        // retention time
                        if (double.TryParse(param[_msmsRetnCol], out double ms2RetentionTime))
                        {
                            if (fileType == PsmFileType.PeptideShaker)
                            {
                                // peptide shaker RT is in seconds - convert to minutes
                                ms2RetentionTime = ms2RetentionTime / 60.0;
                            }

                            if (ms2RetentionTime < 0)
                            {
                                if (!silent)
                                {
                                    Console.WriteLine("Caution! PSM with sequence " + modSequence + " at line " +
                                                      lineNum + " could not be read; retention time was negative");
                                }

                                continue;
                            }
                        }
                        else
                        {
                            if (!silent)
                            {
                                Console.WriteLine("PSM with sequence " + modSequence + " at line " +
                                                  lineNum + " could not be read; " +
                                                  "retention time was not interpretable: \"" + param[_msmsRetnCol] + "\"");
                            }

                            continue;
                        }

                        // charge state
                        int chargeState;
                        if (fileType == PsmFileType.PeptideShaker)
                        {
                            string chargeStringNumbersOnly = new String(param[_chargeStCol].Where(Char.IsDigit).ToArray());

                            if (string.IsNullOrWhiteSpace(chargeStringNumbersOnly))
                            {
                                if (!silent)
                                {
                                    Console.WriteLine("PSM with sequence " + modSequence + " at line " +
                                                      lineNum + " could not be read; " +
                                                      "charge state was not interpretable: \"" + param[_chargeStCol] + "\"");
                                }

                                continue;
                            }
                            else
                            {
                                if (!int.TryParse(chargeStringNumbersOnly, out chargeState))
                                {
                                    if (!silent)
                                    {
                                        Console.WriteLine("PSM with sequence " + modSequence + " at line " +
                                                          lineNum + " could not be read; " +
                                                          "charge state was not interpretable: \"" + param[_chargeStCol] + "\"");
                                    }

                                    continue;
                                }
                            }
                        }
                        else
                        {
                            if (!double.TryParse(param[_chargeStCol], out double chargeStateDouble))
                            {
                                if (!silent)
                                {
                                    Console.WriteLine("PSM with sequence " + modSequence + " at line " +
                                                      lineNum + " could not be read; " +
                                                      "charge state was not interpretable: \"" + param[_chargeStCol] + "\"");
                                }

                                continue;
                            }

                            chargeState = (int)chargeStateDouble;
                        }

                        // protein groups
                        // use all proteins listed
                        List <ProteinGroup> proteinGroups = new List <ProteinGroup>();
                        var proteins = param[_protNameCol].Split(delimiters[fileType], StringSplitOptions.None);

                        string[] genes = null;
                        if (_geneNameCol >= 0)
                        {
                            genes = param[_geneNameCol].Split(delimiters[fileType], StringSplitOptions.None);
                        }

                        string[] organisms = null;
                        if (_organismCol >= 0)
                        {
                            organisms = param[_organismCol].Split(delimiters[fileType], StringSplitOptions.None);
                        }

                        for (int pr = 0; pr < proteins.Length; pr++)
                        {
                            string proteinName = proteins[pr];
                            string gene        = "";
                            string organism    = "";

                            if (genes != null)
                            {
                                if (genes.Length == 1)
                                {
                                    gene = genes[0];
                                }
                                else if (genes.Length == proteins.Length)
                                {
                                    gene = genes[pr];
                                }
                                else if (proteins.Length == 1)
                                {
                                    gene = param[_geneNameCol];
                                }
                            }

                            if (organisms != null)
                            {
                                if (organisms.Length == 1)
                                {
                                    organism = organisms[0];
                                }
                                else if (organisms.Length == proteins.Length)
                                {
                                    organism = organisms[pr];
                                }
                                else if (proteins.Length == 1)
                                {
                                    organism = param[_organismCol];
                                }
                            }

                            if (allProteinGroups.TryGetValue(proteinName, out ProteinGroup pg))
                            {
                                proteinGroups.Add(pg);
                            }
                            else
                            {
                                ProteinGroup newPg = new ProteinGroup(proteinName, gene, organism);
                                allProteinGroups.Add(proteinName, newPg);
                                proteinGroups.Add(newPg);
                            }
                        }

                        // get file name and look up file name object
                        var fileNameNoExt = Path.GetFileNameWithoutExtension(fileName);

                        if (!rawFileDictionary.TryGetValue(fileNameNoExt, out SpectraFileInfo spectraFileInfoToUse))
                        {
                            // skip PSMs for files with no spectrum data input
                            continue;
                        }

                        // construct id
                        var ident = new Identification(spectraFileInfoToUse, baseSequence, modSequence, monoisotopicMass, ms2RetentionTime, chargeState, proteinGroups);
                        ids.Add(ident);
                    }
                }
                catch (Exception)
                {
                    if (!silent)
                    {
                        Console.WriteLine("Problem reading line " + lineNum + " of the identification file");
                    }
                    return(new List <Identification>());
                }
            }

            reader.Close();

            if (!silent)
            {
                Console.WriteLine("Done reading PSMs; found " + ids.Count);
            }

            return(ids);
        }
Ejemplo n.º 16
0
        /// <summary>
        /// Groups proteins into groups based on the peptides in the proteins. Combines Proteins if
        /// they contain all the same peptide sequences (Indistinquishable) and removes groups that
        /// can be made up by other groups in its entirety (Subsumable). Lastly, it filters for false
        /// discovery.
        /// </summary>
        /// <param name="proteins">A list of unique proteins to group together</param>
        private List<ProteinGroup> GroupProteins(List<Protein> proteins, bool printMessages = true)
        {
            if (printMessages)
                Log("Grouping proteins into protein groups...");

            // A list of protein groups that, at the end of this method, will have distinct protein groups.
            List<ProteinGroup> proteinGroups = new List<ProteinGroup>();
            if (printMessages)
                Log("{0:N0} original proteins (maximum proteins identified)", proteins.Count);

            // 1) Find Indistinguishable Proteins and group them together into Protein Groups
            // If they are not indistinguishable, then they are still converted to Protein Groups
            // but only contain one protein.
            // A 1 2 3 4
            // B 1 2 3 4
            // C 1   3 4
            // Proteins A and B are indistinguisable (have same set of peptides 1,2,3,4), and thus would become a Protein Group (PG1 [a,b])
            // C is distinguishable and would become a Protein Group (PG2 [c]).
            #region Indistinguishable

            // Loop over each protein
            int p1 = 0;
            while (p1 < proteins.Count)
            {
                // Grab the next protein and its associated peptides from the list of all proteins
                Protein protein = proteins[p1];
                HashSet<Peptide> peptides = protein.Peptides;

                // Check to see if this protein has enough peptides to be considered indentified
                //if (peptides.Count < MinPeptidesPerGroup)
                //{
                //    // This protein didn't have enough peptides, so remove it from future consideration
                //    proteins.RemoveAt(p1);

                //    // Increase the counter
                //    numberRemovedForNotEnoughPeptides++;

                //    // Go to the next protein on the list
                //    continue;
                //}

                // Start off making the protein into a protein group with its associated peptides
                ProteinGroup pg = new ProteinGroup(protein, peptides);

                // Start looking at the next protein in the list
                int p2 = p1 + 1;

                // Loop over each other protein skipping the one you just made into the PG
                while (p2 < proteins.Count)
                {
                    // Does the next protein contain the same set of peptides as the protein group?
                    if (proteins[p2].Peptides.SetEquals(peptides))
                    {
                        // Yes they are indistinguishable (i.e. proteins A and B from above), so add this protein to the protein group
                        pg.Add(proteins[p2]);

                        // Then remove this protein from the list of all proteins as not to make it into its own PG later
                        proteins.RemoveAt(p2);
                    }
                    else
                    {
                        // Go to next protein in question
                        p2++;
                    }
                }

                // We have gone through every protein possible and thus have completed the grouping of this PG
                proteinGroups.Add(pg);
                p1++;
            }
            if (printMessages)
                Log("{0:N0} protein groups are left after combining indistinguishable proteins (having the exact same set of peptides)", proteinGroups.Count);

            #endregion Indistinguishable

            // 2) Find Subsumable Proteins
            // Sort proteins from worst to best to remove the worst scoring groups first (note well, lower p-values mean better scores)
            // Case Example: P-Value, Protein Group, Peptides
            // 0.1  A 1 2
            // 0.05 B 1   3
            // 0.01 C   2 3
            // These are subsumable and we remove the worst scoring protein group (in this case, Protein Group A at p-value of 0.1) first. This would leave:
            // 0.05 B 1   3
            // 0.01 C   2 3
            // Which would mean Protein Group B and C are distinct groups, but share a common peptide (3), peptides 1 and 2 would remain unshared.
            // Protein Group A is removed, as it its peptides can be explained by groups B and C.
            #region Subsumable

            // First, make sure all the peptides know which protein groups they belong too, so we can determined shared peptides
            // and thus get correct p-value for the PGs.
            //MappedPeptidesToProteinGroups(proteinGroups);

            // First update each protein's p-value
            foreach (ProteinGroup proteinGroup in proteinGroups)
            {
                proteinGroup.UpdatePValue(PScoreCalculationMethod, UseConservativePScore);
            }

            // Then sort the groups on decreasing p-values
            proteinGroups.Sort(ProteinGroup.CompareDecreasing);

            p1 = 0;
            while (p1 < proteinGroups.Count)
            {
                // Get the peptides in the protein group
                ProteinGroup proteinGroup = proteinGroups[p1];
                HashSet<Peptide> referencePeptides = proteinGroup.Peptides;

                // Check if all the peptides are shared, if they are then the protein group is subsumable and should be removed
                if (referencePeptides.All(p => p.IsShared))
                {
                    // Since this protein group is being eliminated, remove its reference from all the peptides
                    foreach (Peptide pep in referencePeptides)
                    {
                        pep.ProteinGroups.Remove(proteinGroup);
                    }

                    // Remove the protein group from the master list
                    proteinGroups.RemoveAt(p1);
                }
                else
                {
                    p1++;
                }
            }

            if (printMessages)
                Log("{0:N0} protein groups are left after removing subsumable groups (peptides can be explain by other groups)", proteinGroups.Count);

            #endregion Subsumable

            // 3) Remove protein groups that do not have enough peptides within them
            #region MinimumGroupSize

            // No need to filter if this is one or less
            if (MinPeptidesPerGroup > 1)
            {
                p1 = 0;
                while (p1 < proteinGroups.Count)
                {
                    ProteinGroup proteinGroup = proteinGroups[p1];

                    // Check to see if this protein has enough peptides to be considered indentified
                    if (proteinGroup.Peptides.Count < MinPeptidesPerGroup)
                    {
                        // Since this protein group is being eliminated, remove its reference from all the peptides
                        foreach (Peptide pep in proteinGroup.Peptides)
                        {
                            pep.ProteinGroups.Remove(proteinGroup);
                        }

                        // This protein didn't have enough peptides, so remove it from future consideration
                        proteinGroups.RemoveAt(p1);
                    }
                    else
                    {
                        p1++;
                    }
                }
                if (printMessages)
                    Log("{0:N0} protein groups are left after removing groups with < {1:N0} peptides [parsimonious proteins]", proteinGroups.Count, MinPeptidesPerGroup);
            }

            #endregion

            // 4) Apply false discovery filtering at the protein level
            #region FDR filtering

            proteinGroups.Sort();
            // Mark each protein group that passes fdr filtering
            int count = 0;
            foreach (ProteinGroup proteinGroup in FalseDiscoveryRate<ProteinGroup, double>.Filter(proteinGroups, MaxFdr / 100, true))
            {
                proteinGroup.PassesFDR = true;
                count++;
            }

            if (printMessages)
                Log("{0:N0} protein groups are left after applying FDR of {1:N2}% [parsimonious proteins filtered]", count, MaxFdr);

            #endregion FDR filtering

            return proteinGroups;
        }
Ejemplo n.º 17
0
        public static void TestFlashLfqMatchBetweenRunsProteinQuant()
        {
            List <string> filesToWrite = new List <string> {
                "mzml_1", "mzml_2"
            };
            List <string> pepSequences = new List <string> {
                "PEPTIDE", "PEPTIDEV", "PEPTIDEVV", "PEPTIDEVVV", "PEPTIDEVVVV"
            };
            double intensity = 1e6;

            double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.04, 1.05 };
            double[] file2Rt = new double[] { 1.015, 1.030, 1.036, 1.050, 1.065 };

            Loaders.LoadElements(Path.Combine(TestContext.CurrentContext.TestDirectory, @"elements.dat"));

            // generate mzml files (5 peptides each)
            for (int f = 0; f < filesToWrite.Count; f++)
            {
                // 1 MS1 scan per peptide
                MsDataScan[] scans = new MsDataScan[5];

                for (int p = 0; p < pepSequences.Count; p++)
                {
                    ChemicalFormula      cf          = new Proteomics.AminoAcidPolymer.Peptide(pepSequences[p]).GetChemicalFormula();
                    IsotopicDistribution dist        = IsotopicDistribution.GetDistribution(cf, 0.125, 1e-8);
                    double[]             mz          = dist.Masses.Select(v => v.ToMz(1)).ToArray();
                    double[]             intensities = dist.Intensities.Select(v => v * intensity).ToArray();
                    double rt;
                    if (f == 0)
                    {
                        rt = file1Rt[p];
                    }
                    else
                    {
                        rt = file2Rt[p];
                    }

                    // add the scan
                    scans[p] = new MsDataScan(massSpectrum: new MzSpectrum(mz, intensities, false), oneBasedScanNumber: p + 1, msnOrder: 1, isCentroid: true,
                                              polarity: Polarity.Positive, retentionTime: rt, scanWindowRange: new MzRange(400, 1600), scanFilter: "f",
                                              mzAnalyzer: MZAnalyzerType.Orbitrap, totalIonCurrent: intensities.Sum(), injectionTime: 1.0, noiseData: null, nativeId: "scan=" + (p + 1));
                }

                // write the .mzML
                IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(new FakeMsDataFile(scans),
                                                                              Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[f] + ".mzML"), false);
            }

            // set up spectra file info
            SpectraFileInfo file1 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[0] + ".mzML"), "a", 0, 0, 0);
            SpectraFileInfo file2 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[1] + ".mzML"), "a", 1, 0, 0);

            // create some PSMs
            var pg = new ProteinGroup("MyProtein", "gene", "org");
            var myMbrProteinGroup = new ProteinGroup("MyMbrProtein", "MbrGene", "org");

            Identification id1 = new Identification(file1, "PEPTIDE", "PEPTIDE",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file1Rt[0] + 0.001, 1, new List <ProteinGroup> {
                pg
            });
            Identification id2 = new Identification(file1, "PEPTIDEV", "PEPTIDEV",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file1Rt[1] + 0.001, 1, new List <ProteinGroup> {
                pg
            });
            Identification id3 = new Identification(file1, "PEPTIDEVV", "PEPTIDEVV",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List <ProteinGroup> {
                myMbrProteinGroup
            });
            Identification id4 = new Identification(file1, "PEPTIDEVVV", "PEPTIDEVVV",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List <ProteinGroup> {
                pg
            });
            Identification id5 = new Identification(file1, "PEPTIDEVVVV", "PEPTIDEVVVV",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List <ProteinGroup> {
                pg
            });

            Identification id6 = new Identification(file2, "PEPTIDE", "PEPTIDE",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file2Rt[0] + 0.001, 1, new List <ProteinGroup> {
                pg
            });
            Identification id7 = new Identification(file2, "PEPTIDEV", "PEPTIDEV",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file2Rt[1] + 0.001, 1, new List <ProteinGroup> {
                pg
            });
            // missing ID 8 - MBR feature
            Identification id9 = new Identification(file2, "PEPTIDEVVV", "PEPTIDEVVV",
                                                    new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[3] + 0.001, 1, new List <ProteinGroup> {
                pg
            });
            Identification id10 = new Identification(file2, "PEPTIDEVVVV", "PEPTIDEVVVV",
                                                     new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List <ProteinGroup> {
                pg
            });

            // test with top3 protein quant engine
            FlashLFQEngine engine = new FlashLFQEngine(new List <Identification> {
                id1, id2, id3, id4, id5, id6, id7, id9, id10
            }, matchBetweenRuns: true);
            var results = engine.Run();

            Assert.That(results.ProteinGroups["MyMbrProtein"].GetIntensity(file1) > 0);
            Assert.That(results.ProteinGroups["MyMbrProtein"].GetIntensity(file2) == 0);

            // test with advanced protein quant engine
            engine = new FlashLFQEngine(new List <Identification> {
                id1, id2, id3, id4, id5, id6, id7, id9, id10
            }, matchBetweenRuns: true, advancedProteinQuant: true);
            results = engine.Run();

            Assert.That(results.ProteinGroups["MyMbrProtein"].GetIntensity(file1) > 0);
            Assert.That(results.ProteinGroups["MyMbrProtein"].GetIntensity(file2) == 0);
        }
Ejemplo n.º 18
0
        public static void TestFlashLfqNormalization()
        {
            // ********************************* check biorep normalization *********************************
            // get the raw file paths
            SpectraFileInfo raw  = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "a", 0, 0, 0);
            SpectraFileInfo mzml = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "a", 1, 0, 0);

            // create some PSMs
            var            pg  = new ProteinGroup("MyProtein", "gene", "org");
            Identification id1 = new Identification(raw, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            Identification id2 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });

            // create the FlashLFQ engine
            var results = new FlashLFQEngine(new List <Identification> {
                id1, id2
            }, normalize: true).Run();

            // check that biorep normalization worked
            int int1 = (int)System.Math.Round(results.Peaks[mzml].First().Intensity, 0);
            int int2 = (int)System.Math.Round(results.Peaks[raw].First().Intensity, 0);

            Assert.That(int1 > 0);
            Assert.That(int1 == int2);

            // ********************************* check condition normalization *********************************
            raw  = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "a", 0, 0, 0);
            mzml = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "b", 0, 0, 0);

            id1 = new Identification(raw, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            id2 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });

            results = new FlashLFQEngine(new List <Identification> {
                id1, id2
            }, normalize: true).Run();

            int int3 = (int)System.Math.Round(results.Peaks[mzml].First().Intensity, 0);
            int int4 = (int)System.Math.Round(results.Peaks[raw].First().Intensity, 0);

            Assert.That(int3 > 0);
            Assert.That(int3 == int4);

            // ********************************* check techrep normalization *********************************
            raw  = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "a", 0, 0, 0);
            mzml = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "a", 0, 1, 0);

            id1 = new Identification(raw, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            id2 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });

            results = new FlashLFQEngine(new List <Identification> {
                id1, id2
            }, normalize: true).Run();

            int int5 = (int)System.Math.Round(results.Peaks[mzml].First().Intensity, 0);
            int int6 = (int)System.Math.Round(results.Peaks[raw].First().Intensity, 0);

            Assert.That(int5 > 0);
            Assert.That(int5 == int6);

            Assert.That(int1 == int3);
            Assert.That(int1 == int5);


            // ********************************* check fraction normalization *********************************
            raw = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "a", 0, 0, 0);
            var raw2 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-raw.raw"), "a", 0, 0, 1);

            mzml = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "a", 1, 0, 0);
            var mzml2 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, @"sliced-mzml.mzml"), "a", 1, 0, 1);

            id1 = new Identification(raw, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            id2 = new Identification(raw2, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            var id3 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });
            var id4 = new Identification(mzml2, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List <ProteinGroup> {
                pg
            });

            results = new FlashLFQEngine(new List <Identification> {
                id1, id2, id3, id4
            }, normalize: true).Run();

            int int7 = (int)System.Math.Round(results.PeptideBaseSequences["EGFQVADGPLYR"].GetIntensity(raw) + results.PeptideBaseSequences["EGFQVADGPLYR"].GetIntensity(raw2));
            int int8 = (int)System.Math.Round(results.PeptideBaseSequences["EGFQVADGPLYR"].GetIntensity(mzml) + results.PeptideBaseSequences["EGFQVADGPLYR"].GetIntensity(mzml2));

            Assert.That(int7 > 0);
            Assert.That(int7 == int8);
        }