Пример #1
0
        public void Test_write_with_custom_mods()
        {
            ModificationMotif.TryGetMotif("S", out ModificationMotif m1);
            ModificationMotif.TryGetMotif("T", out ModificationMotif m2);
            ModificationMotif.TryGetMotif("X", out ModificationMotif motiff);

            var nice = new List <Modification>
            {
                new ModificationWithLocation("fayk", "mt", motiff, TerminusLocalization.Any, null),
                new ModificationWithLocation("Phosphoserine", "mt", m1, TerminusLocalization.Any, null),
                new ModificationWithLocation("Phosphothreonine", "mt", m2, TerminusLocalization.Any, null)
            };

            ModificationMotif.TryGetMotif("K", out ModificationMotif motif);
            ModificationWithMass m = new ModificationWithMass("mod", "mt", motif, TerminusLocalization.Any, 1, neutralLosses: new List <double> {
                -1
            });

            Dictionary <string, HashSet <Tuple <int, Modification> > > new_mods = new Dictionary <string, HashSet <Tuple <int, Modification> > >
            {
                { "P53863", new HashSet <Tuple <int, Modification> > {
                      new Tuple <int, Modification>(2, m)
                  } }
            };

            List <Protein> ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"xml2.xml"), true, DecoyType.None, nice, false, new List <string>(), out Dictionary <string, Modification> un);
            var            newModResEntries = ProteinDbWriter.WriteXmlDatabase(new_mods, ok, Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_xml2.xml"));

            Assert.AreEqual(1, newModResEntries.Count);
            List <Protein> ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_xml2.xml"), true, DecoyType.None, nice, false, new List <string>(), out un);

            Assert.AreEqual(ok.Count, ok2.Count);
            Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence));
            Assert.AreEqual(2, ok[0].OneBasedPossibleLocalizedModifications.Count);
            Assert.AreEqual(3, ok2[0].OneBasedPossibleLocalizedModifications.Count);
        }
Пример #2
0
        public void Test_read_Ensembl_pepAllFasta()
        {
            ModificationMotif.TryGetMotif("X", out ModificationMotif motif);
            var nice = new List <Modification>
            {
                new ModificationWithLocation("fayk", "mt", motif, TerminusLocalization.Any, null)
            };

            List <Protein> ok = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_ensembl.pep.all.fasta"), true, DecoyType.None, false, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_fullName_expression, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_gene_expression);

            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), ok, Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.xml"));
            List <Protein> ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.xml"), true, DecoyType.None, nice, false, null, out Dictionary <string, Modification> un);

            Assert.AreEqual(ok.Count, ok2.Count);
            Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence));
            Assert.AreEqual("ENSP00000381386", ok[0].Accession);
            Assert.AreEqual("ENSP00000215773", ok[1].Accession);
            Assert.AreEqual("ENSG00000099977", ok[0].GeneNames.First().Item2);
            Assert.AreEqual("ENSG00000099977", ok[1].GeneNames.First().Item2);
            Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok[0].FullName);
            Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24322019:-1 gene:ENSG00000099977 transcript:ENST00000350608 gene_biotype:protein_coding transcript_biotype:protein_coding", ok[1].FullName);
            Assert.AreEqual(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_ensembl.pep.all.fasta"), ok[0].DatabaseFilePath);

            Assert.AreEqual("ENSP00000381386", ok2[0].Accession);
            Assert.AreEqual("ENSP00000215773", ok2[1].Accession);
            Assert.AreEqual("ENSG00000099977", ok2[0].GeneNames.First().Item2);
            Assert.AreEqual("ENSG00000099977", ok2[1].GeneNames.First().Item2);
            Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok2[0].FullName);
            Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24322019:-1 gene:ENSG00000099977 transcript:ENST00000350608 gene_biotype:protein_coding transcript_biotype:protein_coding", ok2[1].FullName);
            Assert.AreEqual(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.xml"), ok2[0].DatabaseFilePath);

            Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length)));
            Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length)));
            Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length)));
            Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length)));
        }
Пример #3
0
        public void TestEmptyProteins()
        {
            Protein p1 = new Protein("SEQENCE", "p1");

            Assert.AreEqual("p1||", p1.FullDescription);
            Protein p2 = new Protein("SEQENCE", "p2", name: "namep2");

            var proteinListToWrite = new List <Protein> {
                p1, p2
            };

            // Generate data for files
            ProteinDbWriter.WriteXmlDatabase(null, proteinListToWrite, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"differentlyConstuctedProteins.xml"));

            IEnumerable <string>       modTypesToExclude     = new List <string>();
            IEnumerable <Modification> allKnownModifications = new List <Modification>();
            List <Protein>             ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"differentlyConstuctedProteins.xml"), true, DecoyType.None,
                                                                           allKnownModifications, false, modTypesToExclude, out Dictionary <string, Modification> un);

            Assert.AreEqual(p1.Accession, ok[0].Accession);
            Assert.AreEqual(p2.Accession, ok[1].Accession);
            Assert.AreEqual(p1.Name, ok[0].Name);
            Assert.AreEqual(p2.Name, ok[1].Name);
        }
Пример #4
0
        public void TestReadWriteSpliceSites()
        {
            List <Protein> ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"splices1.xml"), true, DecoyType.None,
                                                               null, false, new List <string>(), out Dictionary <string, Modification> un);

            Assert.IsNull(ok[0].SpliceSites.First().Description.Novel);

            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), ok, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml"));
            List <Protein> ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml"), true, DecoyType.None,
                                                                null, false, new List <string>(), out un);

            Assert.AreEqual(ok[0].SpliceSites.Count(), ok2[0].SpliceSites.Count());
            Assert.AreEqual(ok[0].SpliceSites.First().OneBasedBeginPosition, ok2[0].SpliceSites.First().OneBasedBeginPosition);
            Assert.AreEqual(ok[0].SpliceSites.First().OneBasedEndPosition, ok2[0].SpliceSites.First().OneBasedEndPosition);
            Assert.AreEqual(ok[0].SpliceSites.First().Description, ok2[0].SpliceSites.First().Description);
            Assert.IsNull(ok2[0].SpliceSites.First().Description.Novel);

            ok[0].SpliceSites.First().Description.Novel = true;
            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), ok, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml"));
            ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml"), true, DecoyType.None,
                                                 null, false, new List <string>(), out un);

            Assert.IsTrue(ok2[0].SpliceSites.First().Description.Novel);
        }
Пример #5
0
        public void XmlTest_2entry()
        {
            var nice = new List <Modification>
            {
                new ModificationWithLocation("fayk", null, null, ModificationSites.A, null, null)
            };

            var ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"xml2.xml"), true, nice, false, null, out Dictionary <string, Modification> un);

            Assert.True(ok.All(p => p.ProteolysisProducts.All(d => d.OneBasedBeginPosition == null || d.OneBasedBeginPosition > 0)));

            Assert.True(ok.All(p => p.ProteolysisProducts.All(d => d.OneBasedEndPosition == null || d.OneBasedEndPosition <= p.Length)));

            Assert.False(ok.All(p => p.BaseSequence.Contains(" ")));
            Assert.False(ok.All(p => p.BaseSequence.Contains("\t")));
            Assert.False(ok.All(p => p.BaseSequence.Contains("\n")));

            //GoTerm checks
            List <Protein> targets = ok.Where(p => !p.IsDecoy).ToList();

            Assert.AreEqual(2, targets.Count);
            Assert.AreEqual(1, targets[0].DatabaseReferences.Count(dbRef => dbRef.Type == "EnsemblFungi"));
            Assert.AreEqual(1, targets[1].DatabaseReferences.Count(dbRef => dbRef.Type == "EnsemblFungi"));
        }
Пример #6
0
        [TestCase("exclude_me", false)]//the first part is the test case, the latter part is ther result of the assertion
        //[TestCase("exclude_me_not", true)]
        public static void Read_xml_exclude_mods(string excludeString, bool isExcluded)
        {
            ModificationMotif.TryGetMotif("X", out ModificationMotif motif);

            var nice = new List <Modification>
            {
                new Modification("N-acetylserine", null, "exclude_me", null, motif, "Anywhere.", null, 10, null, null, null, null, null, null),
                new Modification("N-acetylserine", null, "exclude_me_not", null, motif, "Anywhere.", null, 10, null, null, null, null, null, null)
            };

            Assert.That(nice[0].ValidModification);

            var ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"xml.xml"), true, DecoyType.Reverse, nice, false,
                                                     new[] { excludeString }, out Dictionary <string, Modification> un);

            List <string> modTypes = new List <string>();

            foreach (KeyValuePair <int, List <Modification> > entry in ok2[0].OneBasedPossibleLocalizedModifications)
            {
                modTypes.AddRange(entry.Value.Select(m => m.ModificationType).ToList().Distinct());
            }
            Assert.AreEqual(isExcluded, modTypes.Contains("exclude_me"));
            Assert.AreEqual(!isExcluded, modTypes.Contains("exclude_me_not"));
        }
Пример #7
0
        public static void StopGained()
        {
            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
                                                          DecoyType.None, null, false, null, out var unknownModifications);

            Assert.AreEqual(2, proteins.Count);
            Assert.AreEqual(1, proteins[0].SequenceVariations.Count());                                                 // some redundant
            Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count());        // unique changes
            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count());                                          // some redundant
            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count());                                          // some redundant
            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
            Assert.AreEqual(191, proteins[0].Length);
            Assert.AreEqual('Q', proteins[0][161 - 1]);
            Assert.AreEqual(161 - 1, proteins[1].Length);
            Assert.AreNotEqual(proteins[0].Length, proteins[1].Length);

            proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
                                                      DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 400);
            Assert.AreEqual(1, proteins.Count);
            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count());                                          // some redundant
            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
            Assert.AreEqual(161 - 1, proteins[0].Length);
        }
Пример #8
0
        public void DoNotWriteSameModTwiceAndDoNotWriteInHeaderSinceDifferent()
        {
            Loaders.LoadElements(Path.Combine(TestContext.CurrentContext.TestDirectory, "elements2.dat"));
            var     sampleModList = PtmListLoader.ReadModsFromFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "z.txt")).ToList();
            Protein protein       = new Protein("MCSSSSSSSSSS", "accession", new List <Tuple <string, string> >(), new Dictionary <int, List <Modification> > {
                { 2, sampleModList.OfType <Modification>().ToList() }
            }, null, "name", "full_name", false, false, new List <DatabaseReference>(), new List <SequenceVariation>(), new List <DisulfideBond>());

            Assert.AreEqual(1, protein.OneBasedPossibleLocalizedModifications[2].OfType <ModificationWithMass>().Count());

            Dictionary <string, HashSet <Tuple <int, Modification> > > dictWithThisMod = new Dictionary <string, HashSet <Tuple <int, Modification> > >();

            HashSet <Tuple <int, Modification> > value = new HashSet <Tuple <int, Modification> >();

            var modReadFromFile = sampleModList.First() as ModificationWithMassAndCf;

            ModificationMotif.TryGetMotif("C", out ModificationMotif motif);
            ModificationWithMass newMod = new ModificationWithMassAndCf("Palmitoylation of C", modReadFromFile.modificationType, motif, TerminusLocalization.Any, modReadFromFile.chemicalFormula, modReadFromFile.monoisotopicMass, null, null, null);

            Assert.AreEqual(newMod, sampleModList.First());
            Assert.AreEqual(sampleModList.First(), newMod);

            value.Add(new Tuple <int, Modification>(2, newMod));

            dictWithThisMod.Add("accession", value);
            var newModResEntries = ProteinDbWriter.WriteXmlDatabase(dictWithThisMod, new List <Protein> {
                protein
            }, Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins3.xml"));

            Assert.AreEqual(0, newModResEntries.Count);
            List <Protein> new_proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins3.xml"), true, DecoyType.None, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> um);

            Assert.AreEqual(1, new_proteins.Count);
            Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.Count);
            Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).Count());
        }
Пример #9
0
        public static void MultipleAlternateAlleles()
        {
            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "MultipleAlternateAlleles.xml"), true,
                                                          DecoyType.None, null, false, null, out var unknownModifications);

            Assert.AreEqual(2, proteins.Count);
            Assert.AreEqual(2, proteins[0].SequenceVariations.Count());                                                 // some redundant
            Assert.AreEqual(2, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count());        // unique changes

            Assert.IsTrue(proteins[0].SequenceVariations.All(v => v.OneBasedBeginPosition == 63));                      // there are two alternate alleles (1 and 2), but only 2 is in the genotype, so only that's applied
            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count());                                          // some redundant
            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
            Assert.AreEqual(72, proteins[0].Length);
            Assert.AreEqual(72, proteins[1].Length);
            Assert.AreEqual('K', proteins[0][63 - 1]);
            Assert.AreEqual('R', proteins[1][63 - 1]);

            proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "MultipleAlternateAlleles.xml"), true,
                                                      DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 10);
            Assert.AreEqual(1, proteins.Count);
            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count());                                          // some redundant
            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
            Assert.AreEqual('K', proteins[0][63 - 1]);                                                                  // reference only
        }
Пример #10
0
        public static void Test_MetaMorpheusStyleProteinDatabaseWriteAndREad()
        {
            string proteinDbFilePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestProteinSplitAcrossFiles.xml");

            ModificationMotif.TryGetMotif("D", out ModificationMotif motif);
            Modification mod = new Modification(_originalId: "mod1", _modificationType: "mt", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10);

            IDictionary <int, List <Modification> > oneBasedModification = new Dictionary <int, List <Modification> >
            {
                { 3, new List <Modification> {
                      mod
                  } }
            };

            Protein        prot1       = new Protein("MEDEEK", "prot1", oneBasedModifications: oneBasedModification);
            List <Protein> proteinList = new List <Protein> {
                prot1
            };

            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), proteinList, proteinDbFilePath);

            var            lines          = File.ReadAllLines(proteinDbFilePath);
            List <Protein> newProteinList = ProteinDbLoader.LoadProteinXML(proteinDbFilePath, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out var um, -1);
        }
Пример #11
0
        public static void TestSearchPtmVariantDatabase()
        {
            //Create Search Task
            SearchTask task1 = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    SearchTarget         = true,
                    MassDiffAcceptorType = MassDiffAcceptorType.Exact,
                },
                CommonParameters = new CommonParameters(digestionParams: new DigestionParams(minPeptideLength: 5))
            };

            //add task to task list
            var taskList = new List <(string, MetaMorpheusTask)> {
                ("task1", task1)
            };

            //create modification lists
            List <Modification> variableModifications = GlobalVariables.AllModsKnown.OfType <Modification>().Where
                                                            (b => task1.CommonParameters.ListOfModsVariable.Contains((b.ModificationType, b.IdWithMotif))).ToList();

            //protein Creation (One with mod and one without)
            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
            ModificationMotif.TryGetMotif("K", out ModificationMotif motifK);
            var     variant            = new SequenceVariation(3, "P", "K", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G|||||||||||||||||||\tGT:AD:DP\t1/1:30,30:30");
            Protein testProteinWithMod = new Protein("PEPTID", "accession1", sequenceVariations: new List <SequenceVariation> {
                variant
            });
            string variantAcc = VariantApplication.GetAccession(testProteinWithMod, new[] { variant });
            //First Write XML Database
            string xmlName = "oblm.xml";

            //Add Mod to list and write XML input database
            var modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >();
            var hash    = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(1, new Modification(_originalId: "acetyl on P", _modificationType: "type", _target: motifP, _monoisotopicMass: 42, _locationRestriction: "Anywhere.")),
            };
            var hashVar = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(3, new Modification(_originalId: "acetyl on K", _modificationType: "type", _target: motifK, _monoisotopicMass: 42, _locationRestriction: "Anywhere.")),
            };

            modList.Add(testProteinWithMod.Accession, hash);
            modList.Add(variantAcc, hashVar);
            ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> {
                testProteinWithMod
            }, xmlName);

            //now write MZML file
            var variantProteins = ProteinDbLoader.LoadProteinXML(xmlName, true, DecoyType.Reverse, null, false, null, out var unknownModifications);
            var variantProtein  = variantProteins[0];
            var variantDecoy    = variantProteins[1];

            Assert.AreEqual(0, unknownModifications.Count);

            Assert.AreEqual(2, variantProteins.Count); // target & decoy
            Assert.AreEqual(2, variantProteins[0].OneBasedPossibleLocalizedModifications.Count);
            List <int> foundResidueIndicies   = variantProtein.OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList();
            List <int> expectedResidueIndices = new List <int>()
            {
                1, 3
            };

            Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices));
            Assert.AreEqual(2, variantDecoy.OneBasedPossibleLocalizedModifications.Count);
            foundResidueIndicies   = variantDecoy.OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList();
            expectedResidueIndices = new List <int>()
            {
                4, 6
            };                                                 //originally modified residues are now at the end in the decoy
            Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices));

            var thisOk = unknownModifications;                                    //for debugging
            var commonParamsAtThisPoint = task1.CommonParameters.DigestionParams; //for debugging

            var digestedList = variantProteins[0].GetVariantProteins()[0].Digest(task1.CommonParameters.DigestionParams, new List <Modification>(), variableModifications).ToList();

            Assert.AreEqual(4, digestedList.Count);

            //Set Peptide with 1 mod at position 3
            PeptideWithSetModifications pepWithSetMods1 = digestedList[1];

            //Finally Write MZML file
            Assert.AreEqual("PEK[type:acetyl on K]TID", pepWithSetMods1.FullSequence);//this might be base sequence
            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> {
                pepWithSetMods1
            });
            string mzmlName = @"hello.mzML";

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);

            //run!
            var engine = new EverythingRunnerEngine(taskList, new List <string> {
                mzmlName
            },
                                                    new List <DbForTask> {
                new DbForTask(xmlName, false)
            }, Environment.CurrentDirectory);

            engine.Run();
        }
Пример #12
0
        public static void TestPrunedDatabase()
        {
            //Create Search Task
            SearchTask task1 = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    WritePrunedDatabase  = true,
                    SearchTarget         = true,
                    MassDiffAcceptorType = MassDiffAcceptorType.Exact,
                    ModsToWriteSelection = new Dictionary <string, int>
                    {
                        { "ConnorModType", 1 }
                    }
                },
                CommonParameters = new CommonParameters(digestionParams: new DigestionParams(minPeptideLength: 5))
            };

            //add task to task list
            List <(string, MetaMorpheusTask)> taskList = new List <(string, MetaMorpheusTask)>
            {
                ("task1", task1)
            };

            ModificationMotif.TryGetMotif("P", out ModificationMotif motif);

            var connorMod = new ModificationWithMass("ConnorMod", "ConnorModType", motif, TerminusLocalization.Any, 10);

            GlobalVariables.AddMods(new List <ModificationWithLocation>
            {
                connorMod
            });

            //create modification lists
            List <ModificationWithMass> variableModifications = GlobalVariables.AllModsKnown.OfType <ModificationWithMass>().Where
                                                                    (b => task1.CommonParameters.ListOfModsVariable.Contains((b.modificationType, b.id))).ToList();

            //add modification to Protein object
            var dictHere = new Dictionary <int, List <Modification> >();
            ModificationWithMass modToAdd  = connorMod;
            ModificationWithMass modToAdd2 = connorMod;

            dictHere.Add(1, new List <Modification> {
                modToAdd
            });
            dictHere.Add(3, new List <Modification> {
                modToAdd2
            });

            //protein Creation (One with mod and one without)
            Protein TestProteinWithMod = new Protein("PEPTID", "accession1", "organism", new List <Tuple <string, string> >(), dictHere);

            //First Write XML Database
            string xmlName = "okkk.xml";

            //Add Mod to list and write XML input database
            Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >();
            var Hash = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(3, modToAdd)
            };

            modList.Add("test", Hash);
            ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> {
                TestProteinWithMod
            }, xmlName);

            //now write MZML file
            var protein = ProteinDbLoader.LoadProteinXML(xmlName, true,
                                                         DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> ok);
            var digestedList = protein[0].Digest(task1.CommonParameters.DigestionParams, new List <ModificationWithMass> {
            },
                                                 variableModifications).ToList();

            Assert.AreEqual(4, digestedList.Count);

            //Set Peptide with 1 mod at position 3
            PeptideWithSetModifications pepWithSetMods1 = digestedList[1];

            //Finally Write MZML file
            Assert.AreEqual("PEP[ConnorModType:ConnorMod]TID", pepWithSetMods1.Sequence);
            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> {
                pepWithSetMods1
            });
            string mzmlName = @"hello.mzML";

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);

            //run!
            var engine = new EverythingRunnerEngine(taskList, new List <string> {
                mzmlName
            },
                                                    new List <DbForTask> {
                new DbForTask(xmlName, false)
            }, Environment.CurrentDirectory);

            engine.Run();

            string final = Path.Combine(MySetUpClass.outputFolder, "task1", "okkkpruned.xml");

            var proteins = ProteinDbLoader.LoadProteinXML(final, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out ok);

            //check length
            Assert.AreEqual(proteins[0].OneBasedPossibleLocalizedModifications.Count, 1);
            //check location (key)
            Assert.AreEqual(proteins[0].OneBasedPossibleLocalizedModifications.ContainsKey(3), true);
            List <Modification> listOfMods = proteins[0].OneBasedPossibleLocalizedModifications[3];

            //check Type, count, ID
            Assert.AreEqual(listOfMods[0].modificationType, "ConnorModType");
            Assert.AreEqual(listOfMods[0].id, "ConnorMod");
            Assert.AreEqual(listOfMods.Count, 1);
        }
Пример #13
0
        public static void TestProteinPrunedWithModSelectionAndVariants()
        {
            var modToWrite    = GlobalVariables.AllModsKnown.Where(p => p.ModificationType == "UniProt" && p.Target.ToString() == "T").First();
            var modToNotWrite = GlobalVariables.AllModsKnown.Where(p => p.ModificationType == "Common Artifact" && p.Target.ToString() == "X").First();
            Dictionary <int, List <Modification> > variantMods = new Dictionary <int, List <Modification> >();

            variantMods.Add(1, new List <Modification>()
            {
                modToNotWrite
            });

            List <SequenceVariation> variants = new List <SequenceVariation> {
                new SequenceVariation(4, 4, "V", "T", @"20\t41168825\t.\tT\tC\t14290.77\t.\tANN=C|missense_variant|MODERATE|PLCG1|ENSG00000124181|transcript|ENST00000244007.7|protein_coding|22/33|c.2438T>C|p.Ile813Thr|2635/5285|2438/3876|813/1291||\tGT:AD:DP:GQ:PL\t1/1:1,392:393:99:14319,1142,0", variantMods)
            };

            var protein1 = new Protein("PEPVIDEKPEPT", "1", oneBasedModifications: new Dictionary <int, List <Modification> > {
                { 1, new List <Modification> {
                      modToNotWrite
                  } }, { 12, new List <Modification> {
                             modToWrite
                         } }
            }, sequenceVariations: variants);
            var protein2 = new Protein("PEPIDPEPT", "2", oneBasedModifications: new Dictionary <int, List <Modification> > {
                { 1, new List <Modification> {
                      modToNotWrite
                  } }, { 9, new List <Modification> {
                             modToWrite
                         } }
            });
            var protein1Variants = protein1.GetVariantProteins(1, 0);

            string path = @"temp";

            var proteinList = new List <Protein> {
                protein1, protein2
            };

            proteinList.AddRange(protein1Variants);


            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), proteinList, path);

            Directory.CreateDirectory(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTestVariant"));

            Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >();
            var Hash = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(1, modToWrite),
                new Tuple <int, Modification>(2, modToNotWrite),
            };

            var db = ProteinDbWriter.WriteXmlDatabase(modList, proteinList, Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTestVariant/fakeDb.xml"));

            var peptideObserved = protein1Variants.First().Digest(new DigestionParams(minPeptideLength: 1), new List <Modification>(), new List <Modification>())
                                  .Where(p => p.BaseSequence == "PEPT").First();
            PostSearchAnalysisParameters testPostTaskParameters = new PostSearchAnalysisParameters();
            CommonParameters             commonParam            = new CommonParameters(useDeltaScore: false);

            double[,] noiseData = new double[10000, 10000];
            noiseData[0, 0]     = 1.0;
            List <Proteomics.Fragmentation.MatchedFragmentIon> matchedFragmentIons = new List <Proteomics.Fragmentation.MatchedFragmentIon>()
            {
            };
            MzSpectrum spectrum = new MzSpectrum(noiseData);
            MsDataScan scan     = new MsDataScan(spectrum, 1, 1, true, Polarity.Unknown, 2, new MzLibUtil.MzRange(10, 1000), "", MZAnalyzerType.Orbitrap, 10000, null, noiseData, "");

            testPostTaskParameters.ProteinList = proteinList;
            testPostTaskParameters.AllPsms     = new List <PeptideSpectralMatch> {
                new PeptideSpectralMatch(peptideObserved, 0, 20, 1, new Ms2ScanWithSpecificMass(scan, 100, 1, @"", commonParam), commonParam, matchedFragmentIons)
            };
            testPostTaskParameters.SearchParameters = new SearchParameters();
            testPostTaskParameters.SearchParameters.WritePrunedDatabase = true;
            testPostTaskParameters.SearchParameters.DoQuantification    = false;
            testPostTaskParameters.SearchParameters.WriteMzId           = false;
            testPostTaskParameters.DatabaseFilenameList = new List <DbForTask>()
            {
                new DbForTask(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDb.xml"), false)
            };
            testPostTaskParameters.OutputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest");
            Directory.CreateDirectory(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/individual"));
            testPostTaskParameters.IndividualResultsOutputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/individual");
            int[] stuffForSpectraFile = new int[2];
            stuffForSpectraFile[0] = 10;
            stuffForSpectraFile[1] = 10;
            Dictionary <string, int[]> numSpectraPerFile = new Dictionary <string, int[]>();

            numSpectraPerFile.Add("", stuffForSpectraFile);
            testPostTaskParameters.NumMs2SpectraPerFile = numSpectraPerFile;

            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications>
            {
                peptideObserved
            });
            string mzmlName = @"newMzml.mzML";

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);

            modList.Add("test", Hash);

            testPostTaskParameters.CurrentRawFileList = new List <string>()
            {
                mzmlName
            };

            SearchTask task5 = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    WritePrunedDatabase  = true,
                    SearchTarget         = true,
                    MassDiffAcceptorType = MassDiffAcceptorType.Exact,
                },
                CommonParameters = new CommonParameters()
            };

            var test = task5.RunTask(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest"), new List <DbForTask>()
            {
                new DbForTask(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDb.xml"), false)
            }, new List <string>()
            {
                mzmlName
            }, "name");

            testPostTaskParameters.SearchTaskResults = test;

            PostSearchAnalysisTask testPostTask = new PostSearchAnalysisTask();

            testPostTask.Parameters             = testPostTaskParameters;
            testPostTask.CommonParameters       = commonParam;
            testPostTask.FileSpecificParameters = new List <(string FileName, CommonParameters Parameters)> {
                ("newMzMl.mzml", commonParam)
            };
            testPostTask.Run();

            var proteinsLoaded = ProteinDbLoader.LoadProteinXML(path, true, DecoyType.None, GlobalVariables.AllModsKnown, false, new List <string>(), out var unknownMods);

            // assert that mods on proteins are the same before/after task is run
            Assert.AreEqual(protein1Variants.First().Accession, proteinsLoaded.First().Accession);
            Assert.AreEqual(protein1Variants.First().OneBasedPossibleLocalizedModifications.Count(), proteinsLoaded.First().OneBasedPossibleLocalizedModifications.Count());
            Assert.AreEqual(protein2.OneBasedPossibleLocalizedModifications.Count(), proteinsLoaded.ElementAt(1).OneBasedPossibleLocalizedModifications.Count());

            // assert that protein pruned DB has correct proteins mods
            var proteinPruned = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDbproteinPruned.xml"), true, DecoyType.None, GlobalVariables.AllModsKnown, false, new List <string>(), out var unknownMods1);

            Assert.That(proteinPruned.Count().Equals(1));
            Assert.That(proteinPruned.FirstOrDefault().OneBasedPossibleLocalizedModifications.Count().Equals(1));
            // assert that mod-pruned DB has correct proteins and mods
            var modPruned = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDbpruned.xml"), true, DecoyType.None, GlobalVariables.AllModsKnown, false, new List <string>(), out var unknownMods2);

            Assert.That(modPruned.Count().Equals(2));
            Assert.That(modPruned.ElementAt(0).OneBasedPossibleLocalizedModifications.Count().Equals(1));
            Assert.That(modPruned.ElementAt(1).OneBasedPossibleLocalizedModifications.Count().Equals(1));
        }
Пример #14
0
 public void ReadXmlNulls()
 {
     var ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"xml2.xml"), true, DecoyType.None,
                                             null, false, null, out Dictionary <string, Modification> un);
 }
Пример #15
0
        static void Main(string[] args)
        {
            List <string> files = args.Where(f => File.Exists(f) & (Path.GetExtension(f) == ".xml" || Path.GetExtension(f) == ".xml.gz")).ToList();

            if (files.Count < 2)
            {
                Console.WriteLine("Please enter at least two protein .xml or .xml.gz databases.");
                return;
            }

            // check that file path is valid
            string timestamp = DateTime.Now.Year.ToString("0000") + "-" + DateTime.Now.Month.ToString("00") + "-" + DateTime.Now.Day.ToString("00") + "-" + DateTime.Now.Hour.ToString("00") + "-" + DateTime.Now.Minute.ToString("00") + "-" + DateTime.Now.Second.ToString("00");
            string outpath   = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "merged_database_" + timestamp + ".xml");

            // merge databases
            Loaders.LoadElements(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "elements.dat"));
            List <Protein> merged = ProteinDbLoader.merge_proteins(files.SelectMany(f => ProteinDbLoader.LoadProteinXML(f, false, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> un))).ToList();

            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), merged, outpath);
        }
        public void get_theoretical_proteoforms(string current_directory)
        {
            if (!ready_to_make_database(current_directory))
            {
                return;
            }

            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }
            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            List <ModificationWithLocation> all_known_modifications = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path)).ToList();

            uniprotModifications = make_modification_dictionary(all_known_modifications);

            Dictionary <string, Modification> um;

            Parallel.ForEach(SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, false, all_known_modifications, database.ContaminantDB, SaveState.lollipop.mod_types_to_exclude, out um).ToArray());
                lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).OfType <ModificationWithLocation>().Where(m => !SaveState.lollipop.mod_types_to_exclude.Contains(m.modificationType)));
            });

            foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods")))
            {
                var new_mods = !filename.EndsWith("variable.txt") || SaveState.lollipop.methionine_oxidation ?
                               PtmListLoader.ReadModsFromFile(filename) :
                               new List <ModificationWithLocation>(); // Empty variable modifications if not selected
                if (filename.EndsWith("variable.txt"))
                {
                    variableModifications = new_mods.OfType <ModificationWithMass>().ToList();
                }
                if (filename.EndsWith("intact_mods.txt"))
                {
                    List <double> old_mods = all_known_modifications.OfType <ModificationWithMass>().Select(m => m.monoisotopicMass).ToList();
                    new_mods = new_mods.OfType <ModificationWithMass>().Where(m => !old_mods.Contains(m.monoisotopicMass)); // get rid of the unlocalized mods if they're already present
                }
                all_known_modifications.AddRange(new_mods);
            }

            all_known_modifications = new HashSet <ModificationWithLocation>(all_known_modifications).ToList();
            uniprotModifications    = make_modification_dictionary(all_known_modifications);
            all_mods_with_mass      = uniprotModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().Concat(variableModifications).ToList();
            SaveState.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <ModificationWithMass> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));

            //Generate all two-member sets and all three-member (or greater) sets of the same modification (three-member combinitorics gets out of hand for assignment)
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(Math.Min(2, SaveState.lollipop.max_ptms), all_mods_with_mass, SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i < SaveState.lollipop.max_ptms + 1; i++)
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();

            expanded_proteins = expand_protein_entries(theoretical_proteins.Values.SelectMany(p => p).ToArray());
            aaIsotopeMassList = new AminoAcidMasses(SaveState.lollipop.carbamidomethylation, SaveState.lollipop.natural_lysine_isotope_abundance, SaveState.lollipop.neucode_light_lysine, SaveState.lollipop.neucode_heavy_lysine).AA_Masses;
            if (SaveState.lollipop.combine_identical_sequences)
            {
                expanded_proteins = group_proteins_by_sequence(expanded_proteins);
            }

            expanded_proteins = expanded_proteins.OrderBy(x => x.OneBasedPossibleLocalizedModifications.Count).ToArray(); // Take on harder problems first to use parallelization more effectively
            process_entries(expanded_proteins, variableModifications);
            process_decoys(expanded_proteins, variableModifications);

            if (SaveState.lollipop.combine_theoretical_proteoforms_byMass)
            {
                SaveState.lollipop.target_proteoform_community.theoretical_proteoforms = group_proteoforms_by_mass(SaveState.lollipop.target_proteoform_community.theoretical_proteoforms);
                foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values)
                {
                    community.theoretical_proteoforms = group_proteoforms_by_mass(community.theoretical_proteoforms);
                }
            }
        }
Пример #17
0
        public static void AppliedVariants()
        {
            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
            Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary <string, IList <string> >(), null, null, null, null, null);

            List <Protein> proteinsWithSeqVars = new List <Protein>
            {
                new Protein("MPEPTIDE", "protein1", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPTIDE", "protein2", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPTIDE", "protein5", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary <int, List <Modification> > {
                        { 5, new[] { mp }.ToList() }
                    })
                }),
            };
            var    proteinsWithAppliedVariants  = proteinsWithSeqVars.SelectMany(p => p.GetVariantProteins()).ToList();
            var    proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantProteins()).ToList(); // should be stable
            string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml");

            ProteinDbWriter.WriteXmlDatabase(null, proteinsWithSeqVars, xml);
            var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un);

            var listArray = new[] { proteinsWithAppliedVariants, proteinsWithAppliedVariants2, proteinsWithAppliedVariants3 };

            for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
            {
                // sequences
                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence);
                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence);
                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence);
                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence);
                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence);
                Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key);

                // SAV
                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition);

                // MNV
                Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition);

                // insertion
                Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition);

                // deletion
                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition);
            }
        }
        public static void DatabaseSummary(string sourceXmlPath, string destinationXmlPath)
        {
            var culture         = CultureInfo.CurrentCulture;
            var uniprotPtms     = ProteinAnnotation.GetUniProtMods(Environment.CurrentDirectory);
            var uniprot         = ProteinDbLoader.LoadProteinXML(sourceXmlPath, true, DecoyType.None, uniprotPtms, false, null, out var un);
            var spritz          = ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, DecoyType.None, uniprotPtms, false, null, out un);
            var spritzCanonical = spritz.Select(p => p.NonVariantProtein).Distinct().ToList();
            int numberOfCanonicalProteinEntries = spritzCanonical.Count;
            int numberOfVariantProteinEntries   = spritz.Count - spritzCanonical.Count;
            int synonymousCount  = 0;
            int totalVariants    = 0;
            int missenseSnvCount = 0;
            int missenseMnvCount = 0;
            int insertionCount   = 0;
            int deletionCount    = 0;
            int frameshiftCount  = 0;
            int stopGainCount    = 0;
            int stopLossCount    = 0;
            Dictionary <string, List <SequenceVariation> > allVariants = new Dictionary <string, List <SequenceVariation> >();

            foreach (var spritzEntry in spritz)
            {
                if (spritzEntry.AppliedSequenceVariations.Count != 0)
                {
                    if (allVariants.ContainsKey(spritzEntry.NonVariantProtein.Accession))
                    {
                        foreach (var variant in spritzEntry.AppliedSequenceVariations)
                        {
                            if (!allVariants[spritzEntry.NonVariantProtein.Accession].Contains(variant))
                            {
                                allVariants[spritzEntry.NonVariantProtein.Accession].Add(variant);
                            }
                        }
                    }
                    else
                    {
                        allVariants.Add(spritzEntry.NonVariantProtein.Accession, spritzEntry.AppliedSequenceVariations);
                    }
                }
            }
            foreach (var entry in allVariants)
            {
                foreach (var variant in entry.Value)
                {
                    if (culture.CompareInfo.IndexOf(variant.Description.Description, "synonymous_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        synonymousCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0 &&
                             variant.Description.ReferenceAlleleString.Length == 1 && variant.Description.AlternateAlleleString.Length == 1)
                    {
                        missenseSnvCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        missenseMnvCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "frameshift_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        frameshiftCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_gained", CompareOptions.IgnoreCase) >= 0)
                    {
                        stopGainCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_insertion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_insertion", CompareOptions.IgnoreCase) >= 0)
                    {
                        insertionCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_deletion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_deletion", CompareOptions.IgnoreCase) >= 0)
                    {
                        deletionCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_loss", CompareOptions.IgnoreCase) >= 0)
                    {
                        stopLossCount++;
                        totalVariants++;
                    }
                }
            }

            Console.WriteLine($"Spritz Database Summary");
            Console.WriteLine($"--------------------------------------------------------------");
            Console.WriteLine($"{numberOfCanonicalProteinEntries}\tTotal number of canonical protein entries (before applying variations)");
            Console.WriteLine($"{spritz.Count}\tTotal number of protein entries");
            Console.WriteLine($"{numberOfVariantProteinEntries}\tTotal number of variant containing protein entries");
            Console.WriteLine($"{totalVariants}\tTotal number of unique variants");
            Console.WriteLine($"{synonymousCount}\tTotal number of unique synonymous variants");
            Console.WriteLine($"{(totalVariants - synonymousCount)}\tTotal number of unique nonsynonymous variants");
            Console.WriteLine($"{missenseSnvCount}\tNumber of unique SNV missense variants");
            Console.WriteLine($"{missenseMnvCount}\tNumber of unique MNV missense variants");
            Console.WriteLine($"{frameshiftCount}\tNumber of unique frameshift variants");
            Console.WriteLine($"{insertionCount}\tNumber of unique insertion variants");
            Console.WriteLine($"{deletionCount}\tNumber of unique deletion variants");
            Console.WriteLine($"{stopGainCount}\tNumber of unique stop gain variants");
            Console.WriteLine($"{stopLossCount}\tNumber of unique stop loss variants");
        }
Пример #19
0
        public void get_theoretical_proteoforms(string current_directory)
        {
            if (!ready_to_make_database(current_directory))
            {
                return;
            }

            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }

            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(current_directory, "Mods", "PSI-MOD.obo.xml"));
            Dictionary <string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized);

            List <Modification> all_known_modifications = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList)
                                                          .SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings))
                                                          .ToList();

            uniprotModifications = make_modification_dictionary(all_known_modifications);
            Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database =>
            {
                if (database.extension == ".xml")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray());
                    lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType)));
                }
                else if (database.extension == ".fasta")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                                                            ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray());
                }
            });

            foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods")))
            {
                List <Modification> new_mods = !filename.EndsWith("variable.txt") || Sweet.lollipop.methionine_oxidation ?
                                               PtmListLoader.ReadModsFromFile(filename, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings).ToList() :
                                               new List <Modification>(); // Empty variable modifications if not selected
                if (filename.EndsWith("variable.txt"))
                {
                    variableModifications = new_mods;
                }
                all_known_modifications.AddRange(new_mods);
            }

            all_known_modifications = new HashSet <Modification>(all_known_modifications).ToList();
            uniprotModifications    = make_modification_dictionary(all_known_modifications);

            all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).Concat(variableModifications).ToList();
            Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames"));

            //this is for ptmsets --> used in RELATIONS
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();
            make_theoretical_proteoforms();
        }
Пример #20
0
        public void TestFullProteinReadWrite()
        {
            Modification mod = new Modification("mod1", null, "modType1", null, null, null, null, null, null, null, null, null, null, null);

            ModificationMotif.TryGetMotif("E", out ModificationMotif motif);
            Modification mod2 = new Modification("mod2 on E", null, "modType1", null, motif, "Anywhere.", null, null, null, null, null, null, null, null);

            ModificationMotif.TryGetMotif("N", out ModificationMotif motif3);
            Modification mod3 = new Modification("mod3 on N", null, "modType1", null, motif3, "Anywhere.", null, 10, null, null, null, null, null, null);

            List <Tuple <string, string> > gene_names = new List <Tuple <string, string> > {
                new Tuple <string, string>("a", "b")
            };
            IDictionary <int, List <Modification> > oneBasedModifications = new Dictionary <int, List <Modification> >
            {
                { 3, new List <Modification> {
                      mod
                  } },
                { 4, new List <Modification> {
                      mod2
                  } },
                { 5, new List <Modification> {
                      mod3
                  } }
            };
            List <ProteolysisProduct> proteolysisProducts = new List <ProteolysisProduct> {
                new ProteolysisProduct(1, 2, "propeptide")
            };

            string name = "testName";

            string full_name = "testFullName";

            List <DatabaseReference> databaseReferences = new List <DatabaseReference> {
                new DatabaseReference("type1", "id1", new List <Tuple <string, string> > {
                    new Tuple <string, string>("e1", "e2")
                })
            };

            List <SequenceVariation> sequenceVariations = new List <SequenceVariation> {
                new SequenceVariation(3, "Q", "N", "replace Q by N"),
                new SequenceVariation(3, 4, "QE", "NN", "replace QE by NN")
            };

            List <DisulfideBond> disulfideBonds = new List <DisulfideBond> {
                new DisulfideBond(1, "ds1"), new DisulfideBond(2, 3, "ds2")
            };

            Protein p1 = new Protein(
                "SEQENCE",
                "a1",
                geneNames: gene_names,
                oneBasedModifications: oneBasedModifications,
                proteolysisProducts: proteolysisProducts,
                name: name,
                fullName: full_name,
                isDecoy: false,
                isContaminant: true,
                databaseReferences: databaseReferences,
                sequenceVariations: sequenceVariations,
                disulfideBonds: disulfideBonds,
                databaseFilePath: Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bnueiwhf.xml"));

            // Generate data for files
            ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), new List <Protein> {
                p1
            },
                                             Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bnueiwhf.xml"));

            IEnumerable <string>       modTypesToExclude     = new List <string>();
            IEnumerable <Modification> allKnownModifications = new List <Modification>();
            List <Protein>             ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bnueiwhf.xml"), true, DecoyType.None,
                                                                           allKnownModifications, true, modTypesToExclude, out Dictionary <string, Modification> unknownModifications);

            Assert.AreEqual(p1.Accession, ok[0].Accession);
            Assert.AreEqual(p1.BaseSequence, ok[0].BaseSequence);
            Assert.AreEqual(p1.DatabaseReferences.First().Id, ok[0].DatabaseReferences.First().Id);
            Assert.AreEqual(p1.DatabaseReferences.First().Properties.First().Item1, ok[0].DatabaseReferences.First().Properties.First().Item1);
            Assert.AreEqual(p1.DatabaseReferences.First().Properties.First().Item2, ok[0].DatabaseReferences.First().Properties.First().Item2);
            Assert.AreEqual(p1.DatabaseReferences.First().Type, ok[0].DatabaseReferences.First().Type);

            Assert.AreEqual(p1.DisulfideBonds.First().Description, ok[0].DisulfideBonds.First().Description);
            Assert.AreEqual(p1.DisulfideBonds.First().OneBasedBeginPosition, ok[0].DisulfideBonds.First().OneBasedBeginPosition);
            Assert.AreEqual(p1.DisulfideBonds.First().OneBasedEndPosition, ok[0].DisulfideBonds.First().OneBasedEndPosition);
            Assert.AreEqual(p1.DisulfideBonds.Last().Description, ok[0].DisulfideBonds.Last().Description);
            Assert.AreEqual(p1.DisulfideBonds.Last().OneBasedBeginPosition, ok[0].DisulfideBonds.Last().OneBasedBeginPosition);
            Assert.AreEqual(p1.DisulfideBonds.Last().OneBasedEndPosition, ok[0].DisulfideBonds.Last().OneBasedEndPosition);

            Assert.AreEqual(p1.FullDescription, ok[0].FullDescription);
            Assert.AreEqual(p1.FullName, ok[0].FullName);
            Assert.AreEqual(p1.GeneNames, ok[0].GeneNames);
            Assert.AreEqual(p1.IsContaminant, ok[0].IsContaminant);
            Assert.AreEqual(p1.IsDecoy, ok[0].IsDecoy);
            Assert.AreEqual(p1.Length, ok[0].Length);
            Assert.AreEqual(p1.Name, ok[0].Name);
            Assert.AreEqual(p1.Organism, ok[0].Organism);
            Assert.AreEqual(p1.DatabaseFilePath, ok[0].DatabaseFilePath);
            Assert.AreEqual(1, p1.OneBasedPossibleLocalizedModifications.Keys.Count);
            Assert.AreEqual(1, ok[0].OneBasedPossibleLocalizedModifications.Keys.Count);
            Assert.AreEqual(p1.OneBasedPossibleLocalizedModifications.Keys.First(), ok[0].OneBasedPossibleLocalizedModifications.Keys.First());
            Assert.IsTrue(p1.OneBasedPossibleLocalizedModifications[5][0].Equals(ok[0].OneBasedPossibleLocalizedModifications[5][0]));

            Assert.AreEqual(p1.ProteolysisProducts.First().OneBasedBeginPosition, ok[0].ProteolysisProducts.First().OneBasedBeginPosition);
            Assert.AreEqual(p1.ProteolysisProducts.First().OneBasedEndPosition, ok[0].ProteolysisProducts.First().OneBasedEndPosition);
            Assert.AreEqual(p1.ProteolysisProducts.First().Type, ok[0].ProteolysisProducts.First().Type);

            Assert.AreEqual(p1.SequenceVariations.First().Description, ok[0].SequenceVariations.First().Description);
            Assert.AreEqual(p1.SequenceVariations.First().OneBasedBeginPosition, ok[0].SequenceVariations.First().OneBasedBeginPosition);
            Assert.AreEqual(p1.SequenceVariations.First().OneBasedEndPosition, ok[0].SequenceVariations.First().OneBasedEndPosition);
            Assert.AreEqual(p1.SequenceVariations.First().OriginalSequence, ok[0].SequenceVariations.First().OriginalSequence);
            Assert.AreEqual(p1.SequenceVariations.First().VariantSequence, ok[0].SequenceVariations.First().VariantSequence);
            Assert.AreEqual(p1.SequenceVariations.Last().Description, ok[0].SequenceVariations.Last().Description);
            Assert.AreEqual(p1.SequenceVariations.Last().OneBasedBeginPosition, ok[0].SequenceVariations.Last().OneBasedBeginPosition);
            Assert.AreEqual(p1.SequenceVariations.Last().OneBasedEndPosition, ok[0].SequenceVariations.Last().OneBasedEndPosition);
            Assert.AreEqual(p1.SequenceVariations.Last().OriginalSequence, ok[0].SequenceVariations.Last().OriginalSequence);
            Assert.AreEqual(p1.SequenceVariations.Last().VariantSequence, ok[0].SequenceVariations.Last().VariantSequence);
        }
        public void get_theoretical_proteoforms(string current_directory)
        {
            //Clear out data from potential previous runs
            foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values)
            {
                community.theoretical_proteoforms = new TheoreticalProteoform[0];
            }

            theoretical_proteins.Clear();

            //Read the UniProt-XML and ptmlist
            List <Modification> all_known_modifications = get_mods(current_directory);

            foreach (var database in Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList())
            {
                if (database.extension == ".xml")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray());
                    lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType)));
                }
                else if (database.extension == ".fasta")
                {
                    lock (theoretical_proteins)
                        theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                                                            ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray());
                }
            }

            Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass);

            unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> {
                new Ptm().modification
            }));
            load_unlocalized_names(Path.Combine(current_directory, "Mods", "stored_mods.modnames"));


            //this is for ptmsets --> used in RELATIONS
            all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList();
            for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here
            {
                all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2)));
            }

            //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations
            possible_ptmset_dictionary = make_ptmset_dictionary();

            //read in bottom-up PSMs
            bottom_up_psm_by_accession.Clear();
            foreach (var file in Sweet.lollipop.input_files.Where(f => f.purpose == Purpose.BottomUp))
            {
                var bottom_up_psms = Sweet.lollipop.bottomupReader.ReadTDFile(file);
                foreach (var psm in bottom_up_psms)
                {
                    string accession = psm.accession.Split('_')[0].Split('-')[0];
                    bottom_up_psm_by_accession.TryGetValue(accession, out var psms);
                    if (psms == null)
                    {
                        bottom_up_psm_by_accession.Add(accession, new List <SpectrumMatch>()
                        {
                            psm
                        });
                    }
                    else
                    {
                        psms.Add(psm);
                    }
                }
            }

            //make theoreticals
            make_theoretical_proteoforms();
        }
Пример #22
0
        public static void TestUserModSelectionInPrunedDB()
        {
            List <(string, string)> listOfModsFixed = new List <(string, string)> {
                ("Common Fixed", "Carbamidomethyl of C"), ("Common Fixed", "Carbamidomethyl of U")
            };
            //Create Search Task
            SearchTask task5 = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    WritePrunedDatabase  = true,
                    SearchTarget         = true,
                    MassDiffAcceptorType = MassDiffAcceptorType.Exact,
                },
                CommonParameters = new CommonParameters(listOfModsFixed: listOfModsFixed)
            };

            task5.SearchParameters.ModsToWriteSelection["Mod"]          = 0;
            task5.SearchParameters.ModsToWriteSelection["Common Fixed"] = 1;
            task5.SearchParameters.ModsToWriteSelection["Glycan"]       = 2;
            task5.SearchParameters.ModsToWriteSelection["missing"]      = 3;

            //add task 1 to task list
            List <(string, MetaMorpheusTask)> taskList = new List <(string, MetaMorpheusTask)> {
                ("task5", task5)
            };

            ModificationMotif.TryGetMotif("P", out ModificationMotif motif);
            ModificationMotif.TryGetMotif("E", out ModificationMotif motif2);

            var connorMod  = new Modification(_originalId: "ModToNotAppear", _modificationType: "Mod", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10);
            var connorMod2 = new Modification(_originalId: "Default(Mod in DB and Observed)", _modificationType: "Common Fixed", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10);
            var connorMod3 = new Modification(_originalId: "ModToAlwaysAppear", _modificationType: "Glycan", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10);
            var connorMod4 = new Modification(_originalId: "ModObservedNotinDB", _modificationType: "missing", _target: motif2, _locationRestriction: "Anywhere.", _monoisotopicMass: 5);

            GlobalVariables.AddMods(new List <Modification>
            {
                connorMod,
                connorMod2,
                connorMod3,
                connorMod4
            }, false);

            //create modification lists
            List <Modification> variableModifications = GlobalVariables.AllModsKnown.OfType <Modification>().Where(b => task5.CommonParameters.ListOfModsVariable.Contains
                                                                                                                       ((b.ModificationType, b.IdWithMotif))).ToList();
            List <Modification> fixedModifications = GlobalVariables.AllModsKnown.OfType <Modification>().Where(b => task5.CommonParameters.ListOfModsFixed.Contains
                                                                                                                    ((b.ModificationType, b.IdWithMotif))).ToList();

            //add modification to Protein object
            var          dictHere  = new Dictionary <int, List <Modification> >();
            Modification modToAdd  = connorMod;
            Modification modToAdd2 = connorMod2;
            Modification modToAdd3 = connorMod3;
            Modification modToAdd4 = connorMod4;

            //add Fixed modifcation so can test if mod that is observed and not in DB
            fixedModifications.Add(connorMod4);
            listOfModsFixed.Add((connorMod4.ModificationType, connorMod4.IdWithMotif));

            dictHere.Add(1, new List <Modification> {
                modToAdd
            });
            dictHere.Add(2, new List <Modification> {
                modToAdd2
            });                                                    //default
            dictHere.Add(3, new List <Modification> {
                modToAdd3
            });                                                    //Alway Appear

            var dictHere2 = new Dictionary <int, List <Modification> >
            {
                { 1, new List <Modification> {
                      modToAdd
                  } },
                { 2, new List <Modification> {
                      modToAdd2
                  } },                                       //default
                { 3, new List <Modification> {
                      modToAdd3
                  } },                                       //Alway Appear
                { 4, new List <Modification> {
                      modToAdd4
                  } }                                      //observed
            };

            //protein Creation (One with mod and one without)
            Protein TestProteinWithModForDB    = new Protein("PPPPPPPPPPE", "accession1", "organism", new List <Tuple <string, string> >(), dictHere);
            Protein TestProteinWithModObsevred = new Protein("PPPPPPPPPPE", "accession1", "organism", new List <Tuple <string, string> >(), dictHere2);

            //First Write XML Database
            string xmlName  = "selectedMods.xml";
            string xmlName2 = "selectedModsObvs.xml";

            //Add Mod to list and write XML input database
            Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >();
            var Hash = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(1, modToAdd),
                new Tuple <int, Modification>(2, modToAdd2),
                new Tuple <int, Modification>(3, modToAdd3),
                new Tuple <int, Modification>(4, modToAdd4), //Observed Only
            };

            modList.Add("test", Hash);
            ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> {
                TestProteinWithModForDB
            }, xmlName);

            //Add Observed Only
            modList.Add("test2", Hash);
            ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> {
                TestProteinWithModObsevred
            }, xmlName2);

            //now create MZML data
            var protein      = ProteinDbLoader.LoadProteinXML(xmlName2, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> ok);
            var digestedList = protein[0].Digest(task5.CommonParameters.DigestionParams, fixedModifications, variableModifications).ToList();

            //Set Peptide with 1 mod at position 3
            PeptideWithSetModifications pepWithSetMods1 = digestedList[0];
            PeptideWithSetModifications pepWithSetMods2 = digestedList[1];
            PeptideWithSetModifications pepWithSetMods3 = digestedList[2];
            PeptideWithSetModifications pepWithSetMods4 = digestedList[3];
            PeptideWithSetModifications pepWithSetMods5 = digestedList[4];

            //CUSTOM PEP
            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications>
            {
                pepWithSetMods1, pepWithSetMods2, pepWithSetMods3, pepWithSetMods4, pepWithSetMods5
            });
            string mzmlName = @"newMzml.mzML";

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);

            //make sure this runs correctly
            //run!
            string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestUserModSelectionInPrunedDB");
            var    engine       = new EverythingRunnerEngine(taskList, new List <string> {
                mzmlName
            }, new List <DbForTask> {
                new DbForTask(xmlName, false)
            }, outputFolder);

            engine.Run();
            string final    = Path.Combine(MySetUpClass.outputFolder, "task5", "selectedModspruned.xml");
            var    proteins = ProteinDbLoader.LoadProteinXML(final, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out ok);
            var    Dlist    = proteins[0].GetVariantProteins().SelectMany(vp => vp.Digest(task5.CommonParameters.DigestionParams, fixedModifications, variableModifications)).ToList();

            Assert.AreEqual(Dlist[0].NumFixedMods, 1);

            //check length
            Assert.AreEqual(proteins[0].OneBasedPossibleLocalizedModifications.Count, 3);
            List <Modification> listOfLocalMods = new List <Modification>();

            listOfLocalMods.AddRange(proteins[0].OneBasedPossibleLocalizedModifications[2]);
            listOfLocalMods.AddRange(proteins[0].OneBasedPossibleLocalizedModifications[3]);
            listOfLocalMods.AddRange(proteins[0].OneBasedPossibleLocalizedModifications[11]);

            //check Type, count, ID
            Assert.AreEqual(listOfLocalMods[0].ModificationType, "Common Fixed");
            Assert.AreEqual(listOfLocalMods[2].ModificationType, "missing");
            Assert.IsFalse(listOfLocalMods.Contains(connorMod)); //make sure that mod set not to show up is not in mod list

            Assert.AreEqual(listOfLocalMods[0].IdWithMotif, "Default(Mod in DB and Observed) on P");
            Assert.AreEqual(listOfLocalMods[1].IdWithMotif, "ModToAlwaysAppear on P");
            //Makes sure Mod that was not in the DB but was observed is in pruned DB
            Assert.AreEqual(listOfLocalMods[2].IdWithMotif, "ModObservedNotinDB on E");
            Assert.AreEqual(listOfLocalMods.Count, 3);
            Directory.Delete(outputFolder, true);
            File.Delete(mzmlName);
            File.Delete(xmlName);
            File.Delete(xmlName2);
        }
        public static void DatabaseSummary(string sourceXmlPath, string destinationXmlPath, string destinationAccessionToNameTable, string variantDescriptionTable, bool target)
        {
            var           culture         = CultureInfo.CurrentCulture;
            var           uniprotPtms     = ProteinAnnotation.GetUniProtMods(Environment.CurrentDirectory);
            var           uniprot         = ProteinDbLoader.LoadProteinXML(sourceXmlPath, true, target ? DecoyType.None : DecoyType.Reverse, uniprotPtms, false, null, out var un);
            var           spritz          = ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, target ? DecoyType.None : DecoyType.Reverse, uniprotPtms, false, null, out un);
            var           spritzCanonical = spritz.Select(p => p.NonVariantProtein).Distinct().ToList();
            int           numberOfCanonicalProteinEntries = spritzCanonical.Count;
            int           numberOfVariantProteinEntries   = spritz.Count - spritzCanonical.Count;
            int           synonymousCount       = 0;
            int           totalVariants         = 0;
            int           missenseSnvCount      = 0;
            int           missenseMnvCount      = 0;
            int           insertionCount        = 0;
            int           deletionCount         = 0;
            int           frameshiftCount       = 0;
            int           stopGainCount         = 0;
            int           stopLossCount         = 0;
            List <string> accessionNameList     = new List <string>();
            List <string> variantDescList       = new List <string>();
            List <string> accessionSequenceList = new List <string>();
            Dictionary <string, List <SequenceVariation> > allVariants = new Dictionary <string, List <SequenceVariation> >();

            foreach (var spritzEntry in spritz)
            {
                if (spritzEntry.AppliedSequenceVariations.Count != 0)
                {
                    // Make pivot tables
                    accessionNameList.Add($"{spritzEntry.Accession}\t{spritzEntry.FullName}\t{spritzEntry.BaseSequence}");
                    foreach (SequenceVariation variant in spritzEntry.AppliedSequenceVariations)
                    {
                        variantDescList.Add($"{spritzEntry.Accession}\t{variant.SimpleString()}\t{variant.Description}");
                    }

                    if (allVariants.ContainsKey(spritzEntry.NonVariantProtein.Accession))
                    {
                        foreach (SequenceVariation variant in spritzEntry.AppliedSequenceVariations)
                        {
                            if (!allVariants[spritzEntry.NonVariantProtein.Accession].Contains(variant))
                            {
                                allVariants[spritzEntry.NonVariantProtein.Accession].Add(variant);
                            }
                        }
                    }
                    else
                    {
                        allVariants.Add(spritzEntry.NonVariantProtein.Accession, spritzEntry.AppliedSequenceVariations);
                    }
                }
            }
            File.WriteAllLines(destinationAccessionToNameTable, accessionNameList);
            File.WriteAllLines(variantDescriptionTable, variantDescList);

            foreach (var entry in allVariants)
            {
                foreach (var variant in entry.Value)
                {
                    variantDescList.Add($"{entry.Key}\t{variant.SimpleString()}\t{variant.Description}");

                    if (culture.CompareInfo.IndexOf(variant.Description.Description, "synonymous_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        synonymousCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0 &&
                             variant.Description.ReferenceAlleleString.Length == 1 && variant.Description.AlternateAlleleString.Length == 1)
                    {
                        missenseSnvCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        missenseMnvCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "frameshift_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        frameshiftCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_gained", CompareOptions.IgnoreCase) >= 0)
                    {
                        stopGainCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_insertion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_insertion", CompareOptions.IgnoreCase) >= 0)
                    {
                        insertionCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_deletion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_deletion", CompareOptions.IgnoreCase) >= 0)
                    {
                        deletionCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_lost", CompareOptions.IgnoreCase) >= 0)
                    {
                        stopLossCount++;
                        totalVariants++;
                    }
                }
            }

            Console.WriteLine($"Spritz Database Summary");
            Console.WriteLine($"--------------------------------------------------------------");
            Console.WriteLine($"{numberOfCanonicalProteinEntries}\tTotal number of canonical protein entries (before applying variations)");
            Console.WriteLine($"{spritz.Count}\tTotal number of protein entries");
            Console.WriteLine($"{spritzCanonical.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(b => b.Count))}\tTotal modifications appended from UniProt out of {uniprot.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(b => b.Count))}");
            Console.WriteLine($"{numberOfVariantProteinEntries}\tTotal number of variant containing protein entries");
            Console.WriteLine($"{totalVariants}\tTotal number of unique variants");
            Console.WriteLine($"{synonymousCount}\tTotal number of unique synonymous variants");
            Console.WriteLine($"{(totalVariants - synonymousCount)}\tTotal number of unique nonsynonymous variants");
            Console.WriteLine($"{missenseSnvCount}\tNumber of unique SNV missense variants");
            Console.WriteLine($"{missenseMnvCount}\tNumber of unique MNV missense variants");
            Console.WriteLine($"{frameshiftCount}\tNumber of unique frameshift variants");
            Console.WriteLine($"{insertionCount}\tNumber of unique insertion variants");
            Console.WriteLine($"{deletionCount}\tNumber of unique deletion variants");
            Console.WriteLine($"{stopGainCount}\tNumber of unique stop gain variants");
            Console.WriteLine($"{stopLossCount}\tNumber of unique stop loss variants");
        }
Пример #24
0
        public static void TestPrunedDatabase()
        {
            //Create Search Task
            SearchTask task1 = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    WritePrunedDatabase  = true,
                    SearchTarget         = true,
                    MassDiffAcceptorType = MassDiffAcceptorType.Exact,
                    ModsToWriteSelection = new Dictionary <string, int>
                    {
                        { "ConnorModType", 1 }
                    }
                },
                CommonParameters = new CommonParameters(digestionParams: new DigestionParams(minPeptideLength: 5))
            };

            //add task to task list
            List <(string, MetaMorpheusTask)> taskList = new List <(string, MetaMorpheusTask)>
            {
                ("task1", task1)
            };

            ModificationMotif.TryGetMotif("P", out ModificationMotif motif);

            var connorMod = new Modification(_originalId: "ConnorMod on P", _modificationType: "ConnorModType", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10);

            GlobalVariables.AddMods(new List <Modification>
            {
                connorMod
            }, false);

            //create modification lists
            List <Modification> variableModifications = GlobalVariables.AllModsKnown.OfType <Modification>()
                                                        .Where(b => task1.CommonParameters.ListOfModsVariable.Contains((b.ModificationType, b.IdWithMotif))).ToList();

            //add modification to Protein object
            var          dictHere  = new Dictionary <int, List <Modification> >();
            Modification modToAdd  = connorMod;
            Modification modToAdd2 = connorMod;

            dictHere.Add(1, new List <Modification> {
                modToAdd
            });
            dictHere.Add(3, new List <Modification> {
                modToAdd2
            });

            //protein Creation (One with mod and one without)
            Protein TestProteinWithMod = new Protein("PEPTID", "accession1", "organism", new List <Tuple <string, string> >(), dictHere);

            //First Write XML Database
            string xmlName = "okkk.xml";

            //Add Mod to list and write XML input database
            Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >();
            var Hash = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(3, modToAdd)
            };

            modList.Add("test", Hash);
            ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> {
                TestProteinWithMod
            }, xmlName);

            //now write MZML file
            var protein = ProteinDbLoader.LoadProteinXML(xmlName, true,
                                                         DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> ok);

            //Dictionary 'ok' contains unknown modifications. There are no unknown modifications in this test.
            Assert.AreEqual(0, ok.Count);
            //One protein is read from the .xml database and one decoy is created. Therefore, the list of proteins contains 2 entries.
            Assert.AreEqual(2, protein.Count);
            //The original database had two localized mods on the protein. Therefore. both protein and decoy should have two mods.
            Assert.AreEqual(2, protein[0].OneBasedPossibleLocalizedModifications.Count);
            List <int> foundResidueIndicies   = protein[0].OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList();
            List <int> expectedResidueIndices = new List <int>()
            {
                1, 3
            };

            Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices));
            Assert.AreEqual(2, protein[1].OneBasedPossibleLocalizedModifications.Count);
            foundResidueIndicies   = protein[1].OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList();
            expectedResidueIndices = new List <int>()
            {
                4, 6
            };                                                 //originally modified residues are now at the end in the decoy
            Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices));

            var thisOk = ok;                                                      //for debugging
            var commonParamsAtThisPoint = task1.CommonParameters.DigestionParams; //for debugging

            var digestedList = protein[0].Digest(task1.CommonParameters.DigestionParams, new List <Modification> {
            },
                                                 variableModifications).ToList();

            Assert.AreEqual(4, digestedList.Count);

            //Set Peptide with 1 mod at position 3
            PeptideWithSetModifications pepWithSetMods1 = digestedList[1];

            //Finally Write MZML file
            Assert.AreEqual("PEP[ConnorModType:ConnorMod on P]TID", pepWithSetMods1.FullSequence);//this might be base sequence
            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> {
                pepWithSetMods1
            });
            string mzmlName = @"hello.mzML";

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);

            //run!
            string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestPrunedDatabase");
            var    engine       = new EverythingRunnerEngine(taskList, new List <string> {
                mzmlName
            },
                                                             new List <DbForTask> {
                new DbForTask(xmlName, false)
            }, outputFolder);

            engine.Run();

            string final = Path.Combine(MySetUpClass.outputFolder, "task1", "okkkpruned.xml");

            var proteins = ProteinDbLoader.LoadProteinXML(final, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out ok);

            //check length
            Assert.AreEqual(1, proteins[0].OneBasedPossibleLocalizedModifications.Count);
            //check location (key)
            Assert.AreEqual(true, proteins[0].OneBasedPossibleLocalizedModifications.ContainsKey(3));
            List <Modification> listOfMods = proteins[0].OneBasedPossibleLocalizedModifications[3];

            //check Type, count, ID
            Assert.AreEqual(listOfMods[0].ModificationType, "ConnorModType");
            Assert.AreEqual(listOfMods[0].IdWithMotif, "ConnorMod on P");
            Assert.AreEqual(listOfMods.Count, 1);
            Directory.Delete(outputFolder, true);
            File.Delete(xmlName);
            File.Delete(mzmlName);
        }
        public static void DatabaseSummary(string sourceXmlPath, string destinationXmlPath)
        {
            var culture         = CultureInfo.CurrentCulture;
            var uniprotPtms     = ProteinAnnotation.GetUniProtMods(Environment.CurrentDirectory);
            var uniprot         = ProteinDbLoader.LoadProteinXML(sourceXmlPath, true, DecoyType.None, uniprotPtms, false, null, out var un);
            var spritz          = ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, DecoyType.None, uniprotPtms, false, null, out un);
            var spritzCanonical = spritz.Select(p => p.NonVariantProtein).Distinct().ToList();
            int numberOfCanonicalProteinEntries = spritzCanonical.Count;
            int numberOfVariantProteinEntries   = spritz.Count - spritzCanonical.Count;
            int synonymousCount = 0;
            int totalVariants   = 0;
            int missenseCount   = 0;
            int insertionCount  = 0;
            int deletionCount   = 0;
            int frameshiftCount = 0;
            int stopGainCount   = 0;
            int stopLossCount   = 0;
            Dictionary <string, List <SequenceVariation> > allVariants = new Dictionary <string, List <SequenceVariation> >();

            foreach (var spritzEntry in spritz)
            {
                if (spritzEntry.AppliedSequenceVariations.Count != 0)
                {
                    if (allVariants.ContainsKey(spritzEntry.NonVariantProtein.Accession))
                    {
                        foreach (var variant in spritzEntry.AppliedSequenceVariations)
                        {
                            if (!allVariants[spritzEntry.NonVariantProtein.Accession].Contains(variant))
                            {
                                allVariants[spritzEntry.NonVariantProtein.Accession].Add(variant);
                            }
                        }
                    }
                    else
                    {
                        allVariants.Add(spritzEntry.NonVariantProtein.Accession, spritzEntry.AppliedSequenceVariations);
                    }
                }
            }
            foreach (var entry in allVariants)
            {
                foreach (var variant in entry.Value)
                {
                    if (culture.CompareInfo.IndexOf(variant.Description.Description, "synonymous_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        synonymousCount++;
                        totalVariants++;
                    }
                    if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        missenseCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "frameshift_variant", CompareOptions.IgnoreCase) >= 0)
                    {
                        frameshiftCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_gained", CompareOptions.IgnoreCase) >= 0)
                    {
                        stopGainCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_insertion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_insertion", CompareOptions.IgnoreCase) >= 0)
                    {
                        insertionCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_deletion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_deletion", CompareOptions.IgnoreCase) >= 0)
                    {
                        deletionCount++;
                        totalVariants++;
                    }
                    else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_loss", CompareOptions.IgnoreCase) >= 0)
                    {
                        stopLossCount++;
                        totalVariants++;
                    }
                }
            }

            string[] summary = new string[20];
            summary[0]  = $"Spritz Database Summary";
            summary[1]  = $"--------------------------------------------------------------";
            summary[2]  = $"Total number of protein entries in the database: {spritz.Count}";
            summary[3]  = $"Total number of canonical protein entries in the database: {numberOfCanonicalProteinEntries}";
            summary[4]  = $"Total number of variant containing protein entries in the database: {numberOfVariantProteinEntries}";
            summary[5]  = $"  Total number of unique variants in the database: {totalVariants}";
            summary[6]  = $"      Total number of unique synonymous variants in the database: {synonymousCount}";
            summary[7]  = $"      Total number of unique nonsynonymous variants in the database: {(totalVariants - synonymousCount)}";
            summary[8]  = $"          Number of unique missense variants in the database: {missenseCount}";
            summary[9]  = $"          Number of unique frameshift variants in the database: {frameshiftCount}";
            summary[10] = $"         Number of unique insertion variants in the database: {insertionCount}";
            summary[11] = $"         Number of unique deletion variants in the database: {deletionCount}";
            summary[12] = $"         Number of unique stop gain variants in the database: {stopGainCount}";
            summary[13] = $"         Number of unique stop loss variants in the database: {stopLossCount}";

            File.WriteAllLines(Path.Combine(Path.GetDirectoryName(destinationXmlPath), "SpritzDatabaseSummary.txt"), summary);
        }