public void Test_write_with_custom_mods() { ModificationMotif.TryGetMotif("S", out ModificationMotif m1); ModificationMotif.TryGetMotif("T", out ModificationMotif m2); ModificationMotif.TryGetMotif("X", out ModificationMotif motiff); var nice = new List <Modification> { new ModificationWithLocation("fayk", "mt", motiff, TerminusLocalization.Any, null), new ModificationWithLocation("Phosphoserine", "mt", m1, TerminusLocalization.Any, null), new ModificationWithLocation("Phosphothreonine", "mt", m2, TerminusLocalization.Any, null) }; ModificationMotif.TryGetMotif("K", out ModificationMotif motif); ModificationWithMass m = new ModificationWithMass("mod", "mt", motif, TerminusLocalization.Any, 1, neutralLosses: new List <double> { -1 }); Dictionary <string, HashSet <Tuple <int, Modification> > > new_mods = new Dictionary <string, HashSet <Tuple <int, Modification> > > { { "P53863", new HashSet <Tuple <int, Modification> > { new Tuple <int, Modification>(2, m) } } }; List <Protein> ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"xml2.xml"), true, DecoyType.None, nice, false, new List <string>(), out Dictionary <string, Modification> un); var newModResEntries = ProteinDbWriter.WriteXmlDatabase(new_mods, ok, Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_xml2.xml")); Assert.AreEqual(1, newModResEntries.Count); List <Protein> ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_xml2.xml"), true, DecoyType.None, nice, false, new List <string>(), out un); Assert.AreEqual(ok.Count, ok2.Count); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence)); Assert.AreEqual(2, ok[0].OneBasedPossibleLocalizedModifications.Count); Assert.AreEqual(3, ok2[0].OneBasedPossibleLocalizedModifications.Count); }
public void Test_read_Ensembl_pepAllFasta() { ModificationMotif.TryGetMotif("X", out ModificationMotif motif); var nice = new List <Modification> { new ModificationWithLocation("fayk", "mt", motif, TerminusLocalization.Any, null) }; List <Protein> ok = ProteinDbLoader.LoadProteinFasta(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_ensembl.pep.all.fasta"), true, DecoyType.None, false, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_fullName_expression, ProteinDbLoader.ensembl_accession_expression, ProteinDbLoader.ensembl_gene_expression); ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), ok, Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.xml")); List <Protein> ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.xml"), true, DecoyType.None, nice, false, null, out Dictionary <string, Modification> un); Assert.AreEqual(ok.Count, ok2.Count); Assert.True(Enumerable.Range(0, ok.Count).All(i => ok[i].BaseSequence == ok2[i].BaseSequence)); Assert.AreEqual("ENSP00000381386", ok[0].Accession); Assert.AreEqual("ENSP00000215773", ok[1].Accession); Assert.AreEqual("ENSG00000099977", ok[0].GeneNames.First().Item2); Assert.AreEqual("ENSG00000099977", ok[1].GeneNames.First().Item2); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok[0].FullName); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24322019:-1 gene:ENSG00000099977 transcript:ENST00000350608 gene_biotype:protein_coding transcript_biotype:protein_coding", ok[1].FullName); Assert.AreEqual(Path.Combine(TestContext.CurrentContext.TestDirectory, @"test_ensembl.pep.all.fasta"), ok[0].DatabaseFilePath); Assert.AreEqual("ENSP00000381386", ok2[0].Accession); Assert.AreEqual("ENSP00000215773", ok2[1].Accession); Assert.AreEqual("ENSG00000099977", ok2[0].GeneNames.First().Item2); Assert.AreEqual("ENSG00000099977", ok2[1].GeneNames.First().Item2); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24316773:-1 gene:ENSG00000099977 transcript:ENST00000398344 gene_biotype:protein_coding transcript_biotype:protein_coding", ok2[0].FullName); Assert.AreEqual("pep:known chromosome:GRCh37:22:24313554:24322019:-1 gene:ENSG00000099977 transcript:ENST00000350608 gene_biotype:protein_coding transcript_biotype:protein_coding", ok2[1].FullName); Assert.AreEqual(Path.Combine(TestContext.CurrentContext.TestDirectory, @"rewrite_test_ensembl.pep.all.xml"), ok2[0].DatabaseFilePath); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedBeginPosition == null || prod.OneBasedBeginPosition > 0 && prod.OneBasedBeginPosition <= p.Length))); Assert.True(ok2.All(p => p.ProteolysisProducts.All(prod => prod.OneBasedEndPosition == null || prod.OneBasedEndPosition > 0 && prod.OneBasedEndPosition <= p.Length))); }
public void TestEmptyProteins() { Protein p1 = new Protein("SEQENCE", "p1"); Assert.AreEqual("p1||", p1.FullDescription); Protein p2 = new Protein("SEQENCE", "p2", name: "namep2"); var proteinListToWrite = new List <Protein> { p1, p2 }; // Generate data for files ProteinDbWriter.WriteXmlDatabase(null, proteinListToWrite, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"differentlyConstuctedProteins.xml")); IEnumerable <string> modTypesToExclude = new List <string>(); IEnumerable <Modification> allKnownModifications = new List <Modification>(); List <Protein> ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"differentlyConstuctedProteins.xml"), true, DecoyType.None, allKnownModifications, false, modTypesToExclude, out Dictionary <string, Modification> un); Assert.AreEqual(p1.Accession, ok[0].Accession); Assert.AreEqual(p2.Accession, ok[1].Accession); Assert.AreEqual(p1.Name, ok[0].Name); Assert.AreEqual(p2.Name, ok[1].Name); }
public void TestReadWriteSpliceSites() { List <Protein> ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"splices1.xml"), true, DecoyType.None, null, false, new List <string>(), out Dictionary <string, Modification> un); Assert.IsNull(ok[0].SpliceSites.First().Description.Novel); ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), ok, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml")); List <Protein> ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml"), true, DecoyType.None, null, false, new List <string>(), out un); Assert.AreEqual(ok[0].SpliceSites.Count(), ok2[0].SpliceSites.Count()); Assert.AreEqual(ok[0].SpliceSites.First().OneBasedBeginPosition, ok2[0].SpliceSites.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SpliceSites.First().OneBasedEndPosition, ok2[0].SpliceSites.First().OneBasedEndPosition); Assert.AreEqual(ok[0].SpliceSites.First().Description, ok2[0].SpliceSites.First().Description); Assert.IsNull(ok2[0].SpliceSites.First().Description.Novel); ok[0].SpliceSites.First().Description.Novel = true; ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), ok, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml")); ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"rewrite_spliceSite.xml"), true, DecoyType.None, null, false, new List <string>(), out un); Assert.IsTrue(ok2[0].SpliceSites.First().Description.Novel); }
public void XmlTest_2entry() { var nice = new List <Modification> { new ModificationWithLocation("fayk", null, null, ModificationSites.A, null, null) }; var ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"xml2.xml"), true, nice, false, null, out Dictionary <string, Modification> un); Assert.True(ok.All(p => p.ProteolysisProducts.All(d => d.OneBasedBeginPosition == null || d.OneBasedBeginPosition > 0))); Assert.True(ok.All(p => p.ProteolysisProducts.All(d => d.OneBasedEndPosition == null || d.OneBasedEndPosition <= p.Length))); Assert.False(ok.All(p => p.BaseSequence.Contains(" "))); Assert.False(ok.All(p => p.BaseSequence.Contains("\t"))); Assert.False(ok.All(p => p.BaseSequence.Contains("\n"))); //GoTerm checks List <Protein> targets = ok.Where(p => !p.IsDecoy).ToList(); Assert.AreEqual(2, targets.Count); Assert.AreEqual(1, targets[0].DatabaseReferences.Count(dbRef => dbRef.Type == "EnsemblFungi")); Assert.AreEqual(1, targets[1].DatabaseReferences.Count(dbRef => dbRef.Type == "EnsemblFungi")); }
[TestCase("exclude_me", false)]//the first part is the test case, the latter part is ther result of the assertion //[TestCase("exclude_me_not", true)] public static void Read_xml_exclude_mods(string excludeString, bool isExcluded) { ModificationMotif.TryGetMotif("X", out ModificationMotif motif); var nice = new List <Modification> { new Modification("N-acetylserine", null, "exclude_me", null, motif, "Anywhere.", null, 10, null, null, null, null, null, null), new Modification("N-acetylserine", null, "exclude_me_not", null, motif, "Anywhere.", null, 10, null, null, null, null, null, null) }; Assert.That(nice[0].ValidModification); var ok2 = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"xml.xml"), true, DecoyType.Reverse, nice, false, new[] { excludeString }, out Dictionary <string, Modification> un); List <string> modTypes = new List <string>(); foreach (KeyValuePair <int, List <Modification> > entry in ok2[0].OneBasedPossibleLocalizedModifications) { modTypes.AddRange(entry.Value.Select(m => m.ModificationType).ToList().Distinct()); } Assert.AreEqual(isExcluded, modTypes.Contains("exclude_me")); Assert.AreEqual(!isExcluded, modTypes.Contains("exclude_me_not")); }
public static void StopGained() { var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true, DecoyType.None, null, false, null, out var unknownModifications); Assert.AreEqual(2, proteins.Count); Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count()); // some redundant Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count()); // some redundant Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(191, proteins[0].Length); Assert.AreEqual('Q', proteins[0][161 - 1]); Assert.AreEqual(161 - 1, proteins[1].Length); Assert.AreNotEqual(proteins[0].Length, proteins[1].Length); proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true, DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 400); Assert.AreEqual(1, proteins.Count); Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count()); // some redundant Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(161 - 1, proteins[0].Length); }
public void DoNotWriteSameModTwiceAndDoNotWriteInHeaderSinceDifferent() { Loaders.LoadElements(Path.Combine(TestContext.CurrentContext.TestDirectory, "elements2.dat")); var sampleModList = PtmListLoader.ReadModsFromFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "z.txt")).ToList(); Protein protein = new Protein("MCSSSSSSSSSS", "accession", new List <Tuple <string, string> >(), new Dictionary <int, List <Modification> > { { 2, sampleModList.OfType <Modification>().ToList() } }, null, "name", "full_name", false, false, new List <DatabaseReference>(), new List <SequenceVariation>(), new List <DisulfideBond>()); Assert.AreEqual(1, protein.OneBasedPossibleLocalizedModifications[2].OfType <ModificationWithMass>().Count()); Dictionary <string, HashSet <Tuple <int, Modification> > > dictWithThisMod = new Dictionary <string, HashSet <Tuple <int, Modification> > >(); HashSet <Tuple <int, Modification> > value = new HashSet <Tuple <int, Modification> >(); var modReadFromFile = sampleModList.First() as ModificationWithMassAndCf; ModificationMotif.TryGetMotif("C", out ModificationMotif motif); ModificationWithMass newMod = new ModificationWithMassAndCf("Palmitoylation of C", modReadFromFile.modificationType, motif, TerminusLocalization.Any, modReadFromFile.chemicalFormula, modReadFromFile.monoisotopicMass, null, null, null); Assert.AreEqual(newMod, sampleModList.First()); Assert.AreEqual(sampleModList.First(), newMod); value.Add(new Tuple <int, Modification>(2, newMod)); dictWithThisMod.Add("accession", value); var newModResEntries = ProteinDbWriter.WriteXmlDatabase(dictWithThisMod, new List <Protein> { protein }, Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins3.xml")); Assert.AreEqual(0, newModResEntries.Count); List <Protein> new_proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins3.xml"), true, DecoyType.None, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> um); Assert.AreEqual(1, new_proteins.Count); Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.Count); Assert.AreEqual(1, new_proteins[0].OneBasedPossibleLocalizedModifications.SelectMany(kv => kv.Value).Count()); }
public static void MultipleAlternateAlleles() { var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "MultipleAlternateAlleles.xml"), true, DecoyType.None, null, false, null, out var unknownModifications); Assert.AreEqual(2, proteins.Count); Assert.AreEqual(2, proteins[0].SequenceVariations.Count()); // some redundant Assert.AreEqual(2, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.IsTrue(proteins[0].SequenceVariations.All(v => v.OneBasedBeginPosition == 63)); // there are two alternate alleles (1 and 2), but only 2 is in the genotype, so only that's applied Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count()); // some redundant Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(72, proteins[0].Length); Assert.AreEqual(72, proteins[1].Length); Assert.AreEqual('K', proteins[0][63 - 1]); Assert.AreEqual('R', proteins[1][63 - 1]); proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "MultipleAlternateAlleles.xml"), true, DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 10); Assert.AreEqual(1, proteins.Count); Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count()); // some redundant Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual('K', proteins[0][63 - 1]); // reference only }
public static void Test_MetaMorpheusStyleProteinDatabaseWriteAndREad() { string proteinDbFilePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestProteinSplitAcrossFiles.xml"); ModificationMotif.TryGetMotif("D", out ModificationMotif motif); Modification mod = new Modification(_originalId: "mod1", _modificationType: "mt", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10); IDictionary <int, List <Modification> > oneBasedModification = new Dictionary <int, List <Modification> > { { 3, new List <Modification> { mod } } }; Protein prot1 = new Protein("MEDEEK", "prot1", oneBasedModifications: oneBasedModification); List <Protein> proteinList = new List <Protein> { prot1 }; ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), proteinList, proteinDbFilePath); var lines = File.ReadAllLines(proteinDbFilePath); List <Protein> newProteinList = ProteinDbLoader.LoadProteinXML(proteinDbFilePath, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out var um, -1); }
public static void TestSearchPtmVariantDatabase() { //Create Search Task SearchTask task1 = new SearchTask { SearchParameters = new SearchParameters { SearchTarget = true, MassDiffAcceptorType = MassDiffAcceptorType.Exact, }, CommonParameters = new CommonParameters(digestionParams: new DigestionParams(minPeptideLength: 5)) }; //add task to task list var taskList = new List <(string, MetaMorpheusTask)> { ("task1", task1) }; //create modification lists List <Modification> variableModifications = GlobalVariables.AllModsKnown.OfType <Modification>().Where (b => task1.CommonParameters.ListOfModsVariable.Contains((b.ModificationType, b.IdWithMotif))).ToList(); //protein Creation (One with mod and one without) ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); ModificationMotif.TryGetMotif("K", out ModificationMotif motifK); var variant = new SequenceVariation(3, "P", "K", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G|||||||||||||||||||\tGT:AD:DP\t1/1:30,30:30"); Protein testProteinWithMod = new Protein("PEPTID", "accession1", sequenceVariations: new List <SequenceVariation> { variant }); string variantAcc = VariantApplication.GetAccession(testProteinWithMod, new[] { variant }); //First Write XML Database string xmlName = "oblm.xml"; //Add Mod to list and write XML input database var modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >(); var hash = new HashSet <Tuple <int, Modification> > { new Tuple <int, Modification>(1, new Modification(_originalId: "acetyl on P", _modificationType: "type", _target: motifP, _monoisotopicMass: 42, _locationRestriction: "Anywhere.")), }; var hashVar = new HashSet <Tuple <int, Modification> > { new Tuple <int, Modification>(3, new Modification(_originalId: "acetyl on K", _modificationType: "type", _target: motifK, _monoisotopicMass: 42, _locationRestriction: "Anywhere.")), }; modList.Add(testProteinWithMod.Accession, hash); modList.Add(variantAcc, hashVar); ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> { testProteinWithMod }, xmlName); //now write MZML file var variantProteins = ProteinDbLoader.LoadProteinXML(xmlName, true, DecoyType.Reverse, null, false, null, out var unknownModifications); var variantProtein = variantProteins[0]; var variantDecoy = variantProteins[1]; Assert.AreEqual(0, unknownModifications.Count); Assert.AreEqual(2, variantProteins.Count); // target & decoy Assert.AreEqual(2, variantProteins[0].OneBasedPossibleLocalizedModifications.Count); List <int> foundResidueIndicies = variantProtein.OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList(); List <int> expectedResidueIndices = new List <int>() { 1, 3 }; Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices)); Assert.AreEqual(2, variantDecoy.OneBasedPossibleLocalizedModifications.Count); foundResidueIndicies = variantDecoy.OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList(); expectedResidueIndices = new List <int>() { 4, 6 }; //originally modified residues are now at the end in the decoy Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices)); var thisOk = unknownModifications; //for debugging var commonParamsAtThisPoint = task1.CommonParameters.DigestionParams; //for debugging var digestedList = variantProteins[0].GetVariantProteins()[0].Digest(task1.CommonParameters.DigestionParams, new List <Modification>(), variableModifications).ToList(); Assert.AreEqual(4, digestedList.Count); //Set Peptide with 1 mod at position 3 PeptideWithSetModifications pepWithSetMods1 = digestedList[1]; //Finally Write MZML file Assert.AreEqual("PEK[type:acetyl on K]TID", pepWithSetMods1.FullSequence);//this might be base sequence MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> { pepWithSetMods1 }); string mzmlName = @"hello.mzML"; IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); //run! var engine = new EverythingRunnerEngine(taskList, new List <string> { mzmlName }, new List <DbForTask> { new DbForTask(xmlName, false) }, Environment.CurrentDirectory); engine.Run(); }
public static void TestPrunedDatabase() { //Create Search Task SearchTask task1 = new SearchTask { SearchParameters = new SearchParameters { WritePrunedDatabase = true, SearchTarget = true, MassDiffAcceptorType = MassDiffAcceptorType.Exact, ModsToWriteSelection = new Dictionary <string, int> { { "ConnorModType", 1 } } }, CommonParameters = new CommonParameters(digestionParams: new DigestionParams(minPeptideLength: 5)) }; //add task to task list List <(string, MetaMorpheusTask)> taskList = new List <(string, MetaMorpheusTask)> { ("task1", task1) }; ModificationMotif.TryGetMotif("P", out ModificationMotif motif); var connorMod = new ModificationWithMass("ConnorMod", "ConnorModType", motif, TerminusLocalization.Any, 10); GlobalVariables.AddMods(new List <ModificationWithLocation> { connorMod }); //create modification lists List <ModificationWithMass> variableModifications = GlobalVariables.AllModsKnown.OfType <ModificationWithMass>().Where (b => task1.CommonParameters.ListOfModsVariable.Contains((b.modificationType, b.id))).ToList(); //add modification to Protein object var dictHere = new Dictionary <int, List <Modification> >(); ModificationWithMass modToAdd = connorMod; ModificationWithMass modToAdd2 = connorMod; dictHere.Add(1, new List <Modification> { modToAdd }); dictHere.Add(3, new List <Modification> { modToAdd2 }); //protein Creation (One with mod and one without) Protein TestProteinWithMod = new Protein("PEPTID", "accession1", "organism", new List <Tuple <string, string> >(), dictHere); //First Write XML Database string xmlName = "okkk.xml"; //Add Mod to list and write XML input database Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >(); var Hash = new HashSet <Tuple <int, Modification> > { new Tuple <int, Modification>(3, modToAdd) }; modList.Add("test", Hash); ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> { TestProteinWithMod }, xmlName); //now write MZML file var protein = ProteinDbLoader.LoadProteinXML(xmlName, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> ok); var digestedList = protein[0].Digest(task1.CommonParameters.DigestionParams, new List <ModificationWithMass> { }, variableModifications).ToList(); Assert.AreEqual(4, digestedList.Count); //Set Peptide with 1 mod at position 3 PeptideWithSetModifications pepWithSetMods1 = digestedList[1]; //Finally Write MZML file Assert.AreEqual("PEP[ConnorModType:ConnorMod]TID", pepWithSetMods1.Sequence); MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> { pepWithSetMods1 }); string mzmlName = @"hello.mzML"; IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); //run! var engine = new EverythingRunnerEngine(taskList, new List <string> { mzmlName }, new List <DbForTask> { new DbForTask(xmlName, false) }, Environment.CurrentDirectory); engine.Run(); string final = Path.Combine(MySetUpClass.outputFolder, "task1", "okkkpruned.xml"); var proteins = ProteinDbLoader.LoadProteinXML(final, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out ok); //check length Assert.AreEqual(proteins[0].OneBasedPossibleLocalizedModifications.Count, 1); //check location (key) Assert.AreEqual(proteins[0].OneBasedPossibleLocalizedModifications.ContainsKey(3), true); List <Modification> listOfMods = proteins[0].OneBasedPossibleLocalizedModifications[3]; //check Type, count, ID Assert.AreEqual(listOfMods[0].modificationType, "ConnorModType"); Assert.AreEqual(listOfMods[0].id, "ConnorMod"); Assert.AreEqual(listOfMods.Count, 1); }
public static void TestProteinPrunedWithModSelectionAndVariants() { var modToWrite = GlobalVariables.AllModsKnown.Where(p => p.ModificationType == "UniProt" && p.Target.ToString() == "T").First(); var modToNotWrite = GlobalVariables.AllModsKnown.Where(p => p.ModificationType == "Common Artifact" && p.Target.ToString() == "X").First(); Dictionary <int, List <Modification> > variantMods = new Dictionary <int, List <Modification> >(); variantMods.Add(1, new List <Modification>() { modToNotWrite }); List <SequenceVariation> variants = new List <SequenceVariation> { new SequenceVariation(4, 4, "V", "T", @"20\t41168825\t.\tT\tC\t14290.77\t.\tANN=C|missense_variant|MODERATE|PLCG1|ENSG00000124181|transcript|ENST00000244007.7|protein_coding|22/33|c.2438T>C|p.Ile813Thr|2635/5285|2438/3876|813/1291||\tGT:AD:DP:GQ:PL\t1/1:1,392:393:99:14319,1142,0", variantMods) }; var protein1 = new Protein("PEPVIDEKPEPT", "1", oneBasedModifications: new Dictionary <int, List <Modification> > { { 1, new List <Modification> { modToNotWrite } }, { 12, new List <Modification> { modToWrite } } }, sequenceVariations: variants); var protein2 = new Protein("PEPIDPEPT", "2", oneBasedModifications: new Dictionary <int, List <Modification> > { { 1, new List <Modification> { modToNotWrite } }, { 9, new List <Modification> { modToWrite } } }); var protein1Variants = protein1.GetVariantProteins(1, 0); string path = @"temp"; var proteinList = new List <Protein> { protein1, protein2 }; proteinList.AddRange(protein1Variants); ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), proteinList, path); Directory.CreateDirectory(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTestVariant")); Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >(); var Hash = new HashSet <Tuple <int, Modification> > { new Tuple <int, Modification>(1, modToWrite), new Tuple <int, Modification>(2, modToNotWrite), }; var db = ProteinDbWriter.WriteXmlDatabase(modList, proteinList, Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTestVariant/fakeDb.xml")); var peptideObserved = protein1Variants.First().Digest(new DigestionParams(minPeptideLength: 1), new List <Modification>(), new List <Modification>()) .Where(p => p.BaseSequence == "PEPT").First(); PostSearchAnalysisParameters testPostTaskParameters = new PostSearchAnalysisParameters(); CommonParameters commonParam = new CommonParameters(useDeltaScore: false); double[,] noiseData = new double[10000, 10000]; noiseData[0, 0] = 1.0; List <Proteomics.Fragmentation.MatchedFragmentIon> matchedFragmentIons = new List <Proteomics.Fragmentation.MatchedFragmentIon>() { }; MzSpectrum spectrum = new MzSpectrum(noiseData); MsDataScan scan = new MsDataScan(spectrum, 1, 1, true, Polarity.Unknown, 2, new MzLibUtil.MzRange(10, 1000), "", MZAnalyzerType.Orbitrap, 10000, null, noiseData, ""); testPostTaskParameters.ProteinList = proteinList; testPostTaskParameters.AllPsms = new List <PeptideSpectralMatch> { new PeptideSpectralMatch(peptideObserved, 0, 20, 1, new Ms2ScanWithSpecificMass(scan, 100, 1, @"", commonParam), commonParam, matchedFragmentIons) }; testPostTaskParameters.SearchParameters = new SearchParameters(); testPostTaskParameters.SearchParameters.WritePrunedDatabase = true; testPostTaskParameters.SearchParameters.DoQuantification = false; testPostTaskParameters.SearchParameters.WriteMzId = false; testPostTaskParameters.DatabaseFilenameList = new List <DbForTask>() { new DbForTask(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDb.xml"), false) }; testPostTaskParameters.OutputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest"); Directory.CreateDirectory(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/individual")); testPostTaskParameters.IndividualResultsOutputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/individual"); int[] stuffForSpectraFile = new int[2]; stuffForSpectraFile[0] = 10; stuffForSpectraFile[1] = 10; Dictionary <string, int[]> numSpectraPerFile = new Dictionary <string, int[]>(); numSpectraPerFile.Add("", stuffForSpectraFile); testPostTaskParameters.NumMs2SpectraPerFile = numSpectraPerFile; MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> { peptideObserved }); string mzmlName = @"newMzml.mzML"; IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); modList.Add("test", Hash); testPostTaskParameters.CurrentRawFileList = new List <string>() { mzmlName }; SearchTask task5 = new SearchTask { SearchParameters = new SearchParameters { WritePrunedDatabase = true, SearchTarget = true, MassDiffAcceptorType = MassDiffAcceptorType.Exact, }, CommonParameters = new CommonParameters() }; var test = task5.RunTask(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest"), new List <DbForTask>() { new DbForTask(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDb.xml"), false) }, new List <string>() { mzmlName }, "name"); testPostTaskParameters.SearchTaskResults = test; PostSearchAnalysisTask testPostTask = new PostSearchAnalysisTask(); testPostTask.Parameters = testPostTaskParameters; testPostTask.CommonParameters = commonParam; testPostTask.FileSpecificParameters = new List <(string FileName, CommonParameters Parameters)> { ("newMzMl.mzml", commonParam) }; testPostTask.Run(); var proteinsLoaded = ProteinDbLoader.LoadProteinXML(path, true, DecoyType.None, GlobalVariables.AllModsKnown, false, new List <string>(), out var unknownMods); // assert that mods on proteins are the same before/after task is run Assert.AreEqual(protein1Variants.First().Accession, proteinsLoaded.First().Accession); Assert.AreEqual(protein1Variants.First().OneBasedPossibleLocalizedModifications.Count(), proteinsLoaded.First().OneBasedPossibleLocalizedModifications.Count()); Assert.AreEqual(protein2.OneBasedPossibleLocalizedModifications.Count(), proteinsLoaded.ElementAt(1).OneBasedPossibleLocalizedModifications.Count()); // assert that protein pruned DB has correct proteins mods var proteinPruned = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDbproteinPruned.xml"), true, DecoyType.None, GlobalVariables.AllModsKnown, false, new List <string>(), out var unknownMods1); Assert.That(proteinPruned.Count().Equals(1)); Assert.That(proteinPruned.FirstOrDefault().OneBasedPossibleLocalizedModifications.Count().Equals(1)); // assert that mod-pruned DB has correct proteins and mods var modPruned = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, @"PrunedDbTest/fakeDbpruned.xml"), true, DecoyType.None, GlobalVariables.AllModsKnown, false, new List <string>(), out var unknownMods2); Assert.That(modPruned.Count().Equals(2)); Assert.That(modPruned.ElementAt(0).OneBasedPossibleLocalizedModifications.Count().Equals(1)); Assert.That(modPruned.ElementAt(1).OneBasedPossibleLocalizedModifications.Count().Equals(1)); }
public void ReadXmlNulls() { var ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"xml2.xml"), true, DecoyType.None, null, false, null, out Dictionary <string, Modification> un); }
static void Main(string[] args) { List <string> files = args.Where(f => File.Exists(f) & (Path.GetExtension(f) == ".xml" || Path.GetExtension(f) == ".xml.gz")).ToList(); if (files.Count < 2) { Console.WriteLine("Please enter at least two protein .xml or .xml.gz databases."); return; } // check that file path is valid string timestamp = DateTime.Now.Year.ToString("0000") + "-" + DateTime.Now.Month.ToString("00") + "-" + DateTime.Now.Day.ToString("00") + "-" + DateTime.Now.Hour.ToString("00") + "-" + DateTime.Now.Minute.ToString("00") + "-" + DateTime.Now.Second.ToString("00"); string outpath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "merged_database_" + timestamp + ".xml"); // merge databases Loaders.LoadElements(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "elements.dat")); List <Protein> merged = ProteinDbLoader.merge_proteins(files.SelectMany(f => ProteinDbLoader.LoadProteinXML(f, false, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> un))).ToList(); ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), merged, outpath); }
public void get_theoretical_proteoforms(string current_directory) { if (!ready_to_make_database(current_directory)) { return; } //Clear out data from potential previous runs foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist List <ModificationWithLocation> all_known_modifications = SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.PtmList).SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path)).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); Dictionary <string, Modification> um; Parallel.ForEach(SaveState.lollipop.get_files(SaveState.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, false, all_known_modifications, database.ContaminantDB, SaveState.lollipop.mod_types_to_exclude, out um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).OfType <ModificationWithLocation>().Where(m => !SaveState.lollipop.mod_types_to_exclude.Contains(m.modificationType))); }); foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods"))) { var new_mods = !filename.EndsWith("variable.txt") || SaveState.lollipop.methionine_oxidation ? PtmListLoader.ReadModsFromFile(filename) : new List <ModificationWithLocation>(); // Empty variable modifications if not selected if (filename.EndsWith("variable.txt")) { variableModifications = new_mods.OfType <ModificationWithMass>().ToList(); } if (filename.EndsWith("intact_mods.txt")) { List <double> old_mods = all_known_modifications.OfType <ModificationWithMass>().Select(m => m.monoisotopicMass).ToList(); new_mods = new_mods.OfType <ModificationWithMass>().Where(m => !old_mods.Contains(m.monoisotopicMass)); // get rid of the unlocalized mods if they're already present } all_known_modifications.AddRange(new_mods); } all_known_modifications = new HashSet <ModificationWithLocation>(all_known_modifications).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).OfType <ModificationWithMass>().Concat(variableModifications).ToList(); SaveState.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <ModificationWithMass> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //Generate all two-member sets and all three-member (or greater) sets of the same modification (three-member combinitorics gets out of hand for assignment) all_possible_ptmsets = PtmCombos.generate_all_ptmsets(Math.Min(2, SaveState.lollipop.max_ptms), all_mods_with_mass, SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i < SaveState.lollipop.max_ptms + 1; i++) { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), SaveState.lollipop.modification_ranks, SaveState.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); expanded_proteins = expand_protein_entries(theoretical_proteins.Values.SelectMany(p => p).ToArray()); aaIsotopeMassList = new AminoAcidMasses(SaveState.lollipop.carbamidomethylation, SaveState.lollipop.natural_lysine_isotope_abundance, SaveState.lollipop.neucode_light_lysine, SaveState.lollipop.neucode_heavy_lysine).AA_Masses; if (SaveState.lollipop.combine_identical_sequences) { expanded_proteins = group_proteins_by_sequence(expanded_proteins); } expanded_proteins = expanded_proteins.OrderBy(x => x.OneBasedPossibleLocalizedModifications.Count).ToArray(); // Take on harder problems first to use parallelization more effectively process_entries(expanded_proteins, variableModifications); process_decoys(expanded_proteins, variableModifications); if (SaveState.lollipop.combine_theoretical_proteoforms_byMass) { SaveState.lollipop.target_proteoform_community.theoretical_proteoforms = group_proteoforms_by_mass(SaveState.lollipop.target_proteoform_community.theoretical_proteoforms); foreach (ProteoformCommunity community in SaveState.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = group_proteoforms_by_mass(community.theoretical_proteoforms); } } }
public static void AppliedVariants() { ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary <string, IList <string> >(), null, null, null, null, null); List <Protein> proteinsWithSeqVars = new List <Protein> { new Protein("MPEPTIDE", "protein1", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPTIDE", "protein2", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPTIDE", "protein5", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary <int, List <Modification> > { { 5, new[] { mp }.ToList() } }) }), }; var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantProteins()).ToList(); var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantProteins()).ToList(); // should be stable string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml"); ProteinDbWriter.WriteXmlDatabase(null, proteinsWithSeqVars, xml); var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un); var listArray = new[] { proteinsWithAppliedVariants, proteinsWithAppliedVariants2, proteinsWithAppliedVariants3 }; for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) { // sequences Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence); Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence); Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence); Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence); Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence); Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key); // SAV Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition); // MNV Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition); // insertion Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition); // deletion Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition); } }
public static void DatabaseSummary(string sourceXmlPath, string destinationXmlPath) { var culture = CultureInfo.CurrentCulture; var uniprotPtms = ProteinAnnotation.GetUniProtMods(Environment.CurrentDirectory); var uniprot = ProteinDbLoader.LoadProteinXML(sourceXmlPath, true, DecoyType.None, uniprotPtms, false, null, out var un); var spritz = ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, DecoyType.None, uniprotPtms, false, null, out un); var spritzCanonical = spritz.Select(p => p.NonVariantProtein).Distinct().ToList(); int numberOfCanonicalProteinEntries = spritzCanonical.Count; int numberOfVariantProteinEntries = spritz.Count - spritzCanonical.Count; int synonymousCount = 0; int totalVariants = 0; int missenseSnvCount = 0; int missenseMnvCount = 0; int insertionCount = 0; int deletionCount = 0; int frameshiftCount = 0; int stopGainCount = 0; int stopLossCount = 0; Dictionary <string, List <SequenceVariation> > allVariants = new Dictionary <string, List <SequenceVariation> >(); foreach (var spritzEntry in spritz) { if (spritzEntry.AppliedSequenceVariations.Count != 0) { if (allVariants.ContainsKey(spritzEntry.NonVariantProtein.Accession)) { foreach (var variant in spritzEntry.AppliedSequenceVariations) { if (!allVariants[spritzEntry.NonVariantProtein.Accession].Contains(variant)) { allVariants[spritzEntry.NonVariantProtein.Accession].Add(variant); } } } else { allVariants.Add(spritzEntry.NonVariantProtein.Accession, spritzEntry.AppliedSequenceVariations); } } } foreach (var entry in allVariants) { foreach (var variant in entry.Value) { if (culture.CompareInfo.IndexOf(variant.Description.Description, "synonymous_variant", CompareOptions.IgnoreCase) >= 0) { synonymousCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0 && variant.Description.ReferenceAlleleString.Length == 1 && variant.Description.AlternateAlleleString.Length == 1) { missenseSnvCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0) { missenseMnvCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "frameshift_variant", CompareOptions.IgnoreCase) >= 0) { frameshiftCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_gained", CompareOptions.IgnoreCase) >= 0) { stopGainCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_insertion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_insertion", CompareOptions.IgnoreCase) >= 0) { insertionCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_deletion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_deletion", CompareOptions.IgnoreCase) >= 0) { deletionCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_loss", CompareOptions.IgnoreCase) >= 0) { stopLossCount++; totalVariants++; } } } Console.WriteLine($"Spritz Database Summary"); Console.WriteLine($"--------------------------------------------------------------"); Console.WriteLine($"{numberOfCanonicalProteinEntries}\tTotal number of canonical protein entries (before applying variations)"); Console.WriteLine($"{spritz.Count}\tTotal number of protein entries"); Console.WriteLine($"{numberOfVariantProteinEntries}\tTotal number of variant containing protein entries"); Console.WriteLine($"{totalVariants}\tTotal number of unique variants"); Console.WriteLine($"{synonymousCount}\tTotal number of unique synonymous variants"); Console.WriteLine($"{(totalVariants - synonymousCount)}\tTotal number of unique nonsynonymous variants"); Console.WriteLine($"{missenseSnvCount}\tNumber of unique SNV missense variants"); Console.WriteLine($"{missenseMnvCount}\tNumber of unique MNV missense variants"); Console.WriteLine($"{frameshiftCount}\tNumber of unique frameshift variants"); Console.WriteLine($"{insertionCount}\tNumber of unique insertion variants"); Console.WriteLine($"{deletionCount}\tNumber of unique deletion variants"); Console.WriteLine($"{stopGainCount}\tNumber of unique stop gain variants"); Console.WriteLine($"{stopLossCount}\tNumber of unique stop loss variants"); }
public void get_theoretical_proteoforms(string current_directory) { if (!ready_to_make_database(current_directory)) { return; } //Clear out data from potential previous runs foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(current_directory, "Mods", "PSI-MOD.obo.xml")); Dictionary <string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); List <Modification> all_known_modifications = Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.PtmList) .SelectMany(file => PtmListLoader.ReadModsFromFile(file.complete_path, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings)) .ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); Parallel.ForEach(Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList(), database => { if (database.extension == ".xml") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType))); } else if (database.extension == ".fasta") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray()); } }); foreach (string filename in Directory.GetFiles(Path.Combine(current_directory, "Mods"))) { List <Modification> new_mods = !filename.EndsWith("variable.txt") || Sweet.lollipop.methionine_oxidation ? PtmListLoader.ReadModsFromFile(filename, formalChargesDictionary, out List <(Modification, string)> filteredModificationsWithWarnings).ToList() : new List <Modification>(); // Empty variable modifications if not selected if (filename.EndsWith("variable.txt")) { variableModifications = new_mods; } all_known_modifications.AddRange(new_mods); } all_known_modifications = new HashSet <Modification>(all_known_modifications).ToList(); uniprotModifications = make_modification_dictionary(all_known_modifications); all_mods_with_mass = uniprotModifications.SelectMany(kv => kv.Value).Concat(variableModifications).ToList(); Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> { new Ptm().modification })); load_unlocalized_names(Path.Combine(Environment.CurrentDirectory, "Mods", "stored_mods.modnames")); //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); make_theoretical_proteoforms(); }
public void TestFullProteinReadWrite() { Modification mod = new Modification("mod1", null, "modType1", null, null, null, null, null, null, null, null, null, null, null); ModificationMotif.TryGetMotif("E", out ModificationMotif motif); Modification mod2 = new Modification("mod2 on E", null, "modType1", null, motif, "Anywhere.", null, null, null, null, null, null, null, null); ModificationMotif.TryGetMotif("N", out ModificationMotif motif3); Modification mod3 = new Modification("mod3 on N", null, "modType1", null, motif3, "Anywhere.", null, 10, null, null, null, null, null, null); List <Tuple <string, string> > gene_names = new List <Tuple <string, string> > { new Tuple <string, string>("a", "b") }; IDictionary <int, List <Modification> > oneBasedModifications = new Dictionary <int, List <Modification> > { { 3, new List <Modification> { mod } }, { 4, new List <Modification> { mod2 } }, { 5, new List <Modification> { mod3 } } }; List <ProteolysisProduct> proteolysisProducts = new List <ProteolysisProduct> { new ProteolysisProduct(1, 2, "propeptide") }; string name = "testName"; string full_name = "testFullName"; List <DatabaseReference> databaseReferences = new List <DatabaseReference> { new DatabaseReference("type1", "id1", new List <Tuple <string, string> > { new Tuple <string, string>("e1", "e2") }) }; List <SequenceVariation> sequenceVariations = new List <SequenceVariation> { new SequenceVariation(3, "Q", "N", "replace Q by N"), new SequenceVariation(3, 4, "QE", "NN", "replace QE by NN") }; List <DisulfideBond> disulfideBonds = new List <DisulfideBond> { new DisulfideBond(1, "ds1"), new DisulfideBond(2, 3, "ds2") }; Protein p1 = new Protein( "SEQENCE", "a1", geneNames: gene_names, oneBasedModifications: oneBasedModifications, proteolysisProducts: proteolysisProducts, name: name, fullName: full_name, isDecoy: false, isContaminant: true, databaseReferences: databaseReferences, sequenceVariations: sequenceVariations, disulfideBonds: disulfideBonds, databaseFilePath: Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bnueiwhf.xml")); // Generate data for files ProteinDbWriter.WriteXmlDatabase(new Dictionary <string, HashSet <Tuple <int, Modification> > >(), new List <Protein> { p1 }, Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bnueiwhf.xml")); IEnumerable <string> modTypesToExclude = new List <string>(); IEnumerable <Modification> allKnownModifications = new List <Modification>(); List <Protein> ok = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", @"bnueiwhf.xml"), true, DecoyType.None, allKnownModifications, true, modTypesToExclude, out Dictionary <string, Modification> unknownModifications); Assert.AreEqual(p1.Accession, ok[0].Accession); Assert.AreEqual(p1.BaseSequence, ok[0].BaseSequence); Assert.AreEqual(p1.DatabaseReferences.First().Id, ok[0].DatabaseReferences.First().Id); Assert.AreEqual(p1.DatabaseReferences.First().Properties.First().Item1, ok[0].DatabaseReferences.First().Properties.First().Item1); Assert.AreEqual(p1.DatabaseReferences.First().Properties.First().Item2, ok[0].DatabaseReferences.First().Properties.First().Item2); Assert.AreEqual(p1.DatabaseReferences.First().Type, ok[0].DatabaseReferences.First().Type); Assert.AreEqual(p1.DisulfideBonds.First().Description, ok[0].DisulfideBonds.First().Description); Assert.AreEqual(p1.DisulfideBonds.First().OneBasedBeginPosition, ok[0].DisulfideBonds.First().OneBasedBeginPosition); Assert.AreEqual(p1.DisulfideBonds.First().OneBasedEndPosition, ok[0].DisulfideBonds.First().OneBasedEndPosition); Assert.AreEqual(p1.DisulfideBonds.Last().Description, ok[0].DisulfideBonds.Last().Description); Assert.AreEqual(p1.DisulfideBonds.Last().OneBasedBeginPosition, ok[0].DisulfideBonds.Last().OneBasedBeginPosition); Assert.AreEqual(p1.DisulfideBonds.Last().OneBasedEndPosition, ok[0].DisulfideBonds.Last().OneBasedEndPosition); Assert.AreEqual(p1.FullDescription, ok[0].FullDescription); Assert.AreEqual(p1.FullName, ok[0].FullName); Assert.AreEqual(p1.GeneNames, ok[0].GeneNames); Assert.AreEqual(p1.IsContaminant, ok[0].IsContaminant); Assert.AreEqual(p1.IsDecoy, ok[0].IsDecoy); Assert.AreEqual(p1.Length, ok[0].Length); Assert.AreEqual(p1.Name, ok[0].Name); Assert.AreEqual(p1.Organism, ok[0].Organism); Assert.AreEqual(p1.DatabaseFilePath, ok[0].DatabaseFilePath); Assert.AreEqual(1, p1.OneBasedPossibleLocalizedModifications.Keys.Count); Assert.AreEqual(1, ok[0].OneBasedPossibleLocalizedModifications.Keys.Count); Assert.AreEqual(p1.OneBasedPossibleLocalizedModifications.Keys.First(), ok[0].OneBasedPossibleLocalizedModifications.Keys.First()); Assert.IsTrue(p1.OneBasedPossibleLocalizedModifications[5][0].Equals(ok[0].OneBasedPossibleLocalizedModifications[5][0])); Assert.AreEqual(p1.ProteolysisProducts.First().OneBasedBeginPosition, ok[0].ProteolysisProducts.First().OneBasedBeginPosition); Assert.AreEqual(p1.ProteolysisProducts.First().OneBasedEndPosition, ok[0].ProteolysisProducts.First().OneBasedEndPosition); Assert.AreEqual(p1.ProteolysisProducts.First().Type, ok[0].ProteolysisProducts.First().Type); Assert.AreEqual(p1.SequenceVariations.First().Description, ok[0].SequenceVariations.First().Description); Assert.AreEqual(p1.SequenceVariations.First().OneBasedBeginPosition, ok[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(p1.SequenceVariations.First().OneBasedEndPosition, ok[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(p1.SequenceVariations.First().OriginalSequence, ok[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(p1.SequenceVariations.First().VariantSequence, ok[0].SequenceVariations.First().VariantSequence); Assert.AreEqual(p1.SequenceVariations.Last().Description, ok[0].SequenceVariations.Last().Description); Assert.AreEqual(p1.SequenceVariations.Last().OneBasedBeginPosition, ok[0].SequenceVariations.Last().OneBasedBeginPosition); Assert.AreEqual(p1.SequenceVariations.Last().OneBasedEndPosition, ok[0].SequenceVariations.Last().OneBasedEndPosition); Assert.AreEqual(p1.SequenceVariations.Last().OriginalSequence, ok[0].SequenceVariations.Last().OriginalSequence); Assert.AreEqual(p1.SequenceVariations.Last().VariantSequence, ok[0].SequenceVariations.Last().VariantSequence); }
public void get_theoretical_proteoforms(string current_directory) { //Clear out data from potential previous runs foreach (ProteoformCommunity community in Sweet.lollipop.decoy_proteoform_communities.Values) { community.theoretical_proteoforms = new TheoreticalProteoform[0]; } theoretical_proteins.Clear(); //Read the UniProt-XML and ptmlist List <Modification> all_known_modifications = get_mods(current_directory); foreach (var database in Sweet.lollipop.get_files(Sweet.lollipop.input_files, Purpose.ProteinDatabase).ToList()) { if (database.extension == ".xml") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinXML(database.complete_path, true, DecoyType.None, all_known_modifications, database.ContaminantDB, Sweet.lollipop.mod_types_to_exclude, out Dictionary <string, Modification> um).ToArray()); lock (all_known_modifications) all_known_modifications.AddRange(ProteinDbLoader.GetPtmListFromProteinXml(database.complete_path).Where(m => !Sweet.lollipop.mod_types_to_exclude.Contains(m.ModificationType))); } else if (database.extension == ".fasta") { lock (theoretical_proteins) theoretical_proteins.Add(database, ProteinDbLoader.LoadProteinFasta(database.complete_path, true, DecoyType.None, database.ContaminantDB, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out var dbErrors).ToArray()); } } Sweet.lollipop.modification_ranks = rank_mods(theoretical_proteins, variableModifications, all_mods_with_mass); unlocalized_lookup = make_unlocalized_lookup(all_mods_with_mass.Concat(new List <Modification> { new Ptm().modification })); load_unlocalized_names(Path.Combine(current_directory, "Mods", "stored_mods.modnames")); //this is for ptmsets --> used in RELATIONS all_possible_ptmsets = PtmCombos.generate_all_ptmsets(2, all_mods_with_mass, Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2).ToList(); for (int i = 2; i <= Math.Max(ptmset_max_number_of_a_kind, Sweet.lollipop.max_ptms); i++) // the method above doesn't make 2 or more of a kind, so we make it here { all_possible_ptmsets.AddRange(all_mods_with_mass.Select(m => new PtmSet(Enumerable.Repeat(new Ptm(-1, m), i).ToList(), Sweet.lollipop.modification_ranks, Sweet.lollipop.mod_rank_first_quartile / 2))); } //Generate lookup table for ptm sets based on rounded mass of eligible PTMs -- used in forming ET relations possible_ptmset_dictionary = make_ptmset_dictionary(); //read in bottom-up PSMs bottom_up_psm_by_accession.Clear(); foreach (var file in Sweet.lollipop.input_files.Where(f => f.purpose == Purpose.BottomUp)) { var bottom_up_psms = Sweet.lollipop.bottomupReader.ReadTDFile(file); foreach (var psm in bottom_up_psms) { string accession = psm.accession.Split('_')[0].Split('-')[0]; bottom_up_psm_by_accession.TryGetValue(accession, out var psms); if (psms == null) { bottom_up_psm_by_accession.Add(accession, new List <SpectrumMatch>() { psm }); } else { psms.Add(psm); } } } //make theoreticals make_theoretical_proteoforms(); }
public static void TestUserModSelectionInPrunedDB() { List <(string, string)> listOfModsFixed = new List <(string, string)> { ("Common Fixed", "Carbamidomethyl of C"), ("Common Fixed", "Carbamidomethyl of U") }; //Create Search Task SearchTask task5 = new SearchTask { SearchParameters = new SearchParameters { WritePrunedDatabase = true, SearchTarget = true, MassDiffAcceptorType = MassDiffAcceptorType.Exact, }, CommonParameters = new CommonParameters(listOfModsFixed: listOfModsFixed) }; task5.SearchParameters.ModsToWriteSelection["Mod"] = 0; task5.SearchParameters.ModsToWriteSelection["Common Fixed"] = 1; task5.SearchParameters.ModsToWriteSelection["Glycan"] = 2; task5.SearchParameters.ModsToWriteSelection["missing"] = 3; //add task 1 to task list List <(string, MetaMorpheusTask)> taskList = new List <(string, MetaMorpheusTask)> { ("task5", task5) }; ModificationMotif.TryGetMotif("P", out ModificationMotif motif); ModificationMotif.TryGetMotif("E", out ModificationMotif motif2); var connorMod = new Modification(_originalId: "ModToNotAppear", _modificationType: "Mod", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10); var connorMod2 = new Modification(_originalId: "Default(Mod in DB and Observed)", _modificationType: "Common Fixed", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10); var connorMod3 = new Modification(_originalId: "ModToAlwaysAppear", _modificationType: "Glycan", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10); var connorMod4 = new Modification(_originalId: "ModObservedNotinDB", _modificationType: "missing", _target: motif2, _locationRestriction: "Anywhere.", _monoisotopicMass: 5); GlobalVariables.AddMods(new List <Modification> { connorMod, connorMod2, connorMod3, connorMod4 }, false); //create modification lists List <Modification> variableModifications = GlobalVariables.AllModsKnown.OfType <Modification>().Where(b => task5.CommonParameters.ListOfModsVariable.Contains ((b.ModificationType, b.IdWithMotif))).ToList(); List <Modification> fixedModifications = GlobalVariables.AllModsKnown.OfType <Modification>().Where(b => task5.CommonParameters.ListOfModsFixed.Contains ((b.ModificationType, b.IdWithMotif))).ToList(); //add modification to Protein object var dictHere = new Dictionary <int, List <Modification> >(); Modification modToAdd = connorMod; Modification modToAdd2 = connorMod2; Modification modToAdd3 = connorMod3; Modification modToAdd4 = connorMod4; //add Fixed modifcation so can test if mod that is observed and not in DB fixedModifications.Add(connorMod4); listOfModsFixed.Add((connorMod4.ModificationType, connorMod4.IdWithMotif)); dictHere.Add(1, new List <Modification> { modToAdd }); dictHere.Add(2, new List <Modification> { modToAdd2 }); //default dictHere.Add(3, new List <Modification> { modToAdd3 }); //Alway Appear var dictHere2 = new Dictionary <int, List <Modification> > { { 1, new List <Modification> { modToAdd } }, { 2, new List <Modification> { modToAdd2 } }, //default { 3, new List <Modification> { modToAdd3 } }, //Alway Appear { 4, new List <Modification> { modToAdd4 } } //observed }; //protein Creation (One with mod and one without) Protein TestProteinWithModForDB = new Protein("PPPPPPPPPPE", "accession1", "organism", new List <Tuple <string, string> >(), dictHere); Protein TestProteinWithModObsevred = new Protein("PPPPPPPPPPE", "accession1", "organism", new List <Tuple <string, string> >(), dictHere2); //First Write XML Database string xmlName = "selectedMods.xml"; string xmlName2 = "selectedModsObvs.xml"; //Add Mod to list and write XML input database Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >(); var Hash = new HashSet <Tuple <int, Modification> > { new Tuple <int, Modification>(1, modToAdd), new Tuple <int, Modification>(2, modToAdd2), new Tuple <int, Modification>(3, modToAdd3), new Tuple <int, Modification>(4, modToAdd4), //Observed Only }; modList.Add("test", Hash); ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> { TestProteinWithModForDB }, xmlName); //Add Observed Only modList.Add("test2", Hash); ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> { TestProteinWithModObsevred }, xmlName2); //now create MZML data var protein = ProteinDbLoader.LoadProteinXML(xmlName2, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> ok); var digestedList = protein[0].Digest(task5.CommonParameters.DigestionParams, fixedModifications, variableModifications).ToList(); //Set Peptide with 1 mod at position 3 PeptideWithSetModifications pepWithSetMods1 = digestedList[0]; PeptideWithSetModifications pepWithSetMods2 = digestedList[1]; PeptideWithSetModifications pepWithSetMods3 = digestedList[2]; PeptideWithSetModifications pepWithSetMods4 = digestedList[3]; PeptideWithSetModifications pepWithSetMods5 = digestedList[4]; //CUSTOM PEP MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> { pepWithSetMods1, pepWithSetMods2, pepWithSetMods3, pepWithSetMods4, pepWithSetMods5 }); string mzmlName = @"newMzml.mzML"; IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); //make sure this runs correctly //run! string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestUserModSelectionInPrunedDB"); var engine = new EverythingRunnerEngine(taskList, new List <string> { mzmlName }, new List <DbForTask> { new DbForTask(xmlName, false) }, outputFolder); engine.Run(); string final = Path.Combine(MySetUpClass.outputFolder, "task5", "selectedModspruned.xml"); var proteins = ProteinDbLoader.LoadProteinXML(final, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out ok); var Dlist = proteins[0].GetVariantProteins().SelectMany(vp => vp.Digest(task5.CommonParameters.DigestionParams, fixedModifications, variableModifications)).ToList(); Assert.AreEqual(Dlist[0].NumFixedMods, 1); //check length Assert.AreEqual(proteins[0].OneBasedPossibleLocalizedModifications.Count, 3); List <Modification> listOfLocalMods = new List <Modification>(); listOfLocalMods.AddRange(proteins[0].OneBasedPossibleLocalizedModifications[2]); listOfLocalMods.AddRange(proteins[0].OneBasedPossibleLocalizedModifications[3]); listOfLocalMods.AddRange(proteins[0].OneBasedPossibleLocalizedModifications[11]); //check Type, count, ID Assert.AreEqual(listOfLocalMods[0].ModificationType, "Common Fixed"); Assert.AreEqual(listOfLocalMods[2].ModificationType, "missing"); Assert.IsFalse(listOfLocalMods.Contains(connorMod)); //make sure that mod set not to show up is not in mod list Assert.AreEqual(listOfLocalMods[0].IdWithMotif, "Default(Mod in DB and Observed) on P"); Assert.AreEqual(listOfLocalMods[1].IdWithMotif, "ModToAlwaysAppear on P"); //Makes sure Mod that was not in the DB but was observed is in pruned DB Assert.AreEqual(listOfLocalMods[2].IdWithMotif, "ModObservedNotinDB on E"); Assert.AreEqual(listOfLocalMods.Count, 3); Directory.Delete(outputFolder, true); File.Delete(mzmlName); File.Delete(xmlName); File.Delete(xmlName2); }
public static void DatabaseSummary(string sourceXmlPath, string destinationXmlPath, string destinationAccessionToNameTable, string variantDescriptionTable, bool target) { var culture = CultureInfo.CurrentCulture; var uniprotPtms = ProteinAnnotation.GetUniProtMods(Environment.CurrentDirectory); var uniprot = ProteinDbLoader.LoadProteinXML(sourceXmlPath, true, target ? DecoyType.None : DecoyType.Reverse, uniprotPtms, false, null, out var un); var spritz = ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, target ? DecoyType.None : DecoyType.Reverse, uniprotPtms, false, null, out un); var spritzCanonical = spritz.Select(p => p.NonVariantProtein).Distinct().ToList(); int numberOfCanonicalProteinEntries = spritzCanonical.Count; int numberOfVariantProteinEntries = spritz.Count - spritzCanonical.Count; int synonymousCount = 0; int totalVariants = 0; int missenseSnvCount = 0; int missenseMnvCount = 0; int insertionCount = 0; int deletionCount = 0; int frameshiftCount = 0; int stopGainCount = 0; int stopLossCount = 0; List <string> accessionNameList = new List <string>(); List <string> variantDescList = new List <string>(); List <string> accessionSequenceList = new List <string>(); Dictionary <string, List <SequenceVariation> > allVariants = new Dictionary <string, List <SequenceVariation> >(); foreach (var spritzEntry in spritz) { if (spritzEntry.AppliedSequenceVariations.Count != 0) { // Make pivot tables accessionNameList.Add($"{spritzEntry.Accession}\t{spritzEntry.FullName}\t{spritzEntry.BaseSequence}"); foreach (SequenceVariation variant in spritzEntry.AppliedSequenceVariations) { variantDescList.Add($"{spritzEntry.Accession}\t{variant.SimpleString()}\t{variant.Description}"); } if (allVariants.ContainsKey(spritzEntry.NonVariantProtein.Accession)) { foreach (SequenceVariation variant in spritzEntry.AppliedSequenceVariations) { if (!allVariants[spritzEntry.NonVariantProtein.Accession].Contains(variant)) { allVariants[spritzEntry.NonVariantProtein.Accession].Add(variant); } } } else { allVariants.Add(spritzEntry.NonVariantProtein.Accession, spritzEntry.AppliedSequenceVariations); } } } File.WriteAllLines(destinationAccessionToNameTable, accessionNameList); File.WriteAllLines(variantDescriptionTable, variantDescList); foreach (var entry in allVariants) { foreach (var variant in entry.Value) { variantDescList.Add($"{entry.Key}\t{variant.SimpleString()}\t{variant.Description}"); if (culture.CompareInfo.IndexOf(variant.Description.Description, "synonymous_variant", CompareOptions.IgnoreCase) >= 0) { synonymousCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0 && variant.Description.ReferenceAlleleString.Length == 1 && variant.Description.AlternateAlleleString.Length == 1) { missenseSnvCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0) { missenseMnvCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "frameshift_variant", CompareOptions.IgnoreCase) >= 0) { frameshiftCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_gained", CompareOptions.IgnoreCase) >= 0) { stopGainCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_insertion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_insertion", CompareOptions.IgnoreCase) >= 0) { insertionCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_deletion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_deletion", CompareOptions.IgnoreCase) >= 0) { deletionCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_lost", CompareOptions.IgnoreCase) >= 0) { stopLossCount++; totalVariants++; } } } Console.WriteLine($"Spritz Database Summary"); Console.WriteLine($"--------------------------------------------------------------"); Console.WriteLine($"{numberOfCanonicalProteinEntries}\tTotal number of canonical protein entries (before applying variations)"); Console.WriteLine($"{spritz.Count}\tTotal number of protein entries"); Console.WriteLine($"{spritzCanonical.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(b => b.Count))}\tTotal modifications appended from UniProt out of {uniprot.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(b => b.Count))}"); Console.WriteLine($"{numberOfVariantProteinEntries}\tTotal number of variant containing protein entries"); Console.WriteLine($"{totalVariants}\tTotal number of unique variants"); Console.WriteLine($"{synonymousCount}\tTotal number of unique synonymous variants"); Console.WriteLine($"{(totalVariants - synonymousCount)}\tTotal number of unique nonsynonymous variants"); Console.WriteLine($"{missenseSnvCount}\tNumber of unique SNV missense variants"); Console.WriteLine($"{missenseMnvCount}\tNumber of unique MNV missense variants"); Console.WriteLine($"{frameshiftCount}\tNumber of unique frameshift variants"); Console.WriteLine($"{insertionCount}\tNumber of unique insertion variants"); Console.WriteLine($"{deletionCount}\tNumber of unique deletion variants"); Console.WriteLine($"{stopGainCount}\tNumber of unique stop gain variants"); Console.WriteLine($"{stopLossCount}\tNumber of unique stop loss variants"); }
public static void TestPrunedDatabase() { //Create Search Task SearchTask task1 = new SearchTask { SearchParameters = new SearchParameters { WritePrunedDatabase = true, SearchTarget = true, MassDiffAcceptorType = MassDiffAcceptorType.Exact, ModsToWriteSelection = new Dictionary <string, int> { { "ConnorModType", 1 } } }, CommonParameters = new CommonParameters(digestionParams: new DigestionParams(minPeptideLength: 5)) }; //add task to task list List <(string, MetaMorpheusTask)> taskList = new List <(string, MetaMorpheusTask)> { ("task1", task1) }; ModificationMotif.TryGetMotif("P", out ModificationMotif motif); var connorMod = new Modification(_originalId: "ConnorMod on P", _modificationType: "ConnorModType", _target: motif, _locationRestriction: "Anywhere.", _monoisotopicMass: 10); GlobalVariables.AddMods(new List <Modification> { connorMod }, false); //create modification lists List <Modification> variableModifications = GlobalVariables.AllModsKnown.OfType <Modification>() .Where(b => task1.CommonParameters.ListOfModsVariable.Contains((b.ModificationType, b.IdWithMotif))).ToList(); //add modification to Protein object var dictHere = new Dictionary <int, List <Modification> >(); Modification modToAdd = connorMod; Modification modToAdd2 = connorMod; dictHere.Add(1, new List <Modification> { modToAdd }); dictHere.Add(3, new List <Modification> { modToAdd2 }); //protein Creation (One with mod and one without) Protein TestProteinWithMod = new Protein("PEPTID", "accession1", "organism", new List <Tuple <string, string> >(), dictHere); //First Write XML Database string xmlName = "okkk.xml"; //Add Mod to list and write XML input database Dictionary <string, HashSet <Tuple <int, Modification> > > modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >(); var Hash = new HashSet <Tuple <int, Modification> > { new Tuple <int, Modification>(3, modToAdd) }; modList.Add("test", Hash); ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> { TestProteinWithMod }, xmlName); //now write MZML file var protein = ProteinDbLoader.LoadProteinXML(xmlName, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out Dictionary <string, Modification> ok); //Dictionary 'ok' contains unknown modifications. There are no unknown modifications in this test. Assert.AreEqual(0, ok.Count); //One protein is read from the .xml database and one decoy is created. Therefore, the list of proteins contains 2 entries. Assert.AreEqual(2, protein.Count); //The original database had two localized mods on the protein. Therefore. both protein and decoy should have two mods. Assert.AreEqual(2, protein[0].OneBasedPossibleLocalizedModifications.Count); List <int> foundResidueIndicies = protein[0].OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList(); List <int> expectedResidueIndices = new List <int>() { 1, 3 }; Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices)); Assert.AreEqual(2, protein[1].OneBasedPossibleLocalizedModifications.Count); foundResidueIndicies = protein[1].OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList(); expectedResidueIndices = new List <int>() { 4, 6 }; //originally modified residues are now at the end in the decoy Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices)); var thisOk = ok; //for debugging var commonParamsAtThisPoint = task1.CommonParameters.DigestionParams; //for debugging var digestedList = protein[0].Digest(task1.CommonParameters.DigestionParams, new List <Modification> { }, variableModifications).ToList(); Assert.AreEqual(4, digestedList.Count); //Set Peptide with 1 mod at position 3 PeptideWithSetModifications pepWithSetMods1 = digestedList[1]; //Finally Write MZML file Assert.AreEqual("PEP[ConnorModType:ConnorMod on P]TID", pepWithSetMods1.FullSequence);//this might be base sequence MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> { pepWithSetMods1 }); string mzmlName = @"hello.mzML"; IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); //run! string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestPrunedDatabase"); var engine = new EverythingRunnerEngine(taskList, new List <string> { mzmlName }, new List <DbForTask> { new DbForTask(xmlName, false) }, outputFolder); engine.Run(); string final = Path.Combine(MySetUpClass.outputFolder, "task1", "okkkpruned.xml"); var proteins = ProteinDbLoader.LoadProteinXML(final, true, DecoyType.Reverse, new List <Modification>(), false, new List <string>(), out ok); //check length Assert.AreEqual(1, proteins[0].OneBasedPossibleLocalizedModifications.Count); //check location (key) Assert.AreEqual(true, proteins[0].OneBasedPossibleLocalizedModifications.ContainsKey(3)); List <Modification> listOfMods = proteins[0].OneBasedPossibleLocalizedModifications[3]; //check Type, count, ID Assert.AreEqual(listOfMods[0].ModificationType, "ConnorModType"); Assert.AreEqual(listOfMods[0].IdWithMotif, "ConnorMod on P"); Assert.AreEqual(listOfMods.Count, 1); Directory.Delete(outputFolder, true); File.Delete(xmlName); File.Delete(mzmlName); }
public static void DatabaseSummary(string sourceXmlPath, string destinationXmlPath) { var culture = CultureInfo.CurrentCulture; var uniprotPtms = ProteinAnnotation.GetUniProtMods(Environment.CurrentDirectory); var uniprot = ProteinDbLoader.LoadProteinXML(sourceXmlPath, true, DecoyType.None, uniprotPtms, false, null, out var un); var spritz = ProteinDbLoader.LoadProteinXML(destinationXmlPath, true, DecoyType.None, uniprotPtms, false, null, out un); var spritzCanonical = spritz.Select(p => p.NonVariantProtein).Distinct().ToList(); int numberOfCanonicalProteinEntries = spritzCanonical.Count; int numberOfVariantProteinEntries = spritz.Count - spritzCanonical.Count; int synonymousCount = 0; int totalVariants = 0; int missenseCount = 0; int insertionCount = 0; int deletionCount = 0; int frameshiftCount = 0; int stopGainCount = 0; int stopLossCount = 0; Dictionary <string, List <SequenceVariation> > allVariants = new Dictionary <string, List <SequenceVariation> >(); foreach (var spritzEntry in spritz) { if (spritzEntry.AppliedSequenceVariations.Count != 0) { if (allVariants.ContainsKey(spritzEntry.NonVariantProtein.Accession)) { foreach (var variant in spritzEntry.AppliedSequenceVariations) { if (!allVariants[spritzEntry.NonVariantProtein.Accession].Contains(variant)) { allVariants[spritzEntry.NonVariantProtein.Accession].Add(variant); } } } else { allVariants.Add(spritzEntry.NonVariantProtein.Accession, spritzEntry.AppliedSequenceVariations); } } } foreach (var entry in allVariants) { foreach (var variant in entry.Value) { if (culture.CompareInfo.IndexOf(variant.Description.Description, "synonymous_variant", CompareOptions.IgnoreCase) >= 0) { synonymousCount++; totalVariants++; } if (culture.CompareInfo.IndexOf(variant.Description.Description, "missense_variant", CompareOptions.IgnoreCase) >= 0) { missenseCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "frameshift_variant", CompareOptions.IgnoreCase) >= 0) { frameshiftCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_gained", CompareOptions.IgnoreCase) >= 0) { stopGainCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_insertion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_insertion", CompareOptions.IgnoreCase) >= 0) { insertionCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "conservative_inframe_deletion", CompareOptions.IgnoreCase) >= 0 || culture.CompareInfo.IndexOf(variant.Description.Description, "disruptive_inframe_deletion", CompareOptions.IgnoreCase) >= 0) { deletionCount++; totalVariants++; } else if (culture.CompareInfo.IndexOf(variant.Description.Description, "stop_loss", CompareOptions.IgnoreCase) >= 0) { stopLossCount++; totalVariants++; } } } string[] summary = new string[20]; summary[0] = $"Spritz Database Summary"; summary[1] = $"--------------------------------------------------------------"; summary[2] = $"Total number of protein entries in the database: {spritz.Count}"; summary[3] = $"Total number of canonical protein entries in the database: {numberOfCanonicalProteinEntries}"; summary[4] = $"Total number of variant containing protein entries in the database: {numberOfVariantProteinEntries}"; summary[5] = $" Total number of unique variants in the database: {totalVariants}"; summary[6] = $" Total number of unique synonymous variants in the database: {synonymousCount}"; summary[7] = $" Total number of unique nonsynonymous variants in the database: {(totalVariants - synonymousCount)}"; summary[8] = $" Number of unique missense variants in the database: {missenseCount}"; summary[9] = $" Number of unique frameshift variants in the database: {frameshiftCount}"; summary[10] = $" Number of unique insertion variants in the database: {insertionCount}"; summary[11] = $" Number of unique deletion variants in the database: {deletionCount}"; summary[12] = $" Number of unique stop gain variants in the database: {stopGainCount}"; summary[13] = $" Number of unique stop loss variants in the database: {stopLossCount}"; File.WriteAllLines(Path.Combine(Path.GetDirectoryName(destinationXmlPath), "SpritzDatabaseSummary.txt"), summary); }