Ejemplo n.º 1
0
        /// <summary>
        /// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list.
        /// </summary>
        /// <param name="additionalModsToAddToProteins"></param>
        /// <param name="proteinList"></param>
        /// <param name="outputFileName"></param>
        /// <returns>The new "modified residue" entries that are added due to being in the Mods dictionary</returns>
        public static Dictionary <string, int> WriteXmlDatabase(Dictionary <string, HashSet <Tuple <int, Modification> > > additionalModsToAddToProteins,
                                                                List <Protein> proteinList, string outputFileName)
        {
            additionalModsToAddToProteins = additionalModsToAddToProteins ?? new Dictionary <string, HashSet <Tuple <int, Modification> > >();

            // write nonvariant proteins (for cases where variants aren't applied, this just gets the protein itself)
            List <Protein> nonVariantProteins = proteinList.Select(p => p.NonVariantProtein).Distinct().ToList();

            var xmlWriterSettings = new XmlWriterSettings
            {
                Indent      = true,
                IndentChars = "  "
            };

            Dictionary <string, int> newModResEntries = new Dictionary <string, int>();

            using (XmlWriter writer = XmlWriter.Create(outputFileName, xmlWriterSettings))
            {
                writer.WriteStartDocument();
                writer.WriteStartElement("mzLibProteinDb");

                List <Modification> myModificationList = new List <Modification>();
                foreach (Protein p in nonVariantProteins)
                {
                    foreach (KeyValuePair <int, List <Modification> > entry in p.OneBasedPossibleLocalizedModifications)
                    {
                        myModificationList.AddRange(entry.Value);
                    }
                }

                HashSet <Modification> allRelevantModifications = new HashSet <Modification>(
                    nonVariantProteins.SelectMany(p => p.SequenceVariations.SelectMany(sv => sv.OneBasedModifications).Concat(p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value))
                    .Concat(additionalModsToAddToProteins.Where(kv => nonVariantProteins.SelectMany(p => p.SequenceVariations.Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession })).Contains(kv.Key)).SelectMany(kv => kv.Value.Select(v => v.Item2))));

                foreach (Modification mod in allRelevantModifications.OrderBy(m => m.IdWithMotif))
                {
                    writer.WriteStartElement("modification");
                    writer.WriteString(mod.ToString() + Environment.NewLine + "//");
                    writer.WriteEndElement();
                }

                foreach (Protein protein in nonVariantProteins)
                {
                    writer.WriteStartElement("entry");
                    writer.WriteStartElement("accession");
                    writer.WriteString(protein.Accession);
                    writer.WriteEndElement();

                    if (protein.Name != null)
                    {
                        writer.WriteStartElement("name");
                        writer.WriteString(protein.Name);
                        writer.WriteEndElement();
                    }

                    if (protein.FullName != null)
                    {
                        writer.WriteStartElement("protein");
                        writer.WriteStartElement("recommendedName");
                        writer.WriteStartElement("fullName");
                        writer.WriteString(protein.FullName);
                        writer.WriteEndElement();
                        writer.WriteEndElement();
                        writer.WriteEndElement();
                    }

                    writer.WriteStartElement("gene");
                    foreach (var gene_name in protein.GeneNames)
                    {
                        writer.WriteStartElement("name");
                        writer.WriteAttributeString("type", gene_name.Item1);
                        writer.WriteString(gene_name.Item2);
                        writer.WriteEndElement();
                    }
                    writer.WriteEndElement();

                    if (protein.Organism != null)
                    {
                        writer.WriteStartElement("organism");
                        writer.WriteStartElement("name");
                        writer.WriteAttributeString("type", "scientific");
                        writer.WriteString(protein.Organism);
                        writer.WriteEndElement();
                        writer.WriteEndElement();
                    }

                    foreach (var dbRef in protein.DatabaseReferences)
                    {
                        writer.WriteStartElement("dbReference");
                        writer.WriteAttributeString("type", dbRef.Type);
                        writer.WriteAttributeString("id", dbRef.Id);
                        foreach (Tuple <string, string> property in dbRef.Properties)
                        {
                            writer.WriteStartElement("property");
                            writer.WriteAttributeString("type", property.Item1);
                            writer.WriteAttributeString("value", property.Item2);
                            writer.WriteEndElement();
                        }
                        writer.WriteEndElement();
                    }
                    foreach (var proteolysisProduct in protein.ProteolysisProducts)
                    {
                        writer.WriteStartElement("feature");
                        writer.WriteAttributeString("type", proteolysisProduct.Type);
                        writer.WriteStartElement("location");
                        writer.WriteStartElement("begin");
                        writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString());
                        writer.WriteEndElement();
                        writer.WriteStartElement("end");
                        writer.WriteAttributeString("position", proteolysisProduct.OneBasedEndPosition.ToString());
                        writer.WriteEndElement();
                        writer.WriteEndElement();
                        writer.WriteEndElement();
                    }

                    foreach (var hm in GetModsForThisProtein(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
                    {
                        foreach (var modId in hm.Value)
                        {
                            writer.WriteStartElement("feature");
                            writer.WriteAttributeString("type", "modified residue");
                            writer.WriteAttributeString("description", modId);
                            writer.WriteStartElement("location");
                            writer.WriteStartElement("position");
                            writer.WriteAttributeString("position", hm.Key.ToString(CultureInfo.InvariantCulture));
                            writer.WriteEndElement();
                            writer.WriteEndElement();
                            writer.WriteEndElement();
                        }
                    }

                    foreach (var hm in protein.SequenceVariations)
                    {
                        writer.WriteStartElement("feature");
                        writer.WriteAttributeString("type", "sequence variant");
                        writer.WriteAttributeString("description", hm.Description.ToString());
                        writer.WriteStartElement("original");
                        writer.WriteString(hm.OriginalSequence);
                        writer.WriteEndElement(); // original
                        writer.WriteStartElement("variation");
                        writer.WriteString(hm.VariantSequence);
                        writer.WriteEndElement(); // variation
                        writer.WriteStartElement("location");
                        if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                        {
                            writer.WriteStartElement("position");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement();
                        }
                        else
                        {
                            writer.WriteStartElement("begin");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement();
                            writer.WriteStartElement("end");
                            writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString());
                            writer.WriteEndElement();
                        }
                        foreach (var hmm in GetModsForThisProtein(protein, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key))
                        {
                            foreach (var modId in hmm.Value)
                            {
                                writer.WriteStartElement("subfeature");
                                writer.WriteAttributeString("type", "modified residue");
                                writer.WriteAttributeString("description", modId);
                                writer.WriteStartElement("location");
                                writer.WriteStartElement("subposition");
                                writer.WriteAttributeString("subposition", hmm.Key.ToString(CultureInfo.InvariantCulture));
                                writer.WriteEndElement();
                                writer.WriteEndElement();
                                writer.WriteEndElement();
                            }
                        }
                        writer.WriteEndElement(); // location
                        writer.WriteEndElement(); // feature
                    }

                    foreach (var hm in protein.DisulfideBonds)
                    {
                        writer.WriteStartElement("feature");
                        writer.WriteAttributeString("type", "disulfide bond");
                        writer.WriteAttributeString("description", hm.Description);
                        writer.WriteStartElement("location");
                        if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                        {
                            writer.WriteStartElement("position");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement();
                        }
                        else
                        {
                            writer.WriteStartElement("begin");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement();
                            writer.WriteStartElement("end");
                            writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString());
                            writer.WriteEndElement();
                        }
                        writer.WriteEndElement(); // location
                        writer.WriteEndElement(); // feature
                    }

                    foreach (var hm in protein.SpliceSites)
                    {
                        writer.WriteStartElement("feature");
                        writer.WriteAttributeString("type", "splice site");
                        writer.WriteAttributeString("description", hm.Description.ToString());
                        writer.WriteStartElement("location");
                        if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                        {
                            writer.WriteStartElement("position");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement();
                        }
                        else
                        {
                            writer.WriteStartElement("begin");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement();
                            writer.WriteStartElement("end");
                            writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString());
                            writer.WriteEndElement();
                        }
                        writer.WriteEndElement(); // location
                        writer.WriteEndElement(); // feature
                    }

                    foreach (var hm in protein.SpliceVariants)
                    {
                        writer.WriteStartElement("feature");
                        writer.WriteAttributeString("type", "splice variant"); //TODO should id be stored or not written?
                        writer.WriteAttributeString("description", hm.Description);
                        if (!hm.VariantSequence.Equals(""))
                        {
                            writer.WriteStartElement("original");
                            writer.WriteString(hm.OriginalSequence);
                            writer.WriteEndElement(); // original
                            writer.WriteStartElement("variation");
                            writer.WriteString(hm.VariantSequence);
                            writer.WriteEndElement(); // variation
                        }
                        writer.WriteStartElement("location");
                        if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                        {
                            writer.WriteStartElement("position");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement(); // position
                        }
                        else
                        {
                            writer.WriteStartElement("begin");
                            writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString());
                            writer.WriteEndElement(); // begin
                            writer.WriteStartElement("end");
                            writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString());
                            writer.WriteEndElement(); // end
                        }
                        writer.WriteEndElement();     // location
                        writer.WriteEndElement();     // feature
                    }

                    writer.WriteStartElement("sequence");
                    writer.WriteAttributeString("length", protein.Length.ToString(CultureInfo.InvariantCulture));
                    writer.WriteString(protein.BaseSequence);
                    writer.WriteEndElement(); // sequence
                    writer.WriteEndElement(); // entry
                }

                writer.WriteEndElement(); // mzLibProteinDb
                writer.WriteEndDocument();
            }
            return(newModResEntries);
        }
Ejemplo n.º 2
0
        private static Dictionary <int, HashSet <string> > GetModsForThisProtein(Protein protein, SequenceVariation seqvar, Dictionary <string, HashSet <Tuple <int, Modification> > > additionalModsToAddToProteins, Dictionary <string, int> newModResEntries)
        {
            var modsToWriteForThisSpecificProtein = new Dictionary <int, HashSet <string> >();

            var primaryModDict = seqvar == null ? protein.OneBasedPossibleLocalizedModifications : seqvar.OneBasedModifications;

            foreach (var mods in primaryModDict)
            {
                foreach (var mod in mods.Value)
                {
                    if (modsToWriteForThisSpecificProtein.TryGetValue(mods.Key, out HashSet <string> val))
                    {
                        val.Add(mod.IdWithMotif);
                    }
                    else
                    {
                        modsToWriteForThisSpecificProtein.Add(mods.Key, new HashSet <string> {
                            mod.IdWithMotif
                        });
                    }
                }
            }

            string accession = seqvar == null ? protein.Accession : VariantApplication.GetAccession(protein, new[] { seqvar });

            if (additionalModsToAddToProteins.ContainsKey(accession))
            {
                foreach (var ye in additionalModsToAddToProteins[accession])
                {
                    int    additionalModResidueIndex = ye.Item1;
                    string additionalModId           = ye.Item2.IdWithMotif;
                    bool   modAdded = false;

                    // If we already have modifications that need to be written to the specific residue, get the hash set of those mods
                    if (modsToWriteForThisSpecificProtein.TryGetValue(additionalModResidueIndex, out HashSet <string> val))
                    {
                        // Try to add the new mod to that hash set. If it's not there, modAdded=true, and it is added.
                        modAdded = val.Add(additionalModId);
                    }

                    // Otherwise, no modifications currently need to be written to the residue at residueIndex, so need to create new hash set for that residue
                    else
                    {
                        modsToWriteForThisSpecificProtein.Add(additionalModResidueIndex, new HashSet <string> {
                            additionalModId
                        });
                        modAdded = true;
                    }

                    // Finally, if a new modification has in fact been deemed worthy of being added to the database, mark that in the output dictionary
                    if (modAdded)
                    {
                        if (newModResEntries.ContainsKey(additionalModId))
                        {
                            newModResEntries[additionalModId]++;
                        }
                        else
                        {
                            newModResEntries.Add(additionalModId, 1);
                        }
                    }
                }
            }
            return(modsToWriteForThisSpecificProtein);
        }
Ejemplo n.º 3
0
        public static void TestSearchPtmVariantDatabase()
        {
            //Create Search Task
            SearchTask task1 = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    SearchTarget         = true,
                    MassDiffAcceptorType = MassDiffAcceptorType.Exact,
                },
                CommonParameters = new CommonParameters(digestionParams: new DigestionParams(minPeptideLength: 5))
            };

            //add task to task list
            var taskList = new List <(string, MetaMorpheusTask)> {
                ("task1", task1)
            };

            //create modification lists
            List <Modification> variableModifications = GlobalVariables.AllModsKnown.OfType <Modification>().Where
                                                            (b => task1.CommonParameters.ListOfModsVariable.Contains((b.ModificationType, b.IdWithMotif))).ToList();

            //protein Creation (One with mod and one without)
            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
            ModificationMotif.TryGetMotif("K", out ModificationMotif motifK);
            var     variant            = new SequenceVariation(3, "P", "K", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G|||||||||||||||||||\tGT:AD:DP\t1/1:30,30:30");
            Protein testProteinWithMod = new Protein("PEPTID", "accession1", sequenceVariations: new List <SequenceVariation> {
                variant
            });
            string variantAcc = VariantApplication.GetAccession(testProteinWithMod, new[] { variant });
            //First Write XML Database
            string xmlName = "oblm.xml";

            //Add Mod to list and write XML input database
            var modList = new Dictionary <string, HashSet <Tuple <int, Modification> > >();
            var hash    = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(1, new Modification(_originalId: "acetyl on P", _modificationType: "type", _target: motifP, _monoisotopicMass: 42, _locationRestriction: "Anywhere.")),
            };
            var hashVar = new HashSet <Tuple <int, Modification> >
            {
                new Tuple <int, Modification>(3, new Modification(_originalId: "acetyl on K", _modificationType: "type", _target: motifK, _monoisotopicMass: 42, _locationRestriction: "Anywhere.")),
            };

            modList.Add(testProteinWithMod.Accession, hash);
            modList.Add(variantAcc, hashVar);
            ProteinDbWriter.WriteXmlDatabase(modList, new List <Protein> {
                testProteinWithMod
            }, xmlName);

            //now write MZML file
            var variantProteins = ProteinDbLoader.LoadProteinXML(xmlName, true, DecoyType.Reverse, null, false, null, out var unknownModifications);
            var variantProtein  = variantProteins[0];
            var variantDecoy    = variantProteins[1];

            Assert.AreEqual(0, unknownModifications.Count);

            Assert.AreEqual(2, variantProteins.Count); // target & decoy
            Assert.AreEqual(2, variantProteins[0].OneBasedPossibleLocalizedModifications.Count);
            List <int> foundResidueIndicies   = variantProtein.OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList();
            List <int> expectedResidueIndices = new List <int>()
            {
                1, 3
            };

            Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices));
            Assert.AreEqual(2, variantDecoy.OneBasedPossibleLocalizedModifications.Count);
            foundResidueIndicies   = variantDecoy.OneBasedPossibleLocalizedModifications.Select(k => k.Key).ToList();
            expectedResidueIndices = new List <int>()
            {
                4, 6
            };                                                 //originally modified residues are now at the end in the decoy
            Assert.That(foundResidueIndicies, Is.EquivalentTo(expectedResidueIndices));

            var thisOk = unknownModifications;                                    //for debugging
            var commonParamsAtThisPoint = task1.CommonParameters.DigestionParams; //for debugging

            var digestedList = variantProteins[0].GetVariantProteins()[0].Digest(task1.CommonParameters.DigestionParams, new List <Modification>(), variableModifications).ToList();

            Assert.AreEqual(4, digestedList.Count);

            //Set Peptide with 1 mod at position 3
            PeptideWithSetModifications pepWithSetMods1 = digestedList[1];

            //Finally Write MZML file
            Assert.AreEqual("PEK[type:acetyl on K]TID", pepWithSetMods1.FullSequence);//this might be base sequence
            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> {
                pepWithSetMods1
            });
            string mzmlName = @"hello.mzML";

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);

            //run!
            var engine = new EverythingRunnerEngine(taskList, new List <string> {
                mzmlName
            },
                                                    new List <DbForTask> {
                new DbForTask(xmlName, false)
            }, Environment.CurrentDirectory);

            engine.Run();
        }