Esempio n. 1
0
        //TODO: Generate all the proteolytic products as distinct proteins during XML reading and delete the ProteolysisProducts code
        public static List <Protein> LoadProteinXML(string proteinDbLocation, bool generateTargets, DecoyType decoyType, IEnumerable <Modification> allKnownModifications,
                                                    bool isContaminant, IEnumerable <string> modTypesToExclude, out Dictionary <string, Modification> unknownModifications, int maxThreads = -1,
                                                    int maxHeterozygousVariants = 4, int minAlleleDepth = 1)
        {
            List <Modification> prespecified = GetPtmListFromProteinXml(proteinDbLocation);

            allKnownModifications = allKnownModifications ?? new List <Modification>();
            modTypesToExclude     = modTypesToExclude ?? new List <string>();

            //Dictionary<string, IList<Modification>> modsDictionary = new Dictionary<string, IList<Modification>>();
            if (prespecified.Count > 0 || allKnownModifications.Count() > 0)
            {
                //modsDictionary = GetModificationDict(new HashSet<Modification>(prespecified.Concat(allKnownModifications)));
                IdToPossibleMods = GetModificationDict(new HashSet <Modification>(prespecified.Concat(allKnownModifications)));
                IdWithMotifToMod = GetModificationDictWithMotifs(new HashSet <Modification>(prespecified.Concat(allKnownModifications)));
            }
            List <Protein> targets = new List <Protein>();

            unknownModifications = new Dictionary <string, Modification>();
            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Regex substituteWhitespace = new Regex(@"\s+");

                Stream uniprotXmlFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                              (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                              stream;

                ProteinXmlEntry block = new ProteinXmlEntry();

                using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream))
                {
                    while (xml.Read())
                    {
                        if (xml.NodeType == XmlNodeType.Element)
                        {
                            block.ParseElement(xml.Name, xml);
                        }
                        if (xml.NodeType == XmlNodeType.EndElement || xml.IsEmptyElement)
                        {
                            Protein newProtein = block.ParseEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, proteinDbLocation);
                            if (newProtein != null)
                            {
                                targets.Add(newProtein);
                            }
                        }
                    }
                }
            }

            List <Protein>        decoys           = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);
            IEnumerable <Protein> proteinsToExpand = generateTargets ? targets.Concat(decoys) : decoys;

            return(proteinsToExpand.SelectMany(p => p.GetVariantProteins(maxHeterozygousVariants, minAlleleDepth)).ToList());
        }
Esempio n. 2
0
        /// <summary>
        /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field.
        /// </summary>
        public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant,
                                                      FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex,
                                                      FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1)
        {
            HashSet <string> unique_accessions = new HashSet <string>();
            int    unique_identifier           = 1;
            string accession = null;
            string name      = null;
            string fullName  = null;
            string organism  = null;
            List <Tuple <string, string> > geneName = new List <Tuple <string, string> >();

            errors = new List <string>();
            Regex substituteWhitespace = new Regex(@"\s+");

            List <Protein> targets = new List <Protein>();

            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                         (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                         stream;

                StringBuilder sb    = null;
                StreamReader  fasta = new StreamReader(fastaFileStream);

                while (true)
                {
                    string line = "";
                    line = fasta.ReadLine();
                    if (line == null)
                    {
                        break;
                    }

                    if (line.StartsWith(">"))
                    {
                        accession = ApplyRegex(accessionRegex, line);
                        fullName  = ApplyRegex(fullNameRegex, line);
                        name      = ApplyRegex(nameRegex, line);
                        organism  = ApplyRegex(organismRegex, line);
                        string geneNameString = ApplyRegex(geneNameRegex, line);
                        if (geneNameString != null)
                        {
                            geneName.Add(new Tuple <string, string>("primary", geneNameString));
                        }

                        if (accession == null || accession == "")
                        {
                            accession = line.Substring(1).TrimEnd();
                        }

                        sb = new StringBuilder();
                    }
                    else if (sb != null)
                    {
                        sb.Append(line.Trim());
                    }

                    if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null)
                    {
                        string sequence = substituteWhitespace.Replace(sb.ToString(), "");
                        while (unique_accessions.Contains(accession))
                        {
                            accession += "_" + unique_identifier.ToString();
                            unique_identifier++;
                        }
                        unique_accessions.Add(accession);
                        Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName,
                                                      isContaminant: isContaminant, databaseFilePath: proteinDbLocation);
                        if (protein.Length == 0)
                        {
                            errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation);
                        }
                        else
                        {
                            targets.Add(protein);
                        }

                        accession = null;
                        name      = null;
                        fullName  = null;
                        organism  = null;
                        geneName  = new List <Tuple <string, string> >();
                    }

                    // no input left
                    if (fasta.Peek() == -1)
                    {
                        break;
                    }
                }
            }
            if (!targets.Any())
            {
                errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation);
            }
            List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);

            return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList());
        }
Esempio n. 3
0
        /// <summary>
        /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field.
        /// </summary>
        public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant,
                                                      FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex,
                                                      FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1)
        {
            HashSet <string> unique_accessions = new HashSet <string>();
            int    unique_identifier           = 2;//for isoforms. the first will be "accession", the next will be "accession_2"
            string accession = null;
            string name      = null;
            string fullName  = null;
            string organism  = null;
            List <Tuple <string, string> > geneName = new List <Tuple <string, string> >();

            errors = new List <string>();
            Regex substituteWhitespace = new Regex(@"\s+");

            List <Protein> targets = new List <Protein>();

            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                         (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                         stream;

                StringBuilder sb    = null;
                StreamReader  fasta = new StreamReader(fastaFileStream);

                while (true)
                {
                    string line = "";
                    line = fasta.ReadLine();
                    if (line == null)
                    {
                        break;
                    }

                    if (line.StartsWith(">"))
                    {
                        accession = ApplyRegex(accessionRegex, line);
                        fullName  = ApplyRegex(fullNameRegex, line);
                        name      = ApplyRegex(nameRegex, line);
                        organism  = ApplyRegex(organismRegex, line);
                        string geneNameString = ApplyRegex(geneNameRegex, line);
                        if (geneNameString != null)
                        {
                            geneName.Add(new Tuple <string, string>("primary", geneNameString));
                        }

                        if (accession == null || accession == "")
                        {
                            accession = line.Substring(1).TrimEnd();
                        }

                        sb = new StringBuilder();
                    }
                    else if (sb != null)
                    {
                        sb.Append(line.Trim());
                    }

                    if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null)
                    {
                        string sequence = substituteWhitespace.Replace(sb.ToString(), "");

                        // sanitize the sequence to replace unexpected characters with X (unknown amino acid)
                        // sometimes strange characters get added by RNA sequencing software, etc.
                        sequence = SanitizeAminoAcidSequence(sequence, 'X');

                        if (unique_accessions.Contains(accession))                                  //this will happen for isoforms
                        {
                            string originalAccession = accession;                                   //save the original
                            accession += "_" + unique_identifier.ToString();                        //add a number onto it
                            while (unique_accessions.Contains(accession))                           //if that number was already added
                            {
                                unique_identifier++;                                                //keep increasing it
                                accession = originalAccession + "_" + unique_identifier.ToString(); //try the new number
                            }
                            unique_identifier = 2;                                                  //reset
                        }
                        unique_accessions.Add(accession);
                        Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName,
                                                      isContaminant: isContaminant, databaseFilePath: proteinDbLocation);
                        if (protein.Length == 0)
                        {
                            errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation);
                        }
                        else
                        {
                            targets.Add(protein);
                        }

                        accession = null;
                        name      = null;
                        fullName  = null;
                        organism  = null;
                        geneName  = new List <Tuple <string, string> >();
                    }

                    // no input left
                    if (fasta.Peek() == -1)
                    {
                        break;
                    }
                }
            }
            if (!targets.Any())
            {
                errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation);
            }
            List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);

            return(generateTargets ? targets.Concat(decoys).ToList() : decoys);
        }