//TODO: Generate all the proteolytic products as distinct proteins during XML reading and delete the ProteolysisProducts code public static List <Protein> LoadProteinXML(string proteinDbLocation, bool generateTargets, DecoyType decoyType, IEnumerable <Modification> allKnownModifications, bool isContaminant, IEnumerable <string> modTypesToExclude, out Dictionary <string, Modification> unknownModifications, int maxThreads = -1, int maxHeterozygousVariants = 4, int minAlleleDepth = 1) { List <Modification> prespecified = GetPtmListFromProteinXml(proteinDbLocation); allKnownModifications = allKnownModifications ?? new List <Modification>(); modTypesToExclude = modTypesToExclude ?? new List <string>(); //Dictionary<string, IList<Modification>> modsDictionary = new Dictionary<string, IList<Modification>>(); if (prespecified.Count > 0 || allKnownModifications.Count() > 0) { //modsDictionary = GetModificationDict(new HashSet<Modification>(prespecified.Concat(allKnownModifications))); IdToPossibleMods = GetModificationDict(new HashSet <Modification>(prespecified.Concat(allKnownModifications))); IdWithMotifToMod = GetModificationDictWithMotifs(new HashSet <Modification>(prespecified.Concat(allKnownModifications))); } List <Protein> targets = new List <Protein>(); unknownModifications = new Dictionary <string, Modification>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Regex substituteWhitespace = new Regex(@"\s+"); Stream uniprotXmlFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; ProteinXmlEntry block = new ProteinXmlEntry(); using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream)) { while (xml.Read()) { if (xml.NodeType == XmlNodeType.Element) { block.ParseElement(xml.Name, xml); } if (xml.NodeType == XmlNodeType.EndElement || xml.IsEmptyElement) { Protein newProtein = block.ParseEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, proteinDbLocation); if (newProtein != null) { targets.Add(newProtein); } } } } } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); IEnumerable <Protein> proteinsToExpand = generateTargets ? targets.Concat(decoys) : decoys; return(proteinsToExpand.SelectMany(p => p.GetVariantProteins(maxHeterozygousVariants, minAlleleDepth)).ToList()); }
/// <summary> /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field. /// </summary> public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant, FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex, FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1) { HashSet <string> unique_accessions = new HashSet <string>(); int unique_identifier = 1; string accession = null; string name = null; string fullName = null; string organism = null; List <Tuple <string, string> > geneName = new List <Tuple <string, string> >(); errors = new List <string>(); Regex substituteWhitespace = new Regex(@"\s+"); List <Protein> targets = new List <Protein>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StringBuilder sb = null; StreamReader fasta = new StreamReader(fastaFileStream); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith(">")) { accession = ApplyRegex(accessionRegex, line); fullName = ApplyRegex(fullNameRegex, line); name = ApplyRegex(nameRegex, line); organism = ApplyRegex(organismRegex, line); string geneNameString = ApplyRegex(geneNameRegex, line); if (geneNameString != null) { geneName.Add(new Tuple <string, string>("primary", geneNameString)); } if (accession == null || accession == "") { accession = line.Substring(1).TrimEnd(); } sb = new StringBuilder(); } else if (sb != null) { sb.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null) { string sequence = substituteWhitespace.Replace(sb.ToString(), ""); while (unique_accessions.Contains(accession)) { accession += "_" + unique_identifier.ToString(); unique_identifier++; } unique_accessions.Add(accession); Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName, isContaminant: isContaminant, databaseFilePath: proteinDbLocation); if (protein.Length == 0) { errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation); } else { targets.Add(protein); } accession = null; name = null; fullName = null; organism = null; geneName = new List <Tuple <string, string> >(); } // no input left if (fasta.Peek() == -1) { break; } } } if (!targets.Any()) { errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation); } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList()); }
/// <summary> /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field. /// </summary> public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant, FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex, FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1) { HashSet <string> unique_accessions = new HashSet <string>(); int unique_identifier = 2;//for isoforms. the first will be "accession", the next will be "accession_2" string accession = null; string name = null; string fullName = null; string organism = null; List <Tuple <string, string> > geneName = new List <Tuple <string, string> >(); errors = new List <string>(); Regex substituteWhitespace = new Regex(@"\s+"); List <Protein> targets = new List <Protein>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StringBuilder sb = null; StreamReader fasta = new StreamReader(fastaFileStream); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith(">")) { accession = ApplyRegex(accessionRegex, line); fullName = ApplyRegex(fullNameRegex, line); name = ApplyRegex(nameRegex, line); organism = ApplyRegex(organismRegex, line); string geneNameString = ApplyRegex(geneNameRegex, line); if (geneNameString != null) { geneName.Add(new Tuple <string, string>("primary", geneNameString)); } if (accession == null || accession == "") { accession = line.Substring(1).TrimEnd(); } sb = new StringBuilder(); } else if (sb != null) { sb.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null) { string sequence = substituteWhitespace.Replace(sb.ToString(), ""); // sanitize the sequence to replace unexpected characters with X (unknown amino acid) // sometimes strange characters get added by RNA sequencing software, etc. sequence = SanitizeAminoAcidSequence(sequence, 'X'); if (unique_accessions.Contains(accession)) //this will happen for isoforms { string originalAccession = accession; //save the original accession += "_" + unique_identifier.ToString(); //add a number onto it while (unique_accessions.Contains(accession)) //if that number was already added { unique_identifier++; //keep increasing it accession = originalAccession + "_" + unique_identifier.ToString(); //try the new number } unique_identifier = 2; //reset } unique_accessions.Add(accession); Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName, isContaminant: isContaminant, databaseFilePath: proteinDbLocation); if (protein.Length == 0) { errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation); } else { targets.Add(protein); } accession = null; name = null; fullName = null; organism = null; geneName = new List <Tuple <string, string> >(); } // no input left if (fasta.Peek() == -1) { break; } } } if (!targets.Any()) { errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation); } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); return(generateTargets ? targets.Concat(decoys).ToList() : decoys); }