private static string ApplyRegex(FastaHeaderFieldRegex regex, string line) { string result = null; if (regex != null) { var matches = regex.Regex.Matches(line); if (matches.Count > regex.Match && matches[regex.Match].Groups.Count > regex.Group) { result = matches[regex.Match].Groups[regex.Group].Value; } } return(result); }
/// <summary> /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field. /// </summary> public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant, FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex, FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1) { HashSet <string> unique_accessions = new HashSet <string>(); int unique_identifier = 1; string accession = null; string name = null; string fullName = null; string organism = null; List <Tuple <string, string> > geneName = new List <Tuple <string, string> >(); errors = new List <string>(); Regex substituteWhitespace = new Regex(@"\s+"); List <Protein> targets = new List <Protein>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StringBuilder sb = null; StreamReader fasta = new StreamReader(fastaFileStream); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith(">")) { accession = ApplyRegex(accessionRegex, line); fullName = ApplyRegex(fullNameRegex, line); name = ApplyRegex(nameRegex, line); organism = ApplyRegex(organismRegex, line); string geneNameString = ApplyRegex(geneNameRegex, line); if (geneNameString != null) { geneName.Add(new Tuple <string, string>("primary", geneNameString)); } if (accession == null || accession == "") { accession = line.Substring(1).TrimEnd(); } sb = new StringBuilder(); } else if (sb != null) { sb.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null) { string sequence = substituteWhitespace.Replace(sb.ToString(), ""); while (unique_accessions.Contains(accession)) { accession += "_" + unique_identifier.ToString(); unique_identifier++; } unique_accessions.Add(accession); Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName, isContaminant: isContaminant, databaseFilePath: proteinDbLocation); if (protein.Length == 0) { errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation); } else { targets.Add(protein); } accession = null; name = null; fullName = null; organism = null; geneName = new List <Tuple <string, string> >(); } // no input left if (fasta.Peek() == -1) { break; } } } if (!targets.Any()) { errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation); } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList()); }
/// <summary> /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field. /// </summary> public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant, FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex, FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1) { HashSet <string> unique_accessions = new HashSet <string>(); int unique_identifier = 2;//for isoforms. the first will be "accession", the next will be "accession_2" string accession = null; string name = null; string fullName = null; string organism = null; List <Tuple <string, string> > geneName = new List <Tuple <string, string> >(); errors = new List <string>(); Regex substituteWhitespace = new Regex(@"\s+"); List <Protein> targets = new List <Protein>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StringBuilder sb = null; StreamReader fasta = new StreamReader(fastaFileStream); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith(">")) { accession = ApplyRegex(accessionRegex, line); fullName = ApplyRegex(fullNameRegex, line); name = ApplyRegex(nameRegex, line); organism = ApplyRegex(organismRegex, line); string geneNameString = ApplyRegex(geneNameRegex, line); if (geneNameString != null) { geneName.Add(new Tuple <string, string>("primary", geneNameString)); } if (accession == null || accession == "") { accession = line.Substring(1).TrimEnd(); } sb = new StringBuilder(); } else if (sb != null) { sb.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null) { string sequence = substituteWhitespace.Replace(sb.ToString(), ""); // sanitize the sequence to replace unexpected characters with X (unknown amino acid) // sometimes strange characters get added by RNA sequencing software, etc. sequence = SanitizeAminoAcidSequence(sequence, 'X'); if (unique_accessions.Contains(accession)) //this will happen for isoforms { string originalAccession = accession; //save the original accession += "_" + unique_identifier.ToString(); //add a number onto it while (unique_accessions.Contains(accession)) //if that number was already added { unique_identifier++; //keep increasing it accession = originalAccession + "_" + unique_identifier.ToString(); //try the new number } unique_identifier = 2; //reset } unique_accessions.Add(accession); Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName, isContaminant: isContaminant, databaseFilePath: proteinDbLocation); if (protein.Length == 0) { errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation); } else { targets.Add(protein); } accession = null; name = null; fullName = null; organism = null; geneName = new List <Tuple <string, string> >(); } // no input left if (fasta.Peek() == -1) { break; } } } if (!targets.Any()) { errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation); } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); return(generateTargets ? targets.Concat(decoys).ToList() : decoys); }