/// <summary> /// Performs an in silico digestion of all the proteins found within the fasta file. /// </summary> /// <param name="fastaFile">The fasta filename to perfrom the digestion on</param> /// <param name="uniquePeptides">The unique peptides that were read in from the csv files</param> /// <param name="proteases"></param> /// <param name="semiDigestion">Perform a Semi Digestion</param> /// <returns>True if all the unique peptides get isMapped to at least one protein, false otherwise</returns> private List<Protein> GetMappedProteinsFromFasta(string fastaFile, Dictionary<string, Peptide> uniquePeptides, IList<Protease> proteases, bool semiDigestion = false) { string fastaFileniceName = Path.GetFileName(fastaFile); StringBuilder sb = new StringBuilder(); foreach (Protease protease in proteases) { sb.Append(protease.Name); sb.Append(','); } if (sb.Length > 0) { sb.Remove(sb.Length - 1, 1); } Log("Performing {0}{1} digestion on {2}...", semiDigestion ? "semi " : "", sb, fastaFileniceName); //Peptide.MappedCount = 0; int forwardProteins = 0, decoyProteins = 0, forwardProteinsMapped = 0, decoyProteinsMapped = 0, fastaCounter = 0, pepsMapped = 0; long totalBytes = new FileInfo(fastaFile).Length; // A hashset of all proteins that have a peptide that was in the input files Dictionary<Protein, Protein> proteins = new Dictionary<Protein, Protein>(1 << 13); // Min and Max length of peptides int minLength = semiDigestion ? 1 : _smallestPeptide - 1; int maxLength = semiDigestion ? int.MaxValue : _largestPeptide + 1; // Open the reader for the protein database in the .fasta format using (FastaReader reader = new FastaReader(fastaFile)) { // Read in each protein one-by-one foreach (Fasta fasta in reader.ReadNextFasta()) { // The number of fasta (proteins) read in (for progress bar feedback) fastaCounter++; // Create a new protein from the fasta Protein prot = new Protein(fasta.Description, fasta.Sequence); // Check if the protein is a decoy protein or not if (prot.IsDecoy) { decoyProteins++; } else { forwardProteins++; } // Loop over each protease foreach (Protease protease in proteases) { // Digest the protein's leucine sequences (all I's are now L's) with the given proteases, max missed cleavages, limiting it to the smallest and largest peptide observed (speed improvement) // *Note each peptide sequence (pep_seq) will be leucine sequences as well foreach (string pepSeq in AminoAcidPolymer.Digest(prot.Sequence, protease, MaxMissedCleavages, minLength, maxLength, semiDigestion: semiDigestion)) { // Is this one of the unique peptide sequences in the csv files? If not, we don't care about it Peptide pep; if (!uniquePeptides.TryGetValue(pepSeq.Replace('I', 'L'), out pep)) continue; // Check to see if this protein has already been added to the list of proteins hit if(!proteins.ContainsKey(prot)) // returns true if the protein is new to the hashset of proteins { proteins.Add(prot, prot); if (prot.IsDecoy) { decoyProteinsMapped++; } else { forwardProteinsMapped++; } } // Add the peptide to the protein (internally hashed, so don't worry about duplicates) prot.AddPeptide(pep); // Mark that this peptide was successfully mapped, this is for error checking purposes if (!pep.IsMapped) { pepsMapped++; pep.IsMapped = true; } //pep.MarkAsMapped(); } } // Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work //if (fastaCounter > 100) //{ // fastaCounter = 0; // ProgressUpdate((double)reader.Position / totalBytes); //} } } // Check to see if every peptide is matched, if not try using a brute force search method instead if (uniquePeptides.Count > pepsMapped) { // Get all the peptides that weren't mapped List<Peptide> unMapedPeptides = uniquePeptides.Values.Where(p => !p.IsMapped).ToList(); Log("[WARNING] Couldn't find every peptide using digestion method (wrong enzyme perhaps?), trying brute force search instead on the remaining {0} peptides...", unMapedPeptides.Count); ProgressUpdate(0.1); using (FastaReader reader = new FastaReader(fastaFile)) { fastaCounter = 0; // Read in each protein one-by-one foreach (Fasta fasta in reader.ReadNextFasta()) { string seq = fasta.Sequence.Replace('I', 'L'); foreach (Peptide pep2 in unMapedPeptides) { if (!seq.Contains(pep2.LeucineSequence)) continue; Protein prot = new Protein(fasta.Description, fasta.Sequence); Protein realProt; if (proteins.TryGetValue(prot, out realProt)) { // Add the peptide to the protein realProt.AddPeptide(pep2); } else { proteins.Add(prot, prot); if (prot.IsDecoy) { decoyProteinsMapped++; } else { forwardProteinsMapped++; } // Add the peptide to the protein prot.AddPeptide(pep2); } // Mark that this peptide was successfully isMapped, this is for error checking purposes pep2.IsMapped = true; //pep2.MarkAsMapped(); } fastaCounter++; //// Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work //if (fastaCounter > 100) //{ // fastaCounter = 0; // ProgressUpdate((double)reader.BaseStream.Position / totalBytes); //} } } // Still missing peptides? if (unMapedPeptides.Any(p => !p.IsMapped)) { int count = 0; foreach (Peptide pep2 in unMapedPeptides) { if (pep2.IsMapped) continue; count++; Log("[ERROR]\tPeptide {0} was not isMapped", pep2); } throw new ArgumentException( string.Format( "[ERROR] Unable to map every peptide ({0}) to {1}. You might be using either the wrong database, enzyme, or max missed cleavages!", count, fastaFileniceName)); } } Log("Every unique peptide was successfully mapped to at least one protein"); Log("{0:N0} of {1:N0} ({2:F2}%) target proteins were mapped at least once", forwardProteinsMapped, forwardProteins, 100.0 * (double)forwardProteinsMapped / (double)forwardProteins); Log("{0:N0} of {1:N0} ({2:F2}%) decoy proteins were mapped at least once", decoyProteinsMapped, decoyProteins, 100.0 * (double)decoyProteinsMapped / (double)decoyProteins); // force the progress bar to go into marquee mode ProgressUpdate(0.0); // Return a list of all the proteins that were isMapped at least once return proteins.Values.ToList(); }
public void WriteFasta(string fasta_file, FastaWriter Writer) { bool MakeDecoy = false; if (Options.OutputType == DatabaseType.Target || Options.OutputType == DatabaseType.Concatenated) { MakeDecoy = false; } else if (Options.OutputType == DatabaseType.Decoy || Options.OutputType == DatabaseType.Concatenated) { MakeDecoy = true; } using (FastaReader reader = new FastaReader(fasta_file)) { foreach (Fasta fasta in reader.ReadNextFasta()) { Regex uniprotRegex = new Regex(@"(.+)\|(.+)\|(.+?)\s(.+?)\sOS=(.+?)(?:\sGN=(.+?))?(?:$|PE=(\d+)\sSV=(\d+))", RegexOptions.ExplicitCapture); Match UniprotMatch = uniprotRegex.Match(fasta.Description); string HeaderFile = "InvalidUniprotheaders.txt"; string headerFolder = Path.GetDirectoryName(Options.InputFiles[0]); if (Options.EnforceUniprot && !UniprotMatch.Success) { using (StreamWriter log = new StreamWriter(Path.Combine(headerFolder, HeaderFile), true)) { log.WriteLine("Invalid Header:"); log.WriteLine(); log.WriteLine(fasta.Description); log.WriteLine(); InvalidHeader(fasta); } } if (UniprotMatch.Success) { bool excludeMethionine = false; if (Options.ExcludeNTerminalMethionine && !Options.ExcludeNTerminalResidue) { excludeMethionine = true; } if (MakeDecoy) { Writer.Write(fasta.ToDecoy(Options.DecoyPrefix, Options.DecoyType, (excludeMethionine || Options.ExcludeNTerminalResidue), Options.ExcludeNTerminalMethionine)); } else { Writer.Write(fasta); } } } } }