ReadNextFasta() public method

public ReadNextFasta ( ) : IEnumerable
return IEnumerable
Exemplo n.º 1
0
        /// <summary>
        /// Performs an in silico digestion of all the proteins found within the fasta file.
        /// </summary>
        /// <param name="fastaFile">The fasta filename to perfrom the digestion on</param>
        /// <param name="uniquePeptides">The unique peptides that were read in from the csv files</param>
        /// <param name="proteases"></param>
        /// <param name="semiDigestion">Perform a Semi Digestion</param>
        /// <returns>True if all the unique peptides get isMapped to at least one protein, false otherwise</returns>
        private List<Protein> GetMappedProteinsFromFasta(string fastaFile, Dictionary<string, Peptide> uniquePeptides, IList<Protease> proteases, bool semiDigestion = false)
        {
            string fastaFileniceName = Path.GetFileName(fastaFile);
            StringBuilder sb = new StringBuilder();
            foreach (Protease protease in proteases)
            {
                sb.Append(protease.Name);
                sb.Append(',');
            }
            if (sb.Length > 0) { sb.Remove(sb.Length - 1, 1); }
            Log("Performing {0}{1} digestion on {2}...", semiDigestion ? "semi " : "", sb, fastaFileniceName);
            //Peptide.MappedCount = 0;
            int forwardProteins = 0, decoyProteins = 0, forwardProteinsMapped = 0, decoyProteinsMapped = 0, fastaCounter = 0, pepsMapped = 0;
            long totalBytes = new FileInfo(fastaFile).Length;

            // A hashset of all proteins that have a peptide that was in the input files
            Dictionary<Protein, Protein> proteins = new Dictionary<Protein, Protein>(1 << 13);

            // Min and Max length of peptides
            int minLength = semiDigestion ? 1 : _smallestPeptide - 1;
            int maxLength = semiDigestion ? int.MaxValue : _largestPeptide + 1;

            // Open the reader for the protein database in the .fasta format
            using (FastaReader reader = new FastaReader(fastaFile))
            {
                // Read in each protein one-by-one
                foreach (Fasta fasta in reader.ReadNextFasta())
                {
                    // The number of fasta (proteins) read in (for progress bar feedback)
                    fastaCounter++;

                    // Create a new protein from the fasta
                    Protein prot = new Protein(fasta.Description, fasta.Sequence);

                    // Check if the protein is a decoy protein or not
                    if (prot.IsDecoy)
                    {
                        decoyProteins++;
                    }
                    else
                    {
                        forwardProteins++;
                    }

                    // Loop over each protease
                    foreach (Protease protease in proteases)
                    {

                        // Digest the protein's leucine sequences (all I's are now L's) with the given proteases, max missed cleavages, limiting it to the smallest and largest peptide observed (speed improvement)
                        // *Note each peptide sequence (pep_seq) will be leucine sequences as well
                        foreach (string pepSeq in AminoAcidPolymer.Digest(prot.Sequence, protease, MaxMissedCleavages, minLength, maxLength, semiDigestion: semiDigestion))
                        {
                            // Is this one of the unique peptide sequences in the csv files? If not, we don't care about it
                            Peptide pep;
                            if (!uniquePeptides.TryGetValue(pepSeq.Replace('I', 'L'), out pep))
                                continue;

                            // Check to see if this protein has already been added to the list of proteins hit
                            if(!proteins.ContainsKey(prot)) // returns true if the protein is new to the hashset of proteins
                            {
                                proteins.Add(prot, prot);
                                if (prot.IsDecoy)
                                {
                                    decoyProteinsMapped++;
                                }
                                else
                                {
                                    forwardProteinsMapped++;
                                }
                            }

                            // Add the peptide to the protein (internally hashed, so don't worry about duplicates)
                            prot.AddPeptide(pep);

                            // Mark that this peptide was successfully mapped, this is for error checking purposes
                            if (!pep.IsMapped)
                            {
                                pepsMapped++;
                                pep.IsMapped = true;
                            }
                            //pep.MarkAsMapped();
                        }

                    }

                    // Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work
                    //if (fastaCounter > 100)
                    //{
                    //    fastaCounter = 0;
                    //    ProgressUpdate((double)reader.Position / totalBytes);
                    //}
                }
            }

            // Check to see if every peptide is matched, if not try using a brute force search method instead
            if (uniquePeptides.Count > pepsMapped)
            {
                // Get all the peptides that weren't mapped
                List<Peptide> unMapedPeptides = uniquePeptides.Values.Where(p => !p.IsMapped).ToList();

                Log("[WARNING] Couldn't find every peptide using digestion method (wrong enzyme perhaps?), trying brute force search instead on the remaining {0} peptides...", unMapedPeptides.Count);

                ProgressUpdate(0.1);
                using (FastaReader reader = new FastaReader(fastaFile))
                {
                    fastaCounter = 0;

                    // Read in each protein one-by-one
                    foreach (Fasta fasta in reader.ReadNextFasta())
                    {
                        string seq = fasta.Sequence.Replace('I', 'L');

                        foreach (Peptide pep2 in unMapedPeptides)
                        {
                            if (!seq.Contains(pep2.LeucineSequence))
                                continue;

                            Protein prot = new Protein(fasta.Description, fasta.Sequence);
                            Protein realProt;
                            if (proteins.TryGetValue(prot, out realProt))
                            {
                                // Add the peptide to the protein
                                realProt.AddPeptide(pep2);
                            }
                            else
                            {
                                proteins.Add(prot, prot);
                                if (prot.IsDecoy)
                                {
                                    decoyProteinsMapped++;
                                }
                                else
                                {
                                    forwardProteinsMapped++;
                                }

                                // Add the peptide to the protein
                                prot.AddPeptide(pep2);
                            }

                            // Mark that this peptide was successfully isMapped, this is for error checking purposes
                            pep2.IsMapped = true;
                            //pep2.MarkAsMapped();
                        }

                        fastaCounter++;

                        //// Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work
                        //if (fastaCounter > 100)
                        //{
                        //    fastaCounter = 0;
                        //    ProgressUpdate((double)reader.BaseStream.Position / totalBytes);
                        //}

                    }
                }

                // Still missing peptides?
                if (unMapedPeptides.Any(p => !p.IsMapped))
                {
                    int count = 0;
                    foreach (Peptide pep2 in unMapedPeptides)
                    {
                        if (pep2.IsMapped)
                            continue;
                        count++;
                        Log("[ERROR]\tPeptide {0} was not isMapped", pep2);
                    }
                    throw new ArgumentException(
                        string.Format(
                            "[ERROR] Unable to map every peptide ({0}) to {1}. You might be using either the wrong database, enzyme, or max missed cleavages!",
                            count, fastaFileniceName));
                }

            }

            Log("Every unique peptide was successfully mapped to at least one protein");
            Log("{0:N0} of {1:N0} ({2:F2}%) target proteins were mapped at least once", forwardProteinsMapped, forwardProteins, 100.0 * (double)forwardProteinsMapped / (double)forwardProteins);
            Log("{0:N0} of {1:N0} ({2:F2}%) decoy proteins were mapped at least once", decoyProteinsMapped, decoyProteins, 100.0 * (double)decoyProteinsMapped / (double)decoyProteins);

            // force the progress bar to go into marquee mode
            ProgressUpdate(0.0);

            // Return a list of all the proteins that were isMapped at least once
            return proteins.Values.ToList();
        }
Exemplo n.º 2
0
        public void WriteFasta(string fasta_file, FastaWriter Writer)
        {
            bool MakeDecoy = false;

            if (Options.OutputType == DatabaseType.Target || Options.OutputType == DatabaseType.Concatenated)
            {
                MakeDecoy = false;
            }
            else if (Options.OutputType == DatabaseType.Decoy || Options.OutputType == DatabaseType.Concatenated)
            {
                MakeDecoy = true;
            }

            using (FastaReader reader = new FastaReader(fasta_file))
            {

                foreach (Fasta fasta in reader.ReadNextFasta())
                {
                    Regex uniprotRegex = new Regex(@"(.+)\|(.+)\|(.+?)\s(.+?)\sOS=(.+?)(?:\sGN=(.+?))?(?:$|PE=(\d+)\sSV=(\d+))", RegexOptions.ExplicitCapture);
                    Match UniprotMatch = uniprotRegex.Match(fasta.Description);
                    string HeaderFile = "InvalidUniprotheaders.txt";
                    string headerFolder = Path.GetDirectoryName(Options.InputFiles[0]);

                    if (Options.EnforceUniprot && !UniprotMatch.Success)
                    {
                        using (StreamWriter log = new StreamWriter(Path.Combine(headerFolder, HeaderFile), true))
                        {
                            log.WriteLine("Invalid Header:");
                            log.WriteLine();
                            log.WriteLine(fasta.Description);
                            log.WriteLine();
                            InvalidHeader(fasta);
                        }
                    }

                    if (UniprotMatch.Success)
                    {
                        bool excludeMethionine = false;
                        if (Options.ExcludeNTerminalMethionine && !Options.ExcludeNTerminalResidue)
                        {
                            excludeMethionine = true;
                        }

                        if (MakeDecoy)
                        {
                            Writer.Write(fasta.ToDecoy(Options.DecoyPrefix, Options.DecoyType, (excludeMethionine || Options.ExcludeNTerminalResidue), Options.ExcludeNTerminalMethionine));
                        }

                        else
                        {
                            Writer.Write(fasta);
                        }

                    }

                }

            }
        }