FastaReader, CSMSL.IO C# (CSharp)代码示例

示例#1

0

显示文件

文件： PsmReader.cs 项目： yhayirsevertr/CSMSL

 public void LoadProteins(string fastaFile)
 {
     using (FastaReader reader = new FastaReader(fastaFile))
     {
         LoadProteins(reader.ReadNextProtein());
     }
 }

示例#2

0

显示文件

文件： ProteinGroupingExample.cs 项目： dbaileychess/CSMSL

        public static void Start(IProtease protease, double percentIdentified = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35)
        {
            Console.WriteLine("**Start Protein Grouping**");
            Stopwatch watch = new Stopwatch();
            watch.Start();
            List<Peptide> peps = new List<Peptide>();
            List<Protein> proteins = new List<Protein>();

            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength))
                    {
                        peps.Add(peptide);
                    }
                    proteins.Add(protein);
                }
            }
            Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds);

            // Fixed seed to make it reproducible
            Random random = new Random(480912341);

            // Take the first x % to act as our identified peptides
            List<Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int) (peps.Count*percentIdentified)).ToList();

            List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList();

            watch.Stop();
            Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count);
            Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
            Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet/(1024*1024));
            Console.WriteLine("**END Protein Grouping**");
        }

示例#3

0

显示文件

文件： MorpheusSearch.cs 项目： dbaileychess/CSMSL

        public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35)
        {
            Console.WriteLine("**Start Morpheus Search**");
            Stopwatch watch = new Stopwatch();
            watch.Start();
            List<int> hashCodes = new List<int>();
            // Generate peptide candidates

            HashSet<Peptide> peptides = new HashSet<Peptide>();
            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength))
                    {
                        peptides.Add(peptide);
                    }
                }
            }

            MSSearchEngine engine = new MorpheusSearchEngine();
            engine.PrecursorMassTolerance = Tolerance.FromPPM(100);
            engine.ProductMassTolerance = Tolerance.FromPPM(10);

            engine.LoadPeptides(peptides);

            watch.Stop();
            Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
            Console.WriteLine("Memory used: {0:N0} MB", Environment.WorkingSet/(1024*1024));
            Console.WriteLine("**End Morpheus Search**");
        }

示例#4

0

显示文件

文件： ProteinGroupingExample.cs 项目： dbaileychess/CSMSL

        public static void ExampleProteinGrouping(IProtease protease, double percentIdentified = 0.01, int maxMissed = 3, int minLength = 5, int maxLength = 50)
        {
            Stopwatch watch = new Stopwatch();
            watch.Start();
            List<Peptide> peps = new List<Peptide>(1000000);
            List<Protein> proteins = new List<Protein>(7000);

            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    peps.AddRange(protein.Digest(protease, maxMissed, minLength, maxLength));
                    proteins.Add(protein);
                }
            }
            Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds);
            watch.Restart();

            Random random = new Random(480912341);

            // Take the first x % to act as our identified peptides
            List<Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int) (peps.Count*percentIdentified)).ToList();

            List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList();

            watch.Stop();

            Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count);
            Console.WriteLine();
            Console.WriteLine("Time elapsed: {0} ms", watch.ElapsedMilliseconds);
        }

示例#5

0

显示文件

文件： TrypticDigestion.cs 项目： B-Rich/CSMSL

 public static void Start(IProtease protease, int maxMissed = 1, int minLength = 0, int maxLength = int.MaxValue, bool storeSequenceString = true)
 {
     Console.WriteLine("**Start Digestion**");
     Stopwatch watch = new Stopwatch();
     watch.Start();
     List<Peptide> peps = new List<Peptide>();
     List<Protein> prots = new List<Protein>();
     List<double> allMzs = new List<double>();
     AminoAcidPolymer.StoreSequenceString = storeSequenceString;
     using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
     {
         foreach (Protein protein in reader.ReadNextProtein())
         {
             foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength))
             {
                 peps.Add(peptide);
                 allMzs.Add(peptide.ToMz(2)); // forces the calculation of the mass and thus chemical formula
             }
             prots.Add(protein);
         }
     }
     watch.Stop();
     Console.WriteLine("{0:N0} proteins produced {1:N0} peptides using {2:N0} missed cleavages", prots.Count, peps.Count, maxMissed);
     Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
     Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024));
     Console.WriteLine("**End Digestion**");
 }

示例#6

0

显示文件

文件： FileOutputExamples.cs 项目： B-Rich/CSMSL

        public static void BasicCsvWriting(string outputFilePath)
        {
            Console.WriteLine("Writing file to: " + outputFilePath);

            // Create a stream writer to output data to a stream. In this case, the stream points to a file path on the
            // computer and is saved as a file. The using statement is the same as doing the following:
            // Stream writer = new StreamWriter(outputFilePath);
            // writer.Open();
            // writer.WriteLine("Hello World");
            // writer.Close();
            using (StreamWriter writer = new StreamWriter(outputFilePath))
            {
                // Now that the stream is open, we can a line of text to it to serve as the header row.
                // CSV formats are just text files with fields seperated by commas.
                writer.WriteLine("Protein Name,# of Amino Acids,Mass (da)");

                // Open a connection to a fasta file (very similar syntax as the StreamWriter above)
                using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
                {
                    // Loop over each protein in the fasta file
                    foreach (Protein protein in reader.ReadNextProtein())
                    {
                        // StringBuilder objects are an effective tool for constructing strings to write to files.
                        StringBuilder sb = new StringBuilder();

                        // To add items to the string builder, just call the append method with whatever you want to add
                        sb.Append(protein.Description);
                        sb.Append(','); // we need to add the delimiter after each field we add as well.

                        // Add the next item
                        sb.Append(protein.Length);
                        sb.Append(',');

                        // The append method can take any object, it will simply call the .ToString() method on the object supplied.
                        sb.Append(protein.MonoisotopicMass);

                        // No delimiter is needed after the last field is added.

                        // Now to write this string to the file itself.
                        // We convert the StringBuilder object (named: sb) to a string, and then write it on its own line in the
                        // file writer (named: writer)
                        writer.WriteLine(sb.ToString());
                    }
                }
            } // The file is automatically written and closed after exiting the using {} block.
        }

示例#7

0

显示文件

文件： MorpheusSearch.cs 项目： B-Rich/CSMSL

        public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35)
        {
            Console.WriteLine("**Start Morpheus Search**");
            Stopwatch watch = new Stopwatch();
            watch.Start();
            List<int> hashCodes = new List<int>();
            // Generate peptide candidates

            HashSet<Peptide> peptides = new HashSet<Peptide>();
            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength))
                    {
                        peptides.Add(peptide);
                    }
                }
            }

            MSSearchEngine engine = new MorpheusSearchEngine();
            engine.PrecursorMassTolerance = Tolerance.FromPPM(100);
            engine.ProductMassTolerance = Tolerance.FromPPM(10);

            engine.LoadPeptides(peptides);
            using (MSDataFile msDataFile = new ThermoRawFile("Resources/ThermoRawFileMS1MS2.raw"))
            {
                //SortedMaxSizedContainer<PeptideSpectralMatch> psms = engine.Search(msDataFile.Where(scan => scan.MsnOrder > 1));

                //foreach (MSDataScan scan in msDataFile.Where(scan => scan.MsnOrder > 1))
                //{
                //    List<PeptideSpectralMatch> psms = engine.Search(scan);
                //    Console.WriteLine("{0} {1}", scan.SpectrumNumber, psms.Count);
                //}
            }
            watch.Stop();
            Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
            Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024));
            Console.WriteLine("**End Morpheus Search**");
        }

示例#8

0

显示文件

文件： TrypticDigestion.cs 项目： dbaileychess/CSMSL

 public static void ExampleDigestion()
 {
     const string fastaFilePath = "Resources/yeast_uniprot_120226.fasta";
     IProtease trypsin = Protease.GetProtease("Trypsin");
     const int maxMissedCleavages = 3;
     const int minPeptideLength = 5;
     const int maxPeptideLength = 50;
     List<double> masses = new List<double>();
     Stopwatch watch = new Stopwatch();
     watch.Start();
     using (FastaReader reader = new FastaReader(fastaFilePath))
     {
         foreach (Protein protein in reader.ReadNextProtein())
         {
             foreach (Peptide peptide in protein.Digest(trypsin, maxMissedCleavages, minPeptideLength, maxPeptideLength))
             {
                 masses.Add(peptide.MonoisotopicMass);
             }
         }
     }
     //Console.WriteLine("Average Peptide Mass = {0:F4}", masses.Average());
     watch.Stop();
     Console.WriteLine("Time elapsed: {0}", watch.Elapsed);
 }

示例#9

0

显示文件

文件： PsmReader.cs 项目： kmmbvnr/CSMSL

 public void LoadProteins(string fastaFile)
 {
     using (FastaReader reader = new FastaReader(fastaFile))
     {
         LoadProteins(reader.ReadNextProtein());
     }
 }

示例#10

0

显示文件

文件： ProteinHoarder.cs 项目： dbaileychess/Compass

        /// <summary>
        /// Performs an in silico digestion of all the proteins found within the fasta file.
        /// </summary>
        /// <param name="fastaFile">The fasta filename to perfrom the digestion on</param>
        /// <param name="uniquePeptides">The unique peptides that were read in from the csv files</param>
        /// <param name="proteases"></param>
        /// <param name="semiDigestion">Perform a Semi Digestion</param>
        /// <returns>True if all the unique peptides get isMapped to at least one protein, false otherwise</returns>
        private List<Protein> GetMappedProteinsFromFasta(string fastaFile, Dictionary<string, Peptide> uniquePeptides, IList<Protease> proteases, bool semiDigestion = false)
        {
            string fastaFileniceName = Path.GetFileName(fastaFile);
            StringBuilder sb = new StringBuilder();
            foreach (Protease protease in proteases)
            {
                sb.Append(protease.Name);
                sb.Append(',');
            }
            if (sb.Length > 0) { sb.Remove(sb.Length - 1, 1); }
            Log("Performing {0}{1} digestion on {2}...", semiDigestion ? "semi " : "", sb, fastaFileniceName);
            //Peptide.MappedCount = 0;
            int forwardProteins = 0, decoyProteins = 0, forwardProteinsMapped = 0, decoyProteinsMapped = 0, fastaCounter = 0, pepsMapped = 0;
            long totalBytes = new FileInfo(fastaFile).Length;

            // A hashset of all proteins that have a peptide that was in the input files
            Dictionary<Protein, Protein> proteins = new Dictionary<Protein, Protein>(1 << 13);

            // Min and Max length of peptides
            int minLength = semiDigestion ? 1 : _smallestPeptide - 1;
            int maxLength = semiDigestion ? int.MaxValue : _largestPeptide + 1;

            // Open the reader for the protein database in the .fasta format
            using (FastaReader reader = new FastaReader(fastaFile))
            {
                // Read in each protein one-by-one
                foreach (Fasta fasta in reader.ReadNextFasta())
                {
                    // The number of fasta (proteins) read in (for progress bar feedback)
                    fastaCounter++;

                    // Create a new protein from the fasta
                    Protein prot = new Protein(fasta.Description, fasta.Sequence);

                    // Check if the protein is a decoy protein or not
                    if (prot.IsDecoy)
                    {
                        decoyProteins++;
                    }
                    else
                    {
                        forwardProteins++;
                    }

                    // Loop over each protease
                    foreach (Protease protease in proteases)
                    {

                        // Digest the protein's leucine sequences (all I's are now L's) with the given proteases, max missed cleavages, limiting it to the smallest and largest peptide observed (speed improvement)
                        // *Note each peptide sequence (pep_seq) will be leucine sequences as well
                        foreach (string pepSeq in AminoAcidPolymer.Digest(prot.Sequence, protease, MaxMissedCleavages, minLength, maxLength, semiDigestion: semiDigestion))
                        {
                            // Is this one of the unique peptide sequences in the csv files? If not, we don't care about it
                            Peptide pep;
                            if (!uniquePeptides.TryGetValue(pepSeq.Replace('I', 'L'), out pep))
                                continue;

                            // Check to see if this protein has already been added to the list of proteins hit
                            if(!proteins.ContainsKey(prot)) // returns true if the protein is new to the hashset of proteins
                            {
                                proteins.Add(prot, prot);
                                if (prot.IsDecoy)
                                {
                                    decoyProteinsMapped++;
                                }
                                else
                                {
                                    forwardProteinsMapped++;
                                }
                            }

                            // Add the peptide to the protein (internally hashed, so don't worry about duplicates)
                            prot.AddPeptide(pep);

                            // Mark that this peptide was successfully mapped, this is for error checking purposes
                            if (!pep.IsMapped)
                            {
                                pepsMapped++;
                                pep.IsMapped = true;
                            }
                            //pep.MarkAsMapped();
                        }

                    }

                    // Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work
                    //if (fastaCounter > 100)
                    //{
                    //    fastaCounter = 0;
                    //    ProgressUpdate((double)reader.Position / totalBytes);
                    //}
                }
            }

            // Check to see if every peptide is matched, if not try using a brute force search method instead
            if (uniquePeptides.Count > pepsMapped)
            {
                // Get all the peptides that weren't mapped
                List<Peptide> unMapedPeptides = uniquePeptides.Values.Where(p => !p.IsMapped).ToList();

                Log("[WARNING] Couldn't find every peptide using digestion method (wrong enzyme perhaps?), trying brute force search instead on the remaining {0} peptides...", unMapedPeptides.Count);

                ProgressUpdate(0.1);
                using (FastaReader reader = new FastaReader(fastaFile))
                {
                    fastaCounter = 0;

                    // Read in each protein one-by-one
                    foreach (Fasta fasta in reader.ReadNextFasta())
                    {
                        string seq = fasta.Sequence.Replace('I', 'L');

                        foreach (Peptide pep2 in unMapedPeptides)
                        {
                            if (!seq.Contains(pep2.LeucineSequence))
                                continue;

                            Protein prot = new Protein(fasta.Description, fasta.Sequence);
                            Protein realProt;
                            if (proteins.TryGetValue(prot, out realProt))
                            {
                                // Add the peptide to the protein
                                realProt.AddPeptide(pep2);
                            }
                            else
                            {
                                proteins.Add(prot, prot);
                                if (prot.IsDecoy)
                                {
                                    decoyProteinsMapped++;
                                }
                                else
                                {
                                    forwardProteinsMapped++;
                                }

                                // Add the peptide to the protein
                                prot.AddPeptide(pep2);
                            }

                            // Mark that this peptide was successfully isMapped, this is for error checking purposes
                            pep2.IsMapped = true;
                            //pep2.MarkAsMapped();
                        }

                        fastaCounter++;

                        //// Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work
                        //if (fastaCounter > 100)
                        //{
                        //    fastaCounter = 0;
                        //    ProgressUpdate((double)reader.BaseStream.Position / totalBytes);
                        //}

                    }
                }

                // Still missing peptides?
                if (unMapedPeptides.Any(p => !p.IsMapped))
                {
                    int count = 0;
                    foreach (Peptide pep2 in unMapedPeptides)
                    {
                        if (pep2.IsMapped)
                            continue;
                        count++;
                        Log("[ERROR]\tPeptide {0} was not isMapped", pep2);
                    }
                    throw new ArgumentException(
                        string.Format(
                            "[ERROR] Unable to map every peptide ({0}) to {1}. You might be using either the wrong database, enzyme, or max missed cleavages!",
                            count, fastaFileniceName));
                }

            }

            Log("Every unique peptide was successfully mapped to at least one protein");
            Log("{0:N0} of {1:N0} ({2:F2}%) target proteins were mapped at least once", forwardProteinsMapped, forwardProteins, 100.0 * (double)forwardProteinsMapped / (double)forwardProteins);
            Log("{0:N0} of {1:N0} ({2:F2}%) decoy proteins were mapped at least once", decoyProteinsMapped, decoyProteins, 100.0 * (double)decoyProteinsMapped / (double)decoyProteins);

            // force the progress bar to go into marquee mode
            ProgressUpdate(0.0);

            // Return a list of all the proteins that were isMapped at least once
            return proteins.Values.ToList();
        }

示例#11

0

显示文件

文件： ProteinGroupingExample.cs 项目： dbaileychess/CSMSL

        public static void StartRamp(IProtease protease, double percentIdentifiedSteps = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35)
        {
            List<Peptide> peps = new List<Peptide>();
            List<Protein> proteins = new List<Protein>();

            using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta"))
            {
                foreach (Protein protein in reader.ReadNextProtein())
                {
                    foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength))
                    {
                        peps.Add(peptide);
                    }
                    proteins.Add(protein);
                }
            }

            // Fixed seed to make it reproducible
            Random random = new Random(480912341);
            peps = peps.OrderBy(x => random.Next()).ToList();

            for (double percentIdentified = 0; percentIdentified <= 1; percentIdentified += percentIdentifiedSteps)
            {
                // Take the first x % to act as our identified peptides
                List<Peptide> identifiedPeptides = peps.Take((int) (peps.Count*percentIdentified)).ToList();

                List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList();
                Console.WriteLine("{0} peptides {1} protein groups", identifiedPeptides.Count, proteinGroups.Count);
            }
        }

示例#12

0

显示文件

文件： DatabaseMaker.cs 项目： dbaileychess/Compass

        public void WriteFasta(string fasta_file, FastaWriter Writer)
        {
            bool MakeDecoy = false;

            if (Options.OutputType == DatabaseType.Target || Options.OutputType == DatabaseType.Concatenated)
            {
                MakeDecoy = false;
            }
            else if (Options.OutputType == DatabaseType.Decoy || Options.OutputType == DatabaseType.Concatenated)
            {
                MakeDecoy = true;
            }

            using (FastaReader reader = new FastaReader(fasta_file))
            {

                foreach (Fasta fasta in reader.ReadNextFasta())
                {
                    Regex uniprotRegex = new Regex(@"(.+)\|(.+)\|(.+?)\s(.+?)\sOS=(.+?)(?:\sGN=(.+?))?(?:$|PE=(\d+)\sSV=(\d+))", RegexOptions.ExplicitCapture);
                    Match UniprotMatch = uniprotRegex.Match(fasta.Description);
                    string HeaderFile = "InvalidUniprotheaders.txt";
                    string headerFolder = Path.GetDirectoryName(Options.InputFiles[0]);

                    if (Options.EnforceUniprot && !UniprotMatch.Success)
                    {
                        using (StreamWriter log = new StreamWriter(Path.Combine(headerFolder, HeaderFile), true))
                        {
                            log.WriteLine("Invalid Header:");
                            log.WriteLine();
                            log.WriteLine(fasta.Description);
                            log.WriteLine();
                            InvalidHeader(fasta);
                        }
                    }

                    if (UniprotMatch.Success)
                    {
                        bool excludeMethionine = false;
                        if (Options.ExcludeNTerminalMethionine && !Options.ExcludeNTerminalResidue)
                        {
                            excludeMethionine = true;
                        }

                        if (MakeDecoy)
                        {
                            Writer.Write(fasta.ToDecoy(Options.DecoyPrefix, Options.DecoyType, (excludeMethionine || Options.ExcludeNTerminalResidue), Options.ExcludeNTerminalMethionine));
                        }

                        else
                        {
                            Writer.Write(fasta);
                        }

                    }

                }

            }
        }

C# (CSharp) CSMSL.IO FastaReader示例