public void LoadProteins(string fastaFile) { using (FastaReader reader = new FastaReader(fastaFile)) { LoadProteins(reader.ReadNextProtein()); } }
public static void Start(IProtease protease, double percentIdentified = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Protein Grouping**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<Peptide> peps = new List<Peptide>(); List<Protein> proteins = new List<Protein>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); } proteins.Add(protein); } } Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds); // Fixed seed to make it reproducible Random random = new Random(480912341); // Take the first x % to act as our identified peptides List<Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int) (peps.Count*percentIdentified)).ToList(); List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet/(1024*1024)); Console.WriteLine("**END Protein Grouping**"); }
public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Morpheus Search**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<int> hashCodes = new List<int>(); // Generate peptide candidates HashSet<Peptide> peptides = new HashSet<Peptide>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peptides.Add(peptide); } } } MSSearchEngine engine = new MorpheusSearchEngine(); engine.PrecursorMassTolerance = Tolerance.FromPPM(100); engine.ProductMassTolerance = Tolerance.FromPPM(10); engine.LoadPeptides(peptides); watch.Stop(); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", Environment.WorkingSet/(1024*1024)); Console.WriteLine("**End Morpheus Search**"); }
public static void ExampleProteinGrouping(IProtease protease, double percentIdentified = 0.01, int maxMissed = 3, int minLength = 5, int maxLength = 50) { Stopwatch watch = new Stopwatch(); watch.Start(); List<Peptide> peps = new List<Peptide>(1000000); List<Protein> proteins = new List<Protein>(7000); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { peps.AddRange(protein.Digest(protease, maxMissed, minLength, maxLength)); proteins.Add(protein); } } Console.WriteLine("Loaded {0:N0} peptides from {1:N0} proteins in {2} ms", peps.Count, proteins.Count, watch.ElapsedMilliseconds); watch.Restart(); Random random = new Random(480912341); // Take the first x % to act as our identified peptides List<Peptide> identifiedPeptides = peps.OrderBy(x => random.Next()).Take((int) (peps.Count*percentIdentified)).ToList(); List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} protein groups from {2:N0} identified sequences", proteins.Count, proteinGroups.Count, identifiedPeptides.Count); Console.WriteLine(); Console.WriteLine("Time elapsed: {0} ms", watch.ElapsedMilliseconds); }
public static void Start(IProtease protease, int maxMissed = 1, int minLength = 0, int maxLength = int.MaxValue, bool storeSequenceString = true) { Console.WriteLine("**Start Digestion**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<Peptide> peps = new List<Peptide>(); List<Protein> prots = new List<Protein>(); List<double> allMzs = new List<double>(); AminoAcidPolymer.StoreSequenceString = storeSequenceString; using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); allMzs.Add(peptide.ToMz(2)); // forces the calculation of the mass and thus chemical formula } prots.Add(protein); } } watch.Stop(); Console.WriteLine("{0:N0} proteins produced {1:N0} peptides using {2:N0} missed cleavages", prots.Count, peps.Count, maxMissed); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024)); Console.WriteLine("**End Digestion**"); }
public static void BasicCsvWriting(string outputFilePath) { Console.WriteLine("Writing file to: " + outputFilePath); // Create a stream writer to output data to a stream. In this case, the stream points to a file path on the // computer and is saved as a file. The using statement is the same as doing the following: // Stream writer = new StreamWriter(outputFilePath); // writer.Open(); // writer.WriteLine("Hello World"); // writer.Close(); using (StreamWriter writer = new StreamWriter(outputFilePath)) { // Now that the stream is open, we can a line of text to it to serve as the header row. // CSV formats are just text files with fields seperated by commas. writer.WriteLine("Protein Name,# of Amino Acids,Mass (da)"); // Open a connection to a fasta file (very similar syntax as the StreamWriter above) using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { // Loop over each protein in the fasta file foreach (Protein protein in reader.ReadNextProtein()) { // StringBuilder objects are an effective tool for constructing strings to write to files. StringBuilder sb = new StringBuilder(); // To add items to the string builder, just call the append method with whatever you want to add sb.Append(protein.Description); sb.Append(','); // we need to add the delimiter after each field we add as well. // Add the next item sb.Append(protein.Length); sb.Append(','); // The append method can take any object, it will simply call the .ToString() method on the object supplied. sb.Append(protein.MonoisotopicMass); // No delimiter is needed after the last field is added. // Now to write this string to the file itself. // We convert the StringBuilder object (named: sb) to a string, and then write it on its own line in the // file writer (named: writer) writer.WriteLine(sb.ToString()); } } } // The file is automatically written and closed after exiting the using {} block. }
public static void Start(IProtease protease, int maxMissed = 3, int minLength = 5, int maxLength = 35) { Console.WriteLine("**Start Morpheus Search**"); Stopwatch watch = new Stopwatch(); watch.Start(); List<int> hashCodes = new List<int>(); // Generate peptide candidates HashSet<Peptide> peptides = new HashSet<Peptide>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peptides.Add(peptide); } } } MSSearchEngine engine = new MorpheusSearchEngine(); engine.PrecursorMassTolerance = Tolerance.FromPPM(100); engine.ProductMassTolerance = Tolerance.FromPPM(10); engine.LoadPeptides(peptides); using (MSDataFile msDataFile = new ThermoRawFile("Resources/ThermoRawFileMS1MS2.raw")) { //SortedMaxSizedContainer<PeptideSpectralMatch> psms = engine.Search(msDataFile.Where(scan => scan.MsnOrder > 1)); //foreach (MSDataScan scan in msDataFile.Where(scan => scan.MsnOrder > 1)) //{ // List<PeptideSpectralMatch> psms = engine.Search(scan); // Console.WriteLine("{0} {1}", scan.SpectrumNumber, psms.Count); //} } watch.Stop(); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); Console.WriteLine("Memory used: {0:N0} MB", System.Environment.WorkingSet / (1024 * 1024)); Console.WriteLine("**End Morpheus Search**"); }
public static void ExampleDigestion() { const string fastaFilePath = "Resources/yeast_uniprot_120226.fasta"; IProtease trypsin = Protease.GetProtease("Trypsin"); const int maxMissedCleavages = 3; const int minPeptideLength = 5; const int maxPeptideLength = 50; List<double> masses = new List<double>(); Stopwatch watch = new Stopwatch(); watch.Start(); using (FastaReader reader = new FastaReader(fastaFilePath)) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(trypsin, maxMissedCleavages, minPeptideLength, maxPeptideLength)) { masses.Add(peptide.MonoisotopicMass); } } } //Console.WriteLine("Average Peptide Mass = {0:F4}", masses.Average()); watch.Stop(); Console.WriteLine("Time elapsed: {0}", watch.Elapsed); }
/// <summary> /// Performs an in silico digestion of all the proteins found within the fasta file. /// </summary> /// <param name="fastaFile">The fasta filename to perfrom the digestion on</param> /// <param name="uniquePeptides">The unique peptides that were read in from the csv files</param> /// <param name="proteases"></param> /// <param name="semiDigestion">Perform a Semi Digestion</param> /// <returns>True if all the unique peptides get isMapped to at least one protein, false otherwise</returns> private List<Protein> GetMappedProteinsFromFasta(string fastaFile, Dictionary<string, Peptide> uniquePeptides, IList<Protease> proteases, bool semiDigestion = false) { string fastaFileniceName = Path.GetFileName(fastaFile); StringBuilder sb = new StringBuilder(); foreach (Protease protease in proteases) { sb.Append(protease.Name); sb.Append(','); } if (sb.Length > 0) { sb.Remove(sb.Length - 1, 1); } Log("Performing {0}{1} digestion on {2}...", semiDigestion ? "semi " : "", sb, fastaFileniceName); //Peptide.MappedCount = 0; int forwardProteins = 0, decoyProteins = 0, forwardProteinsMapped = 0, decoyProteinsMapped = 0, fastaCounter = 0, pepsMapped = 0; long totalBytes = new FileInfo(fastaFile).Length; // A hashset of all proteins that have a peptide that was in the input files Dictionary<Protein, Protein> proteins = new Dictionary<Protein, Protein>(1 << 13); // Min and Max length of peptides int minLength = semiDigestion ? 1 : _smallestPeptide - 1; int maxLength = semiDigestion ? int.MaxValue : _largestPeptide + 1; // Open the reader for the protein database in the .fasta format using (FastaReader reader = new FastaReader(fastaFile)) { // Read in each protein one-by-one foreach (Fasta fasta in reader.ReadNextFasta()) { // The number of fasta (proteins) read in (for progress bar feedback) fastaCounter++; // Create a new protein from the fasta Protein prot = new Protein(fasta.Description, fasta.Sequence); // Check if the protein is a decoy protein or not if (prot.IsDecoy) { decoyProteins++; } else { forwardProteins++; } // Loop over each protease foreach (Protease protease in proteases) { // Digest the protein's leucine sequences (all I's are now L's) with the given proteases, max missed cleavages, limiting it to the smallest and largest peptide observed (speed improvement) // *Note each peptide sequence (pep_seq) will be leucine sequences as well foreach (string pepSeq in AminoAcidPolymer.Digest(prot.Sequence, protease, MaxMissedCleavages, minLength, maxLength, semiDigestion: semiDigestion)) { // Is this one of the unique peptide sequences in the csv files? If not, we don't care about it Peptide pep; if (!uniquePeptides.TryGetValue(pepSeq.Replace('I', 'L'), out pep)) continue; // Check to see if this protein has already been added to the list of proteins hit if(!proteins.ContainsKey(prot)) // returns true if the protein is new to the hashset of proteins { proteins.Add(prot, prot); if (prot.IsDecoy) { decoyProteinsMapped++; } else { forwardProteinsMapped++; } } // Add the peptide to the protein (internally hashed, so don't worry about duplicates) prot.AddPeptide(pep); // Mark that this peptide was successfully mapped, this is for error checking purposes if (!pep.IsMapped) { pepsMapped++; pep.IsMapped = true; } //pep.MarkAsMapped(); } } // Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work //if (fastaCounter > 100) //{ // fastaCounter = 0; // ProgressUpdate((double)reader.Position / totalBytes); //} } } // Check to see if every peptide is matched, if not try using a brute force search method instead if (uniquePeptides.Count > pepsMapped) { // Get all the peptides that weren't mapped List<Peptide> unMapedPeptides = uniquePeptides.Values.Where(p => !p.IsMapped).ToList(); Log("[WARNING] Couldn't find every peptide using digestion method (wrong enzyme perhaps?), trying brute force search instead on the remaining {0} peptides...", unMapedPeptides.Count); ProgressUpdate(0.1); using (FastaReader reader = new FastaReader(fastaFile)) { fastaCounter = 0; // Read in each protein one-by-one foreach (Fasta fasta in reader.ReadNextFasta()) { string seq = fasta.Sequence.Replace('I', 'L'); foreach (Peptide pep2 in unMapedPeptides) { if (!seq.Contains(pep2.LeucineSequence)) continue; Protein prot = new Protein(fasta.Description, fasta.Sequence); Protein realProt; if (proteins.TryGetValue(prot, out realProt)) { // Add the peptide to the protein realProt.AddPeptide(pep2); } else { proteins.Add(prot, prot); if (prot.IsDecoy) { decoyProteinsMapped++; } else { forwardProteinsMapped++; } // Add the peptide to the protein prot.AddPeptide(pep2); } // Mark that this peptide was successfully isMapped, this is for error checking purposes pep2.IsMapped = true; //pep2.MarkAsMapped(); } fastaCounter++; //// Only call every 100 proteins otherwise you are wasting a lot of time refreshing and not doing actual work //if (fastaCounter > 100) //{ // fastaCounter = 0; // ProgressUpdate((double)reader.BaseStream.Position / totalBytes); //} } } // Still missing peptides? if (unMapedPeptides.Any(p => !p.IsMapped)) { int count = 0; foreach (Peptide pep2 in unMapedPeptides) { if (pep2.IsMapped) continue; count++; Log("[ERROR]\tPeptide {0} was not isMapped", pep2); } throw new ArgumentException( string.Format( "[ERROR] Unable to map every peptide ({0}) to {1}. You might be using either the wrong database, enzyme, or max missed cleavages!", count, fastaFileniceName)); } } Log("Every unique peptide was successfully mapped to at least one protein"); Log("{0:N0} of {1:N0} ({2:F2}%) target proteins were mapped at least once", forwardProteinsMapped, forwardProteins, 100.0 * (double)forwardProteinsMapped / (double)forwardProteins); Log("{0:N0} of {1:N0} ({2:F2}%) decoy proteins were mapped at least once", decoyProteinsMapped, decoyProteins, 100.0 * (double)decoyProteinsMapped / (double)decoyProteins); // force the progress bar to go into marquee mode ProgressUpdate(0.0); // Return a list of all the proteins that were isMapped at least once return proteins.Values.ToList(); }
public static void StartRamp(IProtease protease, double percentIdentifiedSteps = 0.05, int maxMissed = 3, int minLength = 5, int maxLength = 35) { List<Peptide> peps = new List<Peptide>(); List<Protein> proteins = new List<Protein>(); using (FastaReader reader = new FastaReader("Resources/yeast_uniprot_120226.fasta")) { foreach (Protein protein in reader.ReadNextProtein()) { foreach (Peptide peptide in protein.Digest(protease, maxMissed, minLength, maxLength)) { peps.Add(peptide); } proteins.Add(protein); } } // Fixed seed to make it reproducible Random random = new Random(480912341); peps = peps.OrderBy(x => random.Next()).ToList(); for (double percentIdentified = 0; percentIdentified <= 1; percentIdentified += percentIdentifiedSteps) { // Take the first x % to act as our identified peptides List<Peptide> identifiedPeptides = peps.Take((int) (peps.Count*percentIdentified)).ToList(); List<ProteinGroup> proteinGroups = ProteinGroup.GroupProteins(proteins, protease, identifiedPeptides, new AminoAcidLeucineSequenceComparer(), maxMissed).ToList(); Console.WriteLine("{0} peptides {1} protein groups", identifiedPeptides.Count, proteinGroups.Count); } }
public void WriteFasta(string fasta_file, FastaWriter Writer) { bool MakeDecoy = false; if (Options.OutputType == DatabaseType.Target || Options.OutputType == DatabaseType.Concatenated) { MakeDecoy = false; } else if (Options.OutputType == DatabaseType.Decoy || Options.OutputType == DatabaseType.Concatenated) { MakeDecoy = true; } using (FastaReader reader = new FastaReader(fasta_file)) { foreach (Fasta fasta in reader.ReadNextFasta()) { Regex uniprotRegex = new Regex(@"(.+)\|(.+)\|(.+?)\s(.+?)\sOS=(.+?)(?:\sGN=(.+?))?(?:$|PE=(\d+)\sSV=(\d+))", RegexOptions.ExplicitCapture); Match UniprotMatch = uniprotRegex.Match(fasta.Description); string HeaderFile = "InvalidUniprotheaders.txt"; string headerFolder = Path.GetDirectoryName(Options.InputFiles[0]); if (Options.EnforceUniprot && !UniprotMatch.Success) { using (StreamWriter log = new StreamWriter(Path.Combine(headerFolder, HeaderFile), true)) { log.WriteLine("Invalid Header:"); log.WriteLine(); log.WriteLine(fasta.Description); log.WriteLine(); InvalidHeader(fasta); } } if (UniprotMatch.Success) { bool excludeMethionine = false; if (Options.ExcludeNTerminalMethionine && !Options.ExcludeNTerminalResidue) { excludeMethionine = true; } if (MakeDecoy) { Writer.Write(fasta.ToDecoy(Options.DecoyPrefix, Options.DecoyType, (excludeMethionine || Options.ExcludeNTerminalResidue), Options.ExcludeNTerminalMethionine)); } else { Writer.Write(fasta); } } } } }