Example #1
0
        private static string ApplyRegex(FastaHeaderFieldRegex regex, string line)
        {
            string result = null;

            if (regex != null)
            {
                var matches = regex.Regex.Matches(line);
                if (matches.Count > regex.Match && matches[regex.Match].Groups.Count > regex.Group)
                {
                    result = matches[regex.Match].Groups[regex.Group].Value;
                }
            }
            return(result);
        }
Example #2
0
        /// <summary>
        /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field.
        /// </summary>
        public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant,
                                                      FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex,
                                                      FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1)
        {
            HashSet <string> unique_accessions = new HashSet <string>();
            int    unique_identifier           = 1;
            string accession = null;
            string name      = null;
            string fullName  = null;
            string organism  = null;
            List <Tuple <string, string> > geneName = new List <Tuple <string, string> >();

            errors = new List <string>();
            Regex substituteWhitespace = new Regex(@"\s+");

            List <Protein> targets = new List <Protein>();

            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                         (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                         stream;

                StringBuilder sb    = null;
                StreamReader  fasta = new StreamReader(fastaFileStream);

                while (true)
                {
                    string line = "";
                    line = fasta.ReadLine();
                    if (line == null)
                    {
                        break;
                    }

                    if (line.StartsWith(">"))
                    {
                        accession = ApplyRegex(accessionRegex, line);
                        fullName  = ApplyRegex(fullNameRegex, line);
                        name      = ApplyRegex(nameRegex, line);
                        organism  = ApplyRegex(organismRegex, line);
                        string geneNameString = ApplyRegex(geneNameRegex, line);
                        if (geneNameString != null)
                        {
                            geneName.Add(new Tuple <string, string>("primary", geneNameString));
                        }

                        if (accession == null || accession == "")
                        {
                            accession = line.Substring(1).TrimEnd();
                        }

                        sb = new StringBuilder();
                    }
                    else if (sb != null)
                    {
                        sb.Append(line.Trim());
                    }

                    if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null)
                    {
                        string sequence = substituteWhitespace.Replace(sb.ToString(), "");
                        while (unique_accessions.Contains(accession))
                        {
                            accession += "_" + unique_identifier.ToString();
                            unique_identifier++;
                        }
                        unique_accessions.Add(accession);
                        Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName,
                                                      isContaminant: isContaminant, databaseFilePath: proteinDbLocation);
                        if (protein.Length == 0)
                        {
                            errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation);
                        }
                        else
                        {
                            targets.Add(protein);
                        }

                        accession = null;
                        name      = null;
                        fullName  = null;
                        organism  = null;
                        geneName  = new List <Tuple <string, string> >();
                    }

                    // no input left
                    if (fasta.Peek() == -1)
                    {
                        break;
                    }
                }
            }
            if (!targets.Any())
            {
                errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation);
            }
            List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);

            return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList());
        }
Example #3
0
        /// <summary>
        /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field.
        /// </summary>
        public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant,
                                                      FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex,
                                                      FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1)
        {
            HashSet <string> unique_accessions = new HashSet <string>();
            int    unique_identifier           = 2;//for isoforms. the first will be "accession", the next will be "accession_2"
            string accession = null;
            string name      = null;
            string fullName  = null;
            string organism  = null;
            List <Tuple <string, string> > geneName = new List <Tuple <string, string> >();

            errors = new List <string>();
            Regex substituteWhitespace = new Regex(@"\s+");

            List <Protein> targets = new List <Protein>();

            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                         (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                         stream;

                StringBuilder sb    = null;
                StreamReader  fasta = new StreamReader(fastaFileStream);

                while (true)
                {
                    string line = "";
                    line = fasta.ReadLine();
                    if (line == null)
                    {
                        break;
                    }

                    if (line.StartsWith(">"))
                    {
                        accession = ApplyRegex(accessionRegex, line);
                        fullName  = ApplyRegex(fullNameRegex, line);
                        name      = ApplyRegex(nameRegex, line);
                        organism  = ApplyRegex(organismRegex, line);
                        string geneNameString = ApplyRegex(geneNameRegex, line);
                        if (geneNameString != null)
                        {
                            geneName.Add(new Tuple <string, string>("primary", geneNameString));
                        }

                        if (accession == null || accession == "")
                        {
                            accession = line.Substring(1).TrimEnd();
                        }

                        sb = new StringBuilder();
                    }
                    else if (sb != null)
                    {
                        sb.Append(line.Trim());
                    }

                    if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null)
                    {
                        string sequence = substituteWhitespace.Replace(sb.ToString(), "");

                        // sanitize the sequence to replace unexpected characters with X (unknown amino acid)
                        // sometimes strange characters get added by RNA sequencing software, etc.
                        sequence = SanitizeAminoAcidSequence(sequence, 'X');

                        if (unique_accessions.Contains(accession))                                  //this will happen for isoforms
                        {
                            string originalAccession = accession;                                   //save the original
                            accession += "_" + unique_identifier.ToString();                        //add a number onto it
                            while (unique_accessions.Contains(accession))                           //if that number was already added
                            {
                                unique_identifier++;                                                //keep increasing it
                                accession = originalAccession + "_" + unique_identifier.ToString(); //try the new number
                            }
                            unique_identifier = 2;                                                  //reset
                        }
                        unique_accessions.Add(accession);
                        Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName,
                                                      isContaminant: isContaminant, databaseFilePath: proteinDbLocation);
                        if (protein.Length == 0)
                        {
                            errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation);
                        }
                        else
                        {
                            targets.Add(protein);
                        }

                        accession = null;
                        name      = null;
                        fullName  = null;
                        organism  = null;
                        geneName  = new List <Tuple <string, string> >();
                    }

                    // no input left
                    if (fasta.Peek() == -1)
                    {
                        break;
                    }
                }
            }
            if (!targets.Any())
            {
                errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation);
            }
            List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);

            return(generateTargets ? targets.Concat(decoys).ToList() : decoys);
        }