public void CreateDatabase() { try { // Validate Options are kosher if (Options.InputFiles == null ||Options.InputFiles.Count == 0) { throw new ArgumentNullException("Input Files"); } string ext = Path.GetExtension(Options.OutputFastaFile); if (string.IsNullOrEmpty(ext)) { ext = ".fasta"; } string output_filename = Path.GetFullPath(Options.OutputFastaFile); if (Options.DoNotAppendDatabaseType) { output_filename = output_filename + ext; if (Options.InputFiles.Contains(output_filename)) { throw new ArgumentException("Output file path cannot be the same as an input file."); } } else { switch (Options.OutputType) { case DatabaseType.Target: output_filename += "_TARGET" + ext; break; case DatabaseType.Decoy: output_filename += "_DECOY" + ext; break; case DatabaseType.Concatenated: default: output_filename += "_CONCAT" + ext; break; } } string logFilename = Path.GetFileNameWithoutExtension(Options.OutputFastaFile); string outputFolder = Path.GetDirectoryName(Options.OutputFastaFile); if (!Directory.Exists(outputFolder)) { outputFolder = Directory.GetCurrentDirectory(); } if (Options.GenerateLogFile) { switch (Options.OutputType) { case DatabaseType.Target: logFilename += "_TARGET.log"; break; case DatabaseType.Decoy: logFilename += "_DECOY.log"; break; case DatabaseType.Concatenated: default: logFilename += "_CONCAT.log"; break; } GenerateLog(outputFolder, logFilename); } string outputPath = Path.Combine(outputFolder, output_filename); if (Options.DoNotMergeFiles) { foreach (string fastaFile in Options.InputFiles) { using (FastaWriter writer = new FastaWriter(outputPath)) { WriteFasta(fastaFile, writer); } } } else { using (FastaWriter writer = new FastaWriter(outputPath)) { foreach (string fastaFile in Options.InputFiles) { WriteFasta(fastaFile, writer); } } } if (Options.BlastDatabase) { MakeBlastDatabase(outputFolder, output_filename, Path.GetFileNameWithoutExtension(output_filename)); } } catch (DirectoryNotFoundException) { } }
private void WriteSequenceMaps(List<ProteinGroup> proteinGroups, string outputDirectory) { string fileName = Path.Combine(outputDirectory, "Sequence Coverage Map.txt"); Log("Writing file " + fileName); string csvFile = Path.Combine(outputDirectory, "data.csv"); string identifiedFile = Path.Combine(outputDirectory, "identifiedSequences.fasta"); string observedFile = Path.Combine(outputDirectory, "observedProteins.fasta"); using (StreamWriter csvWriter = new StreamWriter(csvFile)) using (FastaWriter identifiedWriter = new FastaWriter(identifiedFile)) using (FastaWriter observedWriter = new FastaWriter(observedFile)) using (StreamWriter writer = new StreamWriter(fileName)) { csvWriter.WriteLine("protein,position,count,coverage,protein name,# peptides"); int proteinID = 0; foreach (ProteinGroup proteinGroup in proteinGroups.Where(g => !g.IsDecoy).OrderBy(g => g.LongestProteinLen)) { proteinID++; string sequence = proteinGroup.RepresentativeProtein.Sequence; string leusequence = proteinGroup.RepresentativeProtein.LeucineSequence; int length = sequence.Length; int[] bits = proteinGroup.RepresentativeProtein.GetSequenceCoverage(proteinGroup.Peptides); ISet<Peptide> peptides = proteinGroup.Peptides; // Write the header data writer.WriteLine("========"); writer.WriteLine("Proteins = {0}", proteinGroup.Count); writer.WriteLine(proteinGroup.RepresentativeProtein.Description); foreach(Protein prot in proteinGroup) { if(prot != proteinGroup.RepresentativeProtein) writer.WriteLine(prot.Description); } writer.WriteLine("Length = {0}", proteinGroup.RepresentativeProtein.Length); writer.WriteLine("Redundacy = {0:g3}%", proteinGroup.SequenceRedundacy); writer.WriteLine("Coverage = {0:g3}%, {1} AA", proteinGroup.SequenceCoverage, bits.Count(bit => bit > 0)); int shared = peptides.Count(pep => pep.IsShared); if (shared > 0) { int[] bits2 = proteinGroup.RepresentativeProtein.GetSequenceCoverage(peptides.Where(pep => !pep.IsShared)); int observedAminoAcids = bits2.Count(bit => bit > 0); double coverage = (double)observedAminoAcids / bits2.Length * 100.0; writer.WriteLine("Coverage = {0:g3}%, {1} AA (unshared only)", coverage, observedAminoAcids); writer.WriteLine("Peptides = {0}, {1} are shared (marked by *)", peptides.Count, shared); } else { writer.WriteLine("Peptides = {0}", peptides.Count); } writer.WriteLine("========"); if (proteinGroup.Count > 1) { writer.WriteLine("Protein Sequences (Differences marked by *)"); writer.Write(' '); for (int i = 0; i < sequence.Length; i++) { bool same = true; char c = sequence[i]; foreach (Protein prot in proteinGroup) { if (i >= prot.Length || !prot.Sequence[i].Equals(c)) { same = false; break; } } writer.Write(same ? ' ' : '*'); } writer.WriteLine(); foreach (Protein prot in proteinGroup) { writer.Write(' '); writer.WriteLine(prot.Sequence); } writer.WriteLine("========"); } // Write the amino acid numbers writer.Write(" 1"); int size = 2; for (int i = 10; i < length; i += 10) { for (int j = 0; j < 10 - size; j++) { writer.Write(' '); } writer.Write(i); if (i < 100) { size = 2; } else if (i < 1000) { size = 3; } else if (i < 10000) { size = 4; } else { size = 5; } } writer.WriteLine(); // Write the complete sequence writer.WriteLine(" "+sequence); observedWriter.Write(sequence, proteinGroup.RepresentativeProtein.Description); // Write the combined mapped sequence StringBuilder compressedSequence = new StringBuilder(); writer.Write(" "); int startIndex = -1; bool started = false; for (int i = 0; i < bits.Length; i++) { if (bits[i] > 0) { writer.Write(sequence[i]); compressedSequence.Append(sequence[i]); if(!started) { startIndex = i; started = true; } } else { if (started) { started = false; csvWriter.WriteLine("{0},{1},{2},{3},{4},{5}", proteinID, startIndex, i - startIndex, proteinGroup.SequenceCoverage, proteinGroup.Description, peptides.Count); } writer.Write(' '); compressedSequence.Append(' '); } } if (started) { csvWriter.WriteLine("{0},{1},{2},{3},{4},{5}", proteinID, startIndex, bits.Length - startIndex, proteinGroup.SequenceCoverage, proteinGroup.Description, peptides.Count); } identifiedWriter.Write(compressedSequence.ToString(), proteinGroup.RepresentativeProtein.Description); writer.WriteLine(); writer.WriteLine(); // Write the each peptide foreach (Peptide peptide in peptides.OrderBy(pep => leusequence.IndexOf(pep.LeucineSequence, 0)).ThenByDescending(pep => pep.Length)) { writer.Write((peptide.IsShared) ? "*" : " "); int start_index = 0; while (true) { int index = leusequence.IndexOf(peptide.LeucineSequence, start_index); if (index < 0) { break; } // Write blank spaces writer.Write(new string(' ', index)); writer.Write(peptide.LeucineSequence); start_index = index + 1; writer.WriteLine(); //for (int aa = 0; aa < peptide.Length; aa++) //{ // writer.Write(peptide.LeucineSequence[aa]); //} } } // Give some room between proteins writer.WriteLine(); } } }
public void WriteFasta(string fasta_file, FastaWriter Writer) { bool MakeDecoy = false; if (Options.OutputType == DatabaseType.Target || Options.OutputType == DatabaseType.Concatenated) { MakeDecoy = false; } else if (Options.OutputType == DatabaseType.Decoy || Options.OutputType == DatabaseType.Concatenated) { MakeDecoy = true; } using (FastaReader reader = new FastaReader(fasta_file)) { foreach (Fasta fasta in reader.ReadNextFasta()) { Regex uniprotRegex = new Regex(@"(.+)\|(.+)\|(.+?)\s(.+?)\sOS=(.+?)(?:\sGN=(.+?))?(?:$|PE=(\d+)\sSV=(\d+))", RegexOptions.ExplicitCapture); Match UniprotMatch = uniprotRegex.Match(fasta.Description); string HeaderFile = "InvalidUniprotheaders.txt"; string headerFolder = Path.GetDirectoryName(Options.InputFiles[0]); if (Options.EnforceUniprot && !UniprotMatch.Success) { using (StreamWriter log = new StreamWriter(Path.Combine(headerFolder, HeaderFile), true)) { log.WriteLine("Invalid Header:"); log.WriteLine(); log.WriteLine(fasta.Description); log.WriteLine(); InvalidHeader(fasta); } } if (UniprotMatch.Success) { bool excludeMethionine = false; if (Options.ExcludeNTerminalMethionine && !Options.ExcludeNTerminalResidue) { excludeMethionine = true; } if (MakeDecoy) { Writer.Write(fasta.ToDecoy(Options.DecoyPrefix, Options.DecoyType, (excludeMethionine || Options.ExcludeNTerminalResidue), Options.ExcludeNTerminalMethionine)); } else { Writer.Write(fasta); } } } } }