private void GenerateMetaFiles() { if (File.Exists(_seqFilePath)) { File.Delete(_seqFilePath); } if (File.Exists(_annoFilePath)) { File.Delete(_annoFilePath); } using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew))) using (var annoWriter = new StreamWriter(_annoFilePath)) { // Read var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) { return; } long offset = 0; while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = (char)Delimiter + reader.ProteinSequence; var length = reader.ProteinSequence.Length; seqWriter.Write(Encoding.GetBytes(sequence)); annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}", offset, AnnotationDelimiter, length, AnnotationDelimiter, name.Replace(AnnotationDelimiter, '-'), AnnotationDelimiter, description); offset += sequence.Length; } seqWriter.Write((char)Delimiter); // write file format Id seqWriter.Write(FileFormatId); // writer last write time hash var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode(); seqWriter.Write(hashCode); annoWriter.Write("{0}{1}{2}", FileFormatId, AnnotationDelimiter, hashCode); reader.CloseFile(); } }
/// <summary> /// Create the decoy version of this databse /// </summary> /// <param name="enzyme"></param> /// <param name="shuffle"></param> public void CreateDecoyDatabase(Enzyme enzyme, bool shuffle) { var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) { return; } var decoyDatabaseFileName = GetDecoyDatabasePath(enzyme, shuffle); Console.WriteLine("Creating " + decoyDatabaseFileName); using (var decoyWriter = new StreamWriter(decoyDatabaseFileName)) { while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = reader.ProteinSequence; decoyWriter.WriteLine(">{0}_{1} {2}", FastaDatabaseConstants.DecoyProteinPrefix, name, description); if (!shuffle) { // Reversed protein sequence var decoySequence = new StringBuilder(); for (var i = sequence.Length - 1; i >= 0; i--) { var residue = sequence[i]; if (enzyme != null && enzyme.Residues.Length > 0 && enzyme.IsCleavable(residue) && decoySequence.Length > 0) { var residueToBeReplaced = decoySequence[decoySequence.Length - 1]; decoySequence.Remove(decoySequence.Length - 1, 1); decoySequence.Append((char)residue); decoySequence.Append(residueToBeReplaced); } else { decoySequence.Append((char)residue); } } decoyWriter.WriteLine(decoySequence); } else { // Shuffled protein sequences decoyWriter.WriteLine(SimpleStringProcessing.Mutate(SimpleStringProcessing.Shuffle(sequence), NumMutations)); } } reader.CloseFile(); } }
private void GenerateMetaFiles() { if (File.Exists(_seqFilePath)) { File.Delete(_seqFilePath); } if (File.Exists(_annoFilePath)) { File.Delete(_annoFilePath); } // Keys are protein name // Values track the number of times the name has been encountered, the number of residues, and a SHA1 hash of the residues var proteinNamesAndStats = new Dictionary <string, ProteinHashInfo>(StringComparer.InvariantCultureIgnoreCase); using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew))) using (var annoWriter = new StreamWriter(_annoFilePath)) { // Read var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) { return; } long offset = 0; while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = (char)FastaDatabaseConstants.Delimiter + reader.ProteinSequence; var length = sequence.Length; var proteinInfoCurrent = new ProteinHashInfo(sequence); if (proteinNamesAndStats.TryGetValue(name, out var proteinInfoCached)) { // Duplicate name; either skip this protein or rename it if (proteinInfoCached.SequenceLength == proteinInfoCurrent.SequenceLength && string.Equals(proteinInfoCached.SequenceHash, proteinInfoCurrent.SequenceHash)) { // Duplicate protein; skip it continue; } name += "_Duplicate" + proteinInfoCached.ObservationCount.ToString("00"); proteinInfoCached.ObservationCount++; } proteinNamesAndStats.Add(name, proteinInfoCurrent); seqWriter.Write(Encoding.GetBytes((string)sequence)); annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}", offset, FastaDatabaseConstants.AnnotationDelimiter, length, FastaDatabaseConstants.AnnotationDelimiter, name.Replace(FastaDatabaseConstants.AnnotationDelimiter, '-'), FastaDatabaseConstants.AnnotationDelimiter, description); offset += sequence.Length; } seqWriter.Write((char)FastaDatabaseConstants.Delimiter); // write file format Id seqWriter.Write(FileFormatId); // writer last write time hash var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode(); seqWriter.Write(hashCode); annoWriter.Write("{0}{1}{2}", FileFormatId, FastaDatabaseConstants.AnnotationDelimiter, hashCode); reader.CloseFile(); } }
static void Main(string[] args) { string fileName = null; string fileType = "FASTA"; int maximalDistance = 10; string exporter = "EXCEL"; IFileReader fileReader; IExport export; var fundamentPatterns = new List <string>(); var p = new OptionSet() { { "f|file=", a => fileName = a }, { "t|fileType=", a => fileType = a }, { "e|exporter=", a => exporter = a }, { "d|maximalDistance=", (int a) => maximalDistance = a }, { "p|pattern=", a => { if (!string.IsNullOrEmpty(a)) { fundamentPatterns.Add(a); } } }, }; p.Parse(args); //List<string> extra = if (fundamentPatterns.Count == 0) { throw new ArgumentNullException(nameof(fundamentPatterns), "No paterns defined"); } if (string.IsNullOrEmpty(fileName)) { throw new ArgumentNullException(nameof(fileName)); } if (!File.Exists(fileName)) { throw new FileNotFoundException("File not found", fileName); } if (string.Equals(fileType, "FASTA", StringComparison.OrdinalIgnoreCase)) { fileReader = new FastaFileReader(); } else { throw new ArgumentOutOfRangeException(nameof(fileType), "Unknown file type"); } var fileContent = fileReader.Read(fileName).Select((a) => new ProteinSequenceWithFundaments(a)).ToList(); var matcher = new FundamentsMatcher(); foreach (var fundamentPattern in fundamentPatterns.Select((a, i) => Tuple.Create(a, i + 1))) { matcher.FindInSequences(fileContent, fundamentPattern.Item2, fundamentPattern.Item1, maximalDistance); } if (string.Equals(exporter, "CONSOLE", StringComparison.OrdinalIgnoreCase)) { export = new ConsoleExport(fundamentPatterns.Count); } else if (string.Equals(exporter, "EXCEL", StringComparison.OrdinalIgnoreCase)) { export = new ExcelExport(fundamentPatterns.Count); } else { throw new ArgumentOutOfRangeException(nameof(fileType), "Unknown exporter type"); } export.Export(fileContent); }
/// <summary> /// Create the decoy version of this databse /// </summary> /// <param name="enzyme"></param> /// <param name="shuffle"></param> public void CreateDecoyDatabase(Enzyme enzyme, bool shuffle) { var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) return; var decoyDatabaseFileName = GetDecoyDatabasePath(enzyme, shuffle); Console.WriteLine("Creating " + decoyDatabaseFileName); using (var decoyWriter = new StreamWriter(decoyDatabaseFileName)) { while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = reader.ProteinSequence; decoyWriter.WriteLine(">{0}_{1} {2}", DecoyProteinPrefix, name, description); if (!shuffle) { // Reversed protein sequence var decoySequence = new StringBuilder(); for (var i = sequence.Length - 1; i >= 0; i--) { var residue = sequence[i]; if (enzyme != null && enzyme.Residues.Length > 0 && enzyme.IsCleavable(residue) && decoySequence.Length > 0) { var residueToBeReplaced = decoySequence[decoySequence.Length - 1]; decoySequence.Remove(decoySequence.Length - 1, 1); decoySequence.Append(residue); decoySequence.Append(residueToBeReplaced); } else { decoySequence.Append(residue); } } decoyWriter.WriteLine(decoySequence); } else { // Shuffled protein sequences decoyWriter.WriteLine(SimpleStringProcessing.Mutate(SimpleStringProcessing.Shuffle(sequence), NumMutations)); } } reader.CloseFile(); } }
private void GenerateMetaFiles() { if (File.Exists(_seqFilePath)) File.Delete(_seqFilePath); if (File.Exists(_annoFilePath)) File.Delete(_annoFilePath); using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew))) using (var annoWriter = new StreamWriter(_annoFilePath)) { // Read var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) return; long offset = 0; while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = (char)Delimiter + reader.ProteinSequence; var length = reader.ProteinSequence.Length; seqWriter.Write(Encoding.GetBytes(sequence)); annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}", offset, AnnotationDelimiter, length, AnnotationDelimiter, name.Replace(AnnotationDelimiter, '-'), AnnotationDelimiter, description); offset += sequence.Length; } seqWriter.Write((char)Delimiter); // write file format Id seqWriter.Write(FileFormatId); // writer last write time hash var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode(); seqWriter.Write(hashCode); annoWriter.Write("{0}{1}{2}", FileFormatId, AnnotationDelimiter, hashCode); reader.CloseFile(); } }