private void GenerateMetaFiles() { if (File.Exists(_seqFilePath)) { File.Delete(_seqFilePath); } if (File.Exists(_annoFilePath)) { File.Delete(_annoFilePath); } using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew))) using (var annoWriter = new StreamWriter(_annoFilePath)) { // Read var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) { return; } long offset = 0; while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = (char)Delimiter + reader.ProteinSequence; var length = reader.ProteinSequence.Length; seqWriter.Write(Encoding.GetBytes(sequence)); annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}", offset, AnnotationDelimiter, length, AnnotationDelimiter, name.Replace(AnnotationDelimiter, '-'), AnnotationDelimiter, description); offset += sequence.Length; } seqWriter.Write((char)Delimiter); // write file format Id seqWriter.Write(FileFormatId); // writer last write time hash var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode(); seqWriter.Write(hashCode); annoWriter.Write("{0}{1}{2}", FileFormatId, AnnotationDelimiter, hashCode); reader.CloseFile(); } }
/// <summary> /// Create the decoy version of this databse /// </summary> /// <param name="enzyme"></param> /// <param name="shuffle"></param> public void CreateDecoyDatabase(Enzyme enzyme, bool shuffle) { var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) { return; } var decoyDatabaseFileName = GetDecoyDatabasePath(enzyme, shuffle); Console.WriteLine("Creating " + decoyDatabaseFileName); using (var decoyWriter = new StreamWriter(decoyDatabaseFileName)) { while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = reader.ProteinSequence; decoyWriter.WriteLine(">{0}_{1} {2}", FastaDatabaseConstants.DecoyProteinPrefix, name, description); if (!shuffle) { // Reversed protein sequence var decoySequence = new StringBuilder(); for (var i = sequence.Length - 1; i >= 0; i--) { var residue = sequence[i]; if (enzyme != null && enzyme.Residues.Length > 0 && enzyme.IsCleavable(residue) && decoySequence.Length > 0) { var residueToBeReplaced = decoySequence[decoySequence.Length - 1]; decoySequence.Remove(decoySequence.Length - 1, 1); decoySequence.Append((char)residue); decoySequence.Append(residueToBeReplaced); } else { decoySequence.Append((char)residue); } } decoyWriter.WriteLine(decoySequence); } else { // Shuffled protein sequences decoyWriter.WriteLine(SimpleStringProcessing.Mutate(SimpleStringProcessing.Shuffle(sequence), NumMutations)); } } reader.CloseFile(); } }
private void GenerateMetaFiles() { if (File.Exists(_seqFilePath)) { File.Delete(_seqFilePath); } if (File.Exists(_annoFilePath)) { File.Delete(_annoFilePath); } // Keys are protein name // Values track the number of times the name has been encountered, the number of residues, and a SHA1 hash of the residues var proteinNamesAndStats = new Dictionary <string, ProteinHashInfo>(StringComparer.InvariantCultureIgnoreCase); using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew))) using (var annoWriter = new StreamWriter(_annoFilePath)) { // Read var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) { return; } long offset = 0; while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = (char)FastaDatabaseConstants.Delimiter + reader.ProteinSequence; var length = sequence.Length; var proteinInfoCurrent = new ProteinHashInfo(sequence); if (proteinNamesAndStats.TryGetValue(name, out var proteinInfoCached)) { // Duplicate name; either skip this protein or rename it if (proteinInfoCached.SequenceLength == proteinInfoCurrent.SequenceLength && string.Equals(proteinInfoCached.SequenceHash, proteinInfoCurrent.SequenceHash)) { // Duplicate protein; skip it continue; } name += "_Duplicate" + proteinInfoCached.ObservationCount.ToString("00"); proteinInfoCached.ObservationCount++; } proteinNamesAndStats.Add(name, proteinInfoCurrent); seqWriter.Write(Encoding.GetBytes((string)sequence)); annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}", offset, FastaDatabaseConstants.AnnotationDelimiter, length, FastaDatabaseConstants.AnnotationDelimiter, name.Replace(FastaDatabaseConstants.AnnotationDelimiter, '-'), FastaDatabaseConstants.AnnotationDelimiter, description); offset += sequence.Length; } seqWriter.Write((char)FastaDatabaseConstants.Delimiter); // write file format Id seqWriter.Write(FileFormatId); // writer last write time hash var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode(); seqWriter.Write(hashCode); annoWriter.Write("{0}{1}{2}", FileFormatId, FastaDatabaseConstants.AnnotationDelimiter, hashCode); reader.CloseFile(); } }
/// <summary> /// Create the decoy version of this databse /// </summary> /// <param name="enzyme"></param> /// <param name="shuffle"></param> public void CreateDecoyDatabase(Enzyme enzyme, bool shuffle) { var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) return; var decoyDatabaseFileName = GetDecoyDatabasePath(enzyme, shuffle); Console.WriteLine("Creating " + decoyDatabaseFileName); using (var decoyWriter = new StreamWriter(decoyDatabaseFileName)) { while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = reader.ProteinSequence; decoyWriter.WriteLine(">{0}_{1} {2}", DecoyProteinPrefix, name, description); if (!shuffle) { // Reversed protein sequence var decoySequence = new StringBuilder(); for (var i = sequence.Length - 1; i >= 0; i--) { var residue = sequence[i]; if (enzyme != null && enzyme.Residues.Length > 0 && enzyme.IsCleavable(residue) && decoySequence.Length > 0) { var residueToBeReplaced = decoySequence[decoySequence.Length - 1]; decoySequence.Remove(decoySequence.Length - 1, 1); decoySequence.Append(residue); decoySequence.Append(residueToBeReplaced); } else { decoySequence.Append(residue); } } decoyWriter.WriteLine(decoySequence); } else { // Shuffled protein sequences decoyWriter.WriteLine(SimpleStringProcessing.Mutate(SimpleStringProcessing.Shuffle(sequence), NumMutations)); } } reader.CloseFile(); } }
private void GenerateMetaFiles() { if (File.Exists(_seqFilePath)) File.Delete(_seqFilePath); if (File.Exists(_annoFilePath)) File.Delete(_annoFilePath); using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew))) using (var annoWriter = new StreamWriter(_annoFilePath)) { // Read var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) return; long offset = 0; while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = (char)Delimiter + reader.ProteinSequence; var length = reader.ProteinSequence.Length; seqWriter.Write(Encoding.GetBytes(sequence)); annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}", offset, AnnotationDelimiter, length, AnnotationDelimiter, name.Replace(AnnotationDelimiter, '-'), AnnotationDelimiter, description); offset += sequence.Length; } seqWriter.Write((char)Delimiter); // write file format Id seqWriter.Write(FileFormatId); // writer last write time hash var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode(); seqWriter.Write(hashCode); annoWriter.Write("{0}{1}{2}", FileFormatId, AnnotationDelimiter, hashCode); reader.CloseFile(); } }