예제 #1
0
        private void GenerateMetaFiles()
        {
            if (File.Exists(_seqFilePath))
            {
                File.Delete(_seqFilePath);
            }

            if (File.Exists(_annoFilePath))
            {
                File.Delete(_annoFilePath);
            }

            using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew)))
                using (var annoWriter = new StreamWriter(_annoFilePath))
                {
                    // Read
                    var reader = new FastaFileReader();
                    if (!reader.OpenFile(_databaseFilePath))
                    {
                        return;
                    }

                    long offset = 0;
                    while (reader.ReadNextProteinEntry())
                    {
                        var name        = reader.ProteinName;
                        var description = reader.ProteinDescription;
                        var sequence    = (char)Delimiter + reader.ProteinSequence;
                        var length      = reader.ProteinSequence.Length;

                        seqWriter.Write(Encoding.GetBytes(sequence));

                        annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}",
                                             offset,
                                             AnnotationDelimiter,
                                             length,
                                             AnnotationDelimiter,
                                             name.Replace(AnnotationDelimiter, '-'),
                                             AnnotationDelimiter,
                                             description);

                        offset += sequence.Length;
                    }

                    seqWriter.Write((char)Delimiter);

                    // write file format Id
                    seqWriter.Write(FileFormatId);

                    // writer last write time hash
                    var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode();

                    seqWriter.Write(hashCode);
                    annoWriter.Write("{0}{1}{2}", FileFormatId, AnnotationDelimiter, hashCode);

                    reader.CloseFile();
                }
        }
예제 #2
0
        /// <summary>
        /// Create the decoy version of this databse
        /// </summary>
        /// <param name="enzyme"></param>
        /// <param name="shuffle"></param>
        public void CreateDecoyDatabase(Enzyme enzyme, bool shuffle)
        {
            var reader = new FastaFileReader();

            if (!reader.OpenFile(_databaseFilePath))
            {
                return;
            }

            var decoyDatabaseFileName = GetDecoyDatabasePath(enzyme, shuffle);

            Console.WriteLine("Creating " + decoyDatabaseFileName);
            using (var decoyWriter = new StreamWriter(decoyDatabaseFileName))
            {
                while (reader.ReadNextProteinEntry())
                {
                    var name        = reader.ProteinName;
                    var description = reader.ProteinDescription;
                    var sequence    = reader.ProteinSequence;

                    decoyWriter.WriteLine(">{0}_{1} {2}", FastaDatabaseConstants.DecoyProteinPrefix, name, description);

                    if (!shuffle)
                    {
                        // Reversed protein sequence
                        var decoySequence = new StringBuilder();
                        for (var i = sequence.Length - 1; i >= 0; i--)
                        {
                            var residue = sequence[i];
                            if (enzyme != null && enzyme.Residues.Length > 0 && enzyme.IsCleavable(residue) && decoySequence.Length > 0)
                            {
                                var residueToBeReplaced = decoySequence[decoySequence.Length - 1];
                                decoySequence.Remove(decoySequence.Length - 1, 1);
                                decoySequence.Append((char)residue);
                                decoySequence.Append(residueToBeReplaced);
                            }
                            else
                            {
                                decoySequence.Append((char)residue);
                            }
                        }
                        decoyWriter.WriteLine(decoySequence);
                    }
                    else
                    {
                        // Shuffled protein sequences
                        decoyWriter.WriteLine(SimpleStringProcessing.Mutate(SimpleStringProcessing.Shuffle(sequence), NumMutations));
                    }
                }
                reader.CloseFile();
            }
        }
예제 #3
0
        private void GenerateMetaFiles()
        {
            if (File.Exists(_seqFilePath))
            {
                File.Delete(_seqFilePath);
            }

            if (File.Exists(_annoFilePath))
            {
                File.Delete(_annoFilePath);
            }

            // Keys are protein name
            // Values track the number of times the name has been encountered, the number of residues, and a SHA1 hash of the residues
            var proteinNamesAndStats = new Dictionary <string, ProteinHashInfo>(StringComparer.InvariantCultureIgnoreCase);

            using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew)))
                using (var annoWriter = new StreamWriter(_annoFilePath))
                {
                    // Read
                    var reader = new FastaFileReader();
                    if (!reader.OpenFile(_databaseFilePath))
                    {
                        return;
                    }

                    long offset = 0;
                    while (reader.ReadNextProteinEntry())
                    {
                        var name        = reader.ProteinName;
                        var description = reader.ProteinDescription;
                        var sequence    = (char)FastaDatabaseConstants.Delimiter + reader.ProteinSequence;
                        var length      = sequence.Length;

                        var proteinInfoCurrent = new ProteinHashInfo(sequence);

                        if (proteinNamesAndStats.TryGetValue(name, out var proteinInfoCached))
                        {
                            // Duplicate name; either skip this protein or rename it
                            if (proteinInfoCached.SequenceLength == proteinInfoCurrent.SequenceLength &&
                                string.Equals(proteinInfoCached.SequenceHash, proteinInfoCurrent.SequenceHash))
                            {
                                // Duplicate protein; skip it
                                continue;
                            }

                            name += "_Duplicate" + proteinInfoCached.ObservationCount.ToString("00");
                            proteinInfoCached.ObservationCount++;
                        }

                        proteinNamesAndStats.Add(name, proteinInfoCurrent);

                        seqWriter.Write(Encoding.GetBytes((string)sequence));

                        annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}",
                                             offset, FastaDatabaseConstants.AnnotationDelimiter,
                                             length, FastaDatabaseConstants.AnnotationDelimiter,
                                             name.Replace(FastaDatabaseConstants.AnnotationDelimiter, '-'), FastaDatabaseConstants.AnnotationDelimiter,
                                             description);

                        offset += sequence.Length;
                    }

                    seqWriter.Write((char)FastaDatabaseConstants.Delimiter);

                    // write file format Id
                    seqWriter.Write(FileFormatId);

                    // writer last write time hash
                    var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode();

                    seqWriter.Write(hashCode);
                    annoWriter.Write("{0}{1}{2}", FileFormatId, FastaDatabaseConstants.AnnotationDelimiter, hashCode);

                    reader.CloseFile();
                }
        }
        static void Main(string[] args)
        {
            string      fileName        = null;
            string      fileType        = "FASTA";
            int         maximalDistance = 10;
            string      exporter        = "EXCEL";
            IFileReader fileReader;
            IExport     export;

            var fundamentPatterns = new List <string>();

            var p = new OptionSet()
            {
                { "f|file=", a => fileName = a },
                { "t|fileType=", a => fileType = a },
                { "e|exporter=", a => exporter = a },
                { "d|maximalDistance=", (int a) => maximalDistance = a },
                { "p|pattern=", a => {
                      if (!string.IsNullOrEmpty(a))
                      {
                          fundamentPatterns.Add(a);
                      }
                  } },
            };

            p.Parse(args); //List<string> extra =

            if (fundamentPatterns.Count == 0)
            {
                throw new ArgumentNullException(nameof(fundamentPatterns), "No paterns defined");
            }

            if (string.IsNullOrEmpty(fileName))
            {
                throw new ArgumentNullException(nameof(fileName));
            }
            if (!File.Exists(fileName))
            {
                throw new FileNotFoundException("File not found", fileName);
            }

            if (string.Equals(fileType, "FASTA", StringComparison.OrdinalIgnoreCase))
            {
                fileReader = new FastaFileReader();
            }
            else
            {
                throw new ArgumentOutOfRangeException(nameof(fileType), "Unknown file type");
            }

            var fileContent = fileReader.Read(fileName).Select((a) => new ProteinSequenceWithFundaments(a)).ToList();

            var matcher = new FundamentsMatcher();

            foreach (var fundamentPattern in fundamentPatterns.Select((a, i) => Tuple.Create(a, i + 1)))
            {
                matcher.FindInSequences(fileContent, fundamentPattern.Item2, fundamentPattern.Item1, maximalDistance);
            }

            if (string.Equals(exporter, "CONSOLE", StringComparison.OrdinalIgnoreCase))
            {
                export = new ConsoleExport(fundamentPatterns.Count);
            }
            else if (string.Equals(exporter, "EXCEL", StringComparison.OrdinalIgnoreCase))
            {
                export = new ExcelExport(fundamentPatterns.Count);
            }
            else
            {
                throw new ArgumentOutOfRangeException(nameof(fileType), "Unknown exporter type");
            }

            export.Export(fileContent);
        }
예제 #5
0
        /// <summary>
        /// Create the decoy version of this databse
        /// </summary>
        /// <param name="enzyme"></param>
        /// <param name="shuffle"></param>
        public void CreateDecoyDatabase(Enzyme enzyme, bool shuffle)
        {
            var reader = new FastaFileReader();
            if (!reader.OpenFile(_databaseFilePath))
                return;

            var decoyDatabaseFileName = GetDecoyDatabasePath(enzyme, shuffle);

            Console.WriteLine("Creating " + decoyDatabaseFileName);
            using (var decoyWriter = new StreamWriter(decoyDatabaseFileName))
            {
                while (reader.ReadNextProteinEntry())
                {
                    var name = reader.ProteinName;
                    var description = reader.ProteinDescription;
                    var sequence = reader.ProteinSequence;

                    decoyWriter.WriteLine(">{0}_{1} {2}", DecoyProteinPrefix, name, description);

                    if (!shuffle)
                    {
                        // Reversed protein sequence
                        var decoySequence = new StringBuilder();
                        for (var i = sequence.Length - 1; i >= 0; i--)
                        {
                            var residue = sequence[i];
                            if (enzyme != null && enzyme.Residues.Length > 0 && enzyme.IsCleavable(residue) && decoySequence.Length > 0)
                            {
                                var residueToBeReplaced = decoySequence[decoySequence.Length - 1];
                                decoySequence.Remove(decoySequence.Length - 1, 1);
                                decoySequence.Append(residue);
                                decoySequence.Append(residueToBeReplaced);
                            }
                            else
                            {
                                decoySequence.Append(residue);
                            }
                        }
                        decoyWriter.WriteLine(decoySequence);
                    }
                    else
                    {
                        // Shuffled protein sequences
                        decoyWriter.WriteLine(SimpleStringProcessing.Mutate(SimpleStringProcessing.Shuffle(sequence), NumMutations));
                    }
                }
                reader.CloseFile();
            }
        }
예제 #6
0
        private void GenerateMetaFiles()
        {
            if (File.Exists(_seqFilePath))
                File.Delete(_seqFilePath);

            if (File.Exists(_annoFilePath))
                File.Delete(_annoFilePath);

            using (var seqWriter = new BinaryWriter(File.Open(_seqFilePath, FileMode.CreateNew)))
            using (var annoWriter = new StreamWriter(_annoFilePath))
            {
                // Read
                var reader = new FastaFileReader();
                if (!reader.OpenFile(_databaseFilePath))
                    return;

                long offset = 0;
                while (reader.ReadNextProteinEntry())
                {
                    var name = reader.ProteinName;
                    var description = reader.ProteinDescription;
                    var sequence = (char)Delimiter + reader.ProteinSequence;
                    var length = reader.ProteinSequence.Length;

                    seqWriter.Write(Encoding.GetBytes(sequence));

                    annoWriter.WriteLine("{0}{1}{2}{3}{4}{5}{6}",
                        offset,
                        AnnotationDelimiter,
                        length,
                        AnnotationDelimiter,
                        name.Replace(AnnotationDelimiter, '-'),
                        AnnotationDelimiter,
                        description);

                    offset += sequence.Length;
                }

                seqWriter.Write((char)Delimiter);

                // write file format Id
                seqWriter.Write(FileFormatId);

                // writer last write time hash
                var hashCode = File.GetLastWriteTimeUtc(_databaseFilePath).GetHashCode();

                seqWriter.Write(hashCode);
                annoWriter.Write("{0}{1}{2}", FileFormatId, AnnotationDelimiter, hashCode);

                reader.CloseFile();
            }
        }