Пример #1
0
        private List <MutableTranscript> ParseTranscriptFiles(IChromosome chromosome, string dirPath)
        {
            var transcripts = new List <MutableTranscript>();
            var files       = FileUtilities.GetFileNamesInDir(dirPath, "*_transcripts_data_dumper.txt.gz").ToArray();

            foreach (string dumpPath in VepRootDirectory.GetSortedFiles(files))
            {
                ParseTranscriptDumpFile(chromosome, dumpPath, transcripts);
            }

            return(transcripts);
        }
Пример #2
0
        private static List <IRegulatoryRegion> ParseRegulatoryFiles(IChromosome chromosome, string dirPath)
        {
            var files             = Directory.GetFiles(dirPath, "*_reg_regulatory_regions_data_dumper.txt.gz");
            var regulatoryRegions = new List <IRegulatoryRegion>();

            foreach (string dumpPath in VepRootDirectory.GetSortedFiles(files))
            {
                ParseRegulatoryDumpFile(chromosome, dumpPath, regulatoryRegions);
            }

            return(regulatoryRegions);
        }
        private static ExitCodes ProgramExecution()
        {
            var transcriptSource = GetSource(_transcriptSource);
            var sequenceReader   = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath));
            var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome);
            var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory);

            var  genomeAssembly  = GenomeAssemblyHelper.Convert(_genomeAssembly);
            long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks;
            var  idToGenbank     = GetIdToGenbank(genomeAssembly, transcriptSource);

            // =========================
            // create the pre-cache file
            // =========================

            // process each VEP directory
            int numRefSeqs = sequenceReader.NumRefSeqs;
            var header     = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs);

            string siftPath       = _outputStub + ".sift.gz";
            string polyphenPath   = _outputStub + ".polyphen.gz";
            string transcriptPath = _outputStub + ".transcripts.gz";
            string regulatoryPath = _outputStub + ".regulatory.gz";

            using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log")))
                using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift))
                    using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen))
                        using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header))
                            using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header))
                            {
                                var converter           = new VepCacheParser(transcriptSource);
                                var emptyPredictionDict = new Dictionary <string, List <int> >();

                                for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++)
                                {
                                    var chromosome = sequenceReader.RefIndexToChromosome[refIndex];

                                    if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir))
                                    {
                                        siftWriter.Write(chromosome, emptyPredictionDict);
                                        polyphenWriter.Write(chromosome, emptyPredictionDict);
                                        continue;
                                    }

                                    Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName);

                                    var rawData                 = converter.ParseDumpDirectory(chromosome, vepSubDir);
                                    var mergedTranscripts       = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank);
                                    var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions);

                                    int numRawTranscripts    = rawData.Transcripts.Count;
                                    int numMergedTranscripts = mergedTranscripts.Count;
                                    Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}");

                                    WriteTranscripts(transcriptWriter, mergedTranscripts);
                                    WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions);
                                    WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome);
                                    WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome);
                                }
                            }

            Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count);

            return(ExitCodes.Success);
        }