Esempio n. 1
0
        private static IEnumerable <MutableGene> LoadGenes(Stream stream,
                                                           IDictionary <ushort, IChromosome> refIndexToChromosome,
                                                           IDictionary <string, IChromosome> refNameToChromosome38)
        {
            var geneDict = new Dictionary <string, MutableGene>();

            using (var reader = new MutableTranscriptReader(stream, refIndexToChromosome))
            {
                var transcripts = reader.GetTranscripts();

                foreach (var transcript in transcripts)
                {
                    var gene = transcript.Gene;
                    var key  = GetGeneKey(gene);
                    if (geneDict.ContainsKey(key))
                    {
                        continue;
                    }

                    gene.Chromosome = refNameToChromosome38[gene.Chromosome.UcscName];
                    geneDict[key]   = gene;
                }
            }

            return(geneDict.Values.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End));
        }
Esempio n. 2
0
        private static ExitCodes ProgramExecution()
        {
            var    logger         = new ConsoleLogger();
            string transcriptPath = _inputPrefix + ".transcripts.gz";
            string siftPath       = _inputPrefix + ".sift.gz";
            string polyphenPath   = _inputPrefix + ".polyphen.gz";
            string regulatoryPath = _inputPrefix + ".regulatory.gz";

            (var refIndexToChromosome, var refNameToChromosome, int numRefSeqs) = SequenceHelper.GetDictionaries(_inputReferencePath);

            using (var transcriptReader = new MutableTranscriptReader(GZipUtilities.GetAppropriateReadStream(transcriptPath), refIndexToChromosome))
                using (var regulatoryReader = new RegulatoryRegionReader(GZipUtilities.GetAppropriateReadStream(regulatoryPath), refIndexToChromosome))
                    using (var siftReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(siftPath), refIndexToChromosome, IntermediateIoCommon.FileType.Sift))
                        using (var polyphenReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(polyphenPath), refIndexToChromosome, IntermediateIoCommon.FileType.Polyphen))
                            using (var geneReader = new UgaGeneReader(GZipUtilities.GetAppropriateReadStream(ExternalFiles.UniversalGeneFilePath), refNameToChromosome))
                            {
                                var    genomeAssembly  = transcriptReader.Header.Assembly;
                                var    source          = transcriptReader.Header.Source;
                                long   vepReleaseTicks = transcriptReader.Header.VepReleaseTicks;
                                ushort vepVersion      = transcriptReader.Header.VepVersion;

                                logger.Write("- loading universal gene archive file... ");
                                var genes      = geneReader.GetGenes();
                                var geneForest = CreateGeneForest(genes, numRefSeqs, genomeAssembly);
                                logger.WriteLine($"{genes.Length:N0} loaded.");

                                logger.Write("- loading regulatory region file... ");
                                var regulatoryRegions = regulatoryReader.GetRegulatoryRegions();
                                logger.WriteLine($"{regulatoryRegions.Length:N0} loaded.");

                                logger.Write("- loading transcript file... ");
                                var transcripts           = transcriptReader.GetTranscripts();
                                var transcriptsByRefIndex = transcripts.GetMultiValueDict(x => x.Chromosome.Index);
                                logger.WriteLine($"{transcripts.Length:N0} loaded.");

                                MarkCanonicalTranscripts(logger, transcripts);

                                var predictionBuilder = new PredictionCacheBuilder(logger, genomeAssembly);
                                var predictionCaches  = predictionBuilder.CreatePredictionCaches(transcriptsByRefIndex, siftReader, polyphenReader, numRefSeqs);

                                logger.Write("- writing SIFT prediction cache... ");
                                predictionCaches.Sift.Write(FileUtilities.GetCreateStream(CacheConstants.SiftPath(_outputCacheFilePrefix)));
                                logger.WriteLine("finished.");

                                logger.Write("- writing PolyPhen prediction cache... ");
                                predictionCaches.PolyPhen.Write(FileUtilities.GetCreateStream(CacheConstants.PolyPhenPath(_outputCacheFilePrefix)));
                                logger.WriteLine("finished.");

                                var transcriptBuilder = new TranscriptCacheBuilder(logger, genomeAssembly, source, vepReleaseTicks, vepVersion);
                                var transcriptStaging = transcriptBuilder.CreateTranscriptCache(transcripts, regulatoryRegions, geneForest, numRefSeqs);

                                logger.Write("- writing transcript cache... ");
                                transcriptStaging.Write(FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(_outputCacheFilePrefix)));
                                logger.WriteLine("finished.");
                            }

            return(ExitCodes.Success);
        }