private static IEnumerable <MutableGene> LoadGenes(Stream stream, IDictionary <ushort, IChromosome> refIndexToChromosome, IDictionary <string, IChromosome> refNameToChromosome38) { var geneDict = new Dictionary <string, MutableGene>(); using (var reader = new MutableTranscriptReader(stream, refIndexToChromosome)) { var transcripts = reader.GetTranscripts(); foreach (var transcript in transcripts) { var gene = transcript.Gene; var key = GetGeneKey(gene); if (geneDict.ContainsKey(key)) { continue; } gene.Chromosome = refNameToChromosome38[gene.Chromosome.UcscName]; geneDict[key] = gene; } } return(geneDict.Values.OrderBy(x => x.Chromosome.Index).ThenBy(x => x.Start).ThenBy(x => x.End)); }
private static ExitCodes ProgramExecution() { var logger = new ConsoleLogger(); string transcriptPath = _inputPrefix + ".transcripts.gz"; string siftPath = _inputPrefix + ".sift.gz"; string polyphenPath = _inputPrefix + ".polyphen.gz"; string regulatoryPath = _inputPrefix + ".regulatory.gz"; (var refIndexToChromosome, var refNameToChromosome, int numRefSeqs) = SequenceHelper.GetDictionaries(_inputReferencePath); using (var transcriptReader = new MutableTranscriptReader(GZipUtilities.GetAppropriateReadStream(transcriptPath), refIndexToChromosome)) using (var regulatoryReader = new RegulatoryRegionReader(GZipUtilities.GetAppropriateReadStream(regulatoryPath), refIndexToChromosome)) using (var siftReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(siftPath), refIndexToChromosome, IntermediateIoCommon.FileType.Sift)) using (var polyphenReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(polyphenPath), refIndexToChromosome, IntermediateIoCommon.FileType.Polyphen)) using (var geneReader = new UgaGeneReader(GZipUtilities.GetAppropriateReadStream(ExternalFiles.UniversalGeneFilePath), refNameToChromosome)) { var genomeAssembly = transcriptReader.Header.Assembly; var source = transcriptReader.Header.Source; long vepReleaseTicks = transcriptReader.Header.VepReleaseTicks; ushort vepVersion = transcriptReader.Header.VepVersion; logger.Write("- loading universal gene archive file... "); var genes = geneReader.GetGenes(); var geneForest = CreateGeneForest(genes, numRefSeqs, genomeAssembly); logger.WriteLine($"{genes.Length:N0} loaded."); logger.Write("- loading regulatory region file... "); var regulatoryRegions = regulatoryReader.GetRegulatoryRegions(); logger.WriteLine($"{regulatoryRegions.Length:N0} loaded."); logger.Write("- loading transcript file... "); var transcripts = transcriptReader.GetTranscripts(); var transcriptsByRefIndex = transcripts.GetMultiValueDict(x => x.Chromosome.Index); logger.WriteLine($"{transcripts.Length:N0} loaded."); MarkCanonicalTranscripts(logger, transcripts); var predictionBuilder = new PredictionCacheBuilder(logger, genomeAssembly); var predictionCaches = predictionBuilder.CreatePredictionCaches(transcriptsByRefIndex, siftReader, polyphenReader, numRefSeqs); logger.Write("- writing SIFT prediction cache... "); predictionCaches.Sift.Write(FileUtilities.GetCreateStream(CacheConstants.SiftPath(_outputCacheFilePrefix))); logger.WriteLine("finished."); logger.Write("- writing PolyPhen prediction cache... "); predictionCaches.PolyPhen.Write(FileUtilities.GetCreateStream(CacheConstants.PolyPhenPath(_outputCacheFilePrefix))); logger.WriteLine("finished."); var transcriptBuilder = new TranscriptCacheBuilder(logger, genomeAssembly, source, vepReleaseTicks, vepVersion); var transcriptStaging = transcriptBuilder.CreateTranscriptCache(transcripts, regulatoryRegions, geneForest, numRefSeqs); logger.Write("- writing transcript cache... "); transcriptStaging.Write(FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(_outputCacheFilePrefix))); logger.WriteLine("finished."); } return(ExitCodes.Success); }