private static ExitCodes ProgramExecution() { var logger = new ConsoleLogger(); var bundle = DataBundle.GetDataBundle(_inputReferencePath, _inputPrefix); int numRefSeqs = bundle.SequenceReader.NumRefSeqs; var chromosome = ReferenceNameUtilities.GetChromosome(bundle.SequenceReader.RefNameToChromosome, _referenceName); bundle.Load(chromosome); string outputStub = GetOutputStub(chromosome, bundle.Source); var interval = new ChromosomeInterval(chromosome, _referencePosition, _referenceEndPosition); var transcripts = GetTranscripts(logger, bundle, interval); var sift = GetPredictionStaging(logger, "SIFT", transcripts, chromosome, bundle.SiftPredictions, bundle.SiftReader, x => x.SiftIndex, numRefSeqs); var polyphen = GetPredictionStaging(logger, "PolyPhen", transcripts, chromosome, bundle.PolyPhenPredictions, bundle.PolyPhenReader, x => x.PolyPhenIndex, numRefSeqs); string referenceBases = GetReferenceBases(logger, bundle.SequenceReader, interval); var regulatoryRegionIntervalArrays = GetRegulatoryRegionIntervalArrays(logger, bundle.TranscriptCache, interval, numRefSeqs); var transcriptIntervalArrays = PredictionUtilities.UpdateTranscripts(transcripts, bundle.SiftPredictions, sift.Predictions, bundle.PolyPhenPredictions, polyphen.Predictions, numRefSeqs); var transcriptStaging = GetTranscriptStaging(bundle.TranscriptCacheData.Header, transcriptIntervalArrays, regulatoryRegionIntervalArrays); WriteCache(logger, FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(outputStub)), transcriptStaging, "transcript"); WriteCache(logger, FileUtilities.GetCreateStream(CacheConstants.SiftPath(outputStub)), sift.Staging, "SIFT"); WriteCache(logger, FileUtilities.GetCreateStream(CacheConstants.PolyPhenPath(outputStub)), polyphen.Staging, "PolyPhen"); WriteReference(logger, CacheConstants.BasesPath(outputStub), bundle.SequenceReader, chromosome, referenceBases, interval.Start); return(ExitCodes.Success); }
/// <summary> /// creates the global database /// </summary> public void CreateTranscriptCacheFile(string outputPrefix) { if (!_hasData) { throw new GeneralException("Data was not loaded before running CreateTranscriptCacheFile"); } Console.Write("- creating transcript cache file... "); var createBenchmark = new Benchmark(); var globalOutputPath = CacheConstants.TranscriptPath(outputPrefix); var customHeader = new GlobalCustomHeader(_transcriptReader.Header.VepReleaseTicks, _transcriptReader.Header.VepVersion); var header = new FileHeader(CacheConstants.Identifier, CacheConstants.SchemaVersion, CacheConstants.DataVersion, _transcriptReader.Header.TranscriptSource, _currentTimeTicks, _transcriptReader.Header.GenomeAssembly, customHeader); var genes = ConvertGenes(); using (var writer = new GlobalCacheWriter(globalOutputPath, header)) { var cache = new VD.GlobalCache(header, _transcripts.ToArray(), _regulatoryElements.ToArray(), genes, _introns.ToArray(), _microRnas.ToArray(), _peptideSeqs.ToArray()); writer.Write(cache); } Console.WriteLine("{0}", Benchmark.ToHumanReadable(createBenchmark.GetElapsedTime())); }
private static ExitCodes ProgramExecution() { using (var writer = GZipUtilities.GetStreamWriter(_outputFileName)) { string cachePath = CacheConstants.TranscriptPath(_inputPrefix); var sequenceData = SequenceHelper.GetDictionaries(_referencePath); // load the cache Console.Write("- reading {0}... ", Path.GetFileName(cachePath)); var cache = TranscriptCacheHelper.GetCache(cachePath, sequenceData.refIndexToChromosome); Console.WriteLine("found {0:N0} reference sequences. ", cache.RegulatoryRegionIntervalArrays.Length); Console.Write("- writing GFF entries... "); foreach (var intervalArray in cache.RegulatoryRegionIntervalArrays) { if (intervalArray == null) { continue; } foreach (var interval in intervalArray.Array) { WriteRegulatoryFeature(writer, interval.Value); } } Console.WriteLine("finished."); } return(ExitCodes.Success); }
public TranscriptAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider, ProteinConservationProvider conservationProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; _refNameToChromosome = sequenceProvider.RefNameToChromosome; _conservationProvider = conservationProvider; using (var stream = PersistentStreamUtils.GetReadStream(CacheConstants.TranscriptPath(pathPrefix))) { (_transcriptCache, TranscriptIntervalArrays, VepVersion) = InitiateCache(stream, sequenceProvider.RefIndexToChromosome, sequenceProvider.Assembly); } Assembly = _transcriptCache.Assembly; DataSourceVersions = _transcriptCache.DataSourceVersions; // TODO: this is not great. We should not be using IEnumerables if we have to resort to strange stuff like this if (conservationProvider != null) { DataSourceVersions = DataSourceVersions.Concat(new[] { conservationProvider.Version }); } _siftStream = PersistentStreamUtils.GetReadStream(CacheConstants.SiftPath(pathPrefix)); _siftReader = new PredictionCacheReader(_siftStream, PredictionCacheReader.SiftDescriptions); _polyphenStream = PersistentStreamUtils.GetReadStream(CacheConstants.PolyPhenPath(pathPrefix)); _polyphenReader = new PredictionCacheReader(_polyphenStream, PredictionCacheReader.PolyphenDescriptions); }
public void TranscriptPath_NominalCase() { const string expectedResult = "bob.transcripts.ndb"; var observedResult = CacheConstants.TranscriptPath("bob"); Assert.Equal(expectedResult, observedResult); }
private static ExitCodes ProgramExecution() { var sequenceData = SequenceHelper.GetDictionaries(_refSequencePath); var logger = new ConsoleLogger(); var caches = LoadTranscriptCaches(logger, CacheConstants.TranscriptPath(_inputPrefix), CacheConstants.TranscriptPath(_inputPrefix2), sequenceData.refIndexToChromosome); if (caches.Cache.TranscriptIntervalArrays.Length != caches.Cache2.TranscriptIntervalArrays.Length) { throw new InvalidDataException($"Expected the number of reference sequences in cache 1 ({caches.Cache.TranscriptIntervalArrays.Length}) and cache 2 ({caches.Cache2.TranscriptIntervalArrays.Length}) to be the same."); } int numRefSeqs = caches.Cache.TranscriptIntervalArrays.Length; var combinedIntervalArrays = new IntervalArray <ITranscript> [numRefSeqs]; var siftPredictionsPerRef = new Prediction[numRefSeqs][]; var polyphenPredictionsPerRef = new Prediction[numRefSeqs][]; PredictionHeader siftHeader; PredictionHeader polyphenHeader; using (var siftReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix)), PredictionCacheReader.SiftDescriptions)) using (var siftReader2 = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix2)), PredictionCacheReader.SiftDescriptions)) using (var polyphenReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(_inputPrefix)), PredictionCacheReader.PolyphenDescriptions)) using (var polyphenReader2 = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(_inputPrefix2)), PredictionCacheReader.PolyphenDescriptions)) { siftHeader = siftReader.Header; polyphenHeader = polyphenReader.Header; for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceData.refIndexToChromosome[refIndex]; Console.ForegroundColor = ConsoleColor.Yellow; logger.WriteLine($"\n{chromosome.UcscName}:"); Console.ResetColor(); var sift = CombinePredictions(logger, chromosome, "SIFT", siftReader, siftReader2); siftPredictionsPerRef[refIndex] = sift.Predictions; var polyphen = CombinePredictions(logger, chromosome, "PolyPhen", polyphenReader, polyphenReader2); polyphenPredictionsPerRef[refIndex] = polyphen.Predictions; var transcriptIntervalArray = caches.Cache.TranscriptIntervalArrays[refIndex]; var transcriptIntervalArray2 = caches.Cache2.TranscriptIntervalArrays[refIndex]; combinedIntervalArrays[refIndex] = CombineTranscripts(logger, transcriptIntervalArray, transcriptIntervalArray2, sift.Offset, polyphen.Offset); } } logger.WriteLine(); WritePredictions(logger, "SIFT", CacheConstants.SiftPath(_outputPrefix), siftHeader, siftPredictionsPerRef); WritePredictions(logger, "PolyPhen", CacheConstants.PolyPhenPath(_outputPrefix), polyphenHeader, polyphenPredictionsPerRef); WriteTranscripts(logger, CloneHeader(caches.Cache.Header), combinedIntervalArrays, caches.Cache.RegulatoryRegionIntervalArrays); return(ExitCodes.Success); }
public PianoAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; _transcriptCache = InitiateCache(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(pathPrefix)), sequenceProvider.GetChromosomeIndexDictionary(), sequenceProvider.GenomeAssembly, sequenceProvider.NumRefSeqs); GenomeAssembly = _transcriptCache.GenomeAssembly; DataSourceVersions = _transcriptCache.DataSourceVersions; }
private static ExitCodes ProgramExecution() { var cachePath = CacheConstants.TranscriptPath(_inputPrefix); var header = GetHeaderInformation(cachePath); Console.WriteLine($"Versions: Schema: {header.Schema}, Data: {header.Data}, VEP: {header.Vep}"); return(ExitCodes.Success); }
/// <summary> /// constructor /// </summary> public CacheCombiner(string inputPrefix1, string inputPrefix2, string outputPrefix) { _prefix1 = inputPrefix1; _prefix2 = inputPrefix2; _outPrefix = outputPrefix; _cachePath1 = CacheConstants.TranscriptPath(inputPrefix1); _cachePath2 = CacheConstants.TranscriptPath(inputPrefix2); _outputCachePath = CacheConstants.TranscriptPath(outputPrefix); }
private static void WriteTranscripts(ILogger logger, CacheHeader header, IntervalArray <ITranscript>[] transcriptIntervalArrays, IntervalArray <IRegulatoryRegion>[] regulatoryRegionIntervalArrays) { var staging = TranscriptCacheStaging.GetStaging(header, transcriptIntervalArrays, regulatoryRegionIntervalArrays); logger.Write("- writing transcripts... "); staging.Write(FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(_outputPrefix))); logger.WriteLine("finished."); }
public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "ref|r=", "compressed reference sequence file", v => _compressedReference = v }, { "cache|c=", "Transcript cache prefix", v => _transcriptCachePrefix = v }, { "gene|g=", "Gene info data file from NCBI", v => _geneInfoFile = v }, { "in|i=", "input VCF file path", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .CheckInputFilenameExists(_compressedReference, "compressed reference sequence file name", "--ref") .HasRequiredParameter(_transcriptCachePrefix, "transcript cache file", "--cache") .CheckInputFilenameExists(CacheConstants.TranscriptPath(_transcriptCachePrefix), "transcript cache prefix", "--cache") .HasRequiredParameter(_inputFile, "SpliceAI VCF file", "--in") .CheckInputFilenameExists(_inputFile, "SpliceAI VCF file", "--in") .HasRequiredParameter(_geneInfoFile, "Gene info data file from NCBI", "--gene") .CheckInputFilenameExists(_geneInfoFile, "Gene info data file from NCBI", "--gene") .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .SkipBanner() .ShowHelpMenu("Creates a supplementary database containing 1000 Genomes allele frequencies", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return(exitCode); }
static int Main(string[] args) { var ops = new OptionSet { { "cache|c=", "input cache {prefix}", v => ConfigurationSettings.InputCachePrefix = v }, { "in|i=", "input VCF {path}", v => ConfigurationSettings.VcfPath = v }, { "out|o=", "output {file path} ", v => ConfigurationSettings.OutputFileName = v }, { "ref|r=", "input compressed reference sequence {path}", v => ConfigurationSettings.RefSequencePath = v }, { "force-mt", "forces to annotate mitochondria variants", v => ConfigurationSettings.ForceMitochondrialAnnotation = v != null } }; var commandLineExample = "-i <vcf path> -d <cache dir> -r <ref path> -o <base output filename>"; var piano = new Piano(); var exitCode = new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .CheckInputFilenameExists(ConfigurationSettings.VcfPath, "vcf", "--in", true, "-") .CheckInputFilenameExists(ConfigurationSettings.RefSequencePath, "reference sequence", "--ref") .CheckInputFilenameExists(CacheConstants.TranscriptPath(ConfigurationSettings.InputCachePrefix), "transcript cache", "--cache") .HasRequiredParameter(ConfigurationSettings.OutputFileName, "output file stub", "--out") .ShowBanner(Constants.Authors) .ShowHelpMenu("peptide annotation", commandLineExample) .ShowErrors() .Execute(piano.ProgramExecution); return((int)exitCode); }
private static ExitCodes ProgramExecution() { string cachePath = CacheConstants.TranscriptPath(_inputPrefix); var(refIndexToChromosome, _, _) = SequenceHelper.GetDictionaries(_compressedReferencePath); var cache = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome); var geneToInternalId = InternalGenes.CreateDictionary(cache.Genes); using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName))) { var creator = new GffCreator(writer, geneToInternalId); creator.Create(cache.TranscriptIntervalArrays); } return(ExitCodes.Success); }
public void Create() { using (var reader = new GlobalCacheReader(CacheConstants.TranscriptPath(_inputPrefix))) using (var writer = GZipUtilities.GetStreamWriter(_outPath)) { WriteVcfHeader(writer); var cache = reader.Read(); Console.Write("- found {0} transcripts... ", cache.Transcripts.Length); foreach (var transcript in cache.Transcripts) { CreateVcf(writer, transcript); } Console.WriteLine("finished."); } }
public static GlobalCache LoadCache(string cachePrefix) { var cachePath = CacheConstants.TranscriptPath(cachePrefix); if (!File.Exists(cachePath)) { return(null); } GlobalCache transcriptCache; using (var reader = new GlobalCacheReader(FileUtilities.GetReadStream(cachePath))) { transcriptCache = reader.Read(); } return(transcriptCache); }
public void Create(string outputPath) { using (var writer = GZipUtilities.GetStreamWriter(outputPath)) { Console.Write("- reading {0}... ", Path.GetFileName(_cachePrefix)); var cache = GetCache(CacheConstants.TranscriptPath(_cachePrefix)); Console.WriteLine("found {0:N0} transcripts.", cache.Transcripts.Length); AddGenesToDictionary(cache.Genes); Console.Write("- writing GFF entries... "); foreach (var transcript in cache.Transcripts) { Write(writer, _referenceNames[transcript.ReferenceIndex], transcript); } Console.WriteLine("finished."); } }
public static ExitCodes Run(string command, string[] commandArgs) { var ops = new OptionSet { { "cache|c=", "Cache prefix", v => _cachePrefix = v }, { "ref|r=", "Reference sequence path", v => _referenceSequncePath = v }, { "in|i=", "input tsv file", v => _inputFile = v }, { "out|o=", "output directory", v => _outputDirectory = v } }; string commandLineExample = $"{command} [options]"; var exitCode = new ConsoleAppBuilder(commandArgs, ops) .Parse() .HasRequiredParameter(_outputDirectory, "output directory", "--out") .CheckDirectoryExists(_outputDirectory, "output directory", "--out") .HasRequiredParameter(_cachePrefix, "transcript cache prefix", "--cache") .CheckInputFilenameExists(CacheConstants.TranscriptPath(_cachePrefix), "transcript cache prefix", "--cache") .HasRequiredParameter(_referenceSequncePath, "reference sequence path", "--ref") .CheckInputFilenameExists(_referenceSequncePath, "reference sequence path", "--ref") .CheckInputFilenameExists(_inputFile, "input TSV file", "--in") .SkipBanner() .ShowHelpMenu("Creates a gene annotation database from gnomAD data", commandLineExample) .ShowErrors() .Execute(ProgramExecution); return(exitCode); }
private static ExitCodes ProgramExecution() { Source transcriptSource = ParseVepCacheDirectoryMain.GetSource(_transcriptSource); string cachePath = CacheConstants.TranscriptPath(_inputPrefix); IDictionary <ushort, IChromosome> refIndexToChromosome = SequenceHelper.GetDictionaries(_compressedReferencePath).refIndexToChromosome; TranscriptCacheData cache = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome); IDictionary <IGene, int> geneToInternalId = InternalGenes.CreateDictionary(cache.Genes); using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName))) { var creator = new GffCreator(writer, geneToInternalId, transcriptSource); creator.Create(cache.TranscriptIntervalArrays); } return(ExitCodes.Success); }
public TranscriptAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; var transcriptStream = PersistentStreamUtils.GetReadStream(CacheConstants.TranscriptPath(pathPrefix)); (_transcriptCache, TranscriptIntervalArrays, VepVersion) = InitiateCache(transcriptStream, sequenceProvider.RefIndexToChromosome, sequenceProvider.Assembly); Assembly = _transcriptCache.Assembly; DataSourceVersions = _transcriptCache.DataSourceVersions; var siftStream = PersistentStreamUtils.GetReadStream(CacheConstants.SiftPath(pathPrefix)); _siftReader = new PredictionCacheReader(siftStream, PredictionCacheReader.SiftDescriptions); var polyphenStream = PersistentStreamUtils.GetReadStream(CacheConstants.PolyPhenPath(pathPrefix)); _polyphenReader = new PredictionCacheReader(polyphenStream, PredictionCacheReader.PolyphenDescriptions); }
protected override void ProgramExecution() { var referenceNames = GetUcscReferenceNames(ConfigurationSettings.CompressedReferencePath); using (var writer = GZipUtilities.GetStreamWriter(ConfigurationSettings.OutputFileName)) { var cachePath = CacheConstants.TranscriptPath(ConfigurationSettings.CachePrefix); // load the cache Console.Write("- reading {0}... ", Path.GetFileName(cachePath)); var cache = GetCache(cachePath); Console.WriteLine("found {0:N0} regulatory regions. ", cache.RegulatoryElements.Length); Console.Write("- writing GFF entries... "); foreach (var regulatoryFeature in cache.RegulatoryElements) { WriteRegulatoryFeature(writer, referenceNames, regulatoryFeature); } Console.WriteLine("finished."); } }
public IAnnotationSource CreateAnnotationSource(IAnnotatorInfo annotatorInfo, IAnnotatorPaths annotatorPaths) { var conservationScoreReader = new PhylopReader(annotatorPaths.SupplementaryAnnotation); var transcriptStream = FileUtilities.GetReadStream(CacheConstants.TranscriptPath(annotatorPaths.CachePrefix)); var siftStream = FileUtilities.GetReadStream(CacheConstants.SiftPath(annotatorPaths.CachePrefix)); var polyPhenStream = FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(annotatorPaths.CachePrefix)); var referenceStream = FileUtilities.GetReadStream(annotatorPaths.CompressedReference); var streams = new AnnotationSourceStreams(transcriptStream, siftStream, polyPhenStream, referenceStream); var caProvider = annotatorPaths.CustomAnnotation.Any() ? new CustomAnnotationProvider(annotatorPaths.CustomAnnotation) : null; var ciProvider = annotatorPaths.CustomIntervals.Any() ? new CustomIntervalProvider(annotatorPaths.CustomIntervals) : null; var saProvider = annotatorPaths.SupplementaryAnnotation != null ? new SupplementaryAnnotationProvider(annotatorPaths.SupplementaryAnnotation) : null; //adding the saPath because OMIM needs it var annotationSource = new NirvanaAnnotationSource(streams, saProvider, conservationScoreReader, caProvider, ciProvider, annotatorPaths.SupplementaryAnnotation); if (annotatorInfo.BooleanArguments.Contains(AnnotatorInfoCommon.ReferenceNoCall)) { annotationSource.EnableReferenceNoCalls(annotatorInfo.BooleanArguments.Contains(AnnotatorInfoCommon.TranscriptOnlyRefNoCall)); } if (annotatorInfo.BooleanArguments.Contains(AnnotatorInfoCommon.EnableMitochondrialAnnotation)) { annotationSource.EnableMitochondrialAnnotation(); } if (annotatorInfo.BooleanArguments.Contains(AnnotatorInfoCommon.ReportAllSvOverlappingTranscripts)) { annotationSource.EnableReportAllSvOverlappingTranscripts(); } if (annotatorInfo.BooleanArguments.Contains(AnnotatorInfoCommon.EnableLoftee)) { annotationSource.AddPlugin(new Loftee()); } return(annotationSource); }
protected override void ValidateCommandLine() { if (ConfigurationSettings.VcfPath != "-") { CheckInputFilenameExists(ConfigurationSettings.VcfPath, "vcf", "--in"); } CheckInputFilenameExists(ConfigurationSettings.CompressedReferencePath, "compressed reference sequence", "--ref"); CheckInputFilenameExists(CacheConstants.TranscriptPath(ConfigurationSettings.InputCachePrefix), "transcript cache", "--cache"); CheckInputFilenameExists(CacheConstants.SiftPath(ConfigurationSettings.InputCachePrefix), "SIFT cache", "--cache"); CheckInputFilenameExists(CacheConstants.PolyPhenPath(ConfigurationSettings.InputCachePrefix), "PolyPhen cache", "--cache"); CheckDirectoryExists(ConfigurationSettings.SupplementaryAnnotationDirectory, "supplementary annotation", "--sd", false); foreach (var customAnnotationDirectory in ConfigurationSettings.CustomAnnotationDirectories) { CheckDirectoryExists(customAnnotationDirectory, "custom annotation", "--ca", false); } foreach (var customAnnotationDirectory in ConfigurationSettings.CustomIntervalDirectories) { CheckDirectoryExists(customAnnotationDirectory, "custom interval", "--ci", false); } // if we're using stdout, it doesn't make sense to output the VCF and gVCF if (ConfigurationSettings.OutputFileName == "-") { ConfigurationSettings.Vcf = false; ConfigurationSettings.Gvcf = false; PerformanceMetrics.DisableOutput = true; } HasRequiredParameter(ConfigurationSettings.OutputFileName, "output file stub", "--out"); if (ConfigurationSettings.LimitReferenceNoCallsToTranscripts) { ConfigurationSettings.EnableReferenceNoCalls = true; } }
private static ExitCodes ProgramExecution() { Dictionary <string, string> geneIdToSymbols; using (var cacheStream = FileUtilities.GetReadStream(CacheConstants.TranscriptPath(_cachePrefix))) using (var transcriptCacheReader = new TranscriptCacheReader(cacheStream)) using (var refProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_referenceSequncePath))) { geneIdToSymbols = LoadGenesFromCache(refProvider, transcriptCacheReader); Console.WriteLine($"Loaded {geneIdToSymbols.Count} gene symbols from cache."); } var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); var outFileName = $"{version.Name}_{version.Version}"; using (var gnomadGeneParser = new GnomadGeneParser(GZipUtilities.GetAppropriateStreamReader(_inputFile), geneIdToSymbols)) using (var stream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.NgaFileSuffix))) using (var ngaWriter = new NgaWriter(stream, version, SaCommon.GnomadGeneScoreTag, SaCommon.SchemaVersion, false)) { ngaWriter.Write(gnomadGeneParser.GetItems()); } return(ExitCodes.Success); }
private static ExitCodes ProgramExecution() { string transcriptPath = _inputPrefix + ".transcripts.gz"; string siftPath = _inputPrefix + ".sift.gz"; string polyphenPath = _inputPrefix + ".polyphen.gz"; string regulatoryPath = _inputPrefix + ".regulatory.gz"; (var refIndexToChromosome, var refNameToChromosome, int numRefSeqs) = SequenceHelper.GetDictionaries(_inputReferencePath); using (var transcriptReader = new MutableTranscriptReader(GZipUtilities.GetAppropriateReadStream(transcriptPath), refIndexToChromosome)) using (var regulatoryReader = new RegulatoryRegionReader(GZipUtilities.GetAppropriateReadStream(regulatoryPath), refIndexToChromosome)) using (var siftReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(siftPath), refIndexToChromosome, IntermediateIoCommon.FileType.Sift)) using (var polyphenReader = new PredictionReader(GZipUtilities.GetAppropriateReadStream(polyphenPath), refIndexToChromosome, IntermediateIoCommon.FileType.Polyphen)) using (var geneReader = new UgaGeneReader(GZipUtilities.GetAppropriateReadStream(ExternalFiles.UniversalGeneFilePath), refNameToChromosome)) { var genomeAssembly = transcriptReader.Header.Assembly; var source = transcriptReader.Header.Source; long vepReleaseTicks = transcriptReader.Header.VepReleaseTicks; ushort vepVersion = transcriptReader.Header.VepVersion; Logger.Write("- loading universal gene archive file... "); var genes = geneReader.GetGenes(); var geneForest = CreateGeneForest(genes, numRefSeqs, genomeAssembly); Logger.WriteLine($"{genes.Length:N0} loaded."); Logger.Write("- loading regulatory region file... "); var regulatoryRegions = regulatoryReader.GetRegulatoryRegions(); Logger.WriteLine($"{regulatoryRegions.Length:N0} loaded."); Logger.Write("- loading transcript file... "); var transcripts = transcriptReader.GetTranscripts(); var transcriptsByRefIndex = transcripts.GetMultiValueDict(x => x.Chromosome.Index); Logger.WriteLine($"{transcripts.Length:N0} loaded."); MarkCanonicalTranscripts(transcripts); var predictionBuilder = new PredictionCacheBuilder(genomeAssembly); var predictionCaches = predictionBuilder.CreatePredictionCaches(transcriptsByRefIndex, siftReader, polyphenReader, numRefSeqs); Logger.Write("- writing SIFT prediction cache... "); predictionCaches.Sift.Write(FileUtilities.GetCreateStream(CacheConstants.SiftPath(_outputCacheFilePrefix))); Logger.WriteLine("finished."); Logger.Write("- writing PolyPhen prediction cache... "); predictionCaches.PolyPhen.Write(FileUtilities.GetCreateStream(CacheConstants.PolyPhenPath(_outputCacheFilePrefix))); Logger.WriteLine("finished."); var transcriptBuilder = new TranscriptCacheBuilder(genomeAssembly, source, vepReleaseTicks, vepVersion); var transcriptStaging = transcriptBuilder.CreateTranscriptCache(transcripts, regulatoryRegions, geneForest, numRefSeqs); Logger.Write("- writing transcript cache... "); transcriptStaging.Write(FileUtilities.GetCreateStream(CacheConstants.TranscriptPath(_outputCacheFilePrefix))); Logger.WriteLine("finished."); } return(ExitCodes.Success); }
public static TranscriptCacheData GetTranscriptData(IDictionary <ushort, IChromosome> refIndexToChromosome, string transcriptCachePrefix) { using var transcriptCacheReader = new TranscriptCacheReader( FileUtilities.GetReadStream(CacheConstants.TranscriptPath(transcriptCachePrefix))); return(transcriptCacheReader.Read(refIndexToChromosome)); }
public void TranscriptPath_Null_WithNullPrefix() { var observedResult = CacheConstants.TranscriptPath(null); Assert.Null(observedResult); }
private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); TranscriptCacheData transcriptData; using (var transcriptCacheReader = new TranscriptCacheReader(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(_transcriptCachePrefix)))) { transcriptData = transcriptCacheReader.Read(referenceProvider.RefIndexToChromosome); } var spliceIntervals = SpliceUtilities.GetSpliceIntervals(referenceProvider, transcriptData); var nirEnstToGeneSymbols = SpliceUtilities.GetEnstToGeneSymbols(referenceProvider, transcriptData); Dictionary <string, string> spliceAiEnstToGeneSymbols; using (var reader = new StreamReader(GZipUtilities.GetAppropriateReadStream(_geneInfoFile))) { spliceAiEnstToGeneSymbols = SpliceUtilities.GetSpliceAiGeneSymbols(reader); } var spliceAiToNirvanaGeneSymbols = SpliceUtilities.GetSymbolMapping(spliceAiEnstToGeneSymbols, nirEnstToGeneSymbols); Console.WriteLine($"Mapped {spliceAiToNirvanaGeneSymbols.Count} spliceAI gene symbols to Nirvana gene symbols (out of {spliceAiEnstToGeneSymbols.Count})"); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var spliceAiParser = new SpliceAiParser( GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider, spliceIntervals, spliceAiToNirvanaGeneSymbols)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSufix))) using (var nsaWriter = new NsaWriter(nsaStream, indexStream, version, referenceProvider, SaCommon.SpliceAiTag, true, true, SaCommon.SchemaVersion, false)) { nsaWriter.Write(spliceAiParser.GetItems()); } Console.WriteLine($"Total number of entries from Splice AI: {SpliceAiParser.Count}"); return(ExitCodes.Success); }
private static ExitCodes ProgramExecution() { var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); TranscriptCacheData transcriptData; using (var transcriptCacheReader = new TranscriptCacheReader(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(_transcriptCachePrefix)))) { transcriptData = transcriptCacheReader.Read(referenceProvider.RefIndexToChromosome); } var spliceIntervals = SpliceUtilities.GetSpliceIntervals(referenceProvider, transcriptData); var nirvanaGeneForest = SpliceUtilities.GetGeneForest(transcriptData); Console.WriteLine("Loaded transcripts and generated splice intervals."); Dictionary <string, List <string> > geneSymbolSynonyms; using (var geneInfoParser = new GeneInfoParser(GZipUtilities.GetAppropriateStreamReader(_geneInfoFile))) { geneSymbolSynonyms = geneInfoParser.GetGeneSymbolSynonyms(); } Console.WriteLine("Loaded gene symbol synonyms"); var version = DataSourceVersionReader.GetSourceVersion(_inputFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; using (var spliceAiParser = new SpliceAiParser( GZipUtilities.GetAppropriateReadStream(_inputFile), referenceProvider, spliceIntervals, nirvanaGeneForest, geneSymbolSynonyms)) using (var nsaStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix))) using (var indexStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + SaCommon.SaFileSuffix + SaCommon.IndexSufix))) { var nsaWriter = new NsaWriter(new ExtendedBinaryWriter(nsaStream), new ExtendedBinaryWriter(indexStream), version, referenceProvider, SaCommon.SpliceAiTag, true, true, SaCommon.SchemaVersion, false); nsaWriter.Write(spliceAiParser.GetItems()); } Console.WriteLine($"Total number of entries from Splice AI: {SpliceAiParser.Count}"); return(ExitCodes.Success); }
public static int Main(string[] args) { var ops = new OptionSet { { "cache|c=", "input cache {prefix}", v => _inputCachePrefix = v }, { "in|i=", "input VCF {path}", v => _vcfPath = v }, { "out|o=", "output {file path}", v => _outputFileName = v }, { "ref|r=", "input compressed reference sequence {path}", v => _refSequencePath = v }, { "sd=", "input supplementary annotation {directory}", v => SupplementaryAnnotationDirectories.Add(v) }, { "force-mt", "forces to annotate mitochondrial variants", v => _forceMitochondrialAnnotation = v != null }, { "disable-recomposition", "don't recompose function relevant variants", v => _disableRecomposition = v != null }, { "legacy-vids", "enables support for legacy VIDs", v => _useLegacyVids = v != null }, { "enable-dq", "report DQ from VCF samples field", v => _enableDq = v != null }, { "str=", "user provided STR annotation TSV file", v => _customStrTsv = v } }; var exitCode = new ConsoleAppBuilder(args, ops) .UseVersionProvider(new VersionProvider()) .Parse() .CheckInputFilenameExists(_vcfPath, "vcf", "--in", true, "-") .CheckInputFilenameExists(_refSequencePath, "reference sequence", "--ref") .CheckInputFilenameExists(CacheConstants.TranscriptPath(_inputCachePrefix), "transcript cache", "--cache") .CheckInputFilenameExists(CacheConstants.SiftPath(_inputCachePrefix), "SIFT cache", "--cache") .CheckInputFilenameExists(CacheConstants.PolyPhenPath(_inputCachePrefix), "PolyPhen cache", "--cache") .CheckInputFilenameExists(_customStrTsv, "custom STR annotation TSV", "--str", false) .HasRequiredParameter(_outputFileName, "output file stub", "--out") .DisableOutput(_outputFileName == "-") .ShowBanner(Constants.Authors) .ShowHelpMenu("Annotates a set of variants", "-i <vcf path> -c <cache prefix> --sd <sa dir> -r <ref path> -o <base output filename>") .ShowErrors() .Execute(ProgramExecution); return((int)exitCode); }
public static IRecomposer Create(ISequenceProvider sequenceProvider, string inputCachePrefix) { var transcriptIntervalArrays = ReadWriteUtilities.ReadCache(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(inputCachePrefix)), sequenceProvider.RefIndexToChromosome); var(geneIntervalForest, _) = ReadWriteUtilities.GetIntervalAndTranscriptsForeachGene(transcriptIntervalArrays); var codonInfoProvider = CodonInfoProvider.CreateCodonInfoProvider(transcriptIntervalArrays); var variantGenerator = new VariantGenerator(sequenceProvider); var positionBuffer = new PositionBuffer(codonInfoProvider, geneIntervalForest); return(new Recomposer(new PositionProcessor(positionBuffer, codonInfoProvider, variantGenerator), sequenceProvider)); }