private static AnnotationResources GetAnnotationResources(AnnotationConfig annotationConfig) { var genomeAssembly = GenomeAssemblyHelper.Convert(annotationConfig.genomeAssembly); string cachePathPrefix = LambdaUrlHelper.GetCacheFolder().UrlCombine(genomeAssembly.ToString()).UrlCombine(LambdaUrlHelper.DefaultCacheSource); string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly); string saManifestUrl = LambdaUtilities.GetManifestUrl(annotationConfig.supplementaryAnnotations ?? "latest", genomeAssembly); var metrics = new PerformanceMetrics(); var annotationResources = new AnnotationResources(nirvanaS3Ref, cachePathPrefix, saManifestUrl == null? null: new List <string> { saManifestUrl }, annotationConfig.customAnnotations, annotationConfig.customStrUrl, false, false, false, metrics); using (var tabixStream = PersistentStreamUtils.GetReadStream(annotationConfig.tabixUrl)) { annotationResources.InputStartVirtualPosition = GetTabixVirtualPosition(annotationConfig.annotationRange, tabixStream, annotationResources.SequenceProvider.RefNameToChromosome); } Logger.WriteLine($"Tabix position :{annotationResources.InputStartVirtualPosition}"); return(annotationResources); }
public ValidationResult Run(ValidationConfig config, ILambdaContext context) { string snsTopicArn = null; try { LogUtilities.UpdateLogger(context.Logger, null); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); config.Validate(); GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly); string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly); var refProvider = ProviderUtilities.GetSequenceProvider(nirvanaS3Ref); using (var stream = PersistentStreamUtils.GetReadStream(config.customStrUrl)) TryLoadStrFile(stream, genomeAssembly, refProvider); } catch (Exception exception) { return(HandleException(config.id, exception, snsTopicArn)); } return(GetSuccessOutput(config.id)); }
public void Convert_GenomeAssemblyDoesNotExist() { Assert.Throws <UserErrorException>(delegate { GenomeAssemblyHelper.Convert("dummy"); }); }
public SaHeader(string name, string assembly, string version, string releaseDate, string description) { Name = name; GenomeAssembly = GenomeAssemblyHelper.Convert(assembly); Version = version; Description = description; ReleaseDate = releaseDate; }
private static void CheckHeader(TextReader reader, GenomeAssembly desiredGenomeAssembly) { string line = reader.ReadLine(); while (line == string.Empty) { line = reader.ReadLine(); } if (line == null) { throw new UserErrorException("The custom STR file provided is empty."); } GenomeAssembly genomeAssembly = GenomeAssembly.Unknown; var headerNum = 0; while (line != null && line.StartsWith("#")) { headerNum++; line = line.Trim(); var columns = line.Split('=', '\t'); var tag = columns[0].ToLower(); switch (headerNum) { case 1: if (tag != "#assembly") { throw new UserErrorException("First line in STR data file has to contain assembly. For example: #assembly=GRCh38"); } genomeAssembly = GenomeAssemblyHelper.Convert(columns[1]); if (genomeAssembly != desiredGenomeAssembly) { throw new UserErrorException($"Expected {desiredGenomeAssembly} in the STR data file, but found {genomeAssembly}"); } break; case 2: if (tag != "#chrom") { throw new UserErrorException("Second line in TSV has to contain column labels. For example: #Chrom\tStart\tEnd\tPhenotype\t..."); } return; // we should not read the next line default: throw new UserErrorException($"Unexpected header tag observed:\n{line}"); } line = reader.ReadLine(); } if (genomeAssembly == GenomeAssembly.Unknown) { throw new UserErrorException("Genome assembly not specified in STR header. It is a required field."); } }
private static ExitCodes ProgramExecution() { var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); Console.Write("- loading previous reference names... "); List <IChromosome> oldChromosomes = ReferenceNamesReader.GetReferenceNames(FileUtilities.GetReadStream(_referenceNamesPath)); Console.WriteLine("finished."); IDictionary <string, IChromosome> oldRefNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(oldChromosomes); Console.Write("- reading the genome assembly report... "); List <IChromosome> chromosomes = AssemblyReader.GetChromosomes(FileUtilities.GetReadStream(_genomeAssemblyReportPath), oldRefNameToChromosome, oldChromosomes.Count); int numRefSeqs = chromosomes.Count; Console.WriteLine($"{numRefSeqs} references found."); Console.Write("- checking reference index contiguity... "); CheckReferenceIndexContiguity(chromosomes, oldChromosomes); Console.WriteLine("contiguous."); IDictionary <string, IChromosome> refNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(chromosomes); Console.Write("- reading cytogenetic bands... "); List <Band>[] cytogeneticBandsByRef = CytogeneticBandsReader.GetCytogeneticBands(FileUtilities.GetReadStream(_cytogeneticBandPath), numRefSeqs, refNameToChromosome); Console.WriteLine("finished."); Console.WriteLine("- reading FASTA files:"); List <FastaSequence> fastaSequences = GetFastaSequences(_fastaPrefix, refNameToChromosome); long genomeLength = GetGenomeLength(fastaSequences); Console.WriteLine($"- genome length: {genomeLength:N0}"); Console.Write("- check if chrY has PAR masking... "); CheckChrYPadding(fastaSequences); Console.WriteLine("unmasked."); Console.Write("- applying 2-bit compression... "); List <Creation.ReferenceSequence> referenceSequences = CreateReferenceSequences(fastaSequences, cytogeneticBandsByRef); Console.WriteLine("finished."); Console.Write("- creating reference sequence file... "); CreateReferenceSequenceFile(genomeAssembly, _patchLevel, chromosomes, referenceSequences); long fileSize = new FileInfo(_outputCompressedPath).Length; Console.WriteLine($"{fileSize:N0} bytes"); return(ExitCodes.Success); }
private static void CheckHeader(TextReader reader, GenomeAssembly desiredGenomeAssembly) { string line = reader.ReadLine(); string genomeAssemblyString = line.OptimizedSplit('=')[1]; var genomeAssembly = GenomeAssemblyHelper.Convert(genomeAssemblyString); if (genomeAssembly != desiredGenomeAssembly) { throw new InvalidDataException($"Expected {desiredGenomeAssembly} in the STR data file, but found {genomeAssembly}"); } // skip the header fields line reader.ReadLine(); }
// ReSharper disable once UnusedMember.Global public Stream Run(SingleConfig config, ILambdaContext context) { string snsTopicArn = null; Stream response; try { LogUtilities.UpdateLogger(context.Logger, null); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); config.Validate(); GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly); var cacheConfiguration = new CacheConfiguration(genomeAssembly, config.supplementaryAnnotations, config.vepVersion); bool preloadRequired = !string.IsNullOrEmpty(config.supplementaryAnnotations); AnnotationResources annotationResources = GetAndCacheAnnotationResources(config, cacheConfiguration); if (genomeAssembly != GenomeAssembly.hg19) { annotationResources.Annotator.EnableMitochondrialAnnotation(); } (IPosition position, string[] sampleNames) = config.GetPositionAndSampleNames(annotationResources.SequenceProvider, annotationResources.RefMinorProvider); if (position.Chromosome.IsEmpty()) { throw new UserErrorException($"An unknown chromosome was specified ({config.variant.chromosome})"); } string annotationResult = GetPositionAnnotation(position, annotationResources, sampleNames, preloadRequired); response = SingleResult.Create(config.id, LambdaUrlHelper.SuccessMessage, annotationResult); } catch (Exception exception) { response = ExceptionHandler.GetStream(config.id, snsTopicArn, exception); } return(response); }
private static AnnotationResources GetAnnotationResources(SingleConfig lambdaConfig) { GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(lambdaConfig.genomeAssembly); string cachePathPrefix = CacheUtilities.GetCachePathPrefix(lambdaConfig.vepVersion, genomeAssembly); string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly); string annotatorVersion = "Nirvana " + CommandLineUtilities.GetVersion(Assembly.GetAssembly(typeof(SingleAnnotationLambda))); var metrics = new PerformanceMetrics(); Logger.WriteLine($"Cache prefix: {cachePathPrefix}"); //todo: get customStrTsv from lambdaConfig var annotationResources = new AnnotationResources(nirvanaS3Ref, cachePathPrefix, null, lambdaConfig.customAnnotations, null, false, false, false, metrics) { AnnotatorVersionTag = annotatorVersion }; return(annotationResources); }
public static GenomeAssembly ParseGenomeAssembly(string line, HashSet <GenomeAssembly> allowedGenomeAssemblies) { CheckPrefix(line, "#assembly", "second"); string firstCol = line.OptimizedSplit('\t')[0]; (_, string assemblyString) = firstCol.OptimizedKeyValue(); if (assemblyString == null) { throw new UserErrorException("Please provide the genome assembly in the format: #assembly=genomeAssembly."); } var assembly = GenomeAssemblyHelper.Convert(assemblyString); if (!allowedGenomeAssemblies.Contains(assembly)) { throw new UserErrorException("Only GRCh37 and GRCh38 are accepted for genome assembly."); } return(assembly); }
private static ExitCodes ProgramExecution() { var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); Console.Write("- reading the genome assembly report... "); var dummyRefNameToChromosome = new Dictionary <string, IChromosome>(); List <IChromosome> chromosomes = AssemblyReader.GetChromosomes(FileUtilities.GetReadStream(_genomeAssemblyReportPath), dummyRefNameToChromosome, 0); int numRefSeqs = chromosomes.Count; Console.WriteLine($"{numRefSeqs} references found."); IDictionary <string, IChromosome> refNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(chromosomes); Console.Write("- reading FASTA file... "); var fastaSequence = GetFastaSequence(_fastaPath, refNameToChromosome); Console.WriteLine($"- sequence length: {fastaSequence.Bases.Length:N0}"); Console.Write("- reading cytogenetic bands... "); List <Band> cytogeneticBands = GetCytogeneticBands(fastaSequence.Chromosome.Index, numRefSeqs, refNameToChromosome); Console.WriteLine("finished."); Console.Write("- applying 2-bit compression... "); var referenceSequence = CreateReferenceSequence(fastaSequence, cytogeneticBands); Console.WriteLine("finished."); Console.Write("- creating reference sequence file... "); var minimalChromosomes = new List <IChromosome> { fastaSequence.Chromosome }; CreateReferenceSequenceFile(genomeAssembly, minimalChromosomes, referenceSequence); long fileSize = new FileInfo(_outputCompressedPath).Length; Console.WriteLine($"{fileSize:N0} bytes"); return(ExitCodes.Success); }
// ReSharper disable once UnusedMember.Global public NirvanaResult Run(NirvanaConfig config, ILambdaContext context) { NirvanaResult result; string snsTopicArn = null; var runLog = new StringBuilder(); try { LogUtilities.UpdateLogger(context.Logger, runLog); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey, "annotation_lambda_arn" }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); string annotationLambdaArn = LambdaUtilities.GetEnvironmentVariable(AnnotationLambdaKey); config.Validate(); var genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly); if (!_supportedAssemblies.Contains(genomeAssembly)) { throw new UserErrorException($"Unsupported assembly: {config.genomeAssembly}"); } IEnumerable <AnnotationRange> annotationRanges = GetAnnotationRanges(config, genomeAssembly); result = GetNirvanaResult(annotationRanges, config, annotationLambdaArn, context, runLog, snsTopicArn); } catch (Exception exception) { result = HandleException(runLog, config, exception, snsTopicArn); } LogUtilities.LogObject("Result", result); return(result); }
private static ExitCodes ProgramExecution() { var transcriptSource = GetSource(_transcriptSource); var sequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath)); var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome); var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory); var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks; var idToGenbank = GetIdToGenbank(genomeAssembly, transcriptSource); // ========================= // create the pre-cache file // ========================= // process each VEP directory int numRefSeqs = sequenceReader.NumRefSeqs; var header = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs); string siftPath = _outputStub + ".sift.gz"; string polyphenPath = _outputStub + ".polyphen.gz"; string transcriptPath = _outputStub + ".transcripts.gz"; string regulatoryPath = _outputStub + ".regulatory.gz"; using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log"))) using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift)) using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen)) using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header)) using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header)) { var converter = new VepCacheParser(transcriptSource); var emptyPredictionDict = new Dictionary <string, List <int> >(); for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceReader.RefIndexToChromosome[refIndex]; if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir)) { siftWriter.Write(chromosome, emptyPredictionDict); polyphenWriter.Write(chromosome, emptyPredictionDict); continue; } Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName); var rawData = converter.ParseDumpDirectory(chromosome, vepSubDir); var mergedTranscripts = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank); var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions); int numRawTranscripts = rawData.Transcripts.Count; int numMergedTranscripts = mergedTranscripts.Count; Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}"); WriteTranscripts(transcriptWriter, mergedTranscripts); WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions); WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome); WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome); } } Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count); return(ExitCodes.Success); }
public void Convert_GenomeAssemblyExists(string s, GenomeAssembly expectedGenomeAssembly) { var observedResult = GenomeAssemblyHelper.Convert(s); Assert.Equal(expectedGenomeAssembly, observedResult); }