Ejemplo n.º 1
0
        private static AnnotationResources GetAnnotationResources(AnnotationConfig annotationConfig)
        {
            var    genomeAssembly  = GenomeAssemblyHelper.Convert(annotationConfig.genomeAssembly);
            string cachePathPrefix = LambdaUrlHelper.GetCacheFolder().UrlCombine(genomeAssembly.ToString()).UrlCombine(LambdaUrlHelper.DefaultCacheSource);
            string nirvanaS3Ref    = LambdaUrlHelper.GetRefUrl(genomeAssembly);
            string saManifestUrl   = LambdaUtilities.GetManifestUrl(annotationConfig.supplementaryAnnotations ?? "latest", genomeAssembly);
            var    metrics         = new PerformanceMetrics();

            var annotationResources = new AnnotationResources(nirvanaS3Ref, cachePathPrefix,
                                                              saManifestUrl == null? null: new List <string> {
                saManifestUrl
            },
                                                              annotationConfig.customAnnotations,
                                                              annotationConfig.customStrUrl,
                                                              false,
                                                              false,
                                                              false,
                                                              metrics);

            using (var tabixStream = PersistentStreamUtils.GetReadStream(annotationConfig.tabixUrl))
            {
                annotationResources.InputStartVirtualPosition = GetTabixVirtualPosition(annotationConfig.annotationRange, tabixStream, annotationResources.SequenceProvider.RefNameToChromosome);
            }

            Logger.WriteLine($"Tabix position :{annotationResources.InputStartVirtualPosition}");

            return(annotationResources);
        }
Ejemplo n.º 2
0
        public ValidationResult Run(ValidationConfig config, ILambdaContext context)
        {
            string snsTopicArn = null;

            try
            {
                LogUtilities.UpdateLogger(context.Logger, null);
                LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion);
                LogUtilities.LogObject("Config", config);
                LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey });
                LambdaUtilities.GarbageCollect();
                snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey);

                config.Validate();
                GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly);

                string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly);
                var    refProvider  = ProviderUtilities.GetSequenceProvider(nirvanaS3Ref);

                using (var stream = PersistentStreamUtils.GetReadStream(config.customStrUrl))
                    TryLoadStrFile(stream, genomeAssembly, refProvider);
            }
            catch (Exception exception)
            {
                return(HandleException(config.id, exception, snsTopicArn));
            }

            return(GetSuccessOutput(config.id));
        }
 public void Convert_GenomeAssemblyDoesNotExist()
 {
     Assert.Throws <UserErrorException>(delegate
     {
         GenomeAssemblyHelper.Convert("dummy");
     });
 }
Ejemplo n.º 4
0
 public SaHeader(string name, string assembly, string version, string releaseDate, string description)
 {
     Name           = name;
     GenomeAssembly = GenomeAssemblyHelper.Convert(assembly);
     Version        = version;
     Description    = description;
     ReleaseDate    = releaseDate;
 }
Ejemplo n.º 5
0
        private static void CheckHeader(TextReader reader, GenomeAssembly desiredGenomeAssembly)
        {
            string line = reader.ReadLine();

            while (line == string.Empty)
            {
                line = reader.ReadLine();
            }
            if (line == null)
            {
                throw new UserErrorException("The custom STR file provided is empty.");
            }

            GenomeAssembly genomeAssembly = GenomeAssembly.Unknown;
            var            headerNum      = 0;

            while (line != null && line.StartsWith("#"))
            {
                headerNum++;
                line = line.Trim();
                var columns = line.Split('=', '\t');
                var tag     = columns[0].ToLower();
                switch (headerNum)
                {
                case 1:
                    if (tag != "#assembly")
                    {
                        throw new UserErrorException("First line in STR data file has to contain assembly. For example: #assembly=GRCh38");
                    }
                    genomeAssembly = GenomeAssemblyHelper.Convert(columns[1]);
                    if (genomeAssembly != desiredGenomeAssembly)
                    {
                        throw new UserErrorException($"Expected {desiredGenomeAssembly} in the STR data file, but found {genomeAssembly}");
                    }
                    break;

                case 2:
                    if (tag != "#chrom")
                    {
                        throw new UserErrorException("Second line in TSV has to contain column labels. For example: #Chrom\tStart\tEnd\tPhenotype\t...");
                    }
                    return;     // we should not read the next line

                default:
                    throw new UserErrorException($"Unexpected header tag observed:\n{line}");
                }
                line = reader.ReadLine();
            }
            if (genomeAssembly == GenomeAssembly.Unknown)
            {
                throw new UserErrorException("Genome assembly not specified in STR header. It is a required field.");
            }
        }
Ejemplo n.º 6
0
        private static ExitCodes ProgramExecution()
        {
            var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly);

            Console.Write("- loading previous reference names... ");
            List <IChromosome> oldChromosomes = ReferenceNamesReader.GetReferenceNames(FileUtilities.GetReadStream(_referenceNamesPath));

            Console.WriteLine("finished.");

            IDictionary <string, IChromosome> oldRefNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(oldChromosomes);

            Console.Write("- reading the genome assembly report... ");
            List <IChromosome> chromosomes = AssemblyReader.GetChromosomes(FileUtilities.GetReadStream(_genomeAssemblyReportPath), oldRefNameToChromosome, oldChromosomes.Count);
            int numRefSeqs = chromosomes.Count;

            Console.WriteLine($"{numRefSeqs} references found.");

            Console.Write("- checking reference index contiguity... ");
            CheckReferenceIndexContiguity(chromosomes, oldChromosomes);
            Console.WriteLine("contiguous.");

            IDictionary <string, IChromosome> refNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(chromosomes);

            Console.Write("- reading cytogenetic bands... ");
            List <Band>[] cytogeneticBandsByRef = CytogeneticBandsReader.GetCytogeneticBands(FileUtilities.GetReadStream(_cytogeneticBandPath),
                                                                                             numRefSeqs, refNameToChromosome);
            Console.WriteLine("finished.");

            Console.WriteLine("- reading FASTA files:");
            List <FastaSequence> fastaSequences = GetFastaSequences(_fastaPrefix, refNameToChromosome);
            long genomeLength = GetGenomeLength(fastaSequences);

            Console.WriteLine($"- genome length: {genomeLength:N0}");

            Console.Write("- check if chrY has PAR masking... ");
            CheckChrYPadding(fastaSequences);
            Console.WriteLine("unmasked.");

            Console.Write("- applying 2-bit compression... ");
            List <Creation.ReferenceSequence> referenceSequences = CreateReferenceSequences(fastaSequences, cytogeneticBandsByRef);

            Console.WriteLine("finished.");

            Console.Write("- creating reference sequence file... ");
            CreateReferenceSequenceFile(genomeAssembly, _patchLevel, chromosomes, referenceSequences);
            long fileSize = new FileInfo(_outputCompressedPath).Length;

            Console.WriteLine($"{fileSize:N0} bytes");

            return(ExitCodes.Success);
        }
Ejemplo n.º 7
0
        private static void CheckHeader(TextReader reader, GenomeAssembly desiredGenomeAssembly)
        {
            string line = reader.ReadLine();
            string genomeAssemblyString = line.OptimizedSplit('=')[1];

            var genomeAssembly = GenomeAssemblyHelper.Convert(genomeAssemblyString);

            if (genomeAssembly != desiredGenomeAssembly)
            {
                throw new InvalidDataException($"Expected {desiredGenomeAssembly} in the STR data file, but found {genomeAssembly}");
            }

            // skip the header fields line
            reader.ReadLine();
        }
Ejemplo n.º 8
0
        // ReSharper disable once UnusedMember.Global
        public Stream Run(SingleConfig config, ILambdaContext context)
        {
            string snsTopicArn = null;
            Stream response;

            try
            {
                LogUtilities.UpdateLogger(context.Logger, null);
                LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion);
                LogUtilities.LogObject("Config", config);
                LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey });

                LambdaUtilities.GarbageCollect();

                snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey);

                config.Validate();

                GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly);

                var  cacheConfiguration = new CacheConfiguration(genomeAssembly, config.supplementaryAnnotations, config.vepVersion);
                bool preloadRequired    = !string.IsNullOrEmpty(config.supplementaryAnnotations);
                AnnotationResources annotationResources = GetAndCacheAnnotationResources(config, cacheConfiguration);

                if (genomeAssembly != GenomeAssembly.hg19)
                {
                    annotationResources.Annotator.EnableMitochondrialAnnotation();
                }

                (IPosition position, string[] sampleNames) = config.GetPositionAndSampleNames(annotationResources.SequenceProvider, annotationResources.RefMinorProvider);
                if (position.Chromosome.IsEmpty())
                {
                    throw new UserErrorException($"An unknown chromosome was specified ({config.variant.chromosome})");
                }

                string annotationResult = GetPositionAnnotation(position, annotationResources, sampleNames, preloadRequired);
                response = SingleResult.Create(config.id, LambdaUrlHelper.SuccessMessage, annotationResult);
            }
            catch (Exception exception)
            {
                response = ExceptionHandler.GetStream(config.id, snsTopicArn, exception);
            }

            return(response);
        }
Ejemplo n.º 9
0
        private static AnnotationResources GetAnnotationResources(SingleConfig lambdaConfig)
        {
            GenomeAssembly genomeAssembly  = GenomeAssemblyHelper.Convert(lambdaConfig.genomeAssembly);
            string         cachePathPrefix = CacheUtilities.GetCachePathPrefix(lambdaConfig.vepVersion, genomeAssembly);
            string         nirvanaS3Ref    = LambdaUrlHelper.GetRefUrl(genomeAssembly);

            string annotatorVersion = "Nirvana " + CommandLineUtilities.GetVersion(Assembly.GetAssembly(typeof(SingleAnnotationLambda)));
            var    metrics          = new PerformanceMetrics();

            Logger.WriteLine($"Cache prefix: {cachePathPrefix}");
            //todo: get customStrTsv from lambdaConfig
            var annotationResources = new AnnotationResources(nirvanaS3Ref, cachePathPrefix,
                                                              null, lambdaConfig.customAnnotations, null, false, false, false, metrics)
            {
                AnnotatorVersionTag = annotatorVersion
            };

            return(annotationResources);
        }
Ejemplo n.º 10
0
        public static GenomeAssembly ParseGenomeAssembly(string line, HashSet <GenomeAssembly> allowedGenomeAssemblies)
        {
            CheckPrefix(line, "#assembly", "second");
            string firstCol = line.OptimizedSplit('\t')[0];

            (_, string assemblyString) = firstCol.OptimizedKeyValue();

            if (assemblyString == null)
            {
                throw new UserErrorException("Please provide the genome assembly in the format: #assembly=genomeAssembly.");
            }

            var assembly = GenomeAssemblyHelper.Convert(assemblyString);

            if (!allowedGenomeAssemblies.Contains(assembly))
            {
                throw new UserErrorException("Only GRCh37 and GRCh38 are accepted for genome assembly.");
            }

            return(assembly);
        }
Ejemplo n.º 11
0
        private static ExitCodes ProgramExecution()
        {
            var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly);

            Console.Write("- reading the genome assembly report... ");
            var dummyRefNameToChromosome   = new Dictionary <string, IChromosome>();
            List <IChromosome> chromosomes = AssemblyReader.GetChromosomes(FileUtilities.GetReadStream(_genomeAssemblyReportPath), dummyRefNameToChromosome, 0);
            int numRefSeqs = chromosomes.Count;

            Console.WriteLine($"{numRefSeqs} references found.");

            IDictionary <string, IChromosome> refNameToChromosome = ReferenceDictionaryUtils.GetRefNameToChromosome(chromosomes);

            Console.Write("- reading FASTA file... ");
            var fastaSequence = GetFastaSequence(_fastaPath, refNameToChromosome);

            Console.WriteLine($"- sequence length: {fastaSequence.Bases.Length:N0}");

            Console.Write("- reading cytogenetic bands... ");
            List <Band> cytogeneticBands = GetCytogeneticBands(fastaSequence.Chromosome.Index, numRefSeqs, refNameToChromosome);

            Console.WriteLine("finished.");

            Console.Write("- applying 2-bit compression... ");
            var referenceSequence = CreateReferenceSequence(fastaSequence, cytogeneticBands);

            Console.WriteLine("finished.");

            Console.Write("- creating reference sequence file... ");
            var minimalChromosomes = new List <IChromosome> {
                fastaSequence.Chromosome
            };

            CreateReferenceSequenceFile(genomeAssembly, minimalChromosomes, referenceSequence);
            long fileSize = new FileInfo(_outputCompressedPath).Length;

            Console.WriteLine($"{fileSize:N0} bytes");

            return(ExitCodes.Success);
        }
Ejemplo n.º 12
0
        // ReSharper disable once UnusedMember.Global
        public NirvanaResult Run(NirvanaConfig config, ILambdaContext context)
        {
            NirvanaResult result;
            string        snsTopicArn = null;
            var           runLog      = new StringBuilder();

            try
            {
                LogUtilities.UpdateLogger(context.Logger, runLog);
                LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion);
                LogUtilities.LogObject("Config", config);
                LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey, "annotation_lambda_arn" });

                LambdaUtilities.GarbageCollect();

                snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey);
                string annotationLambdaArn = LambdaUtilities.GetEnvironmentVariable(AnnotationLambdaKey);

                config.Validate();

                var genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly);

                if (!_supportedAssemblies.Contains(genomeAssembly))
                {
                    throw new UserErrorException($"Unsupported assembly: {config.genomeAssembly}");
                }

                IEnumerable <AnnotationRange> annotationRanges = GetAnnotationRanges(config, genomeAssembly);
                result = GetNirvanaResult(annotationRanges, config, annotationLambdaArn, context, runLog, snsTopicArn);
            }
            catch (Exception exception)
            {
                result = HandleException(runLog, config, exception, snsTopicArn);
            }

            LogUtilities.LogObject("Result", result);

            return(result);
        }
        private static ExitCodes ProgramExecution()
        {
            var transcriptSource = GetSource(_transcriptSource);
            var sequenceReader   = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath));
            var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome);
            var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory);

            var  genomeAssembly  = GenomeAssemblyHelper.Convert(_genomeAssembly);
            long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks;
            var  idToGenbank     = GetIdToGenbank(genomeAssembly, transcriptSource);

            // =========================
            // create the pre-cache file
            // =========================

            // process each VEP directory
            int numRefSeqs = sequenceReader.NumRefSeqs;
            var header     = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs);

            string siftPath       = _outputStub + ".sift.gz";
            string polyphenPath   = _outputStub + ".polyphen.gz";
            string transcriptPath = _outputStub + ".transcripts.gz";
            string regulatoryPath = _outputStub + ".regulatory.gz";

            using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log")))
                using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift))
                    using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen))
                        using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header))
                            using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header))
                            {
                                var converter           = new VepCacheParser(transcriptSource);
                                var emptyPredictionDict = new Dictionary <string, List <int> >();

                                for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++)
                                {
                                    var chromosome = sequenceReader.RefIndexToChromosome[refIndex];

                                    if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir))
                                    {
                                        siftWriter.Write(chromosome, emptyPredictionDict);
                                        polyphenWriter.Write(chromosome, emptyPredictionDict);
                                        continue;
                                    }

                                    Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName);

                                    var rawData                 = converter.ParseDumpDirectory(chromosome, vepSubDir);
                                    var mergedTranscripts       = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank);
                                    var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions);

                                    int numRawTranscripts    = rawData.Transcripts.Count;
                                    int numMergedTranscripts = mergedTranscripts.Count;
                                    Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}");

                                    WriteTranscripts(transcriptWriter, mergedTranscripts);
                                    WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions);
                                    WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome);
                                    WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome);
                                }
                            }

            Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count);

            return(ExitCodes.Success);
        }
Ejemplo n.º 14
0
        public void Convert_GenomeAssemblyExists(string s, GenomeAssembly expectedGenomeAssembly)
        {
            var observedResult = GenomeAssemblyHelper.Convert(s);

            Assert.Equal(expectedGenomeAssembly, observedResult);
        }