예제 #1
0
        public ValidationResult Run(ValidationConfig config, ILambdaContext context)
        {
            string snsTopicArn = null;

            try
            {
                LogUtilities.UpdateLogger(context.Logger, null);
                LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion);
                LogUtilities.LogObject("Config", config);
                LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey });
                LambdaUtilities.GarbageCollect();
                snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey);

                config.Validate();
                GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly);

                string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly);
                var    refProvider  = ProviderUtilities.GetSequenceProvider(nirvanaS3Ref);

                using (var stream = PersistentStreamUtils.GetReadStream(config.customStrUrl))
                    TryLoadStrFile(stream, genomeAssembly, refProvider);
            }
            catch (Exception exception)
            {
                return(HandleException(config.id, exception, snsTopicArn));
            }

            return(GetSuccessOutput(config.id));
        }
예제 #2
0
        public ExacTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider)
        {
            Console.WriteLine(version.ToString());

            _writer = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(),
                                      SaTsvCommon.OneKgenSchemaVersion, InterimSaCommon.ExacTag, null, true, sequenceProvider);
        }
예제 #3
0
        public ChunkedIndex(Stream stream)
        {
            //reading the index in one shot
            var buffer      = new byte[1048576];
            var indexLength = stream.Read(buffer, 0, 1048576);

            using (var memStream = new MemoryStream(buffer, 0, indexLength))
                using (var memReader = new ExtendedBinaryReader(memStream))
                {
                    Assembly      = (GenomeAssembly)memReader.ReadByte();
                    Version       = DataSourceVersion.Read(memReader);
                    JsonKey       = memReader.ReadAsciiString();
                    MatchByAllele = memReader.ReadBoolean();
                    IsArray       = memReader.ReadBoolean();
                    SchemaVersion = memReader.ReadOptInt32();
                    IsPositional  = memReader.ReadBoolean();

                    var chromCount = memReader.ReadOptInt32();
                    _chromChunks = new Dictionary <ushort, List <Chunk> >(chromCount);
                    for (var i = 0; i < chromCount; i++)
                    {
                        var chromIndex = memReader.ReadOptUInt16();
                        var chunkCount = memReader.ReadOptInt32();
                        _chromChunks[chromIndex] = new List <Chunk>(chunkCount);
                        for (var j = 0; j < chunkCount; j++)
                        {
                            _chromChunks[chromIndex].Add(new Chunk(memReader));
                        }
                    }
                }
        }
예제 #4
0
        public GnomadTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider, string sequencingDataType)
        {
            Console.WriteLine(version.ToString());

            _writer = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(),
                                      SaTsvCommon.SchemaVersion, _jsonKeyDictionary[sequencingDataType], null, true, sequenceProvider);
        }
예제 #5
0
        public TopMedTsvWriter(DataSourceVersion version, string outputFileName, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider)
        {
            Console.WriteLine(version.ToString());

            _writer = new SaTsvWriter(outputFileName, version, genomeAssembly.ToString(),
                                      SaTsvCommon.SchemaVersion, InterimSaCommon.TopMedTag, null, true, sequenceProvider);
        }
예제 #6
0
        public InterimTsvsMerger(IEnumerable <string> annotationFiles, IEnumerable <string> intervalFiles, string miscFile, IEnumerable <string> geneFiles, string compressedReference, string outputDirectory)
        {
            _outputDirectory = outputDirectory;

            var refSequenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(compressedReference));

            _genomeAssembly      = refSequenceProvider.GenomeAssembly;
            _refNameToChromosome = refSequenceProvider.RefNameToChromosome;

            _tsvReaders      = ReaderUtilities.GetSaTsvReaders(annotationFiles);
            _miscReader      = ReaderUtilities.GetMiscTsvReader(miscFile);
            _geneReaders     = ReaderUtilities.GetGeneReaders(geneFiles);
            _intervalReaders = ReaderUtilities.GetIntervalReaders(intervalFiles);

            _saHeaders = new List <SaHeader>();
            _saHeaders.AddRange(ReaderUtilities.GetTsvHeaders(_tsvReaders));
            _saHeaders.AddRange(ReaderUtilities.GetTsvHeaders(_intervalReaders));
            _geneHeaders = ReaderUtilities.GetTsvHeaders(_geneReaders)?.ToList();

            _refNames = new HashSet <string>();
            _refNames.UnionWith(ReaderUtilities.GetRefNames(_tsvReaders));
            _refNames.UnionWith(ReaderUtilities.GetRefNames(_intervalReaders));
            if (_miscReader != null)
            {
                _refNames.UnionWith(_miscReader.RefNames);
            }

            DisplayDataSources(_saHeaders, _geneHeaders);

            MergeUtilities.CheckAssemblyConsistancy(_saHeaders);
        }
예제 #7
0
        private static ClinvarVariant GetClinvarVariant(XElement xElement, GenomeAssembly genomeAssembly, IDictionary <string, IChromosome> refChromDict, int?variantId)
        {
            if (xElement == null)
            {
                return(null);
            }
            //<SequenceLocation Assembly="GRCh38" Chr="17" Accession="NC_000017.11" start="43082402" stop="43082402" variantLength="1" referenceAllele="A" alternateAllele="C" />

            if (genomeAssembly.ToString() != xElement.Attribute(AssemblyTag)?.Value &&
                genomeAssembly != GenomeAssembly.Unknown)
            {
                return(null);
            }

            var    chromosome      = refChromDict.ContainsKey(xElement.Attribute(ChrTag)?.Value) ? refChromDict[xElement.Attribute(ChrTag)?.Value] : null;
            int    start           = Convert.ToInt32(xElement.Attribute(StartTag)?.Value);
            int    stop            = Convert.ToInt32(xElement.Attribute(StopTag)?.Value);
            string referenceAllele = xElement.Attribute(RefAlleleTag)?.Value;
            string altAllele       = xElement.Attribute(AltAlleleTag)?.Value;

            if (stop - start + 1 > MaxVariantLength)
            {
                return(null);
            }
            AdjustVariant(ref start, ref referenceAllele, ref altAllele);

            return(new ClinvarVariant(chromosome, start, stop, variantId, referenceAllele, altAllele));
        }
예제 #8
0
 public SimpleSequenceProvider(GenomeAssembly assembly, ISequence sequence,
                               IDictionary <string, IChromosome> refNameToChromosome)
 {
     Assembly            = assembly;
     Sequence            = sequence;
     RefNameToChromosome = refNameToChromosome;
 }
예제 #9
0
        private List <PredictionCache> GetMergedPredictions(string path1, string path2)
        {
            var mergedPredictions = new List <PredictionCache>();

            using (var reader1 = new PredictionCacheReader(FileUtilities.GetReadStream(path1)))
                using (var reader2 = new PredictionCacheReader(FileUtilities.GetReadStream(path2)))
                {
                    _genomeAssembly = reader1.FileHeader.GenomeAssembly;
                    _numRefSeq      = reader1.FileHeader.Index.Size;

                    if (_genomeAssembly != reader2.FileHeader.GenomeAssembly)
                    {
                        throw  new UserErrorException($"Observed different genome assemblies: {reader1.FileHeader.GenomeAssembly}, {reader2.FileHeader.GenomeAssembly}");
                    }

                    for (ushort i = 0; i < _numRefSeq; i++)
                    {
                        var cache1 = reader1.Read(i);
                        var cache2 = reader2.Read(i);

                        if (cache1 == PredictionCache.Empty ^ cache2 == PredictionCache.Empty)
                        {
                            throw new DataMisalignedException("one of the cache ran out before the other");
                        }
                        mergedPredictions.Add(cache1.GetMergedCache(cache2));
                    }
                    //todo: take care of ref sequences unique to one cache
                }
            return(mergedPredictions);
        }
예제 #10
0
        public void CacheHeader_EndToEnd()
        {
            const Source         expectedTranscriptSource  = Source.BothRefSeqAndEnsembl;
            const long           expectedCreationTimeTicks = long.MaxValue;
            const GenomeAssembly expectedAssembly          = GenomeAssembly.hg19;
            const ushort         expectedVepVersion        = ushort.MaxValue;

            var expectedBaseHeader   = new Header("VEP", 1, 2, expectedTranscriptSource, expectedCreationTimeTicks, expectedAssembly);
            var expectedCustomHeader = new TranscriptCacheCustomHeader(expectedVepVersion, 0);
            var expectedHeader       = new CacheHeader(expectedBaseHeader, expectedCustomHeader);

            CacheHeader observedHeader;

            using (var ms = new MemoryStream())
            {
                using (var writer = new BinaryWriter(ms, Encoding.UTF8, true))
                {
                    expectedHeader.Write(writer);
                }

                ms.Position    = 0;
                observedHeader = CacheHeader.Read(ms);
            }

            Assert.NotNull(observedHeader);
            Assert.Equal(expectedTranscriptSource, observedHeader.Source);
            Assert.Equal(expectedCreationTimeTicks, observedHeader.CreationTimeTicks);
            Assert.Equal(expectedAssembly, observedHeader.Assembly);
            Assert.Equal(expectedVepVersion, observedHeader.Custom.VepVersion);
        }
예제 #11
0
        public static string GetCachePathPrefix(int vepVersion, GenomeAssembly genomeAssembly)
        {
            string cacheFolder = LambdaUrlHelper.GetBaseUrl() + "ab0cf104f39708eabd07b8cb67e149ba-Cache/26/";
            string suffix      = $"{genomeAssembly}/{LambdaUrlHelper.DefaultCacheSource}";

            return(UrlCombine(vepVersion == 84 ? $"{cacheFolder}VEP84/" : cacheFolder, suffix));
        }
예제 #12
0
        public PianoAnnotator(IAnnotationProvider taProvider, ISequenceProvider sequenceProvider)
        {
            _taProvider       = taProvider;
            _sequenceProvider = sequenceProvider;

            GenomeAssembly = GetGenomeAssembly();
        }
예제 #13
0
        public MustGenotypeExtractor(string assembly, string oneKGenomeVcf, string clinvarVcf, string cosmicVcf)
        {
            switch (assembly)
            {
            case "GRCh37":
                _assembly = GenomeAssembly.GRCh37;
                break;

            case "GRCh38":
                _assembly = GenomeAssembly.GRCh38;
                break;

            case "hg19":
                _assembly = GenomeAssembly.hg19;
                break;

            default:
                _assembly = GenomeAssembly.Unknown;
                break;
            }

            if (_assembly == GenomeAssembly.Unknown)
            {
                throw new Exception("Genome assembly must be either GRCh37 or GRCh38");
            }

            _oneKGenomeReader = string.IsNullOrEmpty(oneKGenomeVcf)? null: GZipUtilities.GetAppropriateStreamReader(oneKGenomeVcf);
            _clinvarReader    = string.IsNullOrEmpty(clinvarVcf) ? null : GZipUtilities.GetAppropriateStreamReader(clinvarVcf);
            _cosmicReader     = string.IsNullOrEmpty(cosmicVcf) ? null : GZipUtilities.GetAppropriateStreamReader(cosmicVcf);
        }
예제 #14
0
        public CosmicTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider)
        {
            Console.WriteLine(version.ToString());

            _writer = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(),
                                      SaTsvCommon.CosmicSchemaVersion, InterimSaCommon.CosmicTag, InterimSaCommon.CosmicVcfTag, false, sequenceProvider, true);
        }
예제 #15
0
        public NsaIndex(Stream stream)
        {
            using (var memStream = new MemoryStream())
                using (var memReader = new ExtendedBinaryReader(memStream))
                {
                    stream.CopyTo(memStream);//reading all bytes in stream to memStream
                    memStream.Position = 0;

                    Assembly      = (GenomeAssembly)memReader.ReadByte();
                    Version       = DataSourceVersion.Read(memReader);
                    JsonKey       = memReader.ReadAsciiString();
                    MatchByAllele = memReader.ReadBoolean();
                    IsArray       = memReader.ReadBoolean();
                    SchemaVersion = memReader.ReadOptInt32();
                    IsPositional  = memReader.ReadBoolean();

                    var chromCount = memReader.ReadOptInt32();
                    _chromBlocks = new Dictionary <ushort, List <NsaIndexBlock> >(chromCount);
                    for (var i = 0; i < chromCount; i++)
                    {
                        var chromIndex = memReader.ReadOptUInt16();
                        var chunkCount = memReader.ReadOptInt32();
                        _chromBlocks[chromIndex] = new List <NsaIndexBlock>(chunkCount);
                        for (var j = 0; j < chunkCount; j++)
                        {
                            _chromBlocks[chromIndex].Add(new NsaIndexBlock(memReader));
                        }
                    }
                }
        }
예제 #16
0
 public TranscriptCacheBuilder(GenomeAssembly genomeAssembly, Source source, long vepReleaseTicks,
                               ushort vepVersion)
 {
     _genomeAssembly  = genomeAssembly;
     _source          = source;
     _vepReleaseTicks = vepReleaseTicks;
     _vepVersion      = vepVersion;
 }
예제 #17
0
 private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, IntervalArray <string>[] intervalArrays)
 {
     Assembly        = assembly;
     Version         = version;
     JsonKey         = jsonKey;
     ReportFor       = reportFor;
     _intervalForest = new IntervalForest <string>(intervalArrays);
 }
예제 #18
0
 public PhylopWriter(string inputWigFixFile, DataSourceVersion version, GenomeAssembly genomeAssembly,
                     string outputNirvanaDirectory, int intervalLength = PhylopCommon.MaxIntervalLength)
     : this(null, version, genomeAssembly, intervalLength)
 {
     _version = version;
     _reader  = GZipUtilities.GetAppropriateStreamReader(inputWigFixFile);
     _outputNirvanaDirectory = outputNirvanaDirectory;
 }
        /// <summary>
        /// writes the header to our output file
        /// </summary>
        private static void WriteHeader(StreamWriter writer, GlobalImportCommon.FileType fileType,
                                        TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly)
        {
            var vepReleaseTicks = DateTime.Parse(ConfigurationSettings.VepReleaseDate).Ticks;

            writer.WriteLine("{0}\t{1}", GlobalImportCommon.Header, (byte)fileType);
            writer.WriteLine("{0}\t{1}\t{2}\t{3}", ConfigurationSettings.VepVersion, vepReleaseTicks, (byte)transcriptSource, (byte)genomeAssembly);
        }
예제 #20
0
 /// <summary>
 /// constructor
 /// </summary>
 public GlobalImportHeader(ushort vepVersion, long vepReleaseTicks, TranscriptDataSource transcriptSource,
                           GenomeAssembly genomeAssembly)
 {
     VepVersion       = vepVersion;
     VepReleaseTicks  = vepReleaseTicks;
     TranscriptSource = transcriptSource;
     GenomeAssembly   = genomeAssembly;
 }
예제 #21
0
 private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, Dictionary <ushort, IntervalArray <string> > intervalArrays)
 {
     Assembly        = assembly;
     Version         = version;
     JsonKey         = jsonKey;
     ReportFor       = reportFor;
     _intervalArrays = intervalArrays;
 }
예제 #22
0
        public OnekgTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider)
        {
            Console.WriteLine(version.ToString());

            _onekgWriter = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(),
                                           SaTsvCommon.OneKgenSchemaVersion, InterimSaCommon.OneKgenTag, "AF1000G", true, sequenceProvider);

            _refMinorWriter = new SaMiscTsvWriter(outputDirectory, version, genomeAssembly.ToString(), InterimSaCommon.RefMinorTag, sequenceProvider);
        }
예제 #23
0
 /// <summary>
 /// constructor
 /// </summary>
 public SupplementaryAnnotationHeader(string referenceSequenceName, long creationTimeTicks, ushort dataVersion,
                                      IEnumerable <IDataSourceVersion> dataSourceVersions, GenomeAssembly genomeAssembly)
 {
     ReferenceSequenceName = referenceSequenceName;
     CreationTimeTicks     = creationTimeTicks;
     DataVersion           = dataVersion;
     DataSourceVersions    = dataSourceVersions;
     GenomeAssembly        = genomeAssembly;
 }
        private static void AddFile(this ICollection <RemoteFile> files, GenomeAssembly genomeAssembly, string saDirectory, string path)
        {
            string filename    = Path.GetFileName(path);
            string remotePath  = path;
            string localPath   = Path.Combine(saDirectory, genomeAssembly.ToString(), filename);
            string description = $"{filename} ({genomeAssembly})";

            files.Add(new RemoteFile(remotePath, localPath, description));
        }
예제 #25
0
 public RepeatExpansionProvider(GenomeAssembly genomeAssembly, IDictionary <string, IChromosome> refNameToChromosome,
                                int numRefSeqs, string customTsvPath)
 {
     using (Stream stream = GetTsvStream(genomeAssembly, customTsvPath))
     {
         IIntervalForest <RepeatExpansionPhenotype> phenotypeForest = RepeatExpansionReader.Load(stream, genomeAssembly, refNameToChromosome, numRefSeqs);
         _matcher = new Matcher(phenotypeForest);
     }
 }
예제 #26
0
        // create a phylopWriter with a certain interval length and empty score buffer
        internal PhylopWriter(string refSeqName, DataSourceVersion version, GenomeAssembly genomeAssembly,
                              int intervalLength, ExtendedBinaryWriter writer) : this(refSeqName, version, genomeAssembly, intervalLength)
        {
            _scoreCount = 0; // tbe score buffer is empty
            _writer     = writer;

            WriteHeader();
            _currentInterval = new PhylopInterval(100, 0, 1);
        }
예제 #27
0
 public IntermediateIoHeader(ushort vepVersion, long vepReleaseTicks, Source transcriptSource,
                             GenomeAssembly genomeAssembly, int numRefSeqs)
 {
     VepVersion      = vepVersion;
     VepReleaseTicks = vepReleaseTicks;
     Source          = transcriptSource;
     GenomeAssembly  = genomeAssembly;
     _numRefSeqs     = numRefSeqs;
 }
        private static string GetAssemblyErrorMessage(GenomeAssembly cacheAssembly, GenomeAssembly refAssembly)
        {
            var sb = StringBuilderCache.Acquire();

            sb.AppendLine("Not all of the data sources have the same genome assembly:");
            sb.AppendLine($"- Using {refAssembly}: Reference sequence provider");
            sb.AppendLine($"- Using {cacheAssembly}: Transcript annotation provider");
            return(StringBuilderCache.GetStringAndRelease(sb));
        }
예제 #29
0
 private static void CreateReferenceSequenceFile(GenomeAssembly genomeAssembly, byte patchLevel,
                                                 IReadOnlyCollection <IChromosome> chromosomes, List <Creation.ReferenceSequence> referenceSequences)
 {
     using (var writer = new ReferenceSequenceWriter(FileUtilities.GetCreateStream(_outputCompressedPath),
                                                     chromosomes, genomeAssembly, patchLevel))
     {
         writer.Write(referenceSequences);
     }
 }
예제 #30
0
        private static ISequenceProvider GetSequenceProvider(GenomeAssembly assembly, int start, string refSequence)
        {
            var seqProvider = new Mock <ISequenceProvider>();

            seqProvider.Setup(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome);
            seqProvider.Setup(x => x.Assembly).Returns(assembly);
            seqProvider.Setup(x => x.Sequence).Returns(new SimpleSequence(refSequence, start - 1));
            return(seqProvider.Object);
        }