public ValidationResult Run(ValidationConfig config, ILambdaContext context) { string snsTopicArn = null; try { LogUtilities.UpdateLogger(context.Logger, null); LogUtilities.LogLambdaInfo(context, CommandLineUtilities.InformationalVersion); LogUtilities.LogObject("Config", config); LogUtilities.Log(new[] { LambdaUrlHelper.UrlBaseEnvironmentVariableName, LambdaUtilities.SnsTopicKey }); LambdaUtilities.GarbageCollect(); snsTopicArn = LambdaUtilities.GetEnvironmentVariable(LambdaUtilities.SnsTopicKey); config.Validate(); GenomeAssembly genomeAssembly = GenomeAssemblyHelper.Convert(config.genomeAssembly); string nirvanaS3Ref = LambdaUrlHelper.GetRefUrl(genomeAssembly); var refProvider = ProviderUtilities.GetSequenceProvider(nirvanaS3Ref); using (var stream = PersistentStreamUtils.GetReadStream(config.customStrUrl)) TryLoadStrFile(stream, genomeAssembly, refProvider); } catch (Exception exception) { return(HandleException(config.id, exception, snsTopicArn)); } return(GetSuccessOutput(config.id)); }
public ExacTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider) { Console.WriteLine(version.ToString()); _writer = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(), SaTsvCommon.OneKgenSchemaVersion, InterimSaCommon.ExacTag, null, true, sequenceProvider); }
public ChunkedIndex(Stream stream) { //reading the index in one shot var buffer = new byte[1048576]; var indexLength = stream.Read(buffer, 0, 1048576); using (var memStream = new MemoryStream(buffer, 0, indexLength)) using (var memReader = new ExtendedBinaryReader(memStream)) { Assembly = (GenomeAssembly)memReader.ReadByte(); Version = DataSourceVersion.Read(memReader); JsonKey = memReader.ReadAsciiString(); MatchByAllele = memReader.ReadBoolean(); IsArray = memReader.ReadBoolean(); SchemaVersion = memReader.ReadOptInt32(); IsPositional = memReader.ReadBoolean(); var chromCount = memReader.ReadOptInt32(); _chromChunks = new Dictionary <ushort, List <Chunk> >(chromCount); for (var i = 0; i < chromCount; i++) { var chromIndex = memReader.ReadOptUInt16(); var chunkCount = memReader.ReadOptInt32(); _chromChunks[chromIndex] = new List <Chunk>(chunkCount); for (var j = 0; j < chunkCount; j++) { _chromChunks[chromIndex].Add(new Chunk(memReader)); } } } }
public GnomadTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider, string sequencingDataType) { Console.WriteLine(version.ToString()); _writer = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(), SaTsvCommon.SchemaVersion, _jsonKeyDictionary[sequencingDataType], null, true, sequenceProvider); }
public TopMedTsvWriter(DataSourceVersion version, string outputFileName, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider) { Console.WriteLine(version.ToString()); _writer = new SaTsvWriter(outputFileName, version, genomeAssembly.ToString(), SaTsvCommon.SchemaVersion, InterimSaCommon.TopMedTag, null, true, sequenceProvider); }
public InterimTsvsMerger(IEnumerable <string> annotationFiles, IEnumerable <string> intervalFiles, string miscFile, IEnumerable <string> geneFiles, string compressedReference, string outputDirectory) { _outputDirectory = outputDirectory; var refSequenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(compressedReference)); _genomeAssembly = refSequenceProvider.GenomeAssembly; _refNameToChromosome = refSequenceProvider.RefNameToChromosome; _tsvReaders = ReaderUtilities.GetSaTsvReaders(annotationFiles); _miscReader = ReaderUtilities.GetMiscTsvReader(miscFile); _geneReaders = ReaderUtilities.GetGeneReaders(geneFiles); _intervalReaders = ReaderUtilities.GetIntervalReaders(intervalFiles); _saHeaders = new List <SaHeader>(); _saHeaders.AddRange(ReaderUtilities.GetTsvHeaders(_tsvReaders)); _saHeaders.AddRange(ReaderUtilities.GetTsvHeaders(_intervalReaders)); _geneHeaders = ReaderUtilities.GetTsvHeaders(_geneReaders)?.ToList(); _refNames = new HashSet <string>(); _refNames.UnionWith(ReaderUtilities.GetRefNames(_tsvReaders)); _refNames.UnionWith(ReaderUtilities.GetRefNames(_intervalReaders)); if (_miscReader != null) { _refNames.UnionWith(_miscReader.RefNames); } DisplayDataSources(_saHeaders, _geneHeaders); MergeUtilities.CheckAssemblyConsistancy(_saHeaders); }
private static ClinvarVariant GetClinvarVariant(XElement xElement, GenomeAssembly genomeAssembly, IDictionary <string, IChromosome> refChromDict, int?variantId) { if (xElement == null) { return(null); } //<SequenceLocation Assembly="GRCh38" Chr="17" Accession="NC_000017.11" start="43082402" stop="43082402" variantLength="1" referenceAllele="A" alternateAllele="C" /> if (genomeAssembly.ToString() != xElement.Attribute(AssemblyTag)?.Value && genomeAssembly != GenomeAssembly.Unknown) { return(null); } var chromosome = refChromDict.ContainsKey(xElement.Attribute(ChrTag)?.Value) ? refChromDict[xElement.Attribute(ChrTag)?.Value] : null; int start = Convert.ToInt32(xElement.Attribute(StartTag)?.Value); int stop = Convert.ToInt32(xElement.Attribute(StopTag)?.Value); string referenceAllele = xElement.Attribute(RefAlleleTag)?.Value; string altAllele = xElement.Attribute(AltAlleleTag)?.Value; if (stop - start + 1 > MaxVariantLength) { return(null); } AdjustVariant(ref start, ref referenceAllele, ref altAllele); return(new ClinvarVariant(chromosome, start, stop, variantId, referenceAllele, altAllele)); }
public SimpleSequenceProvider(GenomeAssembly assembly, ISequence sequence, IDictionary <string, IChromosome> refNameToChromosome) { Assembly = assembly; Sequence = sequence; RefNameToChromosome = refNameToChromosome; }
private List <PredictionCache> GetMergedPredictions(string path1, string path2) { var mergedPredictions = new List <PredictionCache>(); using (var reader1 = new PredictionCacheReader(FileUtilities.GetReadStream(path1))) using (var reader2 = new PredictionCacheReader(FileUtilities.GetReadStream(path2))) { _genomeAssembly = reader1.FileHeader.GenomeAssembly; _numRefSeq = reader1.FileHeader.Index.Size; if (_genomeAssembly != reader2.FileHeader.GenomeAssembly) { throw new UserErrorException($"Observed different genome assemblies: {reader1.FileHeader.GenomeAssembly}, {reader2.FileHeader.GenomeAssembly}"); } for (ushort i = 0; i < _numRefSeq; i++) { var cache1 = reader1.Read(i); var cache2 = reader2.Read(i); if (cache1 == PredictionCache.Empty ^ cache2 == PredictionCache.Empty) { throw new DataMisalignedException("one of the cache ran out before the other"); } mergedPredictions.Add(cache1.GetMergedCache(cache2)); } //todo: take care of ref sequences unique to one cache } return(mergedPredictions); }
public void CacheHeader_EndToEnd() { const Source expectedTranscriptSource = Source.BothRefSeqAndEnsembl; const long expectedCreationTimeTicks = long.MaxValue; const GenomeAssembly expectedAssembly = GenomeAssembly.hg19; const ushort expectedVepVersion = ushort.MaxValue; var expectedBaseHeader = new Header("VEP", 1, 2, expectedTranscriptSource, expectedCreationTimeTicks, expectedAssembly); var expectedCustomHeader = new TranscriptCacheCustomHeader(expectedVepVersion, 0); var expectedHeader = new CacheHeader(expectedBaseHeader, expectedCustomHeader); CacheHeader observedHeader; using (var ms = new MemoryStream()) { using (var writer = new BinaryWriter(ms, Encoding.UTF8, true)) { expectedHeader.Write(writer); } ms.Position = 0; observedHeader = CacheHeader.Read(ms); } Assert.NotNull(observedHeader); Assert.Equal(expectedTranscriptSource, observedHeader.Source); Assert.Equal(expectedCreationTimeTicks, observedHeader.CreationTimeTicks); Assert.Equal(expectedAssembly, observedHeader.Assembly); Assert.Equal(expectedVepVersion, observedHeader.Custom.VepVersion); }
public static string GetCachePathPrefix(int vepVersion, GenomeAssembly genomeAssembly) { string cacheFolder = LambdaUrlHelper.GetBaseUrl() + "ab0cf104f39708eabd07b8cb67e149ba-Cache/26/"; string suffix = $"{genomeAssembly}/{LambdaUrlHelper.DefaultCacheSource}"; return(UrlCombine(vepVersion == 84 ? $"{cacheFolder}VEP84/" : cacheFolder, suffix)); }
public PianoAnnotator(IAnnotationProvider taProvider, ISequenceProvider sequenceProvider) { _taProvider = taProvider; _sequenceProvider = sequenceProvider; GenomeAssembly = GetGenomeAssembly(); }
public MustGenotypeExtractor(string assembly, string oneKGenomeVcf, string clinvarVcf, string cosmicVcf) { switch (assembly) { case "GRCh37": _assembly = GenomeAssembly.GRCh37; break; case "GRCh38": _assembly = GenomeAssembly.GRCh38; break; case "hg19": _assembly = GenomeAssembly.hg19; break; default: _assembly = GenomeAssembly.Unknown; break; } if (_assembly == GenomeAssembly.Unknown) { throw new Exception("Genome assembly must be either GRCh37 or GRCh38"); } _oneKGenomeReader = string.IsNullOrEmpty(oneKGenomeVcf)? null: GZipUtilities.GetAppropriateStreamReader(oneKGenomeVcf); _clinvarReader = string.IsNullOrEmpty(clinvarVcf) ? null : GZipUtilities.GetAppropriateStreamReader(clinvarVcf); _cosmicReader = string.IsNullOrEmpty(cosmicVcf) ? null : GZipUtilities.GetAppropriateStreamReader(cosmicVcf); }
public CosmicTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider) { Console.WriteLine(version.ToString()); _writer = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(), SaTsvCommon.CosmicSchemaVersion, InterimSaCommon.CosmicTag, InterimSaCommon.CosmicVcfTag, false, sequenceProvider, true); }
public NsaIndex(Stream stream) { using (var memStream = new MemoryStream()) using (var memReader = new ExtendedBinaryReader(memStream)) { stream.CopyTo(memStream);//reading all bytes in stream to memStream memStream.Position = 0; Assembly = (GenomeAssembly)memReader.ReadByte(); Version = DataSourceVersion.Read(memReader); JsonKey = memReader.ReadAsciiString(); MatchByAllele = memReader.ReadBoolean(); IsArray = memReader.ReadBoolean(); SchemaVersion = memReader.ReadOptInt32(); IsPositional = memReader.ReadBoolean(); var chromCount = memReader.ReadOptInt32(); _chromBlocks = new Dictionary <ushort, List <NsaIndexBlock> >(chromCount); for (var i = 0; i < chromCount; i++) { var chromIndex = memReader.ReadOptUInt16(); var chunkCount = memReader.ReadOptInt32(); _chromBlocks[chromIndex] = new List <NsaIndexBlock>(chunkCount); for (var j = 0; j < chunkCount; j++) { _chromBlocks[chromIndex].Add(new NsaIndexBlock(memReader)); } } } }
public TranscriptCacheBuilder(GenomeAssembly genomeAssembly, Source source, long vepReleaseTicks, ushort vepVersion) { _genomeAssembly = genomeAssembly; _source = source; _vepReleaseTicks = vepReleaseTicks; _vepVersion = vepVersion; }
private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, IntervalArray <string>[] intervalArrays) { Assembly = assembly; Version = version; JsonKey = jsonKey; ReportFor = reportFor; _intervalForest = new IntervalForest <string>(intervalArrays); }
public PhylopWriter(string inputWigFixFile, DataSourceVersion version, GenomeAssembly genomeAssembly, string outputNirvanaDirectory, int intervalLength = PhylopCommon.MaxIntervalLength) : this(null, version, genomeAssembly, intervalLength) { _version = version; _reader = GZipUtilities.GetAppropriateStreamReader(inputWigFixFile); _outputNirvanaDirectory = outputNirvanaDirectory; }
/// <summary> /// writes the header to our output file /// </summary> private static void WriteHeader(StreamWriter writer, GlobalImportCommon.FileType fileType, TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly) { var vepReleaseTicks = DateTime.Parse(ConfigurationSettings.VepReleaseDate).Ticks; writer.WriteLine("{0}\t{1}", GlobalImportCommon.Header, (byte)fileType); writer.WriteLine("{0}\t{1}\t{2}\t{3}", ConfigurationSettings.VepVersion, vepReleaseTicks, (byte)transcriptSource, (byte)genomeAssembly); }
/// <summary> /// constructor /// </summary> public GlobalImportHeader(ushort vepVersion, long vepReleaseTicks, TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly) { VepVersion = vepVersion; VepReleaseTicks = vepReleaseTicks; TranscriptSource = transcriptSource; GenomeAssembly = genomeAssembly; }
private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, Dictionary <ushort, IntervalArray <string> > intervalArrays) { Assembly = assembly; Version = version; JsonKey = jsonKey; ReportFor = reportFor; _intervalArrays = intervalArrays; }
public OnekgTsvWriter(DataSourceVersion version, string outputDirectory, GenomeAssembly genomeAssembly, ISequenceProvider sequenceProvider) { Console.WriteLine(version.ToString()); _onekgWriter = new SaTsvWriter(outputDirectory, version, genomeAssembly.ToString(), SaTsvCommon.OneKgenSchemaVersion, InterimSaCommon.OneKgenTag, "AF1000G", true, sequenceProvider); _refMinorWriter = new SaMiscTsvWriter(outputDirectory, version, genomeAssembly.ToString(), InterimSaCommon.RefMinorTag, sequenceProvider); }
/// <summary> /// constructor /// </summary> public SupplementaryAnnotationHeader(string referenceSequenceName, long creationTimeTicks, ushort dataVersion, IEnumerable <IDataSourceVersion> dataSourceVersions, GenomeAssembly genomeAssembly) { ReferenceSequenceName = referenceSequenceName; CreationTimeTicks = creationTimeTicks; DataVersion = dataVersion; DataSourceVersions = dataSourceVersions; GenomeAssembly = genomeAssembly; }
private static void AddFile(this ICollection <RemoteFile> files, GenomeAssembly genomeAssembly, string saDirectory, string path) { string filename = Path.GetFileName(path); string remotePath = path; string localPath = Path.Combine(saDirectory, genomeAssembly.ToString(), filename); string description = $"{filename} ({genomeAssembly})"; files.Add(new RemoteFile(remotePath, localPath, description)); }
public RepeatExpansionProvider(GenomeAssembly genomeAssembly, IDictionary <string, IChromosome> refNameToChromosome, int numRefSeqs, string customTsvPath) { using (Stream stream = GetTsvStream(genomeAssembly, customTsvPath)) { IIntervalForest <RepeatExpansionPhenotype> phenotypeForest = RepeatExpansionReader.Load(stream, genomeAssembly, refNameToChromosome, numRefSeqs); _matcher = new Matcher(phenotypeForest); } }
// create a phylopWriter with a certain interval length and empty score buffer internal PhylopWriter(string refSeqName, DataSourceVersion version, GenomeAssembly genomeAssembly, int intervalLength, ExtendedBinaryWriter writer) : this(refSeqName, version, genomeAssembly, intervalLength) { _scoreCount = 0; // tbe score buffer is empty _writer = writer; WriteHeader(); _currentInterval = new PhylopInterval(100, 0, 1); }
public IntermediateIoHeader(ushort vepVersion, long vepReleaseTicks, Source transcriptSource, GenomeAssembly genomeAssembly, int numRefSeqs) { VepVersion = vepVersion; VepReleaseTicks = vepReleaseTicks; Source = transcriptSource; GenomeAssembly = genomeAssembly; _numRefSeqs = numRefSeqs; }
private static string GetAssemblyErrorMessage(GenomeAssembly cacheAssembly, GenomeAssembly refAssembly) { var sb = StringBuilderCache.Acquire(); sb.AppendLine("Not all of the data sources have the same genome assembly:"); sb.AppendLine($"- Using {refAssembly}: Reference sequence provider"); sb.AppendLine($"- Using {cacheAssembly}: Transcript annotation provider"); return(StringBuilderCache.GetStringAndRelease(sb)); }
private static void CreateReferenceSequenceFile(GenomeAssembly genomeAssembly, byte patchLevel, IReadOnlyCollection <IChromosome> chromosomes, List <Creation.ReferenceSequence> referenceSequences) { using (var writer = new ReferenceSequenceWriter(FileUtilities.GetCreateStream(_outputCompressedPath), chromosomes, genomeAssembly, patchLevel)) { writer.Write(referenceSequences); } }
private static ISequenceProvider GetSequenceProvider(GenomeAssembly assembly, int start, string refSequence) { var seqProvider = new Mock <ISequenceProvider>(); seqProvider.Setup(x => x.RefNameToChromosome).Returns(ChromosomeUtilities.RefNameToChromosome); seqProvider.Setup(x => x.Assembly).Returns(assembly); seqProvider.Setup(x => x.Sequence).Returns(new SimpleSequence(refSequence, start - 1)); return(seqProvider.Object); }