//some transcripts have multiple locations in the genome and may have conflicting scores // so, we need to load them up and check for duplicates and resolve them. public ProteinConservationWriter(Stream stream, Stream groupStream, TranscriptCacheData transcriptData, DataSourceVersion version) { _transcriptGroupStream = groupStream; _writer = new ExtendedBinaryWriter(stream); _transcriptCacheData = transcriptData; _version = version; }
public static TranscriptCacheStaging GetStaging(CacheHeader header, IntervalArray <ITranscript>[] transcriptIntervalArrays, IntervalArray <IRegulatoryRegion>[] regulatoryRegionIntervalArrays) { var uniqueData = GetUniqueData(transcriptIntervalArrays); var cacheData = new TranscriptCacheData(header, uniqueData.Genes, uniqueData.TranscriptRegions, uniqueData.Mirnas, uniqueData.PeptideSeqs, transcriptIntervalArrays, regulatoryRegionIntervalArrays); return(new TranscriptCacheStaging(cacheData)); }
public TranscriptCacheReaderTests() { var chr1 = new Chromosome("chr1", "1", 0); var chr2 = new Chromosome("chr2", "2", 1); var chr3 = new Chromosome("chr3", "3", 2); _refIndexToChromosome = new Dictionary <ushort, IChromosome> { [chr1.Index] = chr1, [chr2.Index] = chr2, [chr3.Index] = chr3 }; const GenomeAssembly genomeAssembly = GenomeAssembly.GRCh38; var baseHeader = new Header("test", 2, 3, Source.BothRefSeqAndEnsembl, 4, genomeAssembly); var customHeader = new TranscriptCacheCustomHeader(1, 2); _expectedHeader = new CacheHeader(baseHeader, customHeader); var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 300, 399), new TranscriptRegion(TranscriptRegionType.Intron, 1, 200, 299, 399, 400), new TranscriptRegion(TranscriptRegionType.Exon, 2, 300, 399, 400, 499) }; var mirnas = new IInterval[2]; mirnas[0] = new Interval(100, 200); mirnas[1] = new Interval(300, 400); var peptideSeqs = new[] { "MASE*" }; var genes = new IGene[1]; genes[0] = new Gene(chr3, 100, 200, true, "TP53", 300, CompactId.Convert("7157"), CompactId.Convert("ENSG00000141510")); var regulatoryRegions = new IRegulatoryRegion[2]; regulatoryRegions[0] = new RegulatoryRegion(chr3, 1200, 1300, CompactId.Convert("123"), RegulatoryRegionType.enhancer); regulatoryRegions[1] = new RegulatoryRegion(chr3, 1250, 1450, CompactId.Convert("456"), RegulatoryRegionType.enhancer); var regulatoryRegionIntervalArrays = regulatoryRegions.ToIntervalArrays(3); var transcripts = GetTranscripts(chr3, genes, transcriptRegions, mirnas); var transcriptIntervalArrays = transcripts.ToIntervalArrays(3); _expectedCacheData = new TranscriptCacheData(_expectedHeader, genes, transcriptRegions, mirnas, peptideSeqs, transcriptIntervalArrays, regulatoryRegionIntervalArrays); }
private static Stream GetCacheStream() { const GenomeAssembly genomeAssembly = GenomeAssembly.GRCh38; var baseHeader = new Header("test", 2, 3, Source.BothRefSeqAndEnsembl, 4, genomeAssembly); var customHeader = new TranscriptCacheCustomHeader(1, 2); var expectedHeader = new CacheHeader(baseHeader, customHeader); var transcriptRegions = new ITranscriptRegion[] { new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 300, 399), new TranscriptRegion(TranscriptRegionType.Intron, 1, 200, 299, 399, 400), new TranscriptRegion(TranscriptRegionType.Exon, 2, 300, 399, 400, 499) }; var mirnas = new IInterval[2]; mirnas[0] = new Interval(100, 200); mirnas[1] = new Interval(300, 400); var peptideSeqs = new[] { "MASE*" }; var genes = new IGene[1]; genes[0] = new Gene(ChromosomeUtilities.Chr3, 100, 200, true, "TP53", 300, CompactId.Convert("7157"), CompactId.Convert("ENSG00000141510")); var regulatoryRegions = new IRegulatoryRegion[2]; regulatoryRegions[0] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1200, 1300, CompactId.Convert("123"), RegulatoryRegionType.enhancer); regulatoryRegions[1] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1250, 1450, CompactId.Convert("456"), RegulatoryRegionType.enhancer); var regulatoryRegionIntervalArrays = regulatoryRegions.ToIntervalArrays(3); var transcripts = GetTranscripts(ChromosomeUtilities.Chr3, genes, transcriptRegions, mirnas); var transcriptIntervalArrays = transcripts.ToIntervalArrays(3); var expectedCacheData = new TranscriptCacheData(expectedHeader, genes, transcriptRegions, mirnas, peptideSeqs, transcriptIntervalArrays, regulatoryRegionIntervalArrays); var ms = new MemoryStream(); using (var writer = new TranscriptCacheWriter(ms, expectedHeader, true)) { writer.Write(expectedCacheData); } ms.Position = 0; return(ms); }
private static ExitCodes ProgramExecution() { Source transcriptSource = ParseVepCacheDirectoryMain.GetSource(_transcriptSource); string cachePath = CacheConstants.TranscriptPath(_inputPrefix); IDictionary <ushort, IChromosome> refIndexToChromosome = SequenceHelper.GetDictionaries(_compressedReferencePath).refIndexToChromosome; TranscriptCacheData cache = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome); IDictionary <IGene, int> geneToInternalId = InternalGenes.CreateDictionary(cache.Genes); using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName))) { var creator = new GffCreator(writer, geneToInternalId, transcriptSource); creator.Create(cache.TranscriptIntervalArrays); } return(ExitCodes.Success); }
private static ExitCodes ProgramExecution() { using var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference)); TranscriptCacheData transcriptData = AaConservationUtilities.GetTranscriptData(referenceProvider.RefIndexToChromosome, _transcriptCachePrefix);// we will use the transcript data to validate the protein sequence var version = DataSourceVersionReader.GetSourceVersion(_scoresFile + ".version"); string outFileName = $"{version.Name}_{version.Version}"; //read multi-alignments using (var stream = GZipUtilities.GetAppropriateReadStream(_scoresFile)) using (var parser = new ProteinConservationParser(stream)) using (var outStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + ProteinConservationCommon.FileSuffix))) using (var groupStream = FileUtilities.GetCreateStream("transcriptGroups.txt")) using (var writer = new ProteinConservationWriter(outStream, groupStream, transcriptData, version)) { writer.Write(parser.GetItems()); } return(ExitCodes.Success); }
/// <summary> /// writes the annotations to the current database file /// </summary> public void Write(TranscriptCacheData cacheData) { _blockStream.WriteHeader(_header.Write); WriteItems(_writer, cacheData.Genes, x => x.Write(_writer)); WriteItems(_writer, cacheData.TranscriptRegions, x => x.Write(_writer)); WriteItems(_writer, cacheData.Mirnas, x => x.Write(_writer)); WriteItems(_writer, cacheData.PeptideSeqs, x => _writer.WriteOptAscii(x)); var geneComparer = new GeneComparer(); var transcriptRegionComparer = new TranscriptRegionComparer(); var intervalComparer = new IntervalComparer(); var geneIndices = CreateIndex(cacheData.Genes, geneComparer); var transcriptRegionIndices = CreateIndex(cacheData.TranscriptRegions, transcriptRegionComparer); var microRnaIndices = CreateIndex(cacheData.Mirnas, intervalComparer); var peptideIndices = CreateIndex(cacheData.PeptideSeqs, EqualityComparer <string> .Default); WriteIntervals(_writer, cacheData.RegulatoryRegionIntervalArrays, x => x.Write(_writer)); WriteIntervals(_writer, cacheData.TranscriptIntervalArrays, x => x.Write(_writer, geneIndices, transcriptRegionIndices, microRnaIndices, peptideIndices)); }
public static IntervalForest <string> GetGeneForest(TranscriptCacheData transcriptData) { var geneDictionary = new Dictionary <ushort, List <Interval <string> > > (); foreach (var gene in transcriptData.Genes) { if (!geneDictionary.ContainsKey(gene.Chromosome.Index)) { geneDictionary[gene.Chromosome.Index] = new List <Interval <string> >(); } geneDictionary[gene.Chromosome.Index].Add(new Interval <string>(gene.Start, gene.End, gene.Symbol)); } var geneIntervalArrays = new IntervalArray <string> [geneDictionary.Keys.Max() + 1]; foreach (var(index, geneIntervals) in geneDictionary) { geneIntervalArrays[index] = new IntervalArray <string>(geneIntervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } return(new IntervalForest <string>(geneIntervalArrays)); }
GetIdToSymbols(TranscriptCacheData transcriptData) { var entrezToHgnc = new Dictionary <string, string>(); var ensemblToHgnc = new Dictionary <string, string>(); foreach (var gene in transcriptData.Genes) { if (gene.EntrezGeneId.WithoutVersion == "649330") { Console.WriteLine("bug"); } if (!string.IsNullOrEmpty(gene.EntrezGeneId.WithoutVersion)) { entrezToHgnc[gene.EntrezGeneId.WithoutVersion] = gene.Symbol; } if (!string.IsNullOrEmpty(gene.EnsemblId.WithoutVersion)) { ensemblToHgnc[gene.EnsemblId.WithoutVersion] = gene.Symbol; } } return(entrezToHgnc, ensemblToHgnc); }
public static Dictionary <string, string> GetEnstToGeneSymbols(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData) { var cache = transcriptData.GetCache(); var enstToGeneSymbols = new Dictionary <string, string>(); foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys) { var overlappingTranscripts = cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue); if (overlappingTranscripts == null) { continue; } foreach (var transcript in overlappingTranscripts) { if (transcript.Id.WithoutVersion.StartsWith("ENST")) { enstToGeneSymbols[transcript.Id.WithoutVersion] = transcript.Gene.Symbol; } } } return(enstToGeneSymbols); }
public static Dictionary <ushort, IntervalArray <byte> > GetSpliceIntervals(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData) { var cache = transcriptData.GetCache(); var spliceIntervalDict = new Dictionary <ushort, IntervalArray <byte> >(sequenceProvider.RefIndexToChromosome.Count); foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys) { var spliceIntervals = new List <Interval <byte> >(8 * 1024); var overlappingTranscripts = cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue); if (overlappingTranscripts == null) { continue; } foreach (var transcript in overlappingTranscripts) { if (transcript.Id.IsPredictedTranscript()) { continue; } bool isFirstExon = true; foreach (var transcriptRegion in transcript.TranscriptRegions) { if (transcriptRegion.Type != TranscriptRegionType.Exon) { continue; } var firstSplicePosition = transcriptRegion.Start; var secondSplicePosition = transcriptRegion.End; var firstInterval = new Interval <byte>(firstSplicePosition - SpliceFlankLength, firstSplicePosition + SpliceFlankLength, 0); var secondInterval = new Interval <byte>(secondSplicePosition - SpliceFlankLength, secondSplicePosition + SpliceFlankLength, 0); if (!isFirstExon) { spliceIntervals.Add(firstInterval); } spliceIntervals.Add(secondInterval); isFirstExon = false; } //remove the last added interval since this is the tail of the last exon- which is not a splice site if (spliceIntervals.Count > 0) { spliceIntervals.RemoveAt(spliceIntervals.Count - 1); } } spliceIntervalDict[chromIndex] = new IntervalArray <byte>(spliceIntervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } return(spliceIntervalDict); }
public static Dictionary <ushort, IntervalArray <byte> > GetSpliceIntervals(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData) { var cache = transcriptData.GetCache(); var spliceIntervals = new Dictionary <ushort, IntervalArray <byte> >(sequenceProvider.RefIndexToChromosome.Count); foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys) { var spliceInterval = new List <Interval <byte> >(8 * 1024); var overlappingTranscripts = cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue); if (overlappingTranscripts == null) { continue; } foreach (var transcript in overlappingTranscripts) { if (transcript.Id.IsPredictedTranscript()) { continue; } foreach (var transcriptRegion in transcript.TranscriptRegions) { if (transcriptRegion.Type != TranscriptRegionType.Exon) { continue; } var firstSplicePosition = transcriptRegion.Start; var secondSplicePosition = transcriptRegion.End; var firstInterval = new Interval <byte>(firstSplicePosition - SpliceFlankLength, firstSplicePosition + SpliceFlankLength, 0); var secondInterval = new Interval <byte>(secondSplicePosition - SpliceFlankLength, secondSplicePosition + SpliceFlankLength, 0); spliceInterval.Add(firstInterval); spliceInterval.Add(secondInterval); } } spliceIntervals[chromIndex] = new IntervalArray <byte>(spliceInterval.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray()); } return(spliceIntervals); }
private TranscriptCacheStaging(TranscriptCacheData cacheData) { _cacheData = cacheData; }