//some transcripts have multiple locations in the genome and may have conflicting scores
        // so, we need to load them up and check for duplicates and resolve them.


        public ProteinConservationWriter(Stream stream, Stream groupStream, TranscriptCacheData transcriptData, DataSourceVersion version)
        {
            _transcriptGroupStream = groupStream;
            _writer = new ExtendedBinaryWriter(stream);
            _transcriptCacheData = transcriptData;
            _version             = version;
        }
Пример #2
0
        public static TranscriptCacheStaging GetStaging(CacheHeader header,
                                                        IntervalArray <ITranscript>[] transcriptIntervalArrays,
                                                        IntervalArray <IRegulatoryRegion>[] regulatoryRegionIntervalArrays)
        {
            var uniqueData = GetUniqueData(transcriptIntervalArrays);

            var cacheData = new TranscriptCacheData(header, uniqueData.Genes, uniqueData.TranscriptRegions, uniqueData.Mirnas,
                                                    uniqueData.PeptideSeqs, transcriptIntervalArrays, regulatoryRegionIntervalArrays);

            return(new TranscriptCacheStaging(cacheData));
        }
Пример #3
0
        public TranscriptCacheReaderTests()
        {
            var chr1 = new Chromosome("chr1", "1", 0);
            var chr2 = new Chromosome("chr2", "2", 1);
            var chr3 = new Chromosome("chr3", "3", 2);

            _refIndexToChromosome = new Dictionary <ushort, IChromosome>
            {
                [chr1.Index] = chr1,
                [chr2.Index] = chr2,
                [chr3.Index] = chr3
            };

            const GenomeAssembly genomeAssembly = GenomeAssembly.GRCh38;

            var baseHeader   = new Header("test", 2, 3, Source.BothRefSeqAndEnsembl, 4, genomeAssembly);
            var customHeader = new TranscriptCacheCustomHeader(1, 2);

            _expectedHeader = new CacheHeader(baseHeader, customHeader);

            var transcriptRegions = new ITranscriptRegion[]
            {
                new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 300, 399),
                new TranscriptRegion(TranscriptRegionType.Intron, 1, 200, 299, 399, 400),
                new TranscriptRegion(TranscriptRegionType.Exon, 2, 300, 399, 400, 499)
            };

            var mirnas = new IInterval[2];

            mirnas[0] = new Interval(100, 200);
            mirnas[1] = new Interval(300, 400);

            var peptideSeqs = new[] { "MASE*" };

            var genes = new IGene[1];

            genes[0] = new Gene(chr3, 100, 200, true, "TP53", 300, CompactId.Convert("7157"),
                                CompactId.Convert("ENSG00000141510"));

            var regulatoryRegions = new IRegulatoryRegion[2];

            regulatoryRegions[0] = new RegulatoryRegion(chr3, 1200, 1300, CompactId.Convert("123"), RegulatoryRegionType.enhancer);
            regulatoryRegions[1] = new RegulatoryRegion(chr3, 1250, 1450, CompactId.Convert("456"), RegulatoryRegionType.enhancer);
            var regulatoryRegionIntervalArrays = regulatoryRegions.ToIntervalArrays(3);

            var transcripts = GetTranscripts(chr3, genes, transcriptRegions, mirnas);
            var transcriptIntervalArrays = transcripts.ToIntervalArrays(3);

            _expectedCacheData = new TranscriptCacheData(_expectedHeader, genes, transcriptRegions, mirnas, peptideSeqs,
                                                         transcriptIntervalArrays, regulatoryRegionIntervalArrays);
        }
Пример #4
0
        private static Stream GetCacheStream()
        {
            const GenomeAssembly genomeAssembly = GenomeAssembly.GRCh38;

            var baseHeader     = new Header("test", 2, 3, Source.BothRefSeqAndEnsembl, 4, genomeAssembly);
            var customHeader   = new TranscriptCacheCustomHeader(1, 2);
            var expectedHeader = new CacheHeader(baseHeader, customHeader);

            var transcriptRegions = new ITranscriptRegion[]
            {
                new TranscriptRegion(TranscriptRegionType.Exon, 1, 100, 199, 300, 399),
                new TranscriptRegion(TranscriptRegionType.Intron, 1, 200, 299, 399, 400),
                new TranscriptRegion(TranscriptRegionType.Exon, 2, 300, 399, 400, 499)
            };

            var mirnas = new IInterval[2];

            mirnas[0] = new Interval(100, 200);
            mirnas[1] = new Interval(300, 400);

            var peptideSeqs = new[] { "MASE*" };

            var genes = new IGene[1];

            genes[0] = new Gene(ChromosomeUtilities.Chr3, 100, 200, true, "TP53", 300, CompactId.Convert("7157"),
                                CompactId.Convert("ENSG00000141510"));

            var regulatoryRegions = new IRegulatoryRegion[2];

            regulatoryRegions[0] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1200, 1300, CompactId.Convert("123"), RegulatoryRegionType.enhancer);
            regulatoryRegions[1] = new RegulatoryRegion(ChromosomeUtilities.Chr3, 1250, 1450, CompactId.Convert("456"), RegulatoryRegionType.enhancer);
            var regulatoryRegionIntervalArrays = regulatoryRegions.ToIntervalArrays(3);

            var transcripts = GetTranscripts(ChromosomeUtilities.Chr3, genes, transcriptRegions, mirnas);
            var transcriptIntervalArrays = transcripts.ToIntervalArrays(3);

            var expectedCacheData = new TranscriptCacheData(expectedHeader, genes, transcriptRegions, mirnas, peptideSeqs,
                                                            transcriptIntervalArrays, regulatoryRegionIntervalArrays);

            var ms = new MemoryStream();

            using (var writer = new TranscriptCacheWriter(ms, expectedHeader, true))
            {
                writer.Write(expectedCacheData);
            }

            ms.Position = 0;

            return(ms);
        }
Пример #5
0
        private static ExitCodes ProgramExecution()
        {
            Source transcriptSource = ParseVepCacheDirectoryMain.GetSource(_transcriptSource);
            string cachePath        = CacheConstants.TranscriptPath(_inputPrefix);

            IDictionary <ushort, IChromosome> refIndexToChromosome =
                SequenceHelper.GetDictionaries(_compressedReferencePath).refIndexToChromosome;

            TranscriptCacheData      cache            = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome);
            IDictionary <IGene, int> geneToInternalId = InternalGenes.CreateDictionary(cache.Genes);

            using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName)))
            {
                var creator = new GffCreator(writer, geneToInternalId, transcriptSource);
                creator.Create(cache.TranscriptIntervalArrays);
            }

            return(ExitCodes.Success);
        }
Пример #6
0
        private static ExitCodes ProgramExecution()
        {
            using var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReference));
            TranscriptCacheData transcriptData = AaConservationUtilities.GetTranscriptData(referenceProvider.RefIndexToChromosome, _transcriptCachePrefix);// we will use the transcript data to validate the protein sequence

            var    version     = DataSourceVersionReader.GetSourceVersion(_scoresFile + ".version");
            string outFileName = $"{version.Name}_{version.Version}";

            //read multi-alignments
            using (var stream = GZipUtilities.GetAppropriateReadStream(_scoresFile))
                using (var parser = new ProteinConservationParser(stream))
                    using (var outStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName + ProteinConservationCommon.FileSuffix)))
                        using (var groupStream = FileUtilities.GetCreateStream("transcriptGroups.txt"))
                            using (var writer = new ProteinConservationWriter(outStream, groupStream, transcriptData, version))
                            {
                                writer.Write(parser.GetItems());
                            }

            return(ExitCodes.Success);
        }
Пример #7
0
        /// <summary>
        /// writes the annotations to the current database file
        /// </summary>
        public void Write(TranscriptCacheData cacheData)
        {
            _blockStream.WriteHeader(_header.Write);

            WriteItems(_writer, cacheData.Genes, x => x.Write(_writer));
            WriteItems(_writer, cacheData.TranscriptRegions, x => x.Write(_writer));
            WriteItems(_writer, cacheData.Mirnas, x => x.Write(_writer));
            WriteItems(_writer, cacheData.PeptideSeqs, x => _writer.WriteOptAscii(x));

            var geneComparer             = new GeneComparer();
            var transcriptRegionComparer = new TranscriptRegionComparer();
            var intervalComparer         = new IntervalComparer();

            var geneIndices             = CreateIndex(cacheData.Genes, geneComparer);
            var transcriptRegionIndices = CreateIndex(cacheData.TranscriptRegions, transcriptRegionComparer);
            var microRnaIndices         = CreateIndex(cacheData.Mirnas, intervalComparer);
            var peptideIndices          = CreateIndex(cacheData.PeptideSeqs, EqualityComparer <string> .Default);

            WriteIntervals(_writer, cacheData.RegulatoryRegionIntervalArrays, x => x.Write(_writer));
            WriteIntervals(_writer, cacheData.TranscriptIntervalArrays, x => x.Write(_writer, geneIndices, transcriptRegionIndices, microRnaIndices, peptideIndices));
        }
Пример #8
0
        public static IntervalForest <string> GetGeneForest(TranscriptCacheData transcriptData)
        {
            var geneDictionary = new Dictionary <ushort, List <Interval <string> > > ();

            foreach (var gene in transcriptData.Genes)
            {
                if (!geneDictionary.ContainsKey(gene.Chromosome.Index))
                {
                    geneDictionary[gene.Chromosome.Index] = new List <Interval <string> >();
                }

                geneDictionary[gene.Chromosome.Index].Add(new Interval <string>(gene.Start, gene.End, gene.Symbol));
            }
            var geneIntervalArrays = new IntervalArray <string> [geneDictionary.Keys.Max() + 1];

            foreach (var(index, geneIntervals) in geneDictionary)
            {
                geneIntervalArrays[index] = new IntervalArray <string>(geneIntervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray());
            }

            return(new IntervalForest <string>(geneIntervalArrays));
        }
Пример #9
0
        GetIdToSymbols(TranscriptCacheData transcriptData)
        {
            var entrezToHgnc  = new Dictionary <string, string>();
            var ensemblToHgnc = new Dictionary <string, string>();

            foreach (var gene in transcriptData.Genes)
            {
                if (gene.EntrezGeneId.WithoutVersion == "649330")
                {
                    Console.WriteLine("bug");
                }
                if (!string.IsNullOrEmpty(gene.EntrezGeneId.WithoutVersion))
                {
                    entrezToHgnc[gene.EntrezGeneId.WithoutVersion] = gene.Symbol;
                }

                if (!string.IsNullOrEmpty(gene.EnsemblId.WithoutVersion))
                {
                    ensemblToHgnc[gene.EnsemblId.WithoutVersion] = gene.Symbol;
                }
            }

            return(entrezToHgnc, ensemblToHgnc);
        }
Пример #10
0
        public static Dictionary <string, string> GetEnstToGeneSymbols(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData)
        {
            var cache             = transcriptData.GetCache();
            var enstToGeneSymbols = new Dictionary <string, string>();

            foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys)
            {
                var overlappingTranscripts =
                    cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue);

                if (overlappingTranscripts == null)
                {
                    continue;
                }

                foreach (var transcript in overlappingTranscripts)
                {
                    if (transcript.Id.WithoutVersion.StartsWith("ENST"))
                    {
                        enstToGeneSymbols[transcript.Id.WithoutVersion] = transcript.Gene.Symbol;
                    }
                }
            }

            return(enstToGeneSymbols);
        }
Пример #11
0
        public static Dictionary <ushort, IntervalArray <byte> > GetSpliceIntervals(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData)
        {
            var cache = transcriptData.GetCache();

            var spliceIntervalDict = new Dictionary <ushort, IntervalArray <byte> >(sequenceProvider.RefIndexToChromosome.Count);

            foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys)
            {
                var spliceIntervals        = new List <Interval <byte> >(8 * 1024);
                var overlappingTranscripts =
                    cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue);

                if (overlappingTranscripts == null)
                {
                    continue;
                }

                foreach (var transcript in overlappingTranscripts)
                {
                    if (transcript.Id.IsPredictedTranscript())
                    {
                        continue;
                    }
                    bool isFirstExon = true;
                    foreach (var transcriptRegion in transcript.TranscriptRegions)
                    {
                        if (transcriptRegion.Type != TranscriptRegionType.Exon)
                        {
                            continue;
                        }
                        var firstSplicePosition  = transcriptRegion.Start;
                        var secondSplicePosition = transcriptRegion.End;

                        var firstInterval  = new Interval <byte>(firstSplicePosition - SpliceFlankLength, firstSplicePosition + SpliceFlankLength, 0);
                        var secondInterval = new Interval <byte>(secondSplicePosition - SpliceFlankLength, secondSplicePosition + SpliceFlankLength, 0);

                        if (!isFirstExon)
                        {
                            spliceIntervals.Add(firstInterval);
                        }
                        spliceIntervals.Add(secondInterval);
                        isFirstExon = false;
                    }
                    //remove the last added interval since this is the tail of the last exon- which is not a splice site
                    if (spliceIntervals.Count > 0)
                    {
                        spliceIntervals.RemoveAt(spliceIntervals.Count - 1);
                    }
                }

                spliceIntervalDict[chromIndex] = new IntervalArray <byte>(spliceIntervals.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray());
            }

            return(spliceIntervalDict);
        }
Пример #12
0
        public static Dictionary <ushort, IntervalArray <byte> > GetSpliceIntervals(ISequenceProvider sequenceProvider, TranscriptCacheData transcriptData)
        {
            var cache = transcriptData.GetCache();

            var spliceIntervals = new Dictionary <ushort, IntervalArray <byte> >(sequenceProvider.RefIndexToChromosome.Count);

            foreach (var chromIndex in sequenceProvider.RefIndexToChromosome.Keys)
            {
                var spliceInterval         = new List <Interval <byte> >(8 * 1024);
                var overlappingTranscripts =
                    cache.TranscriptIntervalForest.GetAllOverlappingValues(chromIndex, 1, int.MaxValue);

                if (overlappingTranscripts == null)
                {
                    continue;
                }

                foreach (var transcript in overlappingTranscripts)
                {
                    if (transcript.Id.IsPredictedTranscript())
                    {
                        continue;
                    }
                    foreach (var transcriptRegion in transcript.TranscriptRegions)
                    {
                        if (transcriptRegion.Type != TranscriptRegionType.Exon)
                        {
                            continue;
                        }
                        var firstSplicePosition  = transcriptRegion.Start;
                        var secondSplicePosition = transcriptRegion.End;

                        var firstInterval  = new Interval <byte>(firstSplicePosition - SpliceFlankLength, firstSplicePosition + SpliceFlankLength, 0);
                        var secondInterval = new Interval <byte>(secondSplicePosition - SpliceFlankLength, secondSplicePosition + SpliceFlankLength, 0);

                        spliceInterval.Add(firstInterval);
                        spliceInterval.Add(secondInterval);
                    }
                }

                spliceIntervals[chromIndex] = new IntervalArray <byte>(spliceInterval.OrderBy(x => x.Begin).ThenBy(x => x.End).ToArray());
            }

            return(spliceIntervals);
        }
Пример #13
0
 private TranscriptCacheStaging(TranscriptCacheData cacheData)
 {
     _cacheData = cacheData;
 }