Esempio n. 1
0
        public static List <MutableTranscript> InvestigateInconsistentCdnaMaps(this List <MutableTranscript> transcripts,
                                                                               TranscriptMergerLogger logger, string transcriptId)
        {
            var index = 0;

            foreach (var transcript in transcripts)
            {
                string onReverseStrand = transcript.Gene.OnReverseStrand ? "R" : "F";

                if (transcript.Exons.Length != transcript.CdnaMaps.Length)
                {
                    logger.Log(transcriptId, $"Found different exon & cDNA maps counts ({transcript.Exons.Length} vs {transcript.CdnaMaps.Length}) (index: {index}, {onReverseStrand})");
                }

                if (transcript.Exons.Length == transcript.CdnaMaps.Length &&
                    DiffExonsAndCdnaMaps(transcript.Exons, transcript.CdnaMaps))
                {
                    logger.Log(transcriptId, $"Found different start/end coordinates between exons & cDNA maps. (index: {index}, {onReverseStrand})");
                }

                index++;
            }

            return(transcripts);
        }
Esempio n. 2
0
        public static List <MutableTranscript> FixGeneSymbolSource(this List <MutableTranscript> transcripts,
                                                                   TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var symbolSources = transcripts.GetSet(x => x.Gene.SymbolSource);

            if (symbolSources.Count == 1)
            {
                return(transcripts);
            }

            if (symbolSources.Contains(GeneSymbolSource.Unknown))
            {
                symbolSources.Remove(GeneSymbolSource.Unknown);
            }
            if (symbolSources.Count != 1)
            {
                throw new NotImplementedException("Cannot handle multiple gene symbol sources at this time");
            }

            var targetSymbolSource = symbolSources.First();

            foreach (var transcript in transcripts)
            {
                transcript.Gene.SymbolSource = targetSymbolSource;
            }
            logger.Log(transcripts[0].Id, "Normalized gene symbol source");
            return(transcripts.Unique());
        }
Esempio n. 3
0
        /// <summary>
        /// separates the transcripts by ID and clusters the transcripts into overlapping
        /// islands. From there we can resolve differences and return a unique transcript
        /// for each cluster.
        /// </summary>
        public static List <MutableTranscript> Merge(TranscriptMergerLogger logger, IEnumerable <MutableTranscript> transcripts,
                                                     Dictionary <string, GenbankEntry> idToGenbankEntry)
        {
            var idToTranscripts   = transcripts.GetMultiValueDict(x => x.Id + "|" + x.Start + "|" + x.End);
            var mergedTranscripts = idToTranscripts.Select(kvp => Merge(logger, kvp.Value, idToGenbankEntry)).ToList();

            return(mergedTranscripts.OrderBy(x => x.Start).ThenBy(x => x.End).ToList());
        }
Esempio n. 4
0
        private static List <MutableTranscript> UnsupervisedFixGeneSymbols(this IReadOnlyList <MutableTranscript> transcripts,
                                                                           TranscriptMergerLogger logger, List <string> symbols)
        {
            var    nonLocGeneSymbols = symbols.FindAll(x => !string.IsNullOrEmpty(x) && !x.StartsWith("LOC"));
            string symbol            = nonLocGeneSymbols.Count > 0 ? nonLocGeneSymbols[0] : symbols[0];

            foreach (var transcript in transcripts)
            {
                transcript.Gene.Symbol = symbol;
            }
            logger.Log(transcripts[0].Id, "Normalized gene symbol (unsupervised)");
            return(transcripts.Unique());
        }
Esempio n. 5
0
        public static List <MutableTranscript> RemoveFailedTranscripts(
            this List <MutableTranscript> transcripts, TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var filteredTranscripts = transcripts.Where(transcript => transcript.BamEditStatus != "failed").ToList();

            if (filteredTranscripts.Count == 0)
            {
                return(transcripts);
            }

            logger.Log(transcripts[0].Id, "Filtered transcripts with failed BAM status.");
            return(filteredTranscripts.Unique());
        }
Esempio n. 6
0
        public static List <MutableTranscript> ChooseEditedTranscripts(
            this List <MutableTranscript> transcripts, TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var filteredTranscripts = transcripts.Where(transcript => transcript.RnaEdits != null || transcript.BamEditStatus == "ok").ToList();

            if (filteredTranscripts.Count == 0)
            {
                return(transcripts);
            }

            logger.Log(transcripts[0].Id, "Filtered transcripts without RNA edits or BAM edit status");
            return(filteredTranscripts.Unique());
        }
Esempio n. 7
0
        public static List <MutableTranscript> PickSpecificTranscript(
            this List <MutableTranscript> transcripts, TranscriptMergerLogger logger, string transcriptId)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            List <MutableTranscript> filteredTranscripts;
            string logMessage;

            switch (transcriptId)
            {
            case "NM_001005786":
                filteredTranscripts = transcripts.Where(transcript => transcript.CdnaMaps[9].Start == 25419007).ToList();
                logMessage          = $"Filtered on exon 9 start: {transcriptId}";
                break;

            case "NM_001278597":
            case "NM_001278596":
                filteredTranscripts = transcripts.Where(transcript => transcript.CdnaMaps.Length == 26).ToList();
                logMessage          = $"Filtered on exon count (26): {transcriptId}";
                break;

            case "NM_016152":
                filteredTranscripts = transcripts.Where(transcript => transcript.Exons[0].Phase == 0).ToList();
                logMessage          = $"Filtered on exon phase (0): {transcriptId}";
                break;

            default:
                return(transcripts);
            }

            if (filteredTranscripts.Count == 0)
            {
                return(transcripts);
            }
            logger.Log(transcriptId, logMessage);

            return(filteredTranscripts.Unique());
        }
Esempio n. 8
0
        public static List <MutableTranscript> RemoveTranscriptsWithLowestVersion(
            this List <MutableTranscript> transcripts, TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var versionToTranscript = transcripts.GetMultiValueDict(x => x.Version);

            if (versionToTranscript.Count == 1)
            {
                return(transcripts);
            }

            byte maxVersion = versionToTranscript.Keys.Max();

            transcripts.RemoveAll(x => x.Version != maxVersion);

            logger.Log(transcripts[0].Id, "Filtered transcripts with lower versions");
            return(transcripts.Unique());
        }
Esempio n. 9
0
        public static List <MutableTranscript> FixCodingRegionCdnaEnd(this List <MutableTranscript> transcripts,
                                                                      TranscriptMergerLogger logger, IReadOnlyDictionary <string, GenbankEntry> idToGenbankEntry, string transcriptId)
        {
            if (transcripts.Count == 1 || idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry))
            {
                return(transcripts);
            }

            var cdnaEndToTranscript = transcripts.GetMultiValueDict(x => x.CodingRegion.CdnaEnd);

            if (cdnaEndToTranscript.Count == 1)
            {
                return(transcripts);
            }

            if (!cdnaEndToTranscript.TryGetValue(genbankEntry.CodingRegion.End, out var filteredTranscripts))
            {
                return(transcripts);
            }

            logger.Log(transcripts[0].Id, "Filtered transcripts by coding region cDNA end");
            return(filteredTranscripts.Unique());
        }
Esempio n. 10
0
        public static List <MutableTranscript> UnsupervisedFixGeneId(this List <MutableTranscript> transcripts,
                                                                     TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var geneIds = transcripts.GetSet(x => x.Gene.GeneId).ToList();

            if (geneIds.Count == 1)
            {
                return(transcripts);
            }

            string geneId = geneIds[0];

            foreach (var transcript in transcripts)
            {
                transcript.Gene.GeneId = geneId;
            }
            logger.Log(transcripts[0].Id, "Normalized gene ID (unsupervised)");
            return(transcripts.Unique());
        }
Esempio n. 11
0
        private static MutableTranscript Merge(TranscriptMergerLogger logger, IReadOnlyList <MutableTranscript> transcripts,
                                               Dictionary <string, GenbankEntry> idToGenbankEntry)
        {
            string transcriptId = transcripts[0].Id;

            if (transcripts.Count == 1)
            {
                transcripts.Unique().InvestigateInconsistentCdnaMaps(logger, transcriptId);
                return(transcripts[0]);
            }

            var filteredTranscripts = transcripts
                                      .Unique()
                                      .InvestigateInconsistentCdnaMaps(logger, transcriptId)
                                      .RemoveFailedTranscripts(logger)
                                      .ChooseEditedTranscripts(logger)
                                      .RemoveTranscriptsWithLowestVersion(logger)
                                      .FixCodingRegionCdnaStart(logger, idToGenbankEntry, transcriptId)
                                      .FixCodingRegionCdnaEnd(logger, idToGenbankEntry, transcriptId)
                                      .FixGeneSymbolSource(logger)
                                      .FixBioType(logger)
                                      .FixGeneId(logger, idToGenbankEntry, transcriptId)
                                      .FixCanonical(logger)
                                      .FixHgncId(logger)
                                      .FixGeneStart(logger)
                                      .FixGeneEnd(logger)
                                      .FixGeneSymbols(logger, idToGenbankEntry, transcriptId)
                                      .UnsupervisedFixGeneId(logger)
                                      .PickSpecificTranscript(logger, transcriptId);

            if (filteredTranscripts.Count == 1)
            {
                return(filteredTranscripts[0]);
            }
            throw new NotImplementedException($"Could not merge down to one transcript: {filteredTranscripts.Count} transcripts ({transcriptId})");
        }
        private static ExitCodes ProgramExecution()
        {
            var transcriptSource = GetSource(_transcriptSource);
            var sequenceReader   = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath));
            var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome);
            var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory);

            var  genomeAssembly  = GenomeAssemblyHelper.Convert(_genomeAssembly);
            long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks;
            var  idToGenbank     = GetIdToGenbank(genomeAssembly, transcriptSource);

            // =========================
            // create the pre-cache file
            // =========================

            // process each VEP directory
            int numRefSeqs = sequenceReader.NumRefSeqs;
            var header     = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs);

            string siftPath       = _outputStub + ".sift.gz";
            string polyphenPath   = _outputStub + ".polyphen.gz";
            string transcriptPath = _outputStub + ".transcripts.gz";
            string regulatoryPath = _outputStub + ".regulatory.gz";

            using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log")))
                using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift))
                    using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen))
                        using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header))
                            using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header))
                            {
                                var converter           = new VepCacheParser(transcriptSource);
                                var emptyPredictionDict = new Dictionary <string, List <int> >();

                                for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++)
                                {
                                    var chromosome = sequenceReader.RefIndexToChromosome[refIndex];

                                    if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir))
                                    {
                                        siftWriter.Write(chromosome, emptyPredictionDict);
                                        polyphenWriter.Write(chromosome, emptyPredictionDict);
                                        continue;
                                    }

                                    Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName);

                                    var rawData                 = converter.ParseDumpDirectory(chromosome, vepSubDir);
                                    var mergedTranscripts       = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank);
                                    var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions);

                                    int numRawTranscripts    = rawData.Transcripts.Count;
                                    int numMergedTranscripts = mergedTranscripts.Count;
                                    Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}");

                                    WriteTranscripts(transcriptWriter, mergedTranscripts);
                                    WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions);
                                    WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome);
                                    WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome);
                                }
                            }

            Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count);

            return(ExitCodes.Success);
        }
Esempio n. 13
0
        public static List <MutableTranscript> FixBioType(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var biotypes = transcripts.GetSet(x => x.BioType);

            if (biotypes.Count != 2)
            {
                return(transcripts);
            }

            var biotype = GetDesiredBioType(biotypes);

            if (biotype == BioType.other)
            {
                return(transcripts);
            }

            foreach (var transcript in transcripts)
            {
                transcript.BioType = biotype;
            }
            logger.Log(transcripts[0].Id, "Normalized biotype");
            return(transcripts.Unique());
        }
Esempio n. 14
0
 private static void Log(this TranscriptMergerLogger logger, string transcriptId, string description) =>
 logger.WriteLine($"{transcriptId}\t{description}");
Esempio n. 15
0
        public static List <MutableTranscript> FixGeneId(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger,
                                                         Dictionary <string, GenbankEntry> idToGenbankEntry, string transcriptId)
        {
            if (transcripts.Count == 1 || idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry))
            {
                return(transcripts);
            }

            var geneIds = transcripts.GetSet(x => x.Gene.GeneId);

            if (geneIds.Count == 1)
            {
                return(transcripts);
            }

            if (!geneIds.Contains(genbankEntry.GeneId))
            {
                throw new InvalidDataException($"Could not find the Genbank gene ID ({genbankEntry.GeneId}) within the transcripts.");
            }

            foreach (var transcript in transcripts)
            {
                transcript.Gene.GeneId = genbankEntry.GeneId;
            }
            logger.Log(transcripts[0].Id, "Normalized gene ID");
            return(transcripts.Unique());
        }
Esempio n. 16
0
        public static List <MutableTranscript> FixGeneEnd(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var geneEnds = transcripts.GetSet(x => x.Gene.End);

            if (geneEnds.Count == 1)
            {
                return(transcripts);
            }

            var transcriptEnds = transcripts.GetSet(x => x.End).ToArray();

            if (transcriptEnds.Length > 1)
            {
                return(transcripts);
            }

            int closestEnd = GetClosest(geneEnds, transcriptEnds[0]);

            foreach (var transcript in transcripts)
            {
                transcript.Gene.End = closestEnd;
            }
            logger.Log(transcripts[0].Id, "Normalized gene end");
            return(transcripts.Unique());
        }
Esempio n. 17
0
        public static List <MutableTranscript> FixHgncId(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var hgncIds = transcripts.GetSet(x => x.Gene.HgncId);

            if (hgncIds.Count == 1)
            {
                return(transcripts);
            }

            if (hgncIds.Contains(-1))
            {
                hgncIds.Remove(-1);
            }
            int hgncId = hgncIds.First();

            foreach (var transcript in transcripts)
            {
                transcript.Gene.HgncId = hgncId;
            }
            logger.Log(transcripts[0].Id, "Normalized HGNC ID");
            return(transcripts.Unique());
        }
Esempio n. 18
0
        public static List <MutableTranscript> FixCanonical(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var canonicals = transcripts.GetSet(x => x.IsCanonical);

            if (canonicals.Count == 1)
            {
                return(transcripts);
            }

            foreach (var transcript in transcripts)
            {
                transcript.IsCanonical = false;
            }
            logger.Log(transcripts[0].Id, "Normalized canonical flag");
            return(transcripts.Unique());
        }
Esempio n. 19
0
        public static List <MutableTranscript> FixGeneSymbols(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger,
                                                              Dictionary <string, GenbankEntry> idToGenbankEntry, string transcriptId)
        {
            if (transcripts.Count == 1)
            {
                return(transcripts);
            }

            var symbols = transcripts.GetSet(x => x.Gene.Symbol);

            if (symbols.Count == 1)
            {
                return(transcripts);
            }
            if (symbols.Contains(null))
            {
                symbols.Remove(null);
            }

            if (idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry))
            {
                return(transcripts.UnsupervisedFixGeneSymbols(logger, symbols.ToList()));
            }

            if (!symbols.Contains(genbankEntry.Symbol))
            {
                return(transcripts.UnsupervisedFixGeneSymbols(logger, symbols.ToList()));
            }

            foreach (var transcript in transcripts)
            {
                transcript.Gene.Symbol = genbankEntry.Symbol;
            }
            logger.Log(transcripts[0].Id, "Normalized gene symbol");
            return(transcripts.Unique());
        }