public static List <MutableTranscript> InvestigateInconsistentCdnaMaps(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger, string transcriptId) { var index = 0; foreach (var transcript in transcripts) { string onReverseStrand = transcript.Gene.OnReverseStrand ? "R" : "F"; if (transcript.Exons.Length != transcript.CdnaMaps.Length) { logger.Log(transcriptId, $"Found different exon & cDNA maps counts ({transcript.Exons.Length} vs {transcript.CdnaMaps.Length}) (index: {index}, {onReverseStrand})"); } if (transcript.Exons.Length == transcript.CdnaMaps.Length && DiffExonsAndCdnaMaps(transcript.Exons, transcript.CdnaMaps)) { logger.Log(transcriptId, $"Found different start/end coordinates between exons & cDNA maps. (index: {index}, {onReverseStrand})"); } index++; } return(transcripts); }
public static List <MutableTranscript> FixGeneSymbolSource(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var symbolSources = transcripts.GetSet(x => x.Gene.SymbolSource); if (symbolSources.Count == 1) { return(transcripts); } if (symbolSources.Contains(GeneSymbolSource.Unknown)) { symbolSources.Remove(GeneSymbolSource.Unknown); } if (symbolSources.Count != 1) { throw new NotImplementedException("Cannot handle multiple gene symbol sources at this time"); } var targetSymbolSource = symbolSources.First(); foreach (var transcript in transcripts) { transcript.Gene.SymbolSource = targetSymbolSource; } logger.Log(transcripts[0].Id, "Normalized gene symbol source"); return(transcripts.Unique()); }
/// <summary> /// separates the transcripts by ID and clusters the transcripts into overlapping /// islands. From there we can resolve differences and return a unique transcript /// for each cluster. /// </summary> public static List <MutableTranscript> Merge(TranscriptMergerLogger logger, IEnumerable <MutableTranscript> transcripts, Dictionary <string, GenbankEntry> idToGenbankEntry) { var idToTranscripts = transcripts.GetMultiValueDict(x => x.Id + "|" + x.Start + "|" + x.End); var mergedTranscripts = idToTranscripts.Select(kvp => Merge(logger, kvp.Value, idToGenbankEntry)).ToList(); return(mergedTranscripts.OrderBy(x => x.Start).ThenBy(x => x.End).ToList()); }
private static List <MutableTranscript> UnsupervisedFixGeneSymbols(this IReadOnlyList <MutableTranscript> transcripts, TranscriptMergerLogger logger, List <string> symbols) { var nonLocGeneSymbols = symbols.FindAll(x => !string.IsNullOrEmpty(x) && !x.StartsWith("LOC")); string symbol = nonLocGeneSymbols.Count > 0 ? nonLocGeneSymbols[0] : symbols[0]; foreach (var transcript in transcripts) { transcript.Gene.Symbol = symbol; } logger.Log(transcripts[0].Id, "Normalized gene symbol (unsupervised)"); return(transcripts.Unique()); }
public static List <MutableTranscript> RemoveFailedTranscripts( this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var filteredTranscripts = transcripts.Where(transcript => transcript.BamEditStatus != "failed").ToList(); if (filteredTranscripts.Count == 0) { return(transcripts); } logger.Log(transcripts[0].Id, "Filtered transcripts with failed BAM status."); return(filteredTranscripts.Unique()); }
public static List <MutableTranscript> ChooseEditedTranscripts( this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var filteredTranscripts = transcripts.Where(transcript => transcript.RnaEdits != null || transcript.BamEditStatus == "ok").ToList(); if (filteredTranscripts.Count == 0) { return(transcripts); } logger.Log(transcripts[0].Id, "Filtered transcripts without RNA edits or BAM edit status"); return(filteredTranscripts.Unique()); }
public static List <MutableTranscript> PickSpecificTranscript( this List <MutableTranscript> transcripts, TranscriptMergerLogger logger, string transcriptId) { if (transcripts.Count == 1) { return(transcripts); } List <MutableTranscript> filteredTranscripts; string logMessage; switch (transcriptId) { case "NM_001005786": filteredTranscripts = transcripts.Where(transcript => transcript.CdnaMaps[9].Start == 25419007).ToList(); logMessage = $"Filtered on exon 9 start: {transcriptId}"; break; case "NM_001278597": case "NM_001278596": filteredTranscripts = transcripts.Where(transcript => transcript.CdnaMaps.Length == 26).ToList(); logMessage = $"Filtered on exon count (26): {transcriptId}"; break; case "NM_016152": filteredTranscripts = transcripts.Where(transcript => transcript.Exons[0].Phase == 0).ToList(); logMessage = $"Filtered on exon phase (0): {transcriptId}"; break; default: return(transcripts); } if (filteredTranscripts.Count == 0) { return(transcripts); } logger.Log(transcriptId, logMessage); return(filteredTranscripts.Unique()); }
public static List <MutableTranscript> RemoveTranscriptsWithLowestVersion( this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var versionToTranscript = transcripts.GetMultiValueDict(x => x.Version); if (versionToTranscript.Count == 1) { return(transcripts); } byte maxVersion = versionToTranscript.Keys.Max(); transcripts.RemoveAll(x => x.Version != maxVersion); logger.Log(transcripts[0].Id, "Filtered transcripts with lower versions"); return(transcripts.Unique()); }
public static List <MutableTranscript> FixCodingRegionCdnaEnd(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger, IReadOnlyDictionary <string, GenbankEntry> idToGenbankEntry, string transcriptId) { if (transcripts.Count == 1 || idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry)) { return(transcripts); } var cdnaEndToTranscript = transcripts.GetMultiValueDict(x => x.CodingRegion.CdnaEnd); if (cdnaEndToTranscript.Count == 1) { return(transcripts); } if (!cdnaEndToTranscript.TryGetValue(genbankEntry.CodingRegion.End, out var filteredTranscripts)) { return(transcripts); } logger.Log(transcripts[0].Id, "Filtered transcripts by coding region cDNA end"); return(filteredTranscripts.Unique()); }
public static List <MutableTranscript> UnsupervisedFixGeneId(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var geneIds = transcripts.GetSet(x => x.Gene.GeneId).ToList(); if (geneIds.Count == 1) { return(transcripts); } string geneId = geneIds[0]; foreach (var transcript in transcripts) { transcript.Gene.GeneId = geneId; } logger.Log(transcripts[0].Id, "Normalized gene ID (unsupervised)"); return(transcripts.Unique()); }
private static MutableTranscript Merge(TranscriptMergerLogger logger, IReadOnlyList <MutableTranscript> transcripts, Dictionary <string, GenbankEntry> idToGenbankEntry) { string transcriptId = transcripts[0].Id; if (transcripts.Count == 1) { transcripts.Unique().InvestigateInconsistentCdnaMaps(logger, transcriptId); return(transcripts[0]); } var filteredTranscripts = transcripts .Unique() .InvestigateInconsistentCdnaMaps(logger, transcriptId) .RemoveFailedTranscripts(logger) .ChooseEditedTranscripts(logger) .RemoveTranscriptsWithLowestVersion(logger) .FixCodingRegionCdnaStart(logger, idToGenbankEntry, transcriptId) .FixCodingRegionCdnaEnd(logger, idToGenbankEntry, transcriptId) .FixGeneSymbolSource(logger) .FixBioType(logger) .FixGeneId(logger, idToGenbankEntry, transcriptId) .FixCanonical(logger) .FixHgncId(logger) .FixGeneStart(logger) .FixGeneEnd(logger) .FixGeneSymbols(logger, idToGenbankEntry, transcriptId) .UnsupervisedFixGeneId(logger) .PickSpecificTranscript(logger, transcriptId); if (filteredTranscripts.Count == 1) { return(filteredTranscripts[0]); } throw new NotImplementedException($"Could not merge down to one transcript: {filteredTranscripts.Count} transcripts ({transcriptId})"); }
private static ExitCodes ProgramExecution() { var transcriptSource = GetSource(_transcriptSource); var sequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath)); var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome); var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory); var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks; var idToGenbank = GetIdToGenbank(genomeAssembly, transcriptSource); // ========================= // create the pre-cache file // ========================= // process each VEP directory int numRefSeqs = sequenceReader.NumRefSeqs; var header = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs); string siftPath = _outputStub + ".sift.gz"; string polyphenPath = _outputStub + ".polyphen.gz"; string transcriptPath = _outputStub + ".transcripts.gz"; string regulatoryPath = _outputStub + ".regulatory.gz"; using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log"))) using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift)) using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen)) using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header)) using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header)) { var converter = new VepCacheParser(transcriptSource); var emptyPredictionDict = new Dictionary <string, List <int> >(); for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceReader.RefIndexToChromosome[refIndex]; if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir)) { siftWriter.Write(chromosome, emptyPredictionDict); polyphenWriter.Write(chromosome, emptyPredictionDict); continue; } Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName); var rawData = converter.ParseDumpDirectory(chromosome, vepSubDir); var mergedTranscripts = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank); var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions); int numRawTranscripts = rawData.Transcripts.Count; int numMergedTranscripts = mergedTranscripts.Count; Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}"); WriteTranscripts(transcriptWriter, mergedTranscripts); WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions); WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome); WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome); } } Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count); return(ExitCodes.Success); }
public static List <MutableTranscript> FixBioType(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var biotypes = transcripts.GetSet(x => x.BioType); if (biotypes.Count != 2) { return(transcripts); } var biotype = GetDesiredBioType(biotypes); if (biotype == BioType.other) { return(transcripts); } foreach (var transcript in transcripts) { transcript.BioType = biotype; } logger.Log(transcripts[0].Id, "Normalized biotype"); return(transcripts.Unique()); }
private static void Log(this TranscriptMergerLogger logger, string transcriptId, string description) => logger.WriteLine($"{transcriptId}\t{description}");
public static List <MutableTranscript> FixGeneId(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger, Dictionary <string, GenbankEntry> idToGenbankEntry, string transcriptId) { if (transcripts.Count == 1 || idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry)) { return(transcripts); } var geneIds = transcripts.GetSet(x => x.Gene.GeneId); if (geneIds.Count == 1) { return(transcripts); } if (!geneIds.Contains(genbankEntry.GeneId)) { throw new InvalidDataException($"Could not find the Genbank gene ID ({genbankEntry.GeneId}) within the transcripts."); } foreach (var transcript in transcripts) { transcript.Gene.GeneId = genbankEntry.GeneId; } logger.Log(transcripts[0].Id, "Normalized gene ID"); return(transcripts.Unique()); }
public static List <MutableTranscript> FixGeneEnd(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var geneEnds = transcripts.GetSet(x => x.Gene.End); if (geneEnds.Count == 1) { return(transcripts); } var transcriptEnds = transcripts.GetSet(x => x.End).ToArray(); if (transcriptEnds.Length > 1) { return(transcripts); } int closestEnd = GetClosest(geneEnds, transcriptEnds[0]); foreach (var transcript in transcripts) { transcript.Gene.End = closestEnd; } logger.Log(transcripts[0].Id, "Normalized gene end"); return(transcripts.Unique()); }
public static List <MutableTranscript> FixHgncId(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var hgncIds = transcripts.GetSet(x => x.Gene.HgncId); if (hgncIds.Count == 1) { return(transcripts); } if (hgncIds.Contains(-1)) { hgncIds.Remove(-1); } int hgncId = hgncIds.First(); foreach (var transcript in transcripts) { transcript.Gene.HgncId = hgncId; } logger.Log(transcripts[0].Id, "Normalized HGNC ID"); return(transcripts.Unique()); }
public static List <MutableTranscript> FixCanonical(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger) { if (transcripts.Count == 1) { return(transcripts); } var canonicals = transcripts.GetSet(x => x.IsCanonical); if (canonicals.Count == 1) { return(transcripts); } foreach (var transcript in transcripts) { transcript.IsCanonical = false; } logger.Log(transcripts[0].Id, "Normalized canonical flag"); return(transcripts.Unique()); }
public static List <MutableTranscript> FixGeneSymbols(this List <MutableTranscript> transcripts, TranscriptMergerLogger logger, Dictionary <string, GenbankEntry> idToGenbankEntry, string transcriptId) { if (transcripts.Count == 1) { return(transcripts); } var symbols = transcripts.GetSet(x => x.Gene.Symbol); if (symbols.Count == 1) { return(transcripts); } if (symbols.Contains(null)) { symbols.Remove(null); } if (idToGenbankEntry == null || !idToGenbankEntry.TryGetValue(transcriptId, out var genbankEntry)) { return(transcripts.UnsupervisedFixGeneSymbols(logger, symbols.ToList())); } if (!symbols.Contains(genbankEntry.Symbol)) { return(transcripts.UnsupervisedFixGeneSymbols(logger, symbols.ToList())); } foreach (var transcript in transcripts) { transcript.Gene.Symbol = genbankEntry.Symbol; } logger.Log(transcripts[0].Id, "Normalized gene symbol"); return(transcripts.Unique()); }