private static MutableGene GetFlattenedGene(MutableGene seedGene, List <MutableGene> genesWithSameGeneId, int overlapStart, int overlapEnd) { var flattenedGene = MutableGene.Clone(seedGene); bool useOverlap = overlapStart != -1 && overlapEnd != -1; foreach (var gene in genesWithSameGeneId) { if (gene.Invalid || flattenedGene.OnReverseStrand != gene.OnReverseStrand || flattenedGene.ReferenceIndex != gene.ReferenceIndex) { continue; } if (useOverlap && !Overlap.Partial(overlapStart, overlapEnd, gene.Start, gene.End)) { continue; } if (!useOverlap && !Overlap.Partial(flattenedGene.Start, flattenedGene.End, gene.Start, gene.End)) { continue; } UpdateCoordinates(gene, flattenedGene); gene.Invalid = true; } return(flattenedGene); }
private List <MutableGene> GetValidGenes(MutableGene seedGene, List <MutableGene> genes, out int start, out int end) { var validGenes = new List <MutableGene>(); start = seedGene.Start; end = seedGene.End; foreach (var gene in genes) { if (gene.Invalid || seedGene.OnReverseStrand != gene.OnReverseStrand || seedGene.ReferenceIndex != gene.ReferenceIndex || !Overlap.Partial(start, end, gene.Start, gene.End)) { continue; } validGenes.Add(gene); if (gene.Start < start) { start = gene.Start; } if (gene.End > end) { end = gene.End; } } return(validGenes); }
private void MergesGenesWithSameSymbol(MutableGene seedGene, List <MutableGene> genesWithSameSymbol) { int overlapStart, overlapEnd; var validGenes = GetValidGenes(seedGene, genesWithSameSymbol, out overlapStart, out overlapEnd); var ensemblGenes = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.Ensembl); var refSeqGenes = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.RefSeq); var ensemblFlattener = new GeneFlattener(ensemblGenes, "Ensembl", false); var flatEnsemblGenes = ensemblFlattener.Flatten(overlapStart, overlapEnd); var refSeqFlattener = new GeneFlattener(refSeqGenes, "RefSeq", false); var flatRefSeqGenes = refSeqFlattener.Flatten(overlapStart, overlapEnd); foreach (var ensemblGene in flatEnsemblGenes) { // add the unused Ensembl genes string linkedEntrezId; if (!_linkedEnsemblIds.TryGetValue(ensemblGene.EnsemblId.ToString(), out linkedEntrezId)) { AddEnsemblOrphan(ensemblGene); continue; } var refSeqGene = GeneUtilities.GetRefSeqGeneById(flatRefSeqGenes, linkedEntrezId); if (refSeqGene == null) { AddEnsemblOrphan(ensemblGene); continue; } // merge the Ensembl and RefSeq gene var mergedGene = MutableGene.Clone(ensemblGene); mergedGene.TranscriptDataSource = TranscriptDataSource.BothRefSeqAndEnsembl; UpdateCoordinates(refSeqGene, mergedGene); if (mergedGene.HgncId == -1 && refSeqGene.HgncId != -1) { mergedGene.HgncId = refSeqGene.HgncId; } mergedGene.EntrezGeneId = refSeqGene.EntrezGeneId; _mergedGenes.Add(mergedGene); refSeqGene.Invalid = true; ensemblGene.Invalid = true; _numMergedGenes++; } // add the unused RefSeq genes foreach (var refSeqGene in flatRefSeqGenes) { if (refSeqGene.Invalid) { continue; } AddRefSeqOrphan(refSeqGene); } }
private static void UpdateCoordinates(MutableGene source, MutableGene dest) { if (source.Start < dest.Start) { dest.Start = source.Start; } if (source.End > dest.End) { dest.End = source.End; } }
private static bool GeneEquals(MutableGene x, MutableGene y) { return(x.Chromosome.Index == y.Chromosome.Index && x.Start == y.Start && x.End == y.End && x.OnReverseStrand == y.OnReverseStrand && x.GeneId == y.GeneId && x.Symbol == y.Symbol && x.HgncId == y.HgncId && x.SymbolSource == y.SymbolSource); }
private static UgaGene GetMergedGene(MutableGene geneA, MutableGene geneB, bool isGrch37) { (MutableGene ensemblGene, MutableGene refSeqGene) = geneA.GeneId.StartsWith("ENSG") ? (geneA, geneB) : (geneB, geneA); if (ensemblGene.Chromosome.Index != refSeqGene.Chromosome.Index) { throw new InvalidDataException($"The two genes are on different chromosomes: {geneA.GeneId} & {geneB.GeneId}"); } if (ensemblGene.OnReverseStrand != refSeqGene.OnReverseStrand) { throw new InvalidDataException($"Both genes do not have the same orientation: {geneA.GeneId} & {geneB.GeneId}"); } IInterval interval = GetMergedInterval(ensemblGene, refSeqGene); (IInterval grch37, IInterval grch38) = isGrch37 ? (interval, null as IInterval) : (null as IInterval, interval); return(new UgaGene(ensemblGene.Chromosome, grch37, grch38, ensemblGene.OnReverseStrand, refSeqGene.GeneId, ensemblGene.GeneId, ensemblGene.Symbol, ensemblGene.HgncId)); }
private static string GetGeneKey(MutableGene gene) => gene.GeneId + '|' + gene.Chromosome.UcscName + '|' + gene.Start + '|' + gene.End + '|' + (gene.OnReverseStrand ? 'R' : 'F');
private static void WriteGene(TextWriter writer, MutableGene gene) { char strand = gene.OnReverseStrand ? 'R' : 'F'; writer.WriteLine($"Gene\t{gene.GeneId}\t{gene.Chromosome.UcscName}\t{gene.Chromosome.Index}\t{gene.Start}\t{gene.End}\t{strand}\t{gene.Symbol}\t{(int)gene.SymbolSource}\t{gene.HgncId}"); }
private void AddRefSeqOrphan(MutableGene gene) { _mergedGenes.Add(gene); gene.Invalid = true; _numOrphanRefSeqGenes++; }
private void AddEnsemblOrphan(MutableGene gene) { _mergedGenes.Add(gene); gene.Invalid = true; _numOrphanEnsemblGenes++; }
private static IInterval GetMergedInterval(MutableGene geneA, MutableGene geneB) => new Interval(Math.Min(geneA.Start, geneB.Start), Math.Max(geneA.End, geneB.End));
/// <summary> /// parses the relevant data from each transcript /// </summary> public static MutableTranscript Parse(ObjectValueNode objectValue, IChromosome chromosome, Source source) { // IDs string transcriptId = null; byte transcriptVersion = 1; string proteinId = null; byte proteinVersion = 0; string ccdsId = null; string refSeqId = null; string geneId = null; int hgncId = -1; // gene int geneStart = -1; int geneEnd = -1; var geneOnReverseStrand = false; string geneSymbol = null; var geneSymbolSource = GeneSymbolSource.Unknown; // translation int translationStart = -1; int translationEnd = -1; MutableExon translationStartExon = null; MutableExon translationEndExon = null; // predictions string siftData = null; string polyphenData = null; var bioType = BioType.other; IInterval[] microRnas = null; MutableTranscriptRegion[] cdnaMaps = null; IInterval[] introns = null; string peptideSequence = null; string translateableSequence = null; var isCanonical = false; int compDnaCodingStart = -1; int compDnaCodingEnd = -1; int start = -1; int end = -1; MutableExon[] exons = null; var cdsStartNotFound = false; var cdsEndNotFound = false; int[] selenocysteinePositions = null; IRnaEdit[] rnaEdits = null; string bamEditStatus = null; foreach (var node in objectValue.Values) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(node.Key)) { throw new InvalidDataException($"Encountered an unknown key in the dumper transcript object: {node.Key}"); } // handle each key switch (node.Key) { case ImportKeys.CodingRegionEnd: case ImportKeys.CodingRegionStart: case ImportKeys.CreatedDate: case ImportKeys.DbId: case ImportKeys.Description: case ImportKeys.DisplayXref: case ImportKeys.ExternalDb: case ImportKeys.ExternalDisplayName: case ImportKeys.ExternalName: case ImportKeys.ExternalStatus: case ImportKeys.GenePhenotype: case ImportKeys.GeneStableId: case ImportKeys.ModifiedDate: case ImportKeys.Protein: case ImportKeys.Slice: case ImportKeys.Source: case ImportKeys.Strand: case ImportKeys.SwissProt: case ImportKeys.Trembl: case ImportKeys.UniParc: case ImportKeys.VepLazyLoaded: // not used break; case ImportKeys.BamEditStatus: bamEditStatus = node.GetString(); break; case ImportKeys.Attributes: (microRnas, rnaEdits, cdsStartNotFound, cdsEndNotFound) = Attribute.ParseList(node); break; case ImportKeys.Biotype: bioType = TranscriptUtilities.GetBiotype(node); break; case ImportKeys.Ccds: ccdsId = node.GetString(); break; case ImportKeys.CdnaCodingEnd: compDnaCodingEnd = node.GetInt32(); break; case ImportKeys.CdnaCodingStart: compDnaCodingStart = node.GetInt32(); break; case ImportKeys.End: end = node.GetInt32(); break; case ImportKeys.GeneHgncId: hgncId = node.GetHgncId(); break; case ImportKeys.GeneSymbol: case ImportKeys.GeneHgnc: // older key geneSymbol = node.GetString(); break; case ImportKeys.GeneSymbolSource: geneSymbolSource = GeneSymbolSourceHelper.GetGeneSymbolSource(node.GetString()); break; case ImportKeys.Gene: (geneStart, geneEnd, geneId, geneOnReverseStrand) = ImportGene.Parse(node); break; case ImportKeys.IsCanonical: isCanonical = node.GetBool(); break; case ImportKeys.Refseq: refSeqId = node.GetString(); break; case ImportKeys.StableId: transcriptId = node.GetString(); break; case ImportKeys.Start: start = node.GetInt32(); break; case ImportKeys.TransExonArray: exons = ImportExon.ParseList(node, chromosome); break; case ImportKeys.Translation: (translationStart, translationEnd, proteinId, proteinVersion, translationStartExon, translationEndExon) = ImportTranslation.Parse(node, chromosome); break; case ImportKeys.VariationEffectFeatureCache: (cdnaMaps, introns, peptideSequence, translateableSequence, siftData, polyphenData, selenocysteinePositions) = ImportVariantEffectFeatureCache.Parse(node); break; case ImportKeys.Version: transcriptVersion = (byte)node.GetInt32(); break; default: throw new InvalidDataException($"Unknown key found: {node.Key}"); } } var fixedTranscript = AccessionUtilities.GetMaxVersion(transcriptId, transcriptVersion); var fixedProtein = AccessionUtilities.GetMaxVersion(proteinId, proteinVersion); var gene = new MutableGene(chromosome, geneStart, geneEnd, geneOnReverseStrand, geneSymbol, geneSymbolSource, geneId, hgncId); var codingRegion = new CodingRegion(GetCodingRegionStart(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd), GetCodingRegionEnd(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd), compDnaCodingStart, compDnaCodingEnd, 0); int totalExonLength = GetTotalExonLength(exons); int startExonPhase = translationStartExon?.Phase ?? int.MinValue; return(new MutableTranscript(chromosome, start, end, fixedTranscript.Id, fixedTranscript.Version, ccdsId, refSeqId, bioType, isCanonical, codingRegion, fixedProtein.Id, fixedProtein.Version, peptideSequence, source, gene, exons, startExonPhase, totalExonLength, introns, cdnaMaps, siftData, polyphenData, translateableSequence, microRnas, cdsStartNotFound, cdsEndNotFound, selenocysteinePositions, rnaEdits, bamEditStatus)); }