Exemplo n.º 1
0
        private static MutableGene GetFlattenedGene(MutableGene seedGene, List <MutableGene> genesWithSameGeneId,
                                                    int overlapStart, int overlapEnd)
        {
            var  flattenedGene = MutableGene.Clone(seedGene);
            bool useOverlap    = overlapStart != -1 && overlapEnd != -1;

            foreach (var gene in genesWithSameGeneId)
            {
                if (gene.Invalid || flattenedGene.OnReverseStrand != gene.OnReverseStrand ||
                    flattenedGene.ReferenceIndex != gene.ReferenceIndex)
                {
                    continue;
                }

                if (useOverlap && !Overlap.Partial(overlapStart, overlapEnd, gene.Start, gene.End))
                {
                    continue;
                }
                if (!useOverlap && !Overlap.Partial(flattenedGene.Start, flattenedGene.End, gene.Start, gene.End))
                {
                    continue;
                }

                UpdateCoordinates(gene, flattenedGene);
                gene.Invalid = true;
            }

            return(flattenedGene);
        }
Exemplo n.º 2
0
        private List <MutableGene> GetValidGenes(MutableGene seedGene, List <MutableGene> genes, out int start,
                                                 out int end)
        {
            var validGenes = new List <MutableGene>();

            start = seedGene.Start;
            end   = seedGene.End;

            foreach (var gene in genes)
            {
                if (gene.Invalid || seedGene.OnReverseStrand != gene.OnReverseStrand ||
                    seedGene.ReferenceIndex != gene.ReferenceIndex || !Overlap.Partial(start, end, gene.Start, gene.End))
                {
                    continue;
                }

                validGenes.Add(gene);

                if (gene.Start < start)
                {
                    start = gene.Start;
                }
                if (gene.End > end)
                {
                    end = gene.End;
                }
            }

            return(validGenes);
        }
Exemplo n.º 3
0
        private void MergesGenesWithSameSymbol(MutableGene seedGene, List <MutableGene> genesWithSameSymbol)
        {
            int overlapStart, overlapEnd;
            var validGenes = GetValidGenes(seedGene, genesWithSameSymbol, out overlapStart, out overlapEnd);

            var ensemblGenes = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.Ensembl);
            var refSeqGenes  = GeneUtilities.GetGenesByDataSource(validGenes, TranscriptDataSource.RefSeq);

            var ensemblFlattener = new GeneFlattener(ensemblGenes, "Ensembl", false);
            var flatEnsemblGenes = ensemblFlattener.Flatten(overlapStart, overlapEnd);

            var refSeqFlattener = new GeneFlattener(refSeqGenes, "RefSeq", false);
            var flatRefSeqGenes = refSeqFlattener.Flatten(overlapStart, overlapEnd);

            foreach (var ensemblGene in flatEnsemblGenes)
            {
                // add the unused Ensembl genes
                string linkedEntrezId;
                if (!_linkedEnsemblIds.TryGetValue(ensemblGene.EnsemblId.ToString(), out linkedEntrezId))
                {
                    AddEnsemblOrphan(ensemblGene);
                    continue;
                }

                var refSeqGene = GeneUtilities.GetRefSeqGeneById(flatRefSeqGenes, linkedEntrezId);

                if (refSeqGene == null)
                {
                    AddEnsemblOrphan(ensemblGene);
                    continue;
                }

                // merge the Ensembl and RefSeq gene
                var mergedGene = MutableGene.Clone(ensemblGene);
                mergedGene.TranscriptDataSource = TranscriptDataSource.BothRefSeqAndEnsembl;
                UpdateCoordinates(refSeqGene, mergedGene);

                if (mergedGene.HgncId == -1 && refSeqGene.HgncId != -1)
                {
                    mergedGene.HgncId = refSeqGene.HgncId;
                }
                mergedGene.EntrezGeneId = refSeqGene.EntrezGeneId;
                _mergedGenes.Add(mergedGene);

                refSeqGene.Invalid  = true;
                ensemblGene.Invalid = true;
                _numMergedGenes++;
            }

            // add the unused RefSeq genes
            foreach (var refSeqGene in flatRefSeqGenes)
            {
                if (refSeqGene.Invalid)
                {
                    continue;
                }
                AddRefSeqOrphan(refSeqGene);
            }
        }
Exemplo n.º 4
0
 private static void UpdateCoordinates(MutableGene source, MutableGene dest)
 {
     if (source.Start < dest.Start)
     {
         dest.Start = source.Start;
     }
     if (source.End > dest.End)
     {
         dest.End = source.End;
     }
 }
Exemplo n.º 5
0
 private static bool GeneEquals(MutableGene x, MutableGene y)
 {
     return(x.Chromosome.Index == y.Chromosome.Index &&
            x.Start == y.Start &&
            x.End == y.End &&
            x.OnReverseStrand == y.OnReverseStrand &&
            x.GeneId == y.GeneId &&
            x.Symbol == y.Symbol &&
            x.HgncId == y.HgncId &&
            x.SymbolSource == y.SymbolSource);
 }
Exemplo n.º 6
0
        private static UgaGene GetMergedGene(MutableGene geneA, MutableGene geneB, bool isGrch37)
        {
            (MutableGene ensemblGene, MutableGene refSeqGene) = geneA.GeneId.StartsWith("ENSG") ? (geneA, geneB) : (geneB, geneA);

            if (ensemblGene.Chromosome.Index != refSeqGene.Chromosome.Index)
            {
                throw new InvalidDataException($"The two genes are on different chromosomes: {geneA.GeneId} & {geneB.GeneId}");
            }
            if (ensemblGene.OnReverseStrand != refSeqGene.OnReverseStrand)
            {
                throw new InvalidDataException($"Both genes do not have the same orientation: {geneA.GeneId} & {geneB.GeneId}");
            }

            IInterval interval = GetMergedInterval(ensemblGene, refSeqGene);

            (IInterval grch37, IInterval grch38) = isGrch37 ? (interval, null as IInterval) : (null as IInterval, interval);

            return(new UgaGene(ensemblGene.Chromosome, grch37, grch38, ensemblGene.OnReverseStrand, refSeqGene.GeneId,
                               ensemblGene.GeneId, ensemblGene.Symbol, ensemblGene.HgncId));
        }
Exemplo n.º 7
0
 private static string GetGeneKey(MutableGene gene) => gene.GeneId + '|' + gene.Chromosome.UcscName + '|' +
 gene.Start + '|' + gene.End + '|' +
 (gene.OnReverseStrand ? 'R' : 'F');
Exemplo n.º 8
0
        private static void WriteGene(TextWriter writer, MutableGene gene)
        {
            char strand = gene.OnReverseStrand ? 'R' : 'F';

            writer.WriteLine($"Gene\t{gene.GeneId}\t{gene.Chromosome.UcscName}\t{gene.Chromosome.Index}\t{gene.Start}\t{gene.End}\t{strand}\t{gene.Symbol}\t{(int)gene.SymbolSource}\t{gene.HgncId}");
        }
Exemplo n.º 9
0
 private void AddRefSeqOrphan(MutableGene gene)
 {
     _mergedGenes.Add(gene);
     gene.Invalid = true;
     _numOrphanRefSeqGenes++;
 }
Exemplo n.º 10
0
 private void AddEnsemblOrphan(MutableGene gene)
 {
     _mergedGenes.Add(gene);
     gene.Invalid = true;
     _numOrphanEnsemblGenes++;
 }
Exemplo n.º 11
0
 private static IInterval GetMergedInterval(MutableGene geneA, MutableGene geneB) =>
 new Interval(Math.Min(geneA.Start, geneB.Start), Math.Max(geneA.End, geneB.End));
Exemplo n.º 12
0
        /// <summary>
        /// parses the relevant data from each transcript
        /// </summary>
        public static MutableTranscript Parse(ObjectValueNode objectValue, IChromosome chromosome, Source source)
        {
            // IDs
            string transcriptId      = null;
            byte   transcriptVersion = 1;
            string proteinId         = null;
            byte   proteinVersion    = 0;
            string ccdsId            = null;
            string refSeqId          = null;
            string geneId            = null;
            int    hgncId            = -1;

            // gene
            int    geneStart           = -1;
            int    geneEnd             = -1;
            var    geneOnReverseStrand = false;
            string geneSymbol          = null;
            var    geneSymbolSource    = GeneSymbolSource.Unknown;

            // translation
            int         translationStart     = -1;
            int         translationEnd       = -1;
            MutableExon translationStartExon = null;
            MutableExon translationEndExon   = null;

            // predictions
            string siftData     = null;
            string polyphenData = null;

            var bioType = BioType.other;

            IInterval[] microRnas = null;
            MutableTranscriptRegion[] cdnaMaps = null;
            IInterval[] introns               = null;
            string      peptideSequence       = null;
            string      translateableSequence = null;
            var         isCanonical           = false;
            int         compDnaCodingStart    = -1;
            int         compDnaCodingEnd      = -1;
            int         start = -1;
            int         end   = -1;

            MutableExon[] exons            = null;
            var           cdsStartNotFound = false;
            var           cdsEndNotFound   = false;

            int[]      selenocysteinePositions = null;
            IRnaEdit[] rnaEdits      = null;
            string     bamEditStatus = null;

            foreach (var node in objectValue.Values)
            {
                // sanity check: make sure we know about the keys are used for
                if (!KnownKeys.Contains(node.Key))
                {
                    throw new InvalidDataException($"Encountered an unknown key in the dumper transcript object: {node.Key}");
                }

                // handle each key
                switch (node.Key)
                {
                case ImportKeys.CodingRegionEnd:
                case ImportKeys.CodingRegionStart:
                case ImportKeys.CreatedDate:
                case ImportKeys.DbId:
                case ImportKeys.Description:
                case ImportKeys.DisplayXref:
                case ImportKeys.ExternalDb:
                case ImportKeys.ExternalDisplayName:
                case ImportKeys.ExternalName:
                case ImportKeys.ExternalStatus:
                case ImportKeys.GenePhenotype:
                case ImportKeys.GeneStableId:
                case ImportKeys.ModifiedDate:
                case ImportKeys.Protein:
                case ImportKeys.Slice:
                case ImportKeys.Source:
                case ImportKeys.Strand:
                case ImportKeys.SwissProt:
                case ImportKeys.Trembl:
                case ImportKeys.UniParc:
                case ImportKeys.VepLazyLoaded:
                    // not used
                    break;

                case ImportKeys.BamEditStatus:
                    bamEditStatus = node.GetString();
                    break;

                case ImportKeys.Attributes:
                    (microRnas, rnaEdits, cdsStartNotFound, cdsEndNotFound) = Attribute.ParseList(node);
                    break;

                case ImportKeys.Biotype:
                    bioType = TranscriptUtilities.GetBiotype(node);
                    break;

                case ImportKeys.Ccds:
                    ccdsId = node.GetString();
                    break;

                case ImportKeys.CdnaCodingEnd:
                    compDnaCodingEnd = node.GetInt32();
                    break;

                case ImportKeys.CdnaCodingStart:
                    compDnaCodingStart = node.GetInt32();
                    break;

                case ImportKeys.End:
                    end = node.GetInt32();
                    break;

                case ImportKeys.GeneHgncId:
                    hgncId = node.GetHgncId();
                    break;

                case ImportKeys.GeneSymbol:
                case ImportKeys.GeneHgnc:     // older key
                    geneSymbol = node.GetString();
                    break;

                case ImportKeys.GeneSymbolSource:
                    geneSymbolSource = GeneSymbolSourceHelper.GetGeneSymbolSource(node.GetString());
                    break;

                case ImportKeys.Gene:
                    (geneStart, geneEnd, geneId, geneOnReverseStrand) = ImportGene.Parse(node);
                    break;

                case ImportKeys.IsCanonical:
                    isCanonical = node.GetBool();
                    break;

                case ImportKeys.Refseq:
                    refSeqId = node.GetString();
                    break;

                case ImportKeys.StableId:
                    transcriptId = node.GetString();
                    break;

                case ImportKeys.Start:
                    start = node.GetInt32();
                    break;

                case ImportKeys.TransExonArray:
                    exons = ImportExon.ParseList(node, chromosome);
                    break;

                case ImportKeys.Translation:
                    (translationStart, translationEnd, proteinId, proteinVersion, translationStartExon, translationEndExon) = ImportTranslation.Parse(node, chromosome);
                    break;

                case ImportKeys.VariationEffectFeatureCache:
                    (cdnaMaps, introns, peptideSequence, translateableSequence, siftData, polyphenData, selenocysteinePositions) = ImportVariantEffectFeatureCache.Parse(node);
                    break;

                case ImportKeys.Version:
                    transcriptVersion = (byte)node.GetInt32();
                    break;

                default:
                    throw new InvalidDataException($"Unknown key found: {node.Key}");
                }
            }

            var fixedTranscript = AccessionUtilities.GetMaxVersion(transcriptId, transcriptVersion);
            var fixedProtein    = AccessionUtilities.GetMaxVersion(proteinId, proteinVersion);

            var gene = new MutableGene(chromosome, geneStart, geneEnd, geneOnReverseStrand, geneSymbol,
                                       geneSymbolSource, geneId, hgncId);

            var codingRegion = new CodingRegion(GetCodingRegionStart(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd),
                                                GetCodingRegionEnd(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd),
                                                compDnaCodingStart, compDnaCodingEnd, 0);

            int totalExonLength = GetTotalExonLength(exons);
            int startExonPhase  = translationStartExon?.Phase ?? int.MinValue;

            return(new MutableTranscript(chromosome, start, end, fixedTranscript.Id, fixedTranscript.Version, ccdsId,
                                         refSeqId, bioType, isCanonical, codingRegion, fixedProtein.Id, fixedProtein.Version,
                                         peptideSequence, source, gene, exons, startExonPhase, totalExonLength, introns, cdnaMaps,
                                         siftData, polyphenData, translateableSequence, microRnas, cdsStartNotFound, cdsEndNotFound,
                                         selenocysteinePositions, rnaEdits, bamEditStatus));
        }