private SortedDictionary <int, HashSet <TranscriptMetadata> > GetTranscriptsByEntrezGeneId(IEnumerable <MutableTranscript> transcripts)
        {
            var genes = new SortedDictionary <int, HashSet <TranscriptMetadata> >();

            foreach (var transcript in transcripts)
            {
                string idWithVersion = transcript.Id + '.' + transcript.Version;

                int  cdsLength        = transcript.CodingRegion?.Length ?? 0;
                int  transcriptLength = transcript.End - transcript.Start + 1;
                bool isLrg            = _lrgTranscriptIds.Contains(transcript.Id);
                int  accession        = AccessionUtilities.GetAccessionNumber(transcript.Id);

                var metadata = new TranscriptMetadata(idWithVersion, accession, transcriptLength, cdsLength, isLrg);
                int geneId   = ConvertGeneIdToInt(transcript.Gene.GeneId);

                if (genes.TryGetValue(geneId, out var observedMetadata))
                {
                    observedMetadata.Add(metadata);
                }
                else
                {
                    genes[geneId] = new HashSet <TranscriptMetadata> {
                        metadata
                    }
                };
            }

            return(genes);
        }
        public void GetAccessionNumber_ReturnMinusOne()
        {
            const int expectedResult = -1;
            var       observedResult = AccessionUtilities.GetAccessionNumber(null);

            Assert.Equal(expectedResult, observedResult);
        }
        public void GetAccessionNumber_ReturnNumber_Ensembl()
        {
            const int expectedResult = 515242;
            var       observedResult = AccessionUtilities.GetAccessionNumber("ENST00000515242");

            Assert.Equal(expectedResult, observedResult);
        }
        public void GetAccessionNumber_ReturnNumber_RefSeq()
        {
            const int expectedResult = 4522;
            var       observedResult = AccessionUtilities.GetAccessionNumber("NM_004522");

            Assert.Equal(expectedResult, observedResult);
        }
 public void GetAccessionNumber_ThrowException_IfUnderlineMissingRefSeq()
 {
     Assert.Throws <InvalidDataException>(delegate
     {
         // ReSharper disable once UnusedVariable
         var observedResult = AccessionUtilities.GetAccessionNumber("NM004522");
     });
 }
        public void GetMaxVersion_Dupl()
        {
            const string expectedId      = "NM_004522.2_dupl6";
            const byte   expectedVersion = 1;
            var          observedResult  = AccessionUtilities.GetMaxVersion("NM_004522.2_dupl6", 1);

            Assert.Equal(expectedId, observedResult.Id);
            Assert.Equal(expectedVersion, observedResult.Version);
        }
        public void GetMaxVersion_SuppliedVersionMax()
        {
            const string expectedId      = "NM_004522";
            const byte   expectedVersion = 3;
            var          observedResult  = AccessionUtilities.GetMaxVersion("NM_004522.2", 3);

            Assert.Equal(expectedId, observedResult.Id);
            Assert.Equal(expectedVersion, observedResult.Version);
        }
Exemple #8
0
        /// <summary>
        /// parses the relevant data from each transcript
        /// </summary>
        public static MutableTranscript Parse(ObjectValueNode objectValue, IChromosome chromosome, Source source)
        {
            // IDs
            string transcriptId      = null;
            byte   transcriptVersion = 1;
            string proteinId         = null;
            byte   proteinVersion    = 0;
            string ccdsId            = null;
            string refSeqId          = null;
            string geneId            = null;
            int    hgncId            = -1;

            // gene
            int    geneStart           = -1;
            int    geneEnd             = -1;
            var    geneOnReverseStrand = false;
            string geneSymbol          = null;
            var    geneSymbolSource    = GeneSymbolSource.Unknown;

            // translation
            int         translationStart     = -1;
            int         translationEnd       = -1;
            MutableExon translationStartExon = null;
            MutableExon translationEndExon   = null;

            // predictions
            string siftData     = null;
            string polyphenData = null;

            var bioType = BioType.other;

            IInterval[] microRnas = null;
            MutableTranscriptRegion[] cdnaMaps = null;
            IInterval[] introns               = null;
            string      peptideSequence       = null;
            string      translateableSequence = null;
            var         isCanonical           = false;
            int         compDnaCodingStart    = -1;
            int         compDnaCodingEnd      = -1;
            int         start = -1;
            int         end   = -1;

            MutableExon[] exons            = null;
            var           cdsStartNotFound = false;
            var           cdsEndNotFound   = false;

            int[]      selenocysteinePositions = null;
            IRnaEdit[] rnaEdits      = null;
            string     bamEditStatus = null;

            foreach (var node in objectValue.Values)
            {
                // sanity check: make sure we know about the keys are used for
                if (!KnownKeys.Contains(node.Key))
                {
                    throw new InvalidDataException($"Encountered an unknown key in the dumper transcript object: {node.Key}");
                }

                // handle each key
                switch (node.Key)
                {
                case ImportKeys.CodingRegionEnd:
                case ImportKeys.CodingRegionStart:
                case ImportKeys.CreatedDate:
                case ImportKeys.DbId:
                case ImportKeys.Description:
                case ImportKeys.DisplayXref:
                case ImportKeys.ExternalDb:
                case ImportKeys.ExternalDisplayName:
                case ImportKeys.ExternalName:
                case ImportKeys.ExternalStatus:
                case ImportKeys.GenePhenotype:
                case ImportKeys.GeneStableId:
                case ImportKeys.ModifiedDate:
                case ImportKeys.Protein:
                case ImportKeys.Slice:
                case ImportKeys.Source:
                case ImportKeys.Strand:
                case ImportKeys.SwissProt:
                case ImportKeys.Trembl:
                case ImportKeys.UniParc:
                case ImportKeys.VepLazyLoaded:
                    // not used
                    break;

                case ImportKeys.BamEditStatus:
                    bamEditStatus = node.GetString();
                    break;

                case ImportKeys.Attributes:
                    (microRnas, rnaEdits, cdsStartNotFound, cdsEndNotFound) = Attribute.ParseList(node);
                    break;

                case ImportKeys.Biotype:
                    bioType = TranscriptUtilities.GetBiotype(node);
                    break;

                case ImportKeys.Ccds:
                    ccdsId = node.GetString();
                    break;

                case ImportKeys.CdnaCodingEnd:
                    compDnaCodingEnd = node.GetInt32();
                    break;

                case ImportKeys.CdnaCodingStart:
                    compDnaCodingStart = node.GetInt32();
                    break;

                case ImportKeys.End:
                    end = node.GetInt32();
                    break;

                case ImportKeys.GeneHgncId:
                    hgncId = node.GetHgncId();
                    break;

                case ImportKeys.GeneSymbol:
                case ImportKeys.GeneHgnc:     // older key
                    geneSymbol = node.GetString();
                    break;

                case ImportKeys.GeneSymbolSource:
                    geneSymbolSource = GeneSymbolSourceHelper.GetGeneSymbolSource(node.GetString());
                    break;

                case ImportKeys.Gene:
                    (geneStart, geneEnd, geneId, geneOnReverseStrand) = ImportGene.Parse(node);
                    break;

                case ImportKeys.IsCanonical:
                    isCanonical = node.GetBool();
                    break;

                case ImportKeys.Refseq:
                    refSeqId = node.GetString();
                    break;

                case ImportKeys.StableId:
                    transcriptId = node.GetString();
                    break;

                case ImportKeys.Start:
                    start = node.GetInt32();
                    break;

                case ImportKeys.TransExonArray:
                    exons = ImportExon.ParseList(node, chromosome);
                    break;

                case ImportKeys.Translation:
                    (translationStart, translationEnd, proteinId, proteinVersion, translationStartExon, translationEndExon) = ImportTranslation.Parse(node, chromosome);
                    break;

                case ImportKeys.VariationEffectFeatureCache:
                    (cdnaMaps, introns, peptideSequence, translateableSequence, siftData, polyphenData, selenocysteinePositions) = ImportVariantEffectFeatureCache.Parse(node);
                    break;

                case ImportKeys.Version:
                    transcriptVersion = (byte)node.GetInt32();
                    break;

                default:
                    throw new InvalidDataException($"Unknown key found: {node.Key}");
                }
            }

            var fixedTranscript = AccessionUtilities.GetMaxVersion(transcriptId, transcriptVersion);
            var fixedProtein    = AccessionUtilities.GetMaxVersion(proteinId, proteinVersion);

            var gene = new MutableGene(chromosome, geneStart, geneEnd, geneOnReverseStrand, geneSymbol,
                                       geneSymbolSource, geneId, hgncId);

            var codingRegion = new CodingRegion(GetCodingRegionStart(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd),
                                                GetCodingRegionEnd(geneOnReverseStrand, translationStartExon, translationEndExon, translationStart, translationEnd),
                                                compDnaCodingStart, compDnaCodingEnd, 0);

            int totalExonLength = GetTotalExonLength(exons);
            int startExonPhase  = translationStartExon?.Phase ?? int.MinValue;

            return(new MutableTranscript(chromosome, start, end, fixedTranscript.Id, fixedTranscript.Version, ccdsId,
                                         refSeqId, bioType, isCanonical, codingRegion, fixedProtein.Id, fixedProtein.Version,
                                         peptideSequence, source, gene, exons, startExonPhase, totalExonLength, introns, cdnaMaps,
                                         siftData, polyphenData, translateableSequence, microRnas, cdsStartNotFound, cdsEndNotFound,
                                         selenocysteinePositions, rnaEdits, bamEditStatus));
        }