/// <summary>
 /// Create CDS for transcripts in this gene model, based on the translation from CDS in another model
 /// </summary>
 /// <param name="referenceGeneModel"></param>
 public void CreateCDSFromAnnotatedStartCodons(GeneModel referenceGeneModel)
 {
     referenceGeneModel.GenomeForest.Build(); // so we don't need to lock the IntervalTree if we end up parallelizing this method
     foreach (Gene g in Genes)
     {
         bool hasSource = g.FeatureMetadata.SubItems.TryGetValue("source", out List <string> sourceish);
         if (!referenceGeneModel.GenomeForest.Forest.TryGetValue(g.Chromosome.FriendlyName, out IntervalTree tree))
         {
             continue;
         }
         foreach (Transcript t in g.Transcripts)
         {
             List <Transcript> referenceTranscripts        = tree.Query(t).OfType <Transcript>().ToList();
             List <Transcript> referenceTranscriptsWithCDS = referenceTranscripts.Where(tt => tt.IsProteinCoding()).ToList();
             foreach (Transcript tWithCds in referenceTranscriptsWithCDS)
             {
                 lock (tWithCds)
                 {
                     if (t.CreateCDSFromAnnotatedStartCodons(tWithCds))
                     {
                         break;
                     }                                                             // for now, only use the first annotation found if any
                 }
             }
         }
     }
 }
Exemple #2
0
        public override string GetGtfAttributes()
        {
            var attributes = GeneModel.SplitAttributes(FeatureMetadata.FreeText);
            List <Tuple <string, string> > attributeSubsections = new List <Tuple <string, string> >();

            string exonIdLabel = "exon_id";
            bool   hasExonId   = attributes.TryGetValue(exonIdLabel, out string exonId);

            if (hasExonId)
            {
                attributeSubsections.Add(new Tuple <string, string>(exonIdLabel, exonId));
            }

            string exonVersionLabel = "exon_version";
            bool   hasExonVersion   = attributes.TryGetValue(exonVersionLabel, out string exonVersion);

            if (hasExonVersion)
            {
                attributeSubsections.Add(new Tuple <string, string>(exonVersionLabel, exonVersion));
            }

            string exonNumberLabel = "exon_number";
            string exonNumber      = (Parent as Transcript).Exons.Count(x => x.OneBasedStart <= OneBasedStart).ToString();

            attributeSubsections.Add(new Tuple <string, string>(exonNumberLabel, exonNumber));

            return(Parent.GetGtfAttributes() + " " + String.Join(" ", attributeSubsections.Select(x => x.Item1 + " \"" + x.Item2 + "\";")));
        }
Exemple #3
0
        public override string GetGtfAttributes()
        {
            var attributes = GeneModel.SplitAttributes(FeatureMetadata.FreeText);
            List <Tuple <string, string> > attributeSubsections = new List <Tuple <string, string> >();

            string tIdLabel        = "transcript_id";
            bool   hasTranscriptId = attributes.TryGetValue(tIdLabel, out string transcriptId);

            if (hasTranscriptId)
            {
                attributeSubsections.Add(new Tuple <string, string>(tIdLabel, transcriptId));
            }

            string tVersionLabel        = "transcript_version";
            bool   hasTranscriptVersion = attributes.TryGetValue(tVersionLabel, out string transcriptVersion);

            if (hasTranscriptVersion)
            {
                attributeSubsections.Add(new Tuple <string, string>(tVersionLabel, transcriptVersion));
            }

            string tBiotypeLabel        = "transcript_biotype";
            bool   hasTranscriptBiotype = attributes.TryGetValue(tBiotypeLabel, out string transcriptBiotype);

            if (hasTranscriptVersion)
            {
                attributeSubsections.Add(new Tuple <string, string>(tBiotypeLabel, transcriptBiotype));
            }

            // Cufflinks-related, but not using Cufflinks much because stringtie is better
            //bool hasNearestRef = attributes.TryGetValue("nearest_ref", out string nearestRef);
            //bool hasClassCode = attributes.TryGetValue("class_code", out string classCode);

            bool hasSource = FeatureMetadata.SubItems.TryGetValue("source", out List <string> sourceish);
            bool hasStrand = FeatureMetadata.SubItems.TryGetValue("strand", out List <string> strandish);
            bool hasFrame  = FeatureMetadata.SubItems.TryGetValue("frame", out List <string> framey);

            return(Parent.GetGtfAttributes() + " " + String.Join(" ", attributeSubsections.Select(x => x.Item1 + " \"" + x.Item2 + "\";")));
        }
Exemple #4
0
        public override string GetGtfAttributes()
        {
            var attributes = GeneModel.SplitAttributes(FeatureMetadata.FreeText);
            List <Tuple <string, string> > attributeSubsections = new List <Tuple <string, string> >();

            string geneIdLabel = "gene_id";
            bool   hasGeneId   = attributes.TryGetValue(geneIdLabel, out string geneId);

            if (hasGeneId)
            {
                attributeSubsections.Add(new Tuple <string, string>(geneIdLabel, geneId));
            }

            string geneNameLabel = "gene_name";
            bool   hasGeneName   = attributes.TryGetValue(geneNameLabel, out string geneName);

            if (hasGeneName)
            {
                attributeSubsections.Add(new Tuple <string, string>(geneNameLabel, geneName));
            }

            string geneVersionLabel = "gene_version";
            bool   hasGeneVersion   = attributes.TryGetValue(geneVersionLabel, out string geneVersion);

            if (hasGeneVersion)
            {
                attributeSubsections.Add(new Tuple <string, string>(geneVersionLabel, geneVersion));
            }

            string geneBiotypeLabel = "gene_biotype";
            bool   hasGeneBiotype   = attributes.TryGetValue(geneBiotypeLabel, out string geneBiotype);

            if (hasGeneBiotype)
            {
                attributeSubsections.Add(new Tuple <string, string>(geneBiotypeLabel, geneBiotype));
            }

            return(String.Join(" ", attributeSubsections.Select(x => x.Item1 + " \"" + x.Item2 + "\";")));
        }
        } = new Dictionary <string, int>();                                                          // key: mappingStrand + strandFromGene

        /// <summary>
        /// Given a BAM file, try to guess the RNA-Seq experiment:
        ///	1) single-end or pair-end
        ///	2) strand_specific or not
        ///	3) if it is strand-specific, what's the strand_ness of the protocol
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="bamPath"></param>
        /// <param name="geneModelPath"></param>
        /// <param name="minFractionStrandSpecific"></param>
        /// <returns></returns>
        private void CheckProperties(string bamPath, string geneModelPath, Genome genome, double minFractionStrandSpecific)
        {
            GeneModel gm = new GeneModel(genome, geneModelPath);

            using (var reader = File.OpenRead(bamPath))
            {
                Console.WriteLine("Reading BAM file.");

                // read bam, and filter out reads that are QC failures, unmapped, duplicates, or secondary
                BAMParser bam             = new BAMParser();
                var       unfilteredReads = bam.Parse(reader).ToList();
                var       reads           = unfilteredReads.Where(read =>
                                                                  !read.Flag.HasFlag(SAMFlags.QualityCheckFailure) && !read.Flag.HasFlag(SAMFlags.UnmappedQuery) &&
                                                                  !read.Flag.HasFlag(SAMFlags.Duplicate) && !read.Flag.HasFlag(SAMFlags.NonPrimeAlignment)).ToList();

                Console.WriteLine("Evaluating reads.");

                Parallel.ForEach(reads, read =>
                {
                    // set the interval contained by this read, and get the gene regions nearby
                    bool isReversed       = read.Flag.HasFlag(SAMFlags.QueryOnReverseStrand);
                    Interval readInterval = new Interval(null, read.RName, "source", isReversed ? "-" : "+", read.Pos, read.RefEndPos, null);
                    bool hasNearbyRegion  = gm.GenomeForest.Forest.TryGetValue(readInterval.ChromosomeID, out IntervalTree nearbyGeneTree);
                    if (hasNearbyRegion)
                    {
                        List <Interval> nearbyGeneRegions = nearbyGeneTree.Query(readInterval);
                        if (nearbyGeneRegions.Count > 0)
                        {
                            // count up paired-end or single-end read properties
                            string mapStrand = isReversed ? "-" : "+";
                            bool isPaired    = read.Flag.HasFlag(SAMFlags.PairedRead);
                            bool isRead1     = read.Flag.HasFlag(SAMFlags.FirstReadInPair);
                            bool isRead2     = read.Flag.HasFlag(SAMFlags.SecondReadInPair);
                            string readId    = isRead1 ? "1" : isRead2 ? "2" : null;
                            HashSet <string> strandFromGene = new HashSet <string>(nearbyGeneRegions.Select(x => x.Strand));
                            foreach (string strand in strandFromGene)
                            {
                                Dictionary <string, int> dict = isPaired ? PairedStrandedness : SingleStrandedness;
                                string key = isPaired ?
                                             readId + mapStrand + strand :
                                             mapStrand + strand;
                                lock (dict)
                                {
                                    if (dict.TryGetValue(key, out int count))
                                    {
                                        count++;
                                    }
                                    else
                                    {
                                        dict[key] = 1;
                                    }
                                }
                            }
                        }
                    }
                });

                // From RSeQC:
                //      Not strand specific:
                // This is PairEnd Data
                // Fraction of reads failed to determine: 0.0172
                // Fraction of reads explained by "1++,1--,2+-,2-+": 0.4903
                // Fraction of reads explained by "1+-,1-+,2++,2--": 0.4925
                //      Strand specific:
                // This is PairEnd Data
                // Fraction of reads failed to determine: 0.0072
                // Fraction of reads explained by "1++,1--,2+-,2-+": 0.9441
                // Fraction of reads explained by "1+-,1-+,2++,2--": 0.0487
                SingleStrandedness.TryGetValue("++", out int sForward1);
                SingleStrandedness.TryGetValue("--", out int sForward2);

                SingleStrandedness.TryGetValue("+-", out int sReverse1);
                SingleStrandedness.TryGetValue("-+", out int sReverse2);

                PairedStrandedness.TryGetValue("1++", out int pForward1);
                PairedStrandedness.TryGetValue("1--", out int pForward2);
                PairedStrandedness.TryGetValue("2+-", out int pForward3);
                PairedStrandedness.TryGetValue("2-+", out int pForward4);

                PairedStrandedness.TryGetValue("1+-", out int pReverse1);
                PairedStrandedness.TryGetValue("1-+", out int pReverse2);
                PairedStrandedness.TryGetValue("2++", out int pReverse3);
                PairedStrandedness.TryGetValue("2--", out int pReverse4);

                if (PairedStrandedness.Count > 0 && SingleStrandedness.Count == 0)
                {
                    Protocol = RnaSeqProtocol.PairedEnd;
                    FractionForwardStranded = (double)(pForward1 + pForward2 + pForward3 + pForward4) / (double)PairedStrandedness.Values.Sum();
                    FractionReverseStranded = (double)(pReverse1 + pReverse2 + pReverse3 + pReverse4) / (double)PairedStrandedness.Values.Sum();
                    FractionUndetermined    = 1 - FractionForwardStranded - FractionReverseStranded;
                    if (FractionUndetermined > 0.5)
                    {
                        throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath);
                    }
                    Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward :
                                   FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse :
                                   Strandedness.None;
                }
                else if (SingleStrandedness.Count > 0 && PairedStrandedness.Count == 0)
                {
                    Protocol = RnaSeqProtocol.SingleEnd;
                    FractionForwardStranded = (double)(sForward1 + sForward2) / (double)SingleStrandedness.Values.Sum();
                    FractionReverseStranded = (double)(sReverse1 + sReverse2) / (double)SingleStrandedness.Values.Sum();
                    FractionUndetermined    = 1 - FractionForwardStranded - FractionReverseStranded;
                    if (FractionUndetermined > 0.5)
                    {
                        throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath);
                    }
                    Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward :
                                   FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse :
                                   Strandedness.None;
                }
                else
                {
                    Protocol                = RnaSeqProtocol.Mixture;
                    Strandedness            = Strandedness.None;
                    FractionForwardStranded = (double)(sForward1 + sForward2 + pForward1 + pForward2 + pForward3 + pForward4) / (double)PairedStrandedness.Values.Sum();
                    FractionReverseStranded = (double)(sReverse1 + sReverse2 + pReverse1 + pReverse2 + pReverse3 + pReverse4) / (double)PairedStrandedness.Values.Sum();
                    FractionUndetermined    = 1 - FractionForwardStranded - FractionReverseStranded;
                    if (FractionUndetermined > 0.5)
                    {
                        throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath);
                    }
                    Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward :
                                   FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse :
                                   Strandedness.None;
                }
            }
        }