/// <summary> /// Create CDS for transcripts in this gene model, based on the translation from CDS in another model /// </summary> /// <param name="referenceGeneModel"></param> public void CreateCDSFromAnnotatedStartCodons(GeneModel referenceGeneModel) { referenceGeneModel.GenomeForest.Build(); // so we don't need to lock the IntervalTree if we end up parallelizing this method foreach (Gene g in Genes) { bool hasSource = g.FeatureMetadata.SubItems.TryGetValue("source", out List <string> sourceish); if (!referenceGeneModel.GenomeForest.Forest.TryGetValue(g.Chromosome.FriendlyName, out IntervalTree tree)) { continue; } foreach (Transcript t in g.Transcripts) { List <Transcript> referenceTranscripts = tree.Query(t).OfType <Transcript>().ToList(); List <Transcript> referenceTranscriptsWithCDS = referenceTranscripts.Where(tt => tt.IsProteinCoding()).ToList(); foreach (Transcript tWithCds in referenceTranscriptsWithCDS) { lock (tWithCds) { if (t.CreateCDSFromAnnotatedStartCodons(tWithCds)) { break; } // for now, only use the first annotation found if any } } } } }
public override string GetGtfAttributes() { var attributes = GeneModel.SplitAttributes(FeatureMetadata.FreeText); List <Tuple <string, string> > attributeSubsections = new List <Tuple <string, string> >(); string exonIdLabel = "exon_id"; bool hasExonId = attributes.TryGetValue(exonIdLabel, out string exonId); if (hasExonId) { attributeSubsections.Add(new Tuple <string, string>(exonIdLabel, exonId)); } string exonVersionLabel = "exon_version"; bool hasExonVersion = attributes.TryGetValue(exonVersionLabel, out string exonVersion); if (hasExonVersion) { attributeSubsections.Add(new Tuple <string, string>(exonVersionLabel, exonVersion)); } string exonNumberLabel = "exon_number"; string exonNumber = (Parent as Transcript).Exons.Count(x => x.OneBasedStart <= OneBasedStart).ToString(); attributeSubsections.Add(new Tuple <string, string>(exonNumberLabel, exonNumber)); return(Parent.GetGtfAttributes() + " " + String.Join(" ", attributeSubsections.Select(x => x.Item1 + " \"" + x.Item2 + "\";"))); }
public override string GetGtfAttributes() { var attributes = GeneModel.SplitAttributes(FeatureMetadata.FreeText); List <Tuple <string, string> > attributeSubsections = new List <Tuple <string, string> >(); string tIdLabel = "transcript_id"; bool hasTranscriptId = attributes.TryGetValue(tIdLabel, out string transcriptId); if (hasTranscriptId) { attributeSubsections.Add(new Tuple <string, string>(tIdLabel, transcriptId)); } string tVersionLabel = "transcript_version"; bool hasTranscriptVersion = attributes.TryGetValue(tVersionLabel, out string transcriptVersion); if (hasTranscriptVersion) { attributeSubsections.Add(new Tuple <string, string>(tVersionLabel, transcriptVersion)); } string tBiotypeLabel = "transcript_biotype"; bool hasTranscriptBiotype = attributes.TryGetValue(tBiotypeLabel, out string transcriptBiotype); if (hasTranscriptVersion) { attributeSubsections.Add(new Tuple <string, string>(tBiotypeLabel, transcriptBiotype)); } // Cufflinks-related, but not using Cufflinks much because stringtie is better //bool hasNearestRef = attributes.TryGetValue("nearest_ref", out string nearestRef); //bool hasClassCode = attributes.TryGetValue("class_code", out string classCode); bool hasSource = FeatureMetadata.SubItems.TryGetValue("source", out List <string> sourceish); bool hasStrand = FeatureMetadata.SubItems.TryGetValue("strand", out List <string> strandish); bool hasFrame = FeatureMetadata.SubItems.TryGetValue("frame", out List <string> framey); return(Parent.GetGtfAttributes() + " " + String.Join(" ", attributeSubsections.Select(x => x.Item1 + " \"" + x.Item2 + "\";"))); }
public override string GetGtfAttributes() { var attributes = GeneModel.SplitAttributes(FeatureMetadata.FreeText); List <Tuple <string, string> > attributeSubsections = new List <Tuple <string, string> >(); string geneIdLabel = "gene_id"; bool hasGeneId = attributes.TryGetValue(geneIdLabel, out string geneId); if (hasGeneId) { attributeSubsections.Add(new Tuple <string, string>(geneIdLabel, geneId)); } string geneNameLabel = "gene_name"; bool hasGeneName = attributes.TryGetValue(geneNameLabel, out string geneName); if (hasGeneName) { attributeSubsections.Add(new Tuple <string, string>(geneNameLabel, geneName)); } string geneVersionLabel = "gene_version"; bool hasGeneVersion = attributes.TryGetValue(geneVersionLabel, out string geneVersion); if (hasGeneVersion) { attributeSubsections.Add(new Tuple <string, string>(geneVersionLabel, geneVersion)); } string geneBiotypeLabel = "gene_biotype"; bool hasGeneBiotype = attributes.TryGetValue(geneBiotypeLabel, out string geneBiotype); if (hasGeneBiotype) { attributeSubsections.Add(new Tuple <string, string>(geneBiotypeLabel, geneBiotype)); } return(String.Join(" ", attributeSubsections.Select(x => x.Item1 + " \"" + x.Item2 + "\";"))); }
} = new Dictionary <string, int>(); // key: mappingStrand + strandFromGene /// <summary> /// Given a BAM file, try to guess the RNA-Seq experiment: /// 1) single-end or pair-end /// 2) strand_specific or not /// 3) if it is strand-specific, what's the strand_ness of the protocol /// </summary> /// <param name="spritzDirectory"></param> /// <param name="bamPath"></param> /// <param name="geneModelPath"></param> /// <param name="minFractionStrandSpecific"></param> /// <returns></returns> private void CheckProperties(string bamPath, string geneModelPath, Genome genome, double minFractionStrandSpecific) { GeneModel gm = new GeneModel(genome, geneModelPath); using (var reader = File.OpenRead(bamPath)) { Console.WriteLine("Reading BAM file."); // read bam, and filter out reads that are QC failures, unmapped, duplicates, or secondary BAMParser bam = new BAMParser(); var unfilteredReads = bam.Parse(reader).ToList(); var reads = unfilteredReads.Where(read => !read.Flag.HasFlag(SAMFlags.QualityCheckFailure) && !read.Flag.HasFlag(SAMFlags.UnmappedQuery) && !read.Flag.HasFlag(SAMFlags.Duplicate) && !read.Flag.HasFlag(SAMFlags.NonPrimeAlignment)).ToList(); Console.WriteLine("Evaluating reads."); Parallel.ForEach(reads, read => { // set the interval contained by this read, and get the gene regions nearby bool isReversed = read.Flag.HasFlag(SAMFlags.QueryOnReverseStrand); Interval readInterval = new Interval(null, read.RName, "source", isReversed ? "-" : "+", read.Pos, read.RefEndPos, null); bool hasNearbyRegion = gm.GenomeForest.Forest.TryGetValue(readInterval.ChromosomeID, out IntervalTree nearbyGeneTree); if (hasNearbyRegion) { List <Interval> nearbyGeneRegions = nearbyGeneTree.Query(readInterval); if (nearbyGeneRegions.Count > 0) { // count up paired-end or single-end read properties string mapStrand = isReversed ? "-" : "+"; bool isPaired = read.Flag.HasFlag(SAMFlags.PairedRead); bool isRead1 = read.Flag.HasFlag(SAMFlags.FirstReadInPair); bool isRead2 = read.Flag.HasFlag(SAMFlags.SecondReadInPair); string readId = isRead1 ? "1" : isRead2 ? "2" : null; HashSet <string> strandFromGene = new HashSet <string>(nearbyGeneRegions.Select(x => x.Strand)); foreach (string strand in strandFromGene) { Dictionary <string, int> dict = isPaired ? PairedStrandedness : SingleStrandedness; string key = isPaired ? readId + mapStrand + strand : mapStrand + strand; lock (dict) { if (dict.TryGetValue(key, out int count)) { count++; } else { dict[key] = 1; } } } } } }); // From RSeQC: // Not strand specific: // This is PairEnd Data // Fraction of reads failed to determine: 0.0172 // Fraction of reads explained by "1++,1--,2+-,2-+": 0.4903 // Fraction of reads explained by "1+-,1-+,2++,2--": 0.4925 // Strand specific: // This is PairEnd Data // Fraction of reads failed to determine: 0.0072 // Fraction of reads explained by "1++,1--,2+-,2-+": 0.9441 // Fraction of reads explained by "1+-,1-+,2++,2--": 0.0487 SingleStrandedness.TryGetValue("++", out int sForward1); SingleStrandedness.TryGetValue("--", out int sForward2); SingleStrandedness.TryGetValue("+-", out int sReverse1); SingleStrandedness.TryGetValue("-+", out int sReverse2); PairedStrandedness.TryGetValue("1++", out int pForward1); PairedStrandedness.TryGetValue("1--", out int pForward2); PairedStrandedness.TryGetValue("2+-", out int pForward3); PairedStrandedness.TryGetValue("2-+", out int pForward4); PairedStrandedness.TryGetValue("1+-", out int pReverse1); PairedStrandedness.TryGetValue("1-+", out int pReverse2); PairedStrandedness.TryGetValue("2++", out int pReverse3); PairedStrandedness.TryGetValue("2--", out int pReverse4); if (PairedStrandedness.Count > 0 && SingleStrandedness.Count == 0) { Protocol = RnaSeqProtocol.PairedEnd; FractionForwardStranded = (double)(pForward1 + pForward2 + pForward3 + pForward4) / (double)PairedStrandedness.Values.Sum(); FractionReverseStranded = (double)(pReverse1 + pReverse2 + pReverse3 + pReverse4) / (double)PairedStrandedness.Values.Sum(); FractionUndetermined = 1 - FractionForwardStranded - FractionReverseStranded; if (FractionUndetermined > 0.5) { throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath); } Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward : FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse : Strandedness.None; } else if (SingleStrandedness.Count > 0 && PairedStrandedness.Count == 0) { Protocol = RnaSeqProtocol.SingleEnd; FractionForwardStranded = (double)(sForward1 + sForward2) / (double)SingleStrandedness.Values.Sum(); FractionReverseStranded = (double)(sReverse1 + sReverse2) / (double)SingleStrandedness.Values.Sum(); FractionUndetermined = 1 - FractionForwardStranded - FractionReverseStranded; if (FractionUndetermined > 0.5) { throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath); } Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward : FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse : Strandedness.None; } else { Protocol = RnaSeqProtocol.Mixture; Strandedness = Strandedness.None; FractionForwardStranded = (double)(sForward1 + sForward2 + pForward1 + pForward2 + pForward3 + pForward4) / (double)PairedStrandedness.Values.Sum(); FractionReverseStranded = (double)(sReverse1 + sReverse2 + pReverse1 + pReverse2 + pReverse3 + pReverse4) / (double)PairedStrandedness.Values.Sum(); FractionUndetermined = 1 - FractionForwardStranded - FractionReverseStranded; if (FractionUndetermined > 0.5) { throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath); } Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward : FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse : Strandedness.None; } } }