public MustGenotypeExtractor(string compressedSeqPath, string oneKGenomeVcf, string clinvarVcf, string cosmicVcf, bool isHg19 = false) { _compressedSequence = new CompressedSequence(); _dataFileManager = new DataFileManager(new CompressedSequenceReader(FileUtilities.GetReadStream(compressedSeqPath), _compressedSequence), _compressedSequence); _assembly = _compressedSequence.GenomeAssembly == GenomeAssembly.GRCh37 && isHg19? GenomeAssembly.hg19:_compressedSequence.GenomeAssembly; if (_assembly == GenomeAssembly.Unknown) { throw new Exception("Genome assembly must be either GRCh37 or GRCh38"); } if (_compressedSequence.GenomeAssembly == GenomeAssembly.GRCh38 && isHg19) { throw new Exception("reference sequence is GRCh38 while generating hg19 files"); } _oneKGenomeReader = string.IsNullOrEmpty(oneKGenomeVcf)? null: GZipUtilities.GetAppropriateStreamReader(oneKGenomeVcf); _clinvarReader = string.IsNullOrEmpty(clinvarVcf) ? null : GZipUtilities.GetAppropriateStreamReader(clinvarVcf); _cosmicReader = string.IsNullOrEmpty(cosmicVcf) ? null : GZipUtilities.GetAppropriateStreamReader(cosmicVcf); }
/// <summary> /// Retrieves all Exon sequences and concats them together. /// This includes 5' UTR + cDNA + 3' UTR [Transcript.pm:862 spliced_seq] /// </summary> private static string GetSplicedSequence(ICompressedSequence compressedSequence, CdnaCoordinateMap[] cdnaMaps, bool onReverseStrand) { var sb = new StringBuilder(); foreach (var exon in cdnaMaps) { var exonLength = exon.GenomicEnd - exon.GenomicStart + 1; // sanity check: handle the situation where no reference has been provided if (compressedSequence == null) { sb.Append(new string('N', exonLength)); continue; } sb.Append(compressedSequence.Substring(exon.GenomicStart - 1, exonLength)); } return(onReverseStrand ? SequenceUtilities.GetReverseComplement(sb.ToString()) : sb.ToString()); }
public void AnnotateVariant(IVariantFeature variant, List <Transcript> transcripts, IAnnotatedVariant annotatedVariant, ICompressedSequence sequence) { if (variant.IsStructuralVariant) { return; } CreateTranscriptDictionary(transcripts); foreach (var altAllele in annotatedVariant.AnnotatedAlternateAlleles) { foreach (var transcript in altAllele.EnsemblTranscripts) { var lofteeOut = LofteeAnalysis(transcript, altAllele, sequence); if (lofteeOut != null) { if (transcript.AdditionalInfo == null) { transcript.AdditionalInfo = new Dictionary <string, string>(); } transcript.AdditionalInfo["loftee"] = lofteeOut; } } foreach (var transcript in altAllele.RefSeqTranscripts) { var lofteeOut = LofteeAnalysis(transcript, altAllele, sequence); if (lofteeOut != null) { if (transcript.AdditionalInfo == null) { transcript.AdditionalInfo = new Dictionary <string, string>(); } transcript.AdditionalInfo["loftee"] = lofteeOut; } } } }
private void CheckNagnagSite(Transcript transcript, IAnnotatedAlternateAllele allele, HashSet <LofteeFilter.Flag> flags, ICompressedSequence sequence) { if (allele.ReferenceBegin == null || allele.ReferenceEnd == null || allele.ReferenceBegin.Value != allele.ReferenceEnd.Value) { return; } int pos = allele.ReferenceBegin.Value; string upStreamSeq = sequence.Substring(pos - 6, 6); string downStreamSeq = sequence.Substring(pos, 5); var combineSeq = transcript.Gene.OnReverseStrand ? SequenceUtilities.GetReverseComplement(upStreamSeq + downStreamSeq) : upStreamSeq + downStreamSeq; if (Regex.Match(combineSeq, "[A|T|C|G]AG[A|T|C|G]AG").Success) { flags.Add(LofteeFilter.Flag.nagnag_site); } }
/// <summary> /// constructor /// </summary> public VariantAligner(ICompressedSequence compressedSequence) { _compressedSequence = compressedSequence; }
/// <summary> /// constructor /// </summary> public ClinVarXmlReaderTests(ChromosomeRenamerFixture fixture) { _sequence = fixture.Sequence; _reader = fixture.Reader; }
/// <summary> /// constructor /// </summary> public CacheData(ICompressedSequence compressedSequence, Transcript transcript) { _compressedSequence = compressedSequence; _transcript = transcript; }
private void GetProteinPosition(TranscriptAnnotation ta, Transcript transcript, ICompressedSequence compressedSequence) { const int shift = 0; if (ta.HasValidCdsStart) { ta.ProteinBegin = (int)((ta.CodingDnaSequenceBegin + shift + 2.0) / 3.0); } if (ta.HasValidCdsEnd) { ta.ProteinEnd = (int)((ta.CodingDnaSequenceEnd + shift + 2.0) / 3.0); } // assign our codons and amino acids Codons.AssignExtended(ta, transcript, compressedSequence); _aminoAcids.Assign(ta); }
private void GetCodingAnnotations(Transcript transcript, TranscriptAnnotation ta, ICompressedSequence compressedSequence) { // coding annotations if (!ta.HasValidCdnaStart && !ta.HasValidCdnaEnd) { return; } CalculateCdsPositions(transcript, ta); // determine the protein position if (!ta.HasValidCdsStart && !ta.HasValidCdsEnd) { return; } GetProteinPosition(ta, transcript, compressedSequence); }
private void CheckNonCanonicalSpliceSurr(IAnnotatedTranscript ta, Transcript transcript, HashSet <LofteeFilter.Filter> filters, ICompressedSequence sequence) { if (ta.Exons == null) { return; } int affectedExonIndex = Convert.ToInt32(ta.Exons.Split('/').First().Split('-').First()); var totalExons = transcript.CdnaMaps.Length; string surrDonor = null; string surrAcceptor = null; if (totalExons <= 1) { return; } var onReverseStrand = transcript.Gene.OnReverseStrand; if (affectedExonIndex > 1) { var intron = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex] : transcript.Introns[affectedExonIndex - 2]; int acceptorStart = onReverseStrand ? intron.Start : intron.End - 1; var acceptorSeq = sequence.Substring(acceptorStart - 1, 2); surrAcceptor = onReverseStrand ? SequenceUtilities.GetReverseComplement(acceptorSeq) : acceptorSeq; } if (affectedExonIndex < totalExons) { var intron = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex - 1] : transcript.Introns[affectedExonIndex - 1]; int donorStart = onReverseStrand ? intron.End - 1 : intron.Start; var donorSeq = sequence.Substring(donorStart - 1, 2); surrDonor = onReverseStrand ? SequenceUtilities.GetReverseComplement(donorSeq) : donorSeq; } if (surrAcceptor != null && surrAcceptor != "AG" || surrDonor != null && surrDonor != "GT") { filters.Add(LofteeFilter.Filter.non_can_splice_surr); } }
/// <summary> /// assigns the reference and alternate codons [TranscriptVariationAllele.pm:259 codon] /// if frameshift variants, add 45 basepair for both ref and alt codon /// added 45 base pair for stop loss variant /// </summary> public static void AssignExtended(TranscriptAnnotation ta, Transcript transcript, ICompressedSequence compressedSequence) { // sanity check: make sure this is a coding region if (!(ta.HasValidCdsEnd && ta.HasValidCdsStart)) { ta.ReferenceCodon = null; ta.AlternateCodon = null; return; } // calculate necessary coordinates and lengths var aminoAcidStart = ta.ProteinBegin * 3 - 2; var aminoAcidEnd = ta.ProteinEnd * 3; var prefixLen = ta.CodingDnaSequenceBegin - aminoAcidStart; var suffixLen = aminoAcidEnd - ta.CodingDnaSequenceEnd; var codingSequence = new CodingSequence(compressedSequence, transcript.Translation.CodingRegion.GenomicStart, transcript.Translation.CodingRegion.GenomicEnd, transcript.CdnaMaps, transcript.Gene.OnReverseStrand, transcript.StartExonPhase); var aminoAcidSeq = codingSequence.Sequence(); var start1 = aminoAcidStart - 1; var start2 = aminoAcidEnd - suffixLen; var maxSuffixLen = aminoAcidSeq.Length - start2; var atTailEnd = false; if (suffixLen > maxSuffixLen) { suffixLen = maxSuffixLen; atTailEnd = true; } if (start1 < 0) { start1 = 0; } if (start2 < 0) { start2 = 0; } if (prefixLen < 0) { prefixLen = 0; } var prefix = start1 + prefixLen < aminoAcidSeq.Length ? aminoAcidSeq.Substring(start1, prefixLen).ToLower() : "AAA"; var suffix = suffixLen > 0 ? aminoAcidSeq.Substring(start2, suffixLen).ToLower() : ""; var needExtend = !atTailEnd && !IsTriplet(prefixLen + suffixLen + ta.TranscriptAlternateAllele.Length); var extendedLen = maxSuffixLen - suffixLen > 45 ? 45 : (maxSuffixLen - suffixLen) / 3 * 3; if (needExtend) { suffix = aminoAcidSeq.Substring(start2, suffixLen + extendedLen); } ta.HasFrameShift = false; ta.ReferenceCodon = GetCodon(ta.TranscriptReferenceAllele, prefix, suffix, ref ta.HasFrameShift, atTailEnd); ta.AlternateCodon = GetCodon(ta.TranscriptAlternateAllele, prefix, suffix, ref ta.HasFrameShift, atTailEnd); }
// constructor public CreateSupplementaryDatabase( string compressedReferencePath, string nsdBaseFileName, string dbSnpFileName = null, string cosmicVcfFile = null, string cosmicTsvFile = null, string clinVarFileName = null, string oneKGenomeAfFileName = null, string evsFileName = null, string exacFileName = null, List <string> customFiles = null, string dgvFileName = null, string oneKSvFileName = null, string clinGenFileName = null, string chrWhiteList = null) { _nsdBaseFileName = nsdBaseFileName; _dataSources = new List <DataSourceVersion>(); _iSupplementaryDataItemList = new List <IEnumerator <SupplementaryDataItem> >(); _supplementaryIntervalList = new List <SupplementaryInterval>(); Console.WriteLine("Creating supplementary annotation files... Data version: {0}, schema version: {1}", SupplementaryAnnotationCommon.DataVersion, SupplementaryAnnotationCommon.SchemaVersion); _compressedSequence = new CompressedSequence(); var compressedSequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(compressedReferencePath), _compressedSequence); _renamer = _compressedSequence.Renamer; _dataFileManager = new DataFileManager(compressedSequenceReader, _compressedSequence); if (!string.IsNullOrEmpty(chrWhiteList)) { Console.WriteLine("Creating SA for the following chromosomes only:"); foreach (var refSeq in chrWhiteList.Split(',')) { InputFileParserUtilities.ChromosomeWhiteList.Add(_renamer.GetEnsemblReferenceName(refSeq)); Console.Write(refSeq + ","); } Console.WriteLine(); } else { InputFileParserUtilities.ChromosomeWhiteList = null; } if (dbSnpFileName != null) { AddSourceVersion(dbSnpFileName); var dbSnpReader = new DbSnpReader(new FileInfo(dbSnpFileName), _renamer); var dbSnpEnumerator = dbSnpReader.GetEnumerator(); _iSupplementaryDataItemList.Add(dbSnpEnumerator); } if (cosmicVcfFile != null && cosmicTsvFile != null) { AddSourceVersion(cosmicVcfFile); var cosmicReader = new MergedCosmicReader(cosmicVcfFile, cosmicTsvFile, _renamer); var cosmicEnumerator = cosmicReader.GetEnumerator(); _iSupplementaryDataItemList.Add(cosmicEnumerator); } if (oneKGenomeAfFileName != null) { AddSourceVersion(oneKGenomeAfFileName); var oneKGenReader = new OneKGenReader(new FileInfo(oneKGenomeAfFileName), _renamer); var oneKGenEnumerator = oneKGenReader.GetEnumerator(); _iSupplementaryDataItemList.Add(oneKGenEnumerator); } if (oneKSvFileName != null) { if (oneKGenomeAfFileName == null) { AddSourceVersion(oneKSvFileName); } var oneKGenSvReader = new OneKGenSvReader(new FileInfo(oneKSvFileName), _renamer); var oneKGenSvEnumerator = oneKGenSvReader.GetEnumerator(); _iSupplementaryDataItemList.Add(oneKGenSvEnumerator); } if (evsFileName != null) { AddSourceVersion(evsFileName); var evsReader = new EvsReader(new FileInfo(evsFileName), _renamer); var evsEnumerator = evsReader.GetEnumerator(); _iSupplementaryDataItemList.Add(evsEnumerator); } if (exacFileName != null) { AddSourceVersion(exacFileName); var exacReader = new ExacReader(new FileInfo(exacFileName), _renamer); var exacEnumerator = exacReader.GetEnumerator(); _iSupplementaryDataItemList.Add(exacEnumerator); } if (clinVarFileName != null) { AddSourceVersion(clinVarFileName); var clinVarReader = new ClinVarXmlReader(new FileInfo(clinVarFileName), compressedSequenceReader, _compressedSequence); var clinVarList = clinVarReader.ToList(); clinVarList.Sort(); Console.WriteLine($"{clinVarList.Count} clinvar items read form XML file"); IEnumerator <ClinVarItem> clinVarEnumerator = clinVarList.GetEnumerator(); _iSupplementaryDataItemList.Add(clinVarEnumerator); } if (dgvFileName != null) { AddSourceVersion(dgvFileName); var dgvReader = new DgvReader(new FileInfo(dgvFileName), _renamer); var dgvEnumerator = dgvReader.GetEnumerator(); _iSupplementaryDataItemList.Add(dgvEnumerator); } if (clinGenFileName != null) { AddSourceVersion(clinGenFileName); var clinGenReader = new ClinGenReader(new FileInfo(clinGenFileName), _renamer); var clinGenEnumerator = clinGenReader.GetEnumerator(); _iSupplementaryDataItemList.Add(clinGenEnumerator); } if (customFiles != null) { foreach (var customFile in customFiles) { AddSourceVersion(customFile); var customReader = new CustomAnnotationReader(new FileInfo(customFile), _renamer); var customEnumerator = customReader.GetEnumerator(); _iSupplementaryDataItemList.Add(customEnumerator); } } // initializing the IEnumerators in the list foreach (var iDataEnumerator in _iSupplementaryDataItemList) { if (!iDataEnumerator.MoveNext()) { _iSupplementaryDataItemList.Remove(iDataEnumerator); } } _additionalItemsList = new List <SupplementaryDataItem>(); }
/// <summary> /// constructor /// </summary> public DataFileManager(CompressedSequenceReader reader, ICompressedSequence compressedSequence) { _compressedSequence = compressedSequence; _compressedSequenceReader = reader; }
/// <summary> /// returns the correct start value when retrieving a substring of a substring /// where the top level might be reverse complemented /// </summary> public static string GetSubSubstring(int seqStart, int seqEnd, bool seqOnReverseStrand, int subStart, int subEnd, ICompressedSequence cs) { var start = seqOnReverseStrand ? seqEnd - subEnd : seqStart + subStart; var precedingBases = cs.Substring(start - 1, subEnd - subStart + 1); if (seqOnReverseStrand) { precedingBases = GetReverseComplement(precedingBases); } return(precedingBases); }