Ejemplo n.º 1
0
        public MustGenotypeExtractor(string compressedSeqPath, string oneKGenomeVcf, string clinvarVcf, string cosmicVcf, bool isHg19 = false)
        {
            _compressedSequence = new CompressedSequence();
            _dataFileManager    = new DataFileManager(new CompressedSequenceReader(FileUtilities.GetReadStream(compressedSeqPath), _compressedSequence), _compressedSequence);
            _assembly           = _compressedSequence.GenomeAssembly == GenomeAssembly.GRCh37 && isHg19? GenomeAssembly.hg19:_compressedSequence.GenomeAssembly;

            if (_assembly == GenomeAssembly.Unknown)
            {
                throw new Exception("Genome assembly must be either GRCh37 or GRCh38");
            }
            if (_compressedSequence.GenomeAssembly == GenomeAssembly.GRCh38 && isHg19)
            {
                throw new Exception("reference sequence is GRCh38 while generating hg19 files");
            }

            _oneKGenomeReader = string.IsNullOrEmpty(oneKGenomeVcf)? null: GZipUtilities.GetAppropriateStreamReader(oneKGenomeVcf);
            _clinvarReader    = string.IsNullOrEmpty(clinvarVcf) ? null : GZipUtilities.GetAppropriateStreamReader(clinvarVcf);
            _cosmicReader     = string.IsNullOrEmpty(cosmicVcf) ? null : GZipUtilities.GetAppropriateStreamReader(cosmicVcf);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Retrieves all Exon sequences and concats them together.
        /// This includes 5' UTR + cDNA + 3' UTR [Transcript.pm:862 spliced_seq]
        /// </summary>
        private static string GetSplicedSequence(ICompressedSequence compressedSequence, CdnaCoordinateMap[] cdnaMaps, bool onReverseStrand)
        {
            var sb = new StringBuilder();

            foreach (var exon in cdnaMaps)
            {
                var exonLength = exon.GenomicEnd - exon.GenomicStart + 1;

                // sanity check: handle the situation where no reference has been provided
                if (compressedSequence == null)
                {
                    sb.Append(new string('N', exonLength));
                    continue;
                }

                sb.Append(compressedSequence.Substring(exon.GenomicStart - 1, exonLength));
            }

            return(onReverseStrand ? SequenceUtilities.GetReverseComplement(sb.ToString()) : sb.ToString());
        }
Ejemplo n.º 3
0
        public void AnnotateVariant(IVariantFeature variant, List <Transcript> transcripts,
                                    IAnnotatedVariant annotatedVariant, ICompressedSequence sequence)
        {
            if (variant.IsStructuralVariant)
            {
                return;
            }

            CreateTranscriptDictionary(transcripts);

            foreach (var altAllele in annotatedVariant.AnnotatedAlternateAlleles)
            {
                foreach (var transcript in altAllele.EnsemblTranscripts)
                {
                    var lofteeOut = LofteeAnalysis(transcript, altAllele, sequence);
                    if (lofteeOut != null)
                    {
                        if (transcript.AdditionalInfo == null)
                        {
                            transcript.AdditionalInfo = new Dictionary <string, string>();
                        }
                        transcript.AdditionalInfo["loftee"] = lofteeOut;
                    }
                }


                foreach (var transcript in altAllele.RefSeqTranscripts)
                {
                    var lofteeOut = LofteeAnalysis(transcript, altAllele, sequence);
                    if (lofteeOut != null)
                    {
                        if (transcript.AdditionalInfo == null)
                        {
                            transcript.AdditionalInfo = new Dictionary <string, string>();
                        }
                        transcript.AdditionalInfo["loftee"] = lofteeOut;
                    }
                }
            }
        }
Ejemplo n.º 4
0
        private void CheckNagnagSite(Transcript transcript, IAnnotatedAlternateAllele allele,
                                     HashSet <LofteeFilter.Flag> flags, ICompressedSequence sequence)
        {
            if (allele.ReferenceBegin == null || allele.ReferenceEnd == null ||
                allele.ReferenceBegin.Value != allele.ReferenceEnd.Value)
            {
                return;
            }

            int pos = allele.ReferenceBegin.Value;

            string upStreamSeq   = sequence.Substring(pos - 6, 6);
            string downStreamSeq = sequence.Substring(pos, 5);

            var combineSeq = transcript.Gene.OnReverseStrand
                ? SequenceUtilities.GetReverseComplement(upStreamSeq + downStreamSeq)
                : upStreamSeq + downStreamSeq;

            if (Regex.Match(combineSeq, "[A|T|C|G]AG[A|T|C|G]AG").Success)
            {
                flags.Add(LofteeFilter.Flag.nagnag_site);
            }
        }
Ejemplo n.º 5
0
 /// <summary>
 /// constructor
 /// </summary>
 public VariantAligner(ICompressedSequence compressedSequence)
 {
     _compressedSequence = compressedSequence;
 }
Ejemplo n.º 6
0
 /// <summary>
 /// constructor
 /// </summary>
 public ClinVarXmlReaderTests(ChromosomeRenamerFixture fixture)
 {
     _sequence = fixture.Sequence;
     _reader   = fixture.Reader;
 }
Ejemplo n.º 7
0
 /// <summary>
 /// constructor
 /// </summary>
 public CacheData(ICompressedSequence compressedSequence, Transcript transcript)
 {
     _compressedSequence = compressedSequence;
     _transcript         = transcript;
 }
Ejemplo n.º 8
0
        private void GetProteinPosition(TranscriptAnnotation ta, Transcript transcript, ICompressedSequence compressedSequence)
        {
            const int shift = 0;

            if (ta.HasValidCdsStart)
            {
                ta.ProteinBegin = (int)((ta.CodingDnaSequenceBegin + shift + 2.0) / 3.0);
            }
            if (ta.HasValidCdsEnd)
            {
                ta.ProteinEnd = (int)((ta.CodingDnaSequenceEnd + shift + 2.0) / 3.0);
            }

            // assign our codons and amino acids
            Codons.AssignExtended(ta, transcript, compressedSequence);
            _aminoAcids.Assign(ta);
        }
Ejemplo n.º 9
0
        private void GetCodingAnnotations(Transcript transcript, TranscriptAnnotation ta, ICompressedSequence compressedSequence)
        {
            // coding annotations
            if (!ta.HasValidCdnaStart && !ta.HasValidCdnaEnd)
            {
                return;
            }
            CalculateCdsPositions(transcript, ta);

            // determine the protein position
            if (!ta.HasValidCdsStart && !ta.HasValidCdsEnd)
            {
                return;
            }
            GetProteinPosition(ta, transcript, compressedSequence);
        }
Ejemplo n.º 10
0
        private void CheckNonCanonicalSpliceSurr(IAnnotatedTranscript ta, Transcript transcript,
                                                 HashSet <LofteeFilter.Filter> filters, ICompressedSequence sequence)
        {
            if (ta.Exons == null)
            {
                return;
            }
            int affectedExonIndex = Convert.ToInt32(ta.Exons.Split('/').First().Split('-').First());
            var totalExons        = transcript.CdnaMaps.Length;

            string surrDonor    = null;
            string surrAcceptor = null;

            if (totalExons <= 1)
            {
                return;
            }

            var onReverseStrand = transcript.Gene.OnReverseStrand;

            if (affectedExonIndex > 1)
            {
                var intron        = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex] : transcript.Introns[affectedExonIndex - 2];
                int acceptorStart = onReverseStrand ? intron.Start : intron.End - 1;
                var acceptorSeq   = sequence.Substring(acceptorStart - 1, 2);
                surrAcceptor = onReverseStrand ? SequenceUtilities.GetReverseComplement(acceptorSeq) : acceptorSeq;
            }

            if (affectedExonIndex < totalExons)
            {
                var intron     = onReverseStrand ? transcript.Introns[totalExons - affectedExonIndex - 1] : transcript.Introns[affectedExonIndex - 1];
                int donorStart = onReverseStrand ? intron.End - 1 : intron.Start;
                var donorSeq   = sequence.Substring(donorStart - 1, 2);
                surrDonor = onReverseStrand ? SequenceUtilities.GetReverseComplement(donorSeq) : donorSeq;
            }

            if (surrAcceptor != null && surrAcceptor != "AG" || surrDonor != null && surrDonor != "GT")
            {
                filters.Add(LofteeFilter.Filter.non_can_splice_surr);
            }
        }
Ejemplo n.º 11
0
        /// <summary>
        /// assigns the reference and alternate codons [TranscriptVariationAllele.pm:259 codon]
        /// if frameshift variants, add 45 basepair for both ref and alt codon
        /// added 45 base pair for stop loss variant
        /// </summary>
        public static void AssignExtended(TranscriptAnnotation ta, Transcript transcript, ICompressedSequence compressedSequence)
        {
            // sanity check: make sure this is a coding region
            if (!(ta.HasValidCdsEnd && ta.HasValidCdsStart))
            {
                ta.ReferenceCodon = null;
                ta.AlternateCodon = null;
                return;
            }

            // calculate necessary coordinates and lengths
            var aminoAcidStart = ta.ProteinBegin * 3 - 2;
            var aminoAcidEnd   = ta.ProteinEnd * 3;

            var prefixLen = ta.CodingDnaSequenceBegin - aminoAcidStart;
            var suffixLen = aminoAcidEnd - ta.CodingDnaSequenceEnd;

            var codingSequence = new CodingSequence(compressedSequence, transcript.Translation.CodingRegion.GenomicStart,
                                                    transcript.Translation.CodingRegion.GenomicEnd, transcript.CdnaMaps, transcript.Gene.OnReverseStrand,
                                                    transcript.StartExonPhase);
            var aminoAcidSeq = codingSequence.Sequence();

            var start1 = aminoAcidStart - 1;
            var start2 = aminoAcidEnd - suffixLen;

            var maxSuffixLen = aminoAcidSeq.Length - start2;

            var atTailEnd = false;

            if (suffixLen > maxSuffixLen)
            {
                suffixLen = maxSuffixLen;
                atTailEnd = true;
            }

            if (start1 < 0)
            {
                start1 = 0;
            }
            if (start2 < 0)
            {
                start2 = 0;
            }
            if (prefixLen < 0)
            {
                prefixLen = 0;
            }

            var prefix = start1 + prefixLen < aminoAcidSeq.Length
                                ? aminoAcidSeq.Substring(start1, prefixLen).ToLower()
                                : "AAA";

            var suffix = suffixLen > 0
                                ? aminoAcidSeq.Substring(start2, suffixLen).ToLower()
                                : "";

            var needExtend = !atTailEnd && !IsTriplet(prefixLen + suffixLen + ta.TranscriptAlternateAllele.Length);

            var extendedLen = maxSuffixLen - suffixLen > 45 ? 45 : (maxSuffixLen - suffixLen) / 3 * 3;

            if (needExtend)
            {
                suffix = aminoAcidSeq.Substring(start2, suffixLen + extendedLen);
            }

            ta.HasFrameShift  = false;
            ta.ReferenceCodon = GetCodon(ta.TranscriptReferenceAllele, prefix, suffix, ref ta.HasFrameShift, atTailEnd);
            ta.AlternateCodon = GetCodon(ta.TranscriptAlternateAllele, prefix, suffix, ref ta.HasFrameShift, atTailEnd);
        }
Ejemplo n.º 12
0
        // constructor
        public CreateSupplementaryDatabase(
            string compressedReferencePath,
            string nsdBaseFileName,
            string dbSnpFileName        = null,
            string cosmicVcfFile        = null,
            string cosmicTsvFile        = null,
            string clinVarFileName      = null,
            string oneKGenomeAfFileName = null,
            string evsFileName          = null,
            string exacFileName         = null,
            List <string> customFiles   = null,
            string dgvFileName          = null,
            string oneKSvFileName       = null,
            string clinGenFileName      = null,
            string chrWhiteList         = null)
        {
            _nsdBaseFileName = nsdBaseFileName;
            _dataSources     = new List <DataSourceVersion>();

            _iSupplementaryDataItemList = new List <IEnumerator <SupplementaryDataItem> >();
            _supplementaryIntervalList  = new List <SupplementaryInterval>();

            Console.WriteLine("Creating supplementary annotation files... Data version: {0}, schema version: {1}", SupplementaryAnnotationCommon.DataVersion, SupplementaryAnnotationCommon.SchemaVersion);

            _compressedSequence = new CompressedSequence();
            var compressedSequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(compressedReferencePath), _compressedSequence);

            _renamer         = _compressedSequence.Renamer;
            _dataFileManager = new DataFileManager(compressedSequenceReader, _compressedSequence);

            if (!string.IsNullOrEmpty(chrWhiteList))
            {
                Console.WriteLine("Creating SA for the following chromosomes only:");
                foreach (var refSeq in chrWhiteList.Split(','))
                {
                    InputFileParserUtilities.ChromosomeWhiteList.Add(_renamer.GetEnsemblReferenceName(refSeq));
                    Console.Write(refSeq + ",");
                }
                Console.WriteLine();
            }
            else
            {
                InputFileParserUtilities.ChromosomeWhiteList = null;
            }

            if (dbSnpFileName != null)
            {
                AddSourceVersion(dbSnpFileName);

                var dbSnpReader     = new DbSnpReader(new FileInfo(dbSnpFileName), _renamer);
                var dbSnpEnumerator = dbSnpReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(dbSnpEnumerator);
            }

            if (cosmicVcfFile != null && cosmicTsvFile != null)
            {
                AddSourceVersion(cosmicVcfFile);

                var cosmicReader     = new MergedCosmicReader(cosmicVcfFile, cosmicTsvFile, _renamer);
                var cosmicEnumerator = cosmicReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(cosmicEnumerator);
            }

            if (oneKGenomeAfFileName != null)
            {
                AddSourceVersion(oneKGenomeAfFileName);

                var oneKGenReader     = new OneKGenReader(new FileInfo(oneKGenomeAfFileName), _renamer);
                var oneKGenEnumerator = oneKGenReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(oneKGenEnumerator);
            }

            if (oneKSvFileName != null)
            {
                if (oneKGenomeAfFileName == null)
                {
                    AddSourceVersion(oneKSvFileName);
                }

                var oneKGenSvReader     = new OneKGenSvReader(new FileInfo(oneKSvFileName), _renamer);
                var oneKGenSvEnumerator = oneKGenSvReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(oneKGenSvEnumerator);
            }

            if (evsFileName != null)
            {
                AddSourceVersion(evsFileName);

                var evsReader     = new EvsReader(new FileInfo(evsFileName), _renamer);
                var evsEnumerator = evsReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(evsEnumerator);
            }

            if (exacFileName != null)
            {
                AddSourceVersion(exacFileName);

                var exacReader     = new ExacReader(new FileInfo(exacFileName), _renamer);
                var exacEnumerator = exacReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(exacEnumerator);
            }

            if (clinVarFileName != null)
            {
                AddSourceVersion(clinVarFileName);

                var clinVarReader = new ClinVarXmlReader(new FileInfo(clinVarFileName), compressedSequenceReader, _compressedSequence);

                var clinVarList = clinVarReader.ToList();

                clinVarList.Sort();
                Console.WriteLine($"{clinVarList.Count} clinvar items read form XML file");

                IEnumerator <ClinVarItem> clinVarEnumerator = clinVarList.GetEnumerator();
                _iSupplementaryDataItemList.Add(clinVarEnumerator);
            }

            if (dgvFileName != null)
            {
                AddSourceVersion(dgvFileName);

                var dgvReader     = new DgvReader(new FileInfo(dgvFileName), _renamer);
                var dgvEnumerator = dgvReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(dgvEnumerator);
            }

            if (clinGenFileName != null)
            {
                AddSourceVersion(clinGenFileName);
                var clinGenReader     = new ClinGenReader(new FileInfo(clinGenFileName), _renamer);
                var clinGenEnumerator = clinGenReader.GetEnumerator();
                _iSupplementaryDataItemList.Add(clinGenEnumerator);
            }

            if (customFiles != null)
            {
                foreach (var customFile in customFiles)
                {
                    AddSourceVersion(customFile);

                    var customReader     = new CustomAnnotationReader(new FileInfo(customFile), _renamer);
                    var customEnumerator = customReader.GetEnumerator();
                    _iSupplementaryDataItemList.Add(customEnumerator);
                }
            }

            // initializing the IEnumerators in the list
            foreach (var iDataEnumerator in _iSupplementaryDataItemList)
            {
                if (!iDataEnumerator.MoveNext())
                {
                    _iSupplementaryDataItemList.Remove(iDataEnumerator);
                }
            }

            _additionalItemsList = new List <SupplementaryDataItem>();
        }
Ejemplo n.º 13
0
 /// <summary>
 /// constructor
 /// </summary>
 public DataFileManager(CompressedSequenceReader reader, ICompressedSequence compressedSequence)
 {
     _compressedSequence       = compressedSequence;
     _compressedSequenceReader = reader;
 }
Ejemplo n.º 14
0
        /// <summary>
        /// returns the correct start value when retrieving a substring of a substring
        /// where the top level might be reverse complemented
        /// </summary>
        public static string GetSubSubstring(int seqStart, int seqEnd, bool seqOnReverseStrand, int subStart, int subEnd, ICompressedSequence cs)
        {
            var start = seqOnReverseStrand ? seqEnd - subEnd : seqStart + subStart;

            var precedingBases = cs.Substring(start - 1, subEnd - subStart + 1);

            if (seqOnReverseStrand)
            {
                precedingBases = GetReverseComplement(precedingBases);
            }

            return(precedingBases);
        }