private void WriteGene(TextWriter writer, string ucscReferenceName, TranscriptDataSource transcriptDataSource, Gene gene) { if (_observedGenes.Contains(gene)) { return; } _observedGenes.Add(gene); var strand = gene.OnReverseStrand ? '-' : '+'; writer.Write($"{ucscReferenceName}\t{transcriptDataSource}\tgene\t{gene.Start}\t{gene.End}\t.\t{strand}\t.\t"); var geneId = transcriptDataSource == TranscriptDataSource.Ensembl ? gene.EnsemblId.ToString() : gene.EntrezGeneId.ToString(); if (!string.IsNullOrEmpty(geneId)) { writer.Write($"gene_id \"{geneId}\"; "); } if (!gene.EntrezGeneId.IsEmpty) { writer.Write($"entrez_gene_id \"{gene.EntrezGeneId}\"; "); } if (!gene.EnsemblId.IsEmpty) { writer.Write($"ensembl_gene_id \"{gene.EnsemblId}\"; "); } if (!string.IsNullOrEmpty(gene.Symbol)) { writer.Write($"gene_name \"{gene.Symbol}\"; "); } writer.WriteLine($"internal_gene_id \"{_internalGeneId[gene]}\"; "); }
/// <summary> /// returns our transcript prefix whitelist given a data source /// </summary> private static string[] GetTranscriptPrefixWhiteList(TranscriptDataSource ds) { // Transcript IDs in the VEP builds: // // RefSeq VEP72: #*, ENSEST*, CCDS*, NC_, NM_, NP_, NR_, XM_, XP_ // Ensembl VEP72: ENSE0*, ENSG0*, ENSP0*, ENST0* // // RefSeq VEP79: #*, CCDS*, ENSE0*, ENSG0*, ENSP0*, ENST*, [gene names], id*, LOC*, NC_*, NM_*, NP_*, NR_*, XM_*, XP_*, XR_* // Ensembl VEP79: ENSE0*, ENSG0*, ENSP0*, ENST0* // // RefSeq 2015-04-20: NG_*, NP_*, XP_*, YP_*, NR_*, NM_*, XR_*, XM_* string[] whiteList; switch (ds) { case TranscriptDataSource.Ensembl: whiteList = new[] { "ENSE0", "ENSG0", "ENSP0", "ENST0" }; break; case TranscriptDataSource.RefSeq: whiteList = new[] { "NG_", "NM_", "NP_", "NR_", "XM_", "XP_", "XR_", "YP_" }; break; default: throw new GeneralException($"Unhandled import mode found: {ds}"); } return(whiteList); }
/// <summary> /// writes the header to our output file /// </summary> private static void WriteHeader(StreamWriter writer, GlobalImportCommon.FileType fileType, TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly) { var vepReleaseTicks = DateTime.Parse(ConfigurationSettings.VepReleaseDate).Ticks; writer.WriteLine("{0}\t{1}", GlobalImportCommon.Header, (byte)fileType); writer.WriteLine("{0}\t{1}\t{2}\t{3}", ConfigurationSettings.VepVersion, vepReleaseTicks, (byte)transcriptSource, (byte)genomeAssembly); }
/// <summary> /// constructor /// </summary> public GlobalImportHeader(ushort vepVersion, long vepReleaseTicks, TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly) { VepVersion = vepVersion; VepReleaseTicks = vepReleaseTicks; TranscriptSource = transcriptSource; GenomeAssembly = genomeAssembly; }
/// <summary> /// constructor /// </summary> public GeneSymbolUpdater(List <MutableGene> genes, string description, SymbolDataSource geneInfoSource, SymbolDataSource hgncSource) { _genes = genes; _description = description; _transcriptDataSource = genes.First().TranscriptDataSource; _geneInfoSource = geneInfoSource; _hgncSource = hgncSource; }
/// <summary> /// constructor /// </summary> private CacheFile(string path, ushort refIndex, TranscriptDataSource ds, MiniCacheType type, IUpdater updater) { CachePath = path; TranscriptDataSource = ds; Type = type; ReferenceIndex = refIndex; Updater = updater; AddInputFiles(FileOperations.GetFullPathWithoutExtension(path)); }
public bool TryUpdateSymbol(TranscriptDataSource source, MutableGene gene, string geneId) { var dict = source == TranscriptDataSource.Ensembl ? _ensemblIdToSymbol : _entrezGeneIdToSymbol; UniqueString newSymbol; if (!dict.TryGetValue(geneId, out newSymbol)) return false; if (newSymbol.HasConflict) return false; gene.Symbol = newSymbol.Value; return true; }
private static void EvaluateTranscriptSource(TranscriptDataSource transcriptSource, ref bool hasEnsembl, ref bool hasRefSeq) { if (transcriptSource == TranscriptDataSource.Ensembl) { hasEnsembl = true; } if (transcriptSource == TranscriptDataSource.RefSeq) { hasRefSeq = true; } }
/// <summary> /// constructor /// </summary> private PredictionCacheHeader(string identifier, ushort schemaVersion, ushort dataVersion, TranscriptDataSource transcriptSource, long creationTimeTicks, GenomeAssembly genomeAssembly, PredictionCustomHeader index) { Identifier = identifier; SchemaVersion = schemaVersion; DataVersion = dataVersion; TranscriptSource = transcriptSource; CreationTimeTicks = creationTimeTicks; GenomeAssembly = genomeAssembly; Index = index; Size = Identifier.Length + Custom.Size + InternalSize; }
/// <summary> /// constructor /// </summary> public FileHeader(string identifier, ushort schemaVersion, ushort dataVersion, TranscriptDataSource transcriptSource, long creationTimeTicks, GenomeAssembly genomeAssembly, ICustomFileHeader customHeader) { Identifier = identifier; SchemaVersion = schemaVersion; DataVersion = dataVersion; TranscriptSource = transcriptSource; CreationTimeTicks = creationTimeTicks; GenomeAssembly = genomeAssembly; Custom = customHeader; Size = identifier.Length + customHeader.Size + InternalSize + 1; }
private static void CheckTranscriptSource(TranscriptDataSource ts, TranscriptDataSource ts2) { bool hasRefSeq = false; bool hasEnsembl = false; EvaluateTranscriptSource(ts, ref hasEnsembl, ref hasRefSeq); EvaluateTranscriptSource(ts2, ref hasEnsembl, ref hasRefSeq); if (!hasEnsembl || !hasRefSeq) { throw new UserErrorException("Expected one RefSeq and one Ensembl cache file. Please revise the --in and --in2 command-line arguments."); } }
/// <summary> /// writes the header to our output file /// </summary> private static void WriteHeader(BinaryWriter writer, GlobalImportCommon.FileType fileType, TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly) { var vepReleaseTicks = DateTime.Parse(ConfigurationSettings.VepReleaseDate).Ticks; writer.Write(GlobalImportCommon.Header); writer.Write((byte)fileType); writer.Write(ConfigurationSettings.VepVersion); writer.Write(vepReleaseTicks); writer.Write((byte)transcriptSource); writer.Write((byte)genomeAssembly); writer.Write(CacheConstants.GuardInt); }
/// <summary> /// constructor /// </summary> public VepCacheParser(TranscriptDataSource ds) { ImportDataStore.TranscriptSource = ds; _uniqueDataStore = new ImportDataStore(); _nonUniquedataStore = new ImportDataStore(); _tempDataStore = new ImportDataStore(); _regulatoryStatistics = new FeatureStatistics("Regulatory"); _transcriptStatistics = new FeatureStatistics("Transcripts"); _geneStatistics = new FeatureStatistics("Genes"); _intronStatistics = new FeatureStatistics("Introns"); _exonStatistics = new FeatureStatistics("Exons"); _mirnaStatistics = new FeatureStatistics("miRNAs"); _siftStatistics = new FeatureStatistics("SIFT matrices"); _polyphenStatistics = new FeatureStatistics("PolyPhen matrices"); _cdnaStatistics = new FeatureStatistics("cDNA seqs"); _peptideStatistics = new FeatureStatistics("Peptide seqs"); }
/// <summary> /// adds a transcript to this variant /// </summary> public void AddTranscript(Transcript transcript, TranscriptDataSource transcriptDataSource) { IList <IAnnotatedTranscript> transcripts; switch (transcriptDataSource) { case TranscriptDataSource.Ensembl: transcripts = EnsemblTranscripts; break; case TranscriptDataSource.RefSeq: transcripts = RefSeqTranscripts; break; default: throw new GeneralException($"Found a transcript ({transcript.TranscriptID}) with an unexpected transcript data source ({transcriptDataSource})"); } transcripts.Add(transcript); }
/// <summary> /// constructor /// </summary> public Transcript(ushort referenceIndex, int start, int end, CompactId id, byte version, Translation translation, BioType bioType, Gene gene, int totalExonLength, byte startExonPhase, bool isCanonical, SimpleInterval[] introns, SimpleInterval[] microRnas, CdnaCoordinateMap[] cdnaMaps, int siftIndex, int polyPhenIndex, TranscriptDataSource transcriptSource) : base(referenceIndex, start, end) { Id = id; Version = version; Translation = translation; BioType = bioType; Gene = gene; TotalExonLength = totalExonLength; StartExonPhase = startExonPhase; IsCanonical = isCanonical; Introns = introns; MicroRnas = microRnas; CdnaMaps = cdnaMaps; SiftIndex = siftIndex; PolyPhenIndex = polyPhenIndex; TranscriptSource = transcriptSource; TotalExonLength = TranscriptUtilities.GetTotalExonLength(cdnaMaps); }
public BreakendTranscriptAnnotation(Transcript transcript, int breakendPosition, char isBreakendSuffix) { var transcriptId = TranscriptUtilities.GetTranscriptId(transcript); TranscriptDataSource = transcript.TranscriptSource; GeneName = transcript.Gene.Symbol; if (transcript.Translation == null || breakendPosition < transcript.Translation.CodingRegion.GenomicStart || breakendPosition > transcript.Translation.CodingRegion.GenomicEnd) { InCodingRegion = false; return; } InCodingRegion = true; var transcriptCdnaLength = transcript.Translation.CodingRegion.CdnaEnd - transcript.Translation.CodingRegion.CdnaStart + 1; _referenceName = "chr"; _codingStart = transcript.Translation.CodingRegion.GenomicStart; _codingEnd = transcript.Translation.CodingRegion.GenomicEnd; // map cdn position var complementaryCdnaPosDescription = MapCdnaPosition(transcript, breakendPosition); LocateExonIntron(transcript, breakendPosition); var transcriptOrientation = transcript.Gene.OnReverseStrand ? '-' : '+'; ConsistentOrientation = transcriptOrientation == isBreakendSuffix; if (ConsistentOrientation) { HgvsDescription = GeneName + "{" + transcriptId + "}" + ":c." + complementaryCdnaPosDescription + "_" + transcriptCdnaLength; } else { HgvsDescription = GeneName + "{" + transcriptId + "}" + ":c." + 1 + "_" + complementaryCdnaPosDescription; } }
/// <summary> /// constructor /// </summary> public EncodedTranscriptData(BioType bioType, byte version, TranscriptDataSource transcriptSource, bool isCanonical, bool hasSift, bool hasPolyPhen, bool hasMicroRnas, bool hasIntrons, bool hasCdnaMaps, bool hasTranslation, byte startExonPhase) { _info = (ushort)bioType; _info |= (ushort)(version << VersionShift); _info |= (ushort)((ushort)transcriptSource << TranscriptSourceShift); if (isCanonical) { _info |= CanonicalMask; } _contents = (byte)(startExonPhase << StartExonShift); if (hasSift) { _contents |= SiftMask; } if (hasPolyPhen) { _contents |= PolyPhenMask; } if (hasMicroRnas) { _contents |= MirnasMask; } if (hasIntrons) { _contents |= IntronsMask; } if (hasCdnaMaps) { _contents |= CdnaMapsMask; } if (hasTranslation) { _contents |= TranslationMask; } }
private DataBundle GetDataBundle(string genomeAssembly, TranscriptDataSource ds) { var compressedSequencePath = GetCompressedSequencePath(_referenceDir, genomeAssembly); var cachePrefix = GetCachePrefix(_cacheRoot, genomeAssembly, ds, _newVepVersion); var sequence = new CompressedSequence(); var bundle = new DataBundle { Sequence = sequence, SequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(compressedSequencePath), sequence), Cache = CacheUtilities.LoadCache(cachePrefix), SiftReader = CacheUtilities.GetPredictionReader(CacheConstants.SiftPath(cachePrefix)), PolyPhenReader = CacheUtilities.GetPredictionReader(CacheConstants.PolyPhenPath(cachePrefix)) }; if (bundle.Cache == null) { return(null); } bundle.TranscriptForest = CacheUtilities.GetIntervalForest(bundle.Cache.Transcripts, bundle.Sequence.Renamer.NumRefSeqs); return(bundle); }
public static List <MutableGene> GetGenesByDataSource(List <MutableGene> genes, TranscriptDataSource desiredDataSource) { return(genes.Where(gene => gene.TranscriptDataSource == desiredDataSource).ToList()); }
private static string GetCachePrefix(string cacheRoot, string genomeAssembly, TranscriptDataSource ds, ushort vepVersion) { return(Path.Combine(cacheRoot, genomeAssembly, $"{ds}{vepVersion}")); }
private void ProcessTranscriptDataSource(IEnumerable <CacheFile> cacheFiles, string genomeAssembly, TranscriptDataSource ds) { _numOutdatedFiles = 0; _numCurrentFiles = 0; var bundle = GetDataBundle(genomeAssembly, ds); if (bundle == null) { Console.WriteLine("- skipping transcript data source: {0}", ds); return; } Console.WriteLine("- Transcript data source: {0}", ds); foreach (var cacheFile in cacheFiles.OrderBy(x => x.ReferenceIndex)) { if (cacheFile.TranscriptDataSource != ds) { continue; } ProcessCacheFile(cacheFile, bundle); } if (_numOutdatedFiles == 0 && _numCurrentFiles > 0) { Console.WriteLine(" - All {0} files are already up-to-date.", _numCurrentFiles); } }