Exemple #1
0
        private void WriteGene(TextWriter writer, string ucscReferenceName,
                               TranscriptDataSource transcriptDataSource, Gene gene)
        {
            if (_observedGenes.Contains(gene))
            {
                return;
            }
            _observedGenes.Add(gene);

            var strand = gene.OnReverseStrand ? '-' : '+';

            writer.Write($"{ucscReferenceName}\t{transcriptDataSource}\tgene\t{gene.Start}\t{gene.End}\t.\t{strand}\t.\t");

            var geneId = transcriptDataSource == TranscriptDataSource.Ensembl
                ? gene.EnsemblId.ToString()
                : gene.EntrezGeneId.ToString();

            if (!string.IsNullOrEmpty(geneId))
            {
                writer.Write($"gene_id \"{geneId}\"; ");
            }
            if (!gene.EntrezGeneId.IsEmpty)
            {
                writer.Write($"entrez_gene_id \"{gene.EntrezGeneId}\"; ");
            }
            if (!gene.EnsemblId.IsEmpty)
            {
                writer.Write($"ensembl_gene_id \"{gene.EnsemblId}\"; ");
            }
            if (!string.IsNullOrEmpty(gene.Symbol))
            {
                writer.Write($"gene_name \"{gene.Symbol}\"; ");
            }
            writer.WriteLine($"internal_gene_id \"{_internalGeneId[gene]}\"; ");
        }
Exemple #2
0
        /// <summary>
        /// returns our transcript prefix whitelist given a data source
        /// </summary>
        private static string[] GetTranscriptPrefixWhiteList(TranscriptDataSource ds)
        {
            // Transcript IDs in the VEP builds:
            //
            // RefSeq  VEP72: #*, ENSEST*, CCDS*, NC_, NM_, NP_, NR_, XM_, XP_
            // Ensembl VEP72: ENSE0*, ENSG0*, ENSP0*, ENST0*
            //
            // RefSeq  VEP79: #*, CCDS*, ENSE0*, ENSG0*, ENSP0*, ENST*, [gene names], id*, LOC*, NC_*, NM_*, NP_*, NR_*, XM_*, XP_*, XR_*
            // Ensembl VEP79: ENSE0*, ENSG0*, ENSP0*, ENST0*
            //
            // RefSeq 2015-04-20: NG_*, NP_*, XP_*, YP_*, NR_*, NM_*, XR_*, XM_*
            string[] whiteList;

            switch (ds)
            {
            case TranscriptDataSource.Ensembl:
                whiteList = new[] { "ENSE0", "ENSG0", "ENSP0", "ENST0" };
                break;

            case TranscriptDataSource.RefSeq:
                whiteList = new[] { "NG_", "NM_", "NP_", "NR_", "XM_", "XP_", "XR_", "YP_" };
                break;

            default:
                throw new GeneralException($"Unhandled import mode found: {ds}");
            }

            return(whiteList);
        }
        /// <summary>
        /// writes the header to our output file
        /// </summary>
        private static void WriteHeader(StreamWriter writer, GlobalImportCommon.FileType fileType,
                                        TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly)
        {
            var vepReleaseTicks = DateTime.Parse(ConfigurationSettings.VepReleaseDate).Ticks;

            writer.WriteLine("{0}\t{1}", GlobalImportCommon.Header, (byte)fileType);
            writer.WriteLine("{0}\t{1}\t{2}\t{3}", ConfigurationSettings.VepVersion, vepReleaseTicks, (byte)transcriptSource, (byte)genomeAssembly);
        }
Exemple #4
0
 /// <summary>
 /// constructor
 /// </summary>
 public GlobalImportHeader(ushort vepVersion, long vepReleaseTicks, TranscriptDataSource transcriptSource,
                           GenomeAssembly genomeAssembly)
 {
     VepVersion       = vepVersion;
     VepReleaseTicks  = vepReleaseTicks;
     TranscriptSource = transcriptSource;
     GenomeAssembly   = genomeAssembly;
 }
Exemple #5
0
 /// <summary>
 /// constructor
 /// </summary>
 public GeneSymbolUpdater(List <MutableGene> genes, string description, SymbolDataSource geneInfoSource,
                          SymbolDataSource hgncSource)
 {
     _genes                = genes;
     _description          = description;
     _transcriptDataSource = genes.First().TranscriptDataSource;
     _geneInfoSource       = geneInfoSource;
     _hgncSource           = hgncSource;
 }
Exemple #6
0
        /// <summary>
        /// constructor
        /// </summary>
        private CacheFile(string path, ushort refIndex, TranscriptDataSource ds, MiniCacheType type, IUpdater updater)
        {
            CachePath            = path;
            TranscriptDataSource = ds;
            Type           = type;
            ReferenceIndex = refIndex;
            Updater        = updater;

            AddInputFiles(FileOperations.GetFullPathWithoutExtension(path));
        }
        public bool TryUpdateSymbol(TranscriptDataSource source, MutableGene gene, string geneId)
        {
            var dict = source == TranscriptDataSource.Ensembl ? _ensemblIdToSymbol : _entrezGeneIdToSymbol;

            UniqueString newSymbol;
            if (!dict.TryGetValue(geneId, out newSymbol)) return false;
            if (newSymbol.HasConflict) return false;

            gene.Symbol = newSymbol.Value;
            return true;
        }
Exemple #8
0
 private static void EvaluateTranscriptSource(TranscriptDataSource transcriptSource, ref bool hasEnsembl,
                                              ref bool hasRefSeq)
 {
     if (transcriptSource == TranscriptDataSource.Ensembl)
     {
         hasEnsembl = true;
     }
     if (transcriptSource == TranscriptDataSource.RefSeq)
     {
         hasRefSeq = true;
     }
 }
 /// <summary>
 /// constructor
 /// </summary>
 private PredictionCacheHeader(string identifier, ushort schemaVersion, ushort dataVersion,
                               TranscriptDataSource transcriptSource, long creationTimeTicks, GenomeAssembly genomeAssembly,
                               PredictionCustomHeader index)
 {
     Identifier        = identifier;
     SchemaVersion     = schemaVersion;
     DataVersion       = dataVersion;
     TranscriptSource  = transcriptSource;
     CreationTimeTicks = creationTimeTicks;
     GenomeAssembly    = genomeAssembly;
     Index             = index;
     Size = Identifier.Length + Custom.Size + InternalSize;
 }
Exemple #10
0
 /// <summary>
 /// constructor
 /// </summary>
 public FileHeader(string identifier, ushort schemaVersion, ushort dataVersion,
                   TranscriptDataSource transcriptSource, long creationTimeTicks, GenomeAssembly genomeAssembly,
                   ICustomFileHeader customHeader)
 {
     Identifier        = identifier;
     SchemaVersion     = schemaVersion;
     DataVersion       = dataVersion;
     TranscriptSource  = transcriptSource;
     CreationTimeTicks = creationTimeTicks;
     GenomeAssembly    = genomeAssembly;
     Custom            = customHeader;
     Size = identifier.Length + customHeader.Size + InternalSize + 1;
 }
Exemple #11
0
        private static void CheckTranscriptSource(TranscriptDataSource ts, TranscriptDataSource ts2)
        {
            bool hasRefSeq  = false;
            bool hasEnsembl = false;

            EvaluateTranscriptSource(ts, ref hasEnsembl, ref hasRefSeq);
            EvaluateTranscriptSource(ts2, ref hasEnsembl, ref hasRefSeq);

            if (!hasEnsembl || !hasRefSeq)
            {
                throw new UserErrorException("Expected one RefSeq and one Ensembl cache file. Please revise the --in and --in2 command-line arguments.");
            }
        }
        /// <summary>
        /// writes the header to our output file
        /// </summary>
        private static void WriteHeader(BinaryWriter writer, GlobalImportCommon.FileType fileType,
                                        TranscriptDataSource transcriptSource, GenomeAssembly genomeAssembly)
        {
            var vepReleaseTicks = DateTime.Parse(ConfigurationSettings.VepReleaseDate).Ticks;

            writer.Write(GlobalImportCommon.Header);
            writer.Write((byte)fileType);
            writer.Write(ConfigurationSettings.VepVersion);
            writer.Write(vepReleaseTicks);
            writer.Write((byte)transcriptSource);
            writer.Write((byte)genomeAssembly);
            writer.Write(CacheConstants.GuardInt);
        }
Exemple #13
0
        /// <summary>
        /// constructor
        /// </summary>
        public VepCacheParser(TranscriptDataSource ds)
        {
            ImportDataStore.TranscriptSource = ds;

            _uniqueDataStore    = new ImportDataStore();
            _nonUniquedataStore = new ImportDataStore();
            _tempDataStore      = new ImportDataStore();

            _regulatoryStatistics = new FeatureStatistics("Regulatory");
            _transcriptStatistics = new FeatureStatistics("Transcripts");
            _geneStatistics       = new FeatureStatistics("Genes");
            _intronStatistics     = new FeatureStatistics("Introns");
            _exonStatistics       = new FeatureStatistics("Exons");
            _mirnaStatistics      = new FeatureStatistics("miRNAs");
            _siftStatistics       = new FeatureStatistics("SIFT matrices");
            _polyphenStatistics   = new FeatureStatistics("PolyPhen matrices");
            _cdnaStatistics       = new FeatureStatistics("cDNA seqs");
            _peptideStatistics    = new FeatureStatistics("Peptide seqs");
        }
Exemple #14
0
        /// <summary>
        /// adds a transcript to this variant
        /// </summary>
        public void AddTranscript(Transcript transcript, TranscriptDataSource transcriptDataSource)
        {
            IList <IAnnotatedTranscript> transcripts;

            switch (transcriptDataSource)
            {
            case TranscriptDataSource.Ensembl:
                transcripts = EnsemblTranscripts;
                break;

            case TranscriptDataSource.RefSeq:
                transcripts = RefSeqTranscripts;
                break;

            default:
                throw new GeneralException($"Found a transcript ({transcript.TranscriptID}) with an unexpected transcript data source ({transcriptDataSource})");
            }

            transcripts.Add(transcript);
        }
Exemple #15
0
 /// <summary>
 /// constructor
 /// </summary>
 public Transcript(ushort referenceIndex, int start, int end, CompactId id, byte version,
                   Translation translation, BioType bioType, Gene gene, int totalExonLength, byte startExonPhase,
                   bool isCanonical, SimpleInterval[] introns, SimpleInterval[] microRnas, CdnaCoordinateMap[] cdnaMaps,
                   int siftIndex, int polyPhenIndex, TranscriptDataSource transcriptSource) : base(referenceIndex, start, end)
 {
     Id               = id;
     Version          = version;
     Translation      = translation;
     BioType          = bioType;
     Gene             = gene;
     TotalExonLength  = totalExonLength;
     StartExonPhase   = startExonPhase;
     IsCanonical      = isCanonical;
     Introns          = introns;
     MicroRnas        = microRnas;
     CdnaMaps         = cdnaMaps;
     SiftIndex        = siftIndex;
     PolyPhenIndex    = polyPhenIndex;
     TranscriptSource = transcriptSource;
     TotalExonLength  = TranscriptUtilities.GetTotalExonLength(cdnaMaps);
 }
        public BreakendTranscriptAnnotation(Transcript transcript, int breakendPosition, char isBreakendSuffix)
        {
            var transcriptId = TranscriptUtilities.GetTranscriptId(transcript);

            TranscriptDataSource = transcript.TranscriptSource;

            GeneName = transcript.Gene.Symbol;

            if (transcript.Translation == null || breakendPosition < transcript.Translation.CodingRegion.GenomicStart || breakendPosition > transcript.Translation.CodingRegion.GenomicEnd)
            {
                InCodingRegion = false;
                return;
            }

            InCodingRegion = true;
            var transcriptCdnaLength = transcript.Translation.CodingRegion.CdnaEnd - transcript.Translation.CodingRegion.CdnaStart + 1;

            _referenceName = "chr";
            _codingStart   = transcript.Translation.CodingRegion.GenomicStart;
            _codingEnd     = transcript.Translation.CodingRegion.GenomicEnd;

            // map cdn position
            var complementaryCdnaPosDescription = MapCdnaPosition(transcript, breakendPosition);

            LocateExonIntron(transcript, breakendPosition);

            var transcriptOrientation = transcript.Gene.OnReverseStrand ? '-' : '+';

            ConsistentOrientation = transcriptOrientation == isBreakendSuffix;

            if (ConsistentOrientation)
            {
                HgvsDescription = GeneName + "{" + transcriptId + "}" + ":c." + complementaryCdnaPosDescription + "_" +
                                  transcriptCdnaLength;
            }
            else
            {
                HgvsDescription = GeneName + "{" + transcriptId + "}" + ":c." + 1 + "_" + complementaryCdnaPosDescription;
            }
        }
        /// <summary>
        /// constructor
        /// </summary>
        public EncodedTranscriptData(BioType bioType, byte version, TranscriptDataSource transcriptSource,
                                     bool isCanonical, bool hasSift, bool hasPolyPhen, bool hasMicroRnas, bool hasIntrons, bool hasCdnaMaps,
                                     bool hasTranslation, byte startExonPhase)
        {
            _info  = (ushort)bioType;
            _info |= (ushort)(version << VersionShift);
            _info |= (ushort)((ushort)transcriptSource << TranscriptSourceShift);
            if (isCanonical)
            {
                _info |= CanonicalMask;
            }

            _contents = (byte)(startExonPhase << StartExonShift);
            if (hasSift)
            {
                _contents |= SiftMask;
            }
            if (hasPolyPhen)
            {
                _contents |= PolyPhenMask;
            }
            if (hasMicroRnas)
            {
                _contents |= MirnasMask;
            }
            if (hasIntrons)
            {
                _contents |= IntronsMask;
            }
            if (hasCdnaMaps)
            {
                _contents |= CdnaMapsMask;
            }
            if (hasTranslation)
            {
                _contents |= TranslationMask;
            }
        }
        private DataBundle GetDataBundle(string genomeAssembly, TranscriptDataSource ds)
        {
            var compressedSequencePath = GetCompressedSequencePath(_referenceDir, genomeAssembly);
            var cachePrefix            = GetCachePrefix(_cacheRoot, genomeAssembly, ds, _newVepVersion);

            var sequence = new CompressedSequence();

            var bundle = new DataBundle
            {
                Sequence       = sequence,
                SequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(compressedSequencePath), sequence),
                Cache          = CacheUtilities.LoadCache(cachePrefix),
                SiftReader     = CacheUtilities.GetPredictionReader(CacheConstants.SiftPath(cachePrefix)),
                PolyPhenReader = CacheUtilities.GetPredictionReader(CacheConstants.PolyPhenPath(cachePrefix))
            };

            if (bundle.Cache == null)
            {
                return(null);
            }

            bundle.TranscriptForest = CacheUtilities.GetIntervalForest(bundle.Cache.Transcripts, bundle.Sequence.Renamer.NumRefSeqs);
            return(bundle);
        }
Exemple #19
0
 public static List <MutableGene> GetGenesByDataSource(List <MutableGene> genes, TranscriptDataSource desiredDataSource)
 {
     return(genes.Where(gene => gene.TranscriptDataSource == desiredDataSource).ToList());
 }
 private static string GetCachePrefix(string cacheRoot, string genomeAssembly, TranscriptDataSource ds, ushort vepVersion)
 {
     return(Path.Combine(cacheRoot, genomeAssembly, $"{ds}{vepVersion}"));
 }
        private void ProcessTranscriptDataSource(IEnumerable <CacheFile> cacheFiles, string genomeAssembly, TranscriptDataSource ds)
        {
            _numOutdatedFiles = 0;
            _numCurrentFiles  = 0;

            var bundle = GetDataBundle(genomeAssembly, ds);

            if (bundle == null)
            {
                Console.WriteLine("- skipping transcript data source: {0}", ds);
                return;
            }

            Console.WriteLine("- Transcript data source: {0}", ds);

            foreach (var cacheFile in cacheFiles.OrderBy(x => x.ReferenceIndex))
            {
                if (cacheFile.TranscriptDataSource != ds)
                {
                    continue;
                }
                ProcessCacheFile(cacheFile, bundle);
            }

            if (_numOutdatedFiles == 0 && _numCurrentFiles > 0)
            {
                Console.WriteLine("  - All {0} files are already up-to-date.", _numCurrentFiles);
            }
        }