public TranscriptAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider, ProteinConservationProvider conservationProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; _refNameToChromosome = sequenceProvider.RefNameToChromosome; _conservationProvider = conservationProvider; using (var stream = PersistentStreamUtils.GetReadStream(CacheConstants.TranscriptPath(pathPrefix))) { (_transcriptCache, TranscriptIntervalArrays, VepVersion) = InitiateCache(stream, sequenceProvider.RefIndexToChromosome, sequenceProvider.Assembly); } Assembly = _transcriptCache.Assembly; DataSourceVersions = _transcriptCache.DataSourceVersions; // TODO: this is not great. We should not be using IEnumerables if we have to resort to strange stuff like this if (conservationProvider != null) { DataSourceVersions = DataSourceVersions.Concat(new[] { conservationProvider.Version }); } _siftStream = PersistentStreamUtils.GetReadStream(CacheConstants.SiftPath(pathPrefix)); _siftReader = new PredictionCacheReader(_siftStream, PredictionCacheReader.SiftDescriptions); _polyphenStream = PersistentStreamUtils.GetReadStream(CacheConstants.PolyPhenPath(pathPrefix)); _polyphenReader = new PredictionCacheReader(_polyphenStream, PredictionCacheReader.PolyphenDescriptions); }
private List <PredictionCache> GetMergedPredictions(string path1, string path2) { var mergedPredictions = new List <PredictionCache>(); using (var reader1 = new PredictionCacheReader(FileUtilities.GetReadStream(path1))) using (var reader2 = new PredictionCacheReader(FileUtilities.GetReadStream(path2))) { _genomeAssembly = reader1.FileHeader.GenomeAssembly; _numRefSeq = reader1.FileHeader.Index.Size; if (_genomeAssembly != reader2.FileHeader.GenomeAssembly) { throw new UserErrorException($"Observed different genome assemblies: {reader1.FileHeader.GenomeAssembly}, {reader2.FileHeader.GenomeAssembly}"); } for (ushort i = 0; i < _numRefSeq; i++) { var cache1 = reader1.Read(i); var cache2 = reader2.Read(i); if (cache1 == PredictionCache.Empty ^ cache2 == PredictionCache.Empty) { throw new DataMisalignedException("one of the cache ran out before the other"); } mergedPredictions.Add(cache1.GetMergedCache(cache2)); } //todo: take care of ref sequences unique to one cache } return(mergedPredictions); }
private static ExitCodes ProgramExecution() { var sequenceData = SequenceHelper.GetDictionaries(_refSequencePath); var logger = new ConsoleLogger(); var caches = LoadTranscriptCaches(logger, CacheConstants.TranscriptPath(_inputPrefix), CacheConstants.TranscriptPath(_inputPrefix2), sequenceData.refIndexToChromosome); if (caches.Cache.TranscriptIntervalArrays.Length != caches.Cache2.TranscriptIntervalArrays.Length) { throw new InvalidDataException($"Expected the number of reference sequences in cache 1 ({caches.Cache.TranscriptIntervalArrays.Length}) and cache 2 ({caches.Cache2.TranscriptIntervalArrays.Length}) to be the same."); } int numRefSeqs = caches.Cache.TranscriptIntervalArrays.Length; var combinedIntervalArrays = new IntervalArray <ITranscript> [numRefSeqs]; var siftPredictionsPerRef = new Prediction[numRefSeqs][]; var polyphenPredictionsPerRef = new Prediction[numRefSeqs][]; PredictionHeader siftHeader; PredictionHeader polyphenHeader; using (var siftReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix)), PredictionCacheReader.SiftDescriptions)) using (var siftReader2 = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(_inputPrefix2)), PredictionCacheReader.SiftDescriptions)) using (var polyphenReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(_inputPrefix)), PredictionCacheReader.PolyphenDescriptions)) using (var polyphenReader2 = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(_inputPrefix2)), PredictionCacheReader.PolyphenDescriptions)) { siftHeader = siftReader.Header; polyphenHeader = polyphenReader.Header; for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceData.refIndexToChromosome[refIndex]; Console.ForegroundColor = ConsoleColor.Yellow; logger.WriteLine($"\n{chromosome.UcscName}:"); Console.ResetColor(); var sift = CombinePredictions(logger, chromosome, "SIFT", siftReader, siftReader2); siftPredictionsPerRef[refIndex] = sift.Predictions; var polyphen = CombinePredictions(logger, chromosome, "PolyPhen", polyphenReader, polyphenReader2); polyphenPredictionsPerRef[refIndex] = polyphen.Predictions; var transcriptIntervalArray = caches.Cache.TranscriptIntervalArrays[refIndex]; var transcriptIntervalArray2 = caches.Cache2.TranscriptIntervalArrays[refIndex]; combinedIntervalArrays[refIndex] = CombineTranscripts(logger, transcriptIntervalArray, transcriptIntervalArray2, sift.Offset, polyphen.Offset); } } logger.WriteLine(); WritePredictions(logger, "SIFT", CacheConstants.SiftPath(_outputPrefix), siftHeader, siftPredictionsPerRef); WritePredictions(logger, "PolyPhen", CacheConstants.PolyPhenPath(_outputPrefix), polyphenHeader, polyphenPredictionsPerRef); WriteTranscripts(logger, CloneHeader(caches.Cache.Header), combinedIntervalArrays, caches.Cache.RegulatoryRegionIntervalArrays); return(ExitCodes.Success); }
private DataBundle(CompressedSequenceReader sequenceReader, PredictionCacheReader siftReader, PredictionCacheReader polyPhenReader, VC.TranscriptCacheData cacheData, VC.TranscriptCache transcriptCache, Source source) { SequenceReader = sequenceReader; TranscriptCacheData = cacheData; TranscriptCache = transcriptCache; Source = source; SiftReader = siftReader; PolyPhenReader = polyPhenReader; }
private static (PredictionCacheStaging Staging, Prediction[] Predictions) GetPredictionStaging( string description, IEnumerable <ITranscript> transcripts, IChromosome chromosome, IReadOnlyList <Prediction> oldPredictions, PredictionCacheReader reader, Func <ITranscript, int> indexFunc, int numRefSeqs) { Logger.Write($"- retrieving {description} predictions... "); var indexSet = GetUniqueIndices(transcripts, indexFunc); var predictionsPerRef = GetPredictions(indexSet, chromosome, numRefSeqs, oldPredictions); var staging = new PredictionCacheStaging(reader.Header, predictionsPerRef); Logger.WriteLine($"found {indexSet.Count} predictions."); return(staging, predictionsPerRef[chromosome.Index]);
public TranscriptAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; (_transcriptCache, VepVersion) = InitiateCache(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(pathPrefix)), sequenceProvider.RefIndexToChromosome, sequenceProvider.GenomeAssembly); GenomeAssembly = _transcriptCache.GenomeAssembly; DataSourceVersions = _transcriptCache.DataSourceVersions; _siftReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(pathPrefix)), PredictionCacheReader.SiftDescriptions); _polyphenReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(pathPrefix)), PredictionCacheReader.PolyphenDescriptions); }
private Dictionary <ushort, int> GetPredictionMatrixCount(string path) { var countPerRefSeq = new Dictionary <ushort, int>(); using (var reader1 = new PredictionCacheReader(FileUtilities.GetReadStream(path))) { _numRefSeq = reader1.FileHeader.Index.Size; for (ushort i = 0; i < _numRefSeq; i++) { var cache1 = reader1.Read(i); countPerRefSeq[i] = cache1.PredictionCount; } } return(countPerRefSeq); }
public static DataBundle GetDataBundle(string referencePath, string cachePrefix) { var sequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(referencePath)); var siftReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.SiftPath(cachePrefix)), PredictionCacheReader.SiftDescriptions); var polyPhenReader = new PredictionCacheReader(FileUtilities.GetReadStream(CacheConstants.PolyPhenPath(cachePrefix)), PredictionCacheReader.PolyphenDescriptions); VC.TranscriptCacheData cacheData; VC.TranscriptCache cache; Source source; using (var transcriptReader = new TranscriptCacheReader(FileUtilities.GetReadStream(CacheConstants.TranscriptPath(cachePrefix)))) { cacheData = transcriptReader.Read(sequenceReader.RefIndexToChromosome); cache = cacheData.GetCache(); source = transcriptReader.Header.Source; } return(new DataBundle(sequenceReader, siftReader, polyPhenReader, cacheData, cache, source)); }
public TranscriptAnnotationProvider(string pathPrefix, ISequenceProvider sequenceProvider) { Name = "Transcript annotation provider"; _sequence = sequenceProvider.Sequence; var transcriptStream = PersistentStreamUtils.GetReadStream(CacheConstants.TranscriptPath(pathPrefix)); (_transcriptCache, TranscriptIntervalArrays, VepVersion) = InitiateCache(transcriptStream, sequenceProvider.RefIndexToChromosome, sequenceProvider.Assembly); Assembly = _transcriptCache.Assembly; DataSourceVersions = _transcriptCache.DataSourceVersions; var siftStream = PersistentStreamUtils.GetReadStream(CacheConstants.SiftPath(pathPrefix)); _siftReader = new PredictionCacheReader(siftStream, PredictionCacheReader.SiftDescriptions); var polyphenStream = PersistentStreamUtils.GetReadStream(CacheConstants.PolyPhenPath(pathPrefix)); _polyphenReader = new PredictionCacheReader(polyphenStream, PredictionCacheReader.PolyphenDescriptions); }
private static (Prediction[] Predictions, int Offset) CombinePredictions(ILogger logger, IChromosome chromosome, string description, PredictionCacheReader reader, PredictionCacheReader reader2) { logger.Write($"- load {description} predictions... "); var predictions = reader.GetPredictions(chromosome.Index); var predictions2 = reader2.GetPredictions(chromosome.Index); logger.WriteLine("finished."); var combinedPredictions = CombinePredictions(logger, description, predictions, predictions2); return(combinedPredictions, predictions.Length); }