private static void WritePredictions(PredictionWriter writer, IReadOnlyList <MutableTranscript> transcripts, Func <MutableTranscript, string> predictionFunc, IChromosome chromosome) { var predictionDict = new Dictionary <string, List <int> >(StringComparer.Ordinal); for (var transcriptIndex = 0; transcriptIndex < transcripts.Count; transcriptIndex++) { var transcript = transcripts[transcriptIndex]; string predictionData = predictionFunc(transcript); if (predictionData == null) { continue; } if (predictionDict.TryGetValue(predictionData, out var transcriptIdList)) { transcriptIdList.Add(transcriptIndex); } else { predictionDict[predictionData] = new List <int> { transcriptIndex } }; } writer.Write(chromosome, predictionDict); }
private static ExitCodes ProgramExecution() { var transcriptSource = GetSource(_transcriptSource); var sequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath)); var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome); var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory); var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks; var idToGenbank = GetIdToGenbank(genomeAssembly, transcriptSource); // ========================= // create the pre-cache file // ========================= // process each VEP directory int numRefSeqs = sequenceReader.NumRefSeqs; var header = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs); string siftPath = _outputStub + ".sift.gz"; string polyphenPath = _outputStub + ".polyphen.gz"; string transcriptPath = _outputStub + ".transcripts.gz"; string regulatoryPath = _outputStub + ".regulatory.gz"; using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log"))) using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift)) using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen)) using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header)) using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header)) { var converter = new VepCacheParser(transcriptSource); var emptyPredictionDict = new Dictionary <string, List <int> >(); for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceReader.RefIndexToChromosome[refIndex]; if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir)) { siftWriter.Write(chromosome, emptyPredictionDict); polyphenWriter.Write(chromosome, emptyPredictionDict); continue; } Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName); var rawData = converter.ParseDumpDirectory(chromosome, vepSubDir); var mergedTranscripts = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank); var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions); int numRawTranscripts = rawData.Transcripts.Count; int numMergedTranscripts = mergedTranscripts.Count; Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}"); WriteTranscripts(transcriptWriter, mergedTranscripts); WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions); WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome); WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome); } } Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count); return(ExitCodes.Success); }