public void GZipReadAndWrite() { const string expectedLine1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."; const string expectedLine2 = "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."; var randomPath = GetRandomPath(); using (var writer = GZipUtilities.GetStreamWriter(randomPath)) { writer.WriteLine(expectedLine1); writer.WriteLine(expectedLine2); } string observedLine1; string observedLine2; string observedLine3; using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath)) { observedLine1 = reader.ReadLine(); observedLine2 = reader.ReadLine(); observedLine3 = reader.ReadLine(); } Assert.Equal(expectedLine1, observedLine1); Assert.Equal(expectedLine2, observedLine2); Assert.Null(observedLine3); }
private static ExitCodes ProgramExecution() { using (var writer = GZipUtilities.GetStreamWriter(_outputFileName)) { string cachePath = CacheConstants.TranscriptPath(_inputPrefix); var sequenceData = SequenceHelper.GetDictionaries(_referencePath); // load the cache Console.Write("- reading {0}... ", Path.GetFileName(cachePath)); var cache = TranscriptCacheHelper.GetCache(cachePath, sequenceData.refIndexToChromosome); Console.WriteLine("found {0:N0} reference sequences. ", cache.RegulatoryRegionIntervalArrays.Length); Console.Write("- writing GFF entries... "); foreach (var intervalArray in cache.RegulatoryRegionIntervalArrays) { if (intervalArray == null) { continue; } foreach (var interval in intervalArray.Array) { WriteRegulatoryFeature(writer, interval.Value); } } Console.WriteLine("finished."); } return(ExitCodes.Success); }
private static ExitCodes ProgramExecution() { const string tempLeftoverFilename = "LeftOvers.vcf.gz"; Dictionary <string, StreamWriter> writers; ISequenceProvider srcSequenceProvider = ProviderUtilities.GetSequenceProvider(_srcRefSequence); ISequenceProvider desSequenceProvider = ProviderUtilities.GetSequenceProvider(_desRefSequence); using (var srcReader = GZipUtilities.GetAppropriateStreamReader(_srcMapFile)) using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile)) using (var leftoverWriter = GZipUtilities.GetStreamWriter(tempLeftoverFilename)) { var chromMapper = new ChromMapper(srcReader, destReader, leftoverWriter, srcSequenceProvider, desSequenceProvider); writers = chromMapper.Map(); } //now we will try to map the leftovers using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile)) using (var leftoverReader = GZipUtilities.GetAppropriateStreamReader(tempLeftoverFilename)) { var leftOverMapper = new LeftoverMapper(leftoverReader, destReader, writers, desSequenceProvider); var leftoverCount = leftOverMapper.Map(); Console.WriteLine($"{leftoverCount} leftovers mapped!!"); } foreach (var writer in writers.Values) { writer.Dispose(); } return(ExitCodes.Success); }
public static void Filter(string intputTsv, string gffFile1, string gffFile2, string outputTsv) { var intronFlankingRegions = GetIntronFlankingRegions(gffFile1, gffFile2); using (var resultsReader = GZipUtilities.GetAppropriateStreamReader(intputTsv)) using (var resultsWriter = GZipUtilities.GetStreamWriter(outputTsv)) { long lineCount = 0; string line; while ((line = resultsReader.ReadLine()) != null) { var info = line.TrimEnd().Split('\t'); ushort chrIndex = GetChrIndex(info[PredChrColumn]); int pos = int.Parse(info[PredPosColumn]); if (intronFlankingRegions.OverlapsAny(chrIndex, pos, pos) || AnyScorePassTheCutoff(info, PredScoreColumns, FreqCutoff)) { resultsWriter.WriteLine(line); } lineCount++; if (lineCount % 1_000_000 == 0) { Console.WriteLine($"Processed {lineCount} lines. Current position: {info[PredChrColumn]}:{info[PredPosColumn]}"); } } } }
public void Write(string outFileName) { using (var writer = GZipUtilities.GetStreamWriter(outFileName)) { writer.Write(OutHeader); foreach (var clinGenItem in _clinGenDictionary.Values) { var varType = VariantType.unknown; if (clinGenItem.ObservedGains > 0 && clinGenItem.ObservedLosses == 0) { varType = VariantType.copy_number_gain; } if (clinGenItem.ObservedGains > 0 && clinGenItem.ObservedLosses > 0) { varType = VariantType.copy_number_variation; } if (clinGenItem.ObservedGains == 0 && clinGenItem.ObservedLosses > 0) { varType = VariantType.copy_number_loss; } writer.Write( $"{clinGenItem.Id}\t{clinGenItem.Chromosome}\t{clinGenItem.Start}\t{clinGenItem.End}\t" + $"{clinGenItem.ObservedGains}\t{clinGenItem.ObservedLosses}\t{varType}\t" + $"{clinGenItem.ClinicalInterpretation}\t{clinGenItem.Validated}\t" + $"{string.Join(",", clinGenItem.Phenotypes.ToArray())}\t{string.Join(",", clinGenItem.PhenotypeIds.ToArray())}\n"); } } }
public GeneAnnotationTsvWriter(string outputDirectory, DataSourceVersion dataSourceVersion, string assembly, int dataVersion, string keyName, bool isArray) { var fileName = keyName + "_" + dataSourceVersion.Version.Replace(" ", "_") + ".gene.tsv.gz"; _writer = GZipUtilities.GetStreamWriter(Path.Combine(outputDirectory, fileName)); _writer.Write(GetHeader(dataSourceVersion, dataVersion, assembly, keyName, isArray)); }
public int RemoveConflictingLines() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_inFile)) using (var writer = GZipUtilities.GetStreamWriter(_outFile)) { string line; var vcfLines = new List <string>(VcfBufferSize); //all lines for the last few positions will be tracked in this dictionary var hasConflictingEntry = new Dictionary <string, bool>(); //indicates if there is a conflicting entry for a certain allele. while ((line = reader.ReadLine()) != null) { if (line.StartsWith("#")) { // streaming the header lines writer.WriteLine(line); continue; } // parsing vcf line var vcfColumns = line.Split(new[] { '\t' }, VcfCommon.InfoIndex + 1); var chromosome = vcfColumns[VcfCommon.ChromIndex]; var vcfPosition = Convert.ToInt32(vcfColumns[VcfCommon.PosIndex]); var refAllele = vcfColumns[VcfCommon.RefIndex]; var altAlleles = vcfColumns[VcfCommon.AltIndex].Split(','); if (chromosome != _currentRefSeq || vcfPosition > _maxVidPosition) { FlushVcfLineBuffer(vcfLines, hasConflictingEntry, writer); vcfLines.Clear(); hasConflictingEntry.Clear(); _currentRefSeq = chromosome; } foreach (var altAllele in altAlleles) { var alleleId = GetAlleleId(chromosome, vcfPosition, refAllele, altAllele); if (hasConflictingEntry.ContainsKey(alleleId)) { hasConflictingEntry[alleleId] = true; //wipe out any lines containing this alt allele } else { hasConflictingEntry[alleleId] = false; } } vcfLines.Add(line); } // flushing out the remaining lines FlushVcfLineBuffer(vcfLines, hasConflictingEntry, writer); } return(_noLinesRemoved); }
private void ExtractFromCosmic() { if (_cosmicReader == null) { return; } using (var writer = GZipUtilities.GetStreamWriter(OncogenicFileName)) { string line; while ((line = _cosmicReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } //copy required header lines if (line.StartsWith("#")) { ProcessHeaderLine(writer, line); continue; } var fields = line.Split('\t'); if (IsLargeVariants(fields[VcfCommon.RefIndex], fields[VcfCommon.AltIndex])) { continue; } if (!HasMinCount(fields[VcfCommon.InfoIndex])) { continue; } _cosmicCount++; var chrName = GetChrName(fields[VcfCommon.ChromIndex]); //skip mito for hg19 if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT")) { continue; } writer.Write(chrName + '\t' + fields[VcfCommon.PosIndex] + '\t' + fields[VcfCommon.IdIndex] + '\t' + fields[VcfCommon.RefIndex] + '\t' + fields[VcfCommon.AltIndex] + '\t' + ".\t.\t.\n"); } } }
private void ExtractFromClinVar() { if (_clinvarReader == null) { return; } using (var writer = GZipUtilities.GetStreamWriter(IsisClinicalIndelFileName)) { string line; while ((line = _clinvarReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } //copy required header lines if (line.StartsWith("#")) { ProcessHeaderLine(writer, line); continue; } var fields = line.Split('\t'); if (IsSnv(fields[VcfCommon.RefIndex], fields[VcfCommon.AltIndex])) { continue; } _clinvarCount++; var chrName = GetChrName(fields[VcfCommon.ChromIndex]); //skip mito for hg19 if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT")) { continue; } var pos = Convert.ToInt32(fields[VcfCommon.PosIndex]); var refAllele = fields[VcfCommon.RefIndex]; if (ValidateReference(chrName, pos, refAllele)) { writer.Write(chrName + '\t' + pos + '\t' + fields[VcfCommon.IdIndex] + '\t' + refAllele + '\t' + fields[VcfCommon.AltIndex] + '\t' + ".\t.\t.\n"); } } } }
private void WriteRemappedEntry(string chrom, int pos, string vcfLine) { if (!_writers.ContainsKey(chrom)) { _writers[chrom] = GZipUtilities.GetStreamWriter(chrom + ".vcf.gz"); } var splits = vcfLine.Split('\t', 3); _writers[chrom].WriteLine($"{chrom}\t{Math.Abs(pos)}\t{splits[2]}"); }
private static void WriteDictionary(ILogger logger, IEnumerable <GenbankEntry> entries) { var header = new IntermediateIoHeader(0, 0, Source.None, GenomeAssembly.Unknown, 0); logger.Write($"- writing Genbank file ({Path.GetFileName(GenbankFilePath)})... "); using (var writer = new GenbankWriter(GZipUtilities.GetStreamWriter(GenbankFilePath), header)) { foreach (var entry in entries) { writer.Write(entry); } } logger.WriteLine("finished."); }
private static ExitCodes ProgramExecution() { if (_createIndex) { using (var indexCreator = new IndexCreator(_inputJson)) { indexCreator.CreateIndex(); } return(ExitCodes.Success); } string indexFileName = _inputJson + JasixCommons.FileExt; ValidateIndexFile(indexFileName); var writer = string.IsNullOrEmpty(_outputFile) ? null : GZipUtilities.GetStreamWriter(_outputFile); using (var queryProcessor = new QueryProcessor(GZipUtilities.GetAppropriateStreamReader(_inputJson), FileUtilities.GetReadStream(indexFileName), writer)) { if (_list) { queryProcessor.ListChromosomesAndSections(); return(ExitCodes.Success); } if (_printHeaderOnly) { queryProcessor.PrintHeaderOnly(); return(ExitCodes.Success); } if (!string.IsNullOrEmpty(_section)) { queryProcessor.PrintSection(_section); return(ExitCodes.Success); } if (Queries == null) { Console.WriteLine("Please specify query region(s)"); return(ExitCodes.BadArguments); } queryProcessor.ProcessQuery(Queries, _printHeader); } return(ExitCodes.Success); }
public void Create() { using (var reader = new GlobalCacheReader(CacheConstants.TranscriptPath(_inputPrefix))) using (var writer = GZipUtilities.GetStreamWriter(_outPath)) { WriteVcfHeader(writer); var cache = reader.Read(); Console.Write("- found {0} transcripts... ", cache.Transcripts.Length); foreach (var transcript in cache.Transcripts) { CreateVcf(writer, transcript); } Console.WriteLine("finished."); } }
private static ExitCodes ProgramExecution() { string cachePath = CacheConstants.TranscriptPath(_inputPrefix); var(refIndexToChromosome, _, _) = SequenceHelper.GetDictionaries(_compressedReferencePath); var cache = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome); var geneToInternalId = InternalGenes.CreateDictionary(cache.Genes); using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName))) { var creator = new GffCreator(writer, geneToInternalId); creator.Create(cache.TranscriptIntervalArrays); } return(ExitCodes.Success); }
public void GetAppropriateReadStream_Handle_BlockGZipFile() { string randomPath = RandomPath.GetRandomPath(); using (var writer = GZipUtilities.GetStreamWriter(randomPath)) { writer.WriteLine(ExpectedString); } string observedString; using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath)) { observedString = reader.ReadLine(); } Assert.Equal(ExpectedString, observedString); }
public void Create(string outputPath) { using (var writer = GZipUtilities.GetStreamWriter(outputPath)) { Console.Write("- reading {0}... ", Path.GetFileName(_cachePrefix)); var cache = GetCache(CacheConstants.TranscriptPath(_cachePrefix)); Console.WriteLine("found {0:N0} transcripts.", cache.Transcripts.Length); AddGenesToDictionary(cache.Genes); Console.Write("- writing GFF entries... "); foreach (var transcript in cache.Transcripts) { Write(writer, _referenceNames[transcript.ReferenceIndex], transcript); } Console.WriteLine("finished."); } }
private void AppendToChromFile(GenomicLocation leftoverLocation, string line) { var chromName = leftoverLocation.Chrom; if (!chromName.StartsWith("chr")) { chromName = "chr" + chromName; } if (!_writers.ContainsKey(chromName)) { Console.WriteLine($"Warning!! {chromName} was not present in source but is in destination"); _writers.Add(chromName, GZipUtilities.GetStreamWriter(chromName + ".vcf.gz")); } var splits = line.Split('\t', 3); _writers[chromName].WriteLine($"{chromName}\t{leftoverLocation.Position}\t{splits[2]}"); }
private static ExitCodes ProgramExecution() { Source transcriptSource = ParseVepCacheDirectoryMain.GetSource(_transcriptSource); string cachePath = CacheConstants.TranscriptPath(_inputPrefix); IDictionary <ushort, IChromosome> refIndexToChromosome = SequenceHelper.GetDictionaries(_compressedReferencePath).refIndexToChromosome; TranscriptCacheData cache = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome); IDictionary <IGene, int> geneToInternalId = InternalGenes.CreateDictionary(cache.Genes); using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName))) { var creator = new GffCreator(writer, geneToInternalId, transcriptSource); creator.Create(cache.TranscriptIntervalArrays); } return(ExitCodes.Success); }
private static ExitCodes ProgramExecution() { if (ConfigurationSettings.CreateIndex) { using (var indexCreator = new IndexCreator(ConfigurationSettings.InputJson)) { indexCreator.CreateIndex(); } return(ExitCodes.Success); } var indexFileName = ConfigurationSettings.InputJson + JasixCommons.FileExt; ValidateIndexFile(indexFileName); var writer = string.IsNullOrEmpty(ConfigurationSettings.OutputFile) ? null : GZipUtilities.GetStreamWriter(ConfigurationSettings.OutputFile); using (var queryProcessor = new QueryProcessor(GZipUtilities.GetAppropriateStreamReader(ConfigurationSettings.InputJson), FileUtilities.GetReadStream(indexFileName), writer)) { if (ConfigurationSettings.ListChromosomeName) { queryProcessor.PrintChromosomeList(); return(ExitCodes.Success); } if (ConfigurationSettings.PrintHeaderOnly) { queryProcessor.PrintHeader(); return(ExitCodes.Success); } if (ConfigurationSettings.Queries == null) { Console.WriteLine("Plese specify query region"); return(ExitCodes.BadArguments); } queryProcessor.ProcessQuery(ConfigurationSettings.Queries, ConfigurationSettings.PrintHeader); } return(ExitCodes.Success); }
public void Write(string outputPath) { Console.WriteLine(); Console.WriteLine("- serializing genes:"); using (var writer = GZipUtilities.GetStreamWriter(outputPath)) { writer.NewLine = "\n"; WriteHeader(writer, _header); int geneIndex = 0; foreach (var gene in _mergedGenes.OrderBy(g => g.ReferenceIndex).ThenBy(g => g.Start).ThenBy(g => g.End).ThenBy(g => g.Symbol)) { writer.WriteLine($"{geneIndex}\t{gene}"); geneIndex++; } Console.WriteLine(" - {0} genes written.", _mergedGenes.Count); } }
protected override void ProgramExecution() { var referenceNames = GetUcscReferenceNames(ConfigurationSettings.CompressedReferencePath); using (var writer = GZipUtilities.GetStreamWriter(ConfigurationSettings.OutputFileName)) { var cachePath = CacheConstants.TranscriptPath(ConfigurationSettings.CachePrefix); // load the cache Console.Write("- reading {0}... ", Path.GetFileName(cachePath)); var cache = GetCache(cachePath); Console.WriteLine("found {0:N0} regulatory regions. ", cache.RegulatoryElements.Length); Console.Write("- writing GFF entries... "); foreach (var regulatoryFeature in cache.RegulatoryElements) { WriteRegulatoryFeature(writer, referenceNames, regulatoryFeature); } Console.WriteLine("finished."); } }
public static void Main(string[] args) { Console.WriteLine("Aggregate exon alignments into transcript alignments"); if (args.Length != 3) { Console.WriteLine("usage: dotnet AminoAcidAligner.dll [input exon alignment FASTA file] [output transcript alignment file] [output AA conservation scores file]"); return; } var exonAlignmentFile = args[0]; var transcriptAlignmentFile = args[1]; var conservationScoresFile = args[2]; using (var reader = GZipUtilities.GetAppropriateStreamReader(exonAlignmentFile)) using (var writer = GZipUtilities.GetStreamWriter(transcriptAlignmentFile)) using (var scoresWriter = GZipUtilities.GetStreamWriter(conservationScoresFile)) { var count = CreateTranscriptAlignments(reader, writer, scoresWriter); Console.WriteLine($"Created {count} transcript alignments"); } }
private static ExitCodes ProgramExecution() { var transcriptSource = GetSource(_transcriptSource); var sequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath)); var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome); var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory); var genomeAssembly = GenomeAssemblyHelper.Convert(_genomeAssembly); long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks; var idToGenbank = GetIdToGenbank(genomeAssembly, transcriptSource); // ========================= // create the pre-cache file // ========================= // process each VEP directory int numRefSeqs = sequenceReader.NumRefSeqs; var header = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs); string siftPath = _outputStub + ".sift.gz"; string polyphenPath = _outputStub + ".polyphen.gz"; string transcriptPath = _outputStub + ".transcripts.gz"; string regulatoryPath = _outputStub + ".regulatory.gz"; using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log"))) using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift)) using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen)) using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header)) using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header)) { var converter = new VepCacheParser(transcriptSource); var emptyPredictionDict = new Dictionary <string, List <int> >(); for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++) { var chromosome = sequenceReader.RefIndexToChromosome[refIndex]; if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir)) { siftWriter.Write(chromosome, emptyPredictionDict); polyphenWriter.Write(chromosome, emptyPredictionDict); continue; } Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName); var rawData = converter.ParseDumpDirectory(chromosome, vepSubDir); var mergedTranscripts = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank); var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions); int numRawTranscripts = rawData.Transcripts.Count; int numMergedTranscripts = mergedTranscripts.Count; Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}"); WriteTranscripts(transcriptWriter, mergedTranscripts); WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions); WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome); WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome); } } Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count); return(ExitCodes.Success); }
private void ExtractFromOneKg() { if (_oneKGenomeReader == null) { return; } using (var writer = GZipUtilities.GetStreamWriter(RefMinorFileName)) { List <string> headerLines = null; if (_assembly == GenomeAssembly.GRCh37) { headerLines = _refMinorGrch37HeaderLines; } if (_assembly == GenomeAssembly.hg19) { headerLines = _refMinorHg19HeaderLines; } if (headerLines == null) { throw new Exception("Unknown assembly for RefMinor Extraction"); } foreach (var headerLine in headerLines) { writer.Write(headerLine + "\n"); } string line; while ((line = _oneKGenomeReader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } // Skip comments. if (line.StartsWith("#")) { continue; } var fields = line.Split('\t'); if (!IsRefMinorPosition(fields[VcfCommon.InfoIndex])) { continue; } _refMinorCount++; var chrName = GetChrName(fields[VcfCommon.ChromIndex]); //skip mito for hg19 if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT")) { continue; } writer.Write(chrName + '\t' + fields[VcfCommon.PosIndex] + '\t' + fields[VcfCommon.IdIndex] + '\t' + fields[VcfCommon.RefIndex] + '\t' + fields[VcfCommon.AltIndex] + '\t' + ".\t.\t.\n"); } } }
protected override void ProgramExecution() { var processedReferences = new HashSet <string>(); string previousReference = null; Console.WriteLine("Running Nirvana on {0}:", Path.GetFileName(ConfigurationSettings.VcfPath)); var outputFilePath = ConfigurationSettings.OutputFileName + ".txt.gz"; var annotationCreationTime = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"); var reader = new LiteVcfReader(ConfigurationSettings.VcfPath); var compressedSequence = new CompressedSequence(); var compressedSequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(ConfigurationSettings.CompressedReferencePath), compressedSequence); var transcriptCacheStream = new FileStream(CacheConstants.TranscriptPath(ConfigurationSettings.InputCachePrefix), FileMode.Open, FileAccess.Read, FileShare.Read); var annotator = new PianoAnnotationSource(transcriptCacheStream, compressedSequenceReader); if (ConfigurationSettings.ForceMitochondrialAnnotation || reader.IsRcrsMitochondrion) { annotator.EnableMitochondrialAnnotation(); } // sanity check: make sure we have annotations if (annotator == null) { throw new GeneralException("Unable to perform annotation because no annotation sources could be created"); } using (var writer = GZipUtilities.GetStreamWriter(outputFilePath)) { WriteHeader(writer, annotationCreationTime); string vcfLine = null; try { while ((vcfLine = reader.ReadLine()) != null) { var vcfVariant = CreateVcfVariant(vcfLine, reader.IsGatkGenomeVcf); // check if the vcf is sorted if (vcfVariant == null) { continue; } var currentReference = vcfVariant.ReferenceName; if (currentReference != previousReference && processedReferences.Contains(currentReference)) { throw new FileNotSortedException( "The current input vcf file is not sorted. Please sort the vcf file before running variant annotation using a tool like vcf-sort in vcftools."); } if (!processedReferences.Contains(currentReference)) { processedReferences.Add(currentReference); } previousReference = currentReference; var annotatedVariant = annotator.Annotate(vcfVariant); writer.Write(annotatedVariant.ToString()); } } catch (Exception e) { // embed the vcf line e.Data["VcfLine"] = vcfLine; throw; } } }
private static StreamWriter GetJsonStreamWriter(string outputPath) { return(ConfigurationSettings.OutputFileName == "-" ? new StreamWriter(Console.OpenStandardOutput()) : GZipUtilities.GetStreamWriter(outputPath)); }
public static StreamWriter GetGvcfOutputWriter(string outputPath) { return(outputPath == "-" ? new StreamWriter(Console.OpenStandardOutput()) : GZipUtilities.GetStreamWriter(outputPath + ".genome.vcf.gz")); }
/// <summary> /// executes the program /// </summary> protected override void ProgramExecution() { var transcriptSource = ConfigurationSettings.ImportRefSeqTranscripts ? TranscriptDataSource.RefSeq : TranscriptDataSource.Ensembl; var referenceIndex = new ReferenceIndex(ConfigurationSettings.InputReferencePath); var vepDirectories = referenceIndex.GetUcscKaryotypeOrder(ConfigurationSettings.InputVepDirectory); var converter = new VepCacheParser(transcriptSource); var genomeAssembly = GenomeAssemblyUtilities.Convert(ConfigurationSettings.GenomeAssembly); // ========================= // create the pre-cache file // ========================= // process each VEP directory int numDirectoriesProcessed = 0; var transcriptPath = ConfigurationSettings.OutputStub + ".transcripts.gz"; var regulatoryPath = ConfigurationSettings.OutputStub + ".regulatory.gz"; var genePath = ConfigurationSettings.OutputStub + ".genes.gz"; var intronPath = ConfigurationSettings.OutputStub + ".introns.gz"; var exonPath = ConfigurationSettings.OutputStub + ".exons.gz"; var mirnaPath = ConfigurationSettings.OutputStub + ".mirnas.gz"; var siftPath = ConfigurationSettings.OutputStub + ".sift.dat"; var polyphenPath = ConfigurationSettings.OutputStub + ".polyphen.dat"; var cdnaPath = ConfigurationSettings.OutputStub + ".cdnas.gz"; var peptidePath = ConfigurationSettings.OutputStub + ".peptides.gz"; using (var transcriptWriter = GZipUtilities.GetStreamWriter(transcriptPath)) using (var regulatoryWriter = GZipUtilities.GetStreamWriter(regulatoryPath)) using (var geneWriter = GZipUtilities.GetStreamWriter(genePath)) using (var intronWriter = GZipUtilities.GetStreamWriter(intronPath)) using (var exonWriter = GZipUtilities.GetStreamWriter(exonPath)) using (var mirnaWriter = GZipUtilities.GetStreamWriter(mirnaPath)) using (var siftWriter = GZipUtilities.GetBinaryWriter(siftPath + ".tmp")) using (var polyphenWriter = GZipUtilities.GetBinaryWriter(polyphenPath + ".tmp")) using (var cdnaWriter = GZipUtilities.GetStreamWriter(cdnaPath)) using (var peptideWriter = GZipUtilities.GetStreamWriter(peptidePath)) { transcriptWriter.NewLine = "\n"; regulatoryWriter.NewLine = "\n"; geneWriter.NewLine = "\n"; intronWriter.NewLine = "\n"; exonWriter.NewLine = "\n"; mirnaWriter.NewLine = "\n"; cdnaWriter.NewLine = "\n"; peptideWriter.NewLine = "\n"; WriteHeader(transcriptWriter, GlobalImportCommon.FileType.Transcript, transcriptSource, genomeAssembly); WriteHeader(regulatoryWriter, GlobalImportCommon.FileType.Regulatory, transcriptSource, genomeAssembly); WriteHeader(geneWriter, GlobalImportCommon.FileType.Gene, transcriptSource, genomeAssembly); WriteHeader(intronWriter, GlobalImportCommon.FileType.Intron, transcriptSource, genomeAssembly); WriteHeader(exonWriter, GlobalImportCommon.FileType.Exon, transcriptSource, genomeAssembly); WriteHeader(mirnaWriter, GlobalImportCommon.FileType.MicroRna, transcriptSource, genomeAssembly); WriteHeader(siftWriter, GlobalImportCommon.FileType.Sift, transcriptSource, genomeAssembly); WriteHeader(polyphenWriter, GlobalImportCommon.FileType.PolyPhen, transcriptSource, genomeAssembly); WriteHeader(cdnaWriter, GlobalImportCommon.FileType.CDna, transcriptSource, genomeAssembly); WriteHeader(peptideWriter, GlobalImportCommon.FileType.Peptide, transcriptSource, genomeAssembly); foreach (var refTuple in vepDirectories) { // DEBUG //if (refTuple.Item1 != "chr7") continue; Console.WriteLine("Parsing reference sequence [{0}]:", refTuple.Item1); numDirectoriesProcessed++; var refIndex = referenceIndex.GetIndex(refTuple.Item1); converter.ParseDumpDirectory(refIndex, refTuple.Item2, transcriptWriter, regulatoryWriter, geneWriter, intronWriter, exonWriter, mirnaWriter, siftWriter, polyphenWriter, cdnaWriter, peptideWriter); } } Console.WriteLine("\n{0} directories processed.", numDirectoriesProcessed); converter.DumpStatistics(); Console.WriteLine(); // convert our protein function predictions var predictionConverter = new PredictionConverter(referenceIndex.NumReferenceSeqs); predictionConverter.Convert(siftPath, "SIFT", GlobalImportCommon.FileType.Sift); predictionConverter.Convert(polyphenPath, "PolyPhen", GlobalImportCommon.FileType.PolyPhen); }