Exemplo n.º 1
0
        public void GZipReadAndWrite()
        {
            const string expectedLine1 =
                "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.";
            const string expectedLine2 =
                "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.";

            var randomPath = GetRandomPath();

            using (var writer = GZipUtilities.GetStreamWriter(randomPath))
            {
                writer.WriteLine(expectedLine1);
                writer.WriteLine(expectedLine2);
            }

            string observedLine1;
            string observedLine2;
            string observedLine3;

            using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath))
            {
                observedLine1 = reader.ReadLine();
                observedLine2 = reader.ReadLine();
                observedLine3 = reader.ReadLine();
            }

            Assert.Equal(expectedLine1, observedLine1);
            Assert.Equal(expectedLine2, observedLine2);
            Assert.Null(observedLine3);
        }
        private static ExitCodes ProgramExecution()
        {
            using (var writer = GZipUtilities.GetStreamWriter(_outputFileName))
            {
                string cachePath    = CacheConstants.TranscriptPath(_inputPrefix);
                var    sequenceData = SequenceHelper.GetDictionaries(_referencePath);

                // load the cache
                Console.Write("- reading {0}... ", Path.GetFileName(cachePath));
                var cache = TranscriptCacheHelper.GetCache(cachePath, sequenceData.refIndexToChromosome);
                Console.WriteLine("found {0:N0} reference sequences. ", cache.RegulatoryRegionIntervalArrays.Length);

                Console.Write("- writing GFF entries... ");
                foreach (var intervalArray in cache.RegulatoryRegionIntervalArrays)
                {
                    if (intervalArray == null)
                    {
                        continue;
                    }
                    foreach (var interval in intervalArray.Array)
                    {
                        WriteRegulatoryFeature(writer, interval.Value);
                    }
                }
                Console.WriteLine("finished.");
            }

            return(ExitCodes.Success);
        }
Exemplo n.º 3
0
        private static ExitCodes ProgramExecution()
        {
            const string tempLeftoverFilename = "LeftOvers.vcf.gz";
            Dictionary <string, StreamWriter> writers;

            ISequenceProvider srcSequenceProvider = ProviderUtilities.GetSequenceProvider(_srcRefSequence);
            ISequenceProvider desSequenceProvider = ProviderUtilities.GetSequenceProvider(_desRefSequence);

            using (var srcReader = GZipUtilities.GetAppropriateStreamReader(_srcMapFile))
                using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile))
                    using (var leftoverWriter = GZipUtilities.GetStreamWriter(tempLeftoverFilename))
                    {
                        var chromMapper = new ChromMapper(srcReader, destReader, leftoverWriter, srcSequenceProvider, desSequenceProvider);
                        writers = chromMapper.Map();
                    }

            //now we will try to map the leftovers
            using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile))
                using (var leftoverReader = GZipUtilities.GetAppropriateStreamReader(tempLeftoverFilename))
                {
                    var leftOverMapper = new LeftoverMapper(leftoverReader, destReader, writers, desSequenceProvider);
                    var leftoverCount  = leftOverMapper.Map();
                    Console.WriteLine($"{leftoverCount} leftovers mapped!!");
                }

            foreach (var writer in writers.Values)
            {
                writer.Dispose();
            }

            return(ExitCodes.Success);
        }
Exemplo n.º 4
0
        public static void Filter(string intputTsv, string gffFile1, string gffFile2, string outputTsv)
        {
            var intronFlankingRegions = GetIntronFlankingRegions(gffFile1, gffFile2);

            using (var resultsReader = GZipUtilities.GetAppropriateStreamReader(intputTsv))
                using (var resultsWriter = GZipUtilities.GetStreamWriter(outputTsv))
                {
                    long   lineCount = 0;
                    string line;
                    while ((line = resultsReader.ReadLine()) != null)
                    {
                        var    info     = line.TrimEnd().Split('\t');
                        ushort chrIndex = GetChrIndex(info[PredChrColumn]);
                        int    pos      = int.Parse(info[PredPosColumn]);
                        if (intronFlankingRegions.OverlapsAny(chrIndex, pos, pos) ||
                            AnyScorePassTheCutoff(info, PredScoreColumns, FreqCutoff))
                        {
                            resultsWriter.WriteLine(line);
                        }
                        lineCount++;
                        if (lineCount % 1_000_000 == 0)
                        {
                            Console.WriteLine($"Processed {lineCount} lines. Current position: {info[PredChrColumn]}:{info[PredPosColumn]}");
                        }
                    }
                }
        }
Exemplo n.º 5
0
        public void Write(string outFileName)
        {
            using (var writer = GZipUtilities.GetStreamWriter(outFileName))
            {
                writer.Write(OutHeader);
                foreach (var clinGenItem in _clinGenDictionary.Values)
                {
                    var varType = VariantType.unknown;
                    if (clinGenItem.ObservedGains > 0 && clinGenItem.ObservedLosses == 0)
                    {
                        varType = VariantType.copy_number_gain;
                    }
                    if (clinGenItem.ObservedGains > 0 && clinGenItem.ObservedLosses > 0)
                    {
                        varType = VariantType.copy_number_variation;
                    }
                    if (clinGenItem.ObservedGains == 0 && clinGenItem.ObservedLosses > 0)
                    {
                        varType = VariantType.copy_number_loss;
                    }

                    writer.Write(
                        $"{clinGenItem.Id}\t{clinGenItem.Chromosome}\t{clinGenItem.Start}\t{clinGenItem.End}\t" +
                        $"{clinGenItem.ObservedGains}\t{clinGenItem.ObservedLosses}\t{varType}\t" +
                        $"{clinGenItem.ClinicalInterpretation}\t{clinGenItem.Validated}\t" +
                        $"{string.Join(",", clinGenItem.Phenotypes.ToArray())}\t{string.Join(",", clinGenItem.PhenotypeIds.ToArray())}\n");
                }
            }
        }
Exemplo n.º 6
0
        public GeneAnnotationTsvWriter(string outputDirectory, DataSourceVersion dataSourceVersion, string assembly, int dataVersion, string keyName,
                                       bool isArray)
        {
            var fileName = keyName + "_" + dataSourceVersion.Version.Replace(" ", "_") + ".gene.tsv.gz";

            _writer = GZipUtilities.GetStreamWriter(Path.Combine(outputDirectory, fileName));

            _writer.Write(GetHeader(dataSourceVersion, dataVersion, assembly, keyName, isArray));
        }
Exemplo n.º 7
0
        public int RemoveConflictingLines()
        {
            using (var reader = GZipUtilities.GetAppropriateStreamReader(_inFile))
                using (var writer = GZipUtilities.GetStreamWriter(_outFile))
                {
                    string line;
                    var    vcfLines            = new List <string>(VcfBufferSize); //all lines for the last few positions will be tracked in this dictionary
                    var    hasConflictingEntry = new Dictionary <string, bool>();  //indicates if there is a conflicting entry for a certain allele.

                    while ((line = reader.ReadLine()) != null)
                    {
                        if (line.StartsWith("#"))
                        {
                            // streaming the header lines
                            writer.WriteLine(line);
                            continue;
                        }

                        // parsing vcf line
                        var vcfColumns = line.Split(new[] { '\t' }, VcfCommon.InfoIndex + 1);

                        var chromosome  = vcfColumns[VcfCommon.ChromIndex];
                        var vcfPosition = Convert.ToInt32(vcfColumns[VcfCommon.PosIndex]);
                        var refAllele   = vcfColumns[VcfCommon.RefIndex];
                        var altAlleles  = vcfColumns[VcfCommon.AltIndex].Split(',');

                        if (chromosome != _currentRefSeq || vcfPosition > _maxVidPosition)
                        {
                            FlushVcfLineBuffer(vcfLines, hasConflictingEntry, writer);
                            vcfLines.Clear();
                            hasConflictingEntry.Clear();

                            _currentRefSeq = chromosome;
                        }

                        foreach (var altAllele in altAlleles)
                        {
                            var alleleId = GetAlleleId(chromosome, vcfPosition, refAllele, altAllele);

                            if (hasConflictingEntry.ContainsKey(alleleId))
                            {
                                hasConflictingEntry[alleleId] = true;                         //wipe out any lines containing this alt allele
                            }
                            else
                            {
                                hasConflictingEntry[alleleId] = false;
                            }
                        }

                        vcfLines.Add(line);
                    }
                    // flushing out the remaining lines
                    FlushVcfLineBuffer(vcfLines, hasConflictingEntry, writer);
                }
            return(_noLinesRemoved);
        }
Exemplo n.º 8
0
        private void ExtractFromCosmic()
        {
            if (_cosmicReader == null)
            {
                return;
            }

            using (var writer = GZipUtilities.GetStreamWriter(OncogenicFileName))
            {
                string line;

                while ((line = _cosmicReader.ReadLine()) != null)
                {
                    // Skip empty lines.
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }
                    //copy required header lines
                    if (line.StartsWith("#"))
                    {
                        ProcessHeaderLine(writer, line);
                        continue;
                    }

                    var fields = line.Split('\t');

                    if (IsLargeVariants(fields[VcfCommon.RefIndex], fields[VcfCommon.AltIndex]))
                    {
                        continue;
                    }
                    if (!HasMinCount(fields[VcfCommon.InfoIndex]))
                    {
                        continue;
                    }

                    _cosmicCount++;


                    var chrName = GetChrName(fields[VcfCommon.ChromIndex]);

                    //skip mito for hg19
                    if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT"))
                    {
                        continue;
                    }

                    writer.Write(chrName + '\t' +
                                 fields[VcfCommon.PosIndex] + '\t' +
                                 fields[VcfCommon.IdIndex] + '\t' +
                                 fields[VcfCommon.RefIndex] + '\t' +
                                 fields[VcfCommon.AltIndex] + '\t' +
                                 ".\t.\t.\n");
                }
            }
        }
Exemplo n.º 9
0
        private void ExtractFromClinVar()
        {
            if (_clinvarReader == null)
            {
                return;
            }

            using (var writer = GZipUtilities.GetStreamWriter(IsisClinicalIndelFileName))
            {
                string line;
                while ((line = _clinvarReader.ReadLine()) != null)
                {
                    // Skip empty lines.
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }

                    //copy required header lines
                    if (line.StartsWith("#"))
                    {
                        ProcessHeaderLine(writer, line);
                        continue;
                    }

                    var fields = line.Split('\t');

                    if (IsSnv(fields[VcfCommon.RefIndex], fields[VcfCommon.AltIndex]))
                    {
                        continue;
                    }

                    _clinvarCount++;
                    var chrName = GetChrName(fields[VcfCommon.ChromIndex]);

                    //skip mito for hg19
                    if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT"))
                    {
                        continue;
                    }
                    var pos       = Convert.ToInt32(fields[VcfCommon.PosIndex]);
                    var refAllele = fields[VcfCommon.RefIndex];

                    if (ValidateReference(chrName, pos, refAllele))
                    {
                        writer.Write(chrName + '\t' +
                                     pos + '\t' +
                                     fields[VcfCommon.IdIndex] + '\t' +
                                     refAllele + '\t' +
                                     fields[VcfCommon.AltIndex] + '\t' +
                                     ".\t.\t.\n");
                    }
                }
            }
        }
Exemplo n.º 10
0
        private void WriteRemappedEntry(string chrom, int pos, string vcfLine)
        {
            if (!_writers.ContainsKey(chrom))
            {
                _writers[chrom] = GZipUtilities.GetStreamWriter(chrom + ".vcf.gz");
            }

            var splits = vcfLine.Split('\t', 3);

            _writers[chrom].WriteLine($"{chrom}\t{Math.Abs(pos)}\t{splits[2]}");
        }
Exemplo n.º 11
0
        private static void WriteDictionary(ILogger logger, IEnumerable <GenbankEntry> entries)
        {
            var header = new IntermediateIoHeader(0, 0, Source.None, GenomeAssembly.Unknown, 0);

            logger.Write($"- writing Genbank file ({Path.GetFileName(GenbankFilePath)})... ");
            using (var writer = new GenbankWriter(GZipUtilities.GetStreamWriter(GenbankFilePath), header))
            {
                foreach (var entry in entries)
                {
                    writer.Write(entry);
                }
            }
            logger.WriteLine("finished.");
        }
Exemplo n.º 12
0
        private static ExitCodes ProgramExecution()
        {
            if (_createIndex)
            {
                using (var indexCreator = new IndexCreator(_inputJson))
                {
                    indexCreator.CreateIndex();
                }

                return(ExitCodes.Success);
            }

            string indexFileName = _inputJson + JasixCommons.FileExt;

            ValidateIndexFile(indexFileName);
            var writer = string.IsNullOrEmpty(_outputFile)
                ? null : GZipUtilities.GetStreamWriter(_outputFile);

            using (var queryProcessor = new QueryProcessor(GZipUtilities.GetAppropriateStreamReader(_inputJson),
                                                           FileUtilities.GetReadStream(indexFileName), writer))
            {
                if (_list)
                {
                    queryProcessor.ListChromosomesAndSections();
                    return(ExitCodes.Success);
                }

                if (_printHeaderOnly)
                {
                    queryProcessor.PrintHeaderOnly();
                    return(ExitCodes.Success);
                }

                if (!string.IsNullOrEmpty(_section))
                {
                    queryProcessor.PrintSection(_section);
                    return(ExitCodes.Success);
                }

                if (Queries == null)
                {
                    Console.WriteLine("Please specify query region(s)");
                    return(ExitCodes.BadArguments);
                }

                queryProcessor.ProcessQuery(Queries, _printHeader);
            }
            return(ExitCodes.Success);
        }
Exemplo n.º 13
0
        public void Create()
        {
            using (var reader = new GlobalCacheReader(CacheConstants.TranscriptPath(_inputPrefix)))
                using (var writer = GZipUtilities.GetStreamWriter(_outPath))
                {
                    WriteVcfHeader(writer);

                    var cache = reader.Read();
                    Console.Write("- found {0} transcripts... ", cache.Transcripts.Length);
                    foreach (var transcript in cache.Transcripts)
                    {
                        CreateVcf(writer, transcript);
                    }
                    Console.WriteLine("finished.");
                }
        }
Exemplo n.º 14
0
        private static ExitCodes ProgramExecution()
        {
            string cachePath = CacheConstants.TranscriptPath(_inputPrefix);

            var(refIndexToChromosome, _, _) = SequenceHelper.GetDictionaries(_compressedReferencePath);
            var cache            = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome);
            var geneToInternalId = InternalGenes.CreateDictionary(cache.Genes);

            using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName)))
            {
                var creator = new GffCreator(writer, geneToInternalId);
                creator.Create(cache.TranscriptIntervalArrays);
            }

            return(ExitCodes.Success);
        }
Exemplo n.º 15
0
        public void GetAppropriateReadStream_Handle_BlockGZipFile()
        {
            string randomPath = RandomPath.GetRandomPath();

            using (var writer = GZipUtilities.GetStreamWriter(randomPath))
            {
                writer.WriteLine(ExpectedString);
            }

            string observedString;

            using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath))
            {
                observedString = reader.ReadLine();
            }

            Assert.Equal(ExpectedString, observedString);
        }
Exemplo n.º 16
0
        public void Create(string outputPath)
        {
            using (var writer = GZipUtilities.GetStreamWriter(outputPath))
            {
                Console.Write("- reading {0}... ", Path.GetFileName(_cachePrefix));
                var cache = GetCache(CacheConstants.TranscriptPath(_cachePrefix));
                Console.WriteLine("found {0:N0} transcripts.", cache.Transcripts.Length);

                AddGenesToDictionary(cache.Genes);

                Console.Write("- writing GFF entries... ");
                foreach (var transcript in cache.Transcripts)
                {
                    Write(writer, _referenceNames[transcript.ReferenceIndex], transcript);
                }
                Console.WriteLine("finished.");
            }
        }
Exemplo n.º 17
0
        private void AppendToChromFile(GenomicLocation leftoverLocation, string line)
        {
            var chromName = leftoverLocation.Chrom;

            if (!chromName.StartsWith("chr"))
            {
                chromName = "chr" + chromName;
            }
            if (!_writers.ContainsKey(chromName))
            {
                Console.WriteLine($"Warning!! {chromName} was not present in source but is in destination");
                _writers.Add(chromName, GZipUtilities.GetStreamWriter(chromName + ".vcf.gz"));
            }

            var splits = line.Split('\t', 3);

            _writers[chromName].WriteLine($"{chromName}\t{leftoverLocation.Position}\t{splits[2]}");
        }
Exemplo n.º 18
0
        private static ExitCodes ProgramExecution()
        {
            Source transcriptSource = ParseVepCacheDirectoryMain.GetSource(_transcriptSource);
            string cachePath        = CacheConstants.TranscriptPath(_inputPrefix);

            IDictionary <ushort, IChromosome> refIndexToChromosome =
                SequenceHelper.GetDictionaries(_compressedReferencePath).refIndexToChromosome;

            TranscriptCacheData      cache            = TranscriptCacheHelper.GetCache(cachePath, refIndexToChromosome);
            IDictionary <IGene, int> geneToInternalId = InternalGenes.CreateDictionary(cache.Genes);

            using (var writer = new GffWriter(GZipUtilities.GetStreamWriter(_outputFileName)))
            {
                var creator = new GffCreator(writer, geneToInternalId, transcriptSource);
                creator.Create(cache.TranscriptIntervalArrays);
            }

            return(ExitCodes.Success);
        }
Exemplo n.º 19
0
        private static ExitCodes ProgramExecution()
        {
            if (ConfigurationSettings.CreateIndex)
            {
                using (var indexCreator = new IndexCreator(ConfigurationSettings.InputJson))
                {
                    indexCreator.CreateIndex();
                }

                return(ExitCodes.Success);
            }

            var indexFileName = ConfigurationSettings.InputJson + JasixCommons.FileExt;

            ValidateIndexFile(indexFileName);
            var writer = string.IsNullOrEmpty(ConfigurationSettings.OutputFile)
                ? null : GZipUtilities.GetStreamWriter(ConfigurationSettings.OutputFile);

            using (var queryProcessor = new QueryProcessor(GZipUtilities.GetAppropriateStreamReader(ConfigurationSettings.InputJson),
                                                           FileUtilities.GetReadStream(indexFileName), writer))
            {
                if (ConfigurationSettings.ListChromosomeName)
                {
                    queryProcessor.PrintChromosomeList();
                    return(ExitCodes.Success);
                }

                if (ConfigurationSettings.PrintHeaderOnly)
                {
                    queryProcessor.PrintHeader();
                    return(ExitCodes.Success);
                }

                if (ConfigurationSettings.Queries == null)
                {
                    Console.WriteLine("Plese specify query region");
                    return(ExitCodes.BadArguments);
                }

                queryProcessor.ProcessQuery(ConfigurationSettings.Queries, ConfigurationSettings.PrintHeader);
            }
            return(ExitCodes.Success);
        }
Exemplo n.º 20
0
        public void Write(string outputPath)
        {
            Console.WriteLine();
            Console.WriteLine("- serializing genes:");

            using (var writer = GZipUtilities.GetStreamWriter(outputPath))
            {
                writer.NewLine = "\n";
                WriteHeader(writer, _header);

                int geneIndex = 0;

                foreach (var gene in _mergedGenes.OrderBy(g => g.ReferenceIndex).ThenBy(g => g.Start).ThenBy(g => g.End).ThenBy(g => g.Symbol))
                {
                    writer.WriteLine($"{geneIndex}\t{gene}");
                    geneIndex++;
                }

                Console.WriteLine("  - {0} genes written.", _mergedGenes.Count);
            }
        }
Exemplo n.º 21
0
        protected override void ProgramExecution()
        {
            var referenceNames = GetUcscReferenceNames(ConfigurationSettings.CompressedReferencePath);

            using (var writer = GZipUtilities.GetStreamWriter(ConfigurationSettings.OutputFileName))
            {
                var cachePath = CacheConstants.TranscriptPath(ConfigurationSettings.CachePrefix);

                // load the cache
                Console.Write("- reading {0}... ", Path.GetFileName(cachePath));
                var cache = GetCache(cachePath);
                Console.WriteLine("found {0:N0} regulatory regions. ", cache.RegulatoryElements.Length);

                Console.Write("- writing GFF entries... ");
                foreach (var regulatoryFeature in cache.RegulatoryElements)
                {
                    WriteRegulatoryFeature(writer, referenceNames, regulatoryFeature);
                }
                Console.WriteLine("finished.");
            }
        }
Exemplo n.º 22
0
        public static void Main(string[] args)
        {
            Console.WriteLine("Aggregate exon alignments into transcript alignments");

            if (args.Length != 3)
            {
                Console.WriteLine("usage: dotnet AminoAcidAligner.dll [input exon alignment FASTA file] [output transcript alignment file] [output AA conservation scores file]");
                return;
            }

            var exonAlignmentFile       = args[0];
            var transcriptAlignmentFile = args[1];
            var conservationScoresFile  = args[2];

            using (var reader = GZipUtilities.GetAppropriateStreamReader(exonAlignmentFile))
                using (var writer = GZipUtilities.GetStreamWriter(transcriptAlignmentFile))
                    using (var scoresWriter = GZipUtilities.GetStreamWriter(conservationScoresFile))
                    {
                        var count = CreateTranscriptAlignments(reader, writer, scoresWriter);
                        Console.WriteLine($"Created {count} transcript alignments");
                    }
        }
        private static ExitCodes ProgramExecution()
        {
            var transcriptSource = GetSource(_transcriptSource);
            var sequenceReader   = new CompressedSequenceReader(FileUtilities.GetReadStream(_inputReferencePath));
            var vepRootDirectory = new VepRootDirectory(sequenceReader.RefNameToChromosome);
            var refIndexToVepDir = vepRootDirectory.GetRefIndexToVepDir(_inputVepDirectory);

            var  genomeAssembly  = GenomeAssemblyHelper.Convert(_genomeAssembly);
            long vepReleaseTicks = DateTime.Parse(_vepReleaseDate).Ticks;
            var  idToGenbank     = GetIdToGenbank(genomeAssembly, transcriptSource);

            // =========================
            // create the pre-cache file
            // =========================

            // process each VEP directory
            int numRefSeqs = sequenceReader.NumRefSeqs;
            var header     = new IntermediateIoHeader(_vepVersion, vepReleaseTicks, transcriptSource, genomeAssembly, numRefSeqs);

            string siftPath       = _outputStub + ".sift.gz";
            string polyphenPath   = _outputStub + ".polyphen.gz";
            string transcriptPath = _outputStub + ".transcripts.gz";
            string regulatoryPath = _outputStub + ".regulatory.gz";

            using (var mergeLogger = new TranscriptMergerLogger(FileUtilities.GetCreateStream(_outputStub + ".merge_transcripts.log")))
                using (var siftWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(siftPath), header, IntermediateIoCommon.FileType.Sift))
                    using (var polyphenWriter = new PredictionWriter(GZipUtilities.GetStreamWriter(polyphenPath), header, IntermediateIoCommon.FileType.Polyphen))
                        using (var transcriptWriter = new MutableTranscriptWriter(GZipUtilities.GetStreamWriter(transcriptPath), header))
                            using (var regulatoryRegionWriter = new RegulatoryRegionWriter(GZipUtilities.GetStreamWriter(regulatoryPath), header))
                            {
                                var converter           = new VepCacheParser(transcriptSource);
                                var emptyPredictionDict = new Dictionary <string, List <int> >();

                                for (ushort refIndex = 0; refIndex < numRefSeqs; refIndex++)
                                {
                                    var chromosome = sequenceReader.RefIndexToChromosome[refIndex];

                                    if (!refIndexToVepDir.TryGetValue(refIndex, out string vepSubDir))
                                    {
                                        siftWriter.Write(chromosome, emptyPredictionDict);
                                        polyphenWriter.Write(chromosome, emptyPredictionDict);
                                        continue;
                                    }

                                    Console.WriteLine("Parsing reference sequence [{0}]:", chromosome.UcscName);

                                    var rawData                 = converter.ParseDumpDirectory(chromosome, vepSubDir);
                                    var mergedTranscripts       = TranscriptMerger.Merge(mergeLogger, rawData.Transcripts, idToGenbank);
                                    var mergedRegulatoryRegions = RegulatoryRegionMerger.Merge(rawData.RegulatoryRegions);

                                    int numRawTranscripts    = rawData.Transcripts.Count;
                                    int numMergedTranscripts = mergedTranscripts.Count;
                                    Console.WriteLine($"- # merged transcripts: {numMergedTranscripts}, # total transcripts: {numRawTranscripts}");

                                    WriteTranscripts(transcriptWriter, mergedTranscripts);
                                    WriteRegulatoryRegions(regulatoryRegionWriter, mergedRegulatoryRegions);
                                    WritePredictions(siftWriter, mergedTranscripts, x => x.SiftData, chromosome);
                                    WritePredictions(polyphenWriter, mergedTranscripts, x => x.PolyphenData, chromosome);
                                }
                            }

            Console.WriteLine("\n{0} directories processed.", refIndexToVepDir.Count);

            return(ExitCodes.Success);
        }
Exemplo n.º 24
0
        private void ExtractFromOneKg()
        {
            if (_oneKGenomeReader == null)
            {
                return;
            }

            using (var writer = GZipUtilities.GetStreamWriter(RefMinorFileName))
            {
                List <string> headerLines = null;
                if (_assembly == GenomeAssembly.GRCh37)
                {
                    headerLines = _refMinorGrch37HeaderLines;
                }
                if (_assembly == GenomeAssembly.hg19)
                {
                    headerLines = _refMinorHg19HeaderLines;
                }

                if (headerLines == null)
                {
                    throw new Exception("Unknown assembly for RefMinor Extraction");
                }

                foreach (var headerLine in headerLines)
                {
                    writer.Write(headerLine + "\n");
                }

                string line;
                while ((line = _oneKGenomeReader.ReadLine()) != null)
                {
                    // Skip empty lines.
                    if (string.IsNullOrWhiteSpace(line))
                    {
                        continue;
                    }
                    // Skip comments.
                    if (line.StartsWith("#"))
                    {
                        continue;
                    }

                    var fields = line.Split('\t');

                    if (!IsRefMinorPosition(fields[VcfCommon.InfoIndex]))
                    {
                        continue;
                    }


                    _refMinorCount++;
                    var chrName = GetChrName(fields[VcfCommon.ChromIndex]);

                    //skip mito for hg19
                    if (_assembly == GenomeAssembly.hg19 && (chrName == "chrM" || chrName == "MT"))
                    {
                        continue;
                    }

                    writer.Write(chrName + '\t' +
                                 fields[VcfCommon.PosIndex] + '\t' +
                                 fields[VcfCommon.IdIndex] + '\t' +
                                 fields[VcfCommon.RefIndex] + '\t' +
                                 fields[VcfCommon.AltIndex] + '\t' +
                                 ".\t.\t.\n");
                }
            }
        }
Exemplo n.º 25
0
        protected override void ProgramExecution()
        {
            var    processedReferences = new HashSet <string>();
            string previousReference   = null;

            Console.WriteLine("Running Nirvana on {0}:", Path.GetFileName(ConfigurationSettings.VcfPath));

            var outputFilePath         = ConfigurationSettings.OutputFileName + ".txt.gz";
            var annotationCreationTime = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");
            var reader = new LiteVcfReader(ConfigurationSettings.VcfPath);

            var compressedSequence       = new CompressedSequence();
            var compressedSequenceReader = new CompressedSequenceReader(FileUtilities.GetReadStream(ConfigurationSettings.CompressedReferencePath), compressedSequence);
            var transcriptCacheStream    = new FileStream(CacheConstants.TranscriptPath(ConfigurationSettings.InputCachePrefix),
                                                          FileMode.Open, FileAccess.Read, FileShare.Read);

            var annotator = new PianoAnnotationSource(transcriptCacheStream, compressedSequenceReader);

            if (ConfigurationSettings.ForceMitochondrialAnnotation || reader.IsRcrsMitochondrion)
            {
                annotator.EnableMitochondrialAnnotation();
            }

            // sanity check: make sure we have annotations
            if (annotator == null)
            {
                throw new GeneralException("Unable to perform annotation because no annotation sources could be created");
            }

            using (var writer = GZipUtilities.GetStreamWriter(outputFilePath))
            {
                WriteHeader(writer, annotationCreationTime);
                string vcfLine = null;

                try
                {
                    while ((vcfLine = reader.ReadLine()) != null)

                    {
                        var vcfVariant = CreateVcfVariant(vcfLine, reader.IsGatkGenomeVcf);

                        // check if the vcf is sorted
                        if (vcfVariant == null)
                        {
                            continue;
                        }

                        var currentReference = vcfVariant.ReferenceName;
                        if (currentReference != previousReference && processedReferences.Contains(currentReference))
                        {
                            throw new FileNotSortedException(
                                      "The current input vcf file is not sorted. Please sort the vcf file before running variant annotation using a tool like vcf-sort in vcftools.");
                        }
                        if (!processedReferences.Contains(currentReference))
                        {
                            processedReferences.Add(currentReference);
                        }
                        previousReference = currentReference;

                        var annotatedVariant = annotator.Annotate(vcfVariant);

                        writer.Write(annotatedVariant.ToString());
                    }
                }
                catch (Exception e)
                {
                    // embed the vcf line
                    e.Data["VcfLine"] = vcfLine;
                    throw;
                }
            }
        }
Exemplo n.º 26
0
 private static StreamWriter GetJsonStreamWriter(string outputPath)
 {
     return(ConfigurationSettings.OutputFileName == "-"
         ? new StreamWriter(Console.OpenStandardOutput())
         : GZipUtilities.GetStreamWriter(outputPath));
 }
Exemplo n.º 27
0
 public static StreamWriter GetGvcfOutputWriter(string outputPath)
 {
     return(outputPath == "-"
             ? new StreamWriter(Console.OpenStandardOutput())
             : GZipUtilities.GetStreamWriter(outputPath + ".genome.vcf.gz"));
 }
Exemplo n.º 28
0
        /// <summary>
        /// executes the program
        /// </summary>
        protected override void ProgramExecution()
        {
            var transcriptSource = ConfigurationSettings.ImportRefSeqTranscripts
                ? TranscriptDataSource.RefSeq
                : TranscriptDataSource.Ensembl;

            var referenceIndex = new ReferenceIndex(ConfigurationSettings.InputReferencePath);
            var vepDirectories = referenceIndex.GetUcscKaryotypeOrder(ConfigurationSettings.InputVepDirectory);
            var converter      = new VepCacheParser(transcriptSource);

            var genomeAssembly = GenomeAssemblyUtilities.Convert(ConfigurationSettings.GenomeAssembly);

            // =========================
            // create the pre-cache file
            // =========================

            // process each VEP directory
            int numDirectoriesProcessed = 0;

            var transcriptPath = ConfigurationSettings.OutputStub + ".transcripts.gz";
            var regulatoryPath = ConfigurationSettings.OutputStub + ".regulatory.gz";
            var genePath       = ConfigurationSettings.OutputStub + ".genes.gz";
            var intronPath     = ConfigurationSettings.OutputStub + ".introns.gz";
            var exonPath       = ConfigurationSettings.OutputStub + ".exons.gz";
            var mirnaPath      = ConfigurationSettings.OutputStub + ".mirnas.gz";
            var siftPath       = ConfigurationSettings.OutputStub + ".sift.dat";
            var polyphenPath   = ConfigurationSettings.OutputStub + ".polyphen.dat";
            var cdnaPath       = ConfigurationSettings.OutputStub + ".cdnas.gz";
            var peptidePath    = ConfigurationSettings.OutputStub + ".peptides.gz";

            using (var transcriptWriter = GZipUtilities.GetStreamWriter(transcriptPath))
                using (var regulatoryWriter = GZipUtilities.GetStreamWriter(regulatoryPath))
                    using (var geneWriter = GZipUtilities.GetStreamWriter(genePath))
                        using (var intronWriter = GZipUtilities.GetStreamWriter(intronPath))
                            using (var exonWriter = GZipUtilities.GetStreamWriter(exonPath))
                                using (var mirnaWriter = GZipUtilities.GetStreamWriter(mirnaPath))
                                    using (var siftWriter = GZipUtilities.GetBinaryWriter(siftPath + ".tmp"))
                                        using (var polyphenWriter = GZipUtilities.GetBinaryWriter(polyphenPath + ".tmp"))
                                            using (var cdnaWriter = GZipUtilities.GetStreamWriter(cdnaPath))
                                                using (var peptideWriter = GZipUtilities.GetStreamWriter(peptidePath))
                                                {
                                                    transcriptWriter.NewLine = "\n";
                                                    regulatoryWriter.NewLine = "\n";
                                                    geneWriter.NewLine       = "\n";
                                                    intronWriter.NewLine     = "\n";
                                                    exonWriter.NewLine       = "\n";
                                                    mirnaWriter.NewLine      = "\n";
                                                    cdnaWriter.NewLine       = "\n";
                                                    peptideWriter.NewLine    = "\n";

                                                    WriteHeader(transcriptWriter, GlobalImportCommon.FileType.Transcript, transcriptSource, genomeAssembly);
                                                    WriteHeader(regulatoryWriter, GlobalImportCommon.FileType.Regulatory, transcriptSource, genomeAssembly);
                                                    WriteHeader(geneWriter, GlobalImportCommon.FileType.Gene, transcriptSource, genomeAssembly);
                                                    WriteHeader(intronWriter, GlobalImportCommon.FileType.Intron, transcriptSource, genomeAssembly);
                                                    WriteHeader(exonWriter, GlobalImportCommon.FileType.Exon, transcriptSource, genomeAssembly);
                                                    WriteHeader(mirnaWriter, GlobalImportCommon.FileType.MicroRna, transcriptSource, genomeAssembly);
                                                    WriteHeader(siftWriter, GlobalImportCommon.FileType.Sift, transcriptSource, genomeAssembly);
                                                    WriteHeader(polyphenWriter, GlobalImportCommon.FileType.PolyPhen, transcriptSource, genomeAssembly);
                                                    WriteHeader(cdnaWriter, GlobalImportCommon.FileType.CDna, transcriptSource, genomeAssembly);
                                                    WriteHeader(peptideWriter, GlobalImportCommon.FileType.Peptide, transcriptSource, genomeAssembly);

                                                    foreach (var refTuple in vepDirectories)
                                                    {
                                                        // DEBUG
                                                        //if (refTuple.Item1 != "chr7") continue;

                                                        Console.WriteLine("Parsing reference sequence [{0}]:", refTuple.Item1);
                                                        numDirectoriesProcessed++;

                                                        var refIndex = referenceIndex.GetIndex(refTuple.Item1);

                                                        converter.ParseDumpDirectory(refIndex, refTuple.Item2, transcriptWriter, regulatoryWriter, geneWriter,
                                                                                     intronWriter, exonWriter, mirnaWriter, siftWriter, polyphenWriter, cdnaWriter, peptideWriter);
                                                    }
                                                }

            Console.WriteLine("\n{0} directories processed.", numDirectoriesProcessed);

            converter.DumpStatistics();
            Console.WriteLine();

            // convert our protein function predictions
            var predictionConverter = new PredictionConverter(referenceIndex.NumReferenceSeqs);

            predictionConverter.Convert(siftPath, "SIFT", GlobalImportCommon.FileType.Sift);
            predictionConverter.Convert(polyphenPath, "PolyPhen", GlobalImportCommon.FileType.PolyPhen);
        }