/// <summary> /// Parses a source file and return an enumeration object containing /// all the data objects that have been extracted. /// </summary> /// <returns></returns> private IEnumerable <ExacItem> GetExacItems() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_exacFileInfo.FullName)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } // Skip comments. if (line.StartsWith("#")) { continue; } var exacItemsList = ExtractItems(line); if (exacItemsList == null) { continue; } foreach (var exacItem in exacItemsList) { yield return(exacItem); } } } }
public MustGenotypeExtractor(string assembly, string oneKGenomeVcf, string clinvarVcf, string cosmicVcf) { switch (assembly) { case "GRCh37": _assembly = GenomeAssembly.GRCh37; break; case "GRCh38": _assembly = GenomeAssembly.GRCh38; break; case "hg19": _assembly = GenomeAssembly.hg19; break; default: _assembly = GenomeAssembly.Unknown; break; } if (_assembly == GenomeAssembly.Unknown) { throw new Exception("Genome assembly must be either GRCh37 or GRCh38"); } _oneKGenomeReader = string.IsNullOrEmpty(oneKGenomeVcf)? null: GZipUtilities.GetAppropriateStreamReader(oneKGenomeVcf); _clinvarReader = string.IsNullOrEmpty(clinvarVcf) ? null : GZipUtilities.GetAppropriateStreamReader(clinvarVcf); _cosmicReader = string.IsNullOrEmpty(cosmicVcf) ? null : GZipUtilities.GetAppropriateStreamReader(cosmicVcf); }
private IEnumerable <OneKGenItem> GetOneKGenSvItems() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_oneKGenSvFile.FullName)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } // Skip comments. if (line.StartsWith("#")) { continue; } var oneKSvGenItem = ExtractOneKGenSvItem(line, _renamer); if (oneKSvGenItem == null) { continue; } yield return(oneKSvGenItem); } } }
private static IEnumerable <GeneInfo> LoadGeneInfoGenes(string filePath) { GeneInfo[] genes; using (var streamReader = GZipUtilities.GetAppropriateStreamReader(filePath)) using (var reader = new GeneInfoReader(streamReader)) genes = reader.GetGenes(); return(genes); }
private static ExitCodes ProgramExecution() { string outFileName = Path.GetFileName(_inputFileName).Replace("vcf", "bed"); using (var reader = GZipUtilities.GetAppropriateStreamReader(_inputFileName)) using (var outputStream = FileUtilities.GetCreateStream(Path.Combine(_outputDirectory, outFileName))) using (var outputGzipStream = new GZipStream(outputStream, CompressionMode.Compress)) using (var writer = new StreamWriter(outputGzipStream)) { string line; while ((line = reader.ReadLine()) != null) { var fields = line.OptimizedSplit('\t', VcfCommon.InfoIndex + 2); if (fields.Length <= VcfCommon.InfoIndex) { continue; } string infoFields = fields[VcfCommon.InfoIndex]; string svEnd = GetSvEndString(infoFields); if (svEnd == null) { continue; } // Because 1K Genome SV has a padding base, the POS should add one to get the 1-based start position of the interval // However, the start position need to minus one to become the 0-based start position in a BED file // So the POS value can be used directly in the BED file. writer.WriteLine(string.Join('\t', fields[VcfCommon.ChromIndex], fields[VcfCommon.PosIndex], svEnd, fields[VcfCommon.IdIndex], fields[VcfCommon.AltIndex], infoFields)); } } return(ExitCodes.Success); }
private IEnumerable <CustomItem> GetCustomItems() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_customFileInfo.FullName)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } if (line.StartsWith("#")) { ParseHeaderLine(line); continue; } var customItemsList = ExtractCustomItems(line); if (customItemsList == null) { continue; } foreach (var customItem in customItemsList) { yield return(customItem); } } } }
public ClinGenUnifier(FileInfo inputFileInfo, FileInfo refNameInfo = null) { _reader = GZipUtilities.GetAppropriateStreamReader(inputFileInfo.FullName); if (refNameInfo == null) { return; } _refNameDict = new Dictionary <string, string>(); using (var refReader = GZipUtilities.GetAppropriateStreamReader(refNameInfo.FullName)) { string line; while ((line = refReader.ReadLine()) != null) { if (line.StartsWith("#")) { continue; } var lineContents = line.Split('\t'); var ucscName = lineContents[0]; var ensemblName = lineContents[1]; var inVep = lineContents[2].Equals("YES"); if (inVep) { _refNameDict[ucscName] = ensemblName; } } } }
// constructor public DataDumperReader(string filename) { // define our regular expressions _binaryKeyValueRegex = new Regex("'([^']+)' => '\x1f\xfffd\x08", RegexOptions.Compiled); _dataTypeRegex = new Regex("}, '([^']+)' \\)", RegexOptions.Compiled); _digitKeyRegex = new Regex("^\\s*([\\d\\.]+)(?:,?)\\s*$", RegexOptions.Compiled); _digitKeyValueRegex = new Regex("'([^']+)' => (\\d+)", RegexOptions.Compiled); _emptyListKeyValueRegex = new Regex("'([^']+)' => \\[\\]", RegexOptions.Compiled); _emptyValueKeyValueRegex = new Regex("'([^']+)' => \\{\\}", RegexOptions.Compiled); _listObjectKeyValueRegex = new Regex("'([^']+)' => \\[", RegexOptions.Compiled); _multiLineKeyValueRegex = new Regex("'([^']+)' => '([^']+)$", RegexOptions.Compiled); _objectKeyValueRegex = new Regex("'([^']+)' => (?:bless\\( )?{", RegexOptions.Compiled); _openBracesRegex = new Regex("bless\\( \\{", RegexOptions.Compiled); _referenceStringKeyRegex = new Regex("^\\s*(\\$VAR\\d+->\\S+?)(?:,?)\\s*$", RegexOptions.Compiled); _referenceStringKeyValueRegex = new Regex("'([^']+)' => (\\$VAR\\S+)(?:,?)", RegexOptions.Compiled); _rootObjectKeyValueRegex = new Regex("\\$VAR\\d = {", RegexOptions.Compiled); _stringKeyRegex = new Regex("^\\s*'([^']+)'(?:,?)\\s*$", RegexOptions.Compiled); _stringKeyValueRegex = new Regex("'([^']+)' => '([^']*)'", RegexOptions.Compiled); _undefKeyValueRegex = new Regex("'([^']+)' => undef", RegexOptions.Compiled); // start building the dumper hierarchy using (_reader = GZipUtilities.GetAppropriateStreamReader(filename)) { BuildDumperHierarchy(); } // dump the tree // Console.WriteLine(_rootNode); }
/// <summary> /// Parses a dbSNP file and return an enumeration object containing /// all the dbSNP objects that have been extracted. /// </summary> /// <returns></returns> private IEnumerable <DbSnpItem> GetDbSnpItems() { using (var reader = _stream == null? GZipUtilities.GetAppropriateStreamReader(_dbSnpFile.FullName): new StreamReader(_stream)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } // Skip comments. if (line.StartsWith("#")) { continue; } var dbSnpItems = ExtractItem(line); if (dbSnpItems == null || dbSnpItems.Count == 0) { continue; } foreach (var dbSnpItem in dbSnpItems) { yield return(dbSnpItem); } } } }
public void Extract() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_inputXmlFile)) using (var xmlReader = XmlReader.Create(reader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Prohibit, IgnoreWhitespace = true })) { var existVarSet = xmlReader.ReadToDescendant("ClinVarSet"); while (_rcvIds.Count > 0 && existVarSet) { var rcvContents = xmlReader.ReadOuterXml(); var rcv = DetectRcv(_rcvIds, rcvContents); if (rcv != null) { var targetedContent = rcvContents; var outXmlFile = Path.Combine(_outputDir, rcv + ".xml"); WriteToFile(outXmlFile, targetedContent); } if (!xmlReader.IsStartElement("ClinVarSet")) { existVarSet = xmlReader.ReadToNextSibling("ClinVarSet"); } } } if (_rcvIds.Count > 0) { Console.WriteLine($"Failed to Find {string.Join(',',_rcvIds)}"); } }
/// <summary> /// Parses a ClinVar file and return an enumeration object containing all the ClinVar objects /// that have been extracted /// </summary> private IEnumerable <DgvItem> GetDgvItems() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_dgvFileInfo.FullName)) { while (true) { // grab the next line string line = reader.ReadLine(); if (line == null) { break; } // skip header and empty lines if (string.IsNullOrWhiteSpace(line) || IsDgvHeader(line)) { continue; } var dgvItem = ExtractDgvItem(line, _renamer); if (dgvItem == null) { continue; } yield return(dgvItem); } } }
private static ExitCodes ProgramExecution() { const string tempLeftoverFilename = "LeftOvers.vcf.gz"; Dictionary <string, StreamWriter> writers; ISequenceProvider srcSequenceProvider = ProviderUtilities.GetSequenceProvider(_srcRefSequence); ISequenceProvider desSequenceProvider = ProviderUtilities.GetSequenceProvider(_desRefSequence); using (var srcReader = GZipUtilities.GetAppropriateStreamReader(_srcMapFile)) using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile)) using (var leftoverWriter = GZipUtilities.GetStreamWriter(tempLeftoverFilename)) { var chromMapper = new ChromMapper(srcReader, destReader, leftoverWriter, srcSequenceProvider, desSequenceProvider); writers = chromMapper.Map(); } //now we will try to map the leftovers using (var destReader = GZipUtilities.GetAppropriateStreamReader(_destMapFile)) using (var leftoverReader = GZipUtilities.GetAppropriateStreamReader(tempLeftoverFilename)) { var leftOverMapper = new LeftoverMapper(leftoverReader, destReader, writers, desSequenceProvider); var leftoverCount = leftOverMapper.Map(); Console.WriteLine($"{leftoverCount} leftovers mapped!!"); } foreach (var writer in writers.Values) { writer.Dispose(); } return(ExitCodes.Success); }
private IEnumerable <OmimAnnotation> GetOmimItems() { //_processedGeneSymbols = new HashSet<string>(); using (var reader = GZipUtilities.GetAppropriateStreamReader(_omimFileInfo.FullName)) { string line; while ((line = reader.ReadLine()) != null) { if (IsHeader(line)) { ParseHeader(line); continue; } if (!IsContentLine(line)) { continue; } var contents = line.Split('\t'); var mimNumber = Convert.ToInt64(contents[_mimNumberCol]); var geneSymbol = contents[_hgncCol]; var description = contents[_geneDescriptionCol].Replace(@"\\'", @"'"); var phenotypeInfo = contents[_phenotypeCol].Replace(@",,", @","); var phenotypes = ParsePhenotypes(phenotypeInfo); if (string.IsNullOrEmpty(geneSymbol)) { continue; } yield return(new OmimAnnotation(geneSymbol, description, mimNumber, phenotypes)); } } }
public void TestDbSnpReader() { using (var dgvReader = new DgvReader(GZipUtilities.GetAppropriateStreamReader(TestDgvFile), ChromosomeUtilities.RefNameToChromosome)) { Assert.True(dgvReader.GetItems().SequenceEqual(CreateTruthDgvItemSequence())); } }
public void TwoStudyCosmicCoding() { var vcfReader = GZipUtilities.GetAppropriateStreamReader(Resources.TopPath("cosm5428243.vcf")); var tsvReader = GZipUtilities.GetAppropriateStreamReader(Resources.TopPath("cosm5428243.tsv")); var cosmicReader = new MergedCosmicReader(vcfReader, tsvReader, _refChromDict); var cosmicItems = cosmicReader.GetCosmicItems(); var count = 0; foreach (var cosmicItem in cosmicItems) { switch (count) { case 0: foreach (var study in cosmicItem.Studies) { Assert.Equal("544", study.Id); Assert.Equal(new [] { "haematopoietic_and_lymphoid_tissue" }, study.Sites); Assert.Equal(new [] { "haematopoietic_neoplasm", "acute_myeloid_leukaemia" }, study.Histologies); } break; case 1: foreach (var study in cosmicItem.Studies) { Assert.Equal("544", study.Id); Assert.Equal(new[] { "haematopoietic;lymphoid_tissue" }, study.Sites); Assert.Equal(new[] { "haematopoietic_neoplasm", "acute_myeloid_leukaemia" }, study.Histologies); } break; } count++; } }
public void GZipReadAndWrite() { const string expectedLine1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."; const string expectedLine2 = "Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."; var randomPath = GetRandomPath(); using (var writer = GZipUtilities.GetStreamWriter(randomPath)) { writer.WriteLine(expectedLine1); writer.WriteLine(expectedLine2); } string observedLine1; string observedLine2; string observedLine3; using (var reader = GZipUtilities.GetAppropriateStreamReader(randomPath)) { observedLine1 = reader.ReadLine(); observedLine2 = reader.ReadLine(); observedLine3 = reader.ReadLine(); } Assert.Equal(expectedLine1, observedLine1); Assert.Equal(expectedLine2, observedLine2); Assert.Null(observedLine3); }
public IEnumerable <DataStructures.CustomInterval> GetCustomIntervals() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_customFileInfo.FullName)) { string line; while ((line = reader.ReadLine()) != null) { // Skip empty lines. if (string.IsNullOrWhiteSpace(line)) { continue; } if (line.StartsWith("#")) { continue; } var customInterval = ExtractCustomInterval(line); if (customInterval == null) { continue; } yield return(customInterval); } } }
public static void Filter(string intputTsv, string gffFile1, string gffFile2, string outputTsv) { var intronFlankingRegions = GetIntronFlankingRegions(gffFile1, gffFile2); using (var resultsReader = GZipUtilities.GetAppropriateStreamReader(intputTsv)) using (var resultsWriter = GZipUtilities.GetStreamWriter(outputTsv)) { long lineCount = 0; string line; while ((line = resultsReader.ReadLine()) != null) { var info = line.TrimEnd().Split('\t'); ushort chrIndex = GetChrIndex(info[PredChrColumn]); int pos = int.Parse(info[PredPosColumn]); if (intronFlankingRegions.OverlapsAny(chrIndex, pos, pos) || AnyScorePassTheCutoff(info, PredScoreColumns, FreqCutoff)) { resultsWriter.WriteLine(line); } lineCount++; if (lineCount % 1_000_000 == 0) { Console.WriteLine($"Processed {lineCount} lines. Current position: {info[PredChrColumn]}:{info[PredPosColumn]}"); } } } }
public IEnumerable <InterimSaItem> GetItems(string refName) { if (!_refNameOffsets.ContainsKey(refName)) { yield break; } var offset = _refNameOffsets[refName]; using (var reader = GZipUtilities.GetAppropriateStreamReader(_fileName)) { reader.BaseStream.Position = offset; string line; while ((line = reader.ReadLine()) != null) { if (string.IsNullOrWhiteSpace(line) || line.StartsWith("#")) { continue; } // finding desired chromosome. We need this because the GetLocation for GZipStream may return a position a few lines before the start of the chromosome if (line.StartsWith(refName + "\t")) { break; } } if (line == null) { yield break; } string lastLine = line; do { //next chromosome if (!line.StartsWith(refName + "\t")) { yield break; } var annotationItem = ExtractItem(line); if (annotationItem == null) { continue; } yield return(annotationItem); try { line = reader.ReadLine(); } catch (Exception) { Console.WriteLine("error while reading line in while loop. Last line read:"); Console.WriteLine(lastLine); throw; } lastLine = line; } while (line != null); } }
public static EnsemblGtf Create(string filePath, IDictionary <string, IChromosome> refNameToChromosome) { var ensemblGenes = LoadEnsemblGenes(GZipUtilities.GetAppropriateStreamReader(filePath), refNameToChromosome); var ensemblIdToGene = ensemblGenes.GetSingleValueDict(x => x.GeneId); var ensemblIdToSymbol = ensemblGenes.GetKeyValueDict(x => x.GeneId, x => x.Symbol); return(new EnsemblGtf(ensemblIdToGene, ensemblIdToSymbol)); }
public MergedCosmicReader(string vcfFile, string tsvFile, ISequenceProvider sequenceProvider) { _vcfFileReader = GZipUtilities.GetAppropriateStreamReader(vcfFile); _tsvFileReader = GZipUtilities.GetAppropriateStreamReader(tsvFile); _sequenceProvider = sequenceProvider; _refChromDict = _sequenceProvider.RefNameToChromosome; _studies = new Dictionary <string, HashSet <CosmicItem.CosmicStudy> >(); }
private static ExitCodes ProgramExecution() { var geneScoreCreator = new GeneScoreTsvCreator(GZipUtilities.GetAppropriateStreamReader(_inputPath), new GeneAnnotationTsvWriter(_outputDirectory, DataSourceVersionReader.GetSourceVersion(_inputPath + ".version"), null, 0, JsonKeyName, false)); return(geneScoreCreator.Create()); }
public PhylopWriter(string inputWigFixFile, DataSourceVersion version, GenomeAssembly genomeAssembly, string outputNirvanaDirectory, int intervalLength = PhylopCommon.MaxIntervalLength) : this(null, version, genomeAssembly, intervalLength) { _version = version; _reader = GZipUtilities.GetAppropriateStreamReader(inputWigFixFile); _outputNirvanaDirectory = outputNirvanaDirectory; }
Accession37, IDictionary <string, IChromosome> Accession38) GetSequenceDictionaries(string referencePath, string assemblyInfo37Path, string assemblyInfo38Path) { var(_, refNameToChromosome, _) = SequenceHelper.GetDictionaries(referencePath); var accession37Dict = AssemblyReader.GetAccessionToChromosome(GZipUtilities.GetAppropriateStreamReader(assemblyInfo37Path), refNameToChromosome); var accession38Dict = AssemblyReader.GetAccessionToChromosome(GZipUtilities.GetAppropriateStreamReader(assemblyInfo38Path), refNameToChromosome); return(refNameToChromosome, accession37Dict, accession38Dict); }
public void CosmicAlleleSpecificIndel() { var vcfReader = GZipUtilities.GetAppropriateStreamReader(Resources.TopPath("COSM18152.vcf")); var tsvReader = GZipUtilities.GetAppropriateStreamReader(Resources.TopPath("COSM18152.tsv")); var cosmicReader = new MergedCosmicReader(vcfReader, tsvReader, _refChromDict); var items = cosmicReader.GetCosmicItems(); Assert.Single(items); }
public int RemoveConflictingLines() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_inFile)) using (var writer = GZipUtilities.GetStreamWriter(_outFile)) { string line; var vcfLines = new List <string>(VcfBufferSize); //all lines for the last few positions will be tracked in this dictionary var hasConflictingEntry = new Dictionary <string, bool>(); //indicates if there is a conflicting entry for a certain allele. while ((line = reader.ReadLine()) != null) { if (line.StartsWith("#")) { // streaming the header lines writer.WriteLine(line); continue; } // parsing vcf line var vcfColumns = line.Split(new[] { '\t' }, VcfCommon.InfoIndex + 1); var chromosome = vcfColumns[VcfCommon.ChromIndex]; var vcfPosition = Convert.ToInt32(vcfColumns[VcfCommon.PosIndex]); var refAllele = vcfColumns[VcfCommon.RefIndex]; var altAlleles = vcfColumns[VcfCommon.AltIndex].Split(','); if (chromosome != _currentRefSeq || vcfPosition > _maxVidPosition) { FlushVcfLineBuffer(vcfLines, hasConflictingEntry, writer); vcfLines.Clear(); hasConflictingEntry.Clear(); _currentRefSeq = chromosome; } foreach (var altAllele in altAlleles) { var alleleId = GetAlleleId(chromosome, vcfPosition, refAllele, altAllele); if (hasConflictingEntry.ContainsKey(alleleId)) { hasConflictingEntry[alleleId] = true; //wipe out any lines containing this alt allele } else { hasConflictingEntry[alleleId] = false; } } vcfLines.Add(line); } // flushing out the remaining lines FlushVcfLineBuffer(vcfLines, hasConflictingEntry, writer); } return(_noLinesRemoved); }
public void CosmicAltAllele() { var vcfReader = GZipUtilities.GetAppropriateStreamReader(Resources.TopPath("COSM983708.vcf")); var tsvReader = GZipUtilities.GetAppropriateStreamReader(Resources.TopPath("COSM983708.tsv")); var cosmicReader = new MergedCosmicReader(vcfReader, tsvReader, _refChromDict); var items = cosmicReader.GetCosmicItems().ToList(); Assert.Single(items); Assert.Contains("\"refAllele\":\"C\"", items[0].GetJsonString()); }
private SaHeader GetHeader() { SaHeader header; using (var reader = GZipUtilities.GetAppropriateStreamReader(_fileName)) { header = ReadHeader(reader); } return(header); }
private ExitCodes ProgramExecution() { var reader = GZipUtilities.GetAppropriateStreamReader(_inputFileArg); var referenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(_compressedReferenceArg)); var version = DataSourceVersionReader.GetSourceVersion(_inputFileArg + ".version"); var topMedTsvCreator = new TopMedTsvCreator(reader, referenceProvider, version, _outputDirArg); topMedTsvCreator.CreateTsvs(); return(ExitCodes.Success); }
/// <summary> /// constructor /// </summary> public VepSimpleIntervalReader(string filePath, string description, GlobalImportCommon.FileType fileType) { // sanity check if (!File.Exists(filePath)) { throw new FileNotFoundException($"The specified intron file ({filePath}) does not exist."); } // open the vcf file and parse the header _reader = GZipUtilities.GetAppropriateStreamReader(filePath); VepReaderCommon.GetHeader(description, filePath, fileType, _reader); }