public ChunkedIndex(Stream stream) { //reading the index in one shot var buffer = new byte[1048576]; var indexLength = stream.Read(buffer, 0, 1048576); using (var memStream = new MemoryStream(buffer, 0, indexLength)) using (var memReader = new ExtendedBinaryReader(memStream)) { Assembly = (GenomeAssembly)memReader.ReadByte(); Version = DataSourceVersion.Read(memReader); JsonKey = memReader.ReadAsciiString(); MatchByAllele = memReader.ReadBoolean(); IsArray = memReader.ReadBoolean(); SchemaVersion = memReader.ReadOptInt32(); IsPositional = memReader.ReadBoolean(); var chromCount = memReader.ReadOptInt32(); _chromChunks = new Dictionary <ushort, List <Chunk> >(chromCount); for (var i = 0; i < chromCount; i++) { var chromIndex = memReader.ReadOptUInt16(); var chunkCount = memReader.ReadOptInt32(); _chromChunks[chromIndex] = new List <Chunk>(chunkCount); for (var j = 0; j < chunkCount; j++) { _chromChunks[chromIndex].Add(new Chunk(memReader)); } } } }
private NgaReader(IDataSourceVersion version, string jsonKey, bool isArray, Dictionary <string, List <string> > geneSymbolToJsonStrings) { Version = version; JsonKey = jsonKey; _isArray = isArray; _geneSymbolToJsonStrings = geneSymbolToJsonStrings; }
public NpdReader(Stream dbStream, Stream indexStream) { _dbStream = dbStream; _indexStream = indexStream; _reader = new ExtendedBinaryReader(dbStream); _index = new NpdIndex(new ExtendedBinaryReader(indexStream)); Assembly = _index.Assembly; Version = _index.Version; if (_index.SchemaVersion != SaCommon.SchemaVersion) { throw new UserErrorException($"SA schema version mismatch. Expected {SaCommon.SchemaVersion}, observed {_index.SchemaVersion}"); } var scoreMap = new Dictionary <byte, double>(); foreach ((double score, byte code) in _index.ScoreMap) { scoreMap.Add(code, score); } _scoreMap = scoreMap.ToImmutableDictionary(); _zstd = new Zstandard(); _scores = new byte[NpdIndex.MaxChromLength]; }
public NsaIndex(Stream stream) { using (var memStream = new MemoryStream()) using (var memReader = new ExtendedBinaryReader(memStream)) { stream.CopyTo(memStream);//reading all bytes in stream to memStream memStream.Position = 0; Assembly = (GenomeAssembly)memReader.ReadByte(); Version = DataSourceVersion.Read(memReader); JsonKey = memReader.ReadAsciiString(); MatchByAllele = memReader.ReadBoolean(); IsArray = memReader.ReadBoolean(); SchemaVersion = memReader.ReadOptInt32(); IsPositional = memReader.ReadBoolean(); var chromCount = memReader.ReadOptInt32(); _chromBlocks = new Dictionary <ushort, List <NsaIndexBlock> >(chromCount); for (var i = 0; i < chromCount; i++) { var chromIndex = memReader.ReadOptUInt16(); var chunkCount = memReader.ReadOptInt32(); _chromBlocks[chromIndex] = new List <NsaIndexBlock>(chunkCount); for (var j = 0; j < chunkCount; j++) { _chromBlocks[chromIndex].Add(new NsaIndexBlock(memReader)); } } } }
public void Vcf_header_write_as_expected() { var ms = new MemoryStream(); var writer = new StreamWriter(ms, Encoding.Default, 1024, true); var currentHeaderLines = new List <string> { "##fileformat=VCFv4.1", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##source=IsaacVariantCaller", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Mother" }; var dataSourceVersions = new IDataSourceVersion[] { new DataSourceVersion("VEP", "84", DateTime.Parse("2017/7/21").Ticks, "RefSeq"), new DataSourceVersion("1000 Genomes Project", "v5", DateTime.Parse("2017/7/21").Ticks), new DataSourceVersion("dbSNP", "72", DateTime.Parse("2017/8/15").Ticks), new DataSourceVersion("dummy", "2", DateTime.Parse("2017/9/15").Ticks) //should not showing in output }; const string vcfLine = "1 10167 . C A 4 LowGQXHetSNP SNVSB=0.0;SNVHPOL=3;CSQT=1|DDX11L1|ENST00000456328.2|upstream_gene_variant,1|WASH7P|ENST00000438504.2|downstream_gene_variant,1|DDX11L1|NR_046018.2|upstream_gene_variant,1|WASH7P|NR_024540.1|downstream_gene_variant;CSQR=1|ENSR00001576074|regulatory_region_variant,1|ENSR00001576074|regulatory_region_variant GT:GQ:GQX:DP:DPF:AD 0/1:34:8:3:0:2,1"; using (var vcfWriter = new LiteVcfWriter(writer, currentHeaderLines, "Illumina Annotation Engine 2.0.4", "84.21.41", dataSourceVersions)) { vcfWriter.Write(vcfLine); } var expectedLines = new List <string> { "##fileformat=VCFv4.1", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##source=IsaacVariantCaller", "##annotator=Illumina Annotation Engine 2.0.4", "##annotatorDataVersion=84.21.41", "##annotatorTranscriptSource=RefSeq", "##dataSource=1000 Genomes Project,version:v5,release date:2017-07-21", "##dataSource=dbSNP,version:72,release date:2017-08-15" }; expectedLines.AddRange(_infoHeaderLines); expectedLines.Add(CsqtHeaderLine); expectedLines.Add(CsqrHeaderLine); expectedLines.Add("#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Mother"); expectedLines.Add(vcfLine); ms.Position = 0; using (var reader = new StreamReader(ms)) { string line; int i = 0; while ((line = reader.ReadLine()) != null) { Assert.Equal(expectedLines[i], line); i++; } Assert.Equal(20, i); } }
private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, Dictionary <ushort, IntervalArray <string> > intervalArrays) { Assembly = assembly; Version = version; JsonKey = jsonKey; ReportFor = reportFor; _intervalArrays = intervalArrays; }
private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, IntervalArray <string>[] intervalArrays) { Assembly = assembly; Version = version; JsonKey = jsonKey; ReportFor = reportFor; _intervalForest = new IntervalForest <string>(intervalArrays); }
public ProteinConservationReader(Stream stream) { _reader = new ExtendedBinaryReader(stream); var schemaVersion = _reader.ReadOptInt32(); if (schemaVersion != ProteinConservationCommon.SchemaVersion) { throw new Exception($"Schema version mismatch found. Observed: {schemaVersion}, expected: {ProteinConservationCommon.SchemaVersion}"); } Assembly = (GenomeAssembly)_reader.ReadByte(); Version = DataSourceVersion.Read(_reader); }
public ChunkedIndex(ExtendedBinaryWriter indexWriter, GenomeAssembly assembly, DataSourceVersion version, string jsonKey, bool matchByAllele, bool isArray, int schemaVersion, bool isPositional) { _writer = indexWriter; MatchByAllele = matchByAllele; JsonKey = jsonKey; Version = version; Assembly = assembly; IsArray = isArray; IsPositional = isPositional; indexWriter.Write((byte)assembly); version.Write(indexWriter); indexWriter.WriteOptAscii(jsonKey); indexWriter.Write(matchByAllele); indexWriter.Write(isArray); indexWriter.WriteOpt(schemaVersion); indexWriter.Write(isPositional); _chromChunks = new Dictionary <ushort, List <Chunk> >(); }
//todo: filter chromIndex=ushort.Max public NsaWriter(Stream nsaStream, Stream indexStream, IDataSourceVersion version, ISequenceProvider refProvider, string jsonKey, bool matchByAllele, bool isArray, int schemaVersion, bool isPositional, bool skipIncorrectRefEntries = true, bool throwErrorOnConflicts = false, int blockSize = SaCommon.DefaultBlockSize, GenomeAssembly assembly = GenomeAssembly.Unknown, bool leaveOpen = false) { _stream = nsaStream; _indexStream = indexStream; _writer = new ExtendedBinaryWriter(_stream, System.Text.Encoding.Default, leaveOpen); _indexWriter = new ExtendedBinaryWriter(_indexStream, System.Text.Encoding.Default, leaveOpen); _isPositional = isPositional; _skipIncorrectRefEntries = skipIncorrectRefEntries; _throwErrorOnConflicts = throwErrorOnConflicts; _refProvider = refProvider; _leaveOpen = leaveOpen; assembly = _refProvider?.Assembly ?? assembly; _block = new NsaBlock(new Zstandard(), blockSize); _index = new NsaIndex(_indexWriter, assembly, version, jsonKey, matchByAllele, isArray, schemaVersion, isPositional); _memBuffer = new byte[blockSize]; _memStream = new MemoryStream(_memBuffer); _memWriter = new ExtendedBinaryWriter(_memStream); }
private void LoadHeader() { var identifier = _reader.ReadString(); if (identifier != PhylopCommon.Header) { throw new InvalidDataException("Unrecognized file header: " + identifier); } var schemaVersion = _reader.ReadInt16(); if (schemaVersion != PhylopCommon.SchemaVersion) { throw new InvalidDataException("Expected phylop schema version:" + PhylopCommon.SchemaVersion + " observed schema version: " + schemaVersion); } var dataVersion = _reader.ReadInt16(); if (dataVersion != PhylopCommon.DataVersion) { Console.WriteLine("WARNING: Expected phylop data version:" + PhylopCommon.DataVersion + " observed data version: " + dataVersion); } _genomeAssembly = (GenomeAssembly)_reader.ReadByte(); _version = DataSourceVersion.Read(_reader); // skip the reference name _reader.ReadString(); _intervalListPosition = _reader.ReadInt64(); CheckGuard(); LoadChromosomeIntervals(); IsInitialized = true; }
public NgaReader(Stream stream) { _nsaStream = stream; // read the whole file. Currently they are well under 2MB var compressedBytes = new byte[2 * 1024 * 1024]; var decompressedBytes = new byte[20 * 1024 * 1024]; var compressedSize = _nsaStream.Read(compressedBytes, 0, compressedBytes.Length); var zstd = new Zstandard(); var decompressedSize = zstd.Decompress(compressedBytes, compressedSize, decompressedBytes, decompressedBytes.Length); _memStream = new MemoryStream(decompressedBytes, 0, decompressedSize); _reader = new ExtendedBinaryReader(_memStream); Version = DataSourceVersion.Read(_reader); JsonKey = _reader.ReadAsciiString(); _isArray = _reader.ReadBoolean(); ushort schemaVersion = _reader.ReadOptUInt16(); if (schemaVersion != SaCommon.SchemaVersion) { throw new UserErrorException($"Expected schema version: {SaCommon.SchemaVersion}, observed: {schemaVersion} for {JsonKey}"); } }