Ejemplo n.º 1
0
        public ChunkedIndex(Stream stream)
        {
            //reading the index in one shot
            var buffer      = new byte[1048576];
            var indexLength = stream.Read(buffer, 0, 1048576);

            using (var memStream = new MemoryStream(buffer, 0, indexLength))
                using (var memReader = new ExtendedBinaryReader(memStream))
                {
                    Assembly      = (GenomeAssembly)memReader.ReadByte();
                    Version       = DataSourceVersion.Read(memReader);
                    JsonKey       = memReader.ReadAsciiString();
                    MatchByAllele = memReader.ReadBoolean();
                    IsArray       = memReader.ReadBoolean();
                    SchemaVersion = memReader.ReadOptInt32();
                    IsPositional  = memReader.ReadBoolean();

                    var chromCount = memReader.ReadOptInt32();
                    _chromChunks = new Dictionary <ushort, List <Chunk> >(chromCount);
                    for (var i = 0; i < chromCount; i++)
                    {
                        var chromIndex = memReader.ReadOptUInt16();
                        var chunkCount = memReader.ReadOptInt32();
                        _chromChunks[chromIndex] = new List <Chunk>(chunkCount);
                        for (var j = 0; j < chunkCount; j++)
                        {
                            _chromChunks[chromIndex].Add(new Chunk(memReader));
                        }
                    }
                }
        }
Ejemplo n.º 2
0
 private NgaReader(IDataSourceVersion version, string jsonKey, bool isArray, Dictionary <string, List <string> > geneSymbolToJsonStrings)
 {
     Version  = version;
     JsonKey  = jsonKey;
     _isArray = isArray;
     _geneSymbolToJsonStrings = geneSymbolToJsonStrings;
 }
Ejemplo n.º 3
0
        public NpdReader(Stream dbStream, Stream indexStream)
        {
            _dbStream    = dbStream;
            _indexStream = indexStream;
            _reader      = new ExtendedBinaryReader(dbStream);

            _index   = new NpdIndex(new ExtendedBinaryReader(indexStream));
            Assembly = _index.Assembly;
            Version  = _index.Version;

            if (_index.SchemaVersion != SaCommon.SchemaVersion)
            {
                throw new UserErrorException($"SA schema version mismatch. Expected {SaCommon.SchemaVersion}, observed {_index.SchemaVersion}");
            }

            var scoreMap = new Dictionary <byte, double>();

            foreach ((double score, byte code) in _index.ScoreMap)
            {
                scoreMap.Add(code, score);
            }

            _scoreMap = scoreMap.ToImmutableDictionary();
            _zstd     = new Zstandard();
            _scores   = new byte[NpdIndex.MaxChromLength];
        }
Ejemplo n.º 4
0
        public NsaIndex(Stream stream)
        {
            using (var memStream = new MemoryStream())
                using (var memReader = new ExtendedBinaryReader(memStream))
                {
                    stream.CopyTo(memStream);//reading all bytes in stream to memStream
                    memStream.Position = 0;

                    Assembly      = (GenomeAssembly)memReader.ReadByte();
                    Version       = DataSourceVersion.Read(memReader);
                    JsonKey       = memReader.ReadAsciiString();
                    MatchByAllele = memReader.ReadBoolean();
                    IsArray       = memReader.ReadBoolean();
                    SchemaVersion = memReader.ReadOptInt32();
                    IsPositional  = memReader.ReadBoolean();

                    var chromCount = memReader.ReadOptInt32();
                    _chromBlocks = new Dictionary <ushort, List <NsaIndexBlock> >(chromCount);
                    for (var i = 0; i < chromCount; i++)
                    {
                        var chromIndex = memReader.ReadOptUInt16();
                        var chunkCount = memReader.ReadOptInt32();
                        _chromBlocks[chromIndex] = new List <NsaIndexBlock>(chunkCount);
                        for (var j = 0; j < chunkCount; j++)
                        {
                            _chromBlocks[chromIndex].Add(new NsaIndexBlock(memReader));
                        }
                    }
                }
        }
Ejemplo n.º 5
0
        public void Vcf_header_write_as_expected()
        {
            var ms     = new MemoryStream();
            var writer = new StreamWriter(ms, Encoding.Default, 1024, true);

            var currentHeaderLines = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##source=IsaacVariantCaller",
                "#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  Mother"
            };

            var dataSourceVersions = new IDataSourceVersion[]
            {
                new DataSourceVersion("VEP", "84", DateTime.Parse("2017/7/21").Ticks, "RefSeq"),
                new DataSourceVersion("1000 Genomes Project", "v5", DateTime.Parse("2017/7/21").Ticks),
                new DataSourceVersion("dbSNP", "72", DateTime.Parse("2017/8/15").Ticks),
                new DataSourceVersion("dummy", "2", DateTime.Parse("2017/9/15").Ticks) //should not showing in output
            };

            const string vcfLine = "1       10167   .       C       A       4       LowGQXHetSNP    SNVSB=0.0;SNVHPOL=3;CSQT=1|DDX11L1|ENST00000456328.2|upstream_gene_variant,1|WASH7P|ENST00000438504.2|downstream_gene_variant,1|DDX11L1|NR_046018.2|upstream_gene_variant,1|WASH7P|NR_024540.1|downstream_gene_variant;CSQR=1|ENSR00001576074|regulatory_region_variant,1|ENSR00001576074|regulatory_region_variant     GT:GQ:GQX:DP:DPF:AD     0/1:34:8:3:0:2,1";

            using (var vcfWriter = new LiteVcfWriter(writer, currentHeaderLines, "Illumina Annotation Engine 2.0.4", "84.21.41", dataSourceVersions))
            {
                vcfWriter.Write(vcfLine);
            }

            var expectedLines = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##source=IsaacVariantCaller",
                "##annotator=Illumina Annotation Engine 2.0.4",
                "##annotatorDataVersion=84.21.41",
                "##annotatorTranscriptSource=RefSeq",
                "##dataSource=1000 Genomes Project,version:v5,release date:2017-07-21",
                "##dataSource=dbSNP,version:72,release date:2017-08-15"
            };

            expectedLines.AddRange(_infoHeaderLines);
            expectedLines.Add(CsqtHeaderLine);
            expectedLines.Add(CsqrHeaderLine);
            expectedLines.Add("#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  Mother");
            expectedLines.Add(vcfLine);

            ms.Position = 0;
            using (var reader = new StreamReader(ms))
            {
                string line;
                int    i = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    Assert.Equal(expectedLines[i], line);
                    i++;
                }
                Assert.Equal(20, i);
            }
        }
Ejemplo n.º 6
0
 private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, Dictionary <ushort, IntervalArray <string> > intervalArrays)
 {
     Assembly        = assembly;
     Version         = version;
     JsonKey         = jsonKey;
     ReportFor       = reportFor;
     _intervalArrays = intervalArrays;
 }
Ejemplo n.º 7
0
 private NsiReader(GenomeAssembly assembly, IDataSourceVersion version, string jsonKey, ReportFor reportFor, IntervalArray <string>[] intervalArrays)
 {
     Assembly        = assembly;
     Version         = version;
     JsonKey         = jsonKey;
     ReportFor       = reportFor;
     _intervalForest = new IntervalForest <string>(intervalArrays);
 }
        public ProteinConservationReader(Stream stream)
        {
            _reader = new ExtendedBinaryReader(stream);

            var schemaVersion = _reader.ReadOptInt32();

            if (schemaVersion != ProteinConservationCommon.SchemaVersion)
            {
                throw new Exception($"Schema version mismatch found. Observed: {schemaVersion}, expected: {ProteinConservationCommon.SchemaVersion}");
            }
            Assembly = (GenomeAssembly)_reader.ReadByte();
            Version  = DataSourceVersion.Read(_reader);
        }
Ejemplo n.º 9
0
        public ChunkedIndex(ExtendedBinaryWriter indexWriter, GenomeAssembly assembly, DataSourceVersion version, string jsonKey, bool matchByAllele, bool isArray, int schemaVersion, bool isPositional)
        {
            _writer       = indexWriter;
            MatchByAllele = matchByAllele;
            JsonKey       = jsonKey;
            Version       = version;
            Assembly      = assembly;
            IsArray       = isArray;
            IsPositional  = isPositional;

            indexWriter.Write((byte)assembly);
            version.Write(indexWriter);
            indexWriter.WriteOptAscii(jsonKey);
            indexWriter.Write(matchByAllele);
            indexWriter.Write(isArray);
            indexWriter.WriteOpt(schemaVersion);
            indexWriter.Write(isPositional);

            _chromChunks = new Dictionary <ushort, List <Chunk> >();
        }
Ejemplo n.º 10
0
        //todo: filter chromIndex=ushort.Max
        public NsaWriter(Stream nsaStream, Stream indexStream, IDataSourceVersion version, ISequenceProvider refProvider, string jsonKey, bool matchByAllele, bool isArray, int schemaVersion, bool isPositional, bool skipIncorrectRefEntries = true, bool throwErrorOnConflicts = false, int blockSize = SaCommon.DefaultBlockSize, GenomeAssembly assembly = GenomeAssembly.Unknown, bool leaveOpen = false)
        {
            _stream                  = nsaStream;
            _indexStream             = indexStream;
            _writer                  = new ExtendedBinaryWriter(_stream, System.Text.Encoding.Default, leaveOpen);
            _indexWriter             = new ExtendedBinaryWriter(_indexStream, System.Text.Encoding.Default, leaveOpen);
            _isPositional            = isPositional;
            _skipIncorrectRefEntries = skipIncorrectRefEntries;
            _throwErrorOnConflicts   = throwErrorOnConflicts;
            _refProvider             = refProvider;
            _leaveOpen               = leaveOpen;

            assembly = _refProvider?.Assembly ?? assembly;

            _block     = new NsaBlock(new Zstandard(), blockSize);
            _index     = new NsaIndex(_indexWriter, assembly, version, jsonKey, matchByAllele, isArray, schemaVersion, isPositional);
            _memBuffer = new byte[blockSize];
            _memStream = new MemoryStream(_memBuffer);
            _memWriter = new ExtendedBinaryWriter(_memStream);
        }
Ejemplo n.º 11
0
        private void LoadHeader()
        {
            var identifier = _reader.ReadString();

            if (identifier != PhylopCommon.Header)
            {
                throw new InvalidDataException("Unrecognized file header: " + identifier);
            }

            var schemaVersion = _reader.ReadInt16();

            if (schemaVersion != PhylopCommon.SchemaVersion)
            {
                throw new InvalidDataException("Expected phylop schema version:" + PhylopCommon.SchemaVersion + " observed schema version: " + schemaVersion);
            }

            var dataVersion = _reader.ReadInt16();

            if (dataVersion != PhylopCommon.DataVersion)
            {
                Console.WriteLine("WARNING: Expected phylop data version:" + PhylopCommon.DataVersion + " observed data version: " + dataVersion);
            }

            _genomeAssembly = (GenomeAssembly)_reader.ReadByte();
            _version        = DataSourceVersion.Read(_reader);

            // skip the reference name
            _reader.ReadString();

            _intervalListPosition = _reader.ReadInt64();

            CheckGuard();

            LoadChromosomeIntervals();
            IsInitialized = true;
        }
Ejemplo n.º 12
0
        public NgaReader(Stream stream)
        {
            _nsaStream = stream;
            // read the whole file. Currently they are well under 2MB
            var compressedBytes   = new byte[2 * 1024 * 1024];
            var decompressedBytes = new byte[20 * 1024 * 1024];
            var compressedSize    = _nsaStream.Read(compressedBytes, 0, compressedBytes.Length);

            var zstd             = new Zstandard();
            var decompressedSize = zstd.Decompress(compressedBytes, compressedSize, decompressedBytes, decompressedBytes.Length);

            _memStream = new MemoryStream(decompressedBytes, 0, decompressedSize);
            _reader    = new ExtendedBinaryReader(_memStream);

            Version  = DataSourceVersion.Read(_reader);
            JsonKey  = _reader.ReadAsciiString();
            _isArray = _reader.ReadBoolean();
            ushort schemaVersion = _reader.ReadOptUInt16();

            if (schemaVersion != SaCommon.SchemaVersion)
            {
                throw new UserErrorException($"Expected schema version: {SaCommon.SchemaVersion}, observed: {schemaVersion} for {JsonKey}");
            }
        }