public void FastForward_EnsemblNamingStyle_ChangeReaderStateCorrectly() { var annotationRange = new GenomicRange(new GenomicPosition(ChromosomeUtilities.Chr1, 100), new GenomicPosition(ChromosomeUtilities.Chr1, 200)); var vcfFilter = new VcfFilter(annotationRange); const string firstLineInRange = "1\t100\t.\tC\tT\t165.00\tPASS\tSNVSB=-12.5;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:119:35:25:0:8,17"; using (var ms = new MemoryStream()) { using (var writer = new StreamWriter(ms, Encoding.UTF8, 1024, true)) { writer.WriteLine("#Header line 1"); writer.WriteLine("#Header line 2"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tMother"); writer.WriteLine("2\t150\t.\tG\tA\t5.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:3:1:1:0:0,1"); writer.WriteLine("1\t90\t.\tT\tC\t1.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:23:9:3:0:2,1"); writer.WriteLine("1\t95\t.\tA\tT\t2.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=2\tGT:GQ:GQX:DP:DPF:AD\t0/1:23:9:3:0:2,1"); writer.WriteLine(firstLineInRange); writer.WriteLine("1\t102\t.\tC\tA\t3.00\tLowGQXHetSNP\tSNVSB=0.0;SNVHPOL=5\tGT:GQ:GQX:DP:DPF:AD\t0/1:29:2:2:0:1,1"); } ms.Position = 0; using (var reader = new StreamReader(ms)) { vcfFilter.FastForward(reader); Assert.Equal(firstLineInRange, vcfFilter.BufferedLine); } } }
public void GetPositions_inRange() { var chromosome = new Chromosome("chr1", "1", 0); var annotationRange = new GenomicRange(new GenomicPosition(chromosome, 10019), new GenomicPosition(chromosome, 10290)); var seqProvider = ParserTestUtils.GetSequenceProvider(10329, "AC", 'A', _chromDict); var positions = PreLoadUtilities.GetPositions(GetVcfStream(), annotationRange, seqProvider); Assert.Single(positions); Assert.Equal(3, positions[Chrom1].Count); }
public void GetPositions_inRange() { var annotationRange = new GenomicRange(new GenomicPosition(ChromosomeUtilities.Chr1, 10019), new GenomicPosition(ChromosomeUtilities.Chr1, 10290)); var seqProvider = ParserTestUtils.GetSequenceProvider(10329, "AC", 'A', ChromosomeUtilities.RefNameToChromosome); (var positions, _) = PreLoadUtilities.GetPositions(GetVcfStream(), annotationRange, seqProvider, null); Assert.Single(positions); Assert.Equal(3, positions[ChromosomeUtilities.Chr1].Count); }
public void PassedTheEnd_AsExpected() { var annotationRange = new GenomicRange(new GenomicPosition(ChromosomeUtilities.Chr1, 100), new GenomicPosition(ChromosomeUtilities.Chr1, 200)); var vcfFilter = new VcfFilter(annotationRange); Assert.False(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr1, 150)); Assert.False(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr1, 200)); Assert.True(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr1, 201)); Assert.True(vcfFilter.PassedTheEnd(ChromosomeUtilities.Chr2, 150)); }
public void GetVariantPositions(Stream vcfStream, GenomicRange genomicRange) { if (vcfStream == null) { _variantPositions = null; return; } vcfStream.Position = Tabix.VirtualPosition.From(InputStartVirtualPosition).BlockOffset; _variantPositions = PreLoadUtilities.GetPositions(vcfStream, genomicRange, SequenceProvider).ToImmutableDictionary(); }
public void PassedTheEnd_AsExpected() { var chromosome = new Chromosome("chr1", "1", 0); var annotationRange = new GenomicRange(new GenomicPosition(chromosome, 100), new GenomicPosition(chromosome, 200)); var vcfFilter = new VcfFilter(annotationRange); Assert.False(vcfFilter.PassedTheEnd(new Chromosome("chr1", "1", 0), 150)); Assert.False(vcfFilter.PassedTheEnd(new Chromosome("chr1", "1", 0), 200)); Assert.True(vcfFilter.PassedTheEnd(new Chromosome("chr1", "1", 0), 201)); Assert.True(vcfFilter.PassedTheEnd(new Chromosome("chr2", "2", 1), 150)); }
public void GetVariantPositions(Stream vcfStream, GenomicRange genomicRange) { if (vcfStream == null) { _variantPositions = null; return; } vcfStream.Position = Tabix.VirtualPosition.From(InputStartVirtualPosition).BlockOffset; int numPositions; Metrics.SaPositionScan.Start(); (_variantPositions, numPositions) = PreLoadUtilities.GetPositions(vcfStream, genomicRange, SequenceProvider, RefMinorProvider); Metrics.ShowSaPositionScanLoad(numPositions); }
public VcfFilter(GenomicRange genomicRange) { _genomicRange = genomicRange; _genomicRangeChecker = new GenomicRangeChecker(genomicRange); }
public static IDictionary <IChromosome, List <int> > GetPositions(Stream vcfStream, GenomicRange genomicRange, ISequenceProvider sequenceProvider) { var benchmark = new Benchmark(); Console.Write("Scanning positions required for SA pre-loading...."); var chromPositions = new Dictionary <IChromosome, List <int> >(); var rangeChecker = new GenomicRangeChecker(genomicRange); var refNameToChrom = sequenceProvider.RefNameToChromosome; using (var reader = new StreamReader(vcfStream)) { string line; while ((line = reader.ReadLine()) != null) { if (!NeedProcessThisLine(refNameToChrom, line, out var splits, out IChromosome iChrom)) { continue; } int position = int.Parse(splits[VcfCommon.PosIndex]); if (rangeChecker.OutOfRange(iChrom, position)) { break; } string refAllele = splits[VcfCommon.RefIndex]; string altAllele = splits[VcfCommon.AltIndex]; sequenceProvider.LoadChromosome(iChrom); UpdateChromToPositions(chromPositions, iChrom, position, refAllele, altAllele, sequenceProvider.Sequence); } } int count = SortPositionsAndGetCount(chromPositions); Console.WriteLine($"{count} positions found in {Benchmark.ToHumanReadable(benchmark.GetElapsedTime())}"); return(chromPositions); }
public static (ImmutableDictionary <IChromosome, List <int> > PositionsByChromosome, int Count) GetPositions(Stream vcfStream, GenomicRange genomicRange, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider) { var positionsByChromosome = new Dictionary <IChromosome, List <int> >(); var rangeChecker = new GenomicRangeChecker(genomicRange); var refNameToChrom = sequenceProvider.RefNameToChromosome; using (var reader = new StreamReader(vcfStream)) { string line; string currentReferenceName = ""; IChromosome chromosome = null; while ((line = reader.ReadLine()) != null) { if (line.StartsWith('#')) { continue; } string[] cols = line.OptimizedSplit('\t'); string referenceName = cols[VcfCommon.ChromIndex]; if (referenceName != currentReferenceName) { if (!refNameToChrom.TryGetValue(referenceName, out chromosome)) { continue; } currentReferenceName = referenceName; } (int position, bool foundError) = cols[VcfCommon.PosIndex].OptimizedParseInt32(); if (foundError) { throw new InvalidDataException($"Unable to convert the VCF position to an integer: {cols[VcfCommon.PosIndex]}"); } if (rangeChecker.OutOfRange(chromosome, position)) { break; } string refAllele = cols[VcfCommon.RefIndex]; string altAllele = cols[VcfCommon.AltIndex]; if (altAllele == "." && !IsRefMinor(refMinorProvider, chromosome, position)) { continue; } sequenceProvider.LoadChromosome(chromosome); TryAddPosition(positionsByChromosome, chromosome, position, refAllele, altAllele, sequenceProvider.Sequence); } } int count = SortPositionsAndGetCount(positionsByChromosome); return(positionsByChromosome.ToImmutableDictionary(), count); }
public GenomicRangeChecker(GenomicRange genomicRange) { _genomicRange = genomicRange; }