public void FilterHeader() { var outputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileWriterTests.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { QuotedCommandLineString = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; // Variant strand bias too high or coverage on only one strand var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Diploid, }; //note, scylla has no SB or RMxN or R8 filters. var variants = new List <CalledAllele> { TestHelper.CreateDummyAllele("chrX", 123, "A", "C", 1000, 156), TestHelper.CreateDummyAllele("chr10", 124, "A", "C", 1000, 156), }; var originalHeader = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FILTER=<ID=q20,Description=\"Quality score less than 20\">", "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">", "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null); writer.WriteHeader(); writer.Write(variants); writer.Dispose(); VcfReader reader = new VcfReader(outputFilePath); List <string> writtenHeader = reader.HeaderLines; reader.Dispose(); var expectedHeader1 = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##VariantPhaser=Scylla 1.0.0.0", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FILTER=<ID=q20,Description=\"Quality score less than 20\">", "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">", "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">", "##FILTER=<ID=q30,Description=\"Quality score less than 30, by Scylla\">", "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">", "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">", "##FILTER=<ID=MultiAllelicSite,Description=\"Variant does not conform to diploid model, by Scylla\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; Assert.Equal(expectedHeader1.Count, writtenHeader.Count); for (int i = 0; i < expectedHeader1.Count; i++) { //let version numbers differ if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla")) { Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla")); continue; } Assert.Equal(expectedHeader1[i], writtenHeader[i]); } config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 22, FrequencyFilterThreshold = 0.007f, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Somatic, }; originalHeader = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null); var expectedHeader2 = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##VariantPhaser=Scylla 1.0.0.0", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "##FILTER=<ID=q22,Description=\"Quality score less than 22, by Scylla\">", "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">", "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; writer.WriteHeader(); writer.Write(variants); writer.Dispose(); reader = new VcfReader(outputFilePath); writtenHeader = reader.HeaderLines; reader.Dispose(); Assert.Equal(expectedHeader2.Count, writtenHeader.Count); for (int i = 0; i < expectedHeader2.Count; i++) { //let version numbers differ if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla")) { Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla")); continue; } Assert.Equal(expectedHeader2[i], writtenHeader[i]); } }
public void WriteANbhd() { var outputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileNbhdWriterTest.vcf"); var inputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "MergerInput.vcf"); var expectedFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "MergerOutput.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { QuotedCommandLineString = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Somatic, AllowMultipleVcfLinesPerLoci = true }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), new List <string>() { }, null); var reader = new VcfReader(inputFilePath, true); //set up the original variants var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr2", 116380048, "A", "New", 1000, 156); var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr2", 116380048, "AAA", "New", 1000, 156); var originalVcfVariant4 = TestHelper.CreateDummyAllele("chr7", 116380051, "A", "New", 1000, 156); var originalVcfVariant5 = TestHelper.CreateDummyAllele("chr7", 116380052, "AC", "New", 1000, 156); var vs1 = new VariantSite((originalVcfVariant1)); var vs2 = new VariantSite((originalVcfVariant2)); var vs4 = new VariantSite((originalVcfVariant4)); var vs5 = new VariantSite((originalVcfVariant5)); //have to replace variants at positon 116380048 (we call two new MNVS here) var nbhd1 = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr2", vs1, vs2, ""); nbhd1.SetRangeOfInterest(); //have to replace variants at positon 116380051 and 52 (we call one new MNV at 51) var nbhd2 = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr7", vs4, vs5, ""); nbhd2.SetRangeOfInterest(); VcfMerger merger = new VcfMerger(reader); List <CalledAllele> allelesPastNbh = new List <CalledAllele>(); nbhd1.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant1.ReferencePosition, new List <CalledAllele> { originalVcfVariant1, originalVcfVariant2 } } }; nbhd2.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant4.ReferencePosition, new List <CalledAllele> { originalVcfVariant4 } } }; allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd1.ReferenceName); allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd1, writer, allelesPastNbh); allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd2.ReferenceName); allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd2, writer, allelesPastNbh); merger.WriteRemainingVariants(writer, allelesPastNbh); writer.Dispose(); var expectedLines = File.ReadLines(expectedFilePath).ToList(); var outputLines = File.ReadLines(outputFilePath).ToList(); Assert.Equal(expectedLines.Count(), outputLines.Count()); for (int i = 0; i < expectedLines.Count; i++) { Assert.Equal(expectedLines[i], outputLines[i]); } }
public void WriteADiploidNbhd() { var outputDir = Path.Combine(TestPaths.LocalScratchDirectory, "MergerWriteADiploidNbhd"); var outputFilePath = Path.Combine(outputDir, "TinyDiploid.Phased.vcf"); var inputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "TinyDiploid.vcf"); var expectedFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "TinyDiploidOutput.vcf"); TestHelper.RecreateDirectory(outputDir); var context = new VcfWriterInputContext { QuotedCommandLineString = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chr22", 51304566), new Tuple <string, long>("chrX", 500) } }; var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.DiploidByThresholding, AllowMultipleVcfLinesPerLoci = false }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), new List <string>() { }, null); var reader = new AlleleReader(inputFilePath, true); //set up the original variants var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr1", 1, "A", "G", 1000, 156); var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 1, "A", "T", 1000, 156); var originalVcfVariant4 = TestHelper.CreateDummyAllele("chr22", 1230237, "GTC", "G", 1000, 156); var originalVcfVariant5 = TestHelper.CreateDummyAllele("chr22", 1230237, "GTC", "GTCT", 1000, 156); var vs1 = new VariantSite((originalVcfVariant1)); var vs2 = new VariantSite((originalVcfVariant2)); var vs4 = new VariantSite((originalVcfVariant4)); var vs5 = new VariantSite((originalVcfVariant5)); //have to replace variants at positon 116380048 (we call two new MNVS here) var nbhd1 = new VcfNeighborhood(0, "chr1", vs1, vs2); var calledNbh1 = new CallableNeighborhood(nbhd1, new VariantCallingParameters()); VcfMerger merger = new VcfMerger(reader); List <Tuple <CalledAllele, string> > alleleTuplesPastNbhd = new List <Tuple <CalledAllele, string> >(); //we will just say, we called the variants that were in the origina vcf. Ie, we agree with it. calledNbh1.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant1.ReferencePosition, new List <CalledAllele> { originalVcfVariant1, originalVcfVariant2 } } }; //Realizes the first nbhd starts at chr1 . We have to do something with the first lines of the vcf (chr1 1 . A G,T) //so, alleleTuplesPastNbhd = chr1 1 . A G,T alleleTuplesPastNbhd = merger.WriteVariantsUptoChr(writer, alleleTuplesPastNbhd, nbhd1.ReferenceName); Assert.True(alleleTuplesPastNbhd[0].Item1.IsSameAllele(originalVcfVariant1)); Assert.True(alleleTuplesPastNbhd[1].Item1.IsSameAllele(originalVcfVariant2)); //This method writes everything up to the end of nbhd 1, //so "(chr1 1 . A G,T)" from the vcf and the variants scylla detected "(chr1 1 . A G,T)" need to be dealt with. //Since these 4 variants are actually the same two, we need to remove the vcf ones and only write the scylla ones. //Thn we peek into the vcf and see the next line is "chr22 1230237 . GTC G,GTCT", clearly outside nbh1. //so we write out everything we need for nbhd1, and save the peeked line alleleTuplesPastNbhd = merger.WriteVariantsUptoIncludingNbhd(writer, alleleTuplesPastNbhd, calledNbh1); Assert.True(alleleTuplesPastNbhd[0].Item1.IsSameAllele(originalVcfVariant4)); Assert.True(alleleTuplesPastNbhd[1].Item1.IsSameAllele(originalVcfVariant5)); //now write out //chr22 1230237.GTC G,GTCT 50 DP = 1370 GT: GQ: AD: DP: VF: NL: SB: NC: US 1 / 2:100:185,68:364:0.258:20:-100.0000:0.0000:0,0,0,0,0,0,1,1,0,0,0,2 //chrX 79.CG GTG,AA 50 DP = 1370 GT: GQ: AD: DP: VF: NL: SB: NC: US 1 / 2:100:185,68:364:0.258:20:-100.0000:0.0000:0,0,0,0,0,0,1,1,0,0,0,2 merger.WriteRemainingVariants(writer, alleleTuplesPastNbhd); writer.Dispose(); var expectedLines = File.ReadLines(expectedFilePath).ToList(); var outputLines = File.ReadLines(outputFilePath).ToList(); Assert.Equal(expectedLines.Count(), outputLines.Count()); for (int i = 0; i < expectedLines.Count; i++) { Assert.Equal(expectedLines[i], outputLines[i]); } }