//Test we get the same results when using muliple samples and intervals, in the same order. //Fist test running two samples together, then test running two samples individualy, then test it with threadByChrOn/ //Nothing strange should happen.. public void IntervalTestingWithMultipleSamples() //based on a real bug when a gvcf was found was out of order, that only happened for multiple-bam runs with different interval files. { var bamFile1Path = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.bam"); //has data from chr17,7572952 and chr19,3118883 var bamFile2Path = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17again.bam"); var interval1Path = Path.Combine(UnitTestPaths.TestDataDirectory, "chr17int.picard"); //chr 17 only var interval2Path = Path.Combine(UnitTestPaths.TestDataDirectory, "poorlyOrdered.picard"); //disordered, chr 19 first. var outDir = Path.Combine(UnitTestPaths.WorkingDirectory, "IntervalTests"); var vcfFile1Path = Path.Combine(outDir, "Chr17Chr19.genome.vcf"); //only results from chr17 var vcfFile2Path = Path.Combine(outDir, "Chr17again.genome.vcf"); //show results from chr17 and 19 var vcfExpectedFile1 = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.expected.genome.vcf"); //only results from chr17 var vcfExpectedFile2 = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17again.expected.genome.vcf"); //show results from chr17 and 19 var genomeDirectory = Path.Combine(UnitTestPaths.TestGenomesDirectory, "fourChrs"); var twoSampleFactory = MakeFactory(new List <string> { bamFile1Path, bamFile2Path }, new List <string> { interval1Path, interval2Path }, outDir); var firstSampleFactory = MakeFactory(new List <string> { bamFile1Path }, new List <string> { interval1Path }, outDir); var secondSampleFactory = MakeFactory(new List <string> { bamFile2Path }, new List <string> { interval2Path }, outDir); //regular two-sample run mode. var genome = twoSampleFactory.GetReferenceGenome(genomeDirectory); var genome1 = firstSampleFactory.GetReferenceGenome(genomeDirectory); var genome2 = secondSampleFactory.GetReferenceGenome(genomeDirectory); var processor = new GenomeProcessor(twoSampleFactory, genome); var chrs = genome.ChromosomesToProcess; Assert.Equal("chr7", chrs[0]); Assert.Equal("chr8", chrs[1]); Assert.Equal("chr17", chrs[2]); Assert.Equal("chr19", chrs[3]); processor.InternalExecute(10); chrs = genome.ChromosomesToProcess; Assert.Equal("chr7", chrs[0]); Assert.Equal("chr8", chrs[1]); Assert.Equal("chr17", chrs[2]); Assert.Equal("chr19", chrs[3]); //jsut be aware, when we porcess the samples individually, we use different genome lists. Assert.Equal(4, genome.ChromosomesToProcess.Count); Assert.Equal(1, genome1.ChromosomesToProcess.Count); Assert.Equal(4, genome2.ChromosomesToProcess.Count); Assert.Equal("chr17", genome1.ChromosomesToProcess[0]); Assert.Equal("chr7", genome2.ChromosomesToProcess[0]); Assert.Equal("chr19", genome2.ChromosomesToProcess[3]); var reader1 = new VcfReader(vcfFile1Path); var reader2 = new VcfReader(vcfFile2Path); var contigs1Results = GetContigs(reader1); var contigs2Results = GetContigs(reader2); var vcf1Results = reader1.GetVariants().ToList(); var vcf2Results = reader2.GetVariants().ToList(); //the expected results: var readerExp1 = new VcfReader(vcfExpectedFile1); var readerExp2 = new VcfReader(vcfExpectedFile2); var contigs1Expected = GetContigs(readerExp1); var contigs2Expected = GetContigs(readerExp2); var vcf1Expected = readerExp1.GetVariants().ToList(); var vcf2Expected = readerExp2.GetVariants().ToList(); Assert.Equal(4, contigs1Results.Count); Assert.Equal(4, contigs2Results.Count); Assert.Equal(11, vcf1Results.Count); Assert.Equal(71, vcf2Results.Count); //check variants and contigs all come out the same CheckForOrdering(contigs1Results, contigs2Results, contigs1Expected, contigs2Expected, vcf1Expected, vcf2Expected); reader1.Dispose(); reader2.Dispose(); File.Delete(vcfFile1Path); File.Delete(vcfFile2Path); //now check again, processing them separately processor = new GenomeProcessor(firstSampleFactory, genome1); processor.InternalExecute(10); processor = new GenomeProcessor(secondSampleFactory, genome2); processor.InternalExecute(10); reader1 = new VcfReader(vcfFile1Path); reader2 = new VcfReader(vcfFile2Path); contigs1Results = GetContigs(reader1); contigs2Results = GetContigs(reader2); vcf1Results = reader1.GetVariants().ToList(); vcf2Results = reader2.GetVariants().ToList(); //check variants all come out the same (the contigs will be different as shown) CheckForOrdering(contigs1Results, contigs2Results, new List <string>() { "chr17" }, contigs2Expected, vcf1Expected, vcf2Expected); reader1.Dispose(); reader2.Dispose(); File.Delete(vcfFile1Path); //now check again, processing them "thread by chr" way processor = new GenomeProcessor(twoSampleFactory, genome, false); processor.InternalExecute(10); reader1 = new VcfReader(vcfFile1Path); reader2 = new VcfReader(vcfFile2Path); contigs1Results = GetContigs(reader1); contigs2Results = GetContigs(reader2); vcf1Results = reader1.GetVariants().ToList(); vcf2Results = reader2.GetVariants().ToList(); //check variants all come out the same (the contigs will be back to normal) CheckForOrdering(contigs1Results, contigs2Results, contigs2Expected, contigs2Expected, vcf1Expected, vcf2Expected); reader1.Dispose(); reader2.Dispose(); File.Delete(vcfFile1Path); File.Delete(vcfFile2Path); }
public void FilterHeader() { var outputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileWriterTests.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { QuotedCommandLineString = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; // Variant strand bias too high or coverage on only one strand var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Diploid, }; //note, scylla has no SB or RMxN or R8 filters. var variants = new List <CalledAllele> { TestHelper.CreateDummyAllele("chrX", 123, "A", "C", 1000, 156), TestHelper.CreateDummyAllele("chr10", 124, "A", "C", 1000, 156), }; var originalHeader = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FILTER=<ID=q20,Description=\"Quality score less than 20\">", "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">", "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null); writer.WriteHeader(); writer.Write(variants); writer.Dispose(); VcfReader reader = new VcfReader(outputFilePath); List <string> writtenHeader = reader.HeaderLines; reader.Dispose(); var expectedHeader1 = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##VariantPhaser=Scylla 1.0.0.0", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FILTER=<ID=q20,Description=\"Quality score less than 20\">", "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">", "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">", "##FILTER=<ID=q30,Description=\"Quality score less than 30, by Scylla\">", "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">", "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">", "##FILTER=<ID=MultiAllelicSite,Description=\"Variant does not conform to diploid model, by Scylla\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; Assert.Equal(expectedHeader1.Count, writtenHeader.Count); for (int i = 0; i < expectedHeader1.Count; i++) { //let version numbers differ if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla")) { Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla")); continue; } Assert.Equal(expectedHeader1[i], writtenHeader[i]); } config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 22, FrequencyFilterThreshold = 0.007f, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Somatic, }; originalHeader = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null); var expectedHeader2 = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##VariantPhaser=Scylla 1.0.0.0", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "##FILTER=<ID=q22,Description=\"Quality score less than 22, by Scylla\">", "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">", "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; writer.WriteHeader(); writer.Write(variants); writer.Dispose(); reader = new VcfReader(outputFilePath); writtenHeader = reader.HeaderLines; reader.Dispose(); Assert.Equal(expectedHeader2.Count, writtenHeader.Count); for (int i = 0; i < expectedHeader2.Count; i++) { //let version numbers differ if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla")) { Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla")); continue; } Assert.Equal(expectedHeader2[i], writtenHeader[i]); } }
public void IntervalTestingWithVcf() { var bamFile1Path = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.bam"); //has data from chr17,7572952 and chr19,3118883 var interval1Path = Path.Combine(UnitTestPaths.TestDataDirectory, "chr17int.picard"); //chr 17 only var outDir = Path.Combine(UnitTestPaths.WorkingDirectory, "IntervalTests"); var vcfFile1Path = Path.Combine(outDir, "Chr17Chr19.vcf"); //only results from chr17 var vcfExpectedFile1 = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.expected.vcf"); //only results from chr17 var genomeDirectory = Path.Combine(UnitTestPaths.TestGenomesDirectory, "fourChrs"); var factory = MakeVcfFactory(new List <string> { bamFile1Path }, new List <string> { interval1Path }, outDir); var genome1 = factory.GetReferenceGenome(genomeDirectory); var processor = new GenomeProcessor(factory, genome1); var chrs = genome1.ChromosomesToProcess; Assert.Equal("chr17", chrs[0]); processor.InternalExecute(10); Assert.Equal(1, genome1.ChromosomesToProcess.Count); Assert.Equal("chr17", genome1.ChromosomesToProcess[0]); var reader1 = new VcfReader(vcfFile1Path); var filters1Results = GetFilters(reader1); var contigs1Results = GetContigs(reader1); var vcf1Results = reader1.GetVariants().ToList(); //the expected results: var readerExp1 = new VcfReader(vcfExpectedFile1); var filters1Expected = GetFilters(readerExp1); var contigs1Expected = GetContigs(readerExp1); var vcf1Expected = readerExp1.GetVariants().ToList(); Assert.Equal(3, filters1Results.Count); Assert.Equal(1, contigs1Results.Count); Assert.Equal(1, vcf1Results.Count); //check variants and contigs all come out the same for (int i = 0; i < contigs1Expected.Count; i++) { Assert.Equal(contigs1Expected[i], contigs1Results[i]); } for (int i = 0; i < filters1Expected.Count; i++) { Assert.Equal(filters1Expected[i].ToString(), filters1Results[i].ToString()); } for (int i = 0; i < vcf1Expected.Count; i++) { Assert.Equal(vcf1Expected[i].ToString(), vcf1Results[i].ToString()); } reader1.Dispose(); File.Delete(vcfFile1Path); }