Esempio n. 1
0
        //Test we get the same results when using muliple samples and intervals, in the same order.
        //Fist test running two samples together, then test running two samples individualy, then test it with threadByChrOn/
        //Nothing strange should happen..
        public void IntervalTestingWithMultipleSamples()                                                            //based on a real bug when a gvcf was found was out of order, that only happened for multiple-bam runs with different interval files.
        {
            var bamFile1Path     = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.bam");                 //has data from chr17,7572952 and chr19,3118883
            var bamFile2Path     = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17again.bam");
            var interval1Path    = Path.Combine(UnitTestPaths.TestDataDirectory, "chr17int.picard");                //chr 17 only
            var interval2Path    = Path.Combine(UnitTestPaths.TestDataDirectory, "poorlyOrdered.picard");           //disordered, chr 19 first.
            var outDir           = Path.Combine(UnitTestPaths.WorkingDirectory, "IntervalTests");
            var vcfFile1Path     = Path.Combine(outDir, "Chr17Chr19.genome.vcf");                                   //only results from chr17
            var vcfFile2Path     = Path.Combine(outDir, "Chr17again.genome.vcf");                                   //show results from chr17 and 19
            var vcfExpectedFile1 = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.expected.genome.vcf"); //only results from chr17
            var vcfExpectedFile2 = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17again.expected.genome.vcf"); //show results from chr17 and 19


            var genomeDirectory  = Path.Combine(UnitTestPaths.TestGenomesDirectory, "fourChrs");
            var twoSampleFactory = MakeFactory(new List <string> {
                bamFile1Path, bamFile2Path
            },
                                               new List <string> {
                interval1Path, interval2Path
            }, outDir);

            var firstSampleFactory = MakeFactory(new List <string> {
                bamFile1Path
            },
                                                 new List <string> {
                interval1Path
            }, outDir);

            var secondSampleFactory = MakeFactory(new List <string> {
                bamFile2Path
            },
                                                  new List <string> {
                interval2Path
            }, outDir);


            //regular two-sample run mode.

            var genome  = twoSampleFactory.GetReferenceGenome(genomeDirectory);
            var genome1 = firstSampleFactory.GetReferenceGenome(genomeDirectory);
            var genome2 = secondSampleFactory.GetReferenceGenome(genomeDirectory);

            var processor = new GenomeProcessor(twoSampleFactory, genome);

            var chrs = genome.ChromosomesToProcess;

            Assert.Equal("chr7", chrs[0]);
            Assert.Equal("chr8", chrs[1]);
            Assert.Equal("chr17", chrs[2]);
            Assert.Equal("chr19", chrs[3]);

            processor.InternalExecute(10);
            chrs = genome.ChromosomesToProcess;
            Assert.Equal("chr7", chrs[0]);
            Assert.Equal("chr8", chrs[1]);
            Assert.Equal("chr17", chrs[2]);
            Assert.Equal("chr19", chrs[3]);

            //jsut be aware, when we porcess the samples individually, we use different genome lists.
            Assert.Equal(4, genome.ChromosomesToProcess.Count);
            Assert.Equal(1, genome1.ChromosomesToProcess.Count);
            Assert.Equal(4, genome2.ChromosomesToProcess.Count);
            Assert.Equal("chr17", genome1.ChromosomesToProcess[0]);
            Assert.Equal("chr7", genome2.ChromosomesToProcess[0]);
            Assert.Equal("chr19", genome2.ChromosomesToProcess[3]);

            var reader1 = new VcfReader(vcfFile1Path);
            var reader2 = new VcfReader(vcfFile2Path);

            var contigs1Results = GetContigs(reader1);
            var contigs2Results = GetContigs(reader2);
            var vcf1Results     = reader1.GetVariants().ToList();
            var vcf2Results     = reader2.GetVariants().ToList();


            //the expected results:
            var readerExp1 = new VcfReader(vcfExpectedFile1);
            var readerExp2 = new VcfReader(vcfExpectedFile2);

            var contigs1Expected = GetContigs(readerExp1);
            var contigs2Expected = GetContigs(readerExp2);
            var vcf1Expected     = readerExp1.GetVariants().ToList();
            var vcf2Expected     = readerExp2.GetVariants().ToList();

            Assert.Equal(4, contigs1Results.Count);
            Assert.Equal(4, contigs2Results.Count);
            Assert.Equal(11, vcf1Results.Count);
            Assert.Equal(71, vcf2Results.Count);

            //check variants and contigs all come out the same
            CheckForOrdering(contigs1Results, contigs2Results, contigs1Expected, contigs2Expected, vcf1Expected, vcf2Expected);

            reader1.Dispose();
            reader2.Dispose();
            File.Delete(vcfFile1Path);
            File.Delete(vcfFile2Path);

            //now check again, processing them separately
            processor = new GenomeProcessor(firstSampleFactory, genome1);
            processor.InternalExecute(10);
            processor = new GenomeProcessor(secondSampleFactory, genome2);
            processor.InternalExecute(10);

            reader1 = new VcfReader(vcfFile1Path);
            reader2 = new VcfReader(vcfFile2Path);

            contigs1Results = GetContigs(reader1);
            contigs2Results = GetContigs(reader2);
            vcf1Results     = reader1.GetVariants().ToList();
            vcf2Results     = reader2.GetVariants().ToList();

            //check variants all come out the same (the contigs will be different as shown)
            CheckForOrdering(contigs1Results, contigs2Results,
                             new List <string>()
            {
                "chr17"
            }, contigs2Expected, vcf1Expected, vcf2Expected);

            reader1.Dispose();
            reader2.Dispose();
            File.Delete(vcfFile1Path);

            //now check again, processing them "thread by chr" way
            processor = new GenomeProcessor(twoSampleFactory, genome, false);
            processor.InternalExecute(10);

            reader1 = new VcfReader(vcfFile1Path);
            reader2 = new VcfReader(vcfFile2Path);

            contigs1Results = GetContigs(reader1);
            contigs2Results = GetContigs(reader2);
            vcf1Results     = reader1.GetVariants().ToList();
            vcf2Results     = reader2.GetVariants().ToList();

            //check variants all come out the same (the contigs will be back to normal)
            CheckForOrdering(contigs1Results, contigs2Results,
                             contigs2Expected, contigs2Expected, vcf1Expected, vcf2Expected);

            reader1.Dispose();
            reader2.Dispose();
            File.Delete(vcfFile1Path);
            File.Delete(vcfFile2Path);
        }
        public void FilterHeader()
        {
            var outputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileWriterTests.vcf");

            File.Delete(outputFilePath);

            var context = new VcfWriterInputContext
            {
                QuotedCommandLineString = "myCommandLine",
                SampleName    = "mySample",
                ReferenceName = "myReference",
                ContigsByChr  = new List <Tuple <string, long> >
                {
                    new Tuple <string, long>("chr1", 10001),
                    new Tuple <string, long>("chrX", 500)
                }
            };

            // Variant strand bias too high or coverage on only one strand
            var config = new VcfWriterConfig
            {
                DepthFilterThreshold                = 500,
                VariantQualityFilterThreshold       = 30,
                FrequencyFilterThreshold            = 0.007f,
                ShouldOutputNoCallFraction          = true,
                ShouldOutputStrandBiasAndNoiseLevel = true,
                EstimatedBaseCallQuality            = 23,
                PloidyModel = PloidyModel.Diploid,
            };

            //note, scylla has no SB or RMxN or R8 filters.


            var variants = new List <CalledAllele>
            {
                TestHelper.CreateDummyAllele("chrX", 123, "A", "C", 1000, 156),
                TestHelper.CreateDummyAllele("chr10", 124, "A", "C", 1000, 156),
            };

            var originalHeader = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FILTER=<ID=q20,Description=\"Quality score less than 20\">",
                "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">",
                "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };


            var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null);

            writer.WriteHeader();
            writer.Write(variants);
            writer.Dispose();

            VcfReader     reader        = new VcfReader(outputFilePath);
            List <string> writtenHeader = reader.HeaderLines;

            reader.Dispose();

            var expectedHeader1 = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##VariantPhaser=Scylla 1.0.0.0",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FILTER=<ID=q20,Description=\"Quality score less than 20\">",
                "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">",
                "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">",
                "##FILTER=<ID=q30,Description=\"Quality score less than 30, by Scylla\">",
                "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">",
                "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">",
                "##FILTER=<ID=MultiAllelicSite,Description=\"Variant does not conform to diploid model, by Scylla\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };


            Assert.Equal(expectedHeader1.Count, writtenHeader.Count);
            for (int i = 0; i < expectedHeader1.Count; i++)
            {
                //let version numbers differ
                if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla"))
                {
                    Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla"));
                    continue;
                }
                Assert.Equal(expectedHeader1[i], writtenHeader[i]);
            }

            config = new VcfWriterConfig
            {
                DepthFilterThreshold          = 500,
                VariantQualityFilterThreshold = 22,
                FrequencyFilterThreshold      = 0.007f,
                EstimatedBaseCallQuality      = 23,
                PloidyModel = PloidyModel.Somatic,
            };


            originalHeader = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };
            writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null);


            var expectedHeader2 = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##VariantPhaser=Scylla 1.0.0.0",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "##FILTER=<ID=q22,Description=\"Quality score less than 22, by Scylla\">",
                "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">",
                "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };

            writer.WriteHeader();
            writer.Write(variants);
            writer.Dispose();

            reader        = new VcfReader(outputFilePath);
            writtenHeader = reader.HeaderLines;
            reader.Dispose();

            Assert.Equal(expectedHeader2.Count, writtenHeader.Count);
            for (int i = 0; i < expectedHeader2.Count; i++)
            {
                //let version numbers differ
                if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla"))
                {
                    Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla"));
                    continue;
                }
                Assert.Equal(expectedHeader2[i], writtenHeader[i]);
            }
        }
Esempio n. 3
0
        public void IntervalTestingWithVcf()
        {
            var bamFile1Path     = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.bam");          //has data from chr17,7572952 and chr19,3118883
            var interval1Path    = Path.Combine(UnitTestPaths.TestDataDirectory, "chr17int.picard");         //chr 17 only
            var outDir           = Path.Combine(UnitTestPaths.WorkingDirectory, "IntervalTests");
            var vcfFile1Path     = Path.Combine(outDir, "Chr17Chr19.vcf");                                   //only results from chr17
            var vcfExpectedFile1 = Path.Combine(UnitTestPaths.TestDataDirectory, "Chr17Chr19.expected.vcf"); //only results from chr17


            var genomeDirectory = Path.Combine(UnitTestPaths.TestGenomesDirectory, "fourChrs");

            var factory = MakeVcfFactory(new List <string> {
                bamFile1Path
            },
                                         new List <string> {
                interval1Path
            }, outDir);

            var genome1 = factory.GetReferenceGenome(genomeDirectory);

            var processor = new GenomeProcessor(factory, genome1);
            var chrs      = genome1.ChromosomesToProcess;

            Assert.Equal("chr17", chrs[0]);

            processor.InternalExecute(10);
            Assert.Equal(1, genome1.ChromosomesToProcess.Count);
            Assert.Equal("chr17", genome1.ChromosomesToProcess[0]);

            var reader1 = new VcfReader(vcfFile1Path);

            var filters1Results = GetFilters(reader1);
            var contigs1Results = GetContigs(reader1);
            var vcf1Results     = reader1.GetVariants().ToList();


            //the expected results:
            var readerExp1 = new VcfReader(vcfExpectedFile1);

            var filters1Expected = GetFilters(readerExp1);
            var contigs1Expected = GetContigs(readerExp1);
            var vcf1Expected     = readerExp1.GetVariants().ToList();

            Assert.Equal(3, filters1Results.Count);
            Assert.Equal(1, contigs1Results.Count);
            Assert.Equal(1, vcf1Results.Count);

            //check variants and contigs all come out the same
            for (int i = 0; i < contigs1Expected.Count; i++)
            {
                Assert.Equal(contigs1Expected[i], contigs1Results[i]);
            }

            for (int i = 0; i < filters1Expected.Count; i++)
            {
                Assert.Equal(filters1Expected[i].ToString(), filters1Results[i].ToString());
            }

            for (int i = 0; i < vcf1Expected.Count; i++)
            {
                Assert.Equal(vcf1Expected[i].ToString(), vcf1Results[i].ToString());
            }


            reader1.Dispose();
            File.Delete(vcfFile1Path);
        }