Пример #1
0
        public void FilterHeader()
        {
            var outputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileWriterTests.vcf");

            File.Delete(outputFilePath);

            var context = new VcfWriterInputContext
            {
                QuotedCommandLineString = "myCommandLine",
                SampleName    = "mySample",
                ReferenceName = "myReference",
                ContigsByChr  = new List <Tuple <string, long> >
                {
                    new Tuple <string, long>("chr1", 10001),
                    new Tuple <string, long>("chrX", 500)
                }
            };

            // Variant strand bias too high or coverage on only one strand
            var config = new VcfWriterConfig
            {
                DepthFilterThreshold                = 500,
                VariantQualityFilterThreshold       = 30,
                FrequencyFilterThreshold            = 0.007f,
                ShouldOutputNoCallFraction          = true,
                ShouldOutputStrandBiasAndNoiseLevel = true,
                EstimatedBaseCallQuality            = 23,
                PloidyModel = PloidyModel.Diploid,
            };

            //note, scylla has no SB or RMxN or R8 filters.


            var variants = new List <CalledAllele>
            {
                TestHelper.CreateDummyAllele("chrX", 123, "A", "C", 1000, 156),
                TestHelper.CreateDummyAllele("chr10", 124, "A", "C", 1000, 156),
            };

            var originalHeader = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FILTER=<ID=q20,Description=\"Quality score less than 20\">",
                "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">",
                "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };


            var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null);

            writer.WriteHeader();
            writer.Write(variants);
            writer.Dispose();

            VcfReader     reader        = new VcfReader(outputFilePath);
            List <string> writtenHeader = reader.HeaderLines;

            reader.Dispose();

            var expectedHeader1 = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##VariantPhaser=Scylla 1.0.0.0",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FILTER=<ID=q20,Description=\"Quality score less than 20\">",
                "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">",
                "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">",
                "##FILTER=<ID=q30,Description=\"Quality score less than 30, by Scylla\">",
                "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">",
                "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">",
                "##FILTER=<ID=MultiAllelicSite,Description=\"Variant does not conform to diploid model, by Scylla\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };


            Assert.Equal(expectedHeader1.Count, writtenHeader.Count);
            for (int i = 0; i < expectedHeader1.Count; i++)
            {
                //let version numbers differ
                if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla"))
                {
                    Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla"));
                    continue;
                }
                Assert.Equal(expectedHeader1[i], writtenHeader[i]);
            }

            config = new VcfWriterConfig
            {
                DepthFilterThreshold          = 500,
                VariantQualityFilterThreshold = 22,
                FrequencyFilterThreshold      = 0.007f,
                EstimatedBaseCallQuality      = 23,
                PloidyModel = PloidyModel.Somatic,
            };


            originalHeader = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };
            writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null);


            var expectedHeader2 = new List <string>
            {
                "##fileformat=VCFv4.1",
                "##fileDate=20160620",
                "##source=Pisces 1.0.0.0",
                "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout",
                "##VariantPhaser=Scylla 1.0.0.0",
                "##reference=WholeGenomeFASTA",
                "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">",
                "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">",
                "##FILTER=<ID=q22,Description=\"Quality score less than 22, by Scylla\">",
                "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">",
                "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">",
                "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HD700n560_miseq1_S7.bam"
            };

            writer.WriteHeader();
            writer.Write(variants);
            writer.Dispose();

            reader        = new VcfReader(outputFilePath);
            writtenHeader = reader.HeaderLines;
            reader.Dispose();

            Assert.Equal(expectedHeader2.Count, writtenHeader.Count);
            for (int i = 0; i < expectedHeader2.Count; i++)
            {
                //let version numbers differ
                if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla"))
                {
                    Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla"));
                    continue;
                }
                Assert.Equal(expectedHeader2[i], writtenHeader[i]);
            }
        }
Пример #2
0
        public void WriteANbhd()
        {
            var outputFilePath   = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileNbhdWriterTest.vcf");
            var inputFilePath    = Path.Combine(TestPaths.LocalTestDataDirectory, "MergerInput.vcf");
            var expectedFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "MergerOutput.vcf");

            File.Delete(outputFilePath);

            var context = new VcfWriterInputContext
            {
                QuotedCommandLineString = "myCommandLine",
                SampleName    = "mySample",
                ReferenceName = "myReference",
                ContigsByChr  = new List <Tuple <string, long> >
                {
                    new Tuple <string, long>("chr1", 10001),
                    new Tuple <string, long>("chrX", 500)
                }
            };

            var config = new VcfWriterConfig
            {
                DepthFilterThreshold                = 500,
                VariantQualityFilterThreshold       = 30,
                FrequencyFilterThreshold            = 0.007f,
                ShouldOutputNoCallFraction          = true,
                ShouldOutputStrandBiasAndNoiseLevel = true,
                EstimatedBaseCallQuality            = 23,
                PloidyModel = PloidyModel.Somatic,
                AllowMultipleVcfLinesPerLoci = true
            };
            var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), new List <string>()
            {
            }, null);
            var reader = new VcfReader(inputFilePath, true);


            //set up the original variants
            var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr2", 116380048, "A", "New", 1000, 156);
            var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr2", 116380048, "AAA", "New", 1000, 156);
            var originalVcfVariant4 = TestHelper.CreateDummyAllele("chr7", 116380051, "A", "New", 1000, 156);
            var originalVcfVariant5 = TestHelper.CreateDummyAllele("chr7", 116380052, "AC", "New", 1000, 156);

            var vs1 = new VariantSite((originalVcfVariant1));
            var vs2 = new VariantSite((originalVcfVariant2));
            var vs4 = new VariantSite((originalVcfVariant4));
            var vs5 = new VariantSite((originalVcfVariant5));


            //have to replace variants at positon 116380048 (we call two new MNVS here)
            var nbhd1 = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr2", vs1, vs2, "");

            nbhd1.SetRangeOfInterest();

            //have to replace variants at positon 116380051 and 52  (we call one new MNV at 51)
            var nbhd2 = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr7", vs4, vs5, "");

            nbhd2.SetRangeOfInterest();


            VcfMerger           merger         = new VcfMerger(reader);
            List <CalledAllele> allelesPastNbh = new List <CalledAllele>();

            nbhd1.CalledVariants = new Dictionary <int, List <CalledAllele> > {
                { originalVcfVariant1.ReferencePosition, new List <CalledAllele> {
                      originalVcfVariant1, originalVcfVariant2
                  } }
            };
            nbhd2.CalledVariants = new Dictionary <int, List <CalledAllele> > {
                { originalVcfVariant4.ReferencePosition, new List <CalledAllele> {
                      originalVcfVariant4
                  } }
            };


            allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd1.ReferenceName);

            allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd1, writer, allelesPastNbh);

            allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd2.ReferenceName);

            allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd2, writer, allelesPastNbh);

            merger.WriteRemainingVariants(writer, allelesPastNbh);

            writer.Dispose();

            var expectedLines = File.ReadLines(expectedFilePath).ToList();
            var outputLines   = File.ReadLines(outputFilePath).ToList();

            Assert.Equal(expectedLines.Count(), outputLines.Count());

            for (int i = 0; i < expectedLines.Count; i++)
            {
                Assert.Equal(expectedLines[i], outputLines[i]);
            }
        }
Пример #3
0
        public void WriteADiploidNbhd()
        {
            var outputDir        = Path.Combine(TestPaths.LocalScratchDirectory, "MergerWriteADiploidNbhd");
            var outputFilePath   = Path.Combine(outputDir, "TinyDiploid.Phased.vcf");
            var inputFilePath    = Path.Combine(TestPaths.LocalTestDataDirectory, "TinyDiploid.vcf");
            var expectedFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "TinyDiploidOutput.vcf");

            TestHelper.RecreateDirectory(outputDir);

            var context = new VcfWriterInputContext
            {
                QuotedCommandLineString = "myCommandLine",
                SampleName    = "mySample",
                ReferenceName = "myReference",
                ContigsByChr  = new List <Tuple <string, long> >
                {
                    new Tuple <string, long>("chr1", 10001),
                    new Tuple <string, long>("chr22", 51304566),
                    new Tuple <string, long>("chrX", 500)
                }
            };

            var config = new VcfWriterConfig
            {
                DepthFilterThreshold                = 500,
                VariantQualityFilterThreshold       = 30,
                FrequencyFilterThreshold            = 0.007f,
                ShouldOutputNoCallFraction          = true,
                ShouldOutputStrandBiasAndNoiseLevel = true,
                EstimatedBaseCallQuality            = 23,
                PloidyModel = PloidyModel.DiploidByThresholding,
                AllowMultipleVcfLinesPerLoci = false
            };
            var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), new List <string>()
            {
            }, null);
            var reader = new AlleleReader(inputFilePath, true);


            //set up the original variants
            var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr1", 1, "A", "G", 1000, 156);
            var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 1, "A", "T", 1000, 156);
            var originalVcfVariant4 = TestHelper.CreateDummyAllele("chr22", 1230237, "GTC", "G", 1000, 156);
            var originalVcfVariant5 = TestHelper.CreateDummyAllele("chr22", 1230237, "GTC", "GTCT", 1000, 156);

            var vs1 = new VariantSite((originalVcfVariant1));
            var vs2 = new VariantSite((originalVcfVariant2));
            var vs4 = new VariantSite((originalVcfVariant4));
            var vs5 = new VariantSite((originalVcfVariant5));


            //have to replace variants at positon 116380048 (we call two new MNVS here)
            var nbhd1      = new VcfNeighborhood(0, "chr1", vs1, vs2);
            var calledNbh1 = new CallableNeighborhood(nbhd1, new VariantCallingParameters());

            VcfMerger merger = new VcfMerger(reader);
            List <Tuple <CalledAllele, string> > alleleTuplesPastNbhd = new List <Tuple <CalledAllele, string> >();

            //we will just say, we called the variants that were in the origina vcf. Ie, we agree with it.
            calledNbh1.CalledVariants = new Dictionary <int, List <CalledAllele> > {
                { originalVcfVariant1.ReferencePosition, new List <CalledAllele> {
                      originalVcfVariant1, originalVcfVariant2
                  } }
            };

            //Realizes the first nbhd starts at chr1 . We have to do something with the first lines of the vcf (chr1	1	.	A	G,T)
            //so, alleleTuplesPastNbhd = chr1	1	.	A	G,T
            alleleTuplesPastNbhd = merger.WriteVariantsUptoChr(writer, alleleTuplesPastNbhd, nbhd1.ReferenceName);
            Assert.True(alleleTuplesPastNbhd[0].Item1.IsSameAllele(originalVcfVariant1));
            Assert.True(alleleTuplesPastNbhd[1].Item1.IsSameAllele(originalVcfVariant2));

            //This method writes everything up to the end of nbhd 1,
            //so "(chr1	1	.	A	G,T)" from the vcf and the variants scylla detected "(chr1	1	.	A	G,T)" need to be dealt with.
            //Since these 4 variants are actually the same two, we need to remove the vcf ones and only write the scylla ones.
            //Thn we peek into the vcf and see the next line is "chr22	1230237	.	GTC	G,GTCT", clearly outside nbh1.
            //so we write out everything we need for nbhd1, and save the peeked line
            alleleTuplesPastNbhd = merger.WriteVariantsUptoIncludingNbhd(writer, alleleTuplesPastNbhd, calledNbh1);
            Assert.True(alleleTuplesPastNbhd[0].Item1.IsSameAllele(originalVcfVariant4));
            Assert.True(alleleTuplesPastNbhd[1].Item1.IsSameAllele(originalVcfVariant5));

            //now write out
            //chr22   1230237.GTC G,GTCT  50  DP = 1370 GT: GQ: AD: DP: VF: NL: SB: NC: US  1 / 2:100:185,68:364:0.258:20:-100.0000:0.0000:0,0,0,0,0,0,1,1,0,0,0,2
            //chrX    79.CG  GTG,AA  50  DP = 1370 GT: GQ: AD: DP: VF: NL: SB: NC: US  1 / 2:100:185,68:364:0.258:20:-100.0000:0.0000:0,0,0,0,0,0,1,1,0,0,0,2
            merger.WriteRemainingVariants(writer, alleleTuplesPastNbhd);

            writer.Dispose();

            var expectedLines = File.ReadLines(expectedFilePath).ToList();
            var outputLines   = File.ReadLines(outputFilePath).ToList();

            Assert.Equal(expectedLines.Count(), outputLines.Count());

            for (int i = 0; i < expectedLines.Count; i++)
            {
                Assert.Equal(expectedLines[i], outputLines[i]);
            }
        }