//Test we get the same results when using muliple samples and intervals, in the same order. //Fist test running two samples together, then test running two samples individualy, then test it with threadByChrOn/ //Nothing strange should happen.. public void IntervalTestingWithMultipleSamples() //based on a real bug when a gvcf was found was out of order, that only happened for multiple-bam runs with different interval files. { var bamFile1Path = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.bam"); //has data from chr17,7572952 and chr19,3118883 var bamFile2Path = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17again.bam"); var interval1Path = Path.Combine(TestPaths.LocalTestDataDirectory, "chr17int.picard"); //chr 17 only var interval2Path = Path.Combine(TestPaths.LocalTestDataDirectory, "poorlyOrdered.picard"); //disordered, chr 19 first. var outDir = Path.Combine(TestPaths.LocalTestDataDirectory, "IntervalTests"); var vcfFile1Path = Path.Combine(outDir, "Chr17Chr19.genome.vcf"); //only results from chr17 var vcfFile2Path = Path.Combine(outDir, "Chr17again.genome.vcf"); //show results from chr17 and 19 var vcfExpectedFile1 = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.expected.genome.vcf"); //only results from chr17 var vcfExpectedFile2 = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17again.expected.genome.vcf"); //show results from chr17 and 19 var genomeDirectory = Path.Combine(TestPaths.SharedGenomesDirectory, "fourChrs"); var twoSampleFactory = MakeFactory(new List <string> { bamFile1Path, bamFile2Path }, new List <string> { interval1Path, interval2Path }, outDir); var firstSampleFactory = MakeFactory(new List <string> { bamFile1Path }, new List <string> { interval1Path }, outDir); var secondSampleFactory = MakeFactory(new List <string> { bamFile2Path }, new List <string> { interval2Path }, outDir); //regular two-sample run mode. var genome = twoSampleFactory.GetReferenceGenome(genomeDirectory); var genome1 = firstSampleFactory.GetReferenceGenome(genomeDirectory); var genome2 = secondSampleFactory.GetReferenceGenome(genomeDirectory); var processor = new GenomeProcessor(twoSampleFactory, genome); var chrs = genome.ChromosomesToProcess; Assert.Equal("chr7", chrs[0]); Assert.Equal("chr8", chrs[1]); Assert.Equal("chr17", chrs[2]); Assert.Equal("chr19", chrs[3]); processor.InternalExecute(10); chrs = genome.ChromosomesToProcess; Assert.Equal("chr7", chrs[0]); Assert.Equal("chr8", chrs[1]); Assert.Equal("chr17", chrs[2]); Assert.Equal("chr19", chrs[3]); //jsut be aware, when we porcess the samples individually, we use different genome lists. Assert.Equal(4, genome.ChromosomesToProcess.Count); Assert.Equal(1, genome1.ChromosomesToProcess.Count); Assert.Equal(4, genome2.ChromosomesToProcess.Count); Assert.Equal("chr17", genome1.ChromosomesToProcess[0]); Assert.Equal("chr7", genome2.ChromosomesToProcess[0]); Assert.Equal("chr19", genome2.ChromosomesToProcess[3]); var reader1 = new VcfReader(vcfFile1Path); var reader2 = new VcfReader(vcfFile2Path); var contigs1Results = GetContigs(reader1); var contigs2Results = GetContigs(reader2); var vcf1Results = reader1.GetVariants().ToList(); var vcf2Results = reader2.GetVariants().ToList(); //the expected results: var readerExp1 = new VcfReader(vcfExpectedFile1); var readerExp2 = new VcfReader(vcfExpectedFile2); var contigs1Expected = GetContigs(readerExp1); var contigs2Expected = GetContigs(readerExp2); var vcf1Expected = readerExp1.GetVariants().ToList(); var vcf2Expected = readerExp2.GetVariants().ToList(); Assert.Equal(4, contigs1Results.Count); Assert.Equal(4, contigs2Results.Count); Assert.Equal(11, vcf1Results.Count); Assert.Equal(71, vcf2Results.Count); //check variants and contigs all come out the same CheckForOrdering(contigs1Results, contigs2Results, contigs1Expected, contigs2Expected, vcf1Expected, vcf2Expected); reader1.Dispose(); reader2.Dispose(); File.Delete(vcfFile1Path); File.Delete(vcfFile2Path); //now check again, processing them separately processor = new GenomeProcessor(firstSampleFactory, genome1); processor.InternalExecute(10); processor = new GenomeProcessor(secondSampleFactory, genome2); processor.InternalExecute(10); reader1 = new VcfReader(vcfFile1Path); reader2 = new VcfReader(vcfFile2Path); contigs1Results = GetContigs(reader1); contigs2Results = GetContigs(reader2); vcf1Results = reader1.GetVariants().ToList(); vcf2Results = reader2.GetVariants().ToList(); //check variants all come out the same (the contigs will be different as shown) CheckForOrdering(contigs1Results, contigs2Results, new List <string>() { "chr17" }, contigs2Expected, vcf1Expected, vcf2Expected); reader1.Dispose(); reader2.Dispose(); File.Delete(vcfFile1Path); //now check again, processing them "thread by chr" way processor = new GenomeProcessor(twoSampleFactory, genome, false); processor.InternalExecute(10); reader1 = new VcfReader(vcfFile1Path); reader2 = new VcfReader(vcfFile2Path); contigs1Results = GetContigs(reader1); contigs2Results = GetContigs(reader2); vcf1Results = reader1.GetVariants().ToList(); vcf2Results = reader2.GetVariants().ToList(); //check variants all come out the same (the contigs will be back to normal) CheckForOrdering(contigs1Results, contigs2Results, contigs2Expected, contigs2Expected, vcf1Expected, vcf2Expected); reader1.Dispose(); reader2.Dispose(); File.Delete(vcfFile1Path); File.Delete(vcfFile2Path); }
public void GetChromAndLengthInfo_ReturnEmptyArray_NoLengthInfo() { Assert.Empty(VcfReader.GetChromAndLengthInfo("##contig=<ID=chr1>")); }
public void IntervalTestingWithVcf() { var bamFile1Path = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.bam"); //has data from chr17,7572952 and chr19,3118883 var interval1Path = Path.Combine(TestPaths.LocalTestDataDirectory, "chr17int.picard"); //chr 17 only var outDir = Path.Combine(TestPaths.LocalTestDataDirectory, "IntervalTests"); var vcfFile1Path = Path.Combine(outDir, "Chr17Chr19.vcf"); //only results from chr17 var vcfExpectedFile1 = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.expected.vcf"); //only results from chr17 var genomeDirectory = Path.Combine(TestPaths.SharedGenomesDirectory, "fourChrs"); var factory = MakeVcfFactory(new List <string> { bamFile1Path }, new List <string> { interval1Path }, outDir); var genome1 = factory.GetReferenceGenome(genomeDirectory); var processor = new GenomeProcessor(factory, genome1); var chrs = genome1.ChromosomesToProcess; Assert.Equal("chr17", chrs[0]); processor.InternalExecute(10); Assert.Equal(1, genome1.ChromosomesToProcess.Count); Assert.Equal("chr17", genome1.ChromosomesToProcess[0]); var reader1 = new VcfReader(vcfFile1Path); var filters1Results = GetFilters(reader1); var contigs1Results = GetContigs(reader1); var vcf1Results = reader1.GetVariants().ToList(); //the expected results: var readerExp1 = new VcfReader(vcfExpectedFile1); var filters1Expected = GetFilters(readerExp1); var contigs1Expected = GetContigs(readerExp1); var vcf1Expected = readerExp1.GetVariants().ToList(); Assert.Equal(4, filters1Results.Count); Assert.Equal(1, contigs1Results.Count); Assert.Equal(1, vcf1Results.Count); //check variants and contigs all come out the same for (int i = 0; i < contigs1Expected.Count; i++) { Assert.Equal(contigs1Expected[i], contigs1Results[i]); } for (int i = 0; i < filters1Expected.Count; i++) { Assert.Equal(filters1Expected[i].ToString(), filters1Results[i].ToString()); } for (int i = 0; i < vcf1Expected.Count; i++) { Assert.Equal(vcf1Expected[i].ToString(), vcf1Results[i].ToString()); } reader1.Dispose(); File.Delete(vcfFile1Path); }
public void GetChromAndLengthInfo_ReturnEmptyArray_NoProperPrefix() { Assert.Empty(VcfReader.GetChromAndLengthInfo("##fileformat=VCFv")); }
public void GetChromAndLength_AsExpect(string line, string[] info) { Assert.Equal(info, VcfReader.GetChromAndLengthInfo(line)); }
public void WriteANbhd() { var outputFilePath = Path.Combine(UnitTestPaths.TestDataDirectory, "PhasedVcfFileNbhdWriterTest.vcf"); var inputFilePath = Path.Combine(UnitTestPaths.TestDataDirectory, "MergerInput.vcf"); var expectedFilePath = Path.Combine(UnitTestPaths.TestDataDirectory, "MergerOutput.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { CommandLine = new[] { "myCommandLine" }, SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Somatic, AllowMultipleVcfLinesPerLoci = true }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), new List <string>() { }, null); var reader = new VcfReader(inputFilePath, true); //set up the original variants var originalVcfVariant1 = PhasedVariantTestUtilities.CreateDummyAllele("chr2", 116380048, "A", "New", 1000, 156); var originalVcfVariant2 = PhasedVariantTestUtilities.CreateDummyAllele("chr2", 116380048, "AAA", "New", 1000, 156); var originalVcfVariant4 = PhasedVariantTestUtilities.CreateDummyAllele("chr7", 116380051, "A", "New", 1000, 156); var originalVcfVariant5 = PhasedVariantTestUtilities.CreateDummyAllele("chr7", 116380052, "AC", "New", 1000, 156); var vs1 = new VariantSite((originalVcfVariant1)); var vs2 = new VariantSite((originalVcfVariant2)); var vs4 = new VariantSite((originalVcfVariant4)); var vs5 = new VariantSite((originalVcfVariant5)); //have to replace variants at positon 116380048 (we call two new MNVS here) var nbhd1 = new VcfNeighborhood(new VariantCallingParameters(), "chr2", vs1, vs2, ""); nbhd1.SetRangeOfInterest(); //have to replace variants at positon 116380051 and 52 (we call one new MNV at 51) var nbhd2 = new VcfNeighborhood(new VariantCallingParameters(), "chr7", vs4, vs5, ""); nbhd2.SetRangeOfInterest(); VcfMerger merger = new VcfMerger(reader); List <CalledAllele> allelesPastNbh = new List <CalledAllele>(); nbhd1.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant1.Coordinate, new List <CalledAllele> { originalVcfVariant1, originalVcfVariant2 } } }; nbhd2.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant4.Coordinate, new List <CalledAllele> { originalVcfVariant4 } } }; allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd1.ReferenceName); allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd1, writer, allelesPastNbh); allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd2.ReferenceName); allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd2, writer, allelesPastNbh); merger.WriteRemainingVariants(writer, allelesPastNbh); writer.Dispose(); var expectedLines = File.ReadLines(expectedFilePath).ToList(); var outputLines = File.ReadLines(outputFilePath).ToList(); Assert.Equal(expectedLines.Count(), outputLines.Count()); for (int i = 0; i < expectedLines.Count; i++) { Assert.Equal(expectedLines[i], outputLines[i]); } }
public void UnpackAlleles() { //two example vcf files that have been "crushed". var crushedVcf1 = Path.Combine(UnitTestPaths.TestDataDirectory, "VcfFileWriterTests_Crushed_Padded_expected.vcf"); var crushedVcf2 = Path.Combine(UnitTestPaths.TestDataDirectory, "crushed.genome.vcf"); var vcfVariants1 = VcfReader.GetAllVariantsInFile(crushedVcf1); var vcfVariants2 = VcfReader.GetAllVariantsInFile(crushedVcf2); Assert.Equal(7, vcfVariants1.Count); Assert.Equal(90, vcfVariants2.Count); // 1/2 variants var hetAlt1 = vcfVariants1[5]; var hetAlt2 = vcfVariants2[3]; var hetAlt1next = vcfVariants1[6]; var hetAlt2next = vcfVariants2[4]; Assert.Equal(1, hetAlt1.Genotypes.Count); Assert.Equal(1, hetAlt2.Genotypes.Count); Assert.Equal(2, hetAlt1.VariantAlleles.Count()); Assert.Equal(2, hetAlt2.VariantAlleles.Count()); Assert.Equal("2387,2000", hetAlt1.Genotypes[0]["AD"]); Assert.Equal("0.8133", hetAlt1.Genotypes[0]["VF"]); Assert.Equal("254,254", hetAlt2.Genotypes[0]["AD"]); Assert.Equal("AA", hetAlt1.ReferenceAllele); Assert.Equal("GA", hetAlt1.VariantAlleles[0]); Assert.Equal("G", hetAlt1.VariantAlleles[1]); Assert.Equal(".", hetAlt1next.VariantAlleles[0]); Assert.Equal("0", hetAlt1next.Genotypes[0]["AD"]); Assert.Equal("532", hetAlt2next.Genotypes[0]["AD"]); Assert.Equal(10, hetAlt1.ReferencePosition); Assert.Equal(223906731, hetAlt2.ReferencePosition); Assert.Equal(10 + 1, hetAlt1next.ReferencePosition); Assert.Equal(223906731 + 1, hetAlt2next.ReferencePosition); var unpackedVariants1 = Extensions.UnpackVariants(vcfVariants1); var unpackedVariants2 = Extensions.UnpackVariants(vcfVariants2); Assert.Equal(8, unpackedVariants1.Count); Assert.Equal(91, unpackedVariants2.Count); hetAlt1 = unpackedVariants1[5]; hetAlt2 = unpackedVariants2[3]; hetAlt1next = unpackedVariants1[6]; hetAlt2next = unpackedVariants2[4]; //example one: //total depth = 5394, total variant count = 2387 + 2000 = 4387 //so, ref counts ~1007. //example two: //total depth = 532, total variant count = 254 + 254 = 508 //so, ref counts ~24. Assert.Equal(1, hetAlt1.Genotypes.Count); Assert.Equal(1, hetAlt2.Genotypes.Count); Assert.Equal("1007,2387", hetAlt1.Genotypes[0]["AD"]); Assert.Equal("24,254", hetAlt2.Genotypes[0]["AD"]); Assert.Equal("0.4425", hetAlt1.Genotypes[0]["VF"]); Assert.Equal(1, hetAlt1.VariantAlleles.Count()); Assert.Equal(1, hetAlt2.VariantAlleles.Count()); Assert.Equal(1, hetAlt1next.VariantAlleles.Count()); Assert.Equal(1, hetAlt2next.VariantAlleles.Count()); Assert.Equal("1007,2000", hetAlt1next.Genotypes[0]["AD"]); Assert.Equal("24,254", hetAlt2next.Genotypes[0]["AD"]); Assert.Equal("AA", hetAlt1.ReferenceAllele); Assert.Equal("GA", hetAlt1.VariantAlleles[0]); Assert.Equal("G", hetAlt1next.VariantAlleles[0]); Assert.Equal("0.3708", hetAlt1next.Genotypes[0]["VF"]); Assert.Equal(10, hetAlt1.ReferencePosition); Assert.Equal(223906731, hetAlt2.ReferencePosition); Assert.Equal(10, hetAlt1next.ReferencePosition); Assert.Equal(223906731, hetAlt2next.ReferencePosition); }
private void Write_InFlow(bool threadByChr) { var bamFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "SBWriter_Sample_S1.bam"); var vcfFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "SBWriter_Sample_S1.genome.vcf"); var biasFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "SBWriter_Sample_S1.genome.ReadStrandBias.txt"); if (threadByChr) { biasFilePath = biasFilePath + "_chr19"; //Currently when threading by chrom we are outputting one bias file per chromsome. This is not a customer-facing deliverable and is a low-priority feature. } var expectedBiasResultsPath = Path.Combine(TestPaths.LocalTestDataDirectory, "Expected_Sample_S1.ReadStrandBias.txt"); var genomeDirectory = Path.Combine(TestPaths.SharedGenomesDirectory, "chr19"); var applicationOptions = new PiscesApplicationOptions { BAMPaths = new[] { bamFilePath }, IntervalPaths = null, GenomePaths = new[] { genomeDirectory }, OutputBiasFiles = true, DebugMode = true, VcfWritingParameters = new Domain.Options.VcfWritingParameters() { OutputGvcfFile = true } }; // Using GenomeProcessor //If OutputBiasFiles is true, should output one bias file per vcf var factory = new MockFactoryWithDefaults(applicationOptions); var genome = factory.GetReferenceGenome(genomeDirectory); CreateAndExecuteProcessor(threadByChr, factory, genome); Assert.True(File.Exists(biasFilePath)); //All variants that are present in VCF where ref!=alt should be included var biasFileContents = File.ReadAllLines(biasFilePath); var alleles = VcfReader.GetAllVariantsInFile(vcfFilePath); var variantCalls = alleles.Where(a => a.VariantAlleles[0] != ".").ToList(); foreach (var variantCall in variantCalls) { Console.WriteLine(variantCall); Assert.True(biasFileContents.Count(l => l.Split('\t')[0] == variantCall.ReferenceName && l.Split('\t')[1] == variantCall.ReferencePosition.ToString() && l.Split('\t')[2] == variantCall.ReferenceAllele && l.Split('\t')[3] == variantCall.VariantAlleles.First()) == 1); } foreach (var refCall in alleles.Where(a => a.VariantAlleles[0] == ".").ToList()) { Assert.False(biasFileContents.Count(l => l.Split('\t')[0] == refCall.ReferenceName && l.Split('\t')[1] == refCall.ReferencePosition.ToString() && l.Split('\t')[2] == refCall.ReferenceAllele && l.Split('\t')[3] == refCall.VariantAlleles.First()) == 1); } //Bias files should have expected contents var expectedBiasFileContents = File.ReadAllLines(expectedBiasResultsPath); Assert.Equal(expectedBiasFileContents, biasFileContents); //If OutputBiasFiles is false, should not output any bias files File.Delete(biasFilePath); applicationOptions.OutputBiasFiles = false; factory = new MockFactoryWithDefaults(applicationOptions); genome = factory.GetReferenceGenome(genomeDirectory); CreateAndExecuteProcessor(threadByChr, factory, genome); Assert.False(File.Exists(biasFilePath)); }