public void Initialize() { VcfWriterConfig config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 20, StrandBiasFilterThreshold = 0.5f, FrequencyFilterThreshold = 0.007f, MinFrequencyThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, ShouldFilterOnlyOneStrandCoverage = true, EstimatedBaseCallQuality = _estimatedBaseCallQuality, //AllowMultipleVcfLinesPerLoci = true }; _formatter = new VcfFormatter(config); _v1 = TestHelper.CreatePassingVariant(false); _v2 = TestHelper.CreatePassingVariant(false); _v3 = TestHelper.CreatePassingVariant(false); }
public void FilterHeader() { var outputFilePath = Path.Combine(UnitTestPaths.TestDataDirectory, "VcfFileWriterTests_SDS-18.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { CommandLine = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; // Variant strand bias too high or coverage on only one strand var config = new VcfWriterConfig { DepthFilterThreshold = 500, QscoreFilterThreshold = 20, StrandBiasFilterThreshold = 0.5f, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, ShouldFilterOnlyOneStrandCoverage = true, EstimatedBaseCallQuality = 23 }; var writer = new VcfFileWriter(outputFilePath, config, context); writer.WriteHeader(); writer.Write(_defaultCandidates); writer.Dispose(); VcfHeaderFormatTester(config, outputFilePath); }
public void WriteANbhd() { var outputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileNbhdWriterTest.vcf"); var inputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "MergerInput.vcf"); var expectedFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "MergerOutput.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { QuotedCommandLineString = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Somatic, AllowMultipleVcfLinesPerLoci = true }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), new List <string>() { }, null); var reader = new VcfReader(inputFilePath, true); //set up the original variants var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr2", 116380048, "A", "New", 1000, 156); var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr2", 116380048, "AAA", "New", 1000, 156); var originalVcfVariant4 = TestHelper.CreateDummyAllele("chr7", 116380051, "A", "New", 1000, 156); var originalVcfVariant5 = TestHelper.CreateDummyAllele("chr7", 116380052, "AC", "New", 1000, 156); var vs1 = new VariantSite((originalVcfVariant1)); var vs2 = new VariantSite((originalVcfVariant2)); var vs4 = new VariantSite((originalVcfVariant4)); var vs5 = new VariantSite((originalVcfVariant5)); //have to replace variants at positon 116380048 (we call two new MNVS here) var nbhd1 = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr2", vs1, vs2, ""); nbhd1.SetRangeOfInterest(); //have to replace variants at positon 116380051 and 52 (we call one new MNV at 51) var nbhd2 = new VcfNeighborhood(new VariantCallingParameters(), 0, "chr7", vs4, vs5, ""); nbhd2.SetRangeOfInterest(); VcfMerger merger = new VcfMerger(reader); List <CalledAllele> allelesPastNbh = new List <CalledAllele>(); nbhd1.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant1.ReferencePosition, new List <CalledAllele> { originalVcfVariant1, originalVcfVariant2 } } }; nbhd2.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant4.ReferencePosition, new List <CalledAllele> { originalVcfVariant4 } } }; allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd1.ReferenceName); allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd1, writer, allelesPastNbh); allelesPastNbh = merger.WriteVariantsUptoChr(writer, allelesPastNbh, nbhd2.ReferenceName); allelesPastNbh = merger.WriteVariantsUptoIncludingNbhd(nbhd2, writer, allelesPastNbh); merger.WriteRemainingVariants(writer, allelesPastNbh); writer.Dispose(); var expectedLines = File.ReadLines(expectedFilePath).ToList(); var outputLines = File.ReadLines(outputFilePath).ToList(); Assert.Equal(expectedLines.Count(), outputLines.Count()); for (int i = 0; i < expectedLines.Count; i++) { Assert.Equal(expectedLines[i], outputLines[i]); } }
private void VcfHeaderFormatTester(VcfWriterConfig config, string outputFile) { // Time to read the header var testFile = File.ReadAllLines(outputFile); bool formatLowDP = false, formatQ = false, formatSB = false; foreach (var x in testFile.Where(x => Regex.IsMatch(x, "##FILTER="))) { switch (x.Split(',')[0]) { case "##FILTER=<ID=LowDP": Assert.True(Regex.IsMatch(x, "^##FILTER=<ID=LowDP,Description=\"Low coverage \\(DP tag\\), therefore no genotype called\">$")); formatLowDP = true; break; case "##FILTER=<ID=SB": if (config.StrandBiasFilterThreshold.HasValue && config.ShouldFilterOnlyOneStrandCoverage) { Assert.True(Regex.IsMatch(x, "^##FILTER=<ID=SB,Description=\"(Variant strand bias too high or coverage on only one strand)\">$")); } else if (config.StrandBiasFilterThreshold.HasValue) { Assert.True(Regex.IsMatch(x, "^##FILTER=<ID=SB,Description=\"(Variant strand bias too high)\">$")); } else if (config.ShouldFilterOnlyOneStrandCoverage) { Assert.True(Regex.IsMatch(x, "^##FILTER=<ID=SB,Description=\"(Variant support on only one strand)\">$")); } else { Assert.True(false, "StrandBias filter header does not match any expected filter."); } formatSB = true; break; default: if (Regex.IsMatch(x, string.Format("##FILTER=<ID=q{0}", config.QscoreFilterThreshold))) { Assert.True(Regex.IsMatch(x, string.Format("^##FILTER=<ID=q{0},Description=\"Quality below {0}\">$", config.QscoreFilterThreshold))); formatQ = true; } else { Assert.True(false, "A filter is listed which does not match any of the specified filters."); } break; } } if (config.QscoreFilterThreshold > 0) { Assert.True(formatQ); } if (config.DepthFilterThreshold > 0) { Assert.True(formatLowDP); } if (config.ShouldOutputStrandBiasAndNoiseLevel) { Assert.True(formatSB); } }
public void DataFormatCheck() { var outputFilePath = Path.Combine(UnitTestPaths.TestDataDirectory, "VcfFileWriterTests_SDS-23.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { CommandLine = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; var config = new VcfWriterConfig { DepthFilterThreshold = 500, QscoreFilterThreshold = 20, StrandBiasFilterThreshold = 0.5f, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, ShouldFilterOnlyOneStrandCoverage = true, EstimatedBaseCallQuality = 23 }; var writer = new VcfFileWriter(outputFilePath, config, context); writer.WriteHeader(); writer.Write(_defaultCandidates); writer.Dispose(); var testFile = File.ReadAllLines(outputFilePath); var formatList = string.Empty; bool caseNL = false, caseSB = false, caseNC = false; foreach (var x in testFile) { if (Regex.IsMatch(x, "^##FORMAT")) { var formatField = x.Split(',')[0].Substring(13); switch (formatField) { case "NL": if (config.ShouldOutputStrandBiasAndNoiseLevel) { caseNL = true; } break; case "SB": if (config.ShouldOutputStrandBiasAndNoiseLevel) { caseSB = true; } break; case "NC": if (config.ShouldOutputNoCallFraction) { caseNC = true; } break; } if (formatList == string.Empty) { formatList = x.Split(',')[0].Substring(13); } else { formatList += ":" + x.Split(',')[0].Substring(13); } } if (Regex.IsMatch(x, "^chr\\d+\t")) { var y = x.Split('\t'); Assert.True(Regex.IsMatch(y[8], formatList)); } } if ((!config.ShouldOutputStrandBiasAndNoiseLevel && caseNL) || (config.ShouldOutputStrandBiasAndNoiseLevel && !caseNL)) { Assert.True(false, "Incorrect setting for ShouldOutputStrandBiasAndNoiseLevel and NL format"); } if ((!config.ShouldOutputStrandBiasAndNoiseLevel && caseSB) || (config.ShouldOutputStrandBiasAndNoiseLevel && !caseSB)) { Assert.True(false, "Incorrect setting for ShouldOutputStrandBiasAndNoiseLevel and SB format"); } if ((!config.ShouldOutputNoCallFraction && caseNC) || (config.ShouldOutputNoCallFraction && !caseNC)) { Assert.True(false, "Incorrect setting for NoCall and NC format"); } }
public void InfoFormatHeader() { var outputFilePath = Path.Combine(UnitTestPaths.TestDataDirectory, "VcfFileWriterTests_SDS-17.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { CommandLine = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; var config = new VcfWriterConfig { DepthFilterThreshold = 500, QscoreFilterThreshold = 20, StrandBiasFilterThreshold = 0.5f, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, ShouldFilterOnlyOneStrandCoverage = true, EstimatedBaseCallQuality = 23 }; var writer = new VcfFileWriter(outputFilePath, config, context); writer.WriteHeader(); writer.Write(_defaultCandidates); writer.Dispose(); // Time to read the header var testFile = File.ReadAllLines(outputFilePath); bool formatNL = false, formatSB = false, formatNC = false; foreach (var x in testFile) { if (Regex.IsMatch(x, "##INFO=")) { switch (x.Split(',')[0]) { case "##INFO=<ID=DP": Assert.True(Regex.IsMatch(x, "^##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">$")); break; case "##INFO=<ID=TI": Assert.True(Regex.IsMatch(x, "^##INFO=<ID=TI,Number=\\.,Type=String,Description=\"Transcript ID\">$")); break; case "##INFO=<ID=GI": Assert.True(Regex.IsMatch(x, "^##INFO=<ID=GI,Number=\\.,Type=String,Description=\"Gene ID\">$")); break; case "##INFO=<ID=EXON": Assert.True(Regex.IsMatch(x, "^##INFO=<ID=EXON,Number=0,Type=Flag,Description=\"Exon Region\">$")); break; case "##INFO=<ID=FC": Assert.True(Regex.IsMatch(x, "^##INFO=<ID=FC,Number=\\.,Type=String,Description=\"Functional Consequence\">$")); break; default: Assert.True(false, "An info is listed which does not match any from the req.`"); break; } } else if (Regex.IsMatch(x, "##FORMAT=")) { switch (x.Split(',')[0]) { case "##FORMAT=<ID=GT": Assert.True(Regex.IsMatch(x, "^##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">$")); break; case "##FORMAT=<ID=GQ": Assert.True(Regex.IsMatch(x, "^##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">$")); break; case "##FORMAT=<ID=AD": Assert.True(Regex.IsMatch(x, "^##FORMAT=<ID=AD,Number=\\.,Type=Integer,Description=\"Allele Depth\">$")); break; case "##FORMAT=<ID=VF": Assert.True(Regex.IsMatch(x, "^##FORMAT=<ID=VF,Number=1,Type=Float,Description=\"Variant Frequency\">$")); break; case "##FORMAT=<ID=NL": Assert.True(Regex.IsMatch(x, "^##FORMAT=<ID=NL,Number=1,Type=Integer,Description=\"Applied BaseCall Noise Level\">$")); formatNL = true; break; case "##FORMAT=<ID=SB": Assert.True(Regex.IsMatch(x, "^##FORMAT=<ID=SB,Number=1,Type=Float,Description=\"StrandBias Score\">$")); formatSB = true; break; case "##FORMAT=<ID=NC": Assert.True(Regex.IsMatch(x, "^##FORMAT=<ID=NC,Number=1,Type=Float,Description=\"Fraction of bases which were uncalled or with basecall quality below the minimum threshold\">$")); formatNC = true; break; default: Assert.True(false, "A format is listed which does not match any of those listed for the req."); break; } } } if (config.ShouldOutputStrandBiasAndNoiseLevel) { Assert.True(formatNL); } if (config.ShouldOutputStrandBiasAndNoiseLevel) { Assert.True(formatSB); } if (config.ShouldOutputNoCallFraction) { Assert.True(formatNC); } }
public void FilterHeader() { var outputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "PhasedVcfFileWriterTests.vcf"); File.Delete(outputFilePath); var context = new VcfWriterInputContext { QuotedCommandLineString = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chrX", 500) } }; // Variant strand bias too high or coverage on only one strand var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Diploid, }; //note, scylla has no SB or RMxN or R8 filters. var variants = new List <CalledAllele> { TestHelper.CreateDummyAllele("chrX", 123, "A", "C", 1000, 156), TestHelper.CreateDummyAllele("chr10", 124, "A", "C", 1000, 156), }; var originalHeader = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FILTER=<ID=q20,Description=\"Quality score less than 20\">", "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">", "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null); writer.WriteHeader(); writer.Write(variants); writer.Dispose(); VcfReader reader = new VcfReader(outputFilePath); List <string> writtenHeader = reader.HeaderLines; reader.Dispose(); var expectedHeader1 = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##VariantPhaser=Scylla 1.0.0.0", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FILTER=<ID=q20,Description=\"Quality score less than 20\">", "##FILTER=<ID=SB,Description=\"Variant strand bias too high\">", "##FILTER=<ID=R5x9,Description=\"Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9\">", "##FILTER=<ID=q30,Description=\"Quality score less than 30, by Scylla\">", "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">", "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">", "##FILTER=<ID=MultiAllelicSite,Description=\"Variant does not conform to diploid model, by Scylla\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; Assert.Equal(expectedHeader1.Count, writtenHeader.Count); for (int i = 0; i < expectedHeader1.Count; i++) { //let version numbers differ if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla")) { Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla")); continue; } Assert.Equal(expectedHeader1[i], writtenHeader[i]); } config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 22, FrequencyFilterThreshold = 0.007f, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.Somatic, }; originalHeader = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), originalHeader, null); var expectedHeader2 = new List <string> { "##fileformat=VCFv4.1", "##fileDate=20160620", "##source=Pisces 1.0.0.0", "##Pisces_cmdline=\"-B KRAS_42_S1.bam -g -MinimumFrequency 0.01 -MinBaseCallQuality 21 -MaxVariantQScore 100 -MinCoverage 300 -MaxAcceptableStrandBiasFilter 0.5 -MinVariantQScore 20 -VariantQualityFilter 20 -gVCF true -CallMNVs True -out \\myout", "##VariantPhaser=Scylla 1.0.0.0", "##reference=WholeGenomeFASTA", "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">", "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", "##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">", "##FILTER=<ID=q22,Description=\"Quality score less than 22, by Scylla\">", "##FILTER=<ID=LowDP,Description=\"Low coverage (DP tag), therefore no genotype called, by Scylla\">", "##FILTER=<ID=LowVariantFreq,Description=\"Variant frequency less than 0.0070, by Scylla\">", "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HD700n560_miseq1_S7.bam" }; writer.WriteHeader(); writer.Write(variants); writer.Dispose(); reader = new VcfReader(outputFilePath); writtenHeader = reader.HeaderLines; reader.Dispose(); Assert.Equal(expectedHeader2.Count, writtenHeader.Count); for (int i = 0; i < expectedHeader2.Count; i++) { //let version numbers differ if (expectedHeader1[i].StartsWith("##VariantPhaser=Scylla")) { Assert.True(writtenHeader[i].StartsWith("##VariantPhaser=Scylla")); continue; } Assert.Equal(expectedHeader2[i], writtenHeader[i]); } }
public static void DoFiltering(PsaraOptions settings) { var geometricFilter = new GeometricFilter(settings.GeometricFilterParameters); //maybe expand to add other filters.. var vcfIn = settings.InputVcf; var vcfName = Path.GetFileName(vcfIn); var outputFile = Path.Combine(settings.OutputDirectory, vcfName.Replace(".vcf", ".filtered.vcf")); outputFile = outputFile.Replace(".genome.filtered.vcf", ".filtered.genome.vcf"); Logger.WriteToLog("filtering " + vcfIn + "..."); if (File.Exists(outputFile)) { File.Delete(outputFile); } List <string> header = VcfReader.GetAllHeaderLines(vcfIn); string cmdLine = "##Psara_cmdline=" + settings.QuotedCommandLineArgumentsString; VcfWriterConfig config = GetWriterConfigToMatchInputVcf(vcfIn); using (PsaraVcfWriter writer = new PsaraVcfWriter(outputFile, config, new VcfWriterInputContext(), header, cmdLine)) { writer.WriteHeader(); using (VcfReader reader = new VcfReader(vcfIn, false)) { var backLogVcfVariant = new VcfVariant(); var coLocatedAlleles = new List <CalledAllele>(); var moreVariantsInVcf = reader.GetNextVariant(backLogVcfVariant); var incomingBatch = new List <CalledAllele>(); while (moreVariantsInVcf) { if (incomingBatch.Count == 0) { incomingBatch = moreVariantsInVcf ? VcfVariantUtilities.Convert(new List <VcfVariant> { backLogVcfVariant }, config.ShouldOutputRcCounts, config.ShouldOutputTsCounts, false).ToList() : null; moreVariantsInVcf = reader.GetNextVariant(backLogVcfVariant); } if ((coLocatedAlleles.Count == 0) || AreColocated(coLocatedAlleles, incomingBatch)) { coLocatedAlleles.AddRange(incomingBatch); incomingBatch.Clear(); //colocated alleles are left behind } else { FilterAndStreamOut(coLocatedAlleles, writer, geometricFilter); coLocatedAlleles.Clear(); //incomingBatch alleles are left behind } } //if you get here, there is no more unprocessed vcf variants but there could be //colocated or an incoming batch of alleles left over. We need to write them to file before exiting. FilterAndStreamOut(coLocatedAlleles, writer, geometricFilter); FilterAndStreamOut(incomingBatch, writer, geometricFilter); } } }
public void WriteADiploidNbhd() { var outputDir = Path.Combine(TestPaths.LocalScratchDirectory, "MergerWriteADiploidNbhd"); var outputFilePath = Path.Combine(outputDir, "TinyDiploid.Phased.vcf"); var inputFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "TinyDiploid.vcf"); var expectedFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "TinyDiploidOutput.vcf"); TestHelper.RecreateDirectory(outputDir); var context = new VcfWriterInputContext { QuotedCommandLineString = "myCommandLine", SampleName = "mySample", ReferenceName = "myReference", ContigsByChr = new List <Tuple <string, long> > { new Tuple <string, long>("chr1", 10001), new Tuple <string, long>("chr22", 51304566), new Tuple <string, long>("chrX", 500) } }; var config = new VcfWriterConfig { DepthFilterThreshold = 500, VariantQualityFilterThreshold = 30, FrequencyFilterThreshold = 0.007f, ShouldOutputNoCallFraction = true, ShouldOutputStrandBiasAndNoiseLevel = true, EstimatedBaseCallQuality = 23, PloidyModel = PloidyModel.DiploidByThresholding, AllowMultipleVcfLinesPerLoci = false }; var writer = new PhasedVcfWriter(outputFilePath, config, new VcfWriterInputContext(), new List <string>() { }, null); var reader = new AlleleReader(inputFilePath, true); //set up the original variants var originalVcfVariant1 = TestHelper.CreateDummyAllele("chr1", 1, "A", "G", 1000, 156); var originalVcfVariant2 = TestHelper.CreateDummyAllele("chr1", 1, "A", "T", 1000, 156); var originalVcfVariant4 = TestHelper.CreateDummyAllele("chr22", 1230237, "GTC", "G", 1000, 156); var originalVcfVariant5 = TestHelper.CreateDummyAllele("chr22", 1230237, "GTC", "GTCT", 1000, 156); var vs1 = new VariantSite((originalVcfVariant1)); var vs2 = new VariantSite((originalVcfVariant2)); var vs4 = new VariantSite((originalVcfVariant4)); var vs5 = new VariantSite((originalVcfVariant5)); //have to replace variants at positon 116380048 (we call two new MNVS here) var nbhd1 = new VcfNeighborhood(0, "chr1", vs1, vs2); var calledNbh1 = new CallableNeighborhood(nbhd1, new VariantCallingParameters()); VcfMerger merger = new VcfMerger(reader); List <Tuple <CalledAllele, string> > alleleTuplesPastNbhd = new List <Tuple <CalledAllele, string> >(); //we will just say, we called the variants that were in the origina vcf. Ie, we agree with it. calledNbh1.CalledVariants = new Dictionary <int, List <CalledAllele> > { { originalVcfVariant1.ReferencePosition, new List <CalledAllele> { originalVcfVariant1, originalVcfVariant2 } } }; //Realizes the first nbhd starts at chr1 . We have to do something with the first lines of the vcf (chr1 1 . A G,T) //so, alleleTuplesPastNbhd = chr1 1 . A G,T alleleTuplesPastNbhd = merger.WriteVariantsUptoChr(writer, alleleTuplesPastNbhd, nbhd1.ReferenceName); Assert.True(alleleTuplesPastNbhd[0].Item1.IsSameAllele(originalVcfVariant1)); Assert.True(alleleTuplesPastNbhd[1].Item1.IsSameAllele(originalVcfVariant2)); //This method writes everything up to the end of nbhd 1, //so "(chr1 1 . A G,T)" from the vcf and the variants scylla detected "(chr1 1 . A G,T)" need to be dealt with. //Since these 4 variants are actually the same two, we need to remove the vcf ones and only write the scylla ones. //Thn we peek into the vcf and see the next line is "chr22 1230237 . GTC G,GTCT", clearly outside nbh1. //so we write out everything we need for nbhd1, and save the peeked line alleleTuplesPastNbhd = merger.WriteVariantsUptoIncludingNbhd(writer, alleleTuplesPastNbhd, calledNbh1); Assert.True(alleleTuplesPastNbhd[0].Item1.IsSameAllele(originalVcfVariant4)); Assert.True(alleleTuplesPastNbhd[1].Item1.IsSameAllele(originalVcfVariant5)); //now write out //chr22 1230237.GTC G,GTCT 50 DP = 1370 GT: GQ: AD: DP: VF: NL: SB: NC: US 1 / 2:100:185,68:364:0.258:20:-100.0000:0.0000:0,0,0,0,0,0,1,1,0,0,0,2 //chrX 79.CG GTG,AA 50 DP = 1370 GT: GQ: AD: DP: VF: NL: SB: NC: US 1 / 2:100:185,68:364:0.258:20:-100.0000:0.0000:0,0,0,0,0,0,1,1,0,0,0,2 merger.WriteRemainingVariants(writer, alleleTuplesPastNbhd); writer.Dispose(); var expectedLines = File.ReadLines(expectedFilePath).ToList(); var outputLines = File.ReadLines(outputFilePath).ToList(); Assert.Equal(expectedLines.Count(), outputLines.Count()); for (int i = 0; i < expectedLines.Count; i++) { Assert.Equal(expectedLines[i], outputLines[i]); } }
public VennVcfFormatter(VcfWriterConfig Config, bool debugMode) { _config = Config; UpdateFrequencyFormat(); DebugMode = debugMode; }
public static void DoReformating(string inputFile, bool crush) { var outputFile = inputFile.Replace(".vcf", ".uncrushed.vcf"); if (crush) { Console.WriteLine("crushing " + inputFile + "..."); outputFile = inputFile.Replace(".vcf", ".crushed.vcf"); } else { Console.WriteLine("uncrushing " + inputFile + "..."); } if (File.Exists(outputFile)) { File.Delete(outputFile); } var config = new VcfWriterConfig() { AllowMultipleVcfLinesPerLoci = !crush }; using (VcfFileWriter writer = new VcfFileWriter(outputFile, config, new VcfWriterInputContext())) { writer.WriteHeader(); using (VcfReader reader = new VcfReader(inputFile, false)) { var currentAllele = new CalledAllele(); var backLogVcfVariant = new VcfVariant(); var backLogExists = reader.GetNextVariant(backLogVcfVariant); while (backLogExists) { var backLogAlleles = backLogExists ? VcfVariantUtilities.Convert(new List <VcfVariant> { backLogVcfVariant }).ToList() : null; foreach (var allele in backLogAlleles) { try { writer.Write(new List <CalledAllele>() { allele }); } catch (Exception ex) { Console.WriteLine("Problem writing " + allele.ToString()); Console.WriteLine("Exception: " + ex); return; } } backLogExists = reader.GetNextVariant(backLogVcfVariant); if (backLogAlleles[0].Chromosome != backLogVcfVariant.ReferenceName) { //we have switched to the next chr. flush the buffer. writer.FlushBuffer(); } } writer.FlushBuffer(); } } }