Exemplo n.º 1
0
        private void GetForcedAlleles()
        {
            if (_options.ForcedAllelesFileNames == null || _options.ForcedAllelesFileNames.Count == 0)
            {
                return;
            }
            foreach (var fileName in _options.ForcedAllelesFileNames)
            {
                using (var reader = new AlleleReader(fileName, false, false))
                {
                    foreach (var variant in reader.GetVariants())
                    {
                        var chr       = variant.Chromosome;
                        var pos       = variant.ReferencePosition;
                        var refAllele = variant.ReferenceAllele.ToUpper();
                        var altAllele = variant.AlternateAllele.ToUpper();

                        if (!_forcedAllelesByChrom.ContainsKey(chr))
                        {
                            _forcedAllelesByChrom[chr] = new HashSet <Tuple <string, int, string, string> >();
                        }


                        if (!IsValidAlt(altAllele, refAllele))
                        {
                            Logger.WriteToLog($"Invalid forced genotyping variant: {variant}");
                            continue;
                        }
                        _forcedAllelesByChrom[chr].Add(new Tuple <string, int, string, string>(chr, pos, refAllele, altAllele));
                    }
                }
            }
        }
Exemplo n.º 2
0
        public void GetVariantsTests()
        {
            var vr     = new AlleleReader(VcfTestFile_1);
            var allVar = vr.GetVariants().ToList();

            Assert.Equal(24, allVar.Count);
            Assert.Equal(10, allVar.First().ReferencePosition);
            Assert.Equal(4000, allVar.Last().ReferencePosition);
        }
Exemplo n.º 3
0
        // tests two bams in different folders
        // expectations:
        // - if outputfolder is not specified, logs are in directory of first bam
        // - if outputfolder specified, logs are in output folder
        // - vcf files have header and both chromosomes, output is where normally expected
        private void ExecuteTest(int numberOfThreads, string outputFolder = null)
        {
            var sourcePath         = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.bam");
            var otherTestDirectory = Path.Combine(TestPaths.LocalScratchDirectory, "MultiProcessIn");
            var bamFilePath1       = Stage(sourcePath, "In1", otherTestDirectory + "1");
            var bamFilePath2       = Stage(sourcePath, "In2", otherTestDirectory + "2");

            var genomePath = Path.Combine(TestPaths.SharedGenomesDirectory, "chr17chr19");

            var options = new PiscesApplicationOptions
            {
                BAMPaths             = new[] { bamFilePath1, bamFilePath2 },
                GenomePaths          = new[] { genomePath },
                OutputDirectory      = outputFolder,
                CommandLineArguments = string.Format("-B {0},{1} -g {2}{3} -gVCF false", bamFilePath1, bamFilePath2, genomePath, string.IsNullOrEmpty(outputFolder) ? string.Empty : " -OutFolder " + outputFolder).Split(' '),
                VcfWritingParameters = new VcfWritingParameters()
                {
                    OutputGvcfFile = true
                }
            };

            options.SetIODirectories("Pisces");
            var factory = new Factory(options);

            foreach (var workRequest in factory.WorkRequests)
            {
                if (File.Exists(workRequest.OutputFilePath))
                {
                    File.Delete(workRequest.OutputFilePath);
                }
            }

            Logger.OpenLog(options.LogFolder, options.LogFileName, true);

            var processor = new GenomeProcessor(factory, factory.GetReferenceGenome(options.GenomePaths[0]), false, true);

            processor.Execute(numberOfThreads);

            Logger.CloseLog();

            foreach (var workRequest in factory.WorkRequests)
            {
                using (var reader = new AlleleReader(workRequest.OutputFilePath))
                {
                    Assert.True(reader.HeaderLines.Any());
                    var variants = reader.GetVariants().ToList();

                    Assert.Equal(251, variants.Count());
                    Assert.Equal("chr17", variants.First().Chromosome);
                    Assert.Equal("chr19", variants.Last().Chromosome);
                }
            }

            Assert.True(Directory.GetFiles(options.LogFolder, options.LogFileNameBase).Any());
        }
        private void ExecuteEmptyIntervalsTest(bool throttle)
        {
            // ----------------------
            // test when one bam has intervals and the other is empty
            // ----------------------

            var bamFilePath    = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.bam");
            var bamFilePath2   = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19_removedSQlines.bam");
            var genomePath     = Path.Combine(TestPaths.SharedGenomesDirectory, "chr17chr19");
            var validIntervals = Path.Combine(TestPaths.LocalTestDataDirectory, "chr17only.picard");
            var emptyIntervals = Path.Combine(TestPaths.LocalTestDataDirectory, "empty.picard");
            var outputFolder   = Path.Combine(TestPaths.LocalTestDataDirectory, "EmptyIntervalsTest_Mixed");

            var options = new PiscesApplicationOptions
            {
                BAMPaths             = new[] { bamFilePath, bamFilePath2 },
                IntervalPaths        = new [] { validIntervals, emptyIntervals },
                GenomePaths          = new[] { genomePath },
                OutputDirectory      = outputFolder,
                VcfWritingParameters = new Domain.Options.VcfWritingParameters()
                {
                    OutputGvcfFile = true
                }
            };

            var factory   = new Factory(options);
            var processor = new GenomeProcessor(factory, factory.GetReferenceGenome(genomePath), throttle);

            processor.Execute(2);

            // first vcf file should have been processed regularly
            using (var reader = new AlleleReader(factory.WorkRequests.First().OutputFilePath))
            {
                var variants = reader.GetVariants();
                Assert.Equal(11, variants.Count());
            }

            // second vcf file should be empty
            using (var reader = new AlleleReader(factory.WorkRequests.Last().OutputFilePath))
            {
                var variants = reader.GetVariants();
                Assert.Equal(0, variants.Count());
            }

            // ----------------------
            // try again but with both bams using empty intervals
            // ----------------------

            options.IntervalPaths   = new[] { emptyIntervals };
            options.OutputDirectory = Path.Combine(TestPaths.LocalTestDataDirectory, "EmptyIntervalsTest_All");

            factory   = new Factory(options);
            processor = new GenomeProcessor(factory, factory.GetReferenceGenome(genomePath), throttle);

            processor.Execute(2);

            foreach (var workRequest in factory.WorkRequests)
            {
                // both vcf file should be empty
                using (var reader = new AlleleReader(workRequest.OutputFilePath))
                {
                    var variants = reader.GetVariants();
                    Assert.Equal(0, variants.Count());
                }
            }
        }
        //Test we get the same results when using muliple samples and intervals, in the same order.
        //Fist test running two samples together, then test running two samples individualy, then test it with threadByChrOn/
        //Nothing strange should happen..
        public void IntervalTestingWithMultipleSamples()                                                             //based on a real bug when a gvcf was found was out of order, that only happened for multiple-bam runs with different interval files.
        {
            var bamFile1Path     = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.bam");                 //has data from chr17,7572952 and chr19,3118883
            var bamFile2Path     = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17again.bam");
            var interval1Path    = Path.Combine(TestPaths.LocalTestDataDirectory, "chr17int.picard");                //chr 17 only
            var interval2Path    = Path.Combine(TestPaths.LocalTestDataDirectory, "poorlyOrdered.picard");           //disordered, chr 19 first.
            var outDir           = Path.Combine(TestPaths.LocalTestDataDirectory, "IntervalTests");
            var vcfFile1Path     = Path.Combine(outDir, "Chr17Chr19.genome.vcf");                                    //only results from chr17
            var vcfFile2Path     = Path.Combine(outDir, "Chr17again.genome.vcf");                                    //show results from chr17 and 19
            var vcfExpectedFile1 = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.expected.genome.vcf"); //only results from chr17
            var vcfExpectedFile2 = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17again.expected.genome.vcf"); //show results from chr17 and 19


            var genomeDirectory  = Path.Combine(TestPaths.SharedGenomesDirectory, "fourChrs");
            var twoSampleFactory = MakeFactory(new List <string> {
                bamFile1Path, bamFile2Path
            },
                                               new List <string> {
                interval1Path, interval2Path
            }, outDir);

            var firstSampleFactory = MakeFactory(new List <string> {
                bamFile1Path
            },
                                                 new List <string> {
                interval1Path
            }, outDir);

            var secondSampleFactory = MakeFactory(new List <string> {
                bamFile2Path
            },
                                                  new List <string> {
                interval2Path
            }, outDir);


            //regular two-sample run mode.

            var genome  = twoSampleFactory.GetReferenceGenome(genomeDirectory);
            var genome1 = firstSampleFactory.GetReferenceGenome(genomeDirectory);
            var genome2 = secondSampleFactory.GetReferenceGenome(genomeDirectory);

            var processor = new GenomeProcessor(twoSampleFactory, genome);

            var chrs = genome.ChromosomesToProcess;

            Assert.Equal("chr7", chrs[0]);
            Assert.Equal("chr8", chrs[1]);
            Assert.Equal("chr17", chrs[2]);
            Assert.Equal("chr19", chrs[3]);

            processor.InternalExecute(10);
            chrs = genome.ChromosomesToProcess;
            Assert.Equal("chr7", chrs[0]);
            Assert.Equal("chr8", chrs[1]);
            Assert.Equal("chr17", chrs[2]);
            Assert.Equal("chr19", chrs[3]);

            //jsut be aware, when we porcess the samples individually, we use different genome lists.
            Assert.Equal(4, genome.ChromosomesToProcess.Count);
            Assert.Equal(1, genome1.ChromosomesToProcess.Count);
            Assert.Equal(4, genome2.ChromosomesToProcess.Count);
            Assert.Equal("chr17", genome1.ChromosomesToProcess[0]);
            Assert.Equal("chr7", genome2.ChromosomesToProcess[0]);
            Assert.Equal("chr19", genome2.ChromosomesToProcess[3]);

            var reader1 = new AlleleReader(vcfFile1Path);
            var reader2 = new AlleleReader(vcfFile2Path);

            var contigs1Results = GetContigs(reader1);
            var contigs2Results = GetContigs(reader2);
            var vcf1Results     = reader1.GetVariants().ToList();
            var vcf2Results     = reader2.GetVariants().ToList();


            //the expected results:
            var readerExp1 = new AlleleReader(vcfExpectedFile1);
            var readerExp2 = new AlleleReader(vcfExpectedFile2);

            var contigs1Expected = GetContigs(readerExp1);
            var contigs2Expected = GetContigs(readerExp2);
            var vcf1Expected     = readerExp1.GetVariants().ToList();
            var vcf2Expected     = readerExp2.GetVariants().ToList();

            Assert.Equal(4, contigs1Results.Count);
            Assert.Equal(4, contigs2Results.Count);
            Assert.Equal(11, vcf1Results.Count);
            Assert.Equal(71, vcf2Results.Count);

            //check variants and contigs all come out the same
            CheckForOrdering(contigs1Results, contigs2Results, contigs1Expected, contigs2Expected, vcf1Expected, vcf2Expected);

            reader1.Dispose();
            reader2.Dispose();
            File.Delete(vcfFile1Path);
            File.Delete(vcfFile2Path);

            //now check again, processing them separately
            processor = new GenomeProcessor(firstSampleFactory, genome1);
            processor.InternalExecute(10);
            processor = new GenomeProcessor(secondSampleFactory, genome2);
            processor.InternalExecute(10);

            reader1 = new AlleleReader(vcfFile1Path);
            reader2 = new AlleleReader(vcfFile2Path);

            contigs1Results = GetContigs(reader1);
            contigs2Results = GetContigs(reader2);
            vcf1Results     = reader1.GetVariants().ToList();
            vcf2Results     = reader2.GetVariants().ToList();

            //check variants all come out the same (the contigs will be different as shown)
            CheckForOrdering(contigs1Results, contigs2Results,
                             new List <string>()
            {
                "chr17"
            }, contigs2Expected, vcf1Expected, vcf2Expected);

            reader1.Dispose();
            reader2.Dispose();
            File.Delete(vcfFile1Path);

            //now check again, processing them "thread by chr" way
            processor = new GenomeProcessor(twoSampleFactory, genome, false);
            processor.InternalExecute(10);

            reader1 = new AlleleReader(vcfFile1Path);
            reader2 = new AlleleReader(vcfFile2Path);

            contigs1Results = GetContigs(reader1);
            contigs2Results = GetContigs(reader2);
            vcf1Results     = reader1.GetVariants().ToList();
            vcf2Results     = reader2.GetVariants().ToList();

            //check variants all come out the same (the contigs will be back to normal)
            CheckForOrdering(contigs1Results, contigs2Results,
                             contigs2Expected, contigs2Expected, vcf1Expected, vcf2Expected);

            reader1.Dispose();
            reader2.Dispose();
            File.Delete(vcfFile1Path);
            File.Delete(vcfFile2Path);
        }
        public void IntervalTestingWithVcf()
        {
            var bamFile1Path         = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.bam");          //has data from chr17,7572952 and chr19,3118883
            var interval1Path        = Path.Combine(TestPaths.LocalTestDataDirectory, "chr17int.picard");         //chr 17 only
            var outDir               = Path.Combine(TestPaths.LocalTestDataDirectory, "IntervalTests");
            var vcfObservedFile1Path = Path.Combine(outDir, "Chr17Chr19.vcf");                                    //only results from chr17
            var vcfExpectedFile1     = Path.Combine(TestPaths.LocalTestDataDirectory, "Chr17Chr19.expected.vcf"); //only results from chr17


            var genomeDirectory = Path.Combine(TestPaths.SharedGenomesDirectory, "fourChrs");

            var factory = MakeVcfFactory(new List <string> {
                bamFile1Path
            },
                                         new List <string> {
                interval1Path
            }, outDir);

            var genome1 = factory.GetReferenceGenome(genomeDirectory);

            var processor = new GenomeProcessor(factory, genome1);
            var chrs      = genome1.ChromosomesToProcess;

            Assert.Equal("chr17", chrs[0]);

            processor.InternalExecute(10);
            Assert.Equal(1, genome1.ChromosomesToProcess.Count);
            Assert.Equal("chr17", genome1.ChromosomesToProcess[0]);

            var reader1 = new AlleleReader(vcfObservedFile1Path);

            var observedFilters1Results = GetFilters(reader1);
            var observedContigs1Results = GetContigs(reader1);
            var observedVcf1Results     = reader1.GetVariants().ToList();


            //the expected results:
            var readerExp1 = new AlleleReader(vcfExpectedFile1);

            var filters1Expected = GetFilters(readerExp1);
            var contigs1Expected = GetContigs(readerExp1);
            var vcf1Expected     = readerExp1.GetVariants().ToList();

            /*
             ##FILTER=<ID=q30,Description="Quality score less than 30">
             ##FILTER=<ID=SB,Description="Variant strand bias too high">
             ##FILTER=<ID=R5x9,Description="Repeats of part or all of the variant allele (max repeat length 5) in the reference greater than or equal to 9">
             ##FILTER=<ID=NC,Description="No-call rate is above 0.6">
             * */

            Assert.Equal(4, observedFilters1Results.Count);

            //##contig=<ID=chr17,length=7573100>
            Assert.Equal(1, observedContigs1Results.Count);
            Assert.Equal(1, observedVcf1Results.Count);

            //check variants and contigs all come out the same
            for (int i = 0; i < contigs1Expected.Count; i++)
            {
                Assert.Equal(contigs1Expected[i], observedContigs1Results[i]);
            }

            for (int i = 0; i < filters1Expected.Count; i++)
            {
                Assert.Equal(filters1Expected[i].ToString(), observedFilters1Results[i].ToString());
            }

            for (int i = 0; i < vcf1Expected.Count; i++)
            {
                Assert.Equal(vcf1Expected[i].ToString(), observedVcf1Results[i].ToString());
            }


            reader1.Dispose();
            File.Delete(vcfObservedFile1Path);
        }