Ejemplo n.º 1
0
        // Need to filter VCF by FILTER = PASS; there are several reasons they don't accept calls that I trust
        // There's an attribute "ZYG" for zygosity, either "het" or "h**o" for heterozygous or homozygous
        public List <string> CallIndels(string spritzDirectory, int threads, string genomeFastaP, string bedPath, string bamPath, string outdir)
        {
            CheckInstallation(spritzDirectory);
            var vcftools = new VcfToolsWrapper();

            IndelVcfPath = Path.Combine(outdir, "variants.indel.vcf");
            //IndelVcf1IndexedPath = Path.Combine(outdir, "variants.indel.1index.vcf");
            var commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                "if [[ ! -f " + WrapperUtility.ConvertWindowsPath(IndelVcfPath) + " || ! -s " + WrapperUtility.ConvertWindowsPath(IndelVcfPath) + " ]]; then ",
                "  scalpel-" + ScalpelVersion + "/scalpel-discovery --single " +
                "--bam " + WrapperUtility.ConvertWindowsPath(bamPath) +
                " --ref " + WrapperUtility.ConvertWindowsPath(genomeFastaP) +
                " --bed " + WrapperUtility.ConvertWindowsPath(bedPath) +
                " --numprocs " + threads.ToString() +
                " --dir " + WrapperUtility.ConvertWindowsPath(outdir),

                // scalpel uses 0-indexing, where SnpEff uses 1-indexing, so change this output to match snpeff
                //"  awk 'BEGIN{OFS=\"\t\"}{ if (substr($0, 1, 1) != \"#\") $2=++$2; print $0 }' " + WrapperUtility.ConvertWindowsPath(IndelVcfPath) + " > " + WrapperUtility.ConvertWindowsPath(IndelVcf1IndexedPath),
                "fi",

                // vcf-concat doesn't keep all INFO header lines, so just dump the INFO from each variant
                vcftools.RemoveAllSnvs(spritzDirectory, IndelVcfPath, false, true)
            };

            FilteredIndelVcfPath = vcftools.VcfWithoutSnvsPath;
            return(commands);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// HaplotypeCaller for calling variants on each RNA-Seq BAM file individually.
        /// </summary>
        /// <param name="spritzDirectory"></param>
        /// <param name="threads"></param>
        /// <param name="genomeFasta"></param>
        /// <param name="splitTrimBam"></param>
        /// <param name="dbsnpReferenceVcfPath"></param>
        /// <param name="newVcf"></param>
        public List <string> VariantCalling(string spritzDirectory, ExperimentType experimentType, int threads, string genomeFasta, string splitTrimBam, string dbsnpReferenceVcfPath)
        {
            HaplotypeCallerGvcfPath        = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.vcf.gz");
            HaplotypeCallerVcfPath         = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.vcf");
            FilteredHaplotypeCallerVcfPath = Path.Combine(Path.GetDirectoryName(splitTrimBam), Path.GetFileNameWithoutExtension(splitTrimBam) + ".g.gt.NoIndels.vcf");
            var vcftools = new VcfToolsWrapper();

            List <string> commands = new List <string>
            {
                WrapperUtility.ChangeToToolsDirectoryCommand(spritzDirectory),
                SamtoolsWrapper.GenomeFastaIndexCommand(genomeFasta),
                GenomeDictionaryIndexCommand(genomeFasta),

                // check that reference VCF is indexed
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + ".idx ]; then " + Gatk(Workers) + " IndexFeatureFile -F " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) + "; fi",

                // call variants
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) + " ]; then " +
                Gatk(Workers, 2) +
                " HaplotypeCaller" +
                " --native-pair-hmm-threads " + threads.ToString() +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -I " + WrapperUtility.ConvertWindowsPath(splitTrimBam) +
                " --min-base-quality-score 20" +
                (experimentType == ExperimentType.RNASequencing ? " --dont-use-soft-clipped-bases true" : "") +
                " --dbsnp " + WrapperUtility.ConvertWindowsPath(dbsnpReferenceVcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) +
                " -ERC GVCF" +            // this prompts phasing!
                " --max-mnp-distance 3" + // note: this can't be used for joint genotyping here, but this setting is available in mutect2 for doing tumor vs normal calls
                "; fi",

                // index compressed gvcf file
                $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerGvcfPath}.tbi")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath)}; fi",

                // genotype the gvcf file into a traditional vcf file
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) + " ]; then " +
                Gatk(Workers, 2) +
                " GenotypeGVCFs" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerGvcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) +
                "; fi",
                $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{HaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath)}; fi",

                // filter out indels
                "if [ ! -f " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) + " ]; then " +
                Gatk(Workers, 2) +
                " SelectVariants" +
                " --select-type-to-exclude INDEL" +
                " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                " -V " + WrapperUtility.ConvertWindowsPath(HaplotypeCallerVcfPath) +
                " -O " + WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath) +
                "; fi",
                $"if [ ! -f {WrapperUtility.ConvertWindowsPath($"{FilteredHaplotypeCallerVcfPath}.idx")} ]; then {Gatk(Workers)} IndexFeatureFile -F {WrapperUtility.ConvertWindowsPath(FilteredHaplotypeCallerVcfPath)}; fi",

                // filter variants (RNA-Seq specific params... need to check out recommendations before using DNA-Seq)
                //"if [ ! -f " + WrapperUtility.ConvertWindowsPath(newVcf) + " ] || [ " + " ! -s " + WrapperUtility.ConvertWindowsPath(newVcf) + " ]; then " +
                //    Gatk() +
                //    " -T VariantFiltration" +
                //    " -nct " + threads.ToString() +
                //    " -R " + WrapperUtility.ConvertWindowsPath(genomeFasta) +
                //    " -V " + WrapperUtility.ConvertWindowsPath(unfliteredVcf) +
                //    " -window 35 -cluster 3" + // filter out clusters of 3 snps within 35 bases (https://software.broadinstitute.org/gatk/documentation/topic?name=methods)
                //    " -filterName FS -filter \"FS > 30.0\"" +
                //    " -filterName QD -filter \"QD < 2.0\"" +
                //    " -o " + WrapperUtility.ConvertWindowsPath(newVcf) +
                //    "; fi",
            };

            return(commands);
        }