예제 #1
0
        /// <summary>
        /// Integrity check, to ensure that our reference FASTA file is in sync with our inputs.
        /// </summary>
        private static void SanityCheckChromosomeNames(GenomeMetadata genome, IEnumerable <CanvasSegment> segments)
        {
            var chromosomeNames = new HashSet <string>();

            foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs())
            {
                chromosomeNames.Add(chromosome.Name.ToLowerInvariant());
            }
            foreach (
                CanvasSegment segment in
                segments.Where(segment => !chromosomeNames.Contains(segment.Chr.ToLowerInvariant())))
            {
                throw new Exception($"Integrity check error: Segment found at unknown chromosome '{segment.Chr}'");
            }
        }
예제 #2
0
        private static Dictionary <string, IEnumerable <Interval> > GetIncludedIntervals(Dictionary <string, List <Interval> > filterIntervals, GenomeMetadata genomeMetadata)
        {
            var contigNames = genomeMetadata
                              .Contigs()
                              .Where(contig => contig.Type == GenomeMetadata.SequenceType.Autosome || contig.Type == GenomeMetadata.SequenceType.Allosome)
                              .Select(contig => contig.Name);

            return(contigNames
                   .Select(contig =>
            {
                var filteredIntervals = filterIntervals.ContainsKey(contig)
                        ? filterIntervals[contig]
                        : new List <Interval>();
                return (contig, GetIncludedIntervals(filteredIntervals, genomeMetadata.GetSequence(contig).Length));
            })
                   .ToDictionary());
        }
예제 #3
0
        public static CallabilityMetricsComputer Create(ILogger logger, GenomeMetadata genomeMetadata, IFileLocation filterBed, bool isFemale)
        {
            var    filterIntervals    = LoadBedRegions(filterBed, genomeMetadata);
            var    nonFilterIntervals = GetIncludedIntervals(filterIntervals, genomeMetadata);
            string chrY = genomeMetadata
                          .Contigs()
                          .Select(contig => contig.Name)
                          .SingleOrDefault(name => name == "chrY" || name == "Y");

            if (isFemale && chrY != null)
            {
                nonFilterIntervals.Remove(chrY);
            }
            var callabilityCalculator = new CallabilityCalculator(nonFilterIntervals);

            return(new CallabilityMetricsComputer(logger, callabilityCalculator));
        }
예제 #4
0
        public StringBuilder GetSingleSampleCommandLine(string sampleId, Bam bam, GenomeMetadata genomeMetadata, IDirectoryLocation sampleSandbox)
        {
            StringBuilder commandLine = new StringBuilder();

            commandLine.Append($" --bam \"{bam.BamFile}\"");
            commandLine.Append($" --sample-name \"{sampleId}\"");
            IFileLocation kmerFasta = _annotationFileProvider.GetKmerFasta(genomeMetadata);

            commandLine.Append($" --reference \"{kmerFasta}\"");
            IDirectoryLocation wholeGenomeFasta = new FileLocation(genomeMetadata.Contigs().First().FastaPath).Directory;

            commandLine.Append($" --genome-folder \"{wholeGenomeFasta}\"");
            IFileLocation filterBed = _annotationFileProvider.GetFilterBed(genomeMetadata);

            commandLine.Append($" --filter-bed \"{filterBed}\"");
            commandLine.Append($" --output \"{sampleSandbox}\"");

            return(commandLine);
        }
예제 #5
0
        /// <summary>
        /// Outputs the copy number calls to a text file.
        /// </summary>
        private static void WriteVariants(IEnumerable <ISampleMap <CanvasSegment> > segmentsOfAllSamples, List <PloidyInfo> ploidies, GenomeMetadata genome,
                                          BgzipOrStreamWriter writer, int?denovoQualityThreshold = null)
        {
            var segmentsOfAllSamplesArray = segmentsOfAllSamples.ToArray(); // TODO: not necessary when chrom match logic has been updated
            int nSamples = segmentsOfAllSamplesArray.First().Values.Count();

            foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs()) //TODO: this is extremely inefficient. Segments should be sorted by chromosome
            {
                foreach (var sampleMap in segmentsOfAllSamplesArray)
                {
                    var currentSegments    = sampleMap.Values.ToArray();
                    var firstSampleSegment = currentSegments.First();
                    if (!firstSampleSegment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)
                        ) //TODO: this is extremely inefficient. Segments should be sorted by chromosome
                    {
                        continue;
                    }
                    var recordLevelFilter = CanvasFilter.GetRecordLevelFilterFromSampleFiltersOnly(
                        sampleMap
                        .Select(x => x.Value.Filter)
                        .ToReadOnlyList())
                                            .ToVcfString();
                    var referenceCopyNumbers = currentSegments.Zip(ploidies,
                                                                   (segment, ploidy) => ploidy?.GetReferenceCopyNumber(segment) ?? 2).ToList();
                    var cnvTypes = new CnvType[nSamples];
                    var sampleSetAlleleCopyNumbers = new int[nSamples][];
                    for (int sampleIndex = 0; sampleIndex < nSamples; sampleIndex++)
                    {
                        (cnvTypes[sampleIndex], sampleSetAlleleCopyNumbers[sampleIndex]) = currentSegments[sampleIndex]
                                                                                           .GetCnvTypeAndAlleleCopyNumbers(referenceCopyNumbers[sampleIndex]);
                    }
                    var sampleSetCnvType = AssignCnvType(cnvTypes);
                    var(alternateAllele, genotypes) = GetAltAllelesAndGenotypes(sampleSetAlleleCopyNumbers);
                    WriteColumnsUntilInfoField(writer, firstSampleSegment, sampleSetCnvType, alternateAllele,
                                               recordLevelFilter, nSamples > 1);
                    WriteFormatAndSampleFields(writer, currentSegments, genotypes,
                                               denovoQualityThreshold.HasValue);
                }
            }
        }
예제 #6
0
        public StringBuilder GetMultiSampleCommandLine(SampleSet <CanvasPedigreeSample> samples, GenomeMetadata genomeMetadata, Vcf vcf, IDirectoryLocation sampleSandbox)
        {
            StringBuilder commandLine = new StringBuilder();
            // move proband to the front of collection (enum Proband gets the lowest int value )
            var sortedBySampleTypeSamples = samples.OrderBy(sample => sample.Value.SampleType);

            foreach (var sampleKvp in sortedBySampleTypeSamples)
            {
                var sampleId = sampleKvp.Key.Id;
                var sample   = sampleKvp.Value;
                commandLine.Append($" --bam \"{sample.Bam.BamFile}\" {sample.SampleType} {sampleId}");
            }
            IFileLocation kmerFasta = _annotationFileProvider.GetKmerFasta(genomeMetadata);

            commandLine.Append($" --reference \"{kmerFasta}\"");
            IDirectoryLocation wholeGenomeFasta = new FileLocation(genomeMetadata.Contigs().First().FastaPath).Directory;

            commandLine.Append($" --genome-folder \"{wholeGenomeFasta}\"");
            IFileLocation filterBed = _annotationFileProvider.GetFilterBed(genomeMetadata);

            commandLine.Append($" --filter-bed \"{filterBed}\"");
            commandLine.Append($" --output \"{sampleSandbox}\"");
            return(commandLine);
        }
예제 #7
0
        private static GenomeMetadata WriteVcfHeader(List <CanvasSegment> segments, double?diploidCoverage,
                                                     string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, BgzipOrStreamWriter writer, int qualityThreshold,
                                                     int?denovoQualityThreshold, int?sizeThreshold)
        {
            // Write the VCF header:
            writer.WriteLine("##fileformat=VCFv4.1");
            writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}");
            writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}");
            // Write ##OverallPloidy and ##DiploidCoverage for a single-sample file (where it makes sense to do so):
            if (sampleNames.Count == 1)
            {
                AddPloidyAndCoverageHeaders(writer, segments, diploidCoverage);
            }
            foreach (string header in extraHeaders ?? new List <string>())
            {
                writer.WriteLine(header);
            }

            GenomeMetadata genome = new GenomeMetadata();

            genome.Deserialize(new FileLocation(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")));

            foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs())
            {
                writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>");
            }
            string qualityFilter = $"q{qualityThreshold}";

            writer.WriteLine("##ALT=<ID=DUP,Description=\"Region of elevated copy number relative to the reference\">");
            WriteHeaderAllAltCnTags(writer);
            writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">");
            if (sizeThreshold.HasValue)
            {
                string sizeFilterName = CanvasFilter.GetCnvSizeFilter(sizeThreshold.Value, out var sizeFilterThreshold);
                writer.WriteLine($"##FILTER=<ID={sizeFilterName},Description=\"Length shorter than {sizeFilterThreshold.Number} {sizeFilterThreshold.Units}\">");
            }
            writer.WriteLine("##FILTER=<ID=FailedFT,Description=\"Sample-level filter failed in all the samples\">");
            writer.WriteLine("##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">");
            writer.WriteLine("##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">");
            writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
            writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
            writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
            writer.WriteLine("##INFO=<ID=SUBCLONAL,Number=0,Type=Flag,Description=\"Subclonal variant\">");
            writer.WriteLine("##INFO=<ID=COMMONCNV,Number=0,Type=Flag,Description=\"Common CNV variant identified from pre-specified bed intervals\">");
            writer.WriteLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
            writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
            writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
            writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
            writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
            writer.WriteLine("##FORMAT=<ID=MCCQ,Number=1,Type=Float,Description=\"Major chromosome count quality score\">");
            writer.WriteLine("##FORMAT=<ID=QS,Number=1,Type=Float,Description=\"Phred-scaled quality score. If CN is reference then this is -10log10(prob(variant)) otherwise this is -10log10(prob(no variant).\">");
            if (denovoQualityThreshold.HasValue)
            {
                writer.WriteLine($"##FORMAT=<ID=DQ,Number=1,Type=Float,Description=\"De novo quality. Threshold for passing de novo call: {denovoQualityThreshold}\">");
            }
            writer.WriteLine("##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Sample filter, 'PASS' indicates that all filters have passed for this sample\">");
            var titleColumns = new List <string> {
                "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"
            };

            titleColumns.AddRange(sampleNames);
            writer.WriteLine(string.Join("\t", titleColumns));
            SanityCheckChromosomeNames(genome, segments);
            return(genome);
        }
예제 #8
0
 private IReferenceGenome GetReferenceGenomeFromGenomeMetadata(GenomeMetadata genomeMetadata)
 {
     return(_referenceGenomeFactory.GetReferenceGenome(new DirectoryLocation(Path.GetDirectoryName(genomeMetadata.Contigs().First().FastaPath))));
 }
예제 #9
0
        public bool IsSupported(GenomeMetadata genome)
        {
            string species = genome.Contigs().FirstOrDefault()?.Species;

            return("Homo_sapiens".Equals(species, StringComparison.OrdinalIgnoreCase));
        }