/// <summary> /// Integrity check, to ensure that our reference FASTA file is in sync with our inputs. /// </summary> private static void SanityCheckChromosomeNames(GenomeMetadata genome, IEnumerable <CanvasSegment> segments) { var chromosomeNames = new HashSet <string>(); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs()) { chromosomeNames.Add(chromosome.Name.ToLowerInvariant()); } foreach ( CanvasSegment segment in segments.Where(segment => !chromosomeNames.Contains(segment.Chr.ToLowerInvariant()))) { throw new Exception($"Integrity check error: Segment found at unknown chromosome '{segment.Chr}'"); } }
private static Dictionary <string, IEnumerable <Interval> > GetIncludedIntervals(Dictionary <string, List <Interval> > filterIntervals, GenomeMetadata genomeMetadata) { var contigNames = genomeMetadata .Contigs() .Where(contig => contig.Type == GenomeMetadata.SequenceType.Autosome || contig.Type == GenomeMetadata.SequenceType.Allosome) .Select(contig => contig.Name); return(contigNames .Select(contig => { var filteredIntervals = filterIntervals.ContainsKey(contig) ? filterIntervals[contig] : new List <Interval>(); return (contig, GetIncludedIntervals(filteredIntervals, genomeMetadata.GetSequence(contig).Length)); }) .ToDictionary()); }
public static CallabilityMetricsComputer Create(ILogger logger, GenomeMetadata genomeMetadata, IFileLocation filterBed, bool isFemale) { var filterIntervals = LoadBedRegions(filterBed, genomeMetadata); var nonFilterIntervals = GetIncludedIntervals(filterIntervals, genomeMetadata); string chrY = genomeMetadata .Contigs() .Select(contig => contig.Name) .SingleOrDefault(name => name == "chrY" || name == "Y"); if (isFemale && chrY != null) { nonFilterIntervals.Remove(chrY); } var callabilityCalculator = new CallabilityCalculator(nonFilterIntervals); return(new CallabilityMetricsComputer(logger, callabilityCalculator)); }
public StringBuilder GetSingleSampleCommandLine(string sampleId, Bam bam, GenomeMetadata genomeMetadata, IDirectoryLocation sampleSandbox) { StringBuilder commandLine = new StringBuilder(); commandLine.Append($" --bam \"{bam.BamFile}\""); commandLine.Append($" --sample-name \"{sampleId}\""); IFileLocation kmerFasta = _annotationFileProvider.GetKmerFasta(genomeMetadata); commandLine.Append($" --reference \"{kmerFasta}\""); IDirectoryLocation wholeGenomeFasta = new FileLocation(genomeMetadata.Contigs().First().FastaPath).Directory; commandLine.Append($" --genome-folder \"{wholeGenomeFasta}\""); IFileLocation filterBed = _annotationFileProvider.GetFilterBed(genomeMetadata); commandLine.Append($" --filter-bed \"{filterBed}\""); commandLine.Append($" --output \"{sampleSandbox}\""); return(commandLine); }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> private static void WriteVariants(IEnumerable <ISampleMap <CanvasSegment> > segmentsOfAllSamples, List <PloidyInfo> ploidies, GenomeMetadata genome, BgzipOrStreamWriter writer, int?denovoQualityThreshold = null) { var segmentsOfAllSamplesArray = segmentsOfAllSamples.ToArray(); // TODO: not necessary when chrom match logic has been updated int nSamples = segmentsOfAllSamplesArray.First().Values.Count(); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs()) //TODO: this is extremely inefficient. Segments should be sorted by chromosome { foreach (var sampleMap in segmentsOfAllSamplesArray) { var currentSegments = sampleMap.Values.ToArray(); var firstSampleSegment = currentSegments.First(); if (!firstSampleSegment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase) ) //TODO: this is extremely inefficient. Segments should be sorted by chromosome { continue; } var recordLevelFilter = CanvasFilter.GetRecordLevelFilterFromSampleFiltersOnly( sampleMap .Select(x => x.Value.Filter) .ToReadOnlyList()) .ToVcfString(); var referenceCopyNumbers = currentSegments.Zip(ploidies, (segment, ploidy) => ploidy?.GetReferenceCopyNumber(segment) ?? 2).ToList(); var cnvTypes = new CnvType[nSamples]; var sampleSetAlleleCopyNumbers = new int[nSamples][]; for (int sampleIndex = 0; sampleIndex < nSamples; sampleIndex++) { (cnvTypes[sampleIndex], sampleSetAlleleCopyNumbers[sampleIndex]) = currentSegments[sampleIndex] .GetCnvTypeAndAlleleCopyNumbers(referenceCopyNumbers[sampleIndex]); } var sampleSetCnvType = AssignCnvType(cnvTypes); var(alternateAllele, genotypes) = GetAltAllelesAndGenotypes(sampleSetAlleleCopyNumbers); WriteColumnsUntilInfoField(writer, firstSampleSegment, sampleSetCnvType, alternateAllele, recordLevelFilter, nSamples > 1); WriteFormatAndSampleFields(writer, currentSegments, genotypes, denovoQualityThreshold.HasValue); } } }
public StringBuilder GetMultiSampleCommandLine(SampleSet <CanvasPedigreeSample> samples, GenomeMetadata genomeMetadata, Vcf vcf, IDirectoryLocation sampleSandbox) { StringBuilder commandLine = new StringBuilder(); // move proband to the front of collection (enum Proband gets the lowest int value ) var sortedBySampleTypeSamples = samples.OrderBy(sample => sample.Value.SampleType); foreach (var sampleKvp in sortedBySampleTypeSamples) { var sampleId = sampleKvp.Key.Id; var sample = sampleKvp.Value; commandLine.Append($" --bam \"{sample.Bam.BamFile}\" {sample.SampleType} {sampleId}"); } IFileLocation kmerFasta = _annotationFileProvider.GetKmerFasta(genomeMetadata); commandLine.Append($" --reference \"{kmerFasta}\""); IDirectoryLocation wholeGenomeFasta = new FileLocation(genomeMetadata.Contigs().First().FastaPath).Directory; commandLine.Append($" --genome-folder \"{wholeGenomeFasta}\""); IFileLocation filterBed = _annotationFileProvider.GetFilterBed(genomeMetadata); commandLine.Append($" --filter-bed \"{filterBed}\""); commandLine.Append($" --output \"{sampleSandbox}\""); return(commandLine); }
private static GenomeMetadata WriteVcfHeader(List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, BgzipOrStreamWriter writer, int qualityThreshold, int?denovoQualityThreshold, int?sizeThreshold) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); // Write ##OverallPloidy and ##DiploidCoverage for a single-sample file (where it makes sense to do so): if (sampleNames.Count == 1) { AddPloidyAndCoverageHeaders(writer, segments, diploidCoverage); } foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(new FileLocation(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"))); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs()) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=DUP,Description=\"Region of elevated copy number relative to the reference\">"); WriteHeaderAllAltCnTags(writer); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); if (sizeThreshold.HasValue) { string sizeFilterName = CanvasFilter.GetCnvSizeFilter(sizeThreshold.Value, out var sizeFilterThreshold); writer.WriteLine($"##FILTER=<ID={sizeFilterName},Description=\"Length shorter than {sizeFilterThreshold.Number} {sizeFilterThreshold.Units}\">"); } writer.WriteLine("##FILTER=<ID=FailedFT,Description=\"Sample-level filter failed in all the samples\">"); writer.WriteLine("##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=SUBCLONAL,Number=0,Type=Flag,Description=\"Subclonal variant\">"); writer.WriteLine("##INFO=<ID=COMMONCNV,Number=0,Type=Flag,Description=\"Common CNV variant identified from pre-specified bed intervals\">"); writer.WriteLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("##FORMAT=<ID=MCCQ,Number=1,Type=Float,Description=\"Major chromosome count quality score\">"); writer.WriteLine("##FORMAT=<ID=QS,Number=1,Type=Float,Description=\"Phred-scaled quality score. If CN is reference then this is -10log10(prob(variant)) otherwise this is -10log10(prob(no variant).\">"); if (denovoQualityThreshold.HasValue) { writer.WriteLine($"##FORMAT=<ID=DQ,Number=1,Type=Float,Description=\"De novo quality. Threshold for passing de novo call: {denovoQualityThreshold}\">"); } writer.WriteLine("##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Sample filter, 'PASS' indicates that all filters have passed for this sample\">"); var titleColumns = new List <string> { "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT" }; titleColumns.AddRange(sampleNames); writer.WriteLine(string.Join("\t", titleColumns)); SanityCheckChromosomeNames(genome, segments); return(genome); }
private IReferenceGenome GetReferenceGenomeFromGenomeMetadata(GenomeMetadata genomeMetadata) { return(_referenceGenomeFactory.GetReferenceGenome(new DirectoryLocation(Path.GetDirectoryName(genomeMetadata.Contigs().First().FastaPath)))); }
public bool IsSupported(GenomeMetadata genome) { string species = genome.Contigs().FirstOrDefault()?.Species; return("Homo_sapiens".Equals(species, StringComparison.OrdinalIgnoreCase)); }