private static void WriteFormatAndSampleFields(BgzipOrStreamWriter writer, CanvasSegment[] segments, string[] genotypes, bool reportDQ) { const string nullValue = "."; string formatColumn = "GT:RC:BC:CN:MCC:MCCQ:QS:FT"; if (reportDQ) { formatColumn += ":DQ"; } var outputFields = new List <string> { formatColumn }; for (int i = 0; i < segments.Length; i++) { var segment = segments[i]; string mcc = segment.MajorChromosomeCount.HasValue ? segment.MajorChromosomeCount.ToString() : nullValue; string mccq = segment.MajorChromosomeCountScore.HasValue ? $"{segment.MajorChromosomeCountScore.Value:F2}" : nullValue; string sampleColumn = $"{genotypes[i]}:{segment.MedianCount:F2}:{segment.BinCount}:{segment.CopyNumber}:{mcc}:{mccq}:{segment.QScore:F2}:{segment.Filter.ToVcfString()}"; if (reportDQ) { string dqscore = segment.DqScore.HasValue ? $"{segment.DqScore.Value:F2}" : nullValue; sampleColumn += $":{dqscore}"; } outputFields.Add(sampleColumn); } writer.WriteLine("\t" + string.Join("\t", outputFields)); }
public static void AddPloidyAndCoverageHeaders(BgzipOrStreamWriter writer, List <CanvasSegment> segments, double?diploidCoverage) { double totalPloidy = 0; double totalWeight = 0; foreach (CanvasSegment segment in segments.Where(segment => segment.Filter.IsPass)) { totalWeight += segment.Length; totalPloidy += segment.CopyNumber * (segment.Length); } if (totalWeight > 0) { writer.WriteLine($"##OverallPloidy={totalPloidy / totalWeight:F2}"); if (diploidCoverage != null) { writer.WriteLine($"##DiploidCoverage={diploidCoverage:F2}"); } } }
public static void WriteHeaderAllAltCnTags(BgzipOrStreamWriter writer, int maxCopyNum = 5) { foreach (var copyNum in Enumerable.Range(0, maxCopyNum + 1)) { if (copyNum == 1) { continue; } writer.WriteLine($"##ALT=<ID=CN{copyNum},Description=\"Copy number allele: {copyNum} copies\">"); } }
public void Write(IEnumerable <BedGraphEntry> bedGraphEntries, IFileLocation location, string header = null) { using (var bgzipWriter = new BgzipOrStreamWriter(location.FullName)) { if (header != null) { bgzipWriter.WriteLine(header); } var writer = new BedGraphWriter(bgzipWriter); writer.WriteLines(bedGraphEntries); } }
private static void WriteFormatField(BgzipOrStreamWriter writer, List <CanvasSegment> segments) { writer.Write("\tRC:BC:CN:MCC:QS:DQ"); const string nullValue = "."; foreach (var segment in segments) { var mcc = segment.MajorChromosomeCount.HasValue ? segment.MajorChromosomeCount.ToString() : nullValue; var dqscore = segment.DQScore.HasValue ? $"{segment.DQScore.Value:F2}" : nullValue; var rc = Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero); writer.Write($"\t{rc}:{segment.BinCount}:{ segment.CopyNumber}:{mcc}:{segment.QScore}:{dqscore}"); } writer.WriteLine(); }
private static void WriteSingleSampleInfo(BgzipOrStreamWriter writer, CanvasSegment segment) { writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); }
private static GenomeMetadata WriteVcfHeader(List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, BgzipOrStreamWriter writer, int qualityThreshold, int?denovoQualityThreshold, int?sizeThreshold) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); // Write ##OverallPloidy and ##DiploidCoverage for a single-sample file (where it makes sense to do so): if (sampleNames.Count == 1) { AddPloidyAndCoverageHeaders(writer, segments, diploidCoverage); } foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(new FileLocation(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"))); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs()) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=DUP,Description=\"Region of elevated copy number relative to the reference\">"); WriteHeaderAllAltCnTags(writer); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); if (sizeThreshold.HasValue) { string sizeFilterName = CanvasFilter.GetCnvSizeFilter(sizeThreshold.Value, out var sizeFilterThreshold); writer.WriteLine($"##FILTER=<ID={sizeFilterName},Description=\"Length shorter than {sizeFilterThreshold.Number} {sizeFilterThreshold.Units}\">"); } writer.WriteLine("##FILTER=<ID=FailedFT,Description=\"Sample-level filter failed in all the samples\">"); writer.WriteLine("##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=SUBCLONAL,Number=0,Type=Flag,Description=\"Subclonal variant\">"); writer.WriteLine("##INFO=<ID=COMMONCNV,Number=0,Type=Flag,Description=\"Common CNV variant identified from pre-specified bed intervals\">"); writer.WriteLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("##FORMAT=<ID=MCCQ,Number=1,Type=Float,Description=\"Major chromosome count quality score\">"); writer.WriteLine("##FORMAT=<ID=QS,Number=1,Type=Float,Description=\"Phred-scaled quality score. If CN is reference then this is -10log10(prob(variant)) otherwise this is -10log10(prob(no variant).\">"); if (denovoQualityThreshold.HasValue) { writer.WriteLine($"##FORMAT=<ID=DQ,Number=1,Type=Float,Description=\"De novo quality. Threshold for passing de novo call: {denovoQualityThreshold}\">"); } writer.WriteLine("##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Sample filter, 'PASS' indicates that all filters have passed for this sample\">"); var titleColumns = new List <string> { "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT" }; titleColumns.AddRange(sampleNames); writer.WriteLine(string.Join("\t", titleColumns)); SanityCheckChromosomeNames(genome, segments); return(genome); }
public void Write(BgzipOrStreamWriter w, int nSamples) { SanityCheck(nSamples); WriteStringFieldWithTab(w, ChromName); w.WriteTab(Pos.ToString()); WriteStringFieldWithTab(w, Id); WriteStringFieldWithTab(w, Ref); WriteStringFieldWithTab(w, GetAltString()); WriteStringFieldWithTab(w, _qual); WriteStringFieldWithTab(w, GetFilterString()); WriteStringFieldWithTab(w, GetInfoString()); WriteStringFieldWithTab(w, GetFormatString()); for (int sampleIndex = 0; sampleIndex < nSamples; ++sampleIndex) { if (sampleIndex == (nSamples - 1)) WriteStringFieldNoTab(w, GetSampleString(sampleIndex)); else WriteStringFieldWithTab(w, GetSampleString(sampleIndex)); } w.WriteLine(); }
private static GenomeMetadata WriteVcfHeader(List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, int qualityThreshold, BgzipOrStreamWriter writer, int?denovoQualityThreshold = null) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); AddPloidyAndCoverageHeaders(writer, segments, diploidCoverage); foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=SUBCLONAL,Number=0,Type=Flag,Description=\"Subclonal variant\">"); if (denovoQualityThreshold.HasValue) { string denovoQualityFilter = $"dq{denovoQualityThreshold}"; writer.WriteLine($"##INFO=<ID={denovoQualityFilter},Description=\"De novo quality score above {denovoQualityThreshold.Value}\">"); } writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); if (denovoQualityThreshold.HasValue) { writer.WriteLine("##FORMAT=<ID=DQ,Number=1,Type=Float,Description=\"De novo variants Phred-scaled quality score\">"); writer.WriteLine("##FORMAT=<ID=QS,Number=1,Type=Float,Description=\"Phred-scaled quality score\">"); } string names = string.Join("\t", sampleNames.ToArray()); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + names); SanityCheckChromosomeNames(genome, segments); return(genome); }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName); SanityCheckChromosomeNames(genome, segments); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { foreach (CanvasSegment segment in segments) { if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) { continue; } int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2; CnvType cnvType = segment.GetCnvType(referenceCopyNumber); // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter); if (cnvType != CnvType.Reference) { writer.Write($"SVTYPE={cnvType.ToSvType()};"); } writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) { writer.Write($";CNVLEN={segment.End - segment.Begin}"); } // FORMAT field writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); } } } }