/// <summary> /// Outputs the copy number calls to a text file. /// </summary> private static void WriteVariants(IReadOnlyCollection <List <CanvasSegment> > segments, PloidyInfo ploidy, GenomeMetadata genome, BgzipOrStreamWriter writer, bool isPedigreeInfoSupplied = true, int?denovoQualityThreshold = null) { var nSamples = segments.Count; foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { for (int segmentIndex = 0; segmentIndex < segments.First().Count; segmentIndex++) { var firstSampleSegment = segments.First()[segmentIndex]; if (!isPedigreeInfoSupplied && segments.Select(sample => sample[segmentIndex].Filter == "PASS").Any() && segments.Count > 1) { firstSampleSegment.Filter = "PASS"; } if (!firstSampleSegment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) { continue; } var referenceCopyNumbers = segments.Select(segment => ploidy?.GetReferenceCopyNumber(segment[segmentIndex]) ?? 2).ToList(); var currentSegments = segments.Select(x => x[segmentIndex]).ToList(); var cnvTypes = new List <CnvType>(); for (int sampleIndex = 0; sampleIndex < nSamples; sampleIndex++) { cnvTypes.Add(currentSegments[sampleIndex].GetCnvType(referenceCopyNumbers[sampleIndex])); } CnvType cnvType; if (cnvTypes.TrueForAll(x => x == CnvType.Reference)) { cnvType = CnvType.Reference; } else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.Loss)) { cnvType = CnvType.Loss; } else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.Gain)) { cnvType = CnvType.Gain; } else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.LossOfHeterozygosity)) { cnvType = CnvType.LossOfHeterozygosity; } else { cnvType = CnvType.ComplexCnv; } WriteInfoField(writer, firstSampleSegment, cnvType, denovoQualityThreshold, isMultisample: segments.Count > 1); // FORMAT field if (segments.Count == 1) { WriteSingleSampleInfo(writer, firstSampleSegment); } else { WriteFormatField(writer, currentSegments); } } } }
private static void WriteFormatAndSampleFields(BgzipOrStreamWriter writer, CanvasSegment[] segments, string[] genotypes, bool reportDQ) { const string nullValue = "."; string formatColumn = "GT:RC:BC:CN:MCC:MCCQ:QS:FT"; if (reportDQ) { formatColumn += ":DQ"; } var outputFields = new List <string> { formatColumn }; for (int i = 0; i < segments.Length; i++) { var segment = segments[i]; string mcc = segment.MajorChromosomeCount.HasValue ? segment.MajorChromosomeCount.ToString() : nullValue; string mccq = segment.MajorChromosomeCountScore.HasValue ? $"{segment.MajorChromosomeCountScore.Value:F2}" : nullValue; string sampleColumn = $"{genotypes[i]}:{segment.MedianCount:F2}:{segment.BinCount}:{segment.CopyNumber}:{mcc}:{mccq}:{segment.QScore:F2}:{segment.Filter.ToVcfString()}"; if (reportDQ) { string dqscore = segment.DqScore.HasValue ? $"{segment.DqScore.Value:F2}" : nullValue; sampleColumn += $":{dqscore}"; } outputFields.Add(sampleColumn); } writer.WriteLine("\t" + string.Join("\t", outputFields)); }
public static void WriteMultiSampleSegments(string outVcfPath, ISampleMap <List <CanvasSegment> > segments, List <double> diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, List <PloidyInfo> ploidies, int qualityThreshold, int?denovoQualityThreshold, int?sizeThreshold, bool isPedigreeInfoSupplied = true) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { var genome = WriteVcfHeader(segments.Values.First(), diploidCoverage.Average(), wholeGenomeFastaDirectory, sampleNames, extraHeaders, writer, qualityThreshold, denovoQualityThreshold, sizeThreshold); WriteVariants(segments.Zip(), ploidies, genome, writer, denovoQualityThreshold); } }
public static void WriteMultiSampleSegments(string outVcfPath, List <List <CanvasSegment> > segments, List <double?> diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied = true, int?denovoQualityThreshold = null) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { var genome = WriteVcfHeader(segments.First(), GetMean(diploidCoverage), wholeGenomeFastaDirectory, sampleNames, extraHeaders, qualityThreshold, writer, denovoQualityThreshold); WriteVariants(segments, ploidy, genome, writer, isPedigreeInfoSupplied, denovoQualityThreshold); } }
public static void WriteHeaderAllAltCnTags(BgzipOrStreamWriter writer, int maxCopyNum = 5) { foreach (var copyNum in Enumerable.Range(0, maxCopyNum + 1)) { if (copyNum == 1) { continue; } writer.WriteLine($"##ALT=<ID=CN{copyNum},Description=\"Copy number allele: {copyNum} copies\">"); } }
public void Write(IEnumerable <BedGraphEntry> bedGraphEntries, IFileLocation location, string header = null) { using (var bgzipWriter = new BgzipOrStreamWriter(location.FullName)) { if (header != null) { bgzipWriter.WriteLine(header); } var writer = new BedGraphWriter(bgzipWriter); writer.WriteLines(bedGraphEntries); } }
private static GenomeMetadata WriteVcfHeader(List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, int qualityThreshold, BgzipOrStreamWriter writer, int?denovoQualityThreshold = null) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); AddPloidyAndCoverageHeaders(writer, segments, diploidCoverage); foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=SUBCLONAL,Number=0,Type=Flag,Description=\"Subclonal variant\">"); if (denovoQualityThreshold.HasValue) { string denovoQualityFilter = $"dq{denovoQualityThreshold}"; writer.WriteLine($"##INFO=<ID={denovoQualityFilter},Description=\"De novo quality score above {denovoQualityThreshold.Value}\">"); } writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); if (denovoQualityThreshold.HasValue) { writer.WriteLine("##FORMAT=<ID=DQ,Number=1,Type=Float,Description=\"De novo variants Phred-scaled quality score\">"); writer.WriteLine("##FORMAT=<ID=QS,Number=1,Type=Float,Description=\"Phred-scaled quality score\">"); } string names = string.Join("\t", sampleNames.ToArray()); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + names); SanityCheckChromosomeNames(genome, segments); return(genome); }
/// <summary> /// Write to a file a single CanvasSegment record as a non-sample VCF columns /// </summary> /// <param name="writer"></param> /// <param name="segment"></param> /// <param name="cnvType"></param> /// <param name="denovoQualityThreshold"></param> /// <returns></returns> private static void WriteInfoField(BgzipOrStreamWriter writer, CanvasSegment segment, CnvType cnvType, int?denovoQualityThreshold, bool isMultisample) { // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); string qScore = ""; qScore = isMultisample ? "." : $"{segment.QScore:F2}"; writer.Write($"N\t{alternateAllele}\t{qScore}\t{segment.Filter}\t"); if (cnvType != CnvType.Reference) { writer.Write($"SVTYPE={cnvType.ToSvType()};"); } if (segment.IsHeterogeneous) { writer.Write("SUBCLONAL;"); } if (segment.DQScore.HasValue && !isMultisample) { writer.Write($"DQ={segment.DQScore.Value};"); } if (denovoQualityThreshold.HasValue & segment.DQScore.HasValue & segment.DQScore >= denovoQualityThreshold) { writer.Write($"dq{denovoQualityThreshold};"); } writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) { writer.Write($";CNVLEN={segment.End - segment.Begin}"); } if (segment.StartConfidenceInterval != null) { writer.Write($";CIPOS={segment.StartConfidenceInterval.Item1},{segment.StartConfidenceInterval.Item2}"); } if (segment.EndConfidenceInterval != null) { writer.Write($";CIEND={segment.EndConfidenceInterval.Item1},{segment.EndConfidenceInterval.Item2}"); } }
private static void WriteFormatField(BgzipOrStreamWriter writer, List <CanvasSegment> segments) { writer.Write("\tRC:BC:CN:MCC:QS:DQ"); const string nullValue = "."; foreach (var segment in segments) { var mcc = segment.MajorChromosomeCount.HasValue ? segment.MajorChromosomeCount.ToString() : nullValue; var dqscore = segment.DQScore.HasValue ? $"{segment.DQScore.Value:F2}" : nullValue; var rc = Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero); writer.Write($"\t{rc}:{segment.BinCount}:{ segment.CopyNumber}:{mcc}:{segment.QScore}:{dqscore}"); } writer.WriteLine(); }
public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied, int?denovoQualityThreshold = null) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { var genome = WriteVcfHeader(segments, diploidCoverage, wholeGenomeFastaDirectory, new List <string> { sampleName }, extraHeaders, qualityThreshold, writer, denovoQualityThreshold); WriteVariants(new List <List <CanvasSegment> > { segments.ToList() }, ploidy, genome, writer, isPedigreeInfoSupplied, denovoQualityThreshold); } }
private static void WriteSingleSampleInfo(BgzipOrStreamWriter writer, CanvasSegment segment) { writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); }
/// <summary> /// Write to a file a single CanvasSegment record as a non-sample VCF columns /// </summary> /// <param name="writer"></param> /// <param name="firstSampleSegment"></param> /// <param name="alternateAllele"></param> /// <param name="recordLevelFilter"></param> /// <param name="sampleSetCnvType"></param> /// <param name="isMultisample"></param> /// <returns></returns> private static void WriteColumnsUntilInfoField(BgzipOrStreamWriter writer, CanvasSegment firstSampleSegment, CnvType sampleSetCnvType, string alternateAllele, string recordLevelFilter, bool isMultisample) { // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String <ID>) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? firstSampleSegment.Begin : firstSampleSegment.Begin + 1; writer.Write($"{firstSampleSegment.Chr}\t{position}\tCanvas:{sampleSetCnvType.ToVcfId()}:{firstSampleSegment.Chr}:{firstSampleSegment.Begin + 1}-{firstSampleSegment.End}\t"); string qScore = isMultisample ? "." : $"{firstSampleSegment.QScore:F2}"; writer.Write($"N\t{alternateAllele}\t{qScore}\t{recordLevelFilter}\t"); if (sampleSetCnvType != CnvType.Reference) { writer.Write($"SVTYPE={sampleSetCnvType.ToSvType()};"); } if (firstSampleSegment.IsHeterogeneous) { writer.Write("SUBCLONAL;"); } if (firstSampleSegment.IsCommonCnv) { writer.Write("COMMONCNV;"); } writer.Write($"END={firstSampleSegment.End}"); if (sampleSetCnvType != CnvType.Reference) { writer.Write($";CNVLEN={firstSampleSegment.Length}"); } if (firstSampleSegment.StartConfidenceInterval != null) { writer.Write($";CIPOS={firstSampleSegment.StartConfidenceInterval.Item1},{firstSampleSegment.StartConfidenceInterval.Item2}"); } if (firstSampleSegment.EndConfidenceInterval != null) { writer.Write($";CIEND={firstSampleSegment.EndConfidenceInterval.Item1},{firstSampleSegment.EndConfidenceInterval.Item2}"); } }
public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied, int?denovoQualityThreshold, int?sizeThreshold) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { var genome = WriteVcfHeader(segments, diploidCoverage, wholeGenomeFastaDirectory, new List <string> { sampleName }, extraHeaders, writer, qualityThreshold, denovoQualityThreshold, sizeThreshold); var sampleId = new SampleId(sampleName); var segmentsOfAllSamples = segments.Select(x => new SampleMap <CanvasSegment> { { sampleId, x } }); WriteVariants(segmentsOfAllSamples, new List <PloidyInfo> { ploidy }, genome, writer, denovoQualityThreshold); } }
public static void AddPloidyAndCoverageHeaders(BgzipOrStreamWriter writer, List <CanvasSegment> segments, double?diploidCoverage) { double totalPloidy = 0; double totalWeight = 0; foreach (CanvasSegment segment in segments.Where(segment => segment.Filter.IsPass)) { totalWeight += segment.Length; totalPloidy += segment.CopyNumber * (segment.Length); } if (totalWeight > 0) { writer.WriteLine($"##OverallPloidy={totalPloidy / totalWeight:F2}"); if (diploidCoverage != null) { writer.WriteLine($"##DiploidCoverage={diploidCoverage:F2}"); } } }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> private static void WriteVariants(IEnumerable <ISampleMap <CanvasSegment> > segmentsOfAllSamples, List <PloidyInfo> ploidies, GenomeMetadata genome, BgzipOrStreamWriter writer, int?denovoQualityThreshold = null) { var segmentsOfAllSamplesArray = segmentsOfAllSamples.ToArray(); // TODO: not necessary when chrom match logic has been updated int nSamples = segmentsOfAllSamplesArray.First().Values.Count(); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs()) //TODO: this is extremely inefficient. Segments should be sorted by chromosome { foreach (var sampleMap in segmentsOfAllSamplesArray) { var currentSegments = sampleMap.Values.ToArray(); var firstSampleSegment = currentSegments.First(); if (!firstSampleSegment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase) ) //TODO: this is extremely inefficient. Segments should be sorted by chromosome { continue; } var recordLevelFilter = CanvasFilter.GetRecordLevelFilterFromSampleFiltersOnly( sampleMap .Select(x => x.Value.Filter) .ToReadOnlyList()) .ToVcfString(); var referenceCopyNumbers = currentSegments.Zip(ploidies, (segment, ploidy) => ploidy?.GetReferenceCopyNumber(segment) ?? 2).ToList(); var cnvTypes = new CnvType[nSamples]; var sampleSetAlleleCopyNumbers = new int[nSamples][]; for (int sampleIndex = 0; sampleIndex < nSamples; sampleIndex++) { (cnvTypes[sampleIndex], sampleSetAlleleCopyNumbers[sampleIndex]) = currentSegments[sampleIndex] .GetCnvTypeAndAlleleCopyNumbers(referenceCopyNumbers[sampleIndex]); } var sampleSetCnvType = AssignCnvType(cnvTypes); var(alternateAllele, genotypes) = GetAltAllelesAndGenotypes(sampleSetAlleleCopyNumbers); WriteColumnsUntilInfoField(writer, firstSampleSegment, sampleSetCnvType, alternateAllele, recordLevelFilter, nSamples > 1); WriteFormatAndSampleFields(writer, currentSegments, genotypes, denovoQualityThreshold.HasValue); } } }
public void Write(BgzipOrStreamWriter w, int nSamples) { SanityCheck(nSamples); WriteStringFieldWithTab(w, ChromName); w.WriteTab(Pos.ToString()); WriteStringFieldWithTab(w, Id); WriteStringFieldWithTab(w, Ref); WriteStringFieldWithTab(w, GetAltString()); WriteStringFieldWithTab(w, _qual); WriteStringFieldWithTab(w, GetFilterString()); WriteStringFieldWithTab(w, GetInfoString()); WriteStringFieldWithTab(w, GetFormatString()); for (int sampleIndex = 0; sampleIndex < nSamples; ++sampleIndex) { if (sampleIndex == (nSamples - 1)) WriteStringFieldNoTab(w, GetSampleString(sampleIndex)); else WriteStringFieldWithTab(w, GetSampleString(sampleIndex)); } w.WriteLine(); }
public VCFOutStreamer(BgzipOrStreamWriter writer) : this(writer, "SAMPLE", "SAMPLE") { }
private void WriteStringFieldNoTab(BgzipOrStreamWriter w, string s) { w.Write(StringCheck(s)); }
public VCFOutStreamer(BgzipOrStreamWriter writer, string[] sampleNames, string[] sampleIds) { _sampleNames = sampleNames; _sampleIDs = sampleIds; _writer = writer; _headerList = new List<KeyValuePair<string, string>>(); }
public VCFOutStreamer(BgzipOrStreamWriter writer, string sampleName, string sampleId) : this(writer, new[] { sampleName }, new[] { sampleId }) { }
private static GenomeMetadata WriteVcfHeader(List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, BgzipOrStreamWriter writer, int qualityThreshold, int?denovoQualityThreshold, int?sizeThreshold) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); // Write ##OverallPloidy and ##DiploidCoverage for a single-sample file (where it makes sense to do so): if (sampleNames.Count == 1) { AddPloidyAndCoverageHeaders(writer, segments, diploidCoverage); } foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(new FileLocation(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"))); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs()) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=DUP,Description=\"Region of elevated copy number relative to the reference\">"); WriteHeaderAllAltCnTags(writer); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); if (sizeThreshold.HasValue) { string sizeFilterName = CanvasFilter.GetCnvSizeFilter(sizeThreshold.Value, out var sizeFilterThreshold); writer.WriteLine($"##FILTER=<ID={sizeFilterName},Description=\"Length shorter than {sizeFilterThreshold.Number} {sizeFilterThreshold.Units}\">"); } writer.WriteLine("##FILTER=<ID=FailedFT,Description=\"Sample-level filter failed in all the samples\">"); writer.WriteLine("##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=SUBCLONAL,Number=0,Type=Flag,Description=\"Subclonal variant\">"); writer.WriteLine("##INFO=<ID=COMMONCNV,Number=0,Type=Flag,Description=\"Common CNV variant identified from pre-specified bed intervals\">"); writer.WriteLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("##FORMAT=<ID=MCCQ,Number=1,Type=Float,Description=\"Major chromosome count quality score\">"); writer.WriteLine("##FORMAT=<ID=QS,Number=1,Type=Float,Description=\"Phred-scaled quality score. If CN is reference then this is -10log10(prob(variant)) otherwise this is -10log10(prob(no variant).\">"); if (denovoQualityThreshold.HasValue) { writer.WriteLine($"##FORMAT=<ID=DQ,Number=1,Type=Float,Description=\"De novo quality. Threshold for passing de novo call: {denovoQualityThreshold}\">"); } writer.WriteLine("##FORMAT=<ID=FT,Number=1,Type=String,Description=\"Sample filter, 'PASS' indicates that all filters have passed for this sample\">"); var titleColumns = new List <string> { "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT" }; titleColumns.AddRange(sampleNames); writer.WriteLine(string.Join("\t", titleColumns)); SanityCheckChromosomeNames(genome, segments); return(genome); }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName); SanityCheckChromosomeNames(genome, segments); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { foreach (CanvasSegment segment in segments) { if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) { continue; } int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2; CnvType cnvType = segment.GetCnvType(referenceCopyNumber); // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter); if (cnvType != CnvType.Reference) { writer.Write($"SVTYPE={cnvType.ToSvType()};"); } writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) { writer.Write($";CNVLEN={segment.End - segment.Begin}"); } // FORMAT field writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); } } } }