/// <summary> /// Write to a file a single CanvasSegment record as a non-sample VCF columns /// </summary> /// <param name="writer"></param> /// <param name="segment"></param> /// <param name="cnvType"></param> /// <param name="denovoQualityThreshold"></param> /// <returns></returns> private static void WriteInfoField(BgzipOrStreamWriter writer, CanvasSegment segment, CnvType cnvType, int?denovoQualityThreshold, bool isMultisample) { // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); string qScore = ""; qScore = isMultisample ? "." : $"{segment.QScore:F2}"; writer.Write($"N\t{alternateAllele}\t{qScore}\t{segment.Filter}\t"); if (cnvType != CnvType.Reference) { writer.Write($"SVTYPE={cnvType.ToSvType()};"); } if (segment.IsHeterogeneous) { writer.Write("SUBCLONAL;"); } if (segment.DQScore.HasValue && !isMultisample) { writer.Write($"DQ={segment.DQScore.Value};"); } if (denovoQualityThreshold.HasValue & segment.DQScore.HasValue & segment.DQScore >= denovoQualityThreshold) { writer.Write($"dq{denovoQualityThreshold};"); } writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) { writer.Write($";CNVLEN={segment.End - segment.Begin}"); } if (segment.StartConfidenceInterval != null) { writer.Write($";CIPOS={segment.StartConfidenceInterval.Item1},{segment.StartConfidenceInterval.Item2}"); } if (segment.EndConfidenceInterval != null) { writer.Write($";CIEND={segment.EndConfidenceInterval.Item1},{segment.EndConfidenceInterval.Item2}"); } }
/// <summary> /// Write to a file a single CanvasSegment record as a non-sample VCF columns /// </summary> /// <param name="writer"></param> /// <param name="firstSampleSegment"></param> /// <param name="alternateAllele"></param> /// <param name="recordLevelFilter"></param> /// <param name="sampleSetCnvType"></param> /// <param name="isMultisample"></param> /// <returns></returns> private static void WriteColumnsUntilInfoField(BgzipOrStreamWriter writer, CanvasSegment firstSampleSegment, CnvType sampleSetCnvType, string alternateAllele, string recordLevelFilter, bool isMultisample) { // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String <ID>) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? firstSampleSegment.Begin : firstSampleSegment.Begin + 1; writer.Write($"{firstSampleSegment.Chr}\t{position}\tCanvas:{sampleSetCnvType.ToVcfId()}:{firstSampleSegment.Chr}:{firstSampleSegment.Begin + 1}-{firstSampleSegment.End}\t"); string qScore = isMultisample ? "." : $"{firstSampleSegment.QScore:F2}"; writer.Write($"N\t{alternateAllele}\t{qScore}\t{recordLevelFilter}\t"); if (sampleSetCnvType != CnvType.Reference) { writer.Write($"SVTYPE={sampleSetCnvType.ToSvType()};"); } if (firstSampleSegment.IsHeterogeneous) { writer.Write("SUBCLONAL;"); } if (firstSampleSegment.IsCommonCnv) { writer.Write("COMMONCNV;"); } writer.Write($"END={firstSampleSegment.End}"); if (sampleSetCnvType != CnvType.Reference) { writer.Write($";CNVLEN={firstSampleSegment.Length}"); } if (firstSampleSegment.StartConfidenceInterval != null) { writer.Write($";CIPOS={firstSampleSegment.StartConfidenceInterval.Item1},{firstSampleSegment.StartConfidenceInterval.Item2}"); } if (firstSampleSegment.EndConfidenceInterval != null) { writer.Write($";CIEND={firstSampleSegment.EndConfidenceInterval.Item1},{firstSampleSegment.EndConfidenceInterval.Item2}"); } }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName); SanityCheckChromosomeNames(genome, segments); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { foreach (CanvasSegment segment in segments) { if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) { continue; } int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2; CnvType cnvType = segment.GetCnvType(referenceCopyNumber); // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter); if (cnvType != CnvType.Reference) { writer.Write($"SVTYPE={cnvType.ToSvType()};"); } writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) { writer.Write($";CNVLEN={segment.End - segment.Begin}"); } // FORMAT field writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); } } } }