public static PloidyInfo LoadPloidyFromBedFile(string filePath) { PloidyInfo ploidy = new PloidyInfo(); int count = 0; using (GzipReader reader = new GzipReader(filePath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) break; if (fileLine.StartsWith("##ExpectedSexChromosomeKaryotype")) { ploidy.HeaderLine = fileLine.Trim(); continue; } if (fileLine.Length == 0 || fileLine[0] == '#') continue; string[] bits = fileLine.Split('\t'); string chromosome = bits[0]; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List<PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(); interval.Start = int.Parse(bits[1]); interval.End = int.Parse(bits[2]); interval.Ploidy = int.Parse(bits[4]); ploidy.PloidyByChromosome[chromosome].Add(interval); count++; } } Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count); return ploidy; }
public static PloidyInfo LoadPloidyFromVcfFile(string vcfPath, string sampleName) { int sampleIndex = 0; PloidyInfo ploidy = new PloidyInfo(); using (VcfReader reader = new VcfReader(vcfPath)) { if (!sampleName.IsNullOrEmpty()) { if (!sampleName.IsNullOrEmpty() && reader.Samples.Count < 2) { throw new ArgumentException( $"File '{vcfPath}' must be a multi-sample sample VCF containing > 1 samples"); } if (reader.Samples.Select(x => Convert.ToInt32(x == sampleName)).Sum() != 1) { throw new ArgumentException( $"File '{vcfPath}' should contain one genotypes column corresponding to sample {sampleName}"); } sampleIndex = reader.Samples.IndexOf(sampleName); } ploidy.HeaderLine = string.Join(" ", reader.HeaderLines); while (true) { VcfVariant record; bool result = reader.GetNextVariant(out record); if (!result) { break; } string chromosome = record.ReferenceName; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(chromosome); interval.Start = record.ReferencePosition; interval.End = int.Parse(record.InfoFields["END"]); var genotypeColumn = record.GenotypeColumns[sampleIndex]; if (genotypeColumn.ContainsKey("CN")) { var value = genotypeColumn["CN"]; interval.Ploidy = value == "." ? 2 : int.Parse(value); } else { throw new ArgumentException($"File '{vcfPath}' must contain one genotype CN column!"); } ploidy.PloidyByChromosome[chromosome].Add(interval); } } return(ploidy); }
public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied, int?denovoQualityThreshold = null) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { var genome = WriteVcfHeader(segments, diploidCoverage, wholeGenomeFastaDirectory, new List <string> { sampleName }, extraHeaders, qualityThreshold, writer, denovoQualityThreshold); WriteVariants(new List <List <CanvasSegment> > { segments.ToList() }, ploidy, genome, writer, isPedigreeInfoSupplied, denovoQualityThreshold); } }
public static PloidyInfo LoadPloidyFromBedFile(string filePath) { PloidyInfo ploidy = new PloidyInfo(); if (string.IsNullOrEmpty(filePath)) { return(ploidy); } int count = 0; using (GzipReader reader = new GzipReader(filePath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } // save anything that looks like a vcf header line (we will add it to the output vcf) // TODO: support adding multiple header lines to the output vcf if (fileLine.StartsWith("##")) { ploidy.HeaderLine = fileLine.Trim(); continue; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; } string[] bits = fileLine.Split('\t'); string chromosome = bits[0]; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(chromosome); interval.Start = int.Parse(bits[1]); interval.End = int.Parse(bits[2]); interval.Ploidy = int.Parse(bits[4]); ploidy.PloidyByChromosome[chromosome].Add(interval); count++; } } Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count); return(ploidy); }
public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, double?diploidCoverage, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied, int?denovoQualityThreshold, int?sizeThreshold) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { var genome = WriteVcfHeader(segments, diploidCoverage, wholeGenomeFastaDirectory, new List <string> { sampleName }, extraHeaders, writer, qualityThreshold, denovoQualityThreshold, sizeThreshold); var sampleId = new SampleId(sampleName); var segmentsOfAllSamples = segments.Select(x => new SampleMap <CanvasSegment> { { sampleId, x } }); WriteVariants(segmentsOfAllSamples, new List <PloidyInfo> { ploidy }, genome, writer, denovoQualityThreshold); } }
private static PloidyInfo LoadPloidyFromVcfFile(string vcfPath, int sampleIndex) { PloidyInfo ploidy = new PloidyInfo(); using (VcfReader reader = new VcfReader(vcfPath)) { //the ploidy.vcf header lines need to be updated to include reference sex chromosome info for one or multiple samples //ploidy.HeaderLine = string.Join(" ", reader.HeaderLines); while (true) { bool result = reader.GetNextVariant(out var record); if (!result) { break; } string chromosome = record.ReferenceName; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(chromosome) { Start = record.ReferencePosition, End = int.Parse(record.InfoFields["END"]) }; var genotypeColumn = record.GenotypeColumns[sampleIndex]; if (genotypeColumn.ContainsKey("CN")) { var value = genotypeColumn["CN"]; interval.Ploidy = value == "." ? 2 : int.Parse(value); } else { throw new ArgumentException($"File '{vcfPath}' must contain one genotype CN column!"); } ploidy.PloidyByChromosome[chromosome].Add(interval); } } return(ploidy); }
public static PloidyInfo LoadPloidyFromBedFile(string filePath) { PloidyInfo ploidy = new PloidyInfo(); int count = 0; using (GzipReader reader = new GzipReader(filePath)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } if (fileLine.StartsWith("##ExpectedSexChromosomeKaryotype")) { ploidy.HeaderLine = fileLine.Trim(); continue; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; } string[] bits = fileLine.Split('\t'); string chromosome = bits[0]; if (!ploidy.PloidyByChromosome.ContainsKey(chromosome)) { ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>(); } PloidyInterval interval = new PloidyInterval(); interval.Start = int.Parse(bits[1]); interval.End = int.Parse(bits[2]); interval.Ploidy = int.Parse(bits[4]); ploidy.PloidyByChromosome[chromosome].Add(interval); count++; } } Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count); return(ploidy); }
private static int GetPloidy(PloidyInfo referencePloidy, string chrom, int start, int end, int defaultPloidy = 2) { if (referencePloidy == null) { return defaultPloidy; } CanvasSegment segment = new CanvasSegment(chrom, start, end, new List<float>()); return referencePloidy.GetReferenceCopyNumber(segment); }
/// <summary> /// Generate a tabular file with information about coverage and allele frequency for each chunk of the genome. /// This file can be used to generate a pretty plot of coverage versus MAF. /// </summary> public static void WriteCoveragePlotData(List <CanvasSegment> segments, double?normalDiploidCoverage, PloidyInfo referencePloidy, string filePath, string referenceFolder) { if (segments.Any() && !normalDiploidCoverage.HasValue) { throw new Illumina.Common.IlluminaException("normal diploid coverage must be specified"); } int pointLength = 100000; int minimumBinsToPlot = GetMinimumBinsForCoveragePlotPoint(segments, pointLength); Dictionary <string, List <CanvasSegment> > segmentsByChromosome = GetSegmentsByChromosome(segments); GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml")); List <float> counts = new List <float>(); List <float> MAF = new List <float>(); List <float> VF = new List <float>(); using (FileStream stream = new FileStream(filePath, FileMode.Create, FileAccess.Write)) using (StreamWriter writer = new StreamWriter(stream)) { writer.NewLine = "\n"; writer.Write("#Chromosome\tStart\tEnd\tCopyNumber\tMajorChromosomeCount\tMedianHits\tNormalizedCoverage\tMedianMinorAlleleFrequency\tReferencePloidy\t"); for (int i = 0; i < NumberVariantFrequencyBins; i++) { writer.Write("VariantFrequencyBin{0}\t", i); } writer.WriteLine(); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { if (!segmentsByChromosome.ContainsKey(chromosome.Name)) { continue; } int pointStartPos = 0; // 0-based start while (pointStartPos < chromosome.Length) { int pointEndPos = (int)Math.Min(chromosome.Length, pointStartPos + pointLength); // 1-based end counts.Clear(); MAF.Clear(); VF.Clear(); Dictionary <string, long> CopyNumberAndChromCount = new Dictionary <string, long>(); Dictionary <int, long> basesByCopyNumber = new Dictionary <int, long>(); // Accumulate counts and MAF from the segments: List <CanvasSegment> chrSegments = new List <CanvasSegment>(); if (segmentsByChromosome.ContainsKey(chromosome.Name)) { chrSegments = segmentsByChromosome[chromosome.Name]; } List <CanvasSegment> overlapSegments = new List <CanvasSegment>(); foreach (CanvasSegment segment in chrSegments) { if (segment.Begin > pointEndPos) { continue; } if (segment.End < pointStartPos) { continue; } int weight = Math.Min(segment.End, pointEndPos) - Math.Max(segment.Begin, pointStartPos); string key = string.Format("{0} {1}", segment.CopyNumber, segment.MajorChromosomeCount); if (!CopyNumberAndChromCount.ContainsKey(key)) { CopyNumberAndChromCount[key] = 0; } CopyNumberAndChromCount[key] += weight; if (!basesByCopyNumber.ContainsKey(segment.CopyNumber)) { basesByCopyNumber[segment.CopyNumber] = 0; } basesByCopyNumber[segment.CopyNumber] += weight; overlapSegments.Add(segment); } // Note the most common copy number: long bestCount = 0; int majorCopyNumber = 0; foreach (int key in basesByCopyNumber.Keys) { if (basesByCopyNumber[key] > bestCount) { bestCount = basesByCopyNumber[key]; majorCopyNumber = key; } } // Find the most common major chromosome count, for the most common copy number: int?majorChromosomeCount = null; bestCount = 0; foreach (string key in CopyNumberAndChromCount.Keys) { string[] bits = key.Split(); if (bits[1].Length == 0) { continue; } if (int.Parse(bits[0]) != majorCopyNumber) { continue; } long count = CopyNumberAndChromCount[key]; if (count < bestCount) { continue; } bestCount = count; majorChromosomeCount = int.Parse(bits[1]); } // Note allele frequency and coverage info, for all overlap segments that match (more or less) // the most common copy number: foreach (CanvasSegment segment in overlapSegments) { if ((majorCopyNumber == 2 && segment.CopyNumber != 2) || (majorCopyNumber < 2 && segment.CopyNumber >= 2) || (majorCopyNumber > 2 && segment.CopyNumber <= 2)) { continue; } float segLength = segment.End - segment.Begin; // Add counts to the overall list: int firstIndex = 0; if (pointStartPos > segment.Begin) { firstIndex = (int)((float)segment.Counts.Count * (pointStartPos - segment.Begin) / segLength); } int lastIndex = segment.Counts.Count; if (pointEndPos < segment.End) { lastIndex = (int)((float)segment.Counts.Count * (pointEndPos - segment.Begin) / segLength); } for (int index = firstIndex; index < lastIndex; index++) { counts.Add(segment.Counts[index]); } // Add MAF to the overall list: firstIndex = 0; if (pointStartPos > segment.Begin) { firstIndex = (int)((float)segment.Alleles.Frequencies.Count * (pointStartPos - segment.Begin) / segLength); } lastIndex = segment.Alleles.Frequencies.Count; if (pointEndPos < segment.End) { lastIndex = (int)((float)segment.Alleles.Frequencies.Count * (pointEndPos - segment.Begin) / segLength); } for (int index = firstIndex; index < lastIndex; index++) { float tempMAF = segment.Alleles.Frequencies[index]; VF.Add(tempMAF); if (tempMAF > 0.5) { tempMAF = 1 - tempMAF; } MAF.Add(tempMAF); } } // Write output for this point: writer.Write("{0}\t{1}\t{2}\t", chromosome.Name, pointStartPos, pointEndPos); // Write counts if we have reasonable amounts of data; write MAF if we have reasonable amounts of data. // (Note: Observed that for germline data on chrY we often had well under 100 counts given the new, smaller bin size) if (counts.Count >= minimumBinsToPlot) { writer.Write("{0}\t", majorCopyNumber); writer.Write("{0}\t", majorChromosomeCount); counts.Sort(); double medianHits = counts[counts.Count / 2]; writer.Write("{0:F2}\t", medianHits); double normalizedCount = 2 * medianHits / normalDiploidCoverage.Value; writer.Write("{0:F2}\t", normalizedCount); if (MAF.Count >= 10) { MAF.Sort(); writer.Write("{0}\t", MAF[MAF.Count / 2]); } else { writer.Write("\t"); } int refPloidy = 2; if (referencePloidy != null && referencePloidy.PloidyByChromosome.ContainsKey(chromosome.Name)) { foreach (var interval in referencePloidy.PloidyByChromosome[chromosome.Name]) { if (interval.Start <= pointEndPos && interval.End >= pointStartPos) { refPloidy = interval.Ploidy; } } } writer.Write("{0}\t", refPloidy); if (VF.Count >= 10) { // bin VF float[] vfDistribution = new float[NumberVariantFrequencyBins]; foreach (float vf in VF) { int binNumber = Math.Min(vfDistribution.Length - 1, (int)Math.Floor(vf / 0.01)); vfDistribution[binNumber]++; } for (int i = 0; i < vfDistribution.Length; i++) { vfDistribution[i] = vfDistribution[i] / (float)VF.Count * 100.0f; writer.Write("{0:F2}\t", vfDistribution[i]); } } else { for (int i = 0; i < NumberVariantFrequencyBins; i++) { writer.Write("\t"); } } } writer.WriteLine(); pointStartPos += pointLength; } } } }
private const double EMLikelihoodThres = 1; // Controls when to update means #endregion /// <summary> /// Load the expected ploidy for sex chromosomes from a .bed file. This lets us know that, for instance, copy number 2 /// on chrX is a GAIN (not REF) call for a male (XY) sample. /// </summary> public void LoadReferencePloidy(string filePath) { Console.WriteLine(">>>LoadReferencePloidy({0})", filePath); ReferencePloidy = PloidyInfo.LoadPloidyFromBedFile(filePath); }
/// <summary> /// Generate a tabular file with information about coverage and allele frequency for each chunk of the genome. /// This file can be used to generate a pretty plot of coverage versus MAF. /// </summary> static public void WriteCoveragePlotData(List<CanvasSegment> segments, double normalDiploidCoverage, PloidyInfo referencePloidy, string filePath, string referenceFolder) { Dictionary<string, List<CanvasSegment>> segmentsByChromosome = GetSegmentsByChromosome(segments); GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml")); int pointLength = 100000; List<float> counts = new List<float>(); List<float> MAF = new List<float>(); List<float> VF = new List<float>(); using (StreamWriter writer = new StreamWriter(filePath)) { writer.NewLine = "\n"; writer.Write("#Chromosome\tStart\tEnd\tCopyNumber\tMajorChromosomeCount\tMedianHits\tNormalizedCoverage\tMedianMinorAlleleFrequency\tReferencePloidy\t"); for (int i = 0; i < NumberVariantFrequencyBins; i++) { writer.Write("VariantFrequencyBin{0}\t", i); } writer.WriteLine(); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { if (chromosome.IsMito()) continue; int pointStartPos = 0; // 0-based start while (pointStartPos < chromosome.Length) { int pointEndPos = (int)Math.Min(chromosome.Length, pointStartPos + pointLength); // 1-based end counts.Clear(); MAF.Clear(); VF.Clear(); Dictionary<string, long> CopyNumberAndChromCount = new Dictionary<string, long>(); Dictionary<int, long> basesByCopyNumber = new Dictionary<int, long>(); // Accumulate counts and MAF from the segments: List<CanvasSegment> chrSegments = new List<CanvasSegment>(); if (segmentsByChromosome.ContainsKey(chromosome.Name)) chrSegments = segmentsByChromosome[chromosome.Name]; List<CanvasSegment> overlapSegments = new List<CanvasSegment>(); foreach (CanvasSegment segment in chrSegments) { if (segment.Begin > pointEndPos) continue; if (segment.End < pointStartPos) continue; int weight = Math.Min(segment.End, pointEndPos) - Math.Max(segment.Begin, pointStartPos); string key = string.Format("{0} {1}", segment.copyNumber, segment.MajorChromosomeCount); if (!CopyNumberAndChromCount.ContainsKey(key)) CopyNumberAndChromCount[key] = 0; CopyNumberAndChromCount[key] += weight; if (!basesByCopyNumber.ContainsKey(segment.copyNumber)) basesByCopyNumber[segment.copyNumber] = 0; basesByCopyNumber[segment.copyNumber] += weight; overlapSegments.Add(segment); } // Note the most common copy number: long bestCount = 0; int majorCopyNumber = 0; foreach (int key in basesByCopyNumber.Keys) { if (basesByCopyNumber[key] > bestCount) { bestCount = basesByCopyNumber[key]; majorCopyNumber = key; } } // Find the most common major chromosome count, for the most common copy number: int? majorChromosomeCount = null; bestCount = 0; foreach (string key in CopyNumberAndChromCount.Keys) { string[] bits = key.Split(); if (bits[1].Length == 0) continue; if (int.Parse(bits[0]) != majorCopyNumber) continue; long count = CopyNumberAndChromCount[key]; if (count < bestCount) continue; bestCount = count; majorChromosomeCount = int.Parse(bits[1]); } // Note allele frequency and coverage info, for all overlap segments that match (more or less) // the most common copy number: foreach (CanvasSegment segment in overlapSegments) { if ((majorCopyNumber == 2 && segment.copyNumber != 2) || (majorCopyNumber < 2 && segment.copyNumber >= 2) || (majorCopyNumber > 2 && segment.copyNumber <= 2)) continue; float segLength = segment.End - segment.Begin; // Add counts to the overall list: int firstIndex = 0; if (pointStartPos > segment.Begin) { firstIndex = (int)((float)segment.Counts.Count * (pointStartPos - segment.Begin) / segLength); } int lastIndex = segment.Counts.Count; if (pointEndPos < segment.End) { lastIndex = (int)((float)segment.Counts.Count * (pointEndPos - segment.Begin) / segLength); } for (int index = firstIndex; index < lastIndex; index++) counts.Add(segment.Counts[index]); // Add MAF to the overall list: firstIndex = 0; if (pointStartPos > segment.Begin) { firstIndex = (int)((float)segment.VariantFrequencies.Count * (pointStartPos - segment.Begin) / segLength); } lastIndex = segment.VariantFrequencies.Count; if (pointEndPos < segment.End) { lastIndex = (int)((float)segment.VariantFrequencies.Count * (pointEndPos - segment.Begin) / segLength); } for (int index = firstIndex; index < lastIndex; index++) { float tempMAF = segment.VariantFrequencies[index]; VF.Add(tempMAF); if (tempMAF > 0.5) tempMAF = 1 - tempMAF; MAF.Add(tempMAF); } } // Write output for this point: writer.Write("{0}\t{1}\t{2}\t", chromosome.Name, pointStartPos, pointEndPos); // Write counts if we have reasonable amounts of data; write MAF if we have reasonable amounts of data. // (Note: Observed that for germline data on chrY we often had well under 100 counts given the new, smaller bin size) if (counts.Count >= 30) { writer.Write("{0}\t", majorCopyNumber); writer.Write("{0}\t", majorChromosomeCount); counts.Sort(); double medianHits = counts[counts.Count / 2]; writer.Write("{0:F2}\t", medianHits); double normalizedCount = 2 * medianHits / normalDiploidCoverage; writer.Write("{0:F2}\t", normalizedCount); if (MAF.Count >= 10) { MAF.Sort(); writer.Write("{0}\t", MAF[MAF.Count / 2]); } else { writer.Write("\t"); } int refPloidy = 2; if (referencePloidy != null && referencePloidy.PloidyByChromosome.ContainsKey(chromosome.Name)) { foreach (var interval in referencePloidy.PloidyByChromosome[chromosome.Name]) { if (interval.Start <= pointEndPos && interval.End >= pointStartPos) { refPloidy = interval.Ploidy; } } } writer.Write("{0}\t", refPloidy); if (VF.Count >= 10) { // bin VF float[] vfDistribution = new float[NumberVariantFrequencyBins]; foreach (float vf in VF) { int binNumber = Math.Min(vfDistribution.Length - 1, (int)Math.Floor(vf / 0.01)); vfDistribution[binNumber]++; } for (int i = 0; i < vfDistribution.Length; i++) { vfDistribution[i] = vfDistribution[i] / (float)VF.Count * 100.0f; writer.Write("{0:F2}\t", vfDistribution[i]); } } else { for (int i = 0; i < NumberVariantFrequencyBins; i++) writer.Write("\t"); } } writer.WriteLine(); pointStartPos += pointLength; } } } }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> /// <param name="outVcfPath">File to write to.</param> /// <param name="segments">List of segments to write out.</param> public static void WriteSegments(string outVcfPath, List<CanvasSegment> segments, string reference, string sampleName, List<string> extraHeaders, bool reportPloidy, PloidyInfo ploidy, bool reportAllSites = false, bool reportGermlineGenotype = false) { string cnvtype = null; string filter = null; // report GT for resequencing workflow and MCC for tumour-normal workflow if (reportGermlineGenotype && reportPloidy) { throw new Exception("WriteSegments VCF file output error: reportGermlineGenotype and reportPloidy can not be both true"); } using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine("##source=Isas," + CanvasCommon.CanvasVersionInfo.NameString + " " + CanvasCommon.CanvasVersionInfo.VersionString); writer.WriteLine("##reference={0}", Path.Combine(reference, "genome.fa")); if (extraHeaders != null) { foreach (string header in extraHeaders) { writer.WriteLine(header); } } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(reference, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine("##contig=<ID={0},length={1}>", chromosome.Name, chromosome.Length); } writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine("##FILTER=<ID=q10,Description=\"Quality below 10\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); if (reportGermlineGenotype) writer.WriteLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); if (reportPloidy) writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName); SanityCheckChromosomeNames(genome, segments); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { foreach (CanvasSegment segment in segments) { if (segment.Chr.ToLowerInvariant() != chromosome.Name.ToLowerInvariant()) continue; int referenceCN = 2; if (ploidy != null) referenceCN = ploidy.GetReferenceCopyNumber(segment); filter = null; bool isReferenceCall = false; if (segment.CopyNumber == referenceCN) isReferenceCall = true; if (reportPloidy && segment.CopyNumber == 2 && segment.MajorChromosomeCount.HasValue && segment.MajorChromosomeCount != 1) isReferenceCall = false; // If we're reporting ploidy and there's LOH, this isn't a reference call. // We can skip reporting of reference sites: if (!reportAllSites && isReferenceCall) continue; if (segment.QScore < 10) filter = "q10"; if (segment.End - segment.Begin < 10000) { if (filter != null) filter = filter + ";L10kb"; else filter = "L10kb"; } if (filter == null) filter = "PASS"; if (segment.CopyNumber < referenceCN) cnvtype = "LOSS"; else if (segment.CopyNumber > referenceCN) cnvtype = "GAIN"; else cnvtype = "REF"; // The Dude abides... from vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. writer.Write("{0}\t{1}\tCanvas:{2}:{0}:{3}-{4}\t", segment.Chr, isReferenceCall ? segment.Begin + 1 : segment.Begin, cnvtype, segment.Begin + 1, segment.End); writer.Write("N\t{0}\t{1}\t{2}\t", isReferenceCall ? "." : "<CNV>", segment.QScore, filter); if (segment.copyNumber != referenceCN) writer.Write("SVTYPE=CNV;"); else if (!isReferenceCall) writer.Write("SVTYPE=LOH;"); if (segment.copyNumber != referenceCN || !isReferenceCall) writer.Write("END={0};CNVLEN={1}", segment.End, segment.End - segment.Begin); else writer.Write("END={0}", segment.End); // FORMAT field if (reportGermlineGenotype) writer.Write("\tGT:RC:BC:CN", segment.End); else writer.Write("\tRC:BC:CN", segment.End); if (reportPloidy && segment.MajorChromosomeCount.HasValue) writer.Write(":MCC"); // writing GT for resequencing workflow if (reportGermlineGenotype) { writer.Write("\t{0}/{1}:", segment.MajorChromosomeCount, segment.CopyNumber); } else writer.Write("\t"); writer.Write("{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); // writing MCC for tumour-normal workflow if (reportPloidy && segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); } } } }
public static void WriteMultiSampleSegments(string outVcfPath, List <List <CanvasSegment> > segments, List <double?> diploidCoverage, string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied = true, int?denovoQualityThreshold = null) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { var genome = WriteVcfHeader(segments.First(), GetMean(diploidCoverage), wholeGenomeFastaDirectory, sampleNames, extraHeaders, qualityThreshold, writer, denovoQualityThreshold); WriteVariants(segments, ploidy, genome, writer, isPedigreeInfoSupplied, denovoQualityThreshold); } }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> private static void WriteVariants(IReadOnlyCollection <List <CanvasSegment> > segments, PloidyInfo ploidy, GenomeMetadata genome, BgzipOrStreamWriter writer, bool isPedigreeInfoSupplied = true, int?denovoQualityThreshold = null) { var nSamples = segments.Count; foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { for (int segmentIndex = 0; segmentIndex < segments.First().Count; segmentIndex++) { var firstSampleSegment = segments.First()[segmentIndex]; if (!isPedigreeInfoSupplied && segments.Select(sample => sample[segmentIndex].Filter == "PASS").Any() && segments.Count > 1) { firstSampleSegment.Filter = "PASS"; } if (!firstSampleSegment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) { continue; } var referenceCopyNumbers = segments.Select(segment => ploidy?.GetReferenceCopyNumber(segment[segmentIndex]) ?? 2).ToList(); var currentSegments = segments.Select(x => x[segmentIndex]).ToList(); var cnvTypes = new List <CnvType>(); for (int sampleIndex = 0; sampleIndex < nSamples; sampleIndex++) { cnvTypes.Add(currentSegments[sampleIndex].GetCnvType(referenceCopyNumbers[sampleIndex])); } CnvType cnvType; if (cnvTypes.TrueForAll(x => x == CnvType.Reference)) { cnvType = CnvType.Reference; } else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.Loss)) { cnvType = CnvType.Loss; } else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.Gain)) { cnvType = CnvType.Gain; } else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.LossOfHeterozygosity)) { cnvType = CnvType.LossOfHeterozygosity; } else { cnvType = CnvType.ComplexCnv; } WriteInfoField(writer, firstSampleSegment, cnvType, denovoQualityThreshold, isMultisample: segments.Count > 1); // FORMAT field if (segments.Count == 1) { WriteSingleSampleInfo(writer, firstSampleSegment); } else { WriteFormatField(writer, currentSegments); } } } }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> public static void WriteSegments(string outVcfPath, List<CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName, List<string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); foreach (string header in extraHeaders ?? new List<string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName); SanityCheckChromosomeNames(genome, segments); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { foreach (CanvasSegment segment in segments) { if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) continue; int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2; CnvType cnvType = segment.GetCnvType(referenceCopyNumber); // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter); if (cnvType != CnvType.Reference) writer.Write($"SVTYPE={cnvType.ToSvType()};"); writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) writer.Write($";CNVLEN={segment.End - segment.Begin}"); // FORMAT field writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); } } } }
/// <summary> /// Outputs the copy number calls to a text file. /// </summary> public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10) { using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath)) { // Write the VCF header: writer.WriteLine("##fileformat=VCFv4.1"); writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}"); writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}"); foreach (string header in extraHeaders ?? new List <string>()) { writer.WriteLine(header); } GenomeMetadata genome = new GenomeMetadata(); genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml")); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>"); } string qualityFilter = $"q{qualityThreshold}"; writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">"); writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">"); writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">"); writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"); writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">"); writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">"); writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">"); writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">"); writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">"); writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName); SanityCheckChromosomeNames(genome, segments); foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences) { foreach (CanvasSegment segment in segments) { if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) { continue; } int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2; CnvType cnvType = segment.GetCnvType(referenceCopyNumber); // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter); if (cnvType != CnvType.Reference) { writer.Write($"SVTYPE={cnvType.ToSvType()};"); } writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) { writer.Write($";CNVLEN={segment.End - segment.Begin}"); } // FORMAT field writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); } } } }