public void TestUsableSegments() { List<CanvasSegment> segments = new List<CanvasSegment>(); int currentPosition = 1000; // Generate some segments. Alternate between: // - Usable // - Too short // - Too few variants // - Too short + too few variants Random RNG = new Random(); for (int index = 0; index < 100; index++) { int length = 100000; if (index % 2 == 1) { length = 2000; } int variantCount = 999; if (index % 4 > 1) variantCount = 25; List<float> counts = new List<float>(); for (int countIndex = 0; countIndex < length / 100; countIndex++) counts.Add(RNG.Next(1000)); CanvasSegment segment = new CanvasSegment("chr1", currentPosition, currentPosition + length, counts); for (int varIndex = 0; varIndex < variantCount; varIndex++) { segment.VariantFrequencies.Add(RNG.Next()); } segments.Add(segment); } var usable = CanvasSomaticCaller.SomaticCaller.GetUsableSegmentsForModeling(segments, false, 50); Assert.AreEqual(50, usable.Count); }
/// <summary> /// Given a segment, return the expected copy number - normally this is 2, but based on the reference ploidy bed file, it could be something else. /// For XX samples, reference ploidy is 0 on chrY; for XY samples, reference ploidy is 1 on chrX+chrY /// </summary> public int GetReferenceCopyNumber(CanvasSegment segment) { if (!PloidyByChromosome.ContainsKey(segment.Chr)) return 2; int[] baseCounts = new int[5]; baseCounts[2] = segment.End - segment.Begin; foreach (PloidyInterval interval in this.PloidyByChromosome[segment.Chr]) { if (interval.Ploidy == 2) continue; int overlapStart = Math.Max(segment.Begin, interval.Start); if (overlapStart > segment.End) continue; int overlapEnd = Math.Min(segment.End, interval.End); int overlapBases = overlapEnd - overlapStart; if (overlapBases < 0) continue; baseCounts[2] -= overlapBases; baseCounts[interval.Ploidy] += overlapBases; // ASSUMPTION: Bed file ploidy shouldn't be >4 (i.e. we wouldn't handle an XXXXXY genome): } int bestCount = 0; int referenceCN = 2; for (int CN = 0; CN < baseCounts.Length; CN++) { if (baseCounts[CN] > bestCount) { bestCount = baseCounts[CN]; referenceCN = CN; } } return referenceCN; }
public int GetKnownCNForSegment(CanvasSegment segment) { // Handle switched chromosome naming convention transparently: string chr = segment.Chr; if (!this.KnownCN.ContainsKey(segment.Chr)) { chr = segment.Chr.Replace("chr", ""); if (!this.KnownCN.ContainsKey(chr)) { chr = "chr" + segment.Chr; if (!this.KnownCN.ContainsKey(chr)) return -1; } } int CN = -1; foreach (CNInterval interval in this.KnownCN[chr]) { if (interval.End < segment.Begin) continue; if (interval.Start > segment.End) continue; int start = Math.Max(segment.Begin, interval.Start); int end = Math.Min(segment.End, interval.End); if ((end - start) * 2 >= (segment.End - segment.Begin)) { CN = interval.CN; break; } } return CN; }
/// <summary> /// Merge another neighboring segment into this one. /// </summary> /// <param name="s">Segment to merge in.</param> public void MergeIn(CanvasSegment s) { this.End = Math.Max(this.End, s.End); this.Begin = Math.Min(this.Begin, s.Begin); this.Counts.AddRange(s.Counts); this.VariantFrequencies.AddRange(s.VariantFrequencies); this.VariantTotalCoverage.AddRange(s.VariantTotalCoverage); }
private static CanvasSegment CreateSegment(List <SampleGenomicBin> bins, Tuple <int, int> startConfidenceInterval, Tuple <int, int> endConfidenceInterval) { var segment = new CanvasSegment(bins.First().GenomicBin.Chromosome, bins.First().GenomicBin.Interval.Start, bins.Last().GenomicBin.Interval.End, bins) { StartConfidenceInterval = startConfidenceInterval, EndConfidenceInterval = endConfidenceInterval }; return(segment); }
public void TestSegmentStats() { List<float> counts = new List<float>() { 80, 79, 78, 77, 2 }; List<CanvasSegment> segments = new List<CanvasSegment>(); for (int index = 0; index < 10; index++) { CanvasSegment seg = new CanvasSegment("chr10", 1000000 * index, 1000000 * (index + 1), counts); segments.Add(seg); } double expectedCount = CanvasSegment.ExpectedCount(segments); Assert.AreEqual(expectedCount, 78, 0.01); }
/// <summary> /// Write to a file a single CanvasSegment record as a non-sample VCF columns /// </summary> /// <param name="writer"></param> /// <param name="segment"></param> /// <param name="cnvType"></param> /// <param name="denovoQualityThreshold"></param> /// <returns></returns> private static void WriteInfoField(BgzipOrStreamWriter writer, CanvasSegment segment, CnvType cnvType, int?denovoQualityThreshold, bool isMultisample) { // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. string alternateAllele = cnvType.ToAltId(); int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1; writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t"); string qScore = ""; qScore = isMultisample ? "." : $"{segment.QScore:F2}"; writer.Write($"N\t{alternateAllele}\t{qScore}\t{segment.Filter}\t"); if (cnvType != CnvType.Reference) { writer.Write($"SVTYPE={cnvType.ToSvType()};"); } if (segment.IsHeterogeneous) { writer.Write("SUBCLONAL;"); } if (segment.DQScore.HasValue && !isMultisample) { writer.Write($"DQ={segment.DQScore.Value};"); } if (denovoQualityThreshold.HasValue & segment.DQScore.HasValue & segment.DQScore >= denovoQualityThreshold) { writer.Write($"dq{denovoQualityThreshold};"); } writer.Write($"END={segment.End}"); if (cnvType != CnvType.Reference) { writer.Write($";CNVLEN={segment.End - segment.Begin}"); } if (segment.StartConfidenceInterval != null) { writer.Write($";CIPOS={segment.StartConfidenceInterval.Item1},{segment.StartConfidenceInterval.Item2}"); } if (segment.EndConfidenceInterval != null) { writer.Write($";CIEND={segment.EndConfidenceInterval.Item1},{segment.EndConfidenceInterval.Item2}"); } }
public void TestMergeSegments() { // Construct several segments, and invoke CanvasSegment.MergeSegments, and ensure that the expected // merges (and no others) occurred. List<CanvasSegment> allSegments = new List<CanvasSegment>(); List<float> counts = new List<float>(); // Chr1 gets five segments and we should merge to three: CanvasSegment seg = new CanvasSegment("chr1", 1000000, 2000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); seg = new CanvasSegment("chr1", 2000000, 2000100, counts); seg.CopyNumber = 3; allSegments.Add(seg); seg = new CanvasSegment("chr1", 2000100, 3000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); seg = new CanvasSegment("chr1", 3000000, 3100000, counts); seg.CopyNumber = 3; allSegments.Add(seg); seg = new CanvasSegment("chr1", 3100000, 4000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); // Chr2 gets segments with a large gap between, so can't merge: seg = new CanvasSegment("chr2", 1000000, 2000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); seg = new CanvasSegment("chr2", 3000000, 3000100, counts); seg.CopyNumber = 3; allSegments.Add(seg); seg = new CanvasSegment("chr2", 4000000, 5000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); // Chr3 has three segments that all merge to 1 big one: seg = new CanvasSegment("chr3", 1000000, 2000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); seg = new CanvasSegment("chr3", 2000000, 3000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); seg = new CanvasSegment("chr3", 3000000, 4000000, counts); seg.CopyNumber = 2; allSegments.Add(seg); CanvasSegment.MergeSegments(ref allSegments, 50000, 10000); Dictionary<string, List<CanvasSegment>> segmentsByChromosome = CanvasSegment.GetSegmentsByChromosome(allSegments); Assert.AreEqual(segmentsByChromosome["chr1"].Count, 3); Assert.AreEqual(segmentsByChromosome["chr2"].Count, 3); Assert.AreEqual(segmentsByChromosome["chr3"].Count, 1); }
/// <summary> /// Loads in data produced by CanvasPartition.exe. /// </summary> /// <param name="infile">Input file.</param> /// <returns>A list of segments.</returns> public static List <CanvasSegment> ReadSegments(string infile) { Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile); List <CanvasSegment> segments = new List <CanvasSegment>(); string chr = null; int begin = -1; int end = -1; int bin = -1; List <float> counts = new List <float>(); using (GzipReader reader = new GzipReader(infile)) { string row = null; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); int currentBin = Convert.ToInt32(fields[4]); // We've moved to a new segment if (currentBin != bin) { // Make a segment if (bin != -1) { CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); counts.Clear(); } chr = fields[0]; begin = Convert.ToInt32(fields[1]); bin = currentBin; } end = Convert.ToInt32(fields[2]); counts.Add(float.Parse(fields[3])); } if (bin != -1) { // Add the last segment CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); } } Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count); return(segments); }
private static void WriteSingleSampleInfo(BgzipOrStreamWriter writer, CanvasSegment segment) { writer.Write("\tRC:BC:CN", segment.End); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":MCC"); } writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber); if (segment.MajorChromosomeCount.HasValue) { writer.Write(":{0}", segment.MajorChromosomeCount); } writer.WriteLine(); }
/// <summary> /// Merge another neighboring segment into this one. /// </summary> /// <param name="s">Segment to merge in.</param> public void MergeIn(CanvasSegment s) { if (s.Begin < this.Begin) { this.StartConfidenceInterval = s.StartConfidenceInterval; this.Begin = s.Begin; } if (s.End > this.End) { this.EndConfidenceInterval = s.EndConfidenceInterval; this.End = s.End; } this.Counts.AddRange(s.Counts); Alleles.Frequencies.AddRange(s.Alleles.Frequencies); Alleles.TotalCoverage.AddRange(s.Alleles.TotalCoverage); }
/// <summary> /// Write to a file a single CanvasSegment record as a non-sample VCF columns /// </summary> /// <param name="writer"></param> /// <param name="firstSampleSegment"></param> /// <param name="alternateAllele"></param> /// <param name="recordLevelFilter"></param> /// <param name="sampleSetCnvType"></param> /// <param name="isMultisample"></param> /// <returns></returns> private static void WriteColumnsUntilInfoField(BgzipOrStreamWriter writer, CanvasSegment firstSampleSegment, CnvType sampleSetCnvType, string alternateAllele, string recordLevelFilter, bool isMultisample) { // From vcf 4.1 spec: // If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String <ID>) then the padding base is required and POS denotes the // coordinate of the base preceding the polymorphism. int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? firstSampleSegment.Begin : firstSampleSegment.Begin + 1; writer.Write($"{firstSampleSegment.Chr}\t{position}\tCanvas:{sampleSetCnvType.ToVcfId()}:{firstSampleSegment.Chr}:{firstSampleSegment.Begin + 1}-{firstSampleSegment.End}\t"); string qScore = isMultisample ? "." : $"{firstSampleSegment.QScore:F2}"; writer.Write($"N\t{alternateAllele}\t{qScore}\t{recordLevelFilter}\t"); if (sampleSetCnvType != CnvType.Reference) { writer.Write($"SVTYPE={sampleSetCnvType.ToSvType()};"); } if (firstSampleSegment.IsHeterogeneous) { writer.Write("SUBCLONAL;"); } if (firstSampleSegment.IsCommonCnv) { writer.Write("COMMONCNV;"); } writer.Write($"END={firstSampleSegment.End}"); if (sampleSetCnvType != CnvType.Reference) { writer.Write($";CNVLEN={firstSampleSegment.Length}"); } if (firstSampleSegment.StartConfidenceInterval != null) { writer.Write($";CIPOS={firstSampleSegment.StartConfidenceInterval.Item1},{firstSampleSegment.StartConfidenceInterval.Item2}"); } if (firstSampleSegment.EndConfidenceInterval != null) { writer.Write($";CIEND={firstSampleSegment.EndConfidenceInterval.Item1},{firstSampleSegment.EndConfidenceInterval.Item2}"); } }
public void TestSegment() { List<float> counts = new List<float>() {100, 90, 110, 100, 95, 105}; CanvasSegment seg1 = new CanvasSegment("chr17", 100000000, 110000000, counts); // Silly constructor tests: Assert.AreEqual(seg1.Begin, 100000000); Assert.AreEqual(seg1.End, 110000000); Assert.AreEqual(seg1.BinCount, counts.Count); Assert.AreEqual(seg1.Chr, "chr17"); // Property test: Assert.AreEqual(seg1.MeanCount, 100, 0.01); // Build a second segment, and merge them, and test results: CanvasSegment seg2 = new CanvasSegment("chr17", 110000000, 120000000, counts); seg1.MergeIn(seg2); Assert.AreEqual(seg1.Counts.Count, 12); Assert.AreEqual(seg1.End, seg2.End); }
/// <summary> /// Given a segment, return the expected copy number - normally this is 2, but based on the reference ploidy bed file, it could be something else. /// For XX samples, reference ploidy is 0 on chrY; for XY samples, reference ploidy is 1 on chrX+chrY /// </summary> public int GetReferenceCopyNumber(CanvasSegment segment) { if (!PloidyByChromosome.ContainsKey(segment.Chr)) { return(2); } int[] baseCounts = new int[5]; baseCounts[2] = segment.End - segment.Begin; foreach (PloidyInterval interval in this.PloidyByChromosome[segment.Chr]) { if (interval.Ploidy == 2) { continue; } int overlapStart = Math.Max(segment.Begin, interval.Start); if (overlapStart > segment.End) { continue; } int overlapEnd = Math.Min(segment.End, interval.End); int overlapBases = overlapEnd - overlapStart; if (overlapBases < 0) { continue; } baseCounts[2] -= overlapBases; baseCounts[interval.Ploidy] += overlapBases; // ASSUMPTION: Bed file ploidy shouldn't be >4 (i.e. we wouldn't handle an XXXXXY genome): } int bestCount = 0; int referenceCN = 2; for (int CN = 0; CN < baseCounts.Length; CN++) { if (baseCounts[CN] > bestCount) { bestCount = baseCounts[CN]; referenceCN = CN; } } return(referenceCN); }
/// <summary> /// Given a segment, return the expected copy number - normally this is 2, but based on the reference ploidy bed file, it could be something else. /// For XX samples, reference ploidy is 0 on chrY; for XY samples, reference ploidy is 1 on chrX+chrY /// </summary> public int GetReferenceCopyNumber(CanvasSegment segment) { if (!PloidyByChromosome.ContainsKey(segment.Chr)) { return(2); } int[] baseCounts = getPloidyCounts(new ReferenceInterval(segment.Chr, new Interval(segment.Begin + 1, segment.End))); int bestCount = 0; int referenceCopyNumber = 2; for (int copyNumber = 0; copyNumber < baseCounts.Length; copyNumber++) { if (baseCounts[copyNumber] > bestCount) { bestCount = baseCounts[copyNumber]; referenceCopyNumber = copyNumber; } } return(referenceCopyNumber); }
public int GetKnownCNForSegment(CanvasSegment segment) { // Handle switched chromosome naming convention transparently: string chr = segment.Chr; if (!this.KnownCN.ContainsKey(segment.Chr)) { chr = segment.Chr.Replace("chr", ""); if (!this.KnownCN.ContainsKey(chr)) { chr = "chr" + segment.Chr; if (!this.KnownCN.ContainsKey(chr)) { return(-1); } } } int CN = -1; foreach (CNInterval interval in this.KnownCN[chr]) { if (interval.End < segment.Begin) { continue; } if (interval.Start > segment.End) { continue; } int start = Math.Max(segment.Begin, interval.Start); int end = Math.Min(segment.End, interval.End); if ((end - start) * 2 >= (segment.End - segment.Begin)) { CN = interval.CN; break; } } return(CN); }
private static int GetPloidy(PloidyInfo referencePloidy, string chrom, int start, int end, int defaultPloidy = 2) { if (referencePloidy == null) { return defaultPloidy; } CanvasSegment segment = new CanvasSegment(chrom, start, end, new List<float>()); return referencePloidy.GetReferenceCopyNumber(segment); }
/// <summary> /// Parse the outputs of CanvasSNV, and note these variant frequencies in the appropriate segment. /// </summary> public static float LoadVariantFrequencies(string variantFrequencyFile, List <CanvasSegment> segments) { Console.WriteLine("{0} Load variant frequencies from {1}", DateTime.Now, variantFrequencyFile); int count = 0; Dictionary <string, List <CanvasSegment> > segmentsByChromosome = CanvasSegment.GetSegmentsByChromosome(segments); Dictionary <string, string> alternativeNames = GetChromosomeAlternativeNames(segmentsByChromosome.Keys); long totalCoverage = 0; int totalRecords = 0; using (GzipReader reader = new GzipReader(variantFrequencyFile)) { while (true) { string fileLine = reader.ReadLine(); if (fileLine == null) { break; } if (fileLine.Length == 0 || fileLine[0] == '#') { continue; // Skip headers } string[] bits = fileLine.Split('\t'); if (bits.Length < 6) { Console.Error.WriteLine("* Bad line in {0}: '{1}'", variantFrequencyFile, fileLine); continue; } string chromosome = bits[0]; if (!segmentsByChromosome.ContainsKey(chromosome)) { if (alternativeNames.ContainsKey(chromosome)) { chromosome = alternativeNames[chromosome]; } else { continue; } } int position = int.Parse(bits[1]); // 1-based (from the input VCF to Canvas SNV) int countRef = int.Parse(bits[4]); int countAlt = int.Parse(bits[5]); if (countRef + countAlt < 10) { continue; } float VF = countAlt / (float)(countRef + countAlt); // Binary search for the segment this variant hits: List <CanvasSegment> chrSegments = segmentsByChromosome[chromosome]; int start = 0; int end = chrSegments.Count - 1; int mid = (start + end) / 2; while (start <= end) { if (chrSegments[mid].End < position) // CanvasSegment.End is already 1-based { start = mid + 1; mid = (start + end) / 2; continue; } if (chrSegments[mid].Begin + 1 > position) // Convert CanvasSegment.Begin to 1-based by adding 1 { end = mid - 1; mid = (start + end) / 2; continue; } chrSegments[mid].VariantFrequencies.Add(VF); chrSegments[mid].VariantTotalCoverage.Add(countRef + countAlt); count++; totalCoverage += (countRef + countAlt); // use only coverage information in segments totalRecords++; break; } } } float meanCoverage = 0; if (totalRecords > 0) { meanCoverage = totalCoverage / Math.Max(1f, totalRecords); } Console.WriteLine("{0} Loaded a total of {1} usable variant frequencies", DateTime.Now, count); return(meanCoverage); }
/// <summary> /// Iterates through a list of segments and merges those which have the same copy number call. /// Also, for segments smaller than MinimumCallSize, assimilate them into the neighbor with the best /// quality score. Two consecutive segments are considered neighbors if they're on the same chromosome /// and the space between them doesn't overlap with any excluded intervals. /// </summary> static public void MergeSegmentsUsingExcludedIntervals(ref List <CanvasSegment> segments, int MinimumCallSize, Dictionary <string, List <SampleGenomicBin> > excludedIntervals) { if (!segments.Any()) { return; } // Assimilate short segments into the *best* available neighbor: List <CanvasSegment> mergedSegments = new List <CanvasSegment>(); int segmentIndex = 0; while (segmentIndex < segments.Count) { if (segments[segmentIndex].End - segments[segmentIndex].Begin >= MinimumCallSize) { mergedSegments.Add(segments[segmentIndex]); segmentIndex++; continue; } int prevIndex = -1; double prevQ = 0; // Look back for a segment: for (int checkIndex = segmentIndex - 1; checkIndex > 0; checkIndex--) { // Stop, if you jump to another chromosome, or cross a forbidden interval: if (segments[checkIndex].Chr != segments[segmentIndex].Chr) { break; } if (segments[checkIndex].End - segments[checkIndex].Begin < MinimumCallSize) { continue; } if (IsForbiddenInterval(segments[checkIndex].Chr, segments[checkIndex].End, segments[segmentIndex].Begin, excludedIntervals)) { break; } prevIndex = checkIndex; prevQ = segments[checkIndex].QScore; break; } // Look forward for a segment: int nextIndex = -1; double nextQ = 0; for (int checkIndex = segmentIndex + 1; checkIndex < segments.Count; checkIndex++) { if (segments[checkIndex].Chr != segments[segmentIndex].Chr) { break; } if (segments[checkIndex].End - segments[checkIndex].Begin < MinimumCallSize) { continue; } if (IsForbiddenInterval(segments[checkIndex].Chr, segments[segmentIndex].End, segments[checkIndex].Begin, excludedIntervals)) { break; } nextIndex = checkIndex; nextQ = segments[checkIndex].QScore; break; } if (prevQ > 0 && prevQ >= nextQ) { // segments[prevIndex] assimilates segments[prevIndex+1...segmentIndex]. // Assimilation of previous segments was already done, so we just need to assimilate this one: segments[prevIndex].MergeIn(segments[segmentIndex]); segmentIndex++; continue; } if (nextQ > 0) { // segments[nextIndex] assimilates segments[segmentIndex...nextIndex - 1] for (int tempIndex = segmentIndex; tempIndex < nextIndex; tempIndex++) { segments[nextIndex].MergeIn(segments[tempIndex]); } segmentIndex = nextIndex; continue; } mergedSegments.Add(segments[segmentIndex]); segmentIndex++; } segments = mergedSegments; // Now, merge together adjacent segments with same calls! mergedSegments = new List <CanvasSegment>(); CanvasSegment lastSegment = segments[0]; mergedSegments.Add(lastSegment); segmentIndex = 1; while (segmentIndex < segments.Count) { // Assimilate an adjacent segment with the same copy number call and heterogeneity flag: if (lastSegment.CopyNumber == segments[segmentIndex].CopyNumber && lastSegment.Chr == segments[segmentIndex].Chr && !IsForbiddenInterval(lastSegment.Chr, lastSegment.End, segments[segmentIndex].Begin, excludedIntervals) && lastSegment.IsHeterogeneous == segments[segmentIndex].IsHeterogeneous) { lastSegment.MergeIn(segments[segmentIndex]); segmentIndex++; continue; } lastSegment = segments[segmentIndex]; mergedSegments.Add(segments[segmentIndex]); segmentIndex++; } segments = mergedSegments; }
/// <summary> /// Loads in data produced by CanvasPartition.exe. /// </summary> /// <param name="infile">Input file.</param> /// <returns>A list of segments.</returns> public static List <CanvasSegment> ReadSegments(string infile) { Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile); List <CanvasSegment> segments = new List <CanvasSegment>(); string chr = null; int begin = -1; int previousSegmentIndex = -1; int previousBinStart = 0; int previousBinEnd = 0; List <float> counts = new List <float>(); Tuple <int, int> segmentStartCI = null; using (GzipReader reader = new GzipReader(infile)) { string row = null; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); int currentSegmentIndex = Convert.ToInt32(fields[4]); int newBinStart = Convert.ToInt32(fields[1]); int newBinEnd = Convert.ToInt32(fields[2]); // We've moved to a new segment if (currentSegmentIndex != previousSegmentIndex) { // Make a segment if (previousSegmentIndex != -1) { CanvasSegment segment = new CanvasSegment(chr, begin, previousBinEnd, counts); // Prepare the confidence interval for the end of the segment that just ended, based on the size of its last bin // (and, if the segments abut, based on the size of the next segment's first bin): int CIEnd1 = -(previousBinEnd - previousBinStart) / 2; int CIEnd2 = -CIEnd1; if (previousBinEnd == newBinStart) { CIEnd2 = (newBinEnd - newBinStart) / 2; } segment.EndConfidenceInterval = new Tuple <int, int>(CIEnd1, CIEnd2); segment.StartConfidenceInterval = segmentStartCI; segments.Add(segment); counts.Clear(); // Prepare the confidence interval for the start of the segment that just started, based on the size of its first // bin (and, if the segments abut, based on the size of the previous segment's last bin): int CIStart2 = (newBinEnd - newBinStart) / 2; int CIStart1 = -CIStart2; if (previousBinEnd == newBinStart) { CIStart1 = -(previousBinEnd - previousBinStart) / 2; } segmentStartCI = new Tuple <int, int>(CIStart1, CIStart2); } else { int interval = (newBinEnd - newBinStart) / 2; segmentStartCI = new Tuple <int, int>(-interval, interval); } chr = fields[0]; begin = Convert.ToInt32(fields[1]); previousSegmentIndex = currentSegmentIndex; } previousBinStart = newBinStart; previousBinEnd = newBinEnd; counts.Add(float.Parse(fields[3])); } if (previousSegmentIndex != -1) { // Add the last segment CanvasSegment segment = new CanvasSegment(chr, begin, previousBinEnd, counts); segments.Add(segment); segment.StartConfidenceInterval = segmentStartCI; } } Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count); return(segments); }
/// <summary> /// Check whether we know the CN for this segment. Look for a known-CN interval that /// covers (at least half of) this segment. Return -1 if we don't know its CN. /// </summary> protected int GetKnownCNForSegment(CanvasSegment segment) { if (CNOracle == null) return -1; return CNOracle.GetKnownCNForSegment(segment); }
/// <summary> /// Iterates through a list of segments and merges those which have the same copy number call. /// Also, for segments smaller than MinimumCallSize, assimilate them into the neighbor with the best /// quality score. Two consecutive segments are considered neighbors if they're on the same chromosome /// and the space between them is not too large. /// </summary> static public void MergeSegments(ref List <CanvasSegment> segments, int MinimumCallSize = 0, int maximumMergeSpan = 10000) { if (!segments.Any()) { return; } // Assimilate short segments into the *best* available neighbor: List <CanvasSegment> mergedSegments = new List <CanvasSegment>(); int segmentIndex = 0; while (segmentIndex < segments.Count) { if (segments[segmentIndex].End - segments[segmentIndex].Begin >= MinimumCallSize) { mergedSegments.Add(segments[segmentIndex]); segmentIndex++; continue; } int prevIndex = -1; double prevQ = -1; // Look back for a segment: for (int checkIndex = segmentIndex - 1; checkIndex >= 0; checkIndex--) { // Stop, if you jump to another chromosome, or cross a forbidden interval: if (segments[checkIndex].Chr != segments[segmentIndex].Chr) { break; } if (segments[checkIndex].End - segments[checkIndex].Begin < MinimumCallSize) { continue; } if (segments[segmentIndex].Begin - segments[checkIndex].End > maximumMergeSpan) { break; } prevIndex = checkIndex; prevQ = segments[checkIndex].QScore; break; } // Look forward for a segment: int nextIndex = -1; double nextQ = -1; for (int checkIndex = segmentIndex + 1; checkIndex < segments.Count; checkIndex++) { if (segments[checkIndex].Chr != segments[segmentIndex].Chr) { break; } if (segments[checkIndex].End - segments[checkIndex].Begin < MinimumCallSize) { continue; } if (segments[checkIndex].Begin - segments[segmentIndex].End > maximumMergeSpan) { continue; } nextIndex = checkIndex; nextQ = segments[checkIndex].QScore; break; } if (prevQ >= 0 && prevQ >= nextQ) { // segments[prevIndex] assimilates segments[prevIndex+1...segmentIndex]. // Assimilation of previous segments was already done, so we just need to assimilate this one: segments[prevIndex].MergeIn(segments[segmentIndex]); segmentIndex++; continue; } if (nextQ >= 0) { // segments[nextIndex] assimilates segments[segmentIndex...nextIndex - 1] for (int tempIndex = segmentIndex; tempIndex < nextIndex; tempIndex++) { segments[nextIndex].MergeIn(segments[tempIndex]); } segmentIndex = nextIndex; continue; } mergedSegments.Add(segments[segmentIndex]); segmentIndex++; } segments = mergedSegments; // Now, merge together adjacent segments with same calls! mergedSegments = new List <CanvasSegment>(); CanvasSegment lastSegment = segments[0]; mergedSegments.Add(lastSegment); segmentIndex = 1; while (segmentIndex < segments.Count) { // Assimilate an adjacent segment with the same copy number call: if (lastSegment.copyNumber == segments[segmentIndex].copyNumber && lastSegment.Chr == segments[segmentIndex].Chr && segments[segmentIndex].Begin - lastSegment.End < maximumMergeSpan) { lastSegment.MergeIn(segments[segmentIndex]); segmentIndex++; continue; } lastSegment = segments[segmentIndex]; mergedSegments.Add(segments[segmentIndex]); segmentIndex++; } segments = mergedSegments; }
/// <summary> /// Developer debug method: ROC curve data generation /// - Report all intervals, associated QScores and QScore predictor values to an extended report output file /// - Report called (i.e. TP+FP) intervals grouped by QScore /// - Generate 2 ROC outputs /// - ROC_intervals: FP vs TP rate, unit=1 interval /// - ROC_bases: FP vs TP rate, unit=1 base /// (Note: In both cases, we ignore intervals shorter than 1kb as most of them are due to imprecise ends of segments, which we don't want to give any weight to) /// </summary> private void GenerateReportAndRocDataForQscoreMethod(CanvasSegment.QScoreMethod qscoreMethod, Dictionary<string, List<CNInterval>> resegmentedKnownCN) { // Create map interval->{segment+qscore}, ignoring intervals shorter than 1kb Dictionary<CNInterval, Tuple<CanvasSegment, int>> Interval2Segment = new Dictionary<CNInterval, Tuple<CanvasSegment, int>>(); foreach (string chr in resegmentedKnownCN.Keys) { foreach (CNInterval interval in resegmentedKnownCN[chr]) { foreach (CanvasSegment segment in this.Segments) { if (segment.Chr == chr && (segment.Begin == interval.Start || segment.End == interval.End)) { if (interval.End - interval.Start >= 1000) Interval2Segment[interval] = new Tuple<CanvasSegment, int>(segment, segment.ComputeQScore(qscoreMethod)); } } } } // Classify intervals by QScore List<List<CNInterval>> intervalsByQScore = new List<List<CNInterval>>(); foreach (CNInterval interval in Interval2Segment.Keys) { int qscore = Interval2Segment[interval].Item2; // Resize list to hold this qscore's entries while (qscore >= intervalsByQScore.Count()) { intervalsByQScore.Add(new List<CNInterval>()); } intervalsByQScore[qscore].Add(interval); } // Output data as ExtendedCallsVersusKnownCN.txt string debugPath = Path.Combine(this.OutputFolder, "qscore_" + qscoreMethod.ToString() + "_ExtendedCallsVersusKnownCN.txt"); using (StreamWriter writer = new StreamWriter(debugPath)) { writer.Write("#Chr\tBegin\tEnd\tTruthSetCN\tCalledCN\tMajorChromCount\tQScore\tInfo"); foreach (CanvasSegment.QScorePredictor predictorId in CanvasSegment.QScorePredictor.GetValues(typeof(CanvasSegment.QScorePredictor))) { writer.Write("\tPredictor_{0}", predictorId.ToString()); } writer.WriteLine(""); foreach (string chr in resegmentedKnownCN.Keys) { foreach (CNInterval interval in resegmentedKnownCN[chr]) { if (Interval2Segment.ContainsKey(interval)) { CanvasSegment segment = Interval2Segment[interval].Item1; int qscore = Interval2Segment[interval].Item2; string info = (interval.CN == segment.CopyNumber) ? "Correct" : "Incorrect"; writer.Write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}", chr, interval.Start, interval.End, interval.CN, segment.CopyNumber, segment.MajorChromosomeCount, qscore, info); foreach (CanvasSegment.QScorePredictor predictorId in CanvasSegment.QScorePredictor.GetValues(typeof(CanvasSegment.QScorePredictor))) { writer.Write("\t{0}", segment.GetQScorePredictor(predictorId)); } writer.WriteLine(""); } else { string info = "Missing"; int CN = -1; int majorChromosomeCount = -1; writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}", chr, interval.Start, interval.End, interval.CN, CN, majorChromosomeCount, info); } } } } // Output data by QScore debugPath = Path.Combine(this.OutputFolder, "qscore_" + qscoreMethod + "_cnaPerQscore.txt"); using (StreamWriter writer = new StreamWriter(debugPath)) { writer.WriteLine("#Chr\tBegin\tEnd\tTruthSetCN\tCalledCN\tMajorChromCount\tMedianMAF\tMedianCoverage\tQScore\tInfo"); for (int qscore = 0; qscore < intervalsByQScore.Count(); qscore++) { foreach (CNInterval interval in intervalsByQScore[qscore]) { CanvasSegment segment = Interval2Segment[interval].Item1; string info = (interval.CN == segment.CopyNumber) ? "Correct" : "Incorrect"; writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}", segment.Chr, interval.Start, interval.End, interval.CN, segment.CopyNumber, segment.MajorChromosomeCount, qscore, info); } } } // ROC output per interval debugPath = Path.Combine(this.OutputFolder, "qscore_" + qscoreMethod + "_ROC_intervals.txt"); GenerateRocOutput(debugPath, intervalsByQScore, Interval2Segment, false, false); // ROC output per base debugPath = Path.Combine(this.OutputFolder, "qscore_" + qscoreMethod + "_ROC_bases.txt"); GenerateRocOutput(debugPath, intervalsByQScore, Interval2Segment, true, false); }
/// <summary> /// Loads in data produced by CanvasPartition.exe. /// </summary> /// <param name="infile">Input file.</param> /// <returns>A list of segments.</returns> public static List<CanvasSegment> ReadSegments(string infile) { Console.WriteLine("{0} Read segments from {1}", DateTime.Now, infile); List<CanvasSegment> segments = new List<CanvasSegment>(); string chr = null; int begin = -1; int end = -1; int bin = -1; List<float> counts = new List<float>(); using (GzipReader reader = new GzipReader(infile)) { string row = null; while ((row = reader.ReadLine()) != null) { string[] fields = row.Split('\t'); int currentBin = Convert.ToInt32(fields[4]); // We've moved to a new segment if (currentBin != bin) { // Make a segment if (bin != -1) { CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); counts.Clear(); } chr = fields[0]; begin = Convert.ToInt32(fields[1]); bin = currentBin; } end = Convert.ToInt32(fields[2]); counts.Add(float.Parse(fields[3])); } if (bin != -1) { // Add the last segment CanvasSegment segment = new CanvasSegment(chr, begin, end, counts); segments.Add(segment); } } Console.WriteLine("{0} Loaded {1} segments", DateTime.Now, segments.Count); return segments; }