示例#1
0
 public static PloidyInfo LoadPloidyFromBedFile(string filePath)
 {
     PloidyInfo ploidy = new PloidyInfo();
     int count = 0;
     using (GzipReader reader = new GzipReader(filePath))
     {
         while (true)
         {
             string fileLine = reader.ReadLine();
             if (fileLine == null) break;
             if (fileLine.StartsWith("##ExpectedSexChromosomeKaryotype"))
             {
                 ploidy.HeaderLine = fileLine.Trim();
                 continue;
             }
             if (fileLine.Length == 0 || fileLine[0] == '#') continue;
             string[] bits = fileLine.Split('\t');
             string chromosome = bits[0];
             if (!ploidy.PloidyByChromosome.ContainsKey(chromosome))
             {
                 ploidy.PloidyByChromosome[chromosome] = new List<PloidyInterval>();
             }
             PloidyInterval interval = new PloidyInterval();
             interval.Start = int.Parse(bits[1]);
             interval.End = int.Parse(bits[2]);
             interval.Ploidy = int.Parse(bits[4]);
             ploidy.PloidyByChromosome[chromosome].Add(interval);
             count++;
         }
     }
     Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count);
     return ploidy;
 }
示例#2
0
        public static PloidyInfo LoadPloidyFromVcfFile(string vcfPath, string sampleName)
        {
            int        sampleIndex = 0;
            PloidyInfo ploidy      = new PloidyInfo();

            using (VcfReader reader = new VcfReader(vcfPath))
            {
                if (!sampleName.IsNullOrEmpty())
                {
                    if (!sampleName.IsNullOrEmpty() && reader.Samples.Count < 2)
                    {
                        throw new ArgumentException(
                                  $"File '{vcfPath}' must be a multi-sample sample VCF containing > 1 samples");
                    }
                    if (reader.Samples.Select(x => Convert.ToInt32(x == sampleName)).Sum() != 1)
                    {
                        throw new ArgumentException(
                                  $"File '{vcfPath}' should contain one genotypes column corresponding to sample {sampleName}");
                    }
                    sampleIndex = reader.Samples.IndexOf(sampleName);
                }

                ploidy.HeaderLine = string.Join(" ", reader.HeaderLines);

                while (true)
                {
                    VcfVariant record;
                    bool       result = reader.GetNextVariant(out record);
                    if (!result)
                    {
                        break;
                    }
                    string chromosome = record.ReferenceName;
                    if (!ploidy.PloidyByChromosome.ContainsKey(chromosome))
                    {
                        ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>();
                    }
                    PloidyInterval interval = new PloidyInterval(chromosome);
                    interval.Start = record.ReferencePosition;
                    interval.End   = int.Parse(record.InfoFields["END"]);
                    var genotypeColumn = record.GenotypeColumns[sampleIndex];
                    if (genotypeColumn.ContainsKey("CN"))
                    {
                        var value = genotypeColumn["CN"];
                        interval.Ploidy = value == "." ? 2 : int.Parse(value);
                    }
                    else
                    {
                        throw new ArgumentException($"File '{vcfPath}' must contain one genotype CN column!");
                    }
                    ploidy.PloidyByChromosome[chromosome].Add(interval);
                }
            }
            return(ploidy);
        }
示例#3
0
 public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, double?diploidCoverage,
                                  string wholeGenomeFastaDirectory, string sampleName,
                                  List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied, int?denovoQualityThreshold = null)
 {
     using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
     {
         var genome = WriteVcfHeader(segments, diploidCoverage, wholeGenomeFastaDirectory, new List <string> {
             sampleName
         },
                                     extraHeaders, qualityThreshold, writer, denovoQualityThreshold);
         WriteVariants(new List <List <CanvasSegment> > {
             segments.ToList()
         }, ploidy, genome, writer, isPedigreeInfoSupplied, denovoQualityThreshold);
     }
 }
示例#4
0
        public static PloidyInfo LoadPloidyFromBedFile(string filePath)
        {
            PloidyInfo ploidy = new PloidyInfo();

            if (string.IsNullOrEmpty(filePath))
            {
                return(ploidy);
            }
            int count = 0;

            using (GzipReader reader = new GzipReader(filePath))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    // save anything that looks like a vcf header line (we will add it to the output vcf)
                    // TODO: support adding multiple header lines to the output vcf
                    if (fileLine.StartsWith("##"))
                    {
                        ploidy.HeaderLine = fileLine.Trim();
                        continue;
                    }
                    if (fileLine.Length == 0 || fileLine[0] == '#')
                    {
                        continue;
                    }
                    string[] bits       = fileLine.Split('\t');
                    string   chromosome = bits[0];
                    if (!ploidy.PloidyByChromosome.ContainsKey(chromosome))
                    {
                        ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>();
                    }
                    PloidyInterval interval = new PloidyInterval(chromosome);
                    interval.Start  = int.Parse(bits[1]);
                    interval.End    = int.Parse(bits[2]);
                    interval.Ploidy = int.Parse(bits[4]);
                    ploidy.PloidyByChromosome[chromosome].Add(interval);
                    count++;
                }
            }
            Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count);
            return(ploidy);
        }
示例#5
0
 public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, double?diploidCoverage,
                                  string wholeGenomeFastaDirectory, string sampleName,
                                  List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold, bool isPedigreeInfoSupplied, int?denovoQualityThreshold, int?sizeThreshold)
 {
     using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
     {
         var genome = WriteVcfHeader(segments, diploidCoverage, wholeGenomeFastaDirectory, new List <string> {
             sampleName
         },
                                     extraHeaders, writer, qualityThreshold, denovoQualityThreshold, sizeThreshold);
         var sampleId             = new SampleId(sampleName);
         var segmentsOfAllSamples = segments.Select(x => new SampleMap <CanvasSegment> {
             { sampleId, x }
         });
         WriteVariants(segmentsOfAllSamples, new List <PloidyInfo> {
             ploidy
         }, genome, writer, denovoQualityThreshold);
     }
 }
示例#6
0
        private static PloidyInfo LoadPloidyFromVcfFile(string vcfPath, int sampleIndex)
        {
            PloidyInfo ploidy = new PloidyInfo();

            using (VcfReader reader = new VcfReader(vcfPath))
            {
                //the ploidy.vcf header lines need to be updated to include reference sex chromosome info for one or multiple samples
                //ploidy.HeaderLine = string.Join(" ", reader.HeaderLines);

                while (true)
                {
                    bool result = reader.GetNextVariant(out var record);
                    if (!result)
                    {
                        break;
                    }
                    string chromosome = record.ReferenceName;
                    if (!ploidy.PloidyByChromosome.ContainsKey(chromosome))
                    {
                        ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>();
                    }
                    PloidyInterval interval = new PloidyInterval(chromosome)
                    {
                        Start = record.ReferencePosition,
                        End   = int.Parse(record.InfoFields["END"])
                    };
                    var genotypeColumn = record.GenotypeColumns[sampleIndex];
                    if (genotypeColumn.ContainsKey("CN"))
                    {
                        var value = genotypeColumn["CN"];
                        interval.Ploidy = value == "." ? 2 : int.Parse(value);
                    }
                    else
                    {
                        throw new ArgumentException($"File '{vcfPath}' must contain one genotype CN column!");
                    }
                    ploidy.PloidyByChromosome[chromosome].Add(interval);
                }
            }
            return(ploidy);
        }
示例#7
0
        public static PloidyInfo LoadPloidyFromBedFile(string filePath)
        {
            PloidyInfo ploidy = new PloidyInfo();
            int        count  = 0;

            using (GzipReader reader = new GzipReader(filePath))
            {
                while (true)
                {
                    string fileLine = reader.ReadLine();
                    if (fileLine == null)
                    {
                        break;
                    }
                    if (fileLine.StartsWith("##ExpectedSexChromosomeKaryotype"))
                    {
                        ploidy.HeaderLine = fileLine.Trim();
                        continue;
                    }
                    if (fileLine.Length == 0 || fileLine[0] == '#')
                    {
                        continue;
                    }
                    string[] bits       = fileLine.Split('\t');
                    string   chromosome = bits[0];
                    if (!ploidy.PloidyByChromosome.ContainsKey(chromosome))
                    {
                        ploidy.PloidyByChromosome[chromosome] = new List <PloidyInterval>();
                    }
                    PloidyInterval interval = new PloidyInterval();
                    interval.Start  = int.Parse(bits[1]);
                    interval.End    = int.Parse(bits[2]);
                    interval.Ploidy = int.Parse(bits[4]);
                    ploidy.PloidyByChromosome[chromosome].Add(interval);
                    count++;
                }
            }
            Console.WriteLine("Reference ploidy: Loaded {0} intervals across {1} chromosomes", count, ploidy.PloidyByChromosome.Keys.Count);
            return(ploidy);
        }
示例#8
0
        private static int GetPloidy(PloidyInfo referencePloidy, string chrom, int start, int end, int defaultPloidy = 2)
        {
            if (referencePloidy == null) { return defaultPloidy; }

            CanvasSegment segment = new CanvasSegment(chrom, start, end, new List<float>());

            return referencePloidy.GetReferenceCopyNumber(segment);
        }
示例#9
0
        /// <summary>
        /// Generate a tabular file with information about coverage and allele frequency for each chunk of the genome.
        /// This file can be used to generate a pretty plot of coverage versus MAF.
        /// </summary>
        public static void WriteCoveragePlotData(List <CanvasSegment> segments, double?normalDiploidCoverage, PloidyInfo referencePloidy,
                                                 string filePath, string referenceFolder)
        {
            if (segments.Any() && !normalDiploidCoverage.HasValue)
            {
                throw new Illumina.Common.IlluminaException("normal diploid coverage must be specified");
            }
            int pointLength       = 100000;
            int minimumBinsToPlot = GetMinimumBinsForCoveragePlotPoint(segments, pointLength);

            Dictionary <string, List <CanvasSegment> > segmentsByChromosome = GetSegmentsByChromosome(segments);
            GenomeMetadata genome = new GenomeMetadata();

            genome.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml"));

            List <float> counts = new List <float>();
            List <float> MAF    = new List <float>();
            List <float> VF     = new List <float>();

            using (FileStream stream = new FileStream(filePath, FileMode.Create, FileAccess.Write))
                using (StreamWriter writer = new StreamWriter(stream))
                {
                    writer.NewLine = "\n";
                    writer.Write("#Chromosome\tStart\tEnd\tCopyNumber\tMajorChromosomeCount\tMedianHits\tNormalizedCoverage\tMedianMinorAlleleFrequency\tReferencePloidy\t");
                    for (int i = 0; i < NumberVariantFrequencyBins; i++)
                    {
                        writer.Write("VariantFrequencyBin{0}\t", i);
                    }
                    writer.WriteLine();
                    foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                    {
                        if (!segmentsByChromosome.ContainsKey(chromosome.Name))
                        {
                            continue;
                        }
                        int pointStartPos = 0; // 0-based start
                        while (pointStartPos < chromosome.Length)
                        {
                            int pointEndPos = (int)Math.Min(chromosome.Length, pointStartPos + pointLength); // 1-based end
                            counts.Clear();
                            MAF.Clear();
                            VF.Clear();
                            Dictionary <string, long> CopyNumberAndChromCount = new Dictionary <string, long>();
                            Dictionary <int, long>    basesByCopyNumber       = new Dictionary <int, long>();
                            // Accumulate counts and MAF from the segments:
                            List <CanvasSegment> chrSegments = new List <CanvasSegment>();
                            if (segmentsByChromosome.ContainsKey(chromosome.Name))
                            {
                                chrSegments = segmentsByChromosome[chromosome.Name];
                            }
                            List <CanvasSegment> overlapSegments = new List <CanvasSegment>();
                            foreach (CanvasSegment segment in chrSegments)
                            {
                                if (segment.Begin > pointEndPos)
                                {
                                    continue;
                                }
                                if (segment.End < pointStartPos)
                                {
                                    continue;
                                }

                                int    weight = Math.Min(segment.End, pointEndPos) - Math.Max(segment.Begin, pointStartPos);
                                string key    = string.Format("{0} {1}", segment.CopyNumber, segment.MajorChromosomeCount);
                                if (!CopyNumberAndChromCount.ContainsKey(key))
                                {
                                    CopyNumberAndChromCount[key] = 0;
                                }
                                CopyNumberAndChromCount[key] += weight;
                                if (!basesByCopyNumber.ContainsKey(segment.CopyNumber))
                                {
                                    basesByCopyNumber[segment.CopyNumber] = 0;
                                }
                                basesByCopyNumber[segment.CopyNumber] += weight;
                                overlapSegments.Add(segment);
                            }

                            // Note the most common copy number:
                            long bestCount       = 0;
                            int  majorCopyNumber = 0;
                            foreach (int key in basesByCopyNumber.Keys)
                            {
                                if (basesByCopyNumber[key] > bestCount)
                                {
                                    bestCount       = basesByCopyNumber[key];
                                    majorCopyNumber = key;
                                }
                            }

                            // Find the most common major chromosome count, for the most common copy number:
                            int?majorChromosomeCount = null;
                            bestCount = 0;
                            foreach (string key in CopyNumberAndChromCount.Keys)
                            {
                                string[] bits = key.Split();
                                if (bits[1].Length == 0)
                                {
                                    continue;
                                }
                                if (int.Parse(bits[0]) != majorCopyNumber)
                                {
                                    continue;
                                }
                                long count = CopyNumberAndChromCount[key];
                                if (count < bestCount)
                                {
                                    continue;
                                }
                                bestCount            = count;
                                majorChromosomeCount = int.Parse(bits[1]);
                            }

                            // Note allele frequency and coverage info, for all overlap segments that match (more or less)
                            // the most common copy number:
                            foreach (CanvasSegment segment in overlapSegments)
                            {
                                if ((majorCopyNumber == 2 && segment.CopyNumber != 2) ||
                                    (majorCopyNumber < 2 && segment.CopyNumber >= 2) ||
                                    (majorCopyNumber > 2 && segment.CopyNumber <= 2))
                                {
                                    continue;
                                }
                                float segLength = segment.End - segment.Begin;

                                // Add counts to the overall list:
                                int firstIndex = 0;
                                if (pointStartPos > segment.Begin)
                                {
                                    firstIndex = (int)((float)segment.Counts.Count * (pointStartPos - segment.Begin) / segLength);
                                }
                                int lastIndex = segment.Counts.Count;
                                if (pointEndPos < segment.End)
                                {
                                    lastIndex = (int)((float)segment.Counts.Count * (pointEndPos - segment.Begin) / segLength);
                                }
                                for (int index = firstIndex; index < lastIndex; index++)
                                {
                                    counts.Add(segment.Counts[index]);
                                }

                                // Add MAF to the overall list:
                                firstIndex = 0;
                                if (pointStartPos > segment.Begin)
                                {
                                    firstIndex = (int)((float)segment.Alleles.Frequencies.Count * (pointStartPos - segment.Begin) / segLength);
                                }
                                lastIndex = segment.Alleles.Frequencies.Count;
                                if (pointEndPos < segment.End)
                                {
                                    lastIndex = (int)((float)segment.Alleles.Frequencies.Count * (pointEndPos - segment.Begin) / segLength);
                                }
                                for (int index = firstIndex; index < lastIndex; index++)
                                {
                                    float tempMAF = segment.Alleles.Frequencies[index];
                                    VF.Add(tempMAF);
                                    if (tempMAF > 0.5)
                                    {
                                        tempMAF = 1 - tempMAF;
                                    }
                                    MAF.Add(tempMAF);
                                }
                            }

                            // Write output for this point:
                            writer.Write("{0}\t{1}\t{2}\t", chromosome.Name, pointStartPos, pointEndPos);

                            // Write counts if we have reasonable amounts of data; write MAF if we have reasonable amounts of data.
                            // (Note: Observed that for germline data on chrY we often had well under 100 counts given the new, smaller bin size)
                            if (counts.Count >= minimumBinsToPlot)
                            {
                                writer.Write("{0}\t", majorCopyNumber);
                                writer.Write("{0}\t", majorChromosomeCount);
                                counts.Sort();
                                double medianHits = counts[counts.Count / 2];
                                writer.Write("{0:F2}\t", medianHits);
                                double normalizedCount = 2 * medianHits / normalDiploidCoverage.Value;
                                writer.Write("{0:F2}\t", normalizedCount);
                                if (MAF.Count >= 10)
                                {
                                    MAF.Sort();
                                    writer.Write("{0}\t", MAF[MAF.Count / 2]);
                                }
                                else
                                {
                                    writer.Write("\t");
                                }
                                int refPloidy = 2;
                                if (referencePloidy != null && referencePloidy.PloidyByChromosome.ContainsKey(chromosome.Name))
                                {
                                    foreach (var interval in referencePloidy.PloidyByChromosome[chromosome.Name])
                                    {
                                        if (interval.Start <= pointEndPos && interval.End >= pointStartPos)
                                        {
                                            refPloidy = interval.Ploidy;
                                        }
                                    }
                                }
                                writer.Write("{0}\t", refPloidy);
                                if (VF.Count >= 10)
                                {
                                    // bin VF
                                    float[] vfDistribution = new float[NumberVariantFrequencyBins];
                                    foreach (float vf in VF)
                                    {
                                        int binNumber = Math.Min(vfDistribution.Length - 1, (int)Math.Floor(vf / 0.01));
                                        vfDistribution[binNumber]++;
                                    }
                                    for (int i = 0; i < vfDistribution.Length; i++)
                                    {
                                        vfDistribution[i] = vfDistribution[i] / (float)VF.Count * 100.0f;
                                        writer.Write("{0:F2}\t", vfDistribution[i]);
                                    }
                                }
                                else
                                {
                                    for (int i = 0; i < NumberVariantFrequencyBins; i++)
                                    {
                                        writer.Write("\t");
                                    }
                                }
                            }
                            writer.WriteLine();
                            pointStartPos += pointLength;
                        }
                    }
                }
        }
示例#10
0
        private const double EMLikelihoodThres = 1; // Controls when to update means
        #endregion

        /// <summary>
        /// Load the expected ploidy for sex chromosomes from a .bed file.  This lets us know that, for instance, copy number 2
        /// on chrX is a GAIN (not REF) call for a male (XY) sample.
        /// </summary>
        public void LoadReferencePloidy(string filePath)
        {
            Console.WriteLine(">>>LoadReferencePloidy({0})", filePath);
            ReferencePloidy = PloidyInfo.LoadPloidyFromBedFile(filePath);
        }
示例#11
0
        /// <summary>
        /// Generate a tabular file with information about coverage and allele frequency for each chunk of the genome.
        /// This file can be used to generate a pretty plot of coverage versus MAF.  
        /// </summary>
        static public void WriteCoveragePlotData(List<CanvasSegment> segments, double normalDiploidCoverage, PloidyInfo referencePloidy,
            string filePath, string referenceFolder)
        {
            Dictionary<string, List<CanvasSegment>> segmentsByChromosome = GetSegmentsByChromosome(segments);
            GenomeMetadata genome = new GenomeMetadata();
            genome.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml"));
            int pointLength = 100000;
            List<float> counts = new List<float>();
            List<float> MAF = new List<float>();
            List<float> VF = new List<float>();
            using (StreamWriter writer = new StreamWriter(filePath))
            {
                writer.NewLine = "\n";
                writer.Write("#Chromosome\tStart\tEnd\tCopyNumber\tMajorChromosomeCount\tMedianHits\tNormalizedCoverage\tMedianMinorAlleleFrequency\tReferencePloidy\t");
                for (int i = 0; i < NumberVariantFrequencyBins; i++) { writer.Write("VariantFrequencyBin{0}\t", i); }
                writer.WriteLine();
                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    if (chromosome.IsMito()) continue;
                    int pointStartPos = 0; // 0-based start
                    while (pointStartPos < chromosome.Length)
                    {
                        int pointEndPos = (int)Math.Min(chromosome.Length, pointStartPos + pointLength); // 1-based end
                        counts.Clear();
                        MAF.Clear();
                        VF.Clear();
                        Dictionary<string, long> CopyNumberAndChromCount = new Dictionary<string, long>();
                        Dictionary<int, long> basesByCopyNumber = new Dictionary<int, long>();
                        // Accumulate counts and MAF from the segments:
                        List<CanvasSegment> chrSegments = new List<CanvasSegment>();
                        if (segmentsByChromosome.ContainsKey(chromosome.Name)) chrSegments = segmentsByChromosome[chromosome.Name];
                        List<CanvasSegment> overlapSegments = new List<CanvasSegment>();
                        foreach (CanvasSegment segment in chrSegments)
                        {
                            if (segment.Begin > pointEndPos) continue;
                            if (segment.End < pointStartPos) continue;

                            int weight = Math.Min(segment.End, pointEndPos) - Math.Max(segment.Begin, pointStartPos);
                            string key = string.Format("{0} {1}", segment.copyNumber, segment.MajorChromosomeCount);
                            if (!CopyNumberAndChromCount.ContainsKey(key)) CopyNumberAndChromCount[key] = 0;
                            CopyNumberAndChromCount[key] += weight;
                            if (!basesByCopyNumber.ContainsKey(segment.copyNumber)) basesByCopyNumber[segment.copyNumber] = 0;
                            basesByCopyNumber[segment.copyNumber] += weight;
                            overlapSegments.Add(segment);
                        }

                        // Note the most common copy number:
                        long bestCount = 0;
                        int majorCopyNumber = 0;
                        foreach (int key in basesByCopyNumber.Keys)
                        {
                            if (basesByCopyNumber[key] > bestCount)
                            {
                                bestCount = basesByCopyNumber[key];
                                majorCopyNumber = key;
                            }
                        }

                        // Find the most common major chromosome count, for the most common copy number:
                        int? majorChromosomeCount = null;
                        bestCount = 0;
                        foreach (string key in CopyNumberAndChromCount.Keys)
                        {
                            string[] bits = key.Split();
                            if (bits[1].Length == 0) continue;
                            if (int.Parse(bits[0]) != majorCopyNumber) continue;
                            long count = CopyNumberAndChromCount[key];
                            if (count < bestCount) continue;
                            bestCount = count;
                            majorChromosomeCount = int.Parse(bits[1]);
                        }

                        // Note allele frequency and coverage info, for all overlap segments that match (more or less)
                        // the most common copy number:
                        foreach (CanvasSegment segment in overlapSegments)
                        {
                            if ((majorCopyNumber == 2 && segment.copyNumber != 2) ||
                                (majorCopyNumber < 2 && segment.copyNumber >= 2) ||
                                (majorCopyNumber > 2 && segment.copyNumber <= 2))
                                continue;
                            float segLength = segment.End - segment.Begin;

                            // Add counts to the overall list:
                            int firstIndex = 0;
                            if (pointStartPos > segment.Begin)
                            {
                                firstIndex = (int)((float)segment.Counts.Count * (pointStartPos - segment.Begin) / segLength);
                            }
                            int lastIndex = segment.Counts.Count;
                            if (pointEndPos < segment.End)
                            {
                                lastIndex = (int)((float)segment.Counts.Count * (pointEndPos - segment.Begin) / segLength);
                            }
                            for (int index = firstIndex; index < lastIndex; index++) counts.Add(segment.Counts[index]);

                            // Add MAF to the overall list:
                            firstIndex = 0;
                            if (pointStartPos > segment.Begin)
                            {
                                firstIndex = (int)((float)segment.VariantFrequencies.Count * (pointStartPos - segment.Begin) / segLength);
                            }
                            lastIndex = segment.VariantFrequencies.Count;
                            if (pointEndPos < segment.End)
                            {
                                lastIndex = (int)((float)segment.VariantFrequencies.Count * (pointEndPos - segment.Begin) / segLength);
                            }
                            for (int index = firstIndex; index < lastIndex; index++)
                            {
                                float tempMAF = segment.VariantFrequencies[index];
                                VF.Add(tempMAF);
                                if (tempMAF > 0.5) tempMAF = 1 - tempMAF;
                                MAF.Add(tempMAF);
                            }
                        }

                        // Write output for this point:
                        writer.Write("{0}\t{1}\t{2}\t", chromosome.Name, pointStartPos, pointEndPos);

                        // Write counts if we have reasonable amounts of data; write MAF if we have reasonable amounts of data.
                        // (Note: Observed that for germline data on chrY we often had well under 100 counts given the new, smaller bin size)
                        if (counts.Count >= 30)
                        {
                            writer.Write("{0}\t", majorCopyNumber);
                            writer.Write("{0}\t", majorChromosomeCount);
                            counts.Sort();
                            double medianHits = counts[counts.Count / 2];
                            writer.Write("{0:F2}\t", medianHits);
                            double normalizedCount = 2 * medianHits / normalDiploidCoverage;
                            writer.Write("{0:F2}\t", normalizedCount);
                            if (MAF.Count >= 10)
                            {
                                MAF.Sort();
                                writer.Write("{0}\t", MAF[MAF.Count / 2]);
                            }
                            else
                            {
                                writer.Write("\t");
                            }
                            int refPloidy = 2;
                            if (referencePloidy != null && referencePloidy.PloidyByChromosome.ContainsKey(chromosome.Name))
                            {
                                foreach (var interval in referencePloidy.PloidyByChromosome[chromosome.Name])
                                {
                                    if (interval.Start <= pointEndPos && interval.End >= pointStartPos)
                                    {
                                        refPloidy = interval.Ploidy;
                                    }
                                }
                            }
                            writer.Write("{0}\t", refPloidy);
                            if (VF.Count >= 10)
                            {
                                // bin VF
                                float[] vfDistribution = new float[NumberVariantFrequencyBins];
                                foreach (float vf in VF)
                                {
                                    int binNumber = Math.Min(vfDistribution.Length - 1, (int)Math.Floor(vf / 0.01));
                                    vfDistribution[binNumber]++;
                                }
                                for (int i = 0; i < vfDistribution.Length; i++)
                                {
                                    vfDistribution[i] = vfDistribution[i] / (float)VF.Count * 100.0f;
                                    writer.Write("{0:F2}\t", vfDistribution[i]);
                                }
                            }
                            else
                            {
                                for (int i = 0; i < NumberVariantFrequencyBins; i++) writer.Write("\t");
                            }
                        }
                        writer.WriteLine();
                        pointStartPos += pointLength;
                    }
                }
            }
        }
示例#12
0
        /// <summary>
        /// Outputs the copy number calls to a text file.
        /// </summary>
        /// <param name="outVcfPath">File to write to.</param>
        /// <param name="segments">List of segments to write out.</param>
        public static void WriteSegments(string outVcfPath, List<CanvasSegment> segments, string reference, string sampleName,
            List<string> extraHeaders, bool reportPloidy, PloidyInfo ploidy, bool reportAllSites = false, bool reportGermlineGenotype = false)
        {
            string cnvtype = null;
            string filter = null;
            // report GT for resequencing workflow and MCC for tumour-normal workflow
            if (reportGermlineGenotype && reportPloidy)
            {
                throw new Exception("WriteSegments VCF file output error: reportGermlineGenotype and reportPloidy can not be both true");
            }

            using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
            {
                // Write the VCF header:
                writer.WriteLine("##fileformat=VCFv4.1");
                writer.WriteLine("##source=Isas," + CanvasCommon.CanvasVersionInfo.NameString + " " + CanvasCommon.CanvasVersionInfo.VersionString);
                writer.WriteLine("##reference={0}", Path.Combine(reference, "genome.fa"));
                if (extraHeaders != null)
                {
                    foreach (string header in extraHeaders)
                    {
                        writer.WriteLine(header);
                    }
                }
                GenomeMetadata genome = new GenomeMetadata();
                genome.Deserialize(Path.Combine(reference, "GenomeSize.xml"));
                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    writer.WriteLine("##contig=<ID={0},length={1}>", chromosome.Name, chromosome.Length);
                }

                writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">");
                writer.WriteLine("##FILTER=<ID=q10,Description=\"Quality below 10\">");
                writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">");
                writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
                writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
                writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
                if (reportGermlineGenotype)
                    writer.WriteLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
                writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
                writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
                writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
                if (reportPloidy)
                    writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
                writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName);

                SanityCheckChromosomeNames(genome, segments);

                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    foreach (CanvasSegment segment in segments)
                    {
                        if (segment.Chr.ToLowerInvariant() != chromosome.Name.ToLowerInvariant()) continue;
                        int referenceCN = 2;
                        if (ploidy != null) referenceCN = ploidy.GetReferenceCopyNumber(segment);
                        filter = null;
                        bool isReferenceCall = false;
                        if (segment.CopyNumber == referenceCN) isReferenceCall = true;
                        if (reportPloidy && segment.CopyNumber == 2 && segment.MajorChromosomeCount.HasValue && segment.MajorChromosomeCount != 1) isReferenceCall = false; // If we're reporting ploidy and there's LOH, this isn't a reference call.

                        // We can skip reporting of reference sites:
                        if (!reportAllSites && isReferenceCall)
                            continue;

                        if (segment.QScore < 10)
                            filter = "q10";

                        if (segment.End - segment.Begin < 10000)
                        {
                            if (filter != null)
                                filter = filter + ";L10kb";
                            else
                                filter = "L10kb";
                        }

                        if (filter == null)
                            filter = "PASS";

                        if (segment.CopyNumber < referenceCN)
                            cnvtype = "LOSS";
                        else if (segment.CopyNumber > referenceCN)
                            cnvtype = "GAIN";
                        else
                            cnvtype = "REF";

                        // The Dude abides... from vcf 4.1 spec:
                        //     If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the 
                        //     coordinate of the base preceding the polymorphism.
                        writer.Write("{0}\t{1}\tCanvas:{2}:{0}:{3}-{4}\t", segment.Chr, isReferenceCall ? segment.Begin + 1 : segment.Begin, cnvtype, segment.Begin + 1, segment.End);
                        writer.Write("N\t{0}\t{1}\t{2}\t", isReferenceCall ? "." : "<CNV>", segment.QScore, filter);
                        if (segment.copyNumber != referenceCN)
                            writer.Write("SVTYPE=CNV;");
                        else if (!isReferenceCall)
                            writer.Write("SVTYPE=LOH;");
                        if (segment.copyNumber != referenceCN || !isReferenceCall)
                            writer.Write("END={0};CNVLEN={1}", segment.End, segment.End - segment.Begin);
                        else
                            writer.Write("END={0}", segment.End);
                        //  FORMAT field
                        if (reportGermlineGenotype)
                            writer.Write("\tGT:RC:BC:CN", segment.End);
                        else
                            writer.Write("\tRC:BC:CN", segment.End);
                        if (reportPloidy && segment.MajorChromosomeCount.HasValue) writer.Write(":MCC");
                        // writing GT for resequencing workflow 
                        if (reportGermlineGenotype)
                        {
                            writer.Write("\t{0}/{1}:", segment.MajorChromosomeCount, segment.CopyNumber);
                        }
                        else
                            writer.Write("\t");
                        writer.Write("{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber);
                        // writing MCC for tumour-normal workflow 
                        if (reportPloidy && segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":{0}", segment.MajorChromosomeCount);
                        }
                        writer.WriteLine();
                    }
                }
            }
        }
示例#13
0
 public static void WriteMultiSampleSegments(string outVcfPath, List <List <CanvasSegment> > segments, List <double?> diploidCoverage,
                                             string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold,
                                             bool isPedigreeInfoSupplied = true, int?denovoQualityThreshold = null)
 {
     using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
     {
         var genome = WriteVcfHeader(segments.First(), GetMean(diploidCoverage), wholeGenomeFastaDirectory, sampleNames,
                                     extraHeaders, qualityThreshold, writer, denovoQualityThreshold);
         WriteVariants(segments, ploidy, genome, writer, isPedigreeInfoSupplied, denovoQualityThreshold);
     }
 }
示例#14
0
        /// <summary>
        /// Outputs the copy number calls to a text file.
        /// </summary>
        private static void WriteVariants(IReadOnlyCollection <List <CanvasSegment> > segments, PloidyInfo ploidy, GenomeMetadata genome,
                                          BgzipOrStreamWriter writer, bool isPedigreeInfoSupplied = true, int?denovoQualityThreshold = null)
        {
            var nSamples = segments.Count;

            foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
            {
                for (int segmentIndex = 0; segmentIndex < segments.First().Count; segmentIndex++)
                {
                    var firstSampleSegment = segments.First()[segmentIndex];
                    if (!isPedigreeInfoSupplied && segments.Select(sample => sample[segmentIndex].Filter == "PASS").Any() && segments.Count > 1)
                    {
                        firstSampleSegment.Filter = "PASS";
                    }
                    if (!firstSampleSegment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase))
                    {
                        continue;
                    }
                    var referenceCopyNumbers = segments.Select(segment => ploidy?.GetReferenceCopyNumber(segment[segmentIndex]) ?? 2).ToList();
                    var currentSegments      = segments.Select(x => x[segmentIndex]).ToList();
                    var cnvTypes             = new List <CnvType>();
                    for (int sampleIndex = 0; sampleIndex < nSamples; sampleIndex++)
                    {
                        cnvTypes.Add(currentSegments[sampleIndex].GetCnvType(referenceCopyNumbers[sampleIndex]));
                    }
                    CnvType cnvType;
                    if (cnvTypes.TrueForAll(x => x == CnvType.Reference))
                    {
                        cnvType = CnvType.Reference;
                    }
                    else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.Loss))
                    {
                        cnvType = CnvType.Loss;
                    }
                    else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.Gain))
                    {
                        cnvType = CnvType.Gain;
                    }
                    else if (cnvTypes.TrueForAll(x => x == CnvType.Reference | x == CnvType.LossOfHeterozygosity))
                    {
                        cnvType = CnvType.LossOfHeterozygosity;
                    }
                    else
                    {
                        cnvType = CnvType.ComplexCnv;
                    }

                    WriteInfoField(writer, firstSampleSegment, cnvType, denovoQualityThreshold, isMultisample: segments.Count > 1);
                    //  FORMAT field
                    if (segments.Count == 1)
                    {
                        WriteSingleSampleInfo(writer, firstSampleSegment);
                    }
                    else
                    {
                        WriteFormatField(writer, currentSegments);
                    }
                }
            }
        }
示例#15
0
        /// <summary>
        /// Outputs the copy number calls to a text file.
        /// </summary>
        public static void WriteSegments(string outVcfPath, List<CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName,
            List<string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10)
        {
            using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
            {
                // Write the VCF header:
                writer.WriteLine("##fileformat=VCFv4.1");
                writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}");
                writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}");

                foreach (string header in extraHeaders ?? new List<string>())
                {
                    writer.WriteLine(header);
                }
                GenomeMetadata genome = new GenomeMetadata();
                genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"));
                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>");
                }
                string qualityFilter = $"q{qualityThreshold}";
                writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">");
                writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">");
                writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">");
                writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
                writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
                writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
                writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
                writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
                writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
                writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
                writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName);

                SanityCheckChromosomeNames(genome, segments);

                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    foreach (CanvasSegment segment in segments)
                    {
                        if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) continue;

                        int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2;
                        CnvType cnvType = segment.GetCnvType(referenceCopyNumber);

                        // From vcf 4.1 spec:
                        //     If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the
                        //     coordinate of the base preceding the polymorphism.
                        string alternateAllele = cnvType.ToAltId();
                        int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1;
                        writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t");

                        writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter);

                        if (cnvType != CnvType.Reference)
                            writer.Write($"SVTYPE={cnvType.ToSvType()};");
                        writer.Write($"END={segment.End}");
                        if (cnvType != CnvType.Reference)
                            writer.Write($";CNVLEN={segment.End - segment.Begin}");

                        //  FORMAT field
                        writer.Write("\tRC:BC:CN", segment.End);
                        if (segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":MCC");
                        }
                        writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber);
                        if (segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":{0}", segment.MajorChromosomeCount);
                        }
                        writer.WriteLine();
                    }
                }
            }
        }
示例#16
0
        /// <summary>
        /// Outputs the copy number calls to a text file.
        /// </summary>
        public static void WriteSegments(string outVcfPath, List <CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName,
                                         List <string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10)
        {
            using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
            {
                // Write the VCF header:
                writer.WriteLine("##fileformat=VCFv4.1");
                writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}");
                writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}");

                foreach (string header in extraHeaders ?? new List <string>())
                {
                    writer.WriteLine(header);
                }
                GenomeMetadata genome = new GenomeMetadata();
                genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"));
                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>");
                }
                string qualityFilter = $"q{qualityThreshold}";
                writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">");
                writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">");
                writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">");
                writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
                writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
                writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
                writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
                writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
                writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
                writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
                writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName);

                SanityCheckChromosomeNames(genome, segments);

                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    foreach (CanvasSegment segment in segments)
                    {
                        if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase))
                        {
                            continue;
                        }

                        int     referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2;
                        CnvType cnvType             = segment.GetCnvType(referenceCopyNumber);

                        // From vcf 4.1 spec:
                        //     If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the
                        //     coordinate of the base preceding the polymorphism.
                        string alternateAllele = cnvType.ToAltId();
                        int    position        = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1;
                        writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t");

                        writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter);

                        if (cnvType != CnvType.Reference)
                        {
                            writer.Write($"SVTYPE={cnvType.ToSvType()};");
                        }
                        writer.Write($"END={segment.End}");
                        if (cnvType != CnvType.Reference)
                        {
                            writer.Write($";CNVLEN={segment.End - segment.Begin}");
                        }

                        //  FORMAT field
                        writer.Write("\tRC:BC:CN", segment.End);
                        if (segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":MCC");
                        }
                        writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber);
                        if (segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":{0}", segment.MajorChromosomeCount);
                        }
                        writer.WriteLine();
                    }
                }
            }
        }