Beispiel #1
0
 /// <summary>
 /// Output bed file of regions. Each region spans both probes and the target interval
 /// Note that the BED format uses:
 /// 0-based start position (inclusive) and 1-based end position (inclusive)
 /// which is equivalent to saying:
 /// 0-based start position (inclusive) and 0-based end position (exclusive)
 /// </summary>
 public static void WriteRegionBed(NexteraManifest manifest, string outputPath, GenomeMetadata genome)
 {
     using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outputPath))
     {
         WriteRegionBed(manifest, writer, genome);
     }
 }
Beispiel #2
0
        public static void WriteTargetBed(NexteraManifest manifest, BgzipOrStreamWriter writer, GenomeMetadata genome)
        {
            List<NexteraManifest.ManifestRegion> tempRegions = manifest.Regions;
            if (genome != null)
            {
                tempRegions = new List<NexteraManifest.ManifestRegion>(manifest.Regions);
                Dictionary<string, int> chromsomeIndexLookup = new Dictionary<string, int>();
                //generate chromsome index lookup and sort
                for (int chromosomeIndex = 0; chromosomeIndex < genome.Sequences.Count; chromosomeIndex++)
                {
                    GenomeMetadata.SequenceMetadata sequence = genome.Sequences[chromosomeIndex];
                    chromsomeIndexLookup[sequence.Name] = chromosomeIndex;
                }
                tempRegions.Sort((a, b) => a.CompareTo(b, chromsomeIndexLookup));
            }

                foreach (NexteraManifest.ManifestRegion region in tempRegions)
                {
                TargetInterval interval = region.GetTargetInterval();
                    writer.WriteLine(string.Join("\t", new[]
                    {
                        interval.ReferenceName, 
                        (interval.Begin - 1).ToString(CultureInfo.InvariantCulture), 
                        interval.End.ToString(CultureInfo.InvariantCulture),
                    region.Name //region name is needed for PUMA metrics outputs to generate .coverage.csv file
                    }));
                }
            }
Beispiel #3
0
        /// <summary>
        /// Write out the ploidy vcf file if ploidy information is available from the vcf header
        /// </summary>
        public Vcf CreatePloidyVcf(SampleSet <SexPloidyInfo> ploidyInfos, GenomeMetadata genomeMetadata, IDirectoryLocation sampleSandbox)
        {
            var ploidyVcf = new Vcf(sampleSandbox.GetFileLocation(PloidyVcfName));

            _ploidyFixer.WritePloidyVcfFile(ploidyVcf, ploidyInfos, genomeMetadata);
            return(ploidyVcf);
        }
 public CoverageVisualizationWriterFactory(ILogger logger, IWorkDoer workDoer, ICommandManager commandManager, GenomeMetadata genome)
 {
     _logger         = logger;
     _workDoer       = workDoer;
     _commandManager = commandManager;
     _genome         = genome;
 }
 public CoverageBigWigWriter(ILogger logger, CoverageBedGraphWriter writer, IBedGraphToBigWigConverter converter, GenomeMetadata genome)
 {
     _logger    = logger;
     _writer    = writer;
     _converter = converter;
     _genome    = genome;
 }
        public void GenomeIOTest()
        {
            var gmdt1        = new GenomeMetadata();
            var createdFile  = Path.Combine(TestPaths.LocalScratchDirectory, "TestSavingGenome.xml");
            var originalFile = Path.Combine(TestPaths.LocalScratchDirectory, "GenomeXml2.xml");

            if (File.Exists(createdFile))
            {
                File.Delete(createdFile);
            }

            if (File.Exists(originalFile))
            {
                File.Delete(originalFile);
            }

            File.Copy(_genomeXML, originalFile);


            gmdt1.Deserialize(originalFile);
            TestGenomeMetadata(gmdt1);

            gmdt1.Serialize(createdFile);

            var gmdt2 = new GenomeMetadata();

            gmdt2.Deserialize(createdFile);

            TestGenomeMetadata(gmdt2);
        }
Beispiel #7
0
        public CanvasCallset(
            IFileLocation bam,
            string sampleName,
            IDirectoryLocation wholeGenomeFastaFolder,
            IDirectoryLocation outputFolder,
            IFileLocation kmerFasta,
            IFileLocation filterBed,
            IFileLocation ploidyBed,
            IFileLocation normalVcfPath,
            bool isDbSnpVcf,
            IEnumerable<IFileLocation> normalBamPaths,
            NexteraManifest manifest,
            IFileLocation somaticVcfPath,
            IFileLocation outputVcfPath)
        {
            Bam = new Bam(bam);
            SampleName = sampleName;
            WholeGenomeFastaFolder = wholeGenomeFastaFolder;
            OutputFolder = outputFolder;
            KmerFasta = kmerFasta;
            FilterBed = filterBed;
            PloidyBed = ploidyBed;
            NormalVcfPath = normalVcfPath;
            IsDbSnpVcf = isDbSnpVcf;
            Manifest = manifest;
            SomaticVcfPath = somaticVcfPath;
            OutputVcfPath = outputVcfPath;
            NormalBamPaths = normalBamPaths.Select(file => new Bam(file));

            var genomeSizeXml = WholeGenomeFastaFolder.GetFileLocation("GenomeSize.xml");
            GenomeMetadata = new GenomeMetadata();
            GenomeMetadata.Deserialize(genomeSizeXml.FullName);
        }
        /// <summary>
        ///  Write out the ploidy bed file if ploidy information is available from the vcf header
        /// Only create the normal XX or XY ploidy bed file so that Canvas can properly classify any abnormalities as variant.
        /// If ploidy Y is > 1 produce the XY ploidy bed file, otherwise produce the XX ploidy bed file
        /// </summary>
        public IFileLocation CreateGermlinePloidyBed(Vcf vcf, GenomeMetadata genomeMetadata, IDirectoryLocation sampleSandbox)
        {
            string sexKaryotype = PloidyCorrector.GetSexChromosomeKaryotypeFromVcfHeader(vcf.VcfFile.FullName);

            if (sexKaryotype == null)
            {
                _logger.Warn($"Sex chromosome ploidy not found in {vcf.VcfFile} header. No ploidy will be provided to Canvas.");
                return(null);
            }
            _logger.Info($"Found sex chromosome ploidy {PloidyCorrector.PrintPloidy(sexKaryotype)} in {vcf.VcfFile}");
            var           ploidyInfo = new SamplePloidyInfo();
            IFileLocation ploidyBed  = sampleSandbox.GetFileLocation("ploidy.bed.gz");

            if (sexKaryotype.ToLower().Contains("y"))
            {
                ploidyInfo.ProvidedPloidy = SexPloidyInfo.NormalMale;
                _logger.Info($"Creating male ploidy bed file at {ploidyBed}.");
            }
            else
            {
                ploidyInfo.ProvidedPloidy = SexPloidyInfo.NormalFemale;
                _logger.Info($"Creating female ploidy bed file at {ploidyBed}.");
            }
            string headerLine = $"{PloidyCorrector.ReferenceSexChromosomeKaryotype}={PloidyCorrector.PrettyPrintPloidy(ploidyInfo.ProvidedPloidyX.Value, ploidyInfo.ProvidedPloidyY.Value)}";

            _ploidyFixer.WritePloidyBedFile(ploidyInfo, genomeMetadata, _ploidyFixer.GetParRegions(genomeMetadata),
                                            ploidyBed.FullName, headerLine, ploidy => true);
            return(ploidyBed);
        }
Beispiel #9
0
        public CanvasCallset(
            IFileLocation bam,
            string sampleName,
            IDirectoryLocation wholeGenomeFastaFolder,
            IDirectoryLocation outputFolder,
            IFileLocation kmerFasta,
            IFileLocation filterBed,
            IFileLocation ploidyBed,
            IFileLocation normalVcfPath,
            bool isDbSnpVcf,
            IEnumerable <IFileLocation> normalBamPaths,
            NexteraManifest manifest,
            IFileLocation somaticVcfPath,
            IFileLocation outputVcfPath)
        {
            Bam                    = new Bam(bam);
            SampleName             = sampleName;
            WholeGenomeFastaFolder = wholeGenomeFastaFolder;
            OutputFolder           = outputFolder;
            KmerFasta              = kmerFasta;
            FilterBed              = filterBed;
            PloidyBed              = ploidyBed;
            NormalVcfPath          = normalVcfPath;
            IsDbSnpVcf             = isDbSnpVcf;
            Manifest               = manifest;
            SomaticVcfPath         = somaticVcfPath;
            OutputVcfPath          = outputVcfPath;
            NormalBamPaths         = normalBamPaths.Select(file => new Bam(file));

            var genomeSizeXml = WholeGenomeFastaFolder.GetFileLocation("GenomeSize.xml");

            GenomeMetadata = new GenomeMetadata();
            GenomeMetadata.Deserialize(genomeSizeXml.FullName);
        }
Beispiel #10
0
 public CanvasTumorNormalWgsInput(Bam tumorBam, Bam normalBam, Vcf normalVcf, Vcf somaticVcf, GenomeMetadata genomeMetadata)
 {
     TumorBam       = tumorBam;
     NormalBam      = normalBam;
     NormalVcf      = normalVcf;
     SomaticVcf     = somaticVcf;
     GenomeMetadata = genomeMetadata;
 }
Beispiel #11
0
        public Vcf CreatePloidyVcf(string sampleId, SexPloidyInfo sexPloidyInfo, GenomeMetadata genomeMetadata, IDirectoryLocation sampleSandbox)
        {
            var sampleInfo = new SampleInfo(sampleId, "SampleName");
            var sampleSet  = new SampleSet <SexPloidyInfo>(new Dictionary <SampleInfo, SexPloidyInfo> {
                { sampleInfo, sexPloidyInfo }
            });

            return(CreatePloidyVcf(sampleSet, genomeMetadata, sampleSandbox));
        }
 public CanvasTumorNormalWgsInput(Bam tumorBam, Bam normalBam, Vcf normalVcf, Vcf somaticVcf, GenomeMetadata genomeMetadata, SexPloidyInfo sexPloidy)
 {
     TumorBam       = tumorBam;
     NormalBam      = normalBam;
     NormalVcf      = normalVcf;
     SomaticVcf     = somaticVcf;
     GenomeMetadata = genomeMetadata;
     SexPloidy      = sexPloidy;
 }
Beispiel #13
0
        public IFileLocation GetDbSnpVcf(GenomeMetadata genome)
        {
            if (_dbSnpVcf != null)
            {
                return(_dbSnpVcf);
            }

            return(GetCanvasAnnotationFile(genome, "dbsnp.vcf"));
        }
        public void HappyPath()
        {
            Assert.Equal(GenomeMetadata.GenomeFolderState.Ready, GenomeMetadata.CheckReferenceGenomeFolderState(_genomeFolder, false, false));
            Assert.Equal(GenomeMetadata.GenomeFolderState.Ready, GenomeMetadata.CheckReferenceGenomeFolderState(_genomeFolder, true, false));
            Assert.Equal(GenomeMetadata.GenomeFolderState.RequireWritableFolder, GenomeMetadata.CheckReferenceGenomeFolderState(_genomeFolder, false, true));
            Assert.Equal(GenomeMetadata.GenomeFolderState.RequireWritableFolder, GenomeMetadata.CheckReferenceGenomeFolderState(_genomeFolder, true, true));

            var firstGmdt = new GenomeMetadata();

            firstGmdt.Deserialize(_genomeXML);
            TestGenomeMetadata(firstGmdt);
        }
Beispiel #15
0
        private static GenomeMetadata WriteVcfHeader(List <CanvasSegment> segments, double?diploidCoverage,
                                                     string wholeGenomeFastaDirectory, List <string> sampleNames, List <string> extraHeaders, int qualityThreshold,
                                                     BgzipOrStreamWriter writer, int?denovoQualityThreshold = null)
        {
            // Write the VCF header:
            writer.WriteLine("##fileformat=VCFv4.1");
            writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}");
            writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}");
            AddPloidyAndCoverageHeaders(writer, segments, diploidCoverage);
            foreach (string header in extraHeaders ?? new List <string>())
            {
                writer.WriteLine(header);
            }

            GenomeMetadata genome = new GenomeMetadata();

            genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"));
            foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
            {
                writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>");
            }
            string qualityFilter = $"q{qualityThreshold}";

            writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">");
            writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">");
            writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">");
            writer.WriteLine("##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">");
            writer.WriteLine("##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">");
            writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
            writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
            writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
            writer.WriteLine("##INFO=<ID=SUBCLONAL,Number=0,Type=Flag,Description=\"Subclonal variant\">");
            if (denovoQualityThreshold.HasValue)
            {
                string denovoQualityFilter = $"dq{denovoQualityThreshold}";
                writer.WriteLine($"##INFO=<ID={denovoQualityFilter},Description=\"De novo quality score above {denovoQualityThreshold.Value}\">");
            }
            writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
            writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
            writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
            writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
            if (denovoQualityThreshold.HasValue)
            {
                writer.WriteLine("##FORMAT=<ID=DQ,Number=1,Type=Float,Description=\"De novo variants Phred-scaled quality score\">");
                writer.WriteLine("##FORMAT=<ID=QS,Number=1,Type=Float,Description=\"Phred-scaled quality score\">");
            }
            string names = string.Join("\t", sampleNames.ToArray());

            writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + names);
            SanityCheckChromosomeNames(genome, segments);
            return(genome);
        }
Beispiel #16
0
        public AnalysisDetails(IDirectoryLocation outputFolder, IDirectoryLocation wholeGenomeFastaFolder,
                               IFileLocation kmerFasta, IFileLocation filterBed, IFileLocation ploidyVcf, IFileLocation commonCnvsBed)
        {
            WholeGenomeFastaFolder = wholeGenomeFastaFolder;
            OutputFolder           = outputFolder;
            KmerFasta     = kmerFasta;
            FilterBed     = filterBed;
            PloidyVcf     = ploidyVcf;
            CommonCnvsBed = commonCnvsBed;
            var genomeSizeXml = WholeGenomeFastaFolder.GetFileLocation("GenomeSize.xml");

            GenomeMetadata = new GenomeMetadata();
            GenomeMetadata.Deserialize(genomeSizeXml);
        }
Beispiel #17
0
        /// <summary>
        /// Integrity check, to ensure that our reference FASTA file is in sync with our inputs.
        /// </summary>
        private static void SanityCheckChromosomeNames(GenomeMetadata genome, IEnumerable <CanvasSegment> segments)
        {
            var chromosomeNames = new HashSet <string>();

            foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Contigs())
            {
                chromosomeNames.Add(chromosome.Name.ToLowerInvariant());
            }
            foreach (
                CanvasSegment segment in
                segments.Where(segment => !chromosomeNames.Contains(segment.Chr.ToLowerInvariant())))
            {
                throw new Exception($"Integrity check error: Segment found at unknown chromosome '{segment.Chr}'");
            }
        }
Beispiel #18
0
 public CanvasTumorNormalEnrichmentInput(
     Bam tumorBam,
     Bam normalBam,
     Vcf normalVcf,
     Vcf somaticVcf,
     GenomeMetadata genomeMetadata,
     NexteraManifest nexteraManifest)
 {
     TumorBam        = tumorBam;
     NormalBam       = normalBam;
     NormalVcf       = normalVcf;
     SomaticVcf      = somaticVcf;
     GenomeMetadata  = genomeMetadata;
     NexteraManifest = nexteraManifest;
 }
        /// <summary>
        /// Write out the ploidy bed file if ploidy information is available from the vcf header
        /// </summary>
        public IFileLocation CreatePloidyBed(Vcf vcf, GenomeMetadata genomeMetadata, IDirectoryLocation sampleSandbox)
        {
            IFileLocation ploidyBed = sampleSandbox.GetFileLocation("ploidy.bed.gz");
            string        fastaPath = genomeMetadata.Sequences.First().FastaPath;

            if (_ploidyFixer.GeneratePloidyBedFileFromVcf(
                    genomeMetadata,
                    fastaPath,
                    vcf.VcfFile.FullName,
                    ploidyBed.FullName, sampleSandbox.FullName, _logger, _workManager))
            {
                return(ploidyBed);
            }
            _logger.Warn($"Sex chromosome ploidy not found in {vcf.VcfFile} header. No ploidy will be provided to Canvas.");
            return(null);
        }
 private static Dictionary <string, List <Interval> > LoadBedRegions(IFileLocation bedFile,
                                                                     GenomeMetadata genomeMetadata)
 {
     return(File.ReadAllLines(bedFile.FullName)
            .Select(line =>
     {
         var bits = line.Split('\t');
         return (Chromosome: bits[0], Interval: new Interval(int.Parse(bits[1]) + 1, int.Parse(bits[2])));
     })
            .GroupByAdjacent(bedEntry => bedEntry.Chromosome)
            .Where(kvp => genomeMetadata.GetSequence(kvp.Key) != null)
            .ToDictionary(
                chromosomeElements => chromosomeElements.Key,
                chromosomeElements => GetValidatedIntervals(chromosomeElements.Value,
                                                            genomeMetadata.GetSequence(chromosomeElements.Key).Length).ToList()));
 }
        /// <summary>
        /// Test the values read from the xml file.
        /// </summary>
        /// <param name="gmdt"></param>
        private void TestGenomeMetadata(GenomeMetadata gmdt)
        {
            Assert.Null(gmdt.Species);
            Assert.Null(gmdt.Build);
            Assert.Equal(gmdt.KnownBases, 3119000);
            Assert.Equal(gmdt.Length, 3119000);
            Assert.Equal(gmdt.Name, "chr19FASTA");

            var result = gmdt.GetChromosomesIncludingNull();

            Assert.Equal(result.Count, 2);
            GenomeMetadata.SequenceMetadata foundSequence = new GenomeMetadata.SequenceMetadata();
            gmdt.TryGetSequence("chr19", out foundSequence);
            Assert.True(foundSequence.Name == "chr19");
            Assert.Equal(foundSequence.Length, 3119000);
        }
Beispiel #22
0
        /// <summary>
        /// Integrity check, to ensure that our reference FASTA file is in sync with our inputs.
        /// </summary>
        private static void SanityCheckChromosomeNames(GenomeMetadata genome, List <CanvasSegment> segments)
        {
            HashSet <string> chromosomeNames = new HashSet <string>();

            foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
            {
                chromosomeNames.Add(chromosome.Name.ToLowerInvariant());
            }
            foreach (CanvasSegment segment in segments)
            {
                if (!chromosomeNames.Contains(segment.Chr.ToLowerInvariant()))
                {
                    throw new Exception(string.Format("Integrity check error: Segment found at unknown chromosome '{0}'", segment.Chr));
                }
            }
        }
        public static CallabilityMetricsComputer Create(ILogger logger, GenomeMetadata genomeMetadata, IFileLocation filterBed, bool isFemale)
        {
            var    filterIntervals    = LoadBedRegions(filterBed, genomeMetadata);
            var    nonFilterIntervals = GetIncludedIntervals(filterIntervals, genomeMetadata);
            string chrY = genomeMetadata
                          .Contigs()
                          .Select(contig => contig.Name)
                          .SingleOrDefault(name => name == "chrY" || name == "Y");

            if (isFemale && chrY != null)
            {
                nonFilterIntervals.Remove(chrY);
            }
            var callabilityCalculator = new CallabilityCalculator(nonFilterIntervals);

            return(new CallabilityMetricsComputer(logger, callabilityCalculator));
        }
 public CanvasEnrichmentInput(Bam bam, GenomeMetadata genomeMetadata,
                              IEnumerable <Bam> controlBamPaths,
                              NexteraManifest nexteraManifest,
                              CanvasEnrichmentPrecomputedControl precomputedControl,
                              SamplePloidyInfo ploidyInfo,
                              IFileLocation predefinedBinsFile,
                              CanvasPcaModels pcaModels)
 {
     Bam                = bam;
     GenomeMetadata     = genomeMetadata;
     NexteraManifest    = nexteraManifest;
     PrecomputedControl = precomputedControl;
     NormalBamPaths     = new ReadOnlyCollection <Bam>(controlBamPaths.ToList());
     PloidyInfo         = ploidyInfo;
     PredefinedBinsFile = predefinedBinsFile;
     PcaModels          = pcaModels;
 }
        public void SequenceMetaDataTest()
        {
            var gmdt = new GenomeMetadata();

            gmdt.Deserialize(_genomeXML);
            TestGenomeMetadata(gmdt);

            var seq1 = gmdt.Sequences[0];

            Assert.True(seq1.CompareTo(seq1) == 0);

            Assert.False(seq1.IsMito());

            Assert.False(seq1.IsDecoyOrOther());

            Assert.True(seq1.IsAutosome());
        }
        public void WriteFastaFileTest()
        {
            var testFile = Path.Combine(TestPaths.LocalScratchDirectory, "WriteFastaFileTest.fa");

            if (File.Exists(testFile))
            {
                File.Delete(testFile);
            }

            var gmdt = new GenomeMetadata();

            gmdt.Deserialize(_genomeXML);
            TestGenomeMetadata(gmdt);

            gmdt.Sequences[0].WriteFastaFile(testFile);

            Assert.True(File.Exists(testFile));
        }
Beispiel #27
0
        protected override void ProgramExecution()
        {
            GenomeMetadata.CheckReferenceGenomeFolderState(_options.InputFastaFolder, false, false);

            Console.WriteLine("Preparing GenomeSize.xml for folder {0}...", _options.OutputDirectory);
            GenomeMetadata metadata = new GenomeMetadata();

            metadata.ImportFromFastaFiles(_options.InputFastaFolder, _options.OutputDirectory);
            metadata.Name = _options.SpeciesName;
            string genomeSizePath = Path.Combine(_options.OutputDirectory, "GenomeSize.xml");

            if (File.Exists(genomeSizePath))
            {
                throw new ArgumentException("GenomeSize.xml already exists on " + _options.OutputDirectory);
            }

            metadata.Serialize(genomeSizePath);
            Console.WriteLine("GenomeSize.xml prepared at {0}", genomeSizePath);
        }
        public StringBuilder GetSingleSampleCommandLine(string sampleId, Bam bam, GenomeMetadata genomeMetadata, IDirectoryLocation sampleSandbox)
        {
            StringBuilder commandLine = new StringBuilder();

            commandLine.Append($" --bam \"{bam.BamFile}\"");
            commandLine.Append($" --sample-name \"{sampleId}\"");
            IFileLocation kmerFasta = _annotationFileProvider.GetKmerFasta(genomeMetadata);

            commandLine.Append($" --reference \"{kmerFasta}\"");
            IDirectoryLocation wholeGenomeFasta = new FileLocation(genomeMetadata.Sequences.First().FastaPath).Directory;

            commandLine.Append($" --genome-folder \"{wholeGenomeFasta}\"");
            IFileLocation filterBed = _annotationFileProvider.GetFilterBed(genomeMetadata);

            commandLine.Append($" --filter-bed \"{filterBed}\"");
            commandLine.Append($" --output \"{sampleSandbox}\"");

            return(commandLine);
        }
Beispiel #29
0
        public static void WriteRegionBed(NexteraManifest manifest, BgzipOrStreamWriter writer, GenomeMetadata genome)
        {
            List<NexteraManifest.ManifestRegion> tempRegions = manifest.Regions;
            if (genome != null)
            {
                tempRegions = new List<NexteraManifest.ManifestRegion>(manifest.Regions);
                Dictionary<string, int> chromsomeIndexLookup = new Dictionary<string, int>();
                //generate chromsome index lookup and sort
                for (int chromosomeIndex = 0; chromosomeIndex < genome.Sequences.Count; chromosomeIndex++)
                {
                    GenomeMetadata.SequenceMetadata sequence = genome.Sequences[chromosomeIndex];
                    chromsomeIndexLookup[sequence.Name] = chromosomeIndex;
                }
                tempRegions.Sort((a, b) => a.CompareTo(b, chromsomeIndexLookup));
            }

                foreach (NexteraManifest.ManifestRegion region in tempRegions)
                {
                    writer.WriteLine(string.Format("{0}\t{1}\t{2}", region.Chromosome, region.Start - 1, region.End));
                }
            }
Beispiel #30
0
        public void SequenceMetaDataTest()
        {
            var testFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "SequenceMetaDataTest.fa");

            var gmdt = new GenomeMetadata();

            gmdt.Deserialize(_genomeXML);
            TestGenomeMetadata(gmdt);

            gmdt.Serialize(testFile);

            var seq1 = gmdt.Sequences[0];

            Assert.True(seq1.CompareTo(seq1) == 0);

            Assert.False(seq1.IsMito());

            Assert.False(seq1.IsDecoyOrOther());

            Assert.True(seq1.IsAutosome());

            File.Delete(testFile);
        }
Beispiel #31
0
        public Genome(string directory, List <string> chrsToProcess)
        {
            Directory = directory;

            // import the genome metadata from the genome folder
            var genomeSizePath = Path.Combine(Directory, "GenomeSize.xml");

            if (!File.Exists(genomeSizePath))
            {
                throw new ArgumentException(string.Format("Cannot load genome '{0}': GenomeSize.xml is missing", Directory));
            }

            try
            {
                _genomeSource = new GenomeMetadata();
                _genomeSource.Deserialize(genomeSizePath);
            }
            catch (Exception ex)
            {
                throw new ArgumentException(string.Format("Cannot load genome '{0}': Unable to read GenomeSize.xml: {1}", Directory, ex.Message));
            }

            foreach (var sequenceMetadata in _genomeSource.Sequences)
            {
                if (!File.Exists(sequenceMetadata.FastaPath))
                {
                    throw new ArgumentException(string.Format("Cannot load genome '{0}': Sequence file '{1}' specified in GenomeSize.xml does not exist.", Directory, sequenceMetadata.FastaPath));
                }
                if (!File.Exists(sequenceMetadata.FastaPath + ".fai"))
                {
                    throw new ArgumentException(string.Format("Cannot load genome '{0}': Sequence file '{1}' specified in GenomeSize.xml does not have an index file.", Directory, sequenceMetadata.FastaPath));
                }
            }

            ChromosomesToProcess = chrsToProcess;
        }
Beispiel #32
0
        /// <summary>
        /// Outputs the copy number calls to a text file.
        /// </summary>
        /// <param name="outVcfPath">File to write to.</param>
        /// <param name="segments">List of segments to write out.</param>
        public static void WriteSegments(string outVcfPath, List<CanvasSegment> segments, string reference, string sampleName,
            List<string> extraHeaders, bool reportPloidy, PloidyInfo ploidy, bool reportAllSites = false, bool reportGermlineGenotype = false)
        {
            string cnvtype = null;
            string filter = null;
            // report GT for resequencing workflow and MCC for tumour-normal workflow
            if (reportGermlineGenotype && reportPloidy)
            {
                throw new Exception("WriteSegments VCF file output error: reportGermlineGenotype and reportPloidy can not be both true");
            }

            using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
            {
                // Write the VCF header:
                writer.WriteLine("##fileformat=VCFv4.1");
                writer.WriteLine("##source=Isas," + CanvasCommon.CanvasVersionInfo.NameString + " " + CanvasCommon.CanvasVersionInfo.VersionString);
                writer.WriteLine("##reference={0}", Path.Combine(reference, "genome.fa"));
                if (extraHeaders != null)
                {
                    foreach (string header in extraHeaders)
                    {
                        writer.WriteLine(header);
                    }
                }
                GenomeMetadata genome = new GenomeMetadata();
                genome.Deserialize(Path.Combine(reference, "GenomeSize.xml"));
                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    writer.WriteLine("##contig=<ID={0},length={1}>", chromosome.Name, chromosome.Length);
                }

                writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">");
                writer.WriteLine("##FILTER=<ID=q10,Description=\"Quality below 10\">");
                writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">");
                writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
                writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
                writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
                if (reportGermlineGenotype)
                    writer.WriteLine("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
                writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
                writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
                writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
                if (reportPloidy)
                    writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
                writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName);

                SanityCheckChromosomeNames(genome, segments);

                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    foreach (CanvasSegment segment in segments)
                    {
                        if (segment.Chr.ToLowerInvariant() != chromosome.Name.ToLowerInvariant()) continue;
                        int referenceCN = 2;
                        if (ploidy != null) referenceCN = ploidy.GetReferenceCopyNumber(segment);
                        filter = null;
                        bool isReferenceCall = false;
                        if (segment.CopyNumber == referenceCN) isReferenceCall = true;
                        if (reportPloidy && segment.CopyNumber == 2 && segment.MajorChromosomeCount.HasValue && segment.MajorChromosomeCount != 1) isReferenceCall = false; // If we're reporting ploidy and there's LOH, this isn't a reference call.

                        // We can skip reporting of reference sites:
                        if (!reportAllSites && isReferenceCall)
                            continue;

                        if (segment.QScore < 10)
                            filter = "q10";

                        if (segment.End - segment.Begin < 10000)
                        {
                            if (filter != null)
                                filter = filter + ";L10kb";
                            else
                                filter = "L10kb";
                        }

                        if (filter == null)
                            filter = "PASS";

                        if (segment.CopyNumber < referenceCN)
                            cnvtype = "LOSS";
                        else if (segment.CopyNumber > referenceCN)
                            cnvtype = "GAIN";
                        else
                            cnvtype = "REF";

                        // The Dude abides... from vcf 4.1 spec:
                        //     If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the 
                        //     coordinate of the base preceding the polymorphism.
                        writer.Write("{0}\t{1}\tCanvas:{2}:{0}:{3}-{4}\t", segment.Chr, isReferenceCall ? segment.Begin + 1 : segment.Begin, cnvtype, segment.Begin + 1, segment.End);
                        writer.Write("N\t{0}\t{1}\t{2}\t", isReferenceCall ? "." : "<CNV>", segment.QScore, filter);
                        if (segment.copyNumber != referenceCN)
                            writer.Write("SVTYPE=CNV;");
                        else if (!isReferenceCall)
                            writer.Write("SVTYPE=LOH;");
                        if (segment.copyNumber != referenceCN || !isReferenceCall)
                            writer.Write("END={0};CNVLEN={1}", segment.End, segment.End - segment.Begin);
                        else
                            writer.Write("END={0}", segment.End);
                        //  FORMAT field
                        if (reportGermlineGenotype)
                            writer.Write("\tGT:RC:BC:CN", segment.End);
                        else
                            writer.Write("\tRC:BC:CN", segment.End);
                        if (reportPloidy && segment.MajorChromosomeCount.HasValue) writer.Write(":MCC");
                        // writing GT for resequencing workflow 
                        if (reportGermlineGenotype)
                        {
                            writer.Write("\t{0}/{1}:", segment.MajorChromosomeCount, segment.CopyNumber);
                        }
                        else
                            writer.Write("\t");
                        writer.Write("{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber);
                        // writing MCC for tumour-normal workflow 
                        if (reportPloidy && segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":{0}", segment.MajorChromosomeCount);
                        }
                        writer.WriteLine();
                    }
                }
            }
        }
Beispiel #33
0
        /// <summary>
        /// Outputs the copy number calls to a text file.
        /// </summary>
        public static void WriteSegments(string outVcfPath, List<CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName,
            List<string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10)
        {
            using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
            {
                // Write the VCF header:
                writer.WriteLine("##fileformat=VCFv4.1");
                writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}");
                writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}");

                foreach (string header in extraHeaders ?? new List<string>())
                {
                    writer.WriteLine(header);
                }
                GenomeMetadata genome = new GenomeMetadata();
                genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"));
                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>");
                }
                string qualityFilter = $"q{qualityThreshold}";
                writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">");
                writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">");
                writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">");
                writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
                writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
                writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
                writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
                writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
                writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
                writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
                writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName);

                SanityCheckChromosomeNames(genome, segments);

                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    foreach (CanvasSegment segment in segments)
                    {
                        if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) continue;

                        int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2;
                        CnvType cnvType = segment.GetCnvType(referenceCopyNumber);

                        // From vcf 4.1 spec:
                        //     If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the
                        //     coordinate of the base preceding the polymorphism.
                        string alternateAllele = cnvType.ToAltId();
                        int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1;
                        writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t");

                        writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter);

                        if (cnvType != CnvType.Reference)
                            writer.Write($"SVTYPE={cnvType.ToSvType()};");
                        writer.Write($"END={segment.End}");
                        if (cnvType != CnvType.Reference)
                            writer.Write($";CNVLEN={segment.End - segment.Begin}");

                        //  FORMAT field
                        writer.Write("\tRC:BC:CN", segment.End);
                        if (segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":MCC");
                        }
                        writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber);
                        if (segment.MajorChromosomeCount.HasValue)
                        {
                            writer.Write(":{0}", segment.MajorChromosomeCount);
                        }
                        writer.WriteLine();
                    }
                }
            }
        }
        public StringBuilder GetMultiSampleCommandLine(SampleSet <CanvasPedigreeSample> samples, GenomeMetadata genomeMetadata, Vcf vcf, IDirectoryLocation sampleSandbox)
        {
            StringBuilder commandLine = new StringBuilder();

            foreach (var sampleKvp in samples)
            {
                var sampleId = sampleKvp.Key.Id;
                var sample   = sampleKvp.Value;
                commandLine.Append($" --bam \"{sample.Bam.BamFile}\"");
                if (sample.SampleType != SampleType.Other)
                {
                    commandLine.Append($" --{sample.SampleType.GetOptionName()} {sampleId}");
                }
            }
            IFileLocation kmerFasta = _annotationFileProvider.GetKmerFasta(genomeMetadata);

            commandLine.Append($" --reference \"{kmerFasta}\"");
            IDirectoryLocation wholeGenomeFasta = new FileLocation(genomeMetadata.Sequences.First().FastaPath).Directory;

            commandLine.Append($" --genome-folder \"{wholeGenomeFasta}\"");
            IFileLocation filterBed = _annotationFileProvider.GetFilterBed(genomeMetadata);

            commandLine.Append($" --filter-bed \"{filterBed}\"");
            commandLine.Append($" --output \"{sampleSandbox}\"");
            return(commandLine);
        }
Beispiel #35
0
 public IFileLocation Convert(IFileLocation sourceFile, GenomeMetadata genomeMetadata, IDirectoryLocation outputDirectory)
 {
     _logger.Warn($"Not coverting {sourceFile} to BigWig. {_reasonUnavailable}");
     return(null);
 }
Beispiel #36
0
        /// <summary>
        /// Generate a tabular file with information about coverage and allele frequency for each chunk of the genome.
        /// This file can be used to generate a pretty plot of coverage versus MAF.
        /// </summary>
        public static void WriteCoveragePlotData(List <CanvasSegment> segments, double?normalDiploidCoverage, PloidyInfo referencePloidy,
                                                 string filePath, string referenceFolder)
        {
            if (segments.Any() && !normalDiploidCoverage.HasValue)
            {
                throw new Illumina.Common.IlluminaException("normal diploid coverage must be specified");
            }
            int pointLength       = 100000;
            int minimumBinsToPlot = GetMinimumBinsForCoveragePlotPoint(segments, pointLength);

            Dictionary <string, List <CanvasSegment> > segmentsByChromosome = GetSegmentsByChromosome(segments);
            GenomeMetadata genome = new GenomeMetadata();

            genome.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml"));

            List <float> counts = new List <float>();
            List <float> MAF    = new List <float>();
            List <float> VF     = new List <float>();

            using (FileStream stream = new FileStream(filePath, FileMode.Create, FileAccess.Write))
                using (StreamWriter writer = new StreamWriter(stream))
                {
                    writer.NewLine = "\n";
                    writer.Write("#Chromosome\tStart\tEnd\tCopyNumber\tMajorChromosomeCount\tMedianHits\tNormalizedCoverage\tMedianMinorAlleleFrequency\tReferencePloidy\t");
                    for (int i = 0; i < NumberVariantFrequencyBins; i++)
                    {
                        writer.Write("VariantFrequencyBin{0}\t", i);
                    }
                    writer.WriteLine();
                    foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                    {
                        if (!segmentsByChromosome.ContainsKey(chromosome.Name))
                        {
                            continue;
                        }
                        int pointStartPos = 0; // 0-based start
                        while (pointStartPos < chromosome.Length)
                        {
                            int pointEndPos = (int)Math.Min(chromosome.Length, pointStartPos + pointLength); // 1-based end
                            counts.Clear();
                            MAF.Clear();
                            VF.Clear();
                            Dictionary <string, long> CopyNumberAndChromCount = new Dictionary <string, long>();
                            Dictionary <int, long>    basesByCopyNumber       = new Dictionary <int, long>();
                            // Accumulate counts and MAF from the segments:
                            List <CanvasSegment> chrSegments = new List <CanvasSegment>();
                            if (segmentsByChromosome.ContainsKey(chromosome.Name))
                            {
                                chrSegments = segmentsByChromosome[chromosome.Name];
                            }
                            List <CanvasSegment> overlapSegments = new List <CanvasSegment>();
                            foreach (CanvasSegment segment in chrSegments)
                            {
                                if (segment.Begin > pointEndPos)
                                {
                                    continue;
                                }
                                if (segment.End < pointStartPos)
                                {
                                    continue;
                                }

                                int    weight = Math.Min(segment.End, pointEndPos) - Math.Max(segment.Begin, pointStartPos);
                                string key    = string.Format("{0} {1}", segment.CopyNumber, segment.MajorChromosomeCount);
                                if (!CopyNumberAndChromCount.ContainsKey(key))
                                {
                                    CopyNumberAndChromCount[key] = 0;
                                }
                                CopyNumberAndChromCount[key] += weight;
                                if (!basesByCopyNumber.ContainsKey(segment.CopyNumber))
                                {
                                    basesByCopyNumber[segment.CopyNumber] = 0;
                                }
                                basesByCopyNumber[segment.CopyNumber] += weight;
                                overlapSegments.Add(segment);
                            }

                            // Note the most common copy number:
                            long bestCount       = 0;
                            int  majorCopyNumber = 0;
                            foreach (int key in basesByCopyNumber.Keys)
                            {
                                if (basesByCopyNumber[key] > bestCount)
                                {
                                    bestCount       = basesByCopyNumber[key];
                                    majorCopyNumber = key;
                                }
                            }

                            // Find the most common major chromosome count, for the most common copy number:
                            int?majorChromosomeCount = null;
                            bestCount = 0;
                            foreach (string key in CopyNumberAndChromCount.Keys)
                            {
                                string[] bits = key.Split();
                                if (bits[1].Length == 0)
                                {
                                    continue;
                                }
                                if (int.Parse(bits[0]) != majorCopyNumber)
                                {
                                    continue;
                                }
                                long count = CopyNumberAndChromCount[key];
                                if (count < bestCount)
                                {
                                    continue;
                                }
                                bestCount            = count;
                                majorChromosomeCount = int.Parse(bits[1]);
                            }

                            // Note allele frequency and coverage info, for all overlap segments that match (more or less)
                            // the most common copy number:
                            foreach (CanvasSegment segment in overlapSegments)
                            {
                                if ((majorCopyNumber == 2 && segment.CopyNumber != 2) ||
                                    (majorCopyNumber < 2 && segment.CopyNumber >= 2) ||
                                    (majorCopyNumber > 2 && segment.CopyNumber <= 2))
                                {
                                    continue;
                                }
                                float segLength = segment.End - segment.Begin;

                                // Add counts to the overall list:
                                int firstIndex = 0;
                                if (pointStartPos > segment.Begin)
                                {
                                    firstIndex = (int)((float)segment.Counts.Count * (pointStartPos - segment.Begin) / segLength);
                                }
                                int lastIndex = segment.Counts.Count;
                                if (pointEndPos < segment.End)
                                {
                                    lastIndex = (int)((float)segment.Counts.Count * (pointEndPos - segment.Begin) / segLength);
                                }
                                for (int index = firstIndex; index < lastIndex; index++)
                                {
                                    counts.Add(segment.Counts[index]);
                                }

                                // Add MAF to the overall list:
                                firstIndex = 0;
                                if (pointStartPos > segment.Begin)
                                {
                                    firstIndex = (int)((float)segment.Alleles.Frequencies.Count * (pointStartPos - segment.Begin) / segLength);
                                }
                                lastIndex = segment.Alleles.Frequencies.Count;
                                if (pointEndPos < segment.End)
                                {
                                    lastIndex = (int)((float)segment.Alleles.Frequencies.Count * (pointEndPos - segment.Begin) / segLength);
                                }
                                for (int index = firstIndex; index < lastIndex; index++)
                                {
                                    float tempMAF = segment.Alleles.Frequencies[index];
                                    VF.Add(tempMAF);
                                    if (tempMAF > 0.5)
                                    {
                                        tempMAF = 1 - tempMAF;
                                    }
                                    MAF.Add(tempMAF);
                                }
                            }

                            // Write output for this point:
                            writer.Write("{0}\t{1}\t{2}\t", chromosome.Name, pointStartPos, pointEndPos);

                            // Write counts if we have reasonable amounts of data; write MAF if we have reasonable amounts of data.
                            // (Note: Observed that for germline data on chrY we often had well under 100 counts given the new, smaller bin size)
                            if (counts.Count >= minimumBinsToPlot)
                            {
                                writer.Write("{0}\t", majorCopyNumber);
                                writer.Write("{0}\t", majorChromosomeCount);
                                counts.Sort();
                                double medianHits = counts[counts.Count / 2];
                                writer.Write("{0:F2}\t", medianHits);
                                double normalizedCount = 2 * medianHits / normalDiploidCoverage.Value;
                                writer.Write("{0:F2}\t", normalizedCount);
                                if (MAF.Count >= 10)
                                {
                                    MAF.Sort();
                                    writer.Write("{0}\t", MAF[MAF.Count / 2]);
                                }
                                else
                                {
                                    writer.Write("\t");
                                }
                                int refPloidy = 2;
                                if (referencePloidy != null && referencePloidy.PloidyByChromosome.ContainsKey(chromosome.Name))
                                {
                                    foreach (var interval in referencePloidy.PloidyByChromosome[chromosome.Name])
                                    {
                                        if (interval.Start <= pointEndPos && interval.End >= pointStartPos)
                                        {
                                            refPloidy = interval.Ploidy;
                                        }
                                    }
                                }
                                writer.Write("{0}\t", refPloidy);
                                if (VF.Count >= 10)
                                {
                                    // bin VF
                                    float[] vfDistribution = new float[NumberVariantFrequencyBins];
                                    foreach (float vf in VF)
                                    {
                                        int binNumber = Math.Min(vfDistribution.Length - 1, (int)Math.Floor(vf / 0.01));
                                        vfDistribution[binNumber]++;
                                    }
                                    for (int i = 0; i < vfDistribution.Length; i++)
                                    {
                                        vfDistribution[i] = vfDistribution[i] / (float)VF.Count * 100.0f;
                                        writer.Write("{0:F2}\t", vfDistribution[i]);
                                    }
                                }
                                else
                                {
                                    for (int i = 0; i < NumberVariantFrequencyBins; i++)
                                    {
                                        writer.Write("\t");
                                    }
                                }
                            }
                            writer.WriteLine();
                            pointStartPos += pointLength;
                        }
                    }
                }
        }
Beispiel #37
0
 /// <summary>
 /// Integrity check, to ensure that our reference FASTA file is in sync with our inputs.  
 /// </summary>
 private static void SanityCheckChromosomeNames(GenomeMetadata genome, List<CanvasSegment> segments)
 {
     HashSet<string> chromosomeNames = new HashSet<string>();
     foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
     {
         chromosomeNames.Add(chromosome.Name.ToLowerInvariant());
     }
     foreach (CanvasSegment segment in segments)
     {
         if (!chromosomeNames.Contains(segment.Chr.ToLowerInvariant()))
         {
             throw new Exception(string.Format("Integrity check error: Segment found at unknown chromosome '{0}'", segment.Chr));
         }
     }
 }
Beispiel #38
0
        /// <summary>
        /// Generate a tabular file with information about coverage and allele frequency for each chunk of the genome.
        /// This file can be used to generate a pretty plot of coverage versus MAF.  
        /// </summary>
        static public void WriteCoveragePlotData(List<CanvasSegment> segments, double normalDiploidCoverage, PloidyInfo referencePloidy,
            string filePath, string referenceFolder)
        {
            Dictionary<string, List<CanvasSegment>> segmentsByChromosome = GetSegmentsByChromosome(segments);
            GenomeMetadata genome = new GenomeMetadata();
            genome.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml"));
            int pointLength = 100000;
            List<float> counts = new List<float>();
            List<float> MAF = new List<float>();
            List<float> VF = new List<float>();
            using (StreamWriter writer = new StreamWriter(filePath))
            {
                writer.NewLine = "\n";
                writer.Write("#Chromosome\tStart\tEnd\tCopyNumber\tMajorChromosomeCount\tMedianHits\tNormalizedCoverage\tMedianMinorAlleleFrequency\tReferencePloidy\t");
                for (int i = 0; i < NumberVariantFrequencyBins; i++) { writer.Write("VariantFrequencyBin{0}\t", i); }
                writer.WriteLine();
                foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
                {
                    if (chromosome.IsMito()) continue;
                    int pointStartPos = 0; // 0-based start
                    while (pointStartPos < chromosome.Length)
                    {
                        int pointEndPos = (int)Math.Min(chromosome.Length, pointStartPos + pointLength); // 1-based end
                        counts.Clear();
                        MAF.Clear();
                        VF.Clear();
                        Dictionary<string, long> CopyNumberAndChromCount = new Dictionary<string, long>();
                        Dictionary<int, long> basesByCopyNumber = new Dictionary<int, long>();
                        // Accumulate counts and MAF from the segments:
                        List<CanvasSegment> chrSegments = new List<CanvasSegment>();
                        if (segmentsByChromosome.ContainsKey(chromosome.Name)) chrSegments = segmentsByChromosome[chromosome.Name];
                        List<CanvasSegment> overlapSegments = new List<CanvasSegment>();
                        foreach (CanvasSegment segment in chrSegments)
                        {
                            if (segment.Begin > pointEndPos) continue;
                            if (segment.End < pointStartPos) continue;

                            int weight = Math.Min(segment.End, pointEndPos) - Math.Max(segment.Begin, pointStartPos);
                            string key = string.Format("{0} {1}", segment.copyNumber, segment.MajorChromosomeCount);
                            if (!CopyNumberAndChromCount.ContainsKey(key)) CopyNumberAndChromCount[key] = 0;
                            CopyNumberAndChromCount[key] += weight;
                            if (!basesByCopyNumber.ContainsKey(segment.copyNumber)) basesByCopyNumber[segment.copyNumber] = 0;
                            basesByCopyNumber[segment.copyNumber] += weight;
                            overlapSegments.Add(segment);
                        }

                        // Note the most common copy number:
                        long bestCount = 0;
                        int majorCopyNumber = 0;
                        foreach (int key in basesByCopyNumber.Keys)
                        {
                            if (basesByCopyNumber[key] > bestCount)
                            {
                                bestCount = basesByCopyNumber[key];
                                majorCopyNumber = key;
                            }
                        }

                        // Find the most common major chromosome count, for the most common copy number:
                        int? majorChromosomeCount = null;
                        bestCount = 0;
                        foreach (string key in CopyNumberAndChromCount.Keys)
                        {
                            string[] bits = key.Split();
                            if (bits[1].Length == 0) continue;
                            if (int.Parse(bits[0]) != majorCopyNumber) continue;
                            long count = CopyNumberAndChromCount[key];
                            if (count < bestCount) continue;
                            bestCount = count;
                            majorChromosomeCount = int.Parse(bits[1]);
                        }

                        // Note allele frequency and coverage info, for all overlap segments that match (more or less)
                        // the most common copy number:
                        foreach (CanvasSegment segment in overlapSegments)
                        {
                            if ((majorCopyNumber == 2 && segment.copyNumber != 2) ||
                                (majorCopyNumber < 2 && segment.copyNumber >= 2) ||
                                (majorCopyNumber > 2 && segment.copyNumber <= 2))
                                continue;
                            float segLength = segment.End - segment.Begin;

                            // Add counts to the overall list:
                            int firstIndex = 0;
                            if (pointStartPos > segment.Begin)
                            {
                                firstIndex = (int)((float)segment.Counts.Count * (pointStartPos - segment.Begin) / segLength);
                            }
                            int lastIndex = segment.Counts.Count;
                            if (pointEndPos < segment.End)
                            {
                                lastIndex = (int)((float)segment.Counts.Count * (pointEndPos - segment.Begin) / segLength);
                            }
                            for (int index = firstIndex; index < lastIndex; index++) counts.Add(segment.Counts[index]);

                            // Add MAF to the overall list:
                            firstIndex = 0;
                            if (pointStartPos > segment.Begin)
                            {
                                firstIndex = (int)((float)segment.VariantFrequencies.Count * (pointStartPos - segment.Begin) / segLength);
                            }
                            lastIndex = segment.VariantFrequencies.Count;
                            if (pointEndPos < segment.End)
                            {
                                lastIndex = (int)((float)segment.VariantFrequencies.Count * (pointEndPos - segment.Begin) / segLength);
                            }
                            for (int index = firstIndex; index < lastIndex; index++)
                            {
                                float tempMAF = segment.VariantFrequencies[index];
                                VF.Add(tempMAF);
                                if (tempMAF > 0.5) tempMAF = 1 - tempMAF;
                                MAF.Add(tempMAF);
                            }
                        }

                        // Write output for this point:
                        writer.Write("{0}\t{1}\t{2}\t", chromosome.Name, pointStartPos, pointEndPos);

                        // Write counts if we have reasonable amounts of data; write MAF if we have reasonable amounts of data.
                        // (Note: Observed that for germline data on chrY we often had well under 100 counts given the new, smaller bin size)
                        if (counts.Count >= 30)
                        {
                            writer.Write("{0}\t", majorCopyNumber);
                            writer.Write("{0}\t", majorChromosomeCount);
                            counts.Sort();
                            double medianHits = counts[counts.Count / 2];
                            writer.Write("{0:F2}\t", medianHits);
                            double normalizedCount = 2 * medianHits / normalDiploidCoverage;
                            writer.Write("{0:F2}\t", normalizedCount);
                            if (MAF.Count >= 10)
                            {
                                MAF.Sort();
                                writer.Write("{0}\t", MAF[MAF.Count / 2]);
                            }
                            else
                            {
                                writer.Write("\t");
                            }
                            int refPloidy = 2;
                            if (referencePloidy != null && referencePloidy.PloidyByChromosome.ContainsKey(chromosome.Name))
                            {
                                foreach (var interval in referencePloidy.PloidyByChromosome[chromosome.Name])
                                {
                                    if (interval.Start <= pointEndPos && interval.End >= pointStartPos)
                                    {
                                        refPloidy = interval.Ploidy;
                                    }
                                }
                            }
                            writer.Write("{0}\t", refPloidy);
                            if (VF.Count >= 10)
                            {
                                // bin VF
                                float[] vfDistribution = new float[NumberVariantFrequencyBins];
                                foreach (float vf in VF)
                                {
                                    int binNumber = Math.Min(vfDistribution.Length - 1, (int)Math.Floor(vf / 0.01));
                                    vfDistribution[binNumber]++;
                                }
                                for (int i = 0; i < vfDistribution.Length; i++)
                                {
                                    vfDistribution[i] = vfDistribution[i] / (float)VF.Count * 100.0f;
                                    writer.Write("{0:F2}\t", vfDistribution[i]);
                                }
                            }
                            else
                            {
                                for (int i = 0; i < NumberVariantFrequencyBins; i++) writer.Write("\t");
                            }
                        }
                        writer.WriteLine();
                        pointStartPos += pointLength;
                    }
                }
            }
        }
Beispiel #39
0
        /// <summary>
        /// Assign copy number calls to segments.  And, produce extra headers for the CNV vcf file, giving the 
        /// overall estimated purity and ploidy.
        /// </summary>
        protected List<string> CallCNVUsingSNVFrequency(double? localSDmertic, string referenceFolder)
        {
            List<string> Headers = new List<string>();
            if (this.CNOracle != null)
            {
                this.DerivePurityEstimateFromVF();
            }

            // Get genome length.
            GenomeMetadata genomeMetaData = null;
            genomeMetaData = new GenomeMetadata();
            genomeMetaData.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml"));

            // Derive a model of diploid coverage, and overall tumor purity:
            this.Model = ModelOverallCoverageAndPurity(genomeMetaData.Length);

            // Make preliminary ploidy calls for all segments.  For those segments which fit their ploidy reasonably well,
            // accumulate information about the MAF by site and coverage by bin.  
            this.HeterogeneousSegmentsSignature.Sort();

            if (AllPloidies.First().Sigma == null)
            {
                AssignPloidyCalls();
            }
            else
            {
                AssignPloidyCallsGaussianMixture();
            }

            // If the somatic SNV/indel file was provided, then we use it to derive another estimate of purity.
            // And, if we didn't make many CNV calls, then we report this estimate, instead of the estimate derived from
            // our overall model.
            if (!string.IsNullOrEmpty(SomaticVCFPath))
            {
                try
                {
                    double SNVPurityEstimate = EstimatePurityFromSomaticSNVs();
                    this.SelectPurityEstimate(SNVPurityEstimate, genomeMetaData.Length);
                }
                catch (Exception e)
                {
                    Console.Error.WriteLine("* Error deriving purity estimate from somatic SNVs.  Details:\n{0}", e.ToString());
                }
            }

            // Add some extra information to the vcf file header:
            Headers.Add(string.Format("##EstimatedTumorPurity={0:F2}", this.Model.Purity));
            double totalPloidy = 0;
            double totalWeight = 0;
            foreach (CanvasSegment segment in this.Segments)
            {
                totalWeight += segment.End - segment.Begin;
                totalPloidy += segment.CopyNumber * (segment.End - segment.Begin);
            }
            Headers.Add(string.Format("##OverallPloidy={0:F2}", totalPloidy / Math.Max(1, totalWeight)));
            Headers.Add(string.Format("##PurityModelFit={0:F4}", this.Model.Deviation));
            Headers.Add(string.Format("##InterModelDistance={0:F4}", this.Model.InterModelDistance));
            Headers.Add(string.Format("##EstimatedChromosomeCount={0:F2}", this.EstimateChromosomeCount()));
            Headers.Add(string.Format("##LocalSDmetric={0:F2}", localSDmertic));
            Headers.Add(string.Format("##Heterogeneity={0:F2}", this.Model.HeterogeneityIndex));
            return Headers;
        }