/// <summary> /// Step 1: Load the normal het SNVs of interest. /// </summary> protected void LoadVariants(string vcfPath) { Console.WriteLine("{0} Loading variants of interest from {1}", DateTime.Now, vcfPath); this.Variants = new List<VcfVariant>(); int overallCount = 0; int countThisChromosome = 0; using (VcfReader reader = new VcfReader(vcfPath, requireGenotypes: false)) { VcfVariant variant = new VcfVariant(); while (true) { bool result = reader.GetNextVariant(out variant); if (!result) break; overallCount++; if (variant.ReferenceName != this.Chromosome) { // Shortcut: If we've seen records for the desired chromosome, then as soon as we hit another chromosome, // we can abort: if (countThisChromosome > 0) break; continue; } countThisChromosome++; // Single-allele SNVs only: if (variant.VariantAlleles.Length != 1 || variant.VariantAlleles[0].Length != 1 || variant.ReferenceAllele.Length != 1) continue; // PF variants only: if ((variant.GenotypeColumns != null && variant.GenotypeColumns.Any()) && variant.Filters != "PASS") continue; // FILTER may not say PASS for a dbSNP VCF file if (variant.GenotypeColumns != null && variant.GenotypeColumns.Any()) // not available if we use a dbSNP VCF file { if (!variant.GenotypeColumns[0].ContainsKey("GT")) continue; // no genotype - we don't know if it's a het SNV. string genotype = variant.GenotypeColumns[0]["GT"]; if (genotype != "0/1" && genotype != "1/0") continue; // Also require they have a high enough quality score: if (variant.GenotypeColumns[0].ContainsKey("GQX")) // Note: Allow no GQX field, in case we want to use another caller (e.g. Pisces) and not crash { float GQX = float.Parse(variant.GenotypeColumns[0]["GQX"]); if (GQX < 30) continue; } } // Note: Let's NOT require the variant be in dbSNP. Maybe we didn't do annotation, either because // we chose not to or because we're on a reference without annotation available. //if (variant.Identifier == ".") continue; // Remember all the variants that pass all our tests: this.Variants.Add(variant); variant = new VcfVariant(); } } Console.WriteLine("Retained {0} variants, out of {1} records for {2}", this.Variants.Count, countThisChromosome, this.Chromosome); }
protected IEnumerable<CNVCall> GetCnvCallsFromVcf(string vcfPath) { using (VcfReader reader = new VcfReader(vcfPath, false)) { foreach (VcfVariant variant in reader.GetVariants()) { int end; int CN = GetCopyNumber(variant, out end); yield return new CNVCall(variant.ReferenceName, variant.ReferencePosition, end, CN); } } }
protected IEnumerable<CNVCall> GetCnvCallsFromVcf(string vcfPath, bool includePassingOnly) { using (VcfReader reader = new VcfReader(vcfPath, false)) { foreach (VcfVariant variant in reader.GetVariants()) { int end; int CN = GetCopyNumber(variant, out end); if (includePassingOnly && variant.Filters != "PASS") continue; yield return new CNVCall(variant.ReferenceName, variant.ReferencePosition, end, CN); } } }
/// <summary> /// Load the somatic SNV output (from Strelka), and use it to dervie an estimate of overall purity. /// Reference: https://ukch-confluence.illumina.com/display/collab/Estimate+purity+of+tumour+samples+using+somatic+SNVs /// </summary> protected double EstimatePurityFromSomaticSNVs() { Dictionary<string, List<CanvasSegment>> segmentsByChromosome = CanvasSegment.GetSegmentsByChromosome(Segments); int recordCount = 0; List<float> variantFrequencies = new List<float>(); using (VcfReader reader = new VcfReader(this.SomaticVCFPath)) { foreach (VcfVariant variant in reader.GetVariants()) { recordCount++; // Skip non-PF variants: if (variant.Filters != "PASS") continue; // Skip everything but SNVs: if (variant.ReferenceAllele.Length > 1 || variant.VariantAlleles.Length != 1 || variant.VariantAlleles[0].Length != 1 || variant.VariantAlleles[0] == ".") { continue; } string refTagName = string.Format("{0}U", variant.ReferenceAllele[0]); string altTagName = string.Format("{0}U", variant.VariantAlleles[0][0]); string[] counts = variant.Genotypes.Last()[refTagName].Split(','); int refCount = 0; foreach (string bit in counts) refCount += int.Parse(bit); int altCount = 0; counts = variant.Genotypes.Last()[altTagName].Split(','); foreach (string bit in counts) altCount += int.Parse(bit); float VF = altCount / (float)(altCount + refCount); if (VF >= 0.5) continue; variantFrequencies.Add(VF); } } Console.WriteLine(">>>Loaded {0} somatic variants; saved {1} somatic SNV frequencies", recordCount, variantFrequencies.Count); // Don't bother providing an estimate if we have very few events: if (variantFrequencies.Count < 100) { return double.NaN; } double mean = variantFrequencies.Average(); double estimatedPurity = Math.Min(1, mean * 2); Console.WriteLine(">>>Estimated tumor purity of {0} from somatic SNV calls", estimatedPurity); return estimatedPurity; }
public VcfWriter(string filePath, VcfReader reader) { HeaderLines = reader.HeaderLines.ToList(); Samples = reader.Samples.ToList(); Open(filePath); }
/// <summary> /// Load a list of all variants in a file. This is memory-intensive; don't do this for whole-genome vcf files! /// </summary> public static List<VcfVariant> GetAllVariantsInFile(string vcfPath) { List<VcfVariant> allVariants = new List<VcfVariant>(); using (VcfReader reader = new VcfReader(vcfPath)) { foreach (VcfVariant variant in reader.GetVariants()) { allVariants.Add(variant); } } return allVariants; }