Ejemplo n.º 1
0
        /// <summary>
        /// Step 1: Load the normal het SNVs of interest.
        /// </summary>
        protected void LoadVariants(string vcfPath)
        {
            Console.WriteLine("{0} Loading variants of interest from {1}", DateTime.Now, vcfPath);
            this.Variants = new List<VcfVariant>();
            int overallCount = 0;
            int countThisChromosome = 0;
            using (VcfReader reader = new VcfReader(vcfPath, requireGenotypes: false))
            {
                VcfVariant variant = new VcfVariant();
                while (true)
                {
                    bool result = reader.GetNextVariant(out variant);
                    if (!result) break;
                    overallCount++;
                    if (variant.ReferenceName != this.Chromosome)
                    {
                        // Shortcut: If we've seen records for the desired chromosome, then as soon as we hit another chromosome,
                        // we can abort:
                        if (countThisChromosome > 0) break;
                        continue;
                    }
                    countThisChromosome++;
                    // Single-allele SNVs only:
                    if (variant.VariantAlleles.Length != 1 || variant.VariantAlleles[0].Length != 1 || variant.ReferenceAllele.Length != 1) continue;
                    // PF variants only:
                    if ((variant.GenotypeColumns != null && variant.GenotypeColumns.Any()) && variant.Filters != "PASS") continue; // FILTER may not say PASS for a dbSNP VCF file
                    if (variant.GenotypeColumns != null && variant.GenotypeColumns.Any()) // not available if we use a dbSNP VCF file
                    {
                        if (!variant.GenotypeColumns[0].ContainsKey("GT")) continue; // no genotype - we don't know if it's a het SNV.
                        string genotype = variant.GenotypeColumns[0]["GT"];
                        if (genotype != "0/1" && genotype != "1/0") continue;

                        // Also require they have a high enough quality score:
                        if (variant.GenotypeColumns[0].ContainsKey("GQX")) // Note: Allow no GQX field, in case we want to use another caller (e.g. Pisces) and not crash
                        {
                            float GQX = float.Parse(variant.GenotypeColumns[0]["GQX"]);
                            if (GQX < 30) continue;
                        }
                    }
                    // Note: Let's NOT require the variant be in dbSNP.  Maybe we didn't do annotation, either because
                    // we chose not to or because we're on a reference without annotation available.
                    //if (variant.Identifier == ".") continue;
                    // Remember all the variants that pass all our tests:
                    this.Variants.Add(variant);
                    variant = new VcfVariant();
                }
            }
            Console.WriteLine("Retained {0} variants, out of {1} records for {2}", this.Variants.Count, countThisChromosome, this.Chromosome);
        }
Ejemplo n.º 2
0
 protected IEnumerable<CNVCall> GetCnvCallsFromVcf(string vcfPath) 
 {
     using (VcfReader reader = new VcfReader(vcfPath, false))
     {
         foreach (VcfVariant variant in reader.GetVariants())
         {
             int end;
             int CN = GetCopyNumber(variant, out end);
             yield return new CNVCall(variant.ReferenceName, variant.ReferencePosition, end, CN);
         }
     }
 }
Ejemplo n.º 3
0
        protected IEnumerable<CNVCall> GetCnvCallsFromVcf(string vcfPath, bool includePassingOnly)
        {
            using (VcfReader reader = new VcfReader(vcfPath, false))
            {
                foreach (VcfVariant variant in reader.GetVariants())
                {

                    int end;
                    int CN = GetCopyNumber(variant, out end);
                    if (includePassingOnly && variant.Filters != "PASS") continue;
                    yield return new CNVCall(variant.ReferenceName, variant.ReferencePosition, end, CN);
                }
            }
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Load the somatic SNV output (from Strelka), and use it to dervie an estimate of overall purity.
        /// Reference: https://ukch-confluence.illumina.com/display/collab/Estimate+purity+of+tumour+samples+using+somatic+SNVs
        /// </summary>
        protected double EstimatePurityFromSomaticSNVs()
        {
            Dictionary<string, List<CanvasSegment>> segmentsByChromosome = CanvasSegment.GetSegmentsByChromosome(Segments);
            int recordCount = 0;
            List<float> variantFrequencies = new List<float>();
            using (VcfReader reader = new VcfReader(this.SomaticVCFPath))
            {
                foreach (VcfVariant variant in reader.GetVariants())
                {
                    recordCount++;
                    // Skip non-PF variants:
                    if (variant.Filters != "PASS") continue;
                    // Skip everything but SNVs:
                    if (variant.ReferenceAllele.Length > 1 || variant.VariantAlleles.Length != 1 || variant.VariantAlleles[0].Length != 1
                        || variant.VariantAlleles[0] == ".")
                    {
                        continue;
                    }

                    string refTagName = string.Format("{0}U", variant.ReferenceAllele[0]);
                    string altTagName = string.Format("{0}U", variant.VariantAlleles[0][0]);
                    string[] counts = variant.Genotypes.Last()[refTagName].Split(',');
                    int refCount = 0;
                    foreach (string bit in counts) refCount += int.Parse(bit);
                    int altCount = 0;
                    counts = variant.Genotypes.Last()[altTagName].Split(',');
                    foreach (string bit in counts) altCount += int.Parse(bit);
                    float VF = altCount / (float)(altCount + refCount);
                    if (VF >= 0.5) continue;
                    variantFrequencies.Add(VF);
                }
            }
            Console.WriteLine(">>>Loaded {0} somatic variants; saved {1} somatic SNV frequencies", recordCount, variantFrequencies.Count);
            // Don't bother providing an estimate if we have very few events:
            if (variantFrequencies.Count < 100)
            {
                return double.NaN;
            }
            double mean = variantFrequencies.Average();
            double estimatedPurity = Math.Min(1, mean * 2);
            Console.WriteLine(">>>Estimated tumor purity of {0} from somatic SNV calls", estimatedPurity);

            return estimatedPurity;
        }
Ejemplo n.º 5
0
 public VcfWriter(string filePath, VcfReader reader) 
 {
     HeaderLines = reader.HeaderLines.ToList();
     Samples = reader.Samples.ToList();
     Open(filePath);
 }
Ejemplo n.º 6
0
		/// <summary>
		/// Load a list of all variants in a file.  This is memory-intensive; don't do this for whole-genome vcf files!
		/// </summary>
		public static List<VcfVariant> GetAllVariantsInFile(string vcfPath)
		{
			List<VcfVariant> allVariants = new List<VcfVariant>();
			using (VcfReader reader = new VcfReader(vcfPath))
			{
				foreach (VcfVariant variant in reader.GetVariants())
				{
					allVariants.Add(variant);
				}
			}
			return allVariants;
		}