Ejemplo n.º 1
0
        public static Dictionary <string, List <CandidateAllele> > GetVariantsByChromosome(this VcfReader reader,
                                                                                           bool variantsOnly = false, bool flagIsKnown = false, List <AlleleCategory> typeFilter = null, Func <CandidateAllele, bool> doSkipCandidate = null)
        {
            var lookup = new Dictionary <string, List <CandidateAllele> >();

            var calledVariants = VcfVariantUtilities.Convert(reader.GetVariants());

            foreach (var calledVariant in calledVariants)
            {
                var candidate = BackToCandiate(calledVariant);

                if (candidate.Type != AlleleCategory.Unsupported)
                {
                    if (variantsOnly && candidate.Type == AlleleCategory.Reference)
                    {
                        continue;
                    }

                    if (typeFilter != null && !typeFilter.Contains(candidate.Type))
                    {
                        continue;
                    }

                    if (doSkipCandidate != null && doSkipCandidate(candidate))
                    {
                        continue;
                    }

                    if (flagIsKnown)
                    {
                        candidate.IsKnown = true;
                    }

                    if (!lookup.ContainsKey(candidate.Chromosome))
                    {
                        lookup[candidate.Chromosome] = new List <CandidateAllele>();
                    }

                    lookup[candidate.Chromosome].Add(candidate);
                }
            }
            return(lookup);
        }
Ejemplo n.º 2
0
        /// <summary>
        ///     populates a called allele object given an array of vcf columns
        /// </summary>
        protected static void ConvertColumnsToVariant(bool shouldTrimComplexAlleles, string[] cols, CalledAllele allele, int alleleIndex)
        {
            bool shouldOutputRcCounts = true;
            bool shouldOutputTsCounts = true;

            if ((cols == null) || (cols.Length == 0))
            {
                allele = null;
                return;
            }

            //set defaults.
            var genotypeQscore   = 0;
            var referenceSupport = 0;
            var altSupport       = 0;
            var genotypeString   = "";
            var totalCoverage    = 0;

            var variantQuality              = 0.0;
            var numAlts                     = 1;
            var noiseLevel                  = 0;
            var fractionNocalls             = 0f;
            var strandBiasInGATKScaleCoords = -100f;
            var tsCounts                    = new List <string>();

            //

            //read in simple data
            allele.Chromosome        = cols[VcfCommon.ChromIndex];
            allele.ReferencePosition = int.Parse(cols[VcfCommon.PosIndex]);
            allele.ReferenceAllele   = cols[VcfCommon.RefIndex];
            allele.Filters           = VcfVariantUtilities.MapFilterString(cols[VcfCommon.FilterIndex]);



            bool gotQual = double.TryParse(cols[VcfCommon.QualIndex], out variantQuality); // CFTR uses a ".", which is not actually legal... (actually, vcf 4.1 does allow the missing value "." here. Strelka uses it)

            if (gotQual)
            {
                allele.VariantQscore = (int)variantQuality;
            }

            // parse the variant alleles
            var variantAlleles = cols[VcfCommon.AltIndex].Split(',');

            allele.AlternateAllele = variantAlleles[alleleIndex];
            var isRef = (allele.AlternateAllele == ".");

            if (isRef)
            {
                numAlts = 0;
            }
            else
            {
                numAlts = variantAlleles.Count();
            }


            // parse the info field data (presume, single  sample)
            Dictionary <string, string> InfoFields = ParseInfoFields(cols);

            // parse the genotype data (presume, single  sample)
            List <Dictionary <string, string> > Genotypes = ParseGenotypeData(cols);

            //get more complex allele data...

            if (InfoFields.ContainsKey("DP"))
            {
                totalCoverage = Int32.Parse(InfoFields["DP"]);
            }

            if ((Genotypes.Count > 0) && (Genotypes[0] != null))
            {
                if (Genotypes[0].ContainsKey("GQ"))
                {
                    genotypeQscore = Int32.Parse(Genotypes[0]["GQ"]);
                }
                else if (Genotypes[0].ContainsKey("GQX"))
                {
                    genotypeQscore = Int32.Parse(Genotypes[0]["GQX"]);
                }

                if (Genotypes[0].ContainsKey("GT"))
                {
                    genotypeString = Genotypes[0]["GT"];
                }

                if (Genotypes[0].ContainsKey("NL"))
                {
                    noiseLevel = Int32.Parse(Genotypes[0]["NL"]);
                }

                if (Genotypes[0].ContainsKey("NC"))
                {
                    fractionNocalls = float.Parse(Genotypes[0]["NC"]);
                }

                if (Genotypes[0].ContainsKey("SB"))
                {
                    strandBiasInGATKScaleCoords = float.Parse(Genotypes[0]["SB"]);
                }

                var ADstrings = new string[] { "0", "0" };

                if (Genotypes[0].ContainsKey("AD"))
                {
                    ADstrings = Genotypes[0]["AD"].Split(',');
                }

                referenceSupport = int.Parse(ADstrings[0]);

                //by default alt support is 0.
                if ((!isRef) && (ADstrings.Length > 1))
                {
                    altSupport = int.Parse(ADstrings[1]);
                }

                if (shouldOutputRcCounts)
                {
                    if (Genotypes[0].ContainsKey("US"))
                    {
                        tsCounts = Genotypes[0]["US"].Split(',').ToList();
                    }
                }

                allele.Genotype = VcfVariantUtilities.MapGTString(genotypeString, numAlts);

                //note this awkward vcf line (pisces)
                //"chr4\t10\t.\tAA\tGA,G\t0\tPASS\tDP=5394\tGT:GQ:AD:DP:VF:NL:SB:NC\t1/2:0:2387,2000:5394:0.8133:23:0.0000:0.0000";
                //and this one
                //chr2    19946216.ATGTGTG ATG,ATGTG,A 0   PASS metal = platinum; cgi =.; bwa_freebayes = HD:0,LOOHD: 0; bwa_platypus =.; bwa_gatk3 = HD:2,LOOHD: 2; cortex =.; isaac2 = HD:1,LOOHD: 1; dist2closest = 192 GT  1 | 2

                if ((numAlts >= 2) && (Genotypes[0].ContainsKey("AD")))
                {
                    if (ADstrings.Count() <= numAlts) //in this case we never expressedly gave the ref support, so we have to derive it.
                    {
                        int totalAltCount = 0;

                        for (int altIndex = 0; altIndex < numAlts; altIndex++)
                        {
                            var altSupportAtIndex = int.Parse(ADstrings[altIndex]);
                            totalAltCount += altSupportAtIndex;

                            if (altIndex == alleleIndex)
                            {
                                altSupport = altSupportAtIndex;
                            }
                        }
                        referenceSupport = Math.Max(0, totalCoverage - totalAltCount);
                    }
                }
            }

            var strandBiasResults = new BiasResults();

            strandBiasResults.GATKBiasScore = strandBiasInGATKScaleCoords;



            //set the remaining data
            allele.TotalCoverage     = totalCoverage;
            allele.AlleleSupport     = isRef ? referenceSupport : altSupport;
            allele.ReferenceSupport  = referenceSupport;
            allele.GenotypeQscore    = genotypeQscore;
            allele.NoiseLevelApplied = noiseLevel;
            allele.StrandBiasResults = strandBiasResults;
            allele.IsForcedToReport  = allele.Filters.Contains(FilterType.ForcedReport);

            //set the derived values
            allele.SetType();
            allele.ForceFractionNoCalls(fractionNocalls);

            //rescue attempt for complex types, ie ACGT -> ACGTGG.
            //Get the simplest form of the allele
            if ((allele.Type == AlleleCategory.Unsupported) && shouldTrimComplexAlleles)
            {
                VcfVariantUtilities.TrimUnsupportedAlleleType(allele);
            }

            if (tsCounts.Count != 0)
            {
                VcfVariantUtilities.FillInCollapsedReadsCount(shouldOutputRcCounts, shouldOutputTsCounts, allele, tsCounts);
            }
        }