Esempio n. 1
0
        /// <summary>
        /// Gets proteins with applied variants from this protein
        /// </summary>
        public List <ProteinWithAppliedVariants> GetVariantProteins()
        {
            List <SequenceVariation> uniqueEffects = SequenceVariations
                                                     .GroupBy(v => v.OriginalSequence + v.OneBasedBeginPosition.ToString() + v.VariantSequence).Select(x => x.First())
                                                     .Where(v => v.Description.Split('\t').Length >= 10) // likely a VCF line (should probably do more rigorous testing, eventually)
                                                     .OrderByDescending(v => v.OneBasedBeginPosition)    // apply variants at the end of the protein sequence first
                                                     .ToList();

            ProteinWithAppliedVariants variantProtein = new ProteinWithAppliedVariants(BaseSequence, this, null, null);

            return(variantProtein.ApplyVariants(variantProtein, uniqueEffects));
        }
Esempio n. 2
0
        /// <summary>
        /// Gets proteins with applied variants from this protein
        /// </summary>
        public List <ProteinWithAppliedVariants> GetVariantProteins()
        {
            if (SequenceVariations.Count() > 0)
            {
                int i    = 0;
                int asdf = SequenceVariations.Select(v => v.Description.Split(new[] { @"\t" }, StringSplitOptions.None).Length).Max();
            }
            List <SequenceVariation> uniqueEffects = SequenceVariations
                                                     .GroupBy(v => v.SimpleString())
                                                     .Select(x => x.First())
                                                     .Where(v => v.Description.Split(new[] { @"\t" }, StringSplitOptions.None).Length >= 10) // likely a VCF line (should probably do more rigorous testing, eventually)
                                                     .OrderByDescending(v => v.OneBasedBeginPosition)                                        // apply variants at the end of the protein sequence first
                                                     .ToList();
            ProteinWithAppliedVariants variantProtein = new ProteinWithAppliedVariants(BaseSequence, this, null, ProteolysisProducts, OneBasedPossibleLocalizedModifications, null);

            return(variantProtein.ApplyVariants(variantProtein, uniqueEffects));
        }
        /// <summary>
        /// Applies variant changes to protein sequence
        /// </summary>
        /// <param name="protein"></param>
        /// <param name="uniqueEffectsToApply"></param>
        /// <returns></returns>
        internal List <ProteinWithAppliedVariants> ApplyVariants(ProteinWithAppliedVariants protein, List <SequenceVariation> uniqueEffectsToApply)
        {
            // If there aren't any variants to apply, just return the base protein
            if (uniqueEffectsToApply.Count == 0)
            {
                return(new List <ProteinWithAppliedVariants> {
                    protein
                });
            }

            bool referenceAdded = false;
            List <ProteinWithAppliedVariants> proteins = new List <ProteinWithAppliedVariants>();

            foreach (SequenceVariation variant in uniqueEffectsToApply)
            {
                // Parse description into
                string[] vcfFields = variant.Description.Split(new[] { @"\t" }, StringSplitOptions.None);
                if (vcfFields.Length < 10)
                {
                    continue;
                }
                string   referenceAlleleString = vcfFields[3];
                string   alternateAlleleString = vcfFields[4];
                string   info      = vcfFields[7];
                string   format    = vcfFields[8];
                string[] genotypes = Enumerable.Range(9, vcfFields.Length - 9).Select(i => vcfFields[i]).ToArray();

                // loop through genotypes for this variant (e.g. tumor and normal)
                for (int i = 0; i < genotypes.Length; i++)
                {
                    if (Individual != null && Individual != i.ToString())
                    {
                        continue;
                    }
                    var genotypeFields = GenotypeDictionary(format.Trim(), genotypes[i].Trim());

                    // parse genotype
                    string[] gt = null;
                    if (genotypeFields.TryGetValue("GT", out string gtString))
                    {
                        gt = gtString.Split('/');
                    }
                    if (gt == null)
                    {
                        continue;
                    }

                    // parse allele depth (might be null, technically, but shouldn't be in most use cases)
                    string[] ad = null;
                    if (genotypeFields.TryGetValue("AD", out string adString))
                    {
                        ad = adString.Split(',');
                    }

                    // reference allele
                    if (gt.Contains("0") && !referenceAdded)
                    {
                        proteins.Add(new ProteinWithAppliedVariants(BaseSequence, Protein, AppliedSequenceVariations, ProteolysisProducts, OneBasedPossibleLocalizedModifications, i.ToString()));
                        referenceAdded = true; // only add the reference allele once
                    }

                    // alternate allele
                    // TODO: recursively apply variants to create haplotypes and be wary of combinitorial explosion
                    if (!gt.All(x => x == "0"))
                    {
                        // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
                        // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles,
                        //    e.g. reference sequence is wrong at that point
                        bool   intersectsAppliedRegionIncompletely = AppliedSequenceVariations.Any(x => variant.Intersects(x) && !variant.Includes(x));
                        string seqBefore  = BaseSequence.Substring(0, variant.OneBasedBeginPosition - 1);
                        string seqVariant = variant.VariantSequence;
                        List <ProteolysisProduct> adjustedProteolysisProducts        = AdjustProteolysisProductIndices(variant, ProteolysisProducts);
                        Dictionary <int, List <Modification> > adjustedModifications = AdjustModificationIndices(variant, OneBasedPossibleLocalizedModifications);
                        int afterIdx = variant.OneBasedBeginPosition + variant.OriginalSequence.Length - 1;
                        if (intersectsAppliedRegionIncompletely)
                        {
                            // use original protein sequence for the remaining sequence
                            string seqAfter = Protein.BaseSequence.Length - afterIdx <= 0 ? "" : Protein.BaseSequence.Substring(afterIdx);
                            proteins.Add(new ProteinWithAppliedVariants(seqBefore + seqVariant + seqAfter, Protein, new[] { variant }, adjustedProteolysisProducts, adjustedModifications, i.ToString()));
                        }
                        else
                        {
                            List <SequenceVariation> variations = AppliedSequenceVariations
                                                                  .Where(x => !variant.Includes(x))
                                                                  .Concat(new[] { variant })
                                                                  .ToList();
                            // use this variant protein sequence for the remaining sequence
                            string seqAfter = BaseSequence.Length - afterIdx <= 0 ? "" : BaseSequence.Substring(afterIdx);
                            proteins.Add(new ProteinWithAppliedVariants(seqBefore + seqVariant + seqAfter, Protein, variations, adjustedProteolysisProducts, adjustedModifications, i.ToString()));
                        }
                    }
                }
            }
            return(proteins);
        }