public ProteinWithAppliedVariants(string variantBaseSequence, Protein protein, IEnumerable <SequenceVariation> appliedSequenceVariations, string individual)
            : base(variantBaseSequence, protein.Accession, organism: protein.Organism, geneNames: new List <Tuple <string, string> >(protein.GeneNames),
                   oneBasedModifications: protein.OneBasedPossibleLocalizedModifications.ToDictionary(x => x.Key, x => x.Value),
                   proteolysisProducts: new List <ProteolysisProduct>(protein.ProteolysisProducts), name: protein.Name, fullName: protein.FullName,
                   isDecoy: protein.IsDecoy, isContaminant: protein.IsContaminant,
                   databaseReferences: new List <DatabaseReference>(protein.DatabaseReferences), sequenceVariations: new List <SequenceVariation>(protein.SequenceVariations),
                   disulfideBonds: new List <DisulfideBond>(protein.DisulfideBonds), databaseFilePath: protein.DatabaseFilePath)
        {
            Protein = protein;
            AppliedSequenceVariations = appliedSequenceVariations != null?appliedSequenceVariations.ToList() : new List <SequenceVariation>();

            Individual = individual;
        }
        public ProteinWithAppliedVariants(string variantBaseSequence, Protein protein, IEnumerable <SequenceVariation> appliedSequenceVariations,
                                          IEnumerable <ProteolysisProduct> applicableProteolysisProducts, IDictionary <int, List <Modification> > oneBasedModifications, string individual)
            : base(variantBaseSequence,
                   protein.Accession + (appliedSequenceVariations == null ? "" : "_" + CombineSimpleStrings(appliedSequenceVariations)),
                   organism: protein.Organism,
                   geneNames: new List <Tuple <string, string> >(protein.GeneNames),
                   oneBasedModifications: oneBasedModifications.ToDictionary(x => x.Key, x => x.Value),
                   proteolysisProducts: new List <ProteolysisProduct>(applicableProteolysisProducts),
                   name: protein.Name + (appliedSequenceVariations == null ? "" : " variant:" + CombineDescriptions(appliedSequenceVariations)),
                   fullName: protein.FullName + (appliedSequenceVariations == null ? "" : " variant:" + CombineDescriptions(appliedSequenceVariations)),
                   isDecoy: protein.IsDecoy,
                   isContaminant: protein.IsContaminant,
                   databaseReferences: new List <DatabaseReference>(protein.DatabaseReferences),
                   sequenceVariations: new List <SequenceVariation>(protein.SequenceVariations),
                   disulfideBonds: new List <DisulfideBond>(protein.DisulfideBonds),
                   databaseFilePath: protein.DatabaseFilePath)
        {
            Protein = protein;
            AppliedSequenceVariations = appliedSequenceVariations != null?appliedSequenceVariations.ToList() : new List <SequenceVariation>();

            Individual = individual;
        }
Esempio n. 3
0
 /// <summary>
 /// Protein construction that clones a protein but assigns a different base sequence
 /// For use in SILAC experiments
 /// </summary>
 /// <param name="originalProtein"></param>
 /// <param name="silacSequence"></param>
 /// <param name="silacAccession"></param>
 public Protein(Protein originalProtein, string silacSequence)
 {
     BaseSequence      = silacSequence;
     Accession         = originalProtein.Accession;
     NonVariantProtein = originalProtein.NonVariantProtein;
     Name                                   = originalProtein.Name;
     Organism                               = originalProtein.Organism;
     FullName                               = originalProtein.FullName;
     IsDecoy                                = originalProtein.IsDecoy;
     IsContaminant                          = originalProtein.IsContaminant;
     DatabaseFilePath                       = originalProtein.DatabaseFilePath;
     SampleNameForVariants                  = originalProtein.SampleNameForVariants;
     GeneNames                              = originalProtein.GeneNames;
     ProteolysisProducts                    = originalProtein.ProteolysisProducts;
     SequenceVariations                     = originalProtein.SequenceVariations;
     AppliedSequenceVariations              = originalProtein.AppliedSequenceVariations;
     OriginalNonVariantModifications        = originalProtein.OriginalNonVariantModifications;
     OneBasedPossibleLocalizedModifications = originalProtein.OneBasedPossibleLocalizedModifications;
     DatabaseReferences                     = originalProtein.DatabaseReferences;
     DisulfideBonds                         = originalProtein.DisulfideBonds;
     SpliceSites                            = originalProtein.SpliceSites;
     DatabaseFilePath                       = originalProtein.DatabaseFilePath;
 }
Esempio n. 4
0
 /// <summary>
 /// Protein construction with applied variations
 /// </summary>
 /// <param name="variantBaseSequence"></param>
 /// <param name="protein"></param>
 /// <param name="appliedSequenceVariations"></param>
 /// <param name="applicableProteolysisProducts"></param>
 /// <param name="oneBasedModifications"></param>
 /// <param name="sampleNameForVariants"></param>
 public Protein(string variantBaseSequence, Protein protein, IEnumerable <SequenceVariation> appliedSequenceVariations,
                IEnumerable <ProteolysisProduct> applicableProteolysisProducts, IDictionary <int, List <Modification> > oneBasedModifications, string sampleNameForVariants)
     : this(variantBaseSequence,
            VariantApplication.GetAccession(protein, appliedSequenceVariations),
            organism : protein.Organism,
            geneNames : new List <Tuple <string, string> >(protein.GeneNames),
            oneBasedModifications : oneBasedModifications != null ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) : new Dictionary <int, List <Modification> >(),
            proteolysisProducts : new List <ProteolysisProduct>(applicableProteolysisProducts ?? new List <ProteolysisProduct>()),
            name : GetName(appliedSequenceVariations, protein.Name),
            fullName : GetName(appliedSequenceVariations, protein.FullName),
            isDecoy : protein.IsDecoy,
            isContaminant : protein.IsContaminant,
            databaseReferences : new List <DatabaseReference>(protein.DatabaseReferences),
            sequenceVariations : new List <SequenceVariation>(protein.SequenceVariations),
            disulfideBonds : new List <DisulfideBond>(protein.DisulfideBonds),
            spliceSites : new List <SpliceSite>(protein.SpliceSites),
            databaseFilePath : protein.DatabaseFilePath)
 {
     NonVariantProtein = protein.NonVariantProtein;
     OriginalNonVariantModifications = NonVariantProtein.OriginalNonVariantModifications;
     AppliedSequenceVariations       = (appliedSequenceVariations ?? new List <SequenceVariation>()).ToList();
     SampleNameForVariants           = sampleNameForVariants;
 }
        /// <summary>
        /// Applies multiple variant changes to a protein sequence
        /// </summary>
        /// <param name="protein"></param>
        /// <param name="uniqueEffectsToApply"></param>
        /// <returns></returns>
        internal static List <Protein> ApplyVariants(Protein protein, IEnumerable <SequenceVariation> sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth)
        {
            List <SequenceVariation> uniqueEffectsToApply = sequenceVariations
                                                            .GroupBy(v => v.SimpleString())
                                                            .Select(x => x.First())
                                                            .Where(v => v.Description.Genotypes.Count > 0)   // this is a VCF line
                                                            .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first
                                                            .ToList();

            Protein proteinCopy = new Protein(protein.BaseSequence, protein, null, protein.ProteolysisProducts, protein.OneBasedPossibleLocalizedModifications, null);

            // If there aren't any variants to apply, just return the base protein
            if (uniqueEffectsToApply.Count == 0)
            {
                return(new List <Protein> {
                    proteinCopy
                });
            }

            HashSet <string> individuals     = new HashSet <string>(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys));
            List <Protein>   variantProteins = new List <Protein>();

            // loop through genotypes for each sample/individual (e.g. tumor and normal)
            foreach (string individual in individuals)
            {
                bool           tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
                List <Protein> newVariantProteins          = new List <Protein> {
                    proteinCopy
                };
                foreach (var variant in uniqueEffectsToApply)
                {
                    bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
                    if (!variantAlleleIsInTheGenotype)
                    {
                        continue;
                    }
                    bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
                    bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
                    bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;

                    // homozygous alternate
                    if (isHomozygousAlternate && isDeepAlternateAllele)
                    {
                        newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
                    }

                    // heterozygous basic
                    // first protein with variants contains all homozygous variation, second contains all variations
                    else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants)
                    {
                        if (isDeepAlternateAllele && isDeepReferenceAllele)
                        {
                            if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0)
                            {
                                Protein variantProtein = ApplySingleVariant(variant, newVariantProteins[0], individual);
                                newVariantProteins.Add(variantProtein);
                            }
                            else if (maxAllowedVariantsForCombinitorics > 0)
                            {
                                newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual);
                            }
                            else
                            {
                                // no heterozygous variants
                            }
                        }
                        else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
                        {
                            newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
                        }
                        else
                        {
                            // keep reference only
                        }
                    }

                    // heterozygous combinitorics
                    else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
                    {
                        List <Protein> combinitoricProteins = new List <Protein>();

                        foreach (Protein ppp in newVariantProteins)
                        {
                            if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele)
                            {
                                // keep reference allele
                                if (variant.Description.Genotypes[individual].Contains("0"))
                                {
                                    combinitoricProteins.Add(ppp);
                                }

                                // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included)
                                combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
                            }
                            else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
                            {
                                combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
                            }
                            else if (variant.Description.Genotypes[individual].Contains("0"))
                            {
                                combinitoricProteins.Add(ppp);
                            }
                            else
                            {
                                // must be two alternate alleles with not enough depth
                            }
                        }
                        newVariantProteins = combinitoricProteins;
                    }
                }
                variantProteins.AddRange(newVariantProteins);
            }

            return(variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList());
        }
 /// <summary>
 /// Restores modification index on a variant protein to the index on the nonvariant protein,
 /// or if it falls on a variant, this restores the position on the protein with only that variant
 /// </summary>
 /// <param name="variantProteinIndex"></param>
 /// <param name="modification"></param>
 /// <returns></returns>
 public static int RestoreModificationIndex(Protein protein, int variantProteinIndex)
 {
     return(variantProteinIndex - protein.AppliedSequenceVariations
            .Where(v => v.OneBasedEndPosition < variantProteinIndex)
            .Sum(v => v.VariantSequence.Length - v.OriginalSequence.Length));
 }
        /// <summary>
        /// Adjusts modification indices.
        /// </summary>
        /// <param name="variant"></param>
        /// <param name="modificationDictionary"></param>
        /// <returns></returns>
        internal static Dictionary <int, List <Modification> > AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, Protein protein)
        {
            IDictionary <int, List <Modification> > modificationDictionary        = protein.OneBasedPossibleLocalizedModifications;
            IDictionary <int, List <Modification> > variantModificationDictionary = variant.OneBasedModifications;
            Dictionary <int, List <Modification> >  mods = new Dictionary <int, List <Modification> >();
            int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;

            // change modification indices for variant sequence
            if (modificationDictionary != null)
            {
                foreach (KeyValuePair <int, List <Modification> > kv in modificationDictionary)
                {
                    if (kv.Key > variantAppliedProteinSequence.Length)
                    {
                        continue; // it was cut out by a stop gain
                    }
                    // mod is before the variant
                    else if (kv.Key < variant.OneBasedBeginPosition)
                    {
                        mods.Add(kv.Key, kv.Value);
                    }
                    // mod is after the variant and not affected by a stop gain
                    else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length)
                    {
                        mods.Add(kv.Key + sequenceLengthChange, kv.Value);
                    }
                    else // sequence variant conflicts with modification site (modification site substitution)
                    {
                        continue;
                    }
                }
            }

            // sequence variant modifications are indexed to the variant sequence
            //    NOTE: this code assumes variants are added from end to beginning of protein, so that previously added variant mods are adjusted above
            if (variantModificationDictionary != null)
            {
                foreach (var kv in variantModificationDictionary)
                {
                    if (mods.TryGetValue(kv.Key, out var modsAtPos))
                    {
                        modsAtPos.AddRange(kv.Value);
                    }
                    else
                    {
                        mods.Add(kv.Key, kv.Value);
                    }
                }
            }

            return(mods);
        }
        /// <summary>
        /// Eliminates proteolysis products that overlap sequence variations.
        /// Since frameshift indels are written across the remaining sequence,
        /// this eliminates proteolysis products that conflict with large deletions and other structural variations.
        /// </summary>
        /// <param name="variants"></param>
        /// <param name="proteolysisProducts"></param>
        /// <returns></returns>
        internal static List <ProteolysisProduct> AdjustProteolysisProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, Protein protein, IEnumerable <ProteolysisProduct> proteolysisProducts)
        {
            List <ProteolysisProduct> products = new List <ProteolysisProduct>();

            if (proteolysisProducts == null)
            {
                return(products);
            }
            int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;

            foreach (ProteolysisProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue))
            {
                // proteolysis product is entirely before the variant
                if (variant.OneBasedBeginPosition > p.OneBasedEndPosition)
                {
                    products.Add(p);
                }
                // proteolysis product straddles the variant, but the cleavage site(s) are still intact; the ends aren't considered cleavage sites
                else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2) &&
                         (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.NonVariantProtein.BaseSequence.Length))
                {
                    if (variant.VariantSequence.EndsWith("*"))
                    {
                        products.Add(new ProteolysisProduct(p.OneBasedBeginPosition, variantAppliedProteinSequence.Length, p.Type));
                    }
                    else if (p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length)
                    {
                        products.Add(new ProteolysisProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
                    }
                    else
                    {
                        // cleavage site is not intact
                    }
                }
                // proteolysis product is after the variant and there is no stop gain
                else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition &&
                         p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length &&
                         p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length &&
                         !variant.VariantSequence.EndsWith("*"))
                {
                    products.Add(new ProteolysisProduct(p.OneBasedBeginPosition + sequenceLengthChange, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
                }
                else // sequence variant conflicts with proteolysis cleavage site (cleavage site was lost)
                {
                    continue;
                }
            }
            return(products);
        }
        /// <summary>
        /// Applies a single variant to a protein sequence
        /// </summary>
        /// <param name="variantGettingApplied"></param>
        /// <returns></returns>
        internal static Protein ApplySingleVariant(SequenceVariation variantGettingApplied, Protein protein, string individual)
        {
            string seqBefore  = protein.BaseSequence.Substring(0, variantGettingApplied.OneBasedBeginPosition - 1);
            string seqVariant = variantGettingApplied.VariantSequence;
            int    afterIdx   = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1;

            SequenceVariation variantAfterApplication = new SequenceVariation(
                variantGettingApplied.OneBasedBeginPosition,
                variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
                variantGettingApplied.OriginalSequence,
                variantGettingApplied.VariantSequence,
                variantGettingApplied.Description.Description,
                variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));

            // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
            // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles,
            //    e.g. reference sequence is wrong at that point
            bool intersectsAppliedRegionIncompletely          = protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
            IEnumerable <SequenceVariation> appliedVariations = new[] { variantAfterApplication };
            string seqAfter = null;

            if (intersectsAppliedRegionIncompletely)
            {
                // use original protein sequence for the remaining sequence
                seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.NonVariantProtein.BaseSequence.Substring(afterIdx);
            }
            else
            {
                // use this variant protein sequence for the remaining sequence
                seqAfter          = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx);
                appliedVariations = appliedVariations
                                    .Concat(protein.AppliedSequenceVariations.Where(x => !variantGettingApplied.Includes(x)))
                                    .ToList();
            }
            string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; // there may be a stop gained

            // adjust indices
            List <ProteolysisProduct> adjustedProteolysisProducts        = AdjustProteolysisProductIndices(variantGettingApplied, variantSequence, protein, protein.ProteolysisProducts);
            Dictionary <int, List <Modification> > adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein);
            List <SequenceVariation> adjustedAppliedVariations           = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations);

            return(new Protein(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual));
        }
Esempio n. 10
0
 /// <summary>
 /// Gets the accession for a protein with applied variations
 /// </summary>
 /// <param name="protein"></param>
 /// <param name="sequenceVariation"></param>
 public static string GetAccession(Protein protein, IEnumerable <SequenceVariation> appliedSequenceVariations)
 {
     return(protein.NonVariantProtein.Accession +
            (appliedSequenceVariations == null || appliedSequenceVariations.Count() == 0 ? "" : $"_{CombineSimpleStrings(appliedSequenceVariations)}"));
 }