/// <summary> /// Gets proteins with applied variants from this protein /// </summary> public List <ProteinWithAppliedVariants> GetVariantProteins() { List <SequenceVariation> uniqueEffects = SequenceVariations .GroupBy(v => v.OriginalSequence + v.OneBasedBeginPosition.ToString() + v.VariantSequence).Select(x => x.First()) .Where(v => v.Description.Split('\t').Length >= 10) // likely a VCF line (should probably do more rigorous testing, eventually) .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first .ToList(); ProteinWithAppliedVariants variantProtein = new ProteinWithAppliedVariants(BaseSequence, this, null, null); return(variantProtein.ApplyVariants(variantProtein, uniqueEffects)); }
/// <summary> /// Gets proteins with applied variants from this protein /// </summary> public List <ProteinWithAppliedVariants> GetVariantProteins() { if (SequenceVariations.Count() > 0) { int i = 0; int asdf = SequenceVariations.Select(v => v.Description.Split(new[] { @"\t" }, StringSplitOptions.None).Length).Max(); } List <SequenceVariation> uniqueEffects = SequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) .Where(v => v.Description.Split(new[] { @"\t" }, StringSplitOptions.None).Length >= 10) // likely a VCF line (should probably do more rigorous testing, eventually) .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first .ToList(); ProteinWithAppliedVariants variantProtein = new ProteinWithAppliedVariants(BaseSequence, this, null, ProteolysisProducts, OneBasedPossibleLocalizedModifications, null); return(variantProtein.ApplyVariants(variantProtein, uniqueEffects)); }
/// <summary> /// Applies variant changes to protein sequence /// </summary> /// <param name="protein"></param> /// <param name="uniqueEffectsToApply"></param> /// <returns></returns> internal List <ProteinWithAppliedVariants> ApplyVariants(ProteinWithAppliedVariants protein, List <SequenceVariation> uniqueEffectsToApply) { // If there aren't any variants to apply, just return the base protein if (uniqueEffectsToApply.Count == 0) { return(new List <ProteinWithAppliedVariants> { protein }); } bool referenceAdded = false; List <ProteinWithAppliedVariants> proteins = new List <ProteinWithAppliedVariants>(); foreach (SequenceVariation variant in uniqueEffectsToApply) { // Parse description into string[] vcfFields = variant.Description.Split(new[] { @"\t" }, StringSplitOptions.None); if (vcfFields.Length < 10) { continue; } string referenceAlleleString = vcfFields[3]; string alternateAlleleString = vcfFields[4]; string info = vcfFields[7]; string format = vcfFields[8]; string[] genotypes = Enumerable.Range(9, vcfFields.Length - 9).Select(i => vcfFields[i]).ToArray(); // loop through genotypes for this variant (e.g. tumor and normal) for (int i = 0; i < genotypes.Length; i++) { if (Individual != null && Individual != i.ToString()) { continue; } var genotypeFields = GenotypeDictionary(format.Trim(), genotypes[i].Trim()); // parse genotype string[] gt = null; if (genotypeFields.TryGetValue("GT", out string gtString)) { gt = gtString.Split('/'); } if (gt == null) { continue; } // parse allele depth (might be null, technically, but shouldn't be in most use cases) string[] ad = null; if (genotypeFields.TryGetValue("AD", out string adString)) { ad = adString.Split(','); } // reference allele if (gt.Contains("0") && !referenceAdded) { proteins.Add(new ProteinWithAppliedVariants(BaseSequence, Protein, AppliedSequenceVariations, ProteolysisProducts, OneBasedPossibleLocalizedModifications, i.ToString())); referenceAdded = true; // only add the reference allele once } // alternate allele // TODO: recursively apply variants to create haplotypes and be wary of combinitorial explosion if (!gt.All(x => x == "0")) { // check to see if there is incomplete indel overlap, which would lead to weird variant sequences // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles, // e.g. reference sequence is wrong at that point bool intersectsAppliedRegionIncompletely = AppliedSequenceVariations.Any(x => variant.Intersects(x) && !variant.Includes(x)); string seqBefore = BaseSequence.Substring(0, variant.OneBasedBeginPosition - 1); string seqVariant = variant.VariantSequence; List <ProteolysisProduct> adjustedProteolysisProducts = AdjustProteolysisProductIndices(variant, ProteolysisProducts); Dictionary <int, List <Modification> > adjustedModifications = AdjustModificationIndices(variant, OneBasedPossibleLocalizedModifications); int afterIdx = variant.OneBasedBeginPosition + variant.OriginalSequence.Length - 1; if (intersectsAppliedRegionIncompletely) { // use original protein sequence for the remaining sequence string seqAfter = Protein.BaseSequence.Length - afterIdx <= 0 ? "" : Protein.BaseSequence.Substring(afterIdx); proteins.Add(new ProteinWithAppliedVariants(seqBefore + seqVariant + seqAfter, Protein, new[] { variant }, adjustedProteolysisProducts, adjustedModifications, i.ToString())); } else { List <SequenceVariation> variations = AppliedSequenceVariations .Where(x => !variant.Includes(x)) .Concat(new[] { variant }) .ToList(); // use this variant protein sequence for the remaining sequence string seqAfter = BaseSequence.Length - afterIdx <= 0 ? "" : BaseSequence.Substring(afterIdx); proteins.Add(new ProteinWithAppliedVariants(seqBefore + seqVariant + seqAfter, Protein, variations, adjustedProteolysisProducts, adjustedModifications, i.ToString())); } } } } return(proteins); }