コード例 #1
0
        private bool assign_pf_identity(ExperimentalProteoform e, PtmSet set, int begin, int end, ProteoformRelation r, TheoreticalProteoform theoretical_base, List <Proteoform> linked_proteoform_references, bool check_ambiguous_IDs)
        {
            bool identification_assigned = false;

            if (!Sweet.lollipop.id_use_ppm_tolerance || Math.Abs(e.calculate_mass_error(theoretical_base, set, begin, end) * 1e6 / e.modified_mass) < Sweet.lollipop.id_ppm_tolerance)
            {
                int new_begin = begin;
                int new_end   = end;

                PtmSet     new_set = new PtmSet(new List <Ptm>(set.ptm_combination));
                List <Ptm> remove  = new List <Ptm>();
                //do retention of M first
                foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                {
                    new_begin--;
                    remove.Add(mod);
                }

                foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                {
                    if (!new_set.ptm_combination.Any(m => m.modification.ModificationType == "AminoAcid") && begin >= theoretical_base.begin)
                    {
                        if (theoretical_base.sequence[begin - theoretical_base.begin].ToString() ==
                            mod.modification.Target.ToString())
                        {
                            new_begin++;
                            remove.Add(mod); //dont have in ptmset --> change the begin & end
                        }
                    }
                    if (!remove.Contains(mod) && theoretical_base.sequence[end - theoretical_base.begin].ToString() ==
                        mod.modification.Target.ToString())
                    {
                        new_end--;
                        remove.Add(mod);
                    }
                }

                foreach (var ptm in remove)
                {
                    new_set.ptm_combination.Remove(ptm);
                }

                new_set = new PtmSet(new_set.ptm_combination);

                if (e.linked_proteoform_references == null)
                {
                    identification_assigned = true;

                    if (linked_proteoform_references != null)
                    {
                        e.linked_proteoform_references = new List <Proteoform>(linked_proteoform_references);
                        e.linked_proteoform_references.Add(this);
                    }
                    else
                    {
                        e.linked_proteoform_references = new List <Proteoform>()
                        {
                            theoretical_base
                        };
                    }

                    e.relation_to_id = r;
                    e.ptm_set        = new_set;
                    e.begin          = new_begin;
                    e.end            = new_end;


                    if (e.gene_name == null)
                    {
                        e.gene_name = theoretical_base.gene_name;
                    }
                    else
                    {
                        e.gene_name.gene_names.Concat(this.gene_name.gene_names);
                    }
                }
                else
                {
                    if (linked_proteoform_references != null && !linked_proteoform_references.Contains(e))
                    {
                        bool different_id = e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) !=
                                            theoretical_base.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ||
                                            ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end)
                                            != ExperimentalProteoform.get_sequence(theoretical_base, new_begin, new_end) || !e.ptm_set.same_ptmset(new_set, true);


                        List <Modification> this_known_mods        = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList();
                        List <Modification> previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList();
                        if (!Sweet.lollipop.topdown_theoretical_reduce_ambiguity || (theoretical_base.topdown_theoretical && !(e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical))
                        {
                            if (!Sweet.lollipop.annotated_PTMs_reduce_ambiguity ||
                                (new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) &&
                                 !e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification)))))
                            {
                                if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity || Sweet.lollipop.annotated_PTMs_reduce_ambiguity)
                                {
                                    if (Sweet.lollipop.remove_bad_connections && different_id) //&& e.relation_to_id != r)
                                    {
                                        e.relation_to_id.Identification     = false;
                                        e.relation_to_id.represented_ptmset = null;
                                    }
                                    e.linked_proteoform_references = null;
                                    e.ptm_set   = new PtmSet(new List <Ptm>());
                                    e.begin     = 0;
                                    e.end       = 0;
                                    e.gene_name = null;
                                    e.ambiguous_identifications.Clear();
                                    ProteoformRelation relation = null;
                                    e.relation_to_id = relation;

                                    //reassign the topdown - based ID
                                    return(this.assign_pf_identity(e, set, begin, end, r, theoretical_base, linked_proteoform_references, true));
                                }
                            }
                        }

                        if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity && (e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical && !theoretical_base.topdown_theoretical)
                        {
                        }
                        else if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity &&
                                 !new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) &&
                                 e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                        {
                        }
                        else
                        {
                            if (different_id)
                            {
                                var new_linked_proteoform_references = new List <Proteoform>(linked_proteoform_references);
                                new_linked_proteoform_references.Add(this);

                                AmbiguousIdentification new_id =
                                    new AmbiguousIdentification(new_begin, new_end, new_set, r, theoretical_base, new_linked_proteoform_references);
                                lock (e.ambiguous_identifications)
                                {
                                    if (!e.ambiguous_identifications.Any(p =>
                                                                         p.theoretical_base.gene_name.primary ==
                                                                         new_id.theoretical_base.gene_name.primary &&
                                                                         ExperimentalProteoform.get_sequence(p.theoretical_base, p.begin, p.end) == ExperimentalProteoform.get_sequence(new_id.theoretical_base, new_id.begin, new_id.end) &&
                                                                         p.ptm_set.same_ptmset(new_id.ptm_set, true)))
                                    {
                                        e.ambiguous_identifications.Add(new_id);
                                        identification_assigned = true;
                                    }
                                }
                            }
                        }
                    }
                }
            }


            if (check_ambiguous_IDs)
            {
                //remove bad relations if using td to reduce ambiguity
                if (identification_assigned)
                {
                    List <AmbiguousIdentification> to_remove = new List <AmbiguousIdentification>();
                    List <Modification>            previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList();
                    if (theoretical_base.topdown_theoretical && Sweet.lollipop.topdown_theoretical_reduce_ambiguity)
                    {
                        to_remove.AddRange(e.ambiguous_identifications.Where(id => !id.theoretical_base.topdown_theoretical));
                    }
                    if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity &&
                        e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                    {
                        foreach (var ambiguous_id in e.ambiguous_identifications)
                        {
                            List <Modification> ambiguous_id_known_mods = ambiguous_id.theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList();
                            if (ambiguous_id.ptm_set.ptm_combination.Any(mod1 => !modification_is_adduct(mod1.modification) && !ambiguous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                            {
                                to_remove.Add(ambiguous_id);
                            }
                        }
                    }
                    foreach (var x in to_remove)
                    {
                        if (e.ambiguous_identifications.Contains(x))
                        {
                            e.ambiguous_identifications.Remove(x);
                            if (Sweet.lollipop.remove_bad_connections)
                            {
                                if (e.relation_to_id != x.relation)
                                {
                                    x.relation.Identification     = false;
                                    x.relation.represented_ptmset = null;
                                }
                            }
                        }
                    }
                    foreach (var x in e.ambiguous_identifications)
                    {
                        x.relation.Identification = true;
                    }
                }


                if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous_identifications.Count > 0)
                {
                    lock ((this as ExperimentalProteoform).ambiguous_identifications)
                    {
                        int      count       = (this as ExperimentalProteoform).ambiguous_identifications.Count;
                        PtmSet[] new_ptm_set = new PtmSet[count];
                        Parallel.For(0, count, i =>
                        {
                            var id         = (this as ExperimentalProteoform).ambiguous_identifications[i];
                            new_ptm_set[i] = determine_mod_change(e, this, id.theoretical_base, r, id.ptm_set, id.begin, id.end);
                        });
                        for (int i = 0; i < count; i++)
                        {
                            if (new_ptm_set[i] != null)
                            {
                                var id = (this as ExperimentalProteoform).ambiguous_identifications[i];
                                if (assign_pf_identity(e, new_ptm_set[i], id.begin, id.end, r, id.theoretical_base, id.linked_proteoform_references, false))
                                {
                                    identification_assigned = true;
                                }
                            }
                        }
                    }
                }
            }
            return(identification_assigned);
        }
コード例 #2
0
        public void identify_experimentals()
        {
            HashSet <ExperimentalProteoform> identified_experimentals = new HashSet <ExperimentalProteoform>();

            if (Sweet.lollipop.identify_from_td_nodes)
            {
                foreach (TopDownProteoform topdown in experimental_proteoforms.Where(e => e.topdown_id))
                {
                    Sweet.lollipop.theoretical_database
                    .theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number]
                    .TryGetValue(topdown.accession.Split('_')[0].Split('-')[0], out var t);
                    if (t != null && t.Count > 0)
                    {
                        TheoreticalProteoform theoretical =
                            new TheoreticalProteoform(topdown.accession, topdown.name, topdown.sequence,
                                                      t.First().ExpandedProteinList, topdown.modified_mass, topdown.lysine_count,
                                                      topdown.topdown_ptm_set, true, false, null);
                        theoretical.topdown_theoretical    = true;
                        theoretical.new_topdown_proteoform = true;
                        theoretical.begin = topdown.topdown_begin;
                        theoretical.end   = topdown.topdown_end;
                        foreach (ExperimentalProteoform e in topdown.identify_connected_experimentals(theoretical, topdown.topdown_begin, topdown.topdown_end,
                                                                                                      new PtmSet(topdown.topdown_ptm_set.ptm_combination), null))
                        {
                            identified_experimentals.Add(e);
                        }
                    }
                }
            }
            foreach (TheoreticalProteoform t in theoretical_proteoforms.OrderBy(t => t.topdown_theoretical))
            {
                lock (identified_experimentals)
                    foreach (ExperimentalProteoform e in t.identify_connected_experimentals(t, t.begin, t.end, t.ptm_set, t.linked_proteoform_references))
                    {
                        identified_experimentals.Add(e);
                    }
            }

            //Continue looking for new experimental identifications until no more remain to be identified
            List <ExperimentalProteoform> newly_identified_experimentals = new List <ExperimentalProteoform>(identified_experimentals).OrderBy(p => p.relationships.Count(r => r.candidate_ptmset != null) > 0 ? p.relationships.Where(r => r.candidate_ptmset != null).Min(r => Math.Abs(r.DeltaMass - r.candidate_ptmset.mass)) : 1e6).ThenBy(p => p.modified_mass).ToList();
            int last_identified_count = identified_experimentals.Count - 1;

            while (newly_identified_experimentals.Count > 0) //&& identified_experimentals.Count > last_identified_count)
            {
                last_identified_count = identified_experimentals.Count;
                HashSet <ExperimentalProteoform> tmp_new_experimentals = new HashSet <ExperimentalProteoform>();
                foreach (ExperimentalProteoform id_experimental in newly_identified_experimentals)
                {
                    {
                        lock (identified_experimentals) lock (tmp_new_experimentals)
                                foreach (ExperimentalProteoform new_e in id_experimental.identify_connected_experimentals(id_experimental.linked_proteoform_references.First() as TheoreticalProteoform, id_experimental.begin,
                                                                                                                          id_experimental.end, id_experimental.ptm_set, id_experimental.linked_proteoform_references))
                                {
                                    identified_experimentals.Add(new_e);
                                    tmp_new_experimentals.Add(new_e);
                                }
                    }
                }
                newly_identified_experimentals = new List <ExperimentalProteoform>(tmp_new_experimentals);
            }

            List <string> topdown_ids = Sweet.lollipop.topdown_proteoforms
                                        .Select(p => p.accession.Split('_')[0].Split('-')[0] + "_" + p.sequence + "_" + string.Join(", ", p.topdown_ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))).ToList();


            //determine identified experimentals that are adducts
            //checks if any experimentals have same mods as e's ptmset, except e has additional adduct only mods.
            Parallel.ForEach(experimental_proteoforms, e =>
            {
                e.adduct =
                    e.linked_proteoform_references != null &&
                    e.ptm_set.ptm_combination.Any(m => Proteoform.modification_is_adduct(m.modification)) &&
                    experimental_proteoforms.Any(l =>
                                                 l.linked_proteoform_references != null &&
                                                 l.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) &&
                                                 l.ptm_set.ptm_combination.Count < e.ptm_set.ptm_combination.Count &&
                                                 e.ptm_set.ptm_combination.Where(m => l.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)) != e.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)))
                                                 .Count(p => !Proteoform.modification_is_adduct(p.modification))
                                                 == 0
                                                 );

                if (e as TopDownProteoform != null)
                {
                    (e as TopDownProteoform).set_correct_id();
                }

                if (e.linked_proteoform_references != null)
                {
                    var mods = e.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification))
                               .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();
                    e.uniprot_mods = "";
                    string add     = "";
                    foreach (string mod in mods)
                    {
                        // positions with mod
                        List <int> theo_ptms = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p
                                                                                                                                                .OneBasedPossibleLocalizedModifications)
                                               .Where(p => p.Key >= e.begin && p.Key <= e.end &&
                                                      p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                               .Select(m => m.Key).ToList();
                        if (theo_ptms.Count > 0)
                        {
                            add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                        }
                        if (e.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                            .Count(m => m == mod) > theo_ptms.Count)
                        {
                            e.novel_mods = true;
                        }
                    }
                    e.uniprot_mods += add;
                    if (add.Length == 0)
                    {
                        e.uniprot_mods += "N/A";
                    }

                    foreach (var ambig_id in e.ambiguous_identifications)
                    {
                        var ambig_mods = ambig_id.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification))
                                         .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();

                        e.uniprot_mods += " | ";
                        add             = "";
                        foreach (var mod in ambig_mods)
                        {
                            // positions with mod
                            List <int> theo_ptms = ambig_id.theoretical_base.ExpandedProteinList.SelectMany(p => p
                                                                                                            .OneBasedPossibleLocalizedModifications)
                                                   .Where(p => p.Key >= ambig_id.begin && p.Key <= ambig_id.end &&
                                                          p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                                   .Select(m => m.Key).ToList();
                            if (theo_ptms.Count > 0)
                            {
                                add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                            }
                            if (ambig_id.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                                .Count(m => m == mod) > theo_ptms.Count)
                            {
                                e.novel_mods = true;
                            }
                        }
                        e.uniprot_mods += add;
                        if (add.Length == 0)
                        {
                            e.uniprot_mods += "N/A";
                        }
                    }
                }

                //determine level #
                e.proteoform_level_description = "";
                if (e.linked_proteoform_references == null)
                {
                    e.proteoform_level             = 5;
                    e.proteoform_level_description = "Unidentified";
                }
                else if (e.ambiguous_identifications.Count == 0)
                {
                    if (e.ptm_set.ptm_combination.Count == 0)
                    {
                        e.proteoform_level = 1;
                    }
                    else
                    {
                        e.proteoform_level              = 2;
                        e.proteoform_level_description += "PTM localization ambiguity; ";
                    }

                    //check if accessions had been grouped in constructing the theoretical database
                    if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1)
                    {
                        e.proteoform_level             += 1;
                        e.proteoform_level_description += "Gene ambiguity; ";
                    }
                }
                else
                {
                    var unique_accessions = new List <string>()
                    {
                        e.linked_proteoform_references.First().accession.Split('_')[0].Split('-')[0]
                    }.Concat(e.ambiguous_identifications.Select(a => a.theoretical_base.accession.Split('_')[0].Split('-')[0])).Distinct();
                    var unique_sequences = new List <string>()
                    {
                        ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end)
                    }.
                    Concat(e.ambiguous_identifications.Select(a => ExperimentalProteoform.get_sequence(a.theoretical_base, a.begin, a.end))).Distinct();
                    var unique_PTMs = new List <string>()
                    {
                        string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))
                    }.Concat(e.ambiguous_identifications.Select(a => string.Join(", ", a.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))))).Distinct();

                    int gene_ambiguity = unique_accessions.Count() > 1 ? 1 : 0;

                    //check if accessions had been grouped in constructing the theoretical database
                    if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1)
                    {
                        gene_ambiguity = 1;
                    }

                    int sequence_ambiguity = unique_sequences.Count() > 1 ? 1 : 0;
                    int PTM_ambiguity      = unique_PTMs.Count() > 1 ? 1 : 0;
                    int PTM_location       = e.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0 || e.ambiguous_identifications.Any(a => a.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0) ? 1 : 0;

                    e.proteoform_level = 1 + gene_ambiguity + sequence_ambiguity + PTM_ambiguity + PTM_location;
                    if (gene_ambiguity > 0)
                    {
                        e.proteoform_level_description += "Gene ambiguity; ";
                    }
                    if (sequence_ambiguity > 0)
                    {
                        e.proteoform_level_description += "Sequence ambiguity; ";
                    }
                    if (PTM_ambiguity > 0)
                    {
                        e.proteoform_level_description += "PTM identity ambiguity; ";
                    }
                    if (PTM_location > 0)
                    {
                        e.proteoform_level_description += "PTM localization ambiguity; ";
                    }
                }
                if (e.proteoform_level == 1)
                {
                    e.proteoform_level_description = "Unambiguous";
                }

                //determine if new intact-mass ID
                e.new_intact_mass_id = false;
                if (!e.topdown_id && e.linked_proteoform_references != null && e.ambiguous_identifications.Count == 0)
                {
                    string this_id = string.Join(",", (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList.Select(a => a.Split('_')[0])).Distinct()) + "_" + ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) + "_" + string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m));
                    if (!topdown_ids.Any(t => this_id.Split('_')[0].Split(',').Contains(t.Split('_')[0]) &&
                                         this_id.Split('_')[1] == t.Split('_')[1] && this_id.Split('_')[2] == t.Split('_')[2]))
                    {
                        e.new_intact_mass_id = true;
                    }
                }
            });

            if (Sweet.lollipop.remove_bad_connections)
            {
                if (theoretical_proteoforms.Count > 0 || (Sweet.lollipop.identify_from_td_nodes && experimental_proteoforms.Count(e => e.topdown_id) > 0))
                {
                    Parallel.ForEach(relations, r =>
                    {
                        r.Accepted = r.Identification;
                    });
                }
            }
        }