Exemplo n.º 1
0
        private void get_uniprot_mods()
        {
            var mods = topdown_ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification))
                       .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();

            topdown_uniprot_mods = "";
            string add = "";

            if (Sweet.lollipop.theoretical_database.theoreticals_by_accession.ContainsKey(Sweet.lollipop.target_proteoform_community.community_number))
            {
                Sweet.lollipop.theoretical_database.theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number].TryGetValue(accession.Split('_')[0].Split('-')[0], out var matching_theoretical);
                if (matching_theoretical != null)
                {
                    foreach (string mod in mods)
                    {
                        // positions with mod
                        List <int> theo_ptms = matching_theoretical.First().ExpandedProteinList.SelectMany(p => p
                                                                                                           .OneBasedPossibleLocalizedModifications)
                                               .Where(p => p.Key >= topdown_begin && p.Key <= topdown_end &&
                                                      p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                               .Select(m => m.Key).ToList();
                        if (theo_ptms.Count > 0)
                        {
                            add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                        }
                        if (topdown_ptm_set.ptm_combination.Where(ptm => !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                            .Count(m => m == mod) > theo_ptms.Count)
                        {
                            topdown_novel_mods = true;
                        }
                    }
                    topdown_uniprot_mods += add;
                    if (add.Length == 0)
                    {
                        topdown_uniprot_mods += "N/A";
                    }

                    foreach (var ambig_id in ambiguous_topdown_hits)
                    {
                        Sweet.lollipop.theoretical_database.theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number].TryGetValue(accession.Split('_')[0].Split('-')[0], out var matching_ambig_theoretical);
                        if (matching_ambig_theoretical != null)
                        {
                            var ambig_mods = ambig_id.ptm_list.Where(p => !Proteoform.modification_is_adduct(p.modification))
                                             .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();

                            topdown_uniprot_mods += " | ";
                            add = "";
                            foreach (var mod in ambig_mods)
                            {
                                // positions with mod
                                List <int> theo_ptms = matching_ambig_theoretical.First().ExpandedProteinList.SelectMany(p => p
                                                                                                                         .OneBasedPossibleLocalizedModifications)
                                                       .Where(p => p.Key >= ambig_id.begin && p.Key <= ambig_id.end &&
                                                              p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                                       .Select(m => m.Key).ToList();
                                if (theo_ptms.Count > 0)
                                {
                                    add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                                }
                                if (ambig_id.ptm_list.Where(ptm => !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                                    .Count(m => m == mod) > theo_ptms.Count)
                                {
                                    topdown_novel_mods = true;
                                }
                            }
                        }
                        topdown_uniprot_mods += add;
                        if (add.Length == 0)
                        {
                            topdown_uniprot_mods += "N/A";
                        }
                    }
                }
            }
        }
Exemplo n.º 2
0
        public void identify_experimentals()
        {
            HashSet <ExperimentalProteoform> identified_experimentals = new HashSet <ExperimentalProteoform>();

            if (Sweet.lollipop.identify_from_td_nodes)
            {
                foreach (TopDownProteoform topdown in experimental_proteoforms.Where(e => e.topdown_id))
                {
                    Sweet.lollipop.theoretical_database
                    .theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number]
                    .TryGetValue(topdown.accession.Split('_')[0].Split('-')[0], out var t);
                    if (t != null && t.Count > 0)
                    {
                        TheoreticalProteoform theoretical =
                            new TheoreticalProteoform(topdown.accession, topdown.name, topdown.sequence,
                                                      t.First().ExpandedProteinList, topdown.modified_mass, topdown.lysine_count,
                                                      topdown.topdown_ptm_set, true, false, null);
                        theoretical.topdown_theoretical    = true;
                        theoretical.new_topdown_proteoform = true;
                        theoretical.begin = topdown.topdown_begin;
                        theoretical.end   = topdown.topdown_end;
                        foreach (ExperimentalProteoform e in topdown.identify_connected_experimentals(theoretical, topdown.topdown_begin, topdown.topdown_end,
                                                                                                      new PtmSet(topdown.topdown_ptm_set.ptm_combination), null))
                        {
                            identified_experimentals.Add(e);
                        }
                    }
                }
            }
            foreach (TheoreticalProteoform t in theoretical_proteoforms.OrderBy(t => t.topdown_theoretical))
            {
                lock (identified_experimentals)
                    foreach (ExperimentalProteoform e in t.identify_connected_experimentals(t, t.begin, t.end, t.ptm_set, t.linked_proteoform_references))
                    {
                        identified_experimentals.Add(e);
                    }
            }

            //Continue looking for new experimental identifications until no more remain to be identified
            List <ExperimentalProteoform> newly_identified_experimentals = new List <ExperimentalProteoform>(identified_experimentals).OrderBy(p => p.relationships.Count(r => r.candidate_ptmset != null) > 0 ? p.relationships.Where(r => r.candidate_ptmset != null).Min(r => Math.Abs(r.DeltaMass - r.candidate_ptmset.mass)) : 1e6).ThenBy(p => p.modified_mass).ToList();
            int last_identified_count = identified_experimentals.Count - 1;

            while (newly_identified_experimentals.Count > 0) //&& identified_experimentals.Count > last_identified_count)
            {
                last_identified_count = identified_experimentals.Count;
                HashSet <ExperimentalProteoform> tmp_new_experimentals = new HashSet <ExperimentalProteoform>();
                foreach (ExperimentalProteoform id_experimental in newly_identified_experimentals)
                {
                    {
                        lock (identified_experimentals) lock (tmp_new_experimentals)
                                foreach (ExperimentalProteoform new_e in id_experimental.identify_connected_experimentals(id_experimental.linked_proteoform_references.First() as TheoreticalProteoform, id_experimental.begin,
                                                                                                                          id_experimental.end, id_experimental.ptm_set, id_experimental.linked_proteoform_references))
                                {
                                    identified_experimentals.Add(new_e);
                                    tmp_new_experimentals.Add(new_e);
                                }
                    }
                }
                newly_identified_experimentals = new List <ExperimentalProteoform>(tmp_new_experimentals);
            }

            List <string> topdown_ids = Sweet.lollipop.topdown_proteoforms
                                        .Select(p => p.accession.Split('_')[0].Split('-')[0] + "_" + p.sequence + "_" + string.Join(", ", p.topdown_ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))).ToList();


            //determine identified experimentals that are adducts
            //checks if any experimentals have same mods as e's ptmset, except e has additional adduct only mods.
            Parallel.ForEach(experimental_proteoforms, e =>
            {
                e.adduct =
                    e.linked_proteoform_references != null &&
                    e.ptm_set.ptm_combination.Any(m => Proteoform.modification_is_adduct(m.modification)) &&
                    experimental_proteoforms.Any(l =>
                                                 l.linked_proteoform_references != null &&
                                                 l.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) &&
                                                 l.ptm_set.ptm_combination.Count < e.ptm_set.ptm_combination.Count &&
                                                 e.ptm_set.ptm_combination.Where(m => l.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)) != e.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)))
                                                 .Count(p => !Proteoform.modification_is_adduct(p.modification))
                                                 == 0
                                                 );

                if (e as TopDownProteoform != null)
                {
                    (e as TopDownProteoform).set_correct_id();
                }

                if (e.linked_proteoform_references != null)
                {
                    var mods = e.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification))
                               .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();
                    e.uniprot_mods = "";
                    string add     = "";
                    foreach (string mod in mods)
                    {
                        // positions with mod
                        List <int> theo_ptms = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p
                                                                                                                                                .OneBasedPossibleLocalizedModifications)
                                               .Where(p => p.Key >= e.begin && p.Key <= e.end &&
                                                      p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                               .Select(m => m.Key).ToList();
                        if (theo_ptms.Count > 0)
                        {
                            add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                        }
                        if (e.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                            .Count(m => m == mod) > theo_ptms.Count)
                        {
                            e.novel_mods = true;
                        }
                    }
                    e.uniprot_mods += add;
                    if (add.Length == 0)
                    {
                        e.uniprot_mods += "N/A";
                    }

                    foreach (var ambig_id in e.ambiguous_identifications)
                    {
                        var ambig_mods = ambig_id.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification))
                                         .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();

                        e.uniprot_mods += " | ";
                        add             = "";
                        foreach (var mod in ambig_mods)
                        {
                            // positions with mod
                            List <int> theo_ptms = ambig_id.theoretical_base.ExpandedProteinList.SelectMany(p => p
                                                                                                            .OneBasedPossibleLocalizedModifications)
                                                   .Where(p => p.Key >= ambig_id.begin && p.Key <= ambig_id.end &&
                                                          p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                                   .Select(m => m.Key).ToList();
                            if (theo_ptms.Count > 0)
                            {
                                add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                            }
                            if (ambig_id.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                                .Count(m => m == mod) > theo_ptms.Count)
                            {
                                e.novel_mods = true;
                            }
                        }
                        e.uniprot_mods += add;
                        if (add.Length == 0)
                        {
                            e.uniprot_mods += "N/A";
                        }
                    }
                }

                //determine level #
                e.proteoform_level_description = "";
                if (e.linked_proteoform_references == null)
                {
                    e.proteoform_level             = 5;
                    e.proteoform_level_description = "Unidentified";
                }
                else if (e.ambiguous_identifications.Count == 0)
                {
                    if (e.ptm_set.ptm_combination.Count == 0)
                    {
                        e.proteoform_level = 1;
                    }
                    else
                    {
                        e.proteoform_level              = 2;
                        e.proteoform_level_description += "PTM localization ambiguity; ";
                    }

                    //check if accessions had been grouped in constructing the theoretical database
                    if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1)
                    {
                        e.proteoform_level             += 1;
                        e.proteoform_level_description += "Gene ambiguity; ";
                    }
                }
                else
                {
                    var unique_accessions = new List <string>()
                    {
                        e.linked_proteoform_references.First().accession.Split('_')[0].Split('-')[0]
                    }.Concat(e.ambiguous_identifications.Select(a => a.theoretical_base.accession.Split('_')[0].Split('-')[0])).Distinct();
                    var unique_sequences = new List <string>()
                    {
                        ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end)
                    }.
                    Concat(e.ambiguous_identifications.Select(a => ExperimentalProteoform.get_sequence(a.theoretical_base, a.begin, a.end))).Distinct();
                    var unique_PTMs = new List <string>()
                    {
                        string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))
                    }.Concat(e.ambiguous_identifications.Select(a => string.Join(", ", a.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))))).Distinct();

                    int gene_ambiguity = unique_accessions.Count() > 1 ? 1 : 0;

                    //check if accessions had been grouped in constructing the theoretical database
                    if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1)
                    {
                        gene_ambiguity = 1;
                    }

                    int sequence_ambiguity = unique_sequences.Count() > 1 ? 1 : 0;
                    int PTM_ambiguity      = unique_PTMs.Count() > 1 ? 1 : 0;
                    int PTM_location       = e.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0 || e.ambiguous_identifications.Any(a => a.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0) ? 1 : 0;

                    e.proteoform_level = 1 + gene_ambiguity + sequence_ambiguity + PTM_ambiguity + PTM_location;
                    if (gene_ambiguity > 0)
                    {
                        e.proteoform_level_description += "Gene ambiguity; ";
                    }
                    if (sequence_ambiguity > 0)
                    {
                        e.proteoform_level_description += "Sequence ambiguity; ";
                    }
                    if (PTM_ambiguity > 0)
                    {
                        e.proteoform_level_description += "PTM identity ambiguity; ";
                    }
                    if (PTM_location > 0)
                    {
                        e.proteoform_level_description += "PTM localization ambiguity; ";
                    }
                }
                if (e.proteoform_level == 1)
                {
                    e.proteoform_level_description = "Unambiguous";
                }

                //determine if new intact-mass ID
                e.new_intact_mass_id = false;
                if (!e.topdown_id && e.linked_proteoform_references != null && e.ambiguous_identifications.Count == 0)
                {
                    string this_id = string.Join(",", (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList.Select(a => a.Split('_')[0])).Distinct()) + "_" + ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) + "_" + string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m));
                    if (!topdown_ids.Any(t => this_id.Split('_')[0].Split(',').Contains(t.Split('_')[0]) &&
                                         this_id.Split('_')[1] == t.Split('_')[1] && this_id.Split('_')[2] == t.Split('_')[2]))
                    {
                        e.new_intact_mass_id = true;
                    }
                }
            });

            if (Sweet.lollipop.remove_bad_connections)
            {
                if (theoretical_proteoforms.Count > 0 || (Sweet.lollipop.identify_from_td_nodes && experimental_proteoforms.Count(e => e.topdown_id) > 0))
                {
                    Parallel.ForEach(relations, r =>
                    {
                        r.Accepted = r.Identification;
                    });
                }
            }
        }