Example #1
0
 private void copy_aggregate(ExperimentalProteoform e)
 {
     root = e.root;
     //doesn't copy quant on purpose
     agg_intensity                  = e.agg_intensity;
     agg_mass                       = e.agg_mass;
     modified_mass                  = e.modified_mass;
     agg_rt                         = e.agg_rt;
     lysine_count                   = e.lysine_count;
     is_target                      = e.is_target;
     family                         = e.family;
     aggregated                     = new List <IAggregatable>(e.aggregated);
     lt_quant_components            = new List <Component>(e.lt_quant_components);
     lt_verification_components     = new List <Component>(e.lt_verification_components);
     hv_quant_components            = new List <Component>(e.hv_quant_components);
     hv_verification_components     = new List <Component>(e.hv_verification_components);
     biorepIntensityList            = new List <BiorepIntensity>(e.biorepIntensityList);
     manual_validation_id           = e.manual_validation_id;
     manual_validation_verification = e.manual_validation_verification;
     manual_validation_quant        = e.manual_validation_quant;
 }
Example #2
0
        public List <ExperimentalProteoform> identify_connected_experimentals()
        {
            List <ExperimentalProteoform> identified = new List <ExperimentalProteoform>();

            //do relations first closest to candidate ptmset delta mass, then in order of relation delta mass (need to do in same order every round)
            foreach (ProteoformRelation r in relationships.Where(r => r.Accepted).OrderBy(r => r.candidate_ptmset != null ? Math.Abs(r.candidate_ptmset.mass - r.DeltaMass) : r.DeltaMass * 1e6).Distinct().ToList())
            {
                ExperimentalProteoform e = r.connected_proteoforms.OfType <ExperimentalProteoform>().FirstOrDefault(p => p != this);
                if (e == null)
                {
                    continue;
                }                           // Looking at an ET pair, expecting an EE pair

                TheoreticalProteoform theoretical_base = this as TheoreticalProteoform != null ?
                                                         this as TheoreticalProteoform :                                  //Theoretical starting point
                                                         (linked_proteoform_references.First() as TheoreticalProteoform != null ?
                                                          linked_proteoform_references.First() as TheoreticalProteoform : //Experimental with theoretical reference
                                                          null);                                                          //Experimental without theoretical reference

                double mass_tolerance  = modified_mass / 1000000 * Sweet.lollipop.mass_tolerance;
                PtmSet with_mod_change = determine_mod_change(e, this, theoretical_base, r, this.ptm_set);

                if (with_mod_change == null && Math.Abs(r.peak.DeltaMass) <= mass_tolerance)
                {
                    lock (r) lock (e) assign_pf_identity(e, ptm_set, r, theoretical_base);
                    identified.Add(e);
                }

                if (with_mod_change == null)
                {
                    continue;
                }

                lock (r) lock (e)
                        assign_pf_identity(e, with_mod_change, r, theoretical_base);
                identified.Add(e);
            }
            return(identified);
        }
 //Selecting numerator and denominator is not implemented
 public QuantitativeProteoformValues(ExperimentalProteoform eP)
 {
     eP.quant   = this;
     proteoform = eP;
 }
Example #4
0
 // COPYING CONSTRUCTOR
 public ExperimentalProteoform(ExperimentalProteoform eP)
     : base(eP.accession, eP.modified_mass, eP.lysine_count, eP.is_target)
 {
     copy_aggregate(eP);
     quant = new QuantitativeProteoformValues(this);
 }
Example #5
0
        public List <ExperimentalProteoform> identify_connected_experimentals(List <PtmSet> all_possible_ptmsets, List <ModificationWithMass> all_mods_with_mass)
        {
            List <ExperimentalProteoform> identified = new List <ExperimentalProteoform>();

            foreach (ProteoformRelation r in relationships.Where(r => r.Accepted).Distinct().ToList())
            {
                ExperimentalProteoform e = r.connected_proteoforms.OfType <ExperimentalProteoform>().FirstOrDefault(p => p != this);
                if (e == null)
                {
                    continue;            // Looking at an ET pair, expecting an EE pair
                }
                double mass_tolerance = modified_mass / 1000000 * (double)SaveState.lollipop.mass_tolerance;
                int    sign           = Math.Sign(e.modified_mass - modified_mass);
                double deltaM         = Math.Sign(r.peak.DeltaMass) < 0 ? r.peak.DeltaMass : sign * r.peak.DeltaMass;     // give EE relations the correct sign, but don't switch negative ET relation deltaM's
                TheoreticalProteoform theoretical_base = this as TheoreticalProteoform != null ?
                                                         this as TheoreticalProteoform :                                  //Theoretical starting point
                                                         (linked_proteoform_references.First() as TheoreticalProteoform != null ?
                                                          linked_proteoform_references.First() as TheoreticalProteoform : //Experimental with theoretical reference
                                                          null);                                                          //Experimental without theoretical reference
                string theoretical_base_sequence = theoretical_base != null ? theoretical_base.sequence : "";

                PtmSet best_addition = generate_possible_added_ptmsets(r.peak.possiblePeakAssignments, deltaM, mass_tolerance, all_mods_with_mass, theoretical_base, theoretical_base_sequence, 1)
                                       .OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1
                                       .FirstOrDefault();

                PtmSet best_loss = null;
                foreach (PtmSet set in all_possible_ptmsets)
                {
                    bool within_loss_tolerance         = deltaM >= -set.mass - mass_tolerance && deltaM <= -set.mass + mass_tolerance;
                    var  these_mods                    = this.ptm_set.ptm_combination.Select(ptm => ptm.modification);
                    var  those_mods                    = set.ptm_combination.Select(ptm => ptm.modification); // all must be in the current set to remove them
                    bool can_be_removed                = those_mods.All(m => these_mods.Contains(m));
                    bool better_than_current_best_loss = best_loss == null || Math.Abs(deltaM - (-set.mass)) < Math.Abs(deltaM - (-best_loss.mass));
                    if (can_be_removed && within_loss_tolerance && better_than_current_best_loss)
                    {
                        best_loss = set;
                    }
                }

                // If they're the same and someone hasn't labeled 0 difference with a "ModificationWithMass", then label it null
                if (best_addition == null && best_loss == null && Math.Abs(r.peak.DeltaMass) <= mass_tolerance)
                {
                    lock (r) lock (e) assign_pf_identity(e, this, ptm_set, r, sign, null);
                    identified.Add(e);
                }

                if (best_addition == null && best_loss == null)
                {
                    continue;
                }

                // Make the new ptmset with ptms removed or added
                PtmSet with_mod_change = null;
                if (best_loss == null)
                {
                    with_mod_change = new PtmSet(new List <Ptm>(this.ptm_set.ptm_combination.Concat(best_addition.ptm_combination).Where(ptm => ptm.modification.monoisotopicMass != 0).ToList()));
                }
                else
                {
                    List <Ptm> new_combo = new List <Ptm>(this.ptm_set.ptm_combination);
                    foreach (Ptm ptm in best_loss.ptm_combination)
                    {
                        new_combo.Remove(new_combo.FirstOrDefault(asdf => asdf.modification == ptm.modification));
                    }
                    with_mod_change = new PtmSet(new_combo);
                }

                lock (r) lock (e)
                        assign_pf_identity(e, this, with_mod_change, r, sign, best_loss != null ? best_loss : best_addition);
                identified.Add(e);
            }
            return(identified);
        }
Example #6
0
        public static string get_cytoscape_nodes_tsv(List <ProteoformFamily> families,
                                                     bool quantitative,
                                                     string color_scheme, string node_label, string node_label_position, string node_position, int double_rounding,
                                                     IEnumerable <TheoreticalProteoform> theoreticals, bool gene_centric_families, string preferred_gene_label)
        {
            DataTable node_table = new DataTable();

            node_table.Columns.Add("accession", typeof(string));
            node_table.Columns.Add(proteoform_type_header, typeof(string));
            node_table.Columns.Add(size_header, typeof(double));
            node_table.Columns.Add(tooltip_header, typeof(string));
            node_table.Columns.Add(layout_header, typeof(int));

            if (quantitative)
            {
                node_table.Columns.Add(SaveState.lollipop.numerator_condition, typeof(string));
                node_table.Columns.Add(SaveState.lollipop.denominator_condition, typeof(string));
                node_table.Columns.Add(significant_header, typeof(string));
                node_table.Columns.Add(piechart_header, typeof(string));
            }


            //Choose the layout order
            IEnumerable <Proteoform> layout_order;

            switch (Lollipop.node_positioning.ToList().IndexOf(node_position))
            {
            case 0:     //arbitrary circle
            case 2:     //mass circle
            default:
                layout_order = families.SelectMany(f => f.experimental_proteoforms).OfType <Proteoform>().Concat(theoreticals).OrderBy(p => p.modified_mass);
                break;

            case 1:     //mass-based spiral
                layout_order = theoreticals.OrderByDescending(p => p.modified_mass).OfType <Proteoform>().Concat(families.SelectMany(f => f.experimental_proteoforms).OrderBy(p => p.modified_mass));
                break;
            }

            int    layout_rank = 1;
            string node_rows   = "";

            foreach (Proteoform p in layout_order.ToList())
            {
                if (p as TheoreticalProteoform != null)
                {
                    string node_type = String.Equals(p.ptm_description, "unmodified", StringComparison.CurrentCultureIgnoreCase) ? unmodified_theoretical_label : modified_theoretical_label;
                    node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, mock_intensity, "", layout_rank);
                }

                if (p as ExperimentalProteoform != null)
                {
                    ExperimentalProteoform ep = p as ExperimentalProteoform;

                    string node_type = quantitative && ep.quant.intensitySum == 0 ?
                                       experimental_notQuantified_label :
                                       experimental_label;

                    string total_intensity = quantitative ?
                                             ep.quant.intensitySum == 0 ? mock_intensity : ((double)ep.quant.intensitySum).ToString() :
                                             ep.agg_intensity.ToString();

                    //Names and size
                    node_rows += String.Join("\t", new List <string> {
                        get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity
                    });

                    //Set tooltip information
                    string tooltip = String.Join("; ", new string[]
                    {
                        "Accession = " + p.accession.ToString(),
                        "Aggregated Mass = " + ep.agg_mass.ToString(),
                        "Aggregated Retention Time = " + ep.agg_rt.ToString(),
                        "Total Intensity = " + total_intensity.ToString(),
                        "Aggregated Component Count = " + ep.aggregated_components.Count.ToString(),
                        SaveState.lollipop.neucode_labeled ? "; Lysine Count = " + p.lysine_count : "",
                        "Abundant Component for Manual Validation of Identification: " + ep.manual_validation_id,
                        "Abundant Component for Manual Validation of Identification Validation: " + ep.manual_validation_verification
                    });
                    if (quantitative && ep.quant.intensitySum > 0)
                    {
                        tooltip += "\\n\\nQuantitation Results:" +
                                   String.Join("; ", new string[] {
                            "Q-Value = " + ep.quant.FDR.ToString(),
                            "Log2FC = " + ep.quant.logFoldChange.ToString(),
                            "Variance = " + ep.quant.variance.ToString(),
                            "Significant = " + ep.quant.significant.ToString(),
                            SaveState.lollipop.numerator_condition + " Quantitative Component Count = " + ep.lt_quant_components.Count.ToString(),
                            SaveState.lollipop.denominator_condition + " Quantitative Component Count = " + ep.hv_quant_components.Count.ToString(),
                            "Abundant Component for Manual Validation of Quantification: " + ep.manual_validation_quant
                        });
                    }

                    if (quantitative && ep.quant.intensitySum != 0)
                    {
                        node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity, tooltip, layout_rank, ((double)ep.quant.lightIntensitySum).ToString(), ((double)ep.quant.heavyIntensitySum).ToString(), ep.quant.significant.ToString(), get_piechart_string(color_scheme));
                    }
                    else if (quantitative)
                    {
                        node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity, tooltip, layout_rank, "", "", "", "");
                    }
                    else
                    {
                        node_table.Rows.Add(get_proteoform_shared_name(p, node_label, double_rounding), node_type, total_intensity, tooltip, layout_rank);
                    }
                }

                layout_rank++;
            }

            if (gene_centric_families)
            {
                foreach (string gene_name in theoreticals.Select(t => t.gene_name.get_prefered_name(preferred_gene_label)).Distinct())
                {
                    if (gene_name != null && quantitative)
                    {
                        node_table.Rows.Add(gene_name, gene_name_label, mock_intensity, "Other Gene Names: ", 0, "", "", "", "");
                    }
                    else if (gene_name != null)
                    {
                        node_table.Rows.Add(gene_name, gene_name_label, mock_intensity, "Other Gene Names: ", 0);
                    }
                }
            }

            return(get_table_string(node_table));
        }
Example #7
0
        private bool assign_pf_identity(ExperimentalProteoform e, PtmSet set, int begin, int end, ProteoformRelation r, TheoreticalProteoform theoretical_base, List <Proteoform> linked_proteoform_references, bool check_ambiguous_IDs)
        {
            bool identification_assigned = false;

            if (!Sweet.lollipop.id_use_ppm_tolerance || Math.Abs(e.calculate_mass_error(theoretical_base, set, begin, end) * 1e6 / e.modified_mass) < Sweet.lollipop.id_ppm_tolerance)
            {
                int new_begin = begin;
                int new_end   = end;

                PtmSet     new_set = new PtmSet(new List <Ptm>(set.ptm_combination));
                List <Ptm> remove  = new List <Ptm>();
                //do retention of M first
                foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                {
                    new_begin--;
                    remove.Add(mod);
                }

                foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                {
                    if (!new_set.ptm_combination.Any(m => m.modification.ModificationType == "AminoAcid") && begin >= theoretical_base.begin)
                    {
                        if (theoretical_base.sequence[begin - theoretical_base.begin].ToString() ==
                            mod.modification.Target.ToString())
                        {
                            new_begin++;
                            remove.Add(mod); //dont have in ptmset --> change the begin & end
                        }
                    }
                    if (!remove.Contains(mod) && theoretical_base.sequence[end - theoretical_base.begin].ToString() ==
                        mod.modification.Target.ToString())
                    {
                        new_end--;
                        remove.Add(mod);
                    }
                }

                foreach (var ptm in remove)
                {
                    new_set.ptm_combination.Remove(ptm);
                }

                new_set = new PtmSet(new_set.ptm_combination);

                if (e.linked_proteoform_references == null)
                {
                    identification_assigned = true;

                    if (linked_proteoform_references != null)
                    {
                        e.linked_proteoform_references = new List <Proteoform>(linked_proteoform_references);
                        e.linked_proteoform_references.Add(this);
                    }
                    else
                    {
                        e.linked_proteoform_references = new List <Proteoform>()
                        {
                            theoretical_base
                        };
                    }

                    e.relation_to_id = r;
                    e.ptm_set        = new_set;
                    e.begin          = new_begin;
                    e.end            = new_end;


                    if (e.gene_name == null)
                    {
                        e.gene_name = theoretical_base.gene_name;
                    }
                    else
                    {
                        e.gene_name.gene_names.Concat(this.gene_name.gene_names);
                    }
                }
                else
                {
                    if (linked_proteoform_references != null && !linked_proteoform_references.Contains(e))
                    {
                        bool different_id = e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) !=
                                            theoretical_base.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ||
                                            ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end)
                                            != ExperimentalProteoform.get_sequence(theoretical_base, new_begin, new_end) || !e.ptm_set.same_ptmset(new_set, true);


                        List <Modification> this_known_mods        = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList();
                        List <Modification> previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList();
                        if (!Sweet.lollipop.topdown_theoretical_reduce_ambiguity || (theoretical_base.topdown_theoretical && !(e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical))
                        {
                            if (!Sweet.lollipop.annotated_PTMs_reduce_ambiguity ||
                                (new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) &&
                                 !e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification)))))
                            {
                                if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity || Sweet.lollipop.annotated_PTMs_reduce_ambiguity)
                                {
                                    if (Sweet.lollipop.remove_bad_connections && different_id) //&& e.relation_to_id != r)
                                    {
                                        e.relation_to_id.Identification     = false;
                                        e.relation_to_id.represented_ptmset = null;
                                    }
                                    e.linked_proteoform_references = null;
                                    e.ptm_set   = new PtmSet(new List <Ptm>());
                                    e.begin     = 0;
                                    e.end       = 0;
                                    e.gene_name = null;
                                    e.ambiguous_identifications.Clear();
                                    ProteoformRelation relation = null;
                                    e.relation_to_id = relation;

                                    //reassign the topdown - based ID
                                    return(this.assign_pf_identity(e, set, begin, end, r, theoretical_base, linked_proteoform_references, true));
                                }
                            }
                        }

                        if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity && (e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical && !theoretical_base.topdown_theoretical)
                        {
                        }
                        else if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity &&
                                 !new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) &&
                                 e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                        {
                        }
                        else
                        {
                            if (different_id)
                            {
                                var new_linked_proteoform_references = new List <Proteoform>(linked_proteoform_references);
                                new_linked_proteoform_references.Add(this);

                                AmbiguousIdentification new_id =
                                    new AmbiguousIdentification(new_begin, new_end, new_set, r, theoretical_base, new_linked_proteoform_references);
                                lock (e.ambiguous_identifications)
                                {
                                    if (!e.ambiguous_identifications.Any(p =>
                                                                         p.theoretical_base.gene_name.primary ==
                                                                         new_id.theoretical_base.gene_name.primary &&
                                                                         ExperimentalProteoform.get_sequence(p.theoretical_base, p.begin, p.end) == ExperimentalProteoform.get_sequence(new_id.theoretical_base, new_id.begin, new_id.end) &&
                                                                         p.ptm_set.same_ptmset(new_id.ptm_set, true)))
                                    {
                                        e.ambiguous_identifications.Add(new_id);
                                        identification_assigned = true;
                                    }
                                }
                            }
                        }
                    }
                }
            }


            if (check_ambiguous_IDs)
            {
                //remove bad relations if using td to reduce ambiguity
                if (identification_assigned)
                {
                    List <AmbiguousIdentification> to_remove = new List <AmbiguousIdentification>();
                    List <Modification>            previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList();
                    if (theoretical_base.topdown_theoretical && Sweet.lollipop.topdown_theoretical_reduce_ambiguity)
                    {
                        to_remove.AddRange(e.ambiguous_identifications.Where(id => !id.theoretical_base.topdown_theoretical));
                    }
                    if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity &&
                        e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                    {
                        foreach (var ambiguous_id in e.ambiguous_identifications)
                        {
                            List <Modification> ambiguous_id_known_mods = ambiguous_id.theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList();
                            if (ambiguous_id.ptm_set.ptm_combination.Any(mod1 => !modification_is_adduct(mod1.modification) && !ambiguous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                            {
                                to_remove.Add(ambiguous_id);
                            }
                        }
                    }
                    foreach (var x in to_remove)
                    {
                        if (e.ambiguous_identifications.Contains(x))
                        {
                            e.ambiguous_identifications.Remove(x);
                            if (Sweet.lollipop.remove_bad_connections)
                            {
                                if (e.relation_to_id != x.relation)
                                {
                                    x.relation.Identification     = false;
                                    x.relation.represented_ptmset = null;
                                }
                            }
                        }
                    }
                    foreach (var x in e.ambiguous_identifications)
                    {
                        x.relation.Identification = true;
                    }
                }


                if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous_identifications.Count > 0)
                {
                    lock ((this as ExperimentalProteoform).ambiguous_identifications)
                    {
                        int      count       = (this as ExperimentalProteoform).ambiguous_identifications.Count;
                        PtmSet[] new_ptm_set = new PtmSet[count];
                        Parallel.For(0, count, i =>
                        {
                            var id         = (this as ExperimentalProteoform).ambiguous_identifications[i];
                            new_ptm_set[i] = determine_mod_change(e, this, id.theoretical_base, r, id.ptm_set, id.begin, id.end);
                        });
                        for (int i = 0; i < count; i++)
                        {
                            if (new_ptm_set[i] != null)
                            {
                                var id = (this as ExperimentalProteoform).ambiguous_identifications[i];
                                if (assign_pf_identity(e, new_ptm_set[i], id.begin, id.end, r, id.theoretical_base, id.linked_proteoform_references, false))
                                {
                                    identification_assigned = true;
                                }
                            }
                        }
                    }
                }
            }
            return(identification_assigned);
        }
Example #8
0
        private static PtmSet determine_mod_change(ExperimentalProteoform e, Proteoform p,
                                                   TheoreticalProteoform theoretical_base, ProteoformRelation r, PtmSet this_ptmset, int begin, int end)
        {
            double mass_tolerance = p.modified_mass / 1000000 * Sweet.lollipop.mass_tolerance;
            int    sign           = Math.Sign(e.modified_mass - p.modified_mass);
            double deltaM         =
                Math.Sign(r.peak.DeltaMass) < 0
                    ? r.peak.DeltaMass
                    : sign * r.peak
                .DeltaMass;           // give EE relations the correct sign, but don't switch negative ET relation deltaM's


            List <PtmSet> possible_additions = r.peak.possiblePeakAssignments
                                               .Where(peak => Math.Abs(peak.mass - deltaM) <= 1)
                                               .ToList(); // EE relations have PtmSets around both positive and negative deltaM, so remove the ones around the opposite of the deltaM of interest

            PtmSet best_addition = generate_possible_added_ptmsets(possible_additions,
                                                                   Sweet.lollipop.theoretical_database.all_mods_with_mass, theoretical_base, begin, end,
                                                                   this_ptmset, 1, true)
                                   .OrderBy(x =>
                                            (double)x.ptm_rank_sum +
                                            Math.Abs(x.mass - deltaM) *
                                            10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1
                                   .FirstOrDefault();


            List <PtmSet> best_losses = new List <PtmSet>();

            foreach (PtmSet set in r.peak.possiblePeakAssignments)
            //Parallel.ForEach(Sweet.lollipop.theoretical_database.all_possible_ptmsets,  set =>
            {
                bool within_loss_tolerance = deltaM >= -set.mass - mass_tolerance && deltaM <= -set.mass + mass_tolerance;
                if (within_loss_tolerance)
                {
                    List <Modification> these_mods = this_ptmset.ptm_combination.Select(ptm => ptm.modification).ToList();
                    List <Modification> those_mods = set.ptm_combination.Select(ptm => ptm.modification).ToList(); // all must be in the current set to remove them
                    bool can_be_removed            = those_mods.All(m1 => these_mods.Count(m2 =>
                                                                                           UnlocalizedModification.LookUpId(m2) ==
                                                                                           UnlocalizedModification.LookUpId(m1)) >=
                                                                    those_mods.Count(m2 =>
                                                                                     UnlocalizedModification.LookUpId(m2) ==
                                                                                     UnlocalizedModification.LookUpId(m1)));
                    lock (best_losses)
                    {
                        if (can_be_removed && within_loss_tolerance)
                        {
                            best_losses.Add(set);
                        }
                    }
                }
            } //);

            PtmSet best_loss = best_losses.OrderBy(s => Math.Abs(deltaM - (-s.mass))).FirstOrDefault();

            if (best_addition == null && best_loss == null)
            {
                return(null);
            }


            // Make the new ptmset with ptms removed or added
            PtmSet with_mod_change = null;

            if (best_loss == null)
            {
                with_mod_change = new PtmSet(new List <Ptm>(this_ptmset.ptm_combination
                                                            .Concat(best_addition.ptm_combination).Where(ptm => ptm.modification.MonoisotopicMass != 0)
                                                            .ToList()));
            }
            else
            {
                List <Ptm> new_combo = new List <Ptm>(this_ptmset.ptm_combination);
                foreach (Ptm ptm in best_loss.ptm_combination)
                {
                    new_combo.Remove(new_combo.FirstOrDefault(asdf => UnlocalizedModification.LookUpId(asdf.modification) == UnlocalizedModification.LookUpId(ptm.modification)));
                }
                with_mod_change = new PtmSet(new_combo);
            }


            if (r.represented_ptmset == null)
            {
                r.represented_ptmset = best_loss == null ? best_addition : best_loss;
                //if (r.RelationType == ProteoformComparison.ExperimentalExperimental)
                //{
                //    r.DeltaMass *= sign;
                //}
            }

            return(with_mod_change);
        }
Example #9
0
        private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, ProteoformRelation r, TheoreticalProteoform theoretical_base)
        {
            if (e.linked_proteoform_references == null)
            {
                e.linked_proteoform_references = new List <Proteoform>(this.linked_proteoform_references);
                e.linked_proteoform_references.Add(this);
                e.ptm_set = set;
                e.begin   = this.begin;
                e.end     = this.end;
                List <Ptm> remove = new List <Ptm>();

                //do retention of M first
                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                {
                    e.begin--;
                    remove.Add(mod);
                }
                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                {
                    if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() == mod.modification.Target.ToString())
                    {
                        e.begin++;
                        remove.Add(mod); //dont have in ptmset --> change the begin & end
                    }
                    else if (theoretical_base.sequence[this.end - this.begin].ToString() == mod.modification.Target.ToString())
                    {
                        e.end--;
                        remove.Add(mod);
                    }
                }
                foreach (var ptm in remove)
                {
                    e.ptm_set.ptm_combination.Remove(ptm);
                }
                e.ptm_set = new PtmSet(e.ptm_set.ptm_combination);

                if (e.gene_name == null)
                {
                    e.gene_name = this.gene_name;
                }
                else if (!e.topdown_id)
                {
                    e.gene_name.gene_names.Concat(this.gene_name.gene_names);
                }
            }
            else
            {
                //check if assign
                int        begin   = this.begin;
                int        end     = this.end;
                PtmSet     ptm_set = set;
                List <Ptm> remove  = new List <Ptm>();
                //do retention of M first
                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                {
                    begin--;
                    remove.Add(mod);
                }

                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                {
                    if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() ==
                        mod.modification.Target.ToString())
                    {
                        begin++;
                        remove.Add(mod); //dont have in ptmset --> change the begin & end
                    }
                    else if (theoretical_base.sequence[this.end - this.begin].ToString() ==
                             mod.modification.Target.ToString())
                    {
                        end--;
                        remove.Add(mod);
                    }
                }

                foreach (var ptm in remove)
                {
                    ptm_set.ptm_combination.Remove(ptm);
                }

                ptm_set = new PtmSet(ptm_set.ptm_combination);

                if (e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) !=
                    this.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ||
                    e.begin != begin || e.end != end || !e.ptm_set.same_ptmset(ptm_set, true))
                {
                    e.ambiguous = true;
                    Proteoform linked_proteoform_reference =
                        this.linked_proteoform_references == null || this.linked_proteoform_references.Count == 0
                            ? this
                            : this.linked_proteoform_references.First();
                    Tuple <Proteoform, int, int, PtmSet> new_id =
                        new Tuple <Proteoform, int, int, PtmSet>(linked_proteoform_reference, begin, end, ptm_set);
                    lock (e.ambiguous_identifications)
                    {
                        if (!e.ambiguous_identifications.Any(p =>
                                                             p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ==
                                                             new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) &&
                                                             p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 &&
                                                             p.Item4.same_ptmset(new_id.Item4, true)))
                        {
                            e.ambiguous_identifications.Add(new_id);
                        }
                    }
                }
            }

            if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous)
            {
                foreach (var id in this.ambiguous_identifications)
                {
                    TheoreticalProteoform id_theoretical_base = id.Item1 as TheoreticalProteoform;
                    int begin  = id.Item2;
                    int end    = id.Item3;
                    var remove = new List <Ptm>();

                    var ptm_set = determine_mod_change(e, this, id_theoretical_base, r, id.Item4);
                    if (ptm_set == null)
                    {
                        continue;
                    }
                    //do retention of M first
                    foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                    {
                        begin--;
                        remove.Add(mod);
                    }
                    foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                    {
                        if (id_theoretical_base.sequence[id.Item2 - id.Item1.begin].ToString() == mod.modification.Target.ToString())
                        {
                            begin++;
                            remove.Add(mod); //dont have in ptmset --> change the begin & end
                        }
                        else if (id_theoretical_base.sequence[id.Item3 - id.Item2].ToString() == mod.modification.Target.ToString())
                        {
                            end--;
                            remove.Add(mod);
                        }
                    }
                    foreach (var ptm in remove)
                    {
                        ptm_set.ptm_combination.Remove(ptm);
                    }
                    ptm_set = new PtmSet(ptm_set.ptm_combination);
                    lock (e.ambiguous_identifications)
                    {
                        var new_id = new Tuple <Proteoform, int, int, PtmSet>(id.Item1, begin, end, ptm_set);
                        if ((e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) !=
                             new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ||
                             e.begin != new_id.Item2 || e.end != new_id.Item3 || !e.ptm_set.same_ptmset(new_id.Item4, true)) &&
                            !e.ambiguous_identifications.Any(p =>
                                                             p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ==
                                                             new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) &&
                                                             p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 &&
                                                             p.Item4.same_ptmset(new_id.Item4, true)))
                        {
                            e.ambiguous_identifications.Add(new_id);
                            e.ambiguous = true;
                        }
                    }
                }
            }

            e.uniprot_mods = "";
            foreach (string mod in e.ptm_set.ptm_combination.Concat(e.ambiguous_identifications.SelectMany(i => i.Item4.ptm_combination)).Where(ptm => ptm.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m))
            {
                // positions with mod
                List <int> theo_ptms = theoretical_base.ExpandedProteinList.First()
                                       .OneBasedPossibleLocalizedModifications
                                       .Where(p => p.Key >= e.begin && p.Key <= e.end &&
                                              p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                       .Select(m => m.Key).ToList();
                if (theo_ptms.Count > 0)
                {
                    e.uniprot_mods += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                }
                if (e.ptm_set.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                    .Count(m => m == mod) > theo_ptms.Count ||
                    e.ambiguous_identifications.Any(i => i.Item4.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                                                    .Count(m => m == mod) > theo_ptms.Count))
                {
                    e.novel_mods = true;
                }
            }

            //else if (!e.topdown_id && e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label)
            // && e.linked_proteoform_references.Count == this.linked_proteoform_references.Count + 1)
            //{
            //    e.ambiguous = true;
            //}
        }
Example #10
0
        public void identify_experimentals()
        {
            HashSet <ExperimentalProteoform> identified_experimentals = new HashSet <ExperimentalProteoform>();

            if (Sweet.lollipop.identify_from_td_nodes)
            {
                foreach (TopDownProteoform topdown in experimental_proteoforms.Where(e => e.topdown_id))
                {
                    Sweet.lollipop.theoretical_database
                    .theoreticals_by_accession[Sweet.lollipop.target_proteoform_community.community_number]
                    .TryGetValue(topdown.accession.Split('_')[0].Split('-')[0], out var t);
                    if (t != null && t.Count > 0)
                    {
                        TheoreticalProteoform theoretical =
                            new TheoreticalProteoform(topdown.accession, topdown.name, topdown.sequence,
                                                      t.First().ExpandedProteinList, topdown.modified_mass, topdown.lysine_count,
                                                      topdown.topdown_ptm_set, true, false, null);
                        theoretical.topdown_theoretical    = true;
                        theoretical.new_topdown_proteoform = true;
                        theoretical.begin = topdown.topdown_begin;
                        theoretical.end   = topdown.topdown_end;
                        foreach (ExperimentalProteoform e in topdown.identify_connected_experimentals(theoretical, topdown.topdown_begin, topdown.topdown_end,
                                                                                                      new PtmSet(topdown.topdown_ptm_set.ptm_combination), null))
                        {
                            identified_experimentals.Add(e);
                        }
                    }
                }
            }
            foreach (TheoreticalProteoform t in theoretical_proteoforms.OrderBy(t => t.topdown_theoretical))
            {
                lock (identified_experimentals)
                    foreach (ExperimentalProteoform e in t.identify_connected_experimentals(t, t.begin, t.end, t.ptm_set, t.linked_proteoform_references))
                    {
                        identified_experimentals.Add(e);
                    }
            }

            //Continue looking for new experimental identifications until no more remain to be identified
            List <ExperimentalProteoform> newly_identified_experimentals = new List <ExperimentalProteoform>(identified_experimentals).OrderBy(p => p.relationships.Count(r => r.candidate_ptmset != null) > 0 ? p.relationships.Where(r => r.candidate_ptmset != null).Min(r => Math.Abs(r.DeltaMass - r.candidate_ptmset.mass)) : 1e6).ThenBy(p => p.modified_mass).ToList();
            int last_identified_count = identified_experimentals.Count - 1;

            while (newly_identified_experimentals.Count > 0) //&& identified_experimentals.Count > last_identified_count)
            {
                last_identified_count = identified_experimentals.Count;
                HashSet <ExperimentalProteoform> tmp_new_experimentals = new HashSet <ExperimentalProteoform>();
                foreach (ExperimentalProteoform id_experimental in newly_identified_experimentals)
                {
                    {
                        lock (identified_experimentals) lock (tmp_new_experimentals)
                                foreach (ExperimentalProteoform new_e in id_experimental.identify_connected_experimentals(id_experimental.linked_proteoform_references.First() as TheoreticalProteoform, id_experimental.begin,
                                                                                                                          id_experimental.end, id_experimental.ptm_set, id_experimental.linked_proteoform_references))
                                {
                                    identified_experimentals.Add(new_e);
                                    tmp_new_experimentals.Add(new_e);
                                }
                    }
                }
                newly_identified_experimentals = new List <ExperimentalProteoform>(tmp_new_experimentals);
            }

            List <string> topdown_ids = Sweet.lollipop.topdown_proteoforms
                                        .Select(p => p.accession.Split('_')[0].Split('-')[0] + "_" + p.sequence + "_" + string.Join(", ", p.topdown_ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))).ToList();


            //determine identified experimentals that are adducts
            //checks if any experimentals have same mods as e's ptmset, except e has additional adduct only mods.
            Parallel.ForEach(experimental_proteoforms, e =>
            {
                e.adduct =
                    e.linked_proteoform_references != null &&
                    e.ptm_set.ptm_combination.Any(m => Proteoform.modification_is_adduct(m.modification)) &&
                    experimental_proteoforms.Any(l =>
                                                 l.linked_proteoform_references != null &&
                                                 l.gene_name.get_prefered_name(Lollipop.preferred_gene_label) == e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) &&
                                                 l.ptm_set.ptm_combination.Count < e.ptm_set.ptm_combination.Count &&
                                                 e.ptm_set.ptm_combination.Where(m => l.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)) != e.ptm_set.ptm_combination.Count(p => UnlocalizedModification.LookUpId(p.modification) == UnlocalizedModification.LookUpId(m.modification)))
                                                 .Count(p => !Proteoform.modification_is_adduct(p.modification))
                                                 == 0
                                                 );

                if (e as TopDownProteoform != null)
                {
                    (e as TopDownProteoform).set_correct_id();
                }

                if (e.linked_proteoform_references != null)
                {
                    var mods = e.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification))
                               .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();
                    e.uniprot_mods = "";
                    string add     = "";
                    foreach (string mod in mods)
                    {
                        // positions with mod
                        List <int> theo_ptms = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p
                                                                                                                                                .OneBasedPossibleLocalizedModifications)
                                               .Where(p => p.Key >= e.begin && p.Key <= e.end &&
                                                      p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                               .Select(m => m.Key).ToList();
                        if (theo_ptms.Count > 0)
                        {
                            add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                        }
                        if (e.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                            .Count(m => m == mod) > theo_ptms.Count)
                        {
                            e.novel_mods = true;
                        }
                    }
                    e.uniprot_mods += add;
                    if (add.Length == 0)
                    {
                        e.uniprot_mods += "N/A";
                    }

                    foreach (var ambig_id in e.ambiguous_identifications)
                    {
                        var ambig_mods = ambig_id.ptm_set.ptm_combination.Where(p => !Proteoform.modification_is_adduct(p.modification))
                                         .Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m).ToList();

                        e.uniprot_mods += " | ";
                        add             = "";
                        foreach (var mod in ambig_mods)
                        {
                            // positions with mod
                            List <int> theo_ptms = ambig_id.theoretical_base.ExpandedProteinList.SelectMany(p => p
                                                                                                            .OneBasedPossibleLocalizedModifications)
                                                   .Where(p => p.Key >= ambig_id.begin && p.Key <= ambig_id.end &&
                                                          p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                                   .Select(m => m.Key).ToList();
                            if (theo_ptms.Count > 0)
                            {
                                add += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                            }
                            if (ambig_id.ptm_set.ptm_combination.Where(ptm => ptm.modification.ModificationType != "Deconvolution Error" && !Proteoform.modification_is_adduct(ptm.modification)).Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                                .Count(m => m == mod) > theo_ptms.Count)
                            {
                                e.novel_mods = true;
                            }
                        }
                        e.uniprot_mods += add;
                        if (add.Length == 0)
                        {
                            e.uniprot_mods += "N/A";
                        }
                    }
                }

                //determine level #
                e.proteoform_level_description = "";
                if (e.linked_proteoform_references == null)
                {
                    e.proteoform_level             = 5;
                    e.proteoform_level_description = "Unidentified";
                }
                else if (e.ambiguous_identifications.Count == 0)
                {
                    if (e.ptm_set.ptm_combination.Count == 0)
                    {
                        e.proteoform_level = 1;
                    }
                    else
                    {
                        e.proteoform_level              = 2;
                        e.proteoform_level_description += "PTM localization ambiguity; ";
                    }

                    //check if accessions had been grouped in constructing the theoretical database
                    if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1)
                    {
                        e.proteoform_level             += 1;
                        e.proteoform_level_description += "Gene ambiguity; ";
                    }
                }
                else
                {
                    var unique_accessions = new List <string>()
                    {
                        e.linked_proteoform_references.First().accession.Split('_')[0].Split('-')[0]
                    }.Concat(e.ambiguous_identifications.Select(a => a.theoretical_base.accession.Split('_')[0].Split('-')[0])).Distinct();
                    var unique_sequences = new List <string>()
                    {
                        ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end)
                    }.
                    Concat(e.ambiguous_identifications.Select(a => ExperimentalProteoform.get_sequence(a.theoretical_base, a.begin, a.end))).Distinct();
                    var unique_PTMs = new List <string>()
                    {
                        string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m))
                    }.Concat(e.ambiguous_identifications.Select(a => string.Join(", ", a.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))))).Distinct();

                    int gene_ambiguity = unique_accessions.Count() > 1 ? 1 : 0;

                    //check if accessions had been grouped in constructing the theoretical database
                    if ((e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList).Select(a => a.Split('_')[0]).Distinct().Count() > 1)
                    {
                        gene_ambiguity = 1;
                    }

                    int sequence_ambiguity = unique_sequences.Count() > 1 ? 1 : 0;
                    int PTM_ambiguity      = unique_PTMs.Count() > 1 ? 1 : 0;
                    int PTM_location       = e.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0 || e.ambiguous_identifications.Any(a => a.ptm_set.ptm_combination.Count(m => m.modification.ModificationType != "Deconvolution Error") > 0) ? 1 : 0;

                    e.proteoform_level = 1 + gene_ambiguity + sequence_ambiguity + PTM_ambiguity + PTM_location;
                    if (gene_ambiguity > 0)
                    {
                        e.proteoform_level_description += "Gene ambiguity; ";
                    }
                    if (sequence_ambiguity > 0)
                    {
                        e.proteoform_level_description += "Sequence ambiguity; ";
                    }
                    if (PTM_ambiguity > 0)
                    {
                        e.proteoform_level_description += "PTM identity ambiguity; ";
                    }
                    if (PTM_location > 0)
                    {
                        e.proteoform_level_description += "PTM localization ambiguity; ";
                    }
                }
                if (e.proteoform_level == 1)
                {
                    e.proteoform_level_description = "Unambiguous";
                }

                //determine if new intact-mass ID
                e.new_intact_mass_id = false;
                if (!e.topdown_id && e.linked_proteoform_references != null && e.ambiguous_identifications.Count == 0)
                {
                    string this_id = string.Join(",", (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.AccessionList.Select(a => a.Split('_')[0])).Distinct()) + "_" + ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end) + "_" + string.Join(", ", e.ptm_set.ptm_combination.Where(m => m.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m));
                    if (!topdown_ids.Any(t => this_id.Split('_')[0].Split(',').Contains(t.Split('_')[0]) &&
                                         this_id.Split('_')[1] == t.Split('_')[1] && this_id.Split('_')[2] == t.Split('_')[2]))
                    {
                        e.new_intact_mass_id = true;
                    }
                }
            });

            if (Sweet.lollipop.remove_bad_connections)
            {
                if (theoretical_proteoforms.Count > 0 || (Sweet.lollipop.identify_from_td_nodes && experimental_proteoforms.Count(e => e.topdown_id) > 0))
                {
                    Parallel.ForEach(relations, r =>
                    {
                        r.Accepted = r.Identification;
                    });
                }
            }
        }