Ejemplo n.º 1
0
 public bool same_ptmset(PtmSet that, bool unlocalized)
 {
     if (unlocalized) //methyl,methyl,methyl = methyl; methyl; methyl, etc
     {
         string this_ptms = string.Join(", ", ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m));
         string that_ptms = string.Join(", ", that.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).OrderBy(m => m));
         return(this_ptms == that_ptms);
     }
     else
     {
         List <string> this_ptms = this.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList();
         List <string> that_ptms = that.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList();
         if (this_ptms.Count != that_ptms.Count)
         {
             return(false);
         }
         foreach (string m in this_ptms.Distinct())
         {
             if (that_ptms.Count(s => s == m) != this_ptms.Count(s => s == m))
             {
                 return(false);
             }
         }
         foreach (string m in that_ptms.Distinct())
         {
             if (that_ptms.Count(s => s == m) != this_ptms.Count(s => s == m))
             {
                 return(false);
             }
         }
         return(true);
     }
 }
Ejemplo n.º 2
0
 public void add_topdown_theoreticals(ProteinWithGoTerms prot, string seq, string accession, double unmodified_mass, int decoy_number, int lysine_count, List <TheoreticalProteoform> new_theoreticals, int ptm_set_counter, Dictionary <double, int> mod_ranks, int added_ptm_penalty)
 {
     foreach (TopDownProteoform topdown in Sweet.lollipop.topdown_proteoforms.Where(p => prot.AccessionList.Select(a => a.Split('_')[0]).Contains(p.accession.Split('_')[0].Split('-')[0]) &&
                                                                                    p.sequence == seq).OrderBy(t => t.accession).ThenByDescending(t => t.sequence.Length)) //order by gene name then descending sequence length --> order matters for creating theoreticals.
     {
         if (!new_theoreticals.Any(t => t.ptm_set.same_ptmset(topdown.topdown_ptm_set, true)))
         {
             //match each td proteoform group to the closest theoretical w/ best explanation.... otherwise make new theoretical proteoform
             PtmSet ptm_set          = new PtmSet(topdown.topdown_ptm_set.ptm_combination, mod_ranks, added_ptm_penalty);
             TheoreticalProteoform t =
                 new TheoreticalProteoform(
                     accession + "_P" + ptm_set_counter.ToString(),
                     prot.FullDescription + "_P" + ptm_set_counter.ToString() + (decoy_number < 0 ? "" : "_DECOY_" + decoy_number.ToString()),
                     seq,
                     (prot as ProteinSequenceGroup != null ? (prot as ProteinSequenceGroup).proteinWithGoTermList.ToArray() : new ProteinWithGoTerms[] { prot }),
                     unmodified_mass,
                     lysine_count,
                     ptm_set,
                     decoy_number < 0,
                     false,
                     theoretical_proteins);
             t.topdown_theoretical = true;
             new_theoreticals.Add(t);
             ptm_set_counter++;
         }
     }
 }
Ejemplo n.º 3
0
        private static PtmSet determine_mod_change(ExperimentalProteoform e, Proteoform p, TheoreticalProteoform theoretical_base, ProteoformRelation r, PtmSet this_ptmset)
        {
            double mass_tolerance = p.modified_mass / 1000000 * Sweet.lollipop.mass_tolerance;
            int    sign           = Math.Sign(e.modified_mass - p.modified_mass);
            double deltaM         = Math.Sign(r.peak.DeltaMass) < 0 ? r.peak.DeltaMass : sign * r.peak.DeltaMass; // give EE relations the correct sign, but don't switch negative ET relation deltaM's


            List <PtmSet> possible_additions = r.peak.possiblePeakAssignments.Where(peak => Math.Abs(peak.mass - deltaM) <= 1).ToList(); // EE relations have PtmSets around both positive and negative deltaM, so remove the ones around the opposite of the deltaM of interest
            PtmSet        best_addition      = generate_possible_added_ptmsets(possible_additions, Sweet.lollipop.theoretical_database.all_mods_with_mass, theoretical_base, p.begin, p.end, p.ptm_set, 1, true)
                                               .OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6)                 // major score: delta rank; tie breaker: deltaM, where it's always less than 1
                                               .FirstOrDefault();

            PtmSet best_loss = null;

            foreach (PtmSet set in Sweet.lollipop.theoretical_database.all_possible_ptmsets)
            {
                bool within_loss_tolerance         = deltaM >= -set.mass - mass_tolerance && deltaM <= -set.mass + mass_tolerance;
                List <Modification> these_mods     = this_ptmset.ptm_combination.Select(ptm => ptm.modification).ToList();
                List <Modification> those_mods     = set.ptm_combination.Select(ptm => ptm.modification).ToList();                                                                           // all must be in the current set to remove them
                bool can_be_removed                = those_mods.All(m1 => these_mods.Count(m2 => m2.OriginalId == m1.OriginalId) >= those_mods.Count(m2 => m2.OriginalId == m1.OriginalId)); //# of each mod in current set must be greater than or equal to # in set to remove.
                bool better_than_current_best_loss = best_loss == null || Math.Abs(deltaM - (-set.mass)) < Math.Abs(deltaM - (-best_loss.mass));
                if (can_be_removed && within_loss_tolerance && better_than_current_best_loss)
                {
                    best_loss = set;
                }
            }

            if (best_addition == null && best_loss == null)
            {
                return(null);
            }

            // Make the new ptmset with ptms removed or added
            PtmSet with_mod_change = null;

            if (best_loss == null)
            {
                with_mod_change = new PtmSet(new List <Ptm>(this_ptmset.ptm_combination.Concat(best_addition.ptm_combination).Where(ptm => ptm.modification.MonoisotopicMass != 0).ToList()));
            }
            else
            {
                List <Ptm> new_combo = new List <Ptm>(this_ptmset.ptm_combination);
                foreach (Ptm ptm in best_loss.ptm_combination)
                {
                    new_combo.Remove(new_combo.FirstOrDefault(asdf => asdf.modification.Equals(ptm.modification)));
                }
                with_mod_change = new PtmSet(new_combo);
            }

            if (r.represented_ptmset == null)
            {
                r.represented_ptmset = best_loss == null ? best_addition : best_loss;
                if (r.RelationType == ProteoformComparison.ExperimentalExperimental)
                {
                    r.DeltaMass *= sign;
                }
            }

            return(with_mod_change);
        }
 public AmbiguousIdentification(int begin, int end, PtmSet ptm_set, ProteoformRelation relation, TheoreticalProteoform theoretical_base, List <Proteoform> linked_proteoform_references)
 {
     this.begin                        = begin;
     this.end                          = end;
     this.relation                     = relation;
     this.theoretical_base             = theoretical_base;
     this.ptm_set                      = ptm_set;
     this.linked_proteoform_references = linked_proteoform_references;
 }
Ejemplo n.º 5
0
        public double calculate_mass_error(TheoreticalProteoform t, PtmSet ptm_set, int begin, int end)
        {
            string sequence = t.sequence
                              .Substring(begin < t.begin ? 0 : begin - t.begin,
                                         1 + end - (begin < t.begin ? t.begin : begin));

            if (begin < t.begin)
            {
                sequence = "M" + sequence;
            }
            double theoretical_mass =
                TheoreticalProteoform.CalculateProteoformMass(sequence, ptm_set.ptm_combination);

            return(Math.Round(agg_mass - theoretical_mass, 4));
        }
Ejemplo n.º 6
0
        public List <ExperimentalProteoform> identify_connected_experimentals(TheoreticalProteoform theoretical_base, int begin, int end, PtmSet ptm_set, List <Proteoform> linked_proteoform_references)
        {
            List <ExperimentalProteoform> identified = new List <ExperimentalProteoform>();

            //do relations first closest to candidate ptmset delta mass, then in order of relation delta mass (need to do in same order every round)
            foreach (ProteoformRelation r in relationships.Where(r => r.Accepted).OrderBy(r => r.candidate_ptmset != null ? Math.Abs(r.candidate_ptmset.mass - r.DeltaMass) : r.DeltaMass * 1e6).Distinct().ToList())
            {
                ExperimentalProteoform e = r.connected_proteoforms.OfType <ExperimentalProteoform>().FirstOrDefault(p => p != this);

                if (e == null)
                {
                    continue;
                }                           // Looking at an ET pair, expecting an EE pair

                //if (Sweet.lollipop.identify_from_td_nodes && this as TopDownProteoform != null && e as TopDownProteoform != null) continue; //between two TD nodes

                double mass_tolerance  = modified_mass / 1000000 * Sweet.lollipop.mass_tolerance;
                PtmSet with_mod_change = determine_mod_change(e, this, theoretical_base, r, ptm_set, begin, end);

                if (with_mod_change == null && Math.Abs(r.peak.DeltaMass) <= mass_tolerance)
                {
                    lock (r) lock (e) lock (this)
                            {
                                if (assign_pf_identity(e, ptm_set, begin, end, r, theoretical_base, linked_proteoform_references, true))
                                {
                                    r.Identification = true;
                                    identified.Add(e);
                                }
                            }
                    continue;
                }

                if (with_mod_change == null)
                {
                    continue;
                }
                lock (r) lock (e) lock (this)
                        {
                            if (assign_pf_identity(e, with_mod_change, begin, end, r, theoretical_base, linked_proteoform_references, true))
                            {
                                r.Identification = true;
                                identified.Add(e);
                            }
                        }
            }

            return(identified);
        }
Ejemplo n.º 7
0
        public ProteoformRelation(Proteoform pf1, Proteoform pf2, ProteoformComparison relation_type, double delta_mass, string current_directory)
        {
            connected_proteoforms[0] = pf1;
            connected_proteoforms[1] = pf2;
            RelationType             = relation_type;
            DeltaMass  = delta_mass;
            InstanceId = instanceCounter;
            lock (SaveState.lollipop) instanceCounter += 1; //Not thread safe

            if (CH2 == null || HPO3 == null)
            {
                Loaders.LoadElements(Path.Combine(current_directory, "elements.dat"));
                CH2  = ChemicalFormula.ParseFormula("C1 H2");
                HPO3 = ChemicalFormula.ParseFormula("H1 O3 P1");
            }

            if (SaveState.lollipop.neucode_labeled)
            {
                lysine_count = pf1.lysine_count;
            }

            if ((relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy) &&
                SaveState.lollipop.theoretical_database.possible_ptmset_dictionary.TryGetValue(Math.Round(delta_mass, 1), out List <PtmSet> candidate_sets) &&
                pf2 as TheoreticalProteoform != null)
            {
                TheoreticalProteoform t      = pf2 as TheoreticalProteoform;
                double        mass_tolerance = t.modified_mass / 1000000 * (double)SaveState.lollipop.mass_tolerance;
                List <PtmSet> narrower_range_of_candidates = candidate_sets.Where(s => Math.Abs(s.mass - delta_mass) < 0.05).ToList();
                candidate_ptmset = t.generate_possible_added_ptmsets(narrower_range_of_candidates, delta_mass, mass_tolerance, SaveState.lollipop.theoretical_database.all_mods_with_mass, t, t.sequence, SaveState.lollipop.mod_rank_first_quartile)
                                   .OrderBy(x => x.ptm_rank_sum + Math.Abs(Math.Abs(x.mass) - Math.Abs(delta_mass)) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1
                                   .FirstOrDefault();
            }

            // Start the model (0 Da) at the mass defect of CH2 or HPO3 itself, allowing the peak width tolerance on either side
            double half_peak_width = RelationType == ProteoformComparison.ExperimentalTheoretical || RelationType == ProteoformComparison.ExperimentalDecoy ?
                                     SaveState.lollipop.peak_width_base_et / 2 :
                                     SaveState.lollipop.peak_width_base_ee / 2;
            double low_decimal_bound  = half_peak_width + ((CH2.MonoisotopicMass - Math.Truncate(CH2.MonoisotopicMass)) / CH2.MonoisotopicMass) * (Math.Abs(delta_mass) <= CH2.MonoisotopicMass ? CH2.MonoisotopicMass : Math.Abs(delta_mass));
            double high_decimal_bound = 1 - half_peak_width + ((HPO3.MonoisotopicMass - Math.Ceiling(HPO3.MonoisotopicMass)) / HPO3.MonoisotopicMass) * (Math.Abs(delta_mass) <= HPO3.MonoisotopicMass ? HPO3.MonoisotopicMass : Math.Abs(delta_mass));
            double delta_mass_decimal = Math.Abs(delta_mass - Math.Truncate(delta_mass));

            outside_no_mans_land = delta_mass_decimal <= low_decimal_bound || delta_mass_decimal >= high_decimal_bound ||
                                   high_decimal_bound <= low_decimal_bound;
        }
Ejemplo n.º 8
0
        public static List <SpectrumMatch> get_possible_PSMs(string accession, PtmSet ptm_set, int begin, int end)
        {
            var bottom_up_PSMs = new List <SpectrumMatch>();

            //add BU PSMs
            Sweet.lollipop.theoretical_database.bottom_up_psm_by_accession.TryGetValue(accession.Split('_')[0].Split('-')[0], out var psms);
            if (psms != null)
            {
                bottom_up_PSMs.AddRange(psms.Where(s => s.begin >= begin && s.end <= end && s.ptm_list.All(m1 =>
                                                                                                           ptm_set.ptm_combination.Count(m2 =>
                                                                                                                                         UnlocalizedModification.LookUpId(m1.modification) ==
                                                                                                                                         UnlocalizedModification.LookUpId(m2.modification)) >=
                                                                                                           s.ptm_list.Count(m2 =>
                                                                                                                            UnlocalizedModification.LookUpId(m1.modification) ==
                                                                                                                            UnlocalizedModification.LookUpId(m2.modification)))));
            }

            return(bottom_up_PSMs.OrderByDescending(p => p.ptm_list.Count).ToList());
        }
Ejemplo n.º 9
0
        private int degraded_aas_count(string seq, PtmSet set, bool from_beginning)
        {
            List <string> missing_aas = set.ptm_combination.Select(ptm => ptm.modification).Where(m => m.modificationType == "Missing").Select(m => m.motif.Motif).ToList();
            int           degraded    = 0;

            if (missing_aas.Count != 0)
            {
                foreach (char c in from_beginning ? seq.ToCharArray() : seq.ToCharArray().Reverse())
                {
                    if (missing_aas.Contains(c.ToString().ToUpper()))
                    {
                        degraded++;
                    }
                    else
                    {
                        break;
                    }
                }
            }
            return(degraded);
        }
Ejemplo n.º 10
0
        public List <ExperimentalProteoform> identify_connected_experimentals()
        {
            List <ExperimentalProteoform> identified = new List <ExperimentalProteoform>();

            //do relations first closest to candidate ptmset delta mass, then in order of relation delta mass (need to do in same order every round)
            foreach (ProteoformRelation r in relationships.Where(r => r.Accepted).OrderBy(r => r.candidate_ptmset != null ? Math.Abs(r.candidate_ptmset.mass - r.DeltaMass) : r.DeltaMass * 1e6).Distinct().ToList())
            {
                ExperimentalProteoform e = r.connected_proteoforms.OfType <ExperimentalProteoform>().FirstOrDefault(p => p != this);
                if (e == null)
                {
                    continue;
                }                           // Looking at an ET pair, expecting an EE pair

                TheoreticalProteoform theoretical_base = this as TheoreticalProteoform != null ?
                                                         this as TheoreticalProteoform :                                  //Theoretical starting point
                                                         (linked_proteoform_references.First() as TheoreticalProteoform != null ?
                                                          linked_proteoform_references.First() as TheoreticalProteoform : //Experimental with theoretical reference
                                                          null);                                                          //Experimental without theoretical reference

                double mass_tolerance  = modified_mass / 1000000 * Sweet.lollipop.mass_tolerance;
                PtmSet with_mod_change = determine_mod_change(e, this, theoretical_base, r, this.ptm_set);

                if (with_mod_change == null && Math.Abs(r.peak.DeltaMass) <= mass_tolerance)
                {
                    lock (r) lock (e) assign_pf_identity(e, ptm_set, r, theoretical_base);
                    identified.Add(e);
                }

                if (with_mod_change == null)
                {
                    continue;
                }

                lock (r) lock (e)
                        assign_pf_identity(e, with_mod_change, r, theoretical_base);
                identified.Add(e);
            }
            return(identified);
        }
Ejemplo n.º 11
0
        }                                                //not in DB without topdown result loaded in...

        #endregion Public Properties

        #region Public Constructor

        public TheoreticalProteoform(string accession, string description, string sequence, IEnumerable <ProteinWithGoTerms> expanded_protein_list, double unmodified_mass, int lysine_count, PtmSet ptm_set, bool is_target, bool check_contaminants, Dictionary <InputFile, Protein[]> theoretical_proteins)
            : base(accession, unmodified_mass + ptm_set.mass, lysine_count, is_target)
        {
            this.linked_proteoform_references = new List <Proteoform>();
            this.ExpandedProteinList          = expanded_protein_list.ToList();
            this.accession       = accession;
            this.description     = description.Split('|').Length >= 3 ? description.Split('|')[2] : description;
            this.name            = string.Join(";", expanded_protein_list.Select(p => p.Name).Distinct());
            this.fragment        = string.Join(";", expanded_protein_list.Select(p => p.ProteolysisProducts.FirstOrDefault().Type).Distinct());
            this.begin           = (int)expanded_protein_list.FirstOrDefault().ProteolysisProducts.FirstOrDefault().OneBasedBeginPosition;
            this.end             = (int)expanded_protein_list.FirstOrDefault().ProteolysisProducts.FirstOrDefault().OneBasedEndPosition;
            this.sequence        = sequence;
            this.goTerms         = expanded_protein_list.SelectMany(p => p.GoTerms).Distinct().ToList();
            goTerm_IDs           = string.Join("; ", goTerms.Select(g => g.Id));
            this.gene_name       = new GeneName(expanded_protein_list.SelectMany(t => t.GeneNames).ToList());
            this.ptm_set         = ptm_set;
            this.unmodified_mass = unmodified_mass;
            if (check_contaminants)
            {
                this.contaminant = theoretical_proteins.Where(item => item.Key.ContaminantDB).SelectMany(kv => kv.Value).Any(p => p.Accession == this.accession.Split(new char[] { '_' })[0]);
            }
            this.modified_mass = CalculateProteoformMass(sequence, ptm_set.ptm_combination);
            bottom_up_PSMs     = get_possible_PSMs(accession, ptm_set, begin, end);
        }
Ejemplo n.º 12
0
        public List <ExperimentalProteoform> identify_connected_experimentals(List <PtmSet> all_possible_ptmsets, List <ModificationWithMass> all_mods_with_mass)
        {
            List <ExperimentalProteoform> identified = new List <ExperimentalProteoform>();

            foreach (ProteoformRelation r in relationships.Where(r => r.Accepted).Distinct().ToList())
            {
                ExperimentalProteoform e = r.connected_proteoforms.OfType <ExperimentalProteoform>().FirstOrDefault(p => p != this);
                if (e == null)
                {
                    continue;            // Looking at an ET pair, expecting an EE pair
                }
                double mass_tolerance = modified_mass / 1000000 * (double)SaveState.lollipop.mass_tolerance;
                int    sign           = Math.Sign(e.modified_mass - modified_mass);
                double deltaM         = Math.Sign(r.peak.DeltaMass) < 0 ? r.peak.DeltaMass : sign * r.peak.DeltaMass;     // give EE relations the correct sign, but don't switch negative ET relation deltaM's
                TheoreticalProteoform theoretical_base = this as TheoreticalProteoform != null ?
                                                         this as TheoreticalProteoform :                                  //Theoretical starting point
                                                         (linked_proteoform_references.First() as TheoreticalProteoform != null ?
                                                          linked_proteoform_references.First() as TheoreticalProteoform : //Experimental with theoretical reference
                                                          null);                                                          //Experimental without theoretical reference
                string theoretical_base_sequence = theoretical_base != null ? theoretical_base.sequence : "";

                PtmSet best_addition = generate_possible_added_ptmsets(r.peak.possiblePeakAssignments, deltaM, mass_tolerance, all_mods_with_mass, theoretical_base, theoretical_base_sequence, 1)
                                       .OrderBy(x => (double)x.ptm_rank_sum + Math.Abs(x.mass - deltaM) * 10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1
                                       .FirstOrDefault();

                PtmSet best_loss = null;
                foreach (PtmSet set in all_possible_ptmsets)
                {
                    bool within_loss_tolerance         = deltaM >= -set.mass - mass_tolerance && deltaM <= -set.mass + mass_tolerance;
                    var  these_mods                    = this.ptm_set.ptm_combination.Select(ptm => ptm.modification);
                    var  those_mods                    = set.ptm_combination.Select(ptm => ptm.modification); // all must be in the current set to remove them
                    bool can_be_removed                = those_mods.All(m => these_mods.Contains(m));
                    bool better_than_current_best_loss = best_loss == null || Math.Abs(deltaM - (-set.mass)) < Math.Abs(deltaM - (-best_loss.mass));
                    if (can_be_removed && within_loss_tolerance && better_than_current_best_loss)
                    {
                        best_loss = set;
                    }
                }

                // If they're the same and someone hasn't labeled 0 difference with a "ModificationWithMass", then label it null
                if (best_addition == null && best_loss == null && Math.Abs(r.peak.DeltaMass) <= mass_tolerance)
                {
                    lock (r) lock (e) assign_pf_identity(e, this, ptm_set, r, sign, null);
                    identified.Add(e);
                }

                if (best_addition == null && best_loss == null)
                {
                    continue;
                }

                // Make the new ptmset with ptms removed or added
                PtmSet with_mod_change = null;
                if (best_loss == null)
                {
                    with_mod_change = new PtmSet(new List <Ptm>(this.ptm_set.ptm_combination.Concat(best_addition.ptm_combination).Where(ptm => ptm.modification.monoisotopicMass != 0).ToList()));
                }
                else
                {
                    List <Ptm> new_combo = new List <Ptm>(this.ptm_set.ptm_combination);
                    foreach (Ptm ptm in best_loss.ptm_combination)
                    {
                        new_combo.Remove(new_combo.FirstOrDefault(asdf => asdf.modification == ptm.modification));
                    }
                    with_mod_change = new PtmSet(new_combo);
                }

                lock (r) lock (e)
                        assign_pf_identity(e, this, with_mod_change, r, sign, best_loss != null ? best_loss : best_addition);
                identified.Add(e);
            }
            return(identified);
        }
Ejemplo n.º 13
0
        private void assign_pf_identity(ExperimentalProteoform e, Proteoform theoretical_reference, PtmSet set, ProteoformRelation r, int sign, PtmSet change)
        {
            if (r.represented_ptmset == null)
            {
                r.represented_ptmset = change;
                if (r.RelationType == ProteoformComparison.ExperimentalExperimental)
                {
                    r.DeltaMass *= sign;
                }
            }
            if (e.linked_proteoform_references == null)
            {
                e.linked_proteoform_references = new List <Proteoform>(this.linked_proteoform_references);
                e.linked_proteoform_references.Add(this);
                e.ptm_set = set;
            }

            if (e.gene_name == null)
            {
                e.gene_name = this.gene_name;
            }
            else
            {
                e.gene_name.gene_names.Concat(this.gene_name.gene_names);
            }
        }
Ejemplo n.º 14
0
        public List <PtmSet> generate_possible_added_ptmsets(List <PtmSet> possible_peak_assignments, double deltaM, double mass_tolerance, List <ModificationWithMass> all_mods_with_mass,
                                                             TheoreticalProteoform theoretical_base, string theoretical_base_sequence, int additional_ptm_penalty)
        {
            List <ModificationWithMass> known_mods = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications.ToList()).SelectMany(kv => kv.Value).OfType <ModificationWithMass>().ToList();
            List <PtmSet> possible_ptmsets         = new List <PtmSet>();

            int n_terminal_degraded_aas = degraded_aas_count(theoretical_base_sequence, ptm_set, true);
            int c_terminal_degraded_aas = degraded_aas_count(theoretical_base_sequence, ptm_set, false);

            foreach (PtmSet set in possible_peak_assignments)
            {
                List <ModificationWithMass> mods_in_set = set.ptm_combination.Select(ptm => ptm.modification).ToList();

                int rank_sum = additional_ptm_penalty * (set.ptm_combination.Sum(m => SaveState.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m.modification, out UnlocalizedModification x) ? x.ptm_count : 1) - 1); // penalize additional PTMs

                foreach (ModificationWithMass m in mods_in_set)
                {
                    int mod_rank = SaveState.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m, out UnlocalizedModification u) ? u.ptm_rank : SaveState.lollipop.modification_ranks[m.monoisotopicMass];

                    if (m.monoisotopicMass == 0)
                    {
                        rank_sum += mod_rank;
                        continue;
                    }

                    bool could_be_m_retention     = m.modificationType == "AminoAcid" && m.motif.Motif == "M" && theoretical_base.begin == 2 && !ptm_set.ptm_combination.Select(p => p.modification).Contains(m);
                    bool motif_matches_n_terminus = n_terminal_degraded_aas < theoretical_base_sequence.Length && m.motif.Motif == theoretical_base_sequence[n_terminal_degraded_aas].ToString();
                    bool motif_matches_c_terminus = c_terminal_degraded_aas < theoretical_base_sequence.Length && m.motif.Motif == theoretical_base_sequence[theoretical_base_sequence.Length - c_terminal_degraded_aas - 1].ToString();
                    bool cannot_be_degradation    = !motif_matches_n_terminus && !motif_matches_c_terminus;
                    if (m.modificationType == "Missing" && cannot_be_degradation ||
                        m.modificationType == "AminoAcid" && !could_be_m_retention ||
                        u != null ? u.require_proteoform_without_mod : false && set.ptm_combination.Count > 1)
                    {
                        rank_sum = Int32.MaxValue;
                        break;
                    }

                    bool could_be_n_term_degradation = m.modificationType == "Missing" && motif_matches_n_terminus;
                    bool could_be_c_term_degradation = m.modificationType == "Missing" && motif_matches_c_terminus;
                    bool likely_cleavage_site        = could_be_n_term_degradation && SaveState.lollipop.likely_cleavages.Contains(theoretical_base_sequence[n_terminal_degraded_aas].ToString()) ||
                                                       could_be_c_term_degradation && SaveState.lollipop.likely_cleavages.Contains(theoretical_base_sequence[theoretical_base_sequence.Length - c_terminal_degraded_aas - 1].ToString());

                    rank_sum -= Convert.ToInt32(SaveState.lollipop.theoretical_database.variableModifications.Contains(m)); // favor variable modifications over regular modifications of the same mass

                    // In order of likelihood:
                    // 1. First, we observe I/L/A cleavage to be the most common,
                    // 1. "Fatty Acid" is a list of modifications prevalent in yeast or bacterial analysis,
                    // 1. and unlocalized modifications are a subset of modifications in the intact_mods.txt list that should be included in intact analysis (handled in unlocalized modification)
                    // 2. Second, other degradations and methionine cleavage are weighted mid-level
                    // 3. Missed monoisotopic errors are considered, but weighted towards the bottom. This should allow missed monoisotopics with common modifications like oxidation, but not rare ones.  (handled in unlocalized modification)
                    if (likely_cleavage_site)
                    {
                        rank_sum += SaveState.lollipop.mod_rank_first_quartile / 2;
                    }
                    else if (could_be_m_retention || could_be_n_term_degradation || could_be_c_term_degradation)
                    {
                        rank_sum += SaveState.lollipop.mod_rank_second_quartile;
                    }
                    else
                    {
                        rank_sum += known_mods.Concat(SaveState.lollipop.theoretical_database.variableModifications).Contains(m) ?
                                    mod_rank :
                                    mod_rank + SaveState.lollipop.mod_rank_first_quartile / 2; // Penalize modifications that aren't known for this protein and push really rare ones out of the running if they're not in the protein entry
                    }
                }

                if (rank_sum <= SaveState.lollipop.mod_rank_sum_threshold)
                {
                    PtmSet adjusted_ranksum = new PtmSet(set.ptm_combination);
                    adjusted_ranksum.ptm_rank_sum = rank_sum;
                    possible_ptmsets.Add(adjusted_ranksum);
                }
            }
            return(possible_ptmsets);
        }
Ejemplo n.º 15
0
        public ProteoformRelation(Proteoform pf1, Proteoform pf2, ProteoformComparison relation_type, double delta_mass, string current_directory)
        {
            connected_proteoforms[0] = pf1;
            connected_proteoforms[1] = pf2;
            RelationType             = relation_type;
            DeltaMass  = delta_mass;
            InstanceId = instanceCounter;
            lock (Sweet.lollipop) instanceCounter += 1; //Not thread safe

            if (CH2 == null || HPO3 == null)
            {
                CH2  = ChemicalFormula.ParseFormula("C1 H2");
                HPO3 = ChemicalFormula.ParseFormula("H1 O3 P1");
            }

            if (Sweet.lollipop.neucode_labeled)
            {
                lysine_count = pf1.lysine_count;
            }


            List <PtmSet> candidate_sets = new List <PtmSet>();

            if (Sweet.lollipop.et_use_notch && (relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy))
            {
                if (Sweet.lollipop.et_use_notch && !Sweet.lollipop.et_notch_ppm)
                {
                    double mass = delta_mass - Sweet.lollipop.notch_tolerance_et;
                    while (mass <= delta_mass + Sweet.lollipop.notch_tolerance_et)
                    {
                        Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue(
                            Math.Round(mass, 1), out List <PtmSet> candidates);
                        if (candidates != null)
                        {
                            candidate_sets.AddRange(candidates);
                        }

                        mass += 0.1;
                    }

                    candidate_sets = candidate_sets.Distinct().ToList();
                }
                else
                {
                    Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue(Math.Round(delta_mass, 1), out candidate_sets);
                }

                if (candidate_sets != null)
                {
                    candidate_sets = candidate_sets.Where(s => Sweet.lollipop.et_notch_ppm
                        ? Math.Abs(s.mass - delta_mass) * 1e6 / pf1.modified_mass <
                                                          Sweet.lollipop.notch_tolerance_et
                        : Math.Abs(s.mass - delta_mass) < Sweet.lollipop.notch_tolerance_et).ToList();
                    candidate_ptmset = candidate_sets.OrderBy(s => s.ptm_rank_sum).FirstOrDefault();
                }
            }

            else if (Sweet.lollipop.ee_use_notch &&
                     (relation_type == ProteoformComparison.ExperimentalExperimental ||
                      relation_type == ProteoformComparison.ExperimentalFalse))
            {
                if (Sweet.lollipop.ee_use_notch && !Sweet.lollipop.ee_notch_ppm)
                {
                    double mass = delta_mass - Sweet.lollipop.notch_tolerance_ee;
                    while (mass <= delta_mass + Sweet.lollipop.notch_tolerance_ee)
                    {
                        Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue(
                            Math.Round(mass, 1), out List <PtmSet> candidates);
                        if (candidates != null)
                        {
                            candidate_sets.AddRange(candidates);
                        }

                        mass += 0.1;
                    }

                    candidate_sets = candidate_sets.Distinct().ToList();
                }
                else
                {
                    Sweet.lollipop.theoretical_database.possible_ptmset_dictionary_notches.TryGetValue(Math.Round(delta_mass, 1), out candidate_sets);
                }

                if (candidate_sets != null)
                {
                    candidate_sets = candidate_sets.Where(s => Sweet.lollipop.ee_notch_ppm
                        ? Math.Abs(s.mass - delta_mass) * 1e6 / pf1.modified_mass <
                                                          Sweet.lollipop.notch_tolerance_ee
                        : Math.Abs(s.mass - delta_mass) < Sweet.lollipop.notch_tolerance_ee).ToList();
                    candidate_ptmset = candidate_sets.OrderBy(s => s.ptm_rank_sum).FirstOrDefault();
                }
            }
            else if
            (relation_type == ProteoformComparison.ExperimentalTheoretical ||
             relation_type == ProteoformComparison.ExperimentalDecoy)
            {
                if (Sweet.lollipop.peak_width_base_et > 0.09)
                {
                    double mass = delta_mass - Sweet.lollipop.peak_width_base_et;
                    while (mass <= delta_mass + Sweet.lollipop.peak_width_base_et)
                    {
                        Sweet.lollipop.theoretical_database.possible_ptmset_dictionary.TryGetValue(
                            Math.Round(mass, 1), out List <PtmSet> candidates);
                        if (candidates != null)
                        {
                            candidate_sets.AddRange(candidates);
                        }

                        mass += 0.1;
                    }

                    candidate_sets = candidate_sets.Distinct().ToList();
                }
                else
                {
                    Sweet.lollipop.theoretical_database.possible_ptmset_dictionary.TryGetValue(
                        Math.Round(delta_mass, 1), out candidate_sets);
                }

                if (pf2 as TheoreticalProteoform != null && candidate_sets != null && candidate_sets.Count > 0)
                {
                    List <PtmSet> narrower_range_of_candidates = new List <PtmSet>();
                    if (Sweet.lollipop.et_use_notch)
                    {
                        narrower_range_of_candidates = candidate_sets;
                    }
                    else
                    {
                        narrower_range_of_candidates = candidate_sets
                                                       .Where(s => Math.Abs(s.mass - delta_mass) < Sweet.lollipop.peak_width_base_et).ToList();
                    }

                    TheoreticalProteoform t = pf2 as TheoreticalProteoform;
                    candidate_ptmset = Proteoform.generate_possible_added_ptmsets(narrower_range_of_candidates,
                                                                                  Sweet.lollipop.theoretical_database.all_mods_with_mass, t, pf2.begin, pf2.end,
                                                                                  pf2.ptm_set,
                                                                                  Sweet.lollipop.mod_rank_first_quartile, false).OrderBy(x =>
                                                                                                                                         x.ptm_rank_sum +
                                                                                                                                         Math.Abs(Math.Abs(x.mass) - Math.Abs(delta_mass)) *
                                                                                                                                         10E-6) // major score: delta rank; tie breaker: deltaM, where it's always less than 1
                                       .FirstOrDefault();
                }
            }

            // Start the model (0 Da) at the mass defect of CH2 or HPO3 itself, allowing the peak width tolerance on either side
            double half_peak_width = RelationType == ProteoformComparison.ExperimentalTheoretical || RelationType == ProteoformComparison.ExperimentalDecoy ?
                                     Sweet.lollipop.peak_width_base_et / 2 :
                                     Sweet.lollipop.peak_width_base_ee / 2;
            double low_decimal_bound  = half_peak_width + ((CH2.MonoisotopicMass - Math.Truncate(CH2.MonoisotopicMass)) / CH2.MonoisotopicMass) * (Math.Abs(delta_mass) <= CH2.MonoisotopicMass ? CH2.MonoisotopicMass : Math.Abs(delta_mass));
            double high_decimal_bound = 1 - half_peak_width + ((HPO3.MonoisotopicMass - Math.Ceiling(HPO3.MonoisotopicMass)) / HPO3.MonoisotopicMass) * (Math.Abs(delta_mass) <= HPO3.MonoisotopicMass ? HPO3.MonoisotopicMass : Math.Abs(delta_mass));
            double delta_mass_decimal = Math.Abs(delta_mass - Math.Truncate(delta_mass));

            outside_no_mans_land = delta_mass_decimal <= low_decimal_bound || delta_mass_decimal >= high_decimal_bound ||
                                   high_decimal_bound <= low_decimal_bound;
            if (Sweet.lollipop.et_use_notch && (relation_type == ProteoformComparison.ExperimentalTheoretical || relation_type == ProteoformComparison.ExperimentalDecoy))
            {
                outside_no_mans_land = true;
            }
            if (Sweet.lollipop.ee_use_notch && (relation_type == ProteoformComparison.ExperimentalExperimental || relation_type == ProteoformComparison.ExperimentalFalse))
            {
                outside_no_mans_land = true;
            }
        }
Ejemplo n.º 16
0
        private bool assign_pf_identity(ExperimentalProteoform e, PtmSet set, int begin, int end, ProteoformRelation r, TheoreticalProteoform theoretical_base, List <Proteoform> linked_proteoform_references, bool check_ambiguous_IDs)
        {
            bool identification_assigned = false;

            if (!Sweet.lollipop.id_use_ppm_tolerance || Math.Abs(e.calculate_mass_error(theoretical_base, set, begin, end) * 1e6 / e.modified_mass) < Sweet.lollipop.id_ppm_tolerance)
            {
                int new_begin = begin;
                int new_end   = end;

                PtmSet     new_set = new PtmSet(new List <Ptm>(set.ptm_combination));
                List <Ptm> remove  = new List <Ptm>();
                //do retention of M first
                foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                {
                    new_begin--;
                    remove.Add(mod);
                }

                foreach (var mod in new_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                {
                    if (!new_set.ptm_combination.Any(m => m.modification.ModificationType == "AminoAcid") && begin >= theoretical_base.begin)
                    {
                        if (theoretical_base.sequence[begin - theoretical_base.begin].ToString() ==
                            mod.modification.Target.ToString())
                        {
                            new_begin++;
                            remove.Add(mod); //dont have in ptmset --> change the begin & end
                        }
                    }
                    if (!remove.Contains(mod) && theoretical_base.sequence[end - theoretical_base.begin].ToString() ==
                        mod.modification.Target.ToString())
                    {
                        new_end--;
                        remove.Add(mod);
                    }
                }

                foreach (var ptm in remove)
                {
                    new_set.ptm_combination.Remove(ptm);
                }

                new_set = new PtmSet(new_set.ptm_combination);

                if (e.linked_proteoform_references == null)
                {
                    identification_assigned = true;

                    if (linked_proteoform_references != null)
                    {
                        e.linked_proteoform_references = new List <Proteoform>(linked_proteoform_references);
                        e.linked_proteoform_references.Add(this);
                    }
                    else
                    {
                        e.linked_proteoform_references = new List <Proteoform>()
                        {
                            theoretical_base
                        };
                    }

                    e.relation_to_id = r;
                    e.ptm_set        = new_set;
                    e.begin          = new_begin;
                    e.end            = new_end;


                    if (e.gene_name == null)
                    {
                        e.gene_name = theoretical_base.gene_name;
                    }
                    else
                    {
                        e.gene_name.gene_names.Concat(this.gene_name.gene_names);
                    }
                }
                else
                {
                    if (linked_proteoform_references != null && !linked_proteoform_references.Contains(e))
                    {
                        bool different_id = e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) !=
                                            theoretical_base.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ||
                                            ExperimentalProteoform.get_sequence(e.linked_proteoform_references.First() as TheoreticalProteoform, e.begin, e.end)
                                            != ExperimentalProteoform.get_sequence(theoretical_base, new_begin, new_end) || !e.ptm_set.same_ptmset(new_set, true);


                        List <Modification> this_known_mods        = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList();
                        List <Modification> previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(v => v.MonoisotopicMass != 0).ToList();
                        if (!Sweet.lollipop.topdown_theoretical_reduce_ambiguity || (theoretical_base.topdown_theoretical && !(e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical))
                        {
                            if (!Sweet.lollipop.annotated_PTMs_reduce_ambiguity ||
                                (new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) &&
                                 !e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification)))))
                            {
                                if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity || Sweet.lollipop.annotated_PTMs_reduce_ambiguity)
                                {
                                    if (Sweet.lollipop.remove_bad_connections && different_id) //&& e.relation_to_id != r)
                                    {
                                        e.relation_to_id.Identification     = false;
                                        e.relation_to_id.represented_ptmset = null;
                                    }
                                    e.linked_proteoform_references = null;
                                    e.ptm_set   = new PtmSet(new List <Ptm>());
                                    e.begin     = 0;
                                    e.end       = 0;
                                    e.gene_name = null;
                                    e.ambiguous_identifications.Clear();
                                    ProteoformRelation relation = null;
                                    e.relation_to_id = relation;

                                    //reassign the topdown - based ID
                                    return(this.assign_pf_identity(e, set, begin, end, r, theoretical_base, linked_proteoform_references, true));
                                }
                            }
                        }

                        if (Sweet.lollipop.topdown_theoretical_reduce_ambiguity && (e.linked_proteoform_references.First() as TheoreticalProteoform).topdown_theoretical && !theoretical_base.topdown_theoretical)
                        {
                        }
                        else if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity &&
                                 !new_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || this_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))) &&
                                 e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                        {
                        }
                        else
                        {
                            if (different_id)
                            {
                                var new_linked_proteoform_references = new List <Proteoform>(linked_proteoform_references);
                                new_linked_proteoform_references.Add(this);

                                AmbiguousIdentification new_id =
                                    new AmbiguousIdentification(new_begin, new_end, new_set, r, theoretical_base, new_linked_proteoform_references);
                                lock (e.ambiguous_identifications)
                                {
                                    if (!e.ambiguous_identifications.Any(p =>
                                                                         p.theoretical_base.gene_name.primary ==
                                                                         new_id.theoretical_base.gene_name.primary &&
                                                                         ExperimentalProteoform.get_sequence(p.theoretical_base, p.begin, p.end) == ExperimentalProteoform.get_sequence(new_id.theoretical_base, new_id.begin, new_id.end) &&
                                                                         p.ptm_set.same_ptmset(new_id.ptm_set, true)))
                                    {
                                        e.ambiguous_identifications.Add(new_id);
                                        identification_assigned = true;
                                    }
                                }
                            }
                        }
                    }
                }
            }


            if (check_ambiguous_IDs)
            {
                //remove bad relations if using td to reduce ambiguity
                if (identification_assigned)
                {
                    List <AmbiguousIdentification> to_remove = new List <AmbiguousIdentification>();
                    List <Modification>            previous_id_known_mods = (e.linked_proteoform_references.First() as TheoreticalProteoform).ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList();
                    if (theoretical_base.topdown_theoretical && Sweet.lollipop.topdown_theoretical_reduce_ambiguity)
                    {
                        to_remove.AddRange(e.ambiguous_identifications.Where(id => !id.theoretical_base.topdown_theoretical));
                    }
                    if (Sweet.lollipop.annotated_PTMs_reduce_ambiguity &&
                        e.ptm_set.ptm_combination.All(mod1 => modification_is_adduct(mod1.modification) || previous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                    {
                        foreach (var ambiguous_id in e.ambiguous_identifications)
                        {
                            List <Modification> ambiguous_id_known_mods = ambiguous_id.theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).Where(m => m.MonoisotopicMass != 0).ToList();
                            if (ambiguous_id.ptm_set.ptm_combination.Any(mod1 => !modification_is_adduct(mod1.modification) && !ambiguous_id_known_mods.Select(mod2 => UnlocalizedModification.LookUpId(mod2)).Contains(UnlocalizedModification.LookUpId(mod1.modification))))
                            {
                                to_remove.Add(ambiguous_id);
                            }
                        }
                    }
                    foreach (var x in to_remove)
                    {
                        if (e.ambiguous_identifications.Contains(x))
                        {
                            e.ambiguous_identifications.Remove(x);
                            if (Sweet.lollipop.remove_bad_connections)
                            {
                                if (e.relation_to_id != x.relation)
                                {
                                    x.relation.Identification     = false;
                                    x.relation.represented_ptmset = null;
                                }
                            }
                        }
                    }
                    foreach (var x in e.ambiguous_identifications)
                    {
                        x.relation.Identification = true;
                    }
                }


                if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous_identifications.Count > 0)
                {
                    lock ((this as ExperimentalProteoform).ambiguous_identifications)
                    {
                        int      count       = (this as ExperimentalProteoform).ambiguous_identifications.Count;
                        PtmSet[] new_ptm_set = new PtmSet[count];
                        Parallel.For(0, count, i =>
                        {
                            var id         = (this as ExperimentalProteoform).ambiguous_identifications[i];
                            new_ptm_set[i] = determine_mod_change(e, this, id.theoretical_base, r, id.ptm_set, id.begin, id.end);
                        });
                        for (int i = 0; i < count; i++)
                        {
                            if (new_ptm_set[i] != null)
                            {
                                var id = (this as ExperimentalProteoform).ambiguous_identifications[i];
                                if (assign_pf_identity(e, new_ptm_set[i], id.begin, id.end, r, id.theoretical_base, id.linked_proteoform_references, false))
                                {
                                    identification_assigned = true;
                                }
                            }
                        }
                    }
                }
            }
            return(identification_assigned);
        }
Ejemplo n.º 17
0
        public static List <PtmSet> generate_possible_added_ptmsets(List <PtmSet> possible_peak_assignments, List <Modification> all_mods_with_mass,
                                                                    TheoreticalProteoform theoretical_base, int begin, int end, PtmSet ptm_set, int additional_ptm_penalty, bool final_assignment)
        {
            List <Modification> known_mods       = theoretical_base.ExpandedProteinList.SelectMany(p => p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value).ToList();
            List <PtmSet>       possible_ptmsets = new List <PtmSet>();

            foreach (PtmSet set in possible_peak_assignments)
            {
                List <Modification> mods_in_set = set.ptm_combination.Select(ptm => ptm.modification).ToList();

                int rank_sum = additional_ptm_penalty * (set.ptm_combination.Sum(m => Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m.modification, out UnlocalizedModification x) ? x.ptm_count : 1) - 1); // penalize additional PTMs
                foreach (Modification m in mods_in_set)
                {
                    int mod_rank = Sweet.lollipop.theoretical_database.unlocalized_lookup.TryGetValue(m, out UnlocalizedModification u) ? u.ptm_rank : Sweet.lollipop.modification_ranks.TryGetValue(Math.Round((double)m.MonoisotopicMass, 5), out int x) ? x : Sweet.lollipop.mod_rank_sum_threshold;

                    bool could_be_m_retention     = m.ModificationType == "AminoAcid" && m.Target.ToString() == "M" && theoretical_base.begin == 2 && begin == 2 && !ptm_set.ptm_combination.Any(p => p.modification.Equals(m));
                    bool motif_matches_n_terminus = begin - theoretical_base.begin >= 0 && begin - theoretical_base.begin < theoretical_base.sequence.Length && m.Target.ToString() == theoretical_base.sequence[begin - theoretical_base.begin].ToString() && !mods_in_set.Any(mod => mod.ModificationType == "AminoAcid" && mod.Target.ToString() == "M");
                    bool motif_matches_c_terminus = end - theoretical_base.begin >= 0 && end - theoretical_base.begin < theoretical_base.sequence.Length && m.Target.ToString() == theoretical_base.sequence[end - theoretical_base.begin].ToString();

                    bool cannot_be_degradation = !motif_matches_n_terminus && !motif_matches_c_terminus;
                    if ((m.ModificationType == "Missing" && cannot_be_degradation) ||
                        (m.ModificationType == "AminoAcid" && !could_be_m_retention) ||
                        ((u != null ? u.require_proteoform_without_mod : false) && set.ptm_combination.Count > 1))
                    {
                        rank_sum = Int32.MaxValue;
                        break;
                    }

                    bool could_be_n_term_degradation = m.ModificationType == "Missing" && motif_matches_n_terminus;
                    bool could_be_c_term_degradation = m.ModificationType == "Missing" && motif_matches_c_terminus;

                    //if selected, going to only allow mods in Mods folder (type "Common"), Missing, Missed Monoisotopic, known mods for that protein, or Unmodified
                    if (Sweet.lollipop.only_assign_common_or_known_mods && final_assignment)
                    {
                        if (!(m.MonoisotopicMass == 0 || m.ModificationType == "Common" || could_be_m_retention || could_be_n_term_degradation || could_be_c_term_degradation || m.ModificationType == "Deconvolution Error" || known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Contains(m) ||
                              known_mods.Select(mod => UnlocalizedModification.LookUpId(mod)).Contains(UnlocalizedModification.LookUpId(m))))
                        {
                            rank_sum = Int32.MaxValue;
                            break;
                        }
                    }

                    // In order of likelihood:
                    // 1. First, we observe I/L/A cleavage to be the most common, other degradations and methionine cleavage are weighted mid-level
                    // 2. Missed monoisotopic errors are considered, but weighted towards the bottom. This should allow missed monoisotopics with common modifications like oxidation, but not rare ones.  (handled in unlocalized modification)
                    if (m.MonoisotopicMass == 0)
                    {
                        rank_sum += mod_rank;
                        continue;
                    }

                    rank_sum -= Convert.ToInt32(Sweet.lollipop.theoretical_database.variableModifications.Contains(m)); // favor variable modifications over regular modifications of the same mass

                    if (could_be_m_retention || could_be_n_term_degradation || could_be_c_term_degradation)
                    {
                        rank_sum += Sweet.lollipop.mod_rank_first_quartile / 2;
                    }
                    else if (m.ModificationType == "Deconvolution Error")
                    {
                        rank_sum += Sweet.lollipop.neucode_labeled ?
                                    Sweet.lollipop.mod_rank_third_quartile : //in neucode-labeled data, fewer missed monoisotopics - don't prioritize
                                    1;                                       //in label-free, more missed monoisotoipcs, should prioritize (set to same priority as variable modification)
                        rank_sum -= additional_ptm_penalty;
                    }
                    else
                    {
                        //if annotated in DB for this, just add 1?
                        rank_sum += known_mods.Concat(Sweet.lollipop.theoretical_database.variableModifications).Select(mod => UnlocalizedModification.LookUpId(mod)).Contains(UnlocalizedModification.LookUpId(m))
                                ?
                                    1 :                                                    //mod rank
                                    mod_rank + Sweet.lollipop.mod_rank_first_quartile / 2; // Penalize modifications that aren't known for this protein and push really rare ones out of the running if they're not in the protein entry
                    }
                }

                if (rank_sum <= Sweet.lollipop.mod_rank_sum_threshold)
                {
                    PtmSet adjusted_ranksum = new PtmSet(set.ptm_combination);
                    adjusted_ranksum.ptm_rank_sum = rank_sum;
                    possible_ptmsets.Add(adjusted_ranksum);
                }
            }

            return(possible_ptmsets);
        }
Ejemplo n.º 18
0
        private void assign_pf_identity(ExperimentalProteoform e, PtmSet set, ProteoformRelation r, TheoreticalProteoform theoretical_base)
        {
            if (e.linked_proteoform_references == null)
            {
                e.linked_proteoform_references = new List <Proteoform>(this.linked_proteoform_references);
                e.linked_proteoform_references.Add(this);
                e.ptm_set = set;
                e.begin   = this.begin;
                e.end     = this.end;
                List <Ptm> remove = new List <Ptm>();

                //do retention of M first
                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                {
                    e.begin--;
                    remove.Add(mod);
                }
                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                {
                    if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() == mod.modification.Target.ToString())
                    {
                        e.begin++;
                        remove.Add(mod); //dont have in ptmset --> change the begin & end
                    }
                    else if (theoretical_base.sequence[this.end - this.begin].ToString() == mod.modification.Target.ToString())
                    {
                        e.end--;
                        remove.Add(mod);
                    }
                }
                foreach (var ptm in remove)
                {
                    e.ptm_set.ptm_combination.Remove(ptm);
                }
                e.ptm_set = new PtmSet(e.ptm_set.ptm_combination);

                if (e.gene_name == null)
                {
                    e.gene_name = this.gene_name;
                }
                else if (!e.topdown_id)
                {
                    e.gene_name.gene_names.Concat(this.gene_name.gene_names);
                }
            }
            else
            {
                //check if assign
                int        begin   = this.begin;
                int        end     = this.end;
                PtmSet     ptm_set = set;
                List <Ptm> remove  = new List <Ptm>();
                //do retention of M first
                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                {
                    begin--;
                    remove.Add(mod);
                }

                foreach (var mod in set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                {
                    if (theoretical_base.sequence[this.begin - theoretical_base.begin].ToString() ==
                        mod.modification.Target.ToString())
                    {
                        begin++;
                        remove.Add(mod); //dont have in ptmset --> change the begin & end
                    }
                    else if (theoretical_base.sequence[this.end - this.begin].ToString() ==
                             mod.modification.Target.ToString())
                    {
                        end--;
                        remove.Add(mod);
                    }
                }

                foreach (var ptm in remove)
                {
                    ptm_set.ptm_combination.Remove(ptm);
                }

                ptm_set = new PtmSet(ptm_set.ptm_combination);

                if (e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) !=
                    this.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ||
                    e.begin != begin || e.end != end || !e.ptm_set.same_ptmset(ptm_set, true))
                {
                    e.ambiguous = true;
                    Proteoform linked_proteoform_reference =
                        this.linked_proteoform_references == null || this.linked_proteoform_references.Count == 0
                            ? this
                            : this.linked_proteoform_references.First();
                    Tuple <Proteoform, int, int, PtmSet> new_id =
                        new Tuple <Proteoform, int, int, PtmSet>(linked_proteoform_reference, begin, end, ptm_set);
                    lock (e.ambiguous_identifications)
                    {
                        if (!e.ambiguous_identifications.Any(p =>
                                                             p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ==
                                                             new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) &&
                                                             p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 &&
                                                             p.Item4.same_ptmset(new_id.Item4, true)))
                        {
                            e.ambiguous_identifications.Add(new_id);
                        }
                    }
                }
            }

            if (this as ExperimentalProteoform != null && (this as ExperimentalProteoform).ambiguous)
            {
                foreach (var id in this.ambiguous_identifications)
                {
                    TheoreticalProteoform id_theoretical_base = id.Item1 as TheoreticalProteoform;
                    int begin  = id.Item2;
                    int end    = id.Item3;
                    var remove = new List <Ptm>();

                    var ptm_set = determine_mod_change(e, this, id_theoretical_base, r, id.Item4);
                    if (ptm_set == null)
                    {
                        continue;
                    }
                    //do retention of M first
                    foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "AminoAcid"))
                    {
                        begin--;
                        remove.Add(mod);
                    }
                    foreach (var mod in ptm_set.ptm_combination.Where(m => m.modification.ModificationType == "Missing"))
                    {
                        if (id_theoretical_base.sequence[id.Item2 - id.Item1.begin].ToString() == mod.modification.Target.ToString())
                        {
                            begin++;
                            remove.Add(mod); //dont have in ptmset --> change the begin & end
                        }
                        else if (id_theoretical_base.sequence[id.Item3 - id.Item2].ToString() == mod.modification.Target.ToString())
                        {
                            end--;
                            remove.Add(mod);
                        }
                    }
                    foreach (var ptm in remove)
                    {
                        ptm_set.ptm_combination.Remove(ptm);
                    }
                    ptm_set = new PtmSet(ptm_set.ptm_combination);
                    lock (e.ambiguous_identifications)
                    {
                        var new_id = new Tuple <Proteoform, int, int, PtmSet>(id.Item1, begin, end, ptm_set);
                        if ((e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) !=
                             new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ||
                             e.begin != new_id.Item2 || e.end != new_id.Item3 || !e.ptm_set.same_ptmset(new_id.Item4, true)) &&
                            !e.ambiguous_identifications.Any(p =>
                                                             p.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) ==
                                                             new_id.Item1.gene_name.get_prefered_name(Lollipop.preferred_gene_label) &&
                                                             p.Item2 == new_id.Item2 && p.Item3 == new_id.Item3 &&
                                                             p.Item4.same_ptmset(new_id.Item4, true)))
                        {
                            e.ambiguous_identifications.Add(new_id);
                            e.ambiguous = true;
                        }
                    }
                }
            }

            e.uniprot_mods = "";
            foreach (string mod in e.ptm_set.ptm_combination.Concat(e.ambiguous_identifications.SelectMany(i => i.Item4.ptm_combination)).Where(ptm => ptm.modification.ModificationType != "Deconvolution Error").Select(ptm => UnlocalizedModification.LookUpId(ptm.modification)).ToList().Distinct().OrderBy(m => m))
            {
                // positions with mod
                List <int> theo_ptms = theoretical_base.ExpandedProteinList.First()
                                       .OneBasedPossibleLocalizedModifications
                                       .Where(p => p.Key >= e.begin && p.Key <= e.end &&
                                              p.Value.Select(m => UnlocalizedModification.LookUpId(m)).Contains(mod))
                                       .Select(m => m.Key).ToList();
                if (theo_ptms.Count > 0)
                {
                    e.uniprot_mods += mod + " @ " + string.Join(", ", theo_ptms) + "; ";
                }
                if (e.ptm_set.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                    .Count(m => m == mod) > theo_ptms.Count ||
                    e.ambiguous_identifications.Any(i => i.Item4.ptm_combination.Select(ptm => UnlocalizedModification.LookUpId(ptm.modification))
                                                    .Count(m => m == mod) > theo_ptms.Count))
                {
                    e.novel_mods = true;
                }
            }

            //else if (!e.topdown_id && e.gene_name.get_prefered_name(Lollipop.preferred_gene_label) != this.gene_name.get_prefered_name(Lollipop.preferred_gene_label)
            // && e.linked_proteoform_references.Count == this.linked_proteoform_references.Count + 1)
            //{
            //    e.ambiguous = true;
            //}
        }