public void CalculateSequenceCoverage() { var proteinsWithUnambigSeqPsms = new Dictionary <Protein, List <PeptideWithSetModifications> >(); var proteinsWithPsmsWithLocalizedMods = new Dictionary <Protein, List <PeptideWithSetModifications> >(); foreach (var protein in Proteins) { proteinsWithUnambigSeqPsms.Add(protein, new List <PeptideWithSetModifications>()); proteinsWithPsmsWithLocalizedMods.Add(protein, new List <PeptideWithSetModifications>()); } foreach (var psm in AllPsmsBelowOnePercentFDR) { // null BaseSequence means that the amino acid sequence is ambiguous; do not use these to calculate sequence coverage if (psm.BaseSequence != null) { var peptides = psm.BestMatchingPeptides.Select(p => p.Peptide); foreach (var peptide in peptides) { // might be unambiguous but also shared; make sure this protein group contains this peptide+protein combo if (Proteins.Contains(peptide.Protein)) { proteinsWithUnambigSeqPsms[peptide.Protein].Add(peptide); // null FullSequence means that mods were not successfully localized; do not display them on the sequence coverage mods info if (psm.FullSequence != null) { proteinsWithPsmsWithLocalizedMods[peptide.Protein].Add(peptide); } } } } } foreach (var protein in ListOfProteinsOrderedByAccession) { HashSet <int> coveredOneBasedResidues = new HashSet <int>(); // get residue numbers of each peptide in the protein and identify them as observed if the sequence is unambiguous foreach (var peptide in proteinsWithUnambigSeqPsms[protein]) { for (int i = peptide.OneBasedStartResidueInProtein; i <= peptide.OneBasedEndResidueInProtein; i++) { coveredOneBasedResidues.Add(i); } } // calculate sequence coverage percent double seqCoverageFract = (double)coveredOneBasedResidues.Count / protein.Length; // add the percent coverage SequenceCoverageFraction.Add(seqCoverageFract); // convert the observed amino acids to upper case if they are unambiguously observed string sequenceCoverageDisplay = protein.BaseSequence.ToLower(); var coverageArray = sequenceCoverageDisplay.ToCharArray(); foreach (var obsResidueLocation in coveredOneBasedResidues) { coverageArray[obsResidueLocation - 1] = char.ToUpper(coverageArray[obsResidueLocation - 1]); } sequenceCoverageDisplay = new string(coverageArray); // add the coverage display SequenceCoverageDisplayList.Add(sequenceCoverageDisplay); // put mods in the sequence coverage display // get mods to display in sequence (only unambiguously identified mods) var modsOnThisProtein = new HashSet <KeyValuePair <int, Modification> >(); foreach (var pep in proteinsWithPsmsWithLocalizedMods[protein]) { foreach (var mod in pep.AllModsOneIsNterminus) { if (!mod.Value.ModificationType.Contains("PeptideTermMod") && !mod.Value.ModificationType.Contains("Common Variable") && !mod.Value.ModificationType.Contains("Common Fixed")) { modsOnThisProtein.Add(new KeyValuePair <int, Modification>(pep.OneBasedStartResidueInProtein + mod.Key - 2, mod.Value)); } } } var tempMods = modsOnThisProtein.OrderBy(p => p.Key).ToList(); foreach (var mod in tempMods) { if (mod.Value.LocationRestriction.Equals("N-terminal.")) { sequenceCoverageDisplay = sequenceCoverageDisplay.Insert( 0, $"[{mod.Value.IdWithMotif}]-"); } else if (mod.Value.LocationRestriction.Equals("Anywhere.")) { int modStringIndex = sequenceCoverageDisplay.Length - (protein.Length - mod.Key); sequenceCoverageDisplay = sequenceCoverageDisplay.Insert( modStringIndex, $"[{mod.Value.IdWithMotif}]"); } else if (mod.Value.LocationRestriction.Equals("C-terminal.")) { sequenceCoverageDisplay = sequenceCoverageDisplay.Insert( sequenceCoverageDisplay.Length, $"-[{mod.Value.IdWithMotif}]"); } } SequenceCoverageDisplayListWithMods.Add(sequenceCoverageDisplay); if (!modsOnThisProtein.Any()) { continue; } // calculate spectral count % of modified observations var pepModTotals = new List <int>(); // count of modified peptides for each mod/index var pepTotals = new List <int>(); // count of all peptides for each mod/index var modIndex = new List <(int index, string modName)>(); // index and name of the modified position foreach (var pep in proteinsWithPsmsWithLocalizedMods[protein]) { foreach (var mod in pep.AllModsOneIsNterminus) { int pepNumTotal = 0; //For one mod, The total Pep Num if (mod.Value.ModificationType.Contains("Common Variable") || mod.Value.ModificationType.Contains("Common Fixed") || mod.Value.LocationRestriction.Equals(ModLocationOnPeptideOrProtein.PepC) || mod.Value.LocationRestriction.Equals(ModLocationOnPeptideOrProtein.NPep)) { continue; } int indexInProtein; if (mod.Value.LocationRestriction.Equals("N-terminal.")) { indexInProtein = 1; } else if (mod.Value.LocationRestriction.Equals("Anywhere.")) { indexInProtein = pep.OneBasedStartResidueInProtein + mod.Key - 2; } else if (mod.Value.LocationRestriction.Equals("C-terminal.")) { indexInProtein = protein.Length; } else { // In case it's a peptide terminal mod, skip! // we don't want this annotated in the protein's modifications continue; } var modKey = (indexInProtein, mod.Value.IdWithMotif); if (modIndex.Contains(modKey)) { pepModTotals[modIndex.IndexOf(modKey)] += 1; } else { modIndex.Add(modKey); foreach (var pept in proteinsWithPsmsWithLocalizedMods[protein]) { if (indexInProtein >= pept.OneBasedStartResidueInProtein - (indexInProtein == 1 ? 1 : 0) && indexInProtein <= pept.OneBasedEndResidueInProtein) { pepNumTotal += 1; } } pepTotals.Add(pepNumTotal); pepModTotals.Add(1); } } } var modStrings = new List <(int aaNum, string part)>(); for (int i = 0; i < pepModTotals.Count; i++) { string aa = modIndex[i].index.ToString(); string modName = modIndex[i].modName.ToString(); string occupancy = ((double)pepModTotals[i] / (double)pepTotals[i]).ToString("F2"); string fractOccupancy = $"{pepModTotals[i].ToString()}/{pepTotals[i].ToString()}"; string tempString = ($"#aa{aa}[{modName},info:occupancy={occupancy}({fractOccupancy})]"); modStrings.Add((modIndex[i].index, tempString)); } var modInfoString = string.Join(";", modStrings.OrderBy(x => x.aaNum).Select(x => x.part)); if (!string.IsNullOrEmpty(modInfoString)) { ModsInfo.Add(modInfoString); } } }
public void CalculateSequenceCoverage() { var proteinsWithUnambigSeqPsms = new Dictionary <Protein, List <PeptideWithSetModifications> >(); var proteinsWithPsmsWithLocalizedMods = new Dictionary <Protein, List <PeptideWithSetModifications> >(); foreach (var protein in Proteins) { proteinsWithUnambigSeqPsms.Add(protein, new List <PeptideWithSetModifications>()); proteinsWithPsmsWithLocalizedMods.Add(protein, new List <PeptideWithSetModifications>()); } foreach (var psm in AllPsmsBelowOnePercentFDR) { // null BaseSequence means that the amino acid sequence is ambiguous; do not use these to calculate sequence coverage if (psm.BaseSequence != null) { var peptides = psm.BestMatchingPeptides.Select(p => p.Peptide); foreach (var peptide in peptides) { // might be unambiguous but also shared; make sure this protein group contains this peptide+protein combo if (Proteins.Contains(peptide.Protein)) { proteinsWithUnambigSeqPsms[peptide.Protein].Add(peptide); // null FullSequence means that mods were not successfully localized; do not display them on the sequence coverage mods info if (psm.FullSequence != null) { proteinsWithPsmsWithLocalizedMods[peptide.Protein].Add(peptide); } } } } } foreach (var protein in ListOfProteinsOrderedByAccession) { bool errorResult = false; var sequenceCoverageDisplay = protein.BaseSequence.ToLower(CultureInfo.InvariantCulture); HashSet <int> coveredOneBasedResidues = new HashSet <int>(); // get residue numbers of each peptide in the protein and identify them as observed if the sequence is unambiguous foreach (var peptide in proteinsWithUnambigSeqPsms[protein]) { string sequenceExtractedFromProtein = ""; for (int i = peptide.OneBasedStartResidueInProtein; i <= peptide.OneBasedEndResidueInProtein; i++) { // check for bugs in sequence coverage; make sure we have the right amino acids! sequenceExtractedFromProtein += sequenceCoverageDisplay[i - 1]; coveredOneBasedResidues.Add(i); } if (!sequenceExtractedFromProtein.ToUpper().Equals(peptide.BaseSequence)) { errorResult = true; } } // calculate sequence coverage percent double seqCoverageFract = (double)coveredOneBasedResidues.Count / protein.Length; if (seqCoverageFract > 1) { errorResult = true; } // add the percent coverage or NaN if there was an error if (!errorResult) { SequenceCoverageFraction.Add(seqCoverageFract); } else { SequenceCoverageFraction.Add(double.NaN); } // convert the observed amino acids to upper case if they are unambiguously observed var coverageArray = sequenceCoverageDisplay.ToCharArray(); foreach (var obsResidueLocation in coveredOneBasedResidues) { coverageArray[obsResidueLocation - 1] = char.ToUpper(coverageArray[obsResidueLocation - 1]); } sequenceCoverageDisplay = new string(coverageArray); // check to see if there was an errored result; if not, add the coverage display if (!errorResult) { SequenceCoverageDisplayList.Add(sequenceCoverageDisplay); } else { SequenceCoverageDisplayList.Add("Error calculating sequence coverage"); } // put mods in the sequence coverage display if (!errorResult) { // get mods to display in sequence (only unambiguously identified mods) var modsOnThisProtein = new HashSet <KeyValuePair <int, Modification> >(); foreach (var pep in proteinsWithPsmsWithLocalizedMods[protein]) { foreach (var mod in pep.AllModsOneIsNterminus) { if (!mod.Value.ModificationType.Contains("PeptideTermMod") && !mod.Value.ModificationType.Contains("Common Variable") && !mod.Value.ModificationType.Contains("Common Fixed")) { modsOnThisProtein.Add(new KeyValuePair <int, Modification>(pep.OneBasedStartResidueInProtein + mod.Key - 2, mod.Value)); } } } var temp1 = modsOnThisProtein.OrderBy(p => p.Key).ToList(); foreach (var mod in temp1) { if (mod.Value.LocationRestriction.Equals("N-terminal.")) { sequenceCoverageDisplay = sequenceCoverageDisplay.Insert(0, "[" + mod.Value.IdWithMotif + "]-"); } else if (mod.Value.LocationRestriction.Equals("Anywhere.")) { int modStringIndex = sequenceCoverageDisplay.Length - (protein.Length - mod.Key); sequenceCoverageDisplay = sequenceCoverageDisplay.Insert(modStringIndex, "[" + mod.Value.IdWithMotif + "]"); } else if (mod.Value.LocationRestriction.Equals("C-terminal.")) { sequenceCoverageDisplay = sequenceCoverageDisplay.Insert(sequenceCoverageDisplay.Length, "-[" + mod.Value.IdWithMotif + "]"); } } SequenceCoverageDisplayListWithMods.Add(sequenceCoverageDisplay); if (modsOnThisProtein.Any()) { // calculate spectral count percentage of modified observation string tempModStrings = ""; //The whole string List <int> tempPepModTotals = new List <int>(); //The List of (For one mod, The Modified Pep Num) List <int> tempPepTotals = new List <int>(); //The List of (For one mod, The total Pep Num) List <string> tempPepModValues = new List <string>(); //The List of (For one mod, the Modified Name) List <int> tempModIndex = new List <int>(); //The Index of the modified position. foreach (var pep in proteinsWithPsmsWithLocalizedMods[protein]) { foreach (var mod in pep.AllModsOneIsNterminus) { int tempPepNumTotal = 0; //For one mod, The total Pep Num if (!mod.Value.ModificationType.Contains("Common Variable") && !mod.Value.ModificationType.Contains("Common Fixed") && !mod.Value.LocationRestriction.Equals(ModLocationOnPeptideOrProtein.PepC) && !mod.Value.LocationRestriction.Equals(ModLocationOnPeptideOrProtein.NPep)) { int tempIndexInProtein; if (mod.Value.LocationRestriction.Equals("N-terminal.")) { tempIndexInProtein = 1; } else if (mod.Value.LocationRestriction.Equals("Anywhere.")) { tempIndexInProtein = pep.OneBasedStartResidueInProtein + mod.Key - 2; } else if (mod.Value.LocationRestriction.Equals("C-terminal.")) { tempIndexInProtein = protein.Length; } else { // In case it's a peptide terminal mod, skip! // we don't want this annotated in the protein's modifications continue; } if (tempModIndex.Contains(tempIndexInProtein) && tempPepModValues[tempModIndex.IndexOf(tempIndexInProtein)] == mod.Value.IdWithMotif) { tempPepModTotals[tempModIndex.IndexOf(tempIndexInProtein)] += 1; } else { tempModIndex.Add(tempIndexInProtein); foreach (var pept in proteinsWithPsmsWithLocalizedMods[protein]) { if (tempIndexInProtein >= pept.OneBasedStartResidueInProtein - (tempIndexInProtein == 1 ? 1 : 0) && tempIndexInProtein <= pept.OneBasedEndResidueInProtein) { tempPepNumTotal += 1; } } tempPepTotals.Add(tempPepNumTotal); tempPepModValues.Add(mod.Value.IdWithMotif); tempPepModTotals.Add(1); } } } } for (int i = 0; i < tempPepModTotals.Count; i++) { string tempString = ("#aa" + tempModIndex[i].ToString() + "[" + tempPepModValues[i].ToString() + ",info:occupancy=" + ((double)tempPepModTotals[i] / (double)tempPepTotals[i]).ToString("F2") + "(" + tempPepModTotals[i].ToString() + "/" + tempPepTotals[i].ToString() + ")" + "];"); tempModStrings += tempString; } if (!string.IsNullOrEmpty(tempModStrings)) { ModsInfo.Add(tempModStrings); } } } } }
public override string ToString() { var sb = new StringBuilder(); // list of protein accession numbers sb.Append(ProteinGroupName); sb.Append("\t"); // genes sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ListOfProteinsOrderedByAccession.Select(p => p.GeneNames.Select(x => x.Item2).FirstOrDefault())))); sb.Append("\t"); // organisms sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ListOfProteinsOrderedByAccession.Select(p => p.Organism).Distinct()))); sb.Append("\t"); // list of protein names sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ListOfProteinsOrderedByAccession.Select(p => p.FullName).Distinct()))); sb.Append("\t"); // list of masses var sequences = ListOfProteinsOrderedByAccession.Select(p => p.BaseSequence).Distinct(); List <double> masses = new List <double>(); foreach (var sequence in sequences) { try { masses.Add(new Proteomics.AminoAcidPolymer.Peptide(sequence).MonoisotopicMass); } catch (System.Exception) { masses.Add(double.NaN); } } sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", masses))); sb.Append("\t"); // number of proteins in group sb.Append("" + Proteins.Count); sb.Append("\t"); // list of unique peptides if (UniquePeptidesOutput != null) { sb.Append(GlobalVariables.CheckLengthOfOutput(UniquePeptidesOutput)); } sb.Append("\t"); // list of shared peptides if (SharedPeptidesOutput != null) { sb.Append(GlobalVariables.CheckLengthOfOutput(SharedPeptidesOutput)); } sb.Append("\t"); // number of peptides if (!DisplayModsOnPeptides) { sb.Append("" + AllPeptides.Select(p => p.BaseSequence).Distinct().Count()); } else { sb.Append("" + AllPeptides.Select(p => p.FullSequence).Distinct().Count()); } sb.Append("\t"); // number of unique peptides if (!DisplayModsOnPeptides) { sb.Append("" + UniquePeptides.Select(p => p.BaseSequence).Distinct().Count()); } else { sb.Append("" + UniquePeptides.Select(p => p.FullSequence).Distinct().Count()); } sb.Append("\t"); // sequence coverage percent sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SequenceCoverageFraction.Select(p => string.Format("{0:0.#####}", p))))); sb.Append("\t"); // sequence coverage sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SequenceCoverageDisplayList))); sb.Append("\t"); // sequence coverage with mods sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SequenceCoverageDisplayListWithMods))); sb.Append("\t"); //Detailed mods information list sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ModsInfo))); sb.Append("\t"); // MS1 intensity (retrieved from FlashLFQ in the SearchTask) if (IntensitiesByFile != null && FilesForQuantification != null) { foreach (var sampleGroup in FilesForQuantification.GroupBy(p => p.Condition)) { foreach (var sample in sampleGroup.GroupBy(p => p.BiologicalReplicate).OrderBy(p => p.Key)) { // if the samples are fractionated, the protein will only have 1 intensity in the first fraction // and the other fractions will be zero. we could find the first/only fraction with an intensity, // but simply summing the fractions is easier than finding the single non-zero value double summedIntensity = sample.Sum(file => IntensitiesByFile[file]); if (summedIntensity > 0) { sb.Append(summedIntensity); } sb.Append("\t"); } } } // number of PSMs for listed peptides sb.Append("" + AllPsmsBelowOnePercentFDR.Count); sb.Append("\t"); // isDecoy if (IsDecoy) { sb.Append("D"); } else if (IsContaminant) { sb.Append("C"); } else { sb.Append("T"); } sb.Append("\t"); // cumulative target sb.Append(CumulativeTarget); sb.Append("\t"); // cumulative decoy sb.Append(CumulativeDecoy); sb.Append("\t"); // q value sb.Append(QValue); sb.Append("\t"); // best peptide score sb.Append(BestPeptideScore); sb.Append("\t"); // best peptide q value sb.Append(BestPeptideQValue); sb.Append("\t"); return(sb.ToString()); }
public override string ToString() { var sb = new StringBuilder(); // list of protein accession numbers sb.Append(ProteinGroupName); sb.Append("\t"); // genes sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ListOfProteinsOrderedByAccession.Select(p => p.GeneNames.Select(x => x.Item2).FirstOrDefault())))); sb.Append("\t"); // organisms sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ListOfProteinsOrderedByAccession.Select(p => p.Organism).Distinct()))); sb.Append("\t"); // list of protein names sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ListOfProteinsOrderedByAccession.Select(p => p.FullName).Distinct()))); sb.Append("\t"); // list of masses var sequences = ListOfProteinsOrderedByAccession.Select(p => p.BaseSequence).Distinct(); List <double> masses = new List <double>(); foreach (var sequence in sequences) { try { masses.Add(new Proteomics.AminoAcidPolymer.Peptide(sequence).MonoisotopicMass); } catch (System.Exception) { masses.Add(double.NaN); } } sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", masses))); sb.Append("\t"); // number of proteins in group sb.Append("" + Proteins.Count); sb.Append("\t"); // list of unique peptides if (!DisplayModsOnPeptides) { sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", UniquePeptides.Select(p => p.BaseSequence).Distinct()))); } else { sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", UniquePeptides.Select(p => p.FullSequence).Distinct()))); } sb.Append("\t"); // list of shared peptides var SharedPeptides = AllPeptides.Except(UniquePeptides); if (!DisplayModsOnPeptides) { sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SharedPeptides.Select(p => p.BaseSequence).Distinct()))); } else { sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SharedPeptides.Select(p => p.FullSequence).Distinct()))); } sb.Append("\t"); // number of peptides if (!DisplayModsOnPeptides) { sb.Append("" + AllPeptides.Select(p => p.BaseSequence).Distinct().Count()); } else { sb.Append("" + AllPeptides.Select(p => p.FullSequence).Distinct().Count()); } sb.Append("\t"); // number of unique peptides if (!DisplayModsOnPeptides) { sb.Append("" + UniquePeptides.Select(p => p.BaseSequence).Distinct().Count()); } else { sb.Append("" + UniquePeptides.Select(p => p.FullSequence).Distinct().Count()); } sb.Append("\t"); // sequence coverage percent sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SequenceCoverageFraction.Select(p => string.Format("{0:0.#####}", p))))); sb.Append("\t"); // sequence coverage sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SequenceCoverageDisplayList))); sb.Append("\t"); // sequence coverage with mods sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", SequenceCoverageDisplayListWithMods))); sb.Append("\t"); //Detailed mods information list sb.Append(GlobalVariables.CheckLengthOfOutput(string.Join("|", ModsInfo))); sb.Append("\t"); // MS1 intensity (retrieved from FlashLFQ in the SearchTask) if (IntensitiesByFile != null && FilesForQuantification != null) { foreach (var file in FilesForQuantification) { if (IntensitiesByFile[file] > 0) { sb.Append(IntensitiesByFile[file]); } else { sb.Append(""); } sb.Append("\t"); } } // number of PSMs for listed peptides sb.Append("" + AllPsmsBelowOnePercentFDR.Count); sb.Append("\t"); // isDecoy if (IsDecoy) { sb.Append("D"); } else if (IsContaminant) { sb.Append("C"); } else { sb.Append("T"); } sb.Append("\t"); // cumulative target sb.Append(CumulativeTarget); sb.Append("\t"); // cumulative decoy sb.Append(CumulativeDecoy); sb.Append("\t"); // q value sb.Append(QValue); sb.Append("\t"); // best peptide score sb.Append(BestPeptideScore); sb.Append("\t"); // best peptide q value sb.Append(BestPeptideQValue); sb.Append("\t"); return(sb.ToString()); }