// The calculations for unlabeled and neucode components are the same, currently public static List <Component> generate_neucode_components(double starter_mass) { List <Component> components = new List <Component>(); InputFile inFile = new ProteoformSuiteInternal.InputFile("somepath", Labeling.NeuCode, Purpose.Identification); for (int i = 0; i < 2; i++) { Component light = new Component(); Component heavy = new Component(); light.input_file = inFile; heavy.input_file = inFile; light.id = 1.ToString(); heavy.id = 2.ToString(); light.weighted_monoisotopic_mass = starter_mass; heavy.weighted_monoisotopic_mass = starter_mass + starter_lysine_count * Lollipop.NEUCODE_LYSINE_MASS_SHIFT; light.intensity_sum_olcs = starter_intensity; //using the special intensity sum for overlapping charge states in a neucode pair heavy.intensity_sum_olcs = starter_intensity / 2; //using the special intensity sum for overlapping charge states in a neucode pair light.rt_apex = starter_rt; heavy.rt_apex = starter_rt; light.accepted = true; heavy.accepted = true; ChargeState light_charge_state = new ChargeState(1, light.intensity_sum_olcs, light.weighted_monoisotopic_mass, 1.00727645D); ChargeState heavy_charge_state = new ChargeState(1, heavy.intensity_sum_olcs, heavy.weighted_monoisotopic_mass, 1.00727645D); light.charge_states = new List <ChargeState> { light_charge_state }; heavy.charge_states = new List <ChargeState> { heavy_charge_state }; NeuCodePair n = new NeuCodePair(light, heavy); n.lysine_count = starter_lysine_count; n.calculate_properties(); components.Add(n); } return(components); }
public bool Run_TdMzCal(InputFile raw_file, List <TopDownHit> topdown_hits) { all_topdown_hits = topdown_hits.Where(h => h.score > 0).ToList(); //need to reset m/z in case same td hits used for multiple calibration raw files... Parallel.ForEach(all_topdown_hits, h => h.mz = h.reported_mass.ToMz(h.charge)); high_scoring_topdown_hits = all_topdown_hits.Where(h => h.score >= 40).ToList(); this.raw_file = raw_file; if (high_scoring_topdown_hits.Count < 5) { return(false); } myMsDataFile = Path.GetExtension(raw_file.complete_path) == ".raw" ? ThermoStaticData.LoadAllStaticData(raw_file.complete_path) : null; if (myMsDataFile == null) { myMsDataFile = Mzml.LoadAllStaticData(raw_file.complete_path); } if (myMsDataFile == null) { return(false); } DataPointAquisitionResults dataPointAcquisitionResult = GetDataPoints(); if (dataPointAcquisitionResult.Ms1List.Count < 10) { return(false); } if (Sweet.lollipop.mass_calibration) { var myMs1DataPoints = new List <(double[] xValues, double yValue)>(); for (int i = 0; i < dataPointAcquisitionResult.Ms1List.Count; i++) { //x values var explanatoryVariables = new double[4]; explanatoryVariables[0] = dataPointAcquisitionResult.Ms1List[i].mz; explanatoryVariables[1] = dataPointAcquisitionResult.Ms1List[i].retentionTime; explanatoryVariables[2] = dataPointAcquisitionResult.Ms1List[i].logTotalIonCurrent; explanatoryVariables[3] = dataPointAcquisitionResult.Ms1List[i].logInjectionTime; //yvalue double mzError = dataPointAcquisitionResult.Ms1List[i].massError; myMs1DataPoints.Add((explanatoryVariables, mzError)); } var ms1Model = GetRandomForestModel(myMs1DataPoints); CalibrateHitsAndComponents(ms1Model); if (Sweet.lollipop.calibrate_raw_files) { MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, raw_file.directory + "\\" + raw_file.filename + "_calibrated.mzML", false); } } if (Sweet.lollipop.retention_time_calibration) { var myMs1DataPoints = new List <(double[] xValues, double yValue)>(); List <TopDownHit> firstElutingTopDownHit = new List <TopDownHit>(); List <string> PFRs = high_scoring_topdown_hits.Select(h => h.pfr_accession).Distinct().ToList(); foreach (var PFR in PFRs) { var firstHitWithPFR = high_scoring_topdown_hits .Where(h => h.pfr_accession == PFR).OrderBy(h => h.ms2_retention_time).First(); firstElutingTopDownHit.Add(firstHitWithPFR); } for (int i = 0; i < dataPointAcquisitionResult.Ms1List.Count; i++) { if (firstElutingTopDownHit.Contains(dataPointAcquisitionResult.Ms1List[i].identification)) { //x values var explanatoryVariables = new double[1]; explanatoryVariables[0] = dataPointAcquisitionResult.Ms1List[i].retentionTime; //yvalue double RTError = dataPointAcquisitionResult.Ms1List[i].RTError; myMs1DataPoints.Add((explanatoryVariables, RTError)); } } if (myMs1DataPoints.Count < 10) { return(false); } var ms1Model = GetRandomForestModel(myMs1DataPoints); foreach (Component c in Sweet.lollipop.calibration_components.Where(h => h.input_file.lt_condition == raw_file.lt_condition && h.input_file.biological_replicate == raw_file.biological_replicate && h.input_file.fraction == raw_file.fraction && h.input_file.technical_replicate == raw_file.technical_replicate)) { c.rt_apex = c.rt_apex - ms1Model.Predict(new double[] { c.rt_apex }); } } return(true); }
public List <string> bad_topdown_ptms = new List <string>(); //PTMs not in theoretical database added to warning file. //Reading in Top-down excel public List <TopDownHit> ReadTDFile(InputFile file) { //if neucode labeled, calculate neucode light theoretical AND observed mass! --> better for matching up //if carbamidomethylated, add 57 to theoretical mass (already in observed mass...) aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, Sweet.lollipop.neucode_labeled).AA_Masses; List <TopDownHit> td_hits = new List <TopDownHit>(); List <List <string> > cells = ExcelReader.get_cell_strings(file, true);//This returns the entire sheet except for the header. Each row of cells is one List<string> //get ptms on proteoform -- check for mods. IF not in database, make new topdown mod, show Warning message. Parallel.ForEach(cells, cellStrings => { bool add_topdown_hit = true; //if PTM or accession not found, will not add (show warning) if (cellStrings.Count == 24) { TopDownResultType tdResultType = (cellStrings[15] == "BioMarker") ? TopDownResultType.Biomarker : ((cellStrings[15] == "Tight Absolute Mass") ? TopDownResultType.TightAbsoluteMass : TopDownResultType.Unknown); if (tdResultType != TopDownResultType.Unknown) //uknown result type! { List <Ptm> ptm_list = new List <Ptm>(); // if nothing gets added, an empty ptmlist is passed to the topdownhit constructor. //N-term modifications if (cellStrings[10].Length > 0) //N Terminal Modification Code { string[] ptms = cellStrings[10].Split('|'); foreach (string ptm in ptms) { int position = Int32.TryParse(cellStrings[5], out int i) ? i : 0; if (position == 0) { add_topdown_hit = false; continue; } if (cellStrings[10].Split(':')[1] == "1458")//PSI-MOD 1458 is supposed to be N-terminal acetylation { ptm_list.Add(new Ptm(position, Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.OriginalId == "N-terminal Acetyl").FirstOrDefault())); } else { string psimod = ptm.Split(':')[1].Split('@')[0];//The number after the @ is the position in the protein while (psimod.Length < 5) { psimod = "0" + psimod; //short part should be the accession number, which is an integer } Modification mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault(); if (mod == null) { psimod = "MOD:" + psimod; mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault(); } if (mod != null) { ptm_list.Add(new Ptm(position, mod)); } else { lock (bad_topdown_ptms) { bad_topdown_ptms.Add("PSI-MOD:" + psimod + " at " + position); } add_topdown_hit = false; } } } } //don't have example of c-term modification to write code //other mods if (cellStrings[9].Length > 0)//Modification Codes { string[] ptms = cellStrings[9].Split('|'); foreach (string ptm in ptms) { Modification mod = null; string id = ""; if (ptm.Split(':').Length < 2) { add_topdown_hit = false; continue; } if (ptm.Split(':')[1].Split('@').Length < 2) { add_topdown_hit = false; continue; } int position_after_begin = (Int32.TryParse(ptm.Split(':')[1].Split('@')[1], out int j) ? j : -1) + 1; //one based sequence //they give position # as from begin site -> want to report in terms of overall sequence #'s //begin + position from begin - 1 => position in overall sequence if (position_after_begin == 0) { add_topdown_hit = false; continue; } int begin = Int32.TryParse(cellStrings[5], out int k) ? k : 0; if (begin == 0) { add_topdown_hit = false; continue; } int position = begin + position_after_begin - 1; if (ptm.Split(':')[0] == "RESID") { string resid = ptm.Split(':')[1].Split('@')[0];//The number after the @ is the position in the protein while (resid.Length < 4) { resid = "0" + resid; //short part should be the accession number, which is an integer } resid = "AA" + resid; id = "RESID:" + resid; mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("RESID") && m.DatabaseReference["RESID"].Contains(resid)).FirstOrDefault(); } else if (ptm.Split(':')[0] == "PSI-MOD") { string psimod = ptm.Split(':')[1].Split('@')[0];//The number after the @ is the position in the protein while (psimod.Length < 5) { psimod = "0" + psimod; //short part should be the accession number, which is an integer } mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault(); if (mod == null) { psimod = "MOD:" + psimod; mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault(); } id = "PSI-MOD:" + psimod; } if (mod != null) { ptm_list.Add(new Ptm(position, mod)); } else { lock (bad_topdown_ptms) { bad_topdown_ptms.Add(id + " at " + cellStrings[4][position_after_begin - 1]); } add_topdown_hit = false; } }
//Reading in metamopheus excel public List <TopDownHit> ReadMetamopheusFile(InputFile file) { //if neucode labeled, calculate neucode light theoretical AND observed mass! --> better for matching up //if carbamidomethylated, add 57 to theoretical mass (already in observed mass...) aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, Sweet.lollipop.neucode_labeled).AA_Masses; List <TopDownHit> td_hits = new List <TopDownHit>();//for one line in excel file //creates dictionary to find mods Dictionary <string, Modification> mods = Sweet.lollipop.theoretical_database.all_mods_with_mass.ToDictionary(kv => kv.IdWithMotif, kv => kv); List <List <string> > cells = ExcelReader.get_cell_strings(file, true);//This returns the entire sheet except for the header. Each row of cells is one List<string> //get ptms on proteoform -- check for mods. IF not in database, make new topdown mod, show Warning message. Parallel.ForEach(cells, cellStrings => { bool add_topdown_hit = true; //if PTM or accession not found, will not add (show warning) if (cellStrings.Count == 55) { List <Ptm> new_ptm_list = new List <Ptm>(); //if bad mod itll catch it to add to bad_topdown_ptms try { PeptideWithSetModifications modsIdentifier = new PeptideWithSetModifications(cellStrings[14].Split('|')[0], mods); var ptm_list = modsIdentifier.AllModsOneIsNterminus; //for each entry in ptm_list make a new Ptm and add it to the new_ptm_list foreach (KeyValuePair <int, Proteomics.Modification> entry in ptm_list) { Modification mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.IdWithMotif == entry.Value.IdWithMotif).FirstOrDefault(); var Ptm = new Ptm(); if (mod != null) { new_ptm_list.Add(new Ptm(entry.Key, entry.Value)); } else { lock (bad_topdown_ptms) { //error is somewahre in sequece bad_topdown_ptms.Add("Mod Name:" + entry.Value.IdWithMotif + " at " + entry.Key); add_topdown_hit = false; } } } } catch (MzLibUtil.MzLibException) { lock (bad_topdown_ptms) { //error is somewahre in sequece bad_topdown_ptms.Add("Bad mod at " + cellStrings[0] + " scan " + cellStrings[1]); add_topdown_hit = false; } } //This is the excel file header: //cellStrings[0]=File Name //cellStrings[1]=Scan Number //cellStrings[2]=Scan Retention Time //cellStrings[3]=Num Experimental Peaks //cellStrings[4]=Total Ion Current //cellStrings[5]=Precursor Scan Number //cellStrings[6]=Precursor Charge //cellStrings[7]=Precursor MZ //cellStrings[8]=Precursor Mass //cellStrings[9]=Score //cellStrings[10]=Delta Score //cellStrings[11]=Notch //cellStrings[12]=Different Peak Matches //cellStrings[13]=Base Sequence //cellStrings[14]=Full Sequence //cellStrings[15]=Essential Sequence //cellStrings[16]=PSM Count //cellStrings[17]=Mods //cellStrings[18]=Mods Chemical Formulas //cellStrings[19]=Mods Combined Chemical Formula //cellStrings[20]=Num Variable Mods //cellStrings[21]=Missed Cleavages //cellStrings[22]=Peptide Monoisotopic Mass //cellStrings[23]=Mass Diff (Da) //cellStrings[24]=Mass Diff (ppm) //cellStrings[25]=Protein Accession //cellStrings[26]=Protein Name //cellStrings[27]=Gene Name //cellStrings[28]=Organism Name //cellStrings[29]=Intersecting Sequence Variations //cellStrings[30]=Identified Sequence Variations //cellStrings[31]=Splice Sites //cellStrings[32]=Contaminant //cellStrings[33]=Decoy //cellStrings[34]=Peptide Description //cellStrings[35]=Start and End Residues In Protein //cellStrings[36]=Previous Amino Acid //cellStrings[37]=Next Amino Acid //cellStrings[38]=All Scores //cellStrings[39]=Theoreticals Searched //cellStrings[40]=Decoy/Contaminant/Target //cellStrings[41]=Matched Ion Series //cellStrings[42]=Matched Ion Mass-To-Charge Ratios //cellStrings[43]=Matched Ion Mass Diff (Da) //cellStrings[44]=Matched Ion Mass Diff (Ppm) //cellStrings[45]=Matched Ion Intensities //cellStrings[46]=Matched Ion Counts //cellStrings[47]=Localized Scores //cellStrings[48]=Improvement Possible //cellStrings[49]=Cumulative Target //cellStrings[50]=Cumulative Decoy //cellStrings[51]=QValue //cellStrings[52]=Cumulative Target Notch //cellStrings[53]=Cumulative Decoy Notch //cellStrings[54]=QValue Notch //cellStrings[55]=eValue //cellStrings[56]=eScore if (cellStrings[35].Length > 0) { string[] ids = cellStrings[35].Split('|'); //splits the string to get the value of starting index string[] index = ids[0].Split(' '); string[] startIndexValue = index[0].Split('['); string startResidues = startIndexValue[1]; //splits string to get value of ending index string[] endIndexValue = index[2].Split(']'); string endResidues = endIndexValue[0]; if (add_topdown_hit) { //if bad mod u want td hit to be false TopDownHit td_hit = new TopDownHit(aaIsotopeMassList, file, TopDownResultType.TightAbsoluteMass, cellStrings[25], cellStrings[14], cellStrings[25], cellStrings[26], cellStrings[13], Int32.TryParse(startResidues, out int j) ? j : 0, Int32.TryParse(endResidues, out int i) ? i : 0, new_ptm_list, Double.TryParse(cellStrings[8], out double d) ? d : 0, Double.TryParse(cellStrings[22], out d) ? d : 0, Int32.TryParse(cellStrings[1], out i) ? i : 0, Double.TryParse(cellStrings[2], out d) ? d : 0, cellStrings[0].Split('.')[0], Double.TryParse(cellStrings[8], out d) ? d : 0, Sweet.lollipop.min_score_td + 1); if (td_hit.begin > 0 && td_hit.end > 0 && td_hit.theoretical_mass > 0 && td_hit.pscore > 0 && td_hit.reported_mass > 0 && td_hit.score > 0 && td_hit.ms2ScanNumber > 0 && td_hit.ms2_retention_time > 0) { lock (td_hits) td_hits.Add(td_hit); } } } } }); return(td_hits); }