// The calculations for unlabeled and neucode components are the same, currently
        public static List <Component> generate_neucode_components(double starter_mass)
        {
            List <Component> components = new List <Component>();
            InputFile        inFile     = new ProteoformSuiteInternal.InputFile("somepath", Labeling.NeuCode, Purpose.Identification);

            for (int i = 0; i < 2; i++)
            {
                Component light = new Component();
                Component heavy = new Component();
                light.input_file = inFile;
                heavy.input_file = inFile;
                light.id         = 1.ToString();
                heavy.id         = 2.ToString();
                light.weighted_monoisotopic_mass = starter_mass;
                heavy.weighted_monoisotopic_mass = starter_mass + starter_lysine_count * Lollipop.NEUCODE_LYSINE_MASS_SHIFT;
                light.intensity_sum_olcs         = starter_intensity;     //using the special intensity sum for overlapping charge states in a neucode pair
                heavy.intensity_sum_olcs         = starter_intensity / 2; //using the special intensity sum for overlapping charge states in a neucode pair
                light.rt_apex  = starter_rt;
                heavy.rt_apex  = starter_rt;
                light.accepted = true;
                heavy.accepted = true;
                ChargeState light_charge_state = new ChargeState(1, light.intensity_sum_olcs, light.weighted_monoisotopic_mass, 1.00727645D);
                ChargeState heavy_charge_state = new ChargeState(1, heavy.intensity_sum_olcs, heavy.weighted_monoisotopic_mass, 1.00727645D);
                light.charge_states = new List <ChargeState> {
                    light_charge_state
                };
                heavy.charge_states = new List <ChargeState> {
                    heavy_charge_state
                };
                NeuCodePair n = new NeuCodePair(light, heavy);
                n.lysine_count = starter_lysine_count;
                n.calculate_properties();
                components.Add(n);
            }
            return(components);
        }
Beispiel #2
0
        public bool Run_TdMzCal(InputFile raw_file, List <TopDownHit> topdown_hits)
        {
            all_topdown_hits = topdown_hits.Where(h => h.score > 0).ToList();
            //need to reset m/z in case same td hits used for multiple calibration raw files...
            Parallel.ForEach(all_topdown_hits, h => h.mz = h.reported_mass.ToMz(h.charge));

            high_scoring_topdown_hits = all_topdown_hits.Where(h => h.score >= 40).ToList();
            this.raw_file             = raw_file;

            if (high_scoring_topdown_hits.Count < 5)
            {
                return(false);
            }

            myMsDataFile = Path.GetExtension(raw_file.complete_path) == ".raw" ?
                           ThermoStaticData.LoadAllStaticData(raw_file.complete_path) :
                           null;
            if (myMsDataFile == null)
            {
                myMsDataFile = Mzml.LoadAllStaticData(raw_file.complete_path);
            }
            if (myMsDataFile == null)
            {
                return(false);
            }

            DataPointAquisitionResults dataPointAcquisitionResult = GetDataPoints();

            if (dataPointAcquisitionResult.Ms1List.Count < 10)
            {
                return(false);
            }

            if (Sweet.lollipop.mass_calibration)
            {
                var myMs1DataPoints = new List <(double[] xValues, double yValue)>();

                for (int i = 0; i < dataPointAcquisitionResult.Ms1List.Count; i++)
                {
                    //x values
                    var explanatoryVariables = new double[4];
                    explanatoryVariables[0] = dataPointAcquisitionResult.Ms1List[i].mz;
                    explanatoryVariables[1] = dataPointAcquisitionResult.Ms1List[i].retentionTime;
                    explanatoryVariables[2] = dataPointAcquisitionResult.Ms1List[i].logTotalIonCurrent;
                    explanatoryVariables[3] = dataPointAcquisitionResult.Ms1List[i].logInjectionTime;

                    //yvalue
                    double mzError = dataPointAcquisitionResult.Ms1List[i].massError;

                    myMs1DataPoints.Add((explanatoryVariables, mzError));
                }

                var ms1Model = GetRandomForestModel(myMs1DataPoints);

                CalibrateHitsAndComponents(ms1Model);
                if (Sweet.lollipop.calibrate_raw_files)
                {
                    MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile,
                                                                          raw_file.directory + "\\" + raw_file.filename + "_calibrated.mzML", false);
                }
            }

            if (Sweet.lollipop.retention_time_calibration)
            {
                var myMs1DataPoints = new List <(double[] xValues, double yValue)>();
                List <TopDownHit> firstElutingTopDownHit = new List <TopDownHit>();
                List <string>     PFRs = high_scoring_topdown_hits.Select(h => h.pfr_accession).Distinct().ToList();
                foreach (var PFR in PFRs)
                {
                    var firstHitWithPFR = high_scoring_topdown_hits
                                          .Where(h => h.pfr_accession == PFR).OrderBy(h => h.ms2_retention_time).First();
                    firstElutingTopDownHit.Add(firstHitWithPFR);
                }

                for (int i = 0; i < dataPointAcquisitionResult.Ms1List.Count; i++)
                {
                    if (firstElutingTopDownHit.Contains(dataPointAcquisitionResult.Ms1List[i].identification))
                    {
                        //x values
                        var explanatoryVariables = new double[1];
                        explanatoryVariables[0] = dataPointAcquisitionResult.Ms1List[i].retentionTime;

                        //yvalue
                        double RTError = dataPointAcquisitionResult.Ms1List[i].RTError;

                        myMs1DataPoints.Add((explanatoryVariables, RTError));
                    }
                }

                if (myMs1DataPoints.Count < 10)
                {
                    return(false);
                }

                var ms1Model = GetRandomForestModel(myMs1DataPoints);

                foreach (Component c in Sweet.lollipop.calibration_components.Where(h => h.input_file.lt_condition == raw_file.lt_condition && h.input_file.biological_replicate == raw_file.biological_replicate && h.input_file.fraction == raw_file.fraction && h.input_file.technical_replicate == raw_file.technical_replicate))
                {
                    c.rt_apex = c.rt_apex - ms1Model.Predict(new double[] { c.rt_apex });
                }
            }
            return(true);
        }
        public List <string> bad_topdown_ptms = new List <string>(); //PTMs not in theoretical database added to warning file.

        //Reading in Top-down excel
        public List <TopDownHit> ReadTDFile(InputFile file)
        {
            //if neucode labeled, calculate neucode light theoretical AND observed mass! --> better for matching up
            //if carbamidomethylated, add 57 to theoretical mass (already in observed mass...)
            aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, Sweet.lollipop.neucode_labeled).AA_Masses;
            List <TopDownHit> td_hits = new List <TopDownHit>();

            List <List <string> > cells = ExcelReader.get_cell_strings(file, true);//This returns the entire sheet except for the header. Each row of cells is one List<string>

            //get ptms on proteoform -- check for mods. IF not in database, make new topdown mod, show Warning message.
            Parallel.ForEach(cells, cellStrings =>
            {
                bool add_topdown_hit = true; //if PTM or accession not found, will not add (show warning)
                if (cellStrings.Count == 24)
                {
                    TopDownResultType tdResultType = (cellStrings[15] == "BioMarker") ? TopDownResultType.Biomarker : ((cellStrings[15] == "Tight Absolute Mass") ? TopDownResultType.TightAbsoluteMass : TopDownResultType.Unknown);
                    if (tdResultType != TopDownResultType.Unknown) //uknown result type!
                    {
                        List <Ptm> ptm_list = new List <Ptm>();    // if nothing gets added, an empty ptmlist is passed to the topdownhit constructor.
                                                                   //N-term modifications
                        if (cellStrings[10].Length > 0)            //N Terminal Modification Code
                        {
                            string[] ptms = cellStrings[10].Split('|');
                            foreach (string ptm in ptms)
                            {
                                int position = Int32.TryParse(cellStrings[5], out int i) ? i : 0;
                                if (position == 0)
                                {
                                    add_topdown_hit = false;
                                    continue;
                                }
                                if (cellStrings[10].Split(':')[1] == "1458")//PSI-MOD 1458 is supposed to be N-terminal acetylation
                                {
                                    ptm_list.Add(new Ptm(position, Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.OriginalId == "N-terminal Acetyl").FirstOrDefault()));
                                }
                                else
                                {
                                    string psimod = ptm.Split(':')[1].Split('@')[0];//The number after the @ is the position in the protein
                                    while (psimod.Length < 5)
                                    {
                                        psimod = "0" + psimod;                      //short part should be the accession number, which is an integer
                                    }
                                    Modification mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault();
                                    if (mod == null)
                                    {
                                        psimod = "MOD:" + psimod;
                                        mod    = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault();
                                    }
                                    if (mod != null)
                                    {
                                        ptm_list.Add(new Ptm(position, mod));
                                    }
                                    else
                                    {
                                        lock (bad_topdown_ptms)
                                        {
                                            bad_topdown_ptms.Add("PSI-MOD:" + psimod + " at " + position);
                                        }
                                        add_topdown_hit = false;
                                    }
                                }
                            }
                        }
                        //don't have example of c-term modification to write code
                        //other mods
                        if (cellStrings[9].Length > 0)//Modification Codes
                        {
                            string[] ptms = cellStrings[9].Split('|');
                            foreach (string ptm in ptms)
                            {
                                Modification mod = null;
                                string id        = "";
                                if (ptm.Split(':').Length < 2)
                                {
                                    add_topdown_hit = false;
                                    continue;
                                }
                                if (ptm.Split(':')[1].Split('@').Length < 2)
                                {
                                    add_topdown_hit = false;
                                    continue;
                                }
                                int position_after_begin = (Int32.TryParse(ptm.Split(':')[1].Split('@')[1], out int j) ? j : -1) + 1; //one based sequence
                                                                                                                                      //they give position # as from begin site -> want to report in terms of overall sequence #'s
                                                                                                                                      //begin + position from begin - 1 => position in overall sequence
                                if (position_after_begin == 0)
                                {
                                    add_topdown_hit = false;
                                    continue;
                                }
                                int begin = Int32.TryParse(cellStrings[5], out int k) ? k : 0;
                                if (begin == 0)
                                {
                                    add_topdown_hit = false;
                                    continue;
                                }
                                int position = begin + position_after_begin - 1;
                                if (ptm.Split(':')[0] == "RESID")
                                {
                                    string resid = ptm.Split(':')[1].Split('@')[0];//The number after the @ is the position in the protein
                                    while (resid.Length < 4)
                                    {
                                        resid = "0" + resid;                     //short part should be the accession number, which is an integer
                                    }
                                    resid = "AA" + resid;
                                    id    = "RESID:" + resid;
                                    mod   = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("RESID") && m.DatabaseReference["RESID"].Contains(resid)).FirstOrDefault();
                                }
                                else if (ptm.Split(':')[0] == "PSI-MOD")
                                {
                                    string psimod = ptm.Split(':')[1].Split('@')[0];//The number after the @ is the position in the protein
                                    while (psimod.Length < 5)
                                    {
                                        psimod = "0" + psimod;                      //short part should be the accession number, which is an integer
                                    }
                                    mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault();
                                    if (mod == null)
                                    {
                                        psimod = "MOD:" + psimod;
                                        mod    = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.DatabaseReference != null && m.DatabaseReference.ContainsKey("PSI-MOD") && m.DatabaseReference["PSI-MOD"].Contains(psimod)).FirstOrDefault();
                                    }
                                    id = "PSI-MOD:" + psimod;
                                }
                                if (mod != null)
                                {
                                    ptm_list.Add(new Ptm(position, mod));
                                }
                                else
                                {
                                    lock (bad_topdown_ptms)
                                    {
                                        bad_topdown_ptms.Add(id + " at " + cellStrings[4][position_after_begin - 1]);
                                    }
                                    add_topdown_hit = false;
                                }
                            }
        //Reading in metamopheus excel
        public List <TopDownHit> ReadMetamopheusFile(InputFile file)
        {
            //if neucode labeled, calculate neucode light theoretical AND observed mass! --> better for matching up
            //if carbamidomethylated, add 57 to theoretical mass (already in observed mass...)
            aaIsotopeMassList = new AminoAcidMasses(Sweet.lollipop.carbamidomethylation, Sweet.lollipop.neucode_labeled).AA_Masses;
            List <TopDownHit> td_hits = new List <TopDownHit>();//for one line in excel file

            //creates dictionary to find mods
            Dictionary <string, Modification> mods = Sweet.lollipop.theoretical_database.all_mods_with_mass.ToDictionary(kv => kv.IdWithMotif, kv => kv);


            List <List <string> > cells = ExcelReader.get_cell_strings(file, true);//This returns the entire sheet except for the header. Each row of cells is one List<string>

            //get ptms on proteoform -- check for mods. IF not in database, make new topdown mod, show Warning message.
            Parallel.ForEach(cells, cellStrings =>
            {
                bool add_topdown_hit = true; //if PTM or accession not found, will not add (show warning)
                if (cellStrings.Count == 55)
                {
                    List <Ptm> new_ptm_list = new List <Ptm>();
                    //if bad mod itll catch it to add to bad_topdown_ptms
                    try
                    {
                        PeptideWithSetModifications modsIdentifier = new PeptideWithSetModifications(cellStrings[14].Split('|')[0], mods);

                        var ptm_list = modsIdentifier.AllModsOneIsNterminus;

                        //for each  entry in ptm_list make a new Ptm and add it to the new_ptm_list
                        foreach (KeyValuePair <int, Proteomics.Modification> entry in ptm_list)
                        {
                            Modification mod = Sweet.lollipop.theoretical_database.uniprotModifications.Values.SelectMany(m => m).Where(m => m.IdWithMotif == entry.Value.IdWithMotif).FirstOrDefault();
                            var Ptm          = new Ptm();

                            if (mod != null)
                            {
                                new_ptm_list.Add(new Ptm(entry.Key, entry.Value));
                            }
                            else
                            {
                                lock (bad_topdown_ptms)
                                {
                                    //error is somewahre in sequece
                                    bad_topdown_ptms.Add("Mod Name:" + entry.Value.IdWithMotif + " at " + entry.Key);
                                    add_topdown_hit = false;
                                }
                            }
                        }
                    }
                    catch (MzLibUtil.MzLibException)
                    {
                        lock (bad_topdown_ptms)
                        {
                            //error is somewahre in sequece
                            bad_topdown_ptms.Add("Bad mod at " + cellStrings[0] + " scan " + cellStrings[1]);
                            add_topdown_hit = false;
                        }
                    }

                    //This is the excel file header:
                    //cellStrings[0]=File Name
                    //cellStrings[1]=Scan Number
                    //cellStrings[2]=Scan Retention Time
                    //cellStrings[3]=Num Experimental Peaks
                    //cellStrings[4]=Total Ion Current
                    //cellStrings[5]=Precursor Scan Number
                    //cellStrings[6]=Precursor Charge
                    //cellStrings[7]=Precursor MZ
                    //cellStrings[8]=Precursor Mass
                    //cellStrings[9]=Score
                    //cellStrings[10]=Delta Score
                    //cellStrings[11]=Notch
                    //cellStrings[12]=Different Peak Matches
                    //cellStrings[13]=Base Sequence
                    //cellStrings[14]=Full Sequence
                    //cellStrings[15]=Essential Sequence
                    //cellStrings[16]=PSM Count
                    //cellStrings[17]=Mods
                    //cellStrings[18]=Mods Chemical Formulas
                    //cellStrings[19]=Mods Combined Chemical Formula
                    //cellStrings[20]=Num Variable Mods
                    //cellStrings[21]=Missed Cleavages
                    //cellStrings[22]=Peptide Monoisotopic Mass
                    //cellStrings[23]=Mass Diff (Da)
                    //cellStrings[24]=Mass Diff (ppm)
                    //cellStrings[25]=Protein Accession
                    //cellStrings[26]=Protein Name
                    //cellStrings[27]=Gene Name
                    //cellStrings[28]=Organism Name
                    //cellStrings[29]=Intersecting Sequence Variations
                    //cellStrings[30]=Identified Sequence Variations
                    //cellStrings[31]=Splice Sites
                    //cellStrings[32]=Contaminant
                    //cellStrings[33]=Decoy
                    //cellStrings[34]=Peptide Description
                    //cellStrings[35]=Start and End Residues In Protein
                    //cellStrings[36]=Previous Amino Acid
                    //cellStrings[37]=Next Amino Acid
                    //cellStrings[38]=All Scores
                    //cellStrings[39]=Theoreticals Searched
                    //cellStrings[40]=Decoy/Contaminant/Target
                    //cellStrings[41]=Matched Ion Series
                    //cellStrings[42]=Matched Ion Mass-To-Charge Ratios
                    //cellStrings[43]=Matched Ion Mass Diff (Da)
                    //cellStrings[44]=Matched Ion Mass Diff (Ppm)
                    //cellStrings[45]=Matched Ion Intensities
                    //cellStrings[46]=Matched Ion Counts
                    //cellStrings[47]=Localized Scores
                    //cellStrings[48]=Improvement Possible
                    //cellStrings[49]=Cumulative Target
                    //cellStrings[50]=Cumulative Decoy
                    //cellStrings[51]=QValue
                    //cellStrings[52]=Cumulative Target Notch
                    //cellStrings[53]=Cumulative Decoy Notch
                    //cellStrings[54]=QValue Notch
                    //cellStrings[55]=eValue
                    //cellStrings[56]=eScore



                    if (cellStrings[35].Length > 0)
                    {
                        string[] ids = cellStrings[35].Split('|');
                        //splits the string to get the value of starting index
                        string[] index = ids[0].Split(' ');

                        string[] startIndexValue = index[0].Split('[');
                        string startResidues     = startIndexValue[1];

                        //splits string to get value of ending index
                        string[] endIndexValue = index[2].Split(']');
                        string endResidues     = endIndexValue[0];


                        if (add_topdown_hit)
                        {
                            //if bad mod u want td hit to be false
                            TopDownHit td_hit = new TopDownHit(aaIsotopeMassList, file, TopDownResultType.TightAbsoluteMass, cellStrings[25], cellStrings[14], cellStrings[25], cellStrings[26], cellStrings[13],
                                                               Int32.TryParse(startResidues, out int j) ? j : 0, Int32.TryParse(endResidues, out int i) ? i : 0, new_ptm_list, Double.TryParse(cellStrings[8], out double d) ? d : 0, Double.TryParse(cellStrings[22], out d) ? d : 0,
                                                               Int32.TryParse(cellStrings[1], out i) ? i : 0, Double.TryParse(cellStrings[2], out d) ? d : 0, cellStrings[0].Split('.')[0], Double.TryParse(cellStrings[8], out d) ? d : 0, Sweet.lollipop.min_score_td + 1);


                            if (td_hit.begin > 0 && td_hit.end > 0 && td_hit.theoretical_mass > 0 && td_hit.pscore > 0 && td_hit.reported_mass > 0 && td_hit.score > 0 &&
                                td_hit.ms2ScanNumber > 0 && td_hit.ms2_retention_time > 0)
                            {
                                lock (td_hits) td_hits.Add(td_hit);
                            }
                        }
                    }
                }
            });
            return(td_hits);
        }