public void FindAmbiguity(List <PSM> candidates, List <TheoreticalProtein> theoreticalProteins, out string error_message) { error_message = ""; ReadProteins(theoreticalProteins); ReadMassDictionary(); for (int i = 0; i < candidates.Count(); i++) //must be mutable while iterating { PSM psm = candidates[i]; psm.fusionType = FusionCandidate.FusionType.TS; //for some maddening reason, this is not arriving here as trans, but instead translated if (IsTooMessy(psm, out string e)) //having explosion of combinations when greater than 3 consequtive peaks producing tens of thousands of sequences ids, causes hanging { candidates.Remove(psm); i--; error_message += e; } else { if (GeneratePossibleSequences(psm, out string error_message1) && !PossibleCandidate(psm)) //return true if fewer than specified number of ambiguities { candidates.Remove(psm); i--; } error_message += error_message1; } this.worker.ReportProgress(Convert.ToInt16((Convert.ToDouble(i) / Convert.ToDouble(candidates.Count())) * 100)); } }
public List <PSM> ImportPSMs(string nFileName, string cFileName, out string error_message) { error_message = ""; List <PSM> MMOutput = new List <PSM>(); string[] nInput = File.ReadAllLines(nFileName); string[] cInput = File.ReadAllLines(cFileName); string[] header = nInput[0].Split('\t').ToArray(); //assume both files have identical headers int fileNameIndex = -1; int scanNumberIndex = -1; int scanPrecursorMassIndex = -1; int proteinAccessionIndex = -1; int fullSequenceIndex = -1; int matchedIonsIndex = -1; int matchedIonCountsIndex = -1; int scoreIndex = -1; for (int col = 0; col < header.Length; col++) { if (header[col].Equals("File Name")) { fileNameIndex = col; } else if (header[col].Equals("Scan Number")) { scanNumberIndex = col; } else if (header[col].Equals("Precursor Mass")) { scanPrecursorMassIndex = col; } else if (header[col].Equals("Protein Accession")) { proteinAccessionIndex = col; } else if (header[col].Equals("Base Sequence")) //"FullSequence" should be used for the detection of FPs containing PTMs and for missed cleave/nonspecific peptides containing PTMs { fullSequenceIndex = col; } else if (header[col].Equals("Matched Ion Masses")) { matchedIonsIndex = col; } else if (header[col].Equals("Matched Ion Counts")) { matchedIonCountsIndex = col; } else if (header[col].Equals("Score")) { scoreIndex = col; } } List <InitialID> nAssignment = new List <InitialID>(); List <InitialID> cAssignment = new List <InitialID>(); for (int i = 1; i < nInput.Count(); i++) { string[] line = nInput[i].Split('\t').ToArray(); InitialID id = new InitialID(line[fileNameIndex], Convert.ToInt32(line[scanNumberIndex]), Convert.ToDouble(line[scanPrecursorMassIndex]), line[proteinAccessionIndex], line[fullSequenceIndex], line[matchedIonsIndex], line[scoreIndex], out string e); error_message += e; nAssignment.Add(id); } for (int i = 1; i < cInput.Count(); i++) { string[] line = cInput[i].Split('\t').ToArray(); InitialID id = new InitialID(line[fileNameIndex], Convert.ToInt32(line[scanNumberIndex]), Convert.ToDouble(line[scanPrecursorMassIndex]), line[proteinAccessionIndex], line[fullSequenceIndex], line[matchedIonsIndex], line[scoreIndex], out string e); error_message += e; cAssignment.Add(id); } //sort by scan number List <InitialID> nAssignmentSorted = nAssignment.OrderBy(o => o.getScan()).ToList(); List <InitialID> cAssignmentSorted = cAssignment.OrderBy(o => o.getScan()).ToList(); //remove scans not found in both files for (int i = 0; i < nAssignmentSorted.Count(); i++) { this.worker.ReportProgress(Convert.ToInt16((Convert.ToDouble(i) / Convert.ToDouble(nAssignmentSorted.Count())) * 100)); if (i < cAssignmentSorted.Count()) { if (nAssignmentSorted[i].getScan().Equals(cAssignmentSorted[i].getScan())) { PSM psm = new PSM(nAssignment[i].getFile(), nAssignmentSorted[i].getScan(), nAssignmentSorted[i].getExpMass(), nAssignmentSorted[i], cAssignmentSorted[i]); MMOutput.Add(psm); continue; } else if (nAssignmentSorted[i].getScan() < cAssignmentSorted[i].getScan()) //no information was found for the b scan using y ions, so remove it { nAssignmentSorted.Remove(nAssignmentSorted[i]); i--; } else //no information was found for the y scan using b ions, so remove it { cAssignmentSorted.Remove(cAssignmentSorted[i]); i--; } } } return(MMOutput); }
//method was originally written recursively, but large peptides result in stackoverflow exceptions public void MassMatch(string B, string Y, PSM psm, int BIndex, int YIndex, out string error_message) //this is the workhorse of SpliceFragments { error_message = ""; test = psm.getScan().ToString(); double ExperimentalMass = psm.getExpMass(); string BFrag = IonCrop(B, ExperimentalMass, BIndex, IonType.b, false, out string e4); //returns a B ion sequence that has a mass smaller than the experimental mass by cleaving C term AA //BIndex = B.Length - BFrag.Length; //added 11/8/16 Useful first pass to record how many AA have been cleaved from C term string YFrag = IonCrop(Y, ExperimentalMass, YIndex, IonType.y, false, out string e3); //returns a Y ion sequence that has a mass smaller than the experimental mass by cleaving N term AA //YIndex = Y.Length - YFrag.Length; //added 11/8/16 Useful first pass to record how many AA have been cleaved from N term double TheoreticalMass = MassCalculator.MonoIsoptopicMass(BFrag, out string e) + MassCalculator.MonoIsoptopicMass(YFrag, out string e2) - Constants.WATER_MONOISOTOPIC_MASS + fixedModMass; //water added once in b and once in y error_message += e3 + e4 + e + e2; //add PTM masses foreach (PTM ptm in psm.getNInfo().getPTMs()) { if (ptm.index < BFrag.Length) { TheoreticalMass += ptm.mass; } } foreach (PTM ptm in psm.getCInfo().getPTMs()) { if (Y.Length - ptm.index < YFrag.Length) { TheoreticalMass += ptm.mass; } } if (YFrag.Length < ionsUsedMassVer) //If the number of AA from the C-term peptide is less than desired amount, end recursion. { //we're done } else if (BFrag.Length < ionsUsedMassVer) //If the number of AA from the N-term peptide is less than desired amount, start over loop and remove a single aa from the C-term { // MassMatch(B, Y, psm, 0, YIndex+1); } //if match //bool elif = true; //"else if" where not a match==true else if (FalsePositives.generateDecoys) { //else if (((TheoreticalMass - Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS * 7) > (ExperimentalMass - 1 * Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS) && (TheoreticalMass - Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS * 7) < (ExperimentalMass + 1 * Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS)) | ((TheoreticalMass + Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS * 7) > (ExperimentalMass - 1 * Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS) && (TheoreticalMass + Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS * 7) < (ExperimentalMass + 1 * Constants.PEPTIDE_N_TERMINAL_MONOISOTOPIC_MASS)))//if match //if ((TheoreticalMass) > (ExperimentalMass+ i -PrecursorMassToleranceDa) && (TheoreticalMass) < (ExperimentalMass+i +PrecursorMassToleranceDa)) //if match if (((TheoreticalMass) > (ExperimentalMass - 9.5) && (TheoreticalMass) < (ExperimentalMass - 4.5)) | ((TheoreticalMass) > (ExperimentalMass + 5.5) && (TheoreticalMass) < (ExperimentalMass + 7.5)))//if match //if ((TheoreticalMass) > (ExperimentalMass+ i -PrecursorMassToleranceDa) && (TheoreticalMass) < (ExperimentalMass+i +PrecursorMassToleranceDa)) //if match //else if(Math.Abs(ExperimentalMass - TheoreticalMass)<40 && (ExperimentalMass - TheoreticalMass)-Math.Floor(ExperimentalMass-TheoreticalMass)>0.3 && (ExperimentalMass - TheoreticalMass) - Math.Floor(ExperimentalMass - TheoreticalMass) < 0.8) { // elif = false; bool previouslyFound = false; foreach (FusionCandidate oldCandidate in psm.getFusionCandidates()) { if ((BFrag + YFrag).Equals(oldCandidate.seq)) //see if that sequence was already recorded { previouslyFound = true; } } if (!previouslyFound) //if fusion sequence was not previously assigned to this psm { FusionCandidate candidate = new FusionCandidate(BFrag + YFrag); psm.addFusionCandidate(candidate); // MassMatch(B, Y, psm, BIndex + 1, YIndex); } } } else { if ((TheoreticalMass) > (ExperimentalMass * (1 - FalsePositives.precursorMassTolerancePpm / 1000000)) && (TheoreticalMass) < (ExperimentalMass * (1 + FalsePositives.precursorMassTolerancePpm / 1000000))) //if match //if ((TheoreticalMass) > (ExperimentalMass+ i -PrecursorMassToleranceDa) && (TheoreticalMass) < (ExperimentalMass+i +PrecursorMassToleranceDa)) //if match { // elif = false; bool previouslyFound = false; foreach (FusionCandidate oldCandidate in psm.getFusionCandidates()) { if ((BFrag + YFrag).Equals(oldCandidate.seq)) //see if that sequence was already recorded { previouslyFound = true; } } if (!previouslyFound) //if fusion sequence was not previously assigned to this psm { FusionCandidate candidate = new FusionCandidate(BFrag + YFrag); psm.addFusionCandidate(candidate); // MassMatch(B, Y, psm, BIndex + 1, YIndex); } } } // if(elif) //not a match { /* if (TheoreticalMass < ExperimentalMass && BIndex == 0) //first pass, theo less than exp and can't take away more ions * { * //we're done * } * else * { * if (TheoreticalMass < ExperimentalMass) //if b out of ions, but y not, crop off a y and start again * { * BIndex = 0; * YIndex++; * MassMatch(B,Y, psm, BIndex, YIndex); * } * else * { //crop off a b ion * MassMatch(B, Y, psm, BIndex + 1, YIndex); * } * }*/ } }
public bool IsTooMessy(PSM psm, out string error_message) //return true if too messy for confident identification { error_message = ""; List <string> baseSequences = new List <string>(); int currentBestScore = 0; for (int index = 0; index < psm.getFusionCandidates().Count(); index++) { bool badID = false; FusionCandidate fc = psm.getFusionCandidates()[index]; findIons(fc, psm, out string error_message1); error_message += error_message1; int consecutiveMissedCounter = 0; int totalHitCounter = 0; foreach (bool b in fc.getFoundIons()) { if (consecutiveMissedCounter > maxMissingConsecutivePeaks) //if too many permutations possible because of an unmapped region { badID = true; } else if (!b) { consecutiveMissedCounter++; } else { totalHitCounter++; consecutiveMissedCounter = 0; } //only care about consecutive } bool isRepeat = false; if (baseSequences.Contains(psm.getFusionCandidates()[index].seq)) { isRepeat = true; } if (totalHitCounter > currentBestScore && !badID)//the others were worse, so delete them { for (int i = 0; i < index; i = 0) { psm.getFusionCandidates().Remove(psm.getFusionCandidates()[0]); index--; } currentBestScore = totalHitCounter; baseSequences = new List <string> { psm.getFusionCandidates()[index].seq }; } else if (totalHitCounter < currentBestScore | badID | isRepeat) { psm.getFusionCandidates().Remove(psm.getFusionCandidates()[index]); index--; } } //If there's anything left if (psm.getFusionCandidates().Count() > 0) //It wasn't too messy! Yay! { return(false); } else //this might be a fusion peptide, but we won't get any valuable information from this spectra, so discard it { return(true); } }
//returns true if a full fusion sequence could not be made or was found in the database, making it translated instead of a novel fusion. public bool PossibleCandidate(PSM psm) { foundSequences = new Dictionary <string, List <TheoreticalProtein> >(); //used for finding longer fragments than those previously identified. Also populates ParentInfo notFoundSequences = new HashSet <string>(); //don't bother looking for these fragments, since we know they don't exist. Good for multiple homologous putative fusion peptide sequences //conduct an initial search of each candidate's full sequence to identify any that are translated for (int i = 0; i < psm.getFusionCandidates().Count(); i++) //foreach fusion peptide sequence that could map to this scan { string novelSeq = psm.getFusionCandidates()[i].seq; if (foundParent(novelSeq, ParentInfo.terminal.C, psm.getFusionCandidates()[i], false)) //check really quick to see if the whole thing exists as is. If so, assign it as translated. Terminal C was arbitrarily chosen { foreach (ParentInfo info in psm.getFusionCandidates()[i].parentInfo) { foreach (TheoreticalProtein protein in info.theoreticalProteins) { if (protein.seq.Contains(novelSeq)) //if translated { psm.getFusionCandidates()[i].translatedParents.Add(new TranslatedParent(protein.id, protein.seq, protein.seq.IndexOf(psm.getFusionCandidates()[i].seq), psm.getFusionCandidates()[i].seq.Length)); } } } psm.getFusionCandidates()[i].fusionType = FusionCandidate.FusionType.TL; psm.fusionType = psm.getFusionCandidates()[i].fusionType; for (int j = 0; j < psm.getFusionCandidates().Count(); j++) { if (j != i) { psm.getFusionCandidates().Remove(psm.getFusionCandidates()[j]); j--; i--; } } return(true); } } for (int i = 0; i < psm.getFusionCandidates().Count(); i++) //foreach fusion peptide sequence that could map to this scan { //sw.StartFindParents if (!isViable(psm.getFusionCandidates()[i])) //remove this fusion peptide sequence if the parent fragments cannot be found with the given database { psm.getFusionCandidates().Remove(psm.getFusionCandidates()[i]); i--; } else { DetermineFusionCandidateType(psm.getFusionCandidates()[i]); //cis, trans? if (psm.fusionType > psm.getFusionCandidates()[i].fusionType) //if more likely than previous types, change the psm type (golf scoring) { psm.fusionType = psm.getFusionCandidates()[i].fusionType; } if (psm.fusionType.Equals(FusionCandidate.FusionType.TL))//if there's a possible sequence that's present in the database, it is likely correct and is it is not worth it to identify parents of other sequences. { //remove all other candidates for (int j = 0; j < psm.getFusionCandidates().Count(); j++) { if (j != i) { psm.getFusionCandidates().Remove(psm.getFusionCandidates()[j]); j--; i--; } } return(true); } } } if (psm.getFusionCandidates().Count() == 0) //if no candidates are left, we couldn't make the sequence with the database and we'll discard it. { return(false); } return(true); }
public bool GeneratePossibleSequences(PSM psm, out string error_message) //returns false if over the specified number of sequences are generated { error_message = ""; List <string> foundSeq = new List <string>(); //get list of all FP sequences foreach (FusionCandidate fusionCandidate in psm.getFusionCandidates()) { findIons(fusionCandidate, psm, out string error_message1); //populate the foundIons array error_message += error_message1; foundSeq.Add(fusionCandidate.seq); } bool done = false; int globalIndex = 0; while (!done) { done = true; //let's assume we're done and correct it later if we're not if (psm.getFusionCandidates().Count() > maxNumPossibleSequences) //if there are more than a set number of possible sequences, this is junk and we are not searching them all { return(false); } for (int fc = 0; fc < psm.getFusionCandidates().Count(); fc++) { FusionCandidate fusionCandidate = psm.getFusionCandidates()[fc]; if (fusionCandidate.getFoundIons().Count() > globalIndex) //prevent crashing, use to tell when done by hitting end of fc { List <FusionCandidate> tempCandidates = new List <FusionCandidate>(); //fill with possible sequences done = false; //We're not done, because at least one fusion candidate sequence length is still greater than the global index string fusionSeq = fusionCandidate.seq; bool[] IonFound = fusionCandidate.getFoundIons(); if (IonFound[globalIndex]) //only look for ambiguity if a peak was found to provide the stop point. { int mostRecent = -1; //most recent Ion found prior to this one (start point) for (int i = 0; i < globalIndex; i++) //identify start point { if (IonFound[i]) { mostRecent = i; //save most recent hit, exclusive of the current index } } string ambiguousFrag = fusionSeq.Substring(mostRecent + 1, globalIndex - mostRecent); double key = MassCalculator.MonoIsoptopicMass(ambiguousFrag, out string error_message2); error_message += error_message2; List <string> combinations = new List <string>(); double closestPeak = double.NaN; var ipos = Array.BinarySearch(keys, key); if (ipos < 0) { ipos = ~ipos; } if (ipos > 0) { var downIpos = ipos - 1; // Try down while (downIpos >= 0) { closestPeak = keys[downIpos]; if (closestPeak > key - productMassToleranceDa && closestPeak < key + productMassToleranceDa) { string[] value; if (massDict.TryGetValue(closestPeak, out value)) { foreach (string frag in value) { combinations.Add(frag); } } } else { break; } downIpos--; } } if (ipos < keys.Length) { var upIpos = ipos; // Try here and up while (upIpos < keys.Length) { closestPeak = keys[upIpos]; if (closestPeak > key - productMassToleranceDa && closestPeak < key + productMassToleranceDa) { string[] value; if (massDict.TryGetValue(closestPeak, out value)) { foreach (string frag in value) { combinations.Add(frag); } } } else { break; } upIpos++; } } foreach (string str in combinations) { string nTermSeq = fusionSeq.Substring(0, mostRecent + 1); string cTermSeq = fusionSeq.Substring(globalIndex + 1, fusionSeq.Length - globalIndex - 1); string novelSeq = nTermSeq + str + cTermSeq; FusionCandidate tempCandidate = new FusionCandidate(novelSeq); tempCandidates.Add(tempCandidate); } } foreach (FusionCandidate newfc in tempCandidates) { if (!foundSeq.Contains(newfc.seq)) //if new FP sequence, add it. { foundSeq.Add(newfc.seq); findIons(newfc, psm, out string error_message3); error_message += error_message3; psm.getFusionCandidates().Add(newfc); } } } } globalIndex++; if (psm.getFusionCandidates().Count() > maxNumPossibleSequences) { return(false); } } return(true); }
//use ion hits to know where peaks have been found by morpheus and where there is ambiguity public static void findIons(FusionCandidate fusionCandidate, PSM psm, out string error_message) { error_message = ""; double[] nPeaks = psm.getNInfo().getPeakHits(); //get peaks double[] cPeaks = psm.getCInfo().getPeakHits(); fusionCandidate.makeFoundIons(); string candSeq = fusionCandidate.seq; bool[] foundIons = fusionCandidate.getFoundIons(); //find which aa have peaks for (int i = 0; i < foundIons.Count() - 1; i++) { //B IONS// if (ionsUsed.Contains(IonType.b)) { double bTheoMass = MassCalculator.MonoIsoptopicMass(candSeq.Substring(0, 1 + i), out string error_message2) - Constants.WATER_MONOISOTOPIC_MASS; error_message += error_message2; foreach (PTM ptm in psm.getNInfo().getPTMs()) { if (ptm.index <= i) { bTheoMass += ptm.mass; } } foreach (double expPeak in nPeaks) { if (expPeak > bTheoMass - productMassToleranceDa && expPeak < bTheoMass + productMassToleranceDa) { foundIons[i] = true; } } } //Y IONS// if (ionsUsed.Contains(IonType.y)) { double yTheoMass = MassCalculator.MonoIsoptopicMass(candSeq.Substring(candSeq.Length - 1 - i, i + 1), out string error_message3); error_message += error_message3; foreach (PTM ptm in psm.getCInfo().getPTMs()) { if (ptm.index >= candSeq.Length - 2 - i) { yTheoMass += ptm.mass; } } foreach (double expPeak in cPeaks) { if (expPeak > yTheoMass - productMassToleranceDa && expPeak < yTheoMass + productMassToleranceDa) { foundIons[foundIons.Count() - 2 - i] = true; } } } //C IONS// if (ionsUsed.Contains(IonType.c)) { double cTheoMass = MassCalculator.MonoIsoptopicMass(candSeq.Substring(0, 1 + i), out string error_message4) - Constants.WATER_MONOISOTOPIC_MASS + Constants.nitrogenMonoisotopicMass + 3 * Constants.hydrogenMonoisotopicMass; error_message += error_message4; foreach (PTM ptm in psm.getNInfo().getPTMs()) { if (ptm.index <= i) { cTheoMass += ptm.mass; } } foreach (double expPeak in nPeaks) { if (expPeak > cTheoMass - productMassToleranceDa && expPeak < cTheoMass + productMassToleranceDa) { foundIons[i] = true; } } } //ZDOT IONS// if (ionsUsed.Contains(IonType.zdot)) { double zdotTheoMass = MassCalculator.MonoIsoptopicMass(candSeq.Substring(candSeq.Length - 1 - i, i + 1), out string error_message5) - Constants.nitrogenMonoisotopicMass - 2 * Constants.hydrogenMonoisotopicMass; error_message += error_message5; foreach (PTM ptm in psm.getCInfo().getPTMs()) { if (ptm.index >= candSeq.Length - 2 - i) { zdotTheoMass += ptm.mass; } } foreach (double expPeak in cPeaks) { if (expPeak > zdotTheoMass - productMassToleranceDa && expPeak < zdotTheoMass + productMassToleranceDa) { foundIons[foundIons.Count() - 2 - i] = true; } } } } //foundIons[0] = true; //AspN always starts with a D foundIons[foundIons.Count() - 1] = true;//A|B|C|D|E|F|K| where the whole peptide peak is always placed arbitrarly at the c term }