public void TestSplitPrefixAndSuffix(string sequence, string expectedPrimarySeq, string expectedPrefix, string expectedSuffix) { PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(sequence, out var primarySequence, out var prefix, out var suffix); Assert.AreEqual(expectedPrimarySeq, primarySequence); Assert.AreEqual(expectedPrefix, prefix); Assert.AreEqual(expectedSuffix, suffix); }
/// <summary> /// Constructor /// </summary> /// <param name="peptideSeqWithModsAndContext"></param> /// <param name="peptideCleanSeq"></param> public FirstHitInfo(string peptideSeqWithModsAndContext, string peptideCleanSeq) { if (!PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(peptideSeqWithModsAndContext, out mPrimarySequence, out mPrefix, out mSuffix)) { throw new Exception("Unable to split the prefix and suffix from peptide " + peptideSeqWithModsAndContext); } CleanSequence = peptideCleanSeq; }
/// <summary> /// Pull sequences out of AScore results, clean them, and output them to a peptide sequence list file /// </summary> private void CreatePeptideList() { var columnMap = new Dictionary <string, int>(StringComparer.OrdinalIgnoreCase); var peptides = new Dictionary <string, int>(); // Write out a list of peptides for clsPeptideToProteinMapEngine using var aScoreReader = new StreamReader(new FileStream(mAScoreResultsFilePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); using var peptideWriter = new StreamWriter(new FileStream(mPeptideListFilePath, FileMode.Create, FileAccess.Write, FileShare.Read)); while (!aScoreReader.EndOfStream) { var dataLine = aScoreReader.ReadLine(); if (string.IsNullOrWhiteSpace(dataLine)) { continue; } var columns = dataLine.Split('\t'); if (columnMap.Count == 0) { // Assume the first line is column names for (var i = 0; i < columns.Length; ++i) { columnMap.Add(columns[i], i); } var requiredColumns = new List <string> { "BestSequence" }; if (!VerifyRequiredColumns(requiredColumns, columnMap, "CreatePeptideList", mAScoreResultsFilePath)) { return; } continue; } var sequence = columns[columnMap["BestSequence"]]; var cleanSequence = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(sequence, true); if (!peptides.ContainsKey(cleanSequence)) { peptides.Add(cleanSequence, 0); } peptideWriter.WriteLine(cleanSequence); } mDistinctPeptides = peptides.Count; }
private static string CleanupPeptide(string strPeptide) { if (PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(strPeptide, out var strPrimarySequence, out var strPrefix, out var strSuffix)) { // Look for an N-terminal iTraq mod var reMatch = RegexFindItraq.Match(strPrimarySequence); if (reMatch.Success) { strPeptide = strPrefix + "." + reMatch.Groups[2].Value + reMatch.Groups[1].Value + reMatch.Groups[3].Value + "." + strSuffix; } } return(strPeptide); }
public void Init() { mCleavageStateCalculator = new PeptideCleavageStateCalculator(); }
/// <summary> /// Combine the data from AScore and the PeptideToProteinMapper into one results file /// </summary> private void CombineAScoreAndProteinData() { // Read the AScore again... using var aScoreReader = new StreamReader(new FileStream(mAScoreResultsFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)); using var mappedWriter = new StreamWriter(new FileStream(mMappingResultsFilePath, FileMode.Create, FileAccess.Write, FileShare.Read)); var columnMapAScore = new Dictionary <string, int>(StringComparer.OrdinalIgnoreCase); if (aScoreReader.EndOfStream) { return; } // Run as long as we can successfully read while (!aScoreReader.EndOfStream) { var dataLine = aScoreReader.ReadLine(); if (string.IsNullOrWhiteSpace(dataLine)) { continue; } var columns = dataLine.Split('\t'); if (columnMapAScore.Count == 0) { for (var i = 0; i < columns.Length; ++i) { columnMapAScore.Add(columns[i], i); } var requiredColumns = new List <string> { "BestSequence" }; if (!VerifyRequiredColumns(requiredColumns, columnMapAScore, "CombineAScoreAndProteinData", mAScoreResultsFilePath)) { return; } var outputFileHeaders = new List <string>(); outputFileHeaders.AddRange(columns); // Append additional columns to outputFileHeaders outputFileHeaders.Add("ProteinName"); // Protein Description - if it contains key-value pairs, use it. if (mOutputProteinDescriptions) { outputFileHeaders.Add("Description"); } outputFileHeaders.Add("ProteinCount"); outputFileHeaders.Add("Residue"); outputFileHeaders.Add("Position"); mappedWriter.WriteLine(string.Join("\t", outputFileHeaders)); continue; } var sequence = columns[columnMapAScore["BestSequence"]]; var cleanSequence = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(sequence, true); ++mTotalPeptides; if (!mPeptideToProteinMap.ContainsKey(cleanSequence)) { // Match not found WriteCombinedLine(mappedWriter, dataLine); if (!mPeptidesNotFound.ContainsKey(cleanSequence)) { mPeptidesNotFound.Add(cleanSequence, 0); } mPeptidesNotFound[cleanSequence]++; ++mTotalPeptidesNotFound; continue; } PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(sequence, out var noPrefixSequence, out _, out _); var mods = new List <int>(); for (var i = 0; i < noPrefixSequence.Length; ++i) { if (noPrefixSequence[i] == '*') { mods.Add(i); } } foreach (var match in mPeptideToProteinMap[cleanSequence]) { // Protein Name var proteinName = match.proteinName; var proteinDescription = string.Empty; // Protein Description - if it contains key-value pairs, use it. if (mOutputProteinDescriptions) { proteinDescription = mProteinDescriptions[match.proteinName]; } // # of proteins occurred in var proteinCount = mPeptideToProteinMap[cleanSequence].Count; var matchFound = false; for (var i = 0; i < mods.Count; ++i) { matchFound = true; var modifiedResidue = ' '; // Residue of mod if (mods[i] > 0) { modifiedResidue = noPrefixSequence[mods[i] - 1]; } // Position of residue // With multiple residues, we need to adjust the position of each subsequent residue by the number of residues we have read var residuePosition = match.residueStart + mods[i] - i - 1; WriteCombinedLine(mappedWriter, dataLine, proteinName, proteinDescription, proteinCount, modifiedResidue, residuePosition); } if (!matchFound) { const char modifiedResidue = ' '; const int residuePosition = 0; WriteCombinedLine(mappedWriter, dataLine, proteinName, proteinDescription, proteinCount, modifiedResidue, residuePosition); } } } }
/// <summary> /// Load the Peptide to Protein mapping using the specified PHRP result file /// </summary> /// <remarks>The PepToProtMap file contains Residue_Start and Residue_End columns</remarks> /// <param name="filePath"></param> /// <param name="pepToProteinMap">Peptide to protein mapping</param> /// <returns>True if successful, false if an error</returns> private bool LoadPepToProtMapData(string filePath, IDictionary <string, PepToProteinMapInfo> pepToProteinMap) { var linesRead = 0; var lastProgress = DateTime.UtcNow; var notifyComplete = false; try { // Read the data from the PepToProtMap file using var reader = new StreamReader(new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); while (!reader.EndOfStream) { var lineIn = reader.ReadLine(); linesRead++; if (!string.IsNullOrEmpty(lineIn)) { var splitLine = lineIn.Split('\t'); if (splitLine.Length >= 4) { // Parse out the numbers from the last two columns // (the first line of the file is the header line, and it will get skipped) if (int.TryParse(splitLine[2], out var residueStart)) { if (int.TryParse(splitLine[3], out var residueEnd)) { var peptide = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(splitLine[0], true); if (pepToProteinMap.TryGetValue(peptide, out var pepToProtMapInfo)) { if (MaxProteinsPerSeqID == 0 || pepToProtMapInfo.ProteinCount < MaxProteinsPerSeqID) { pepToProtMapInfo.AddProtein(splitLine[1], residueStart, residueEnd); } } else { pepToProtMapInfo = new PepToProteinMapInfo(splitLine[1], residueStart, residueEnd); pepToProteinMap.Add(peptide, pepToProtMapInfo); } } } } if (linesRead % 100 != 0) { continue; } if (DateTime.UtcNow.Subtract(lastProgress).TotalSeconds < 5) { continue; } var percentComplete = reader.BaseStream.Position / (float)reader.BaseStream.Length * 100; Console.WriteLine(" ... caching PepToProtMapData: {0:0.0}% complete", percentComplete); lastProgress = DateTime.UtcNow; notifyComplete = true; } } if (notifyComplete) { Console.WriteLine(" ... caching PepToProtMapData: 100% complete"); } } catch (Exception ex) { throw new Exception("Exception loading Peptide to Protein Map data from " + Path.GetFileName(filePath) + ": " + ex.Message); } return(true); }
public bool ConsolidatePSMs(string psmFilePath, bool multiJobFile) { try { var inputFile = new FileInfo(psmFilePath); var outputFileName = Path.GetFileNameWithoutExtension(inputFile.Name) + "_ForDartID.txt"; string outputFilePath; if (inputFile.DirectoryName != null) { outputFilePath = Path.Combine(inputFile.DirectoryName, outputFileName); } else { outputFilePath = outputFileName; } var msgfPlusColumns = new SortedDictionary <Enum, int>(); mScanTimeColIndex = -1; mPeakWidthMinutesColIndex = -1; var requiredColumns = new List <MSGFPlusSynFileColumns> { MSGFPlusSynFileColumns.Peptide, MSGFPlusSynFileColumns.SpecEValue, MSGFPlusSynFileColumns.Charge, MSGFPlusSynFileColumns.Protein }; string datasetName; if (multiJobFile) { datasetName = "TBD"; throw new NotImplementedException( "ConsolidatePSMs needs to be updated to support an input file where Job or Dataset is the first column"); } // Obtain the dataset name from the filename if (psmFilePath.EndsWith(MASICResultsMerger.RESULTS_SUFFIX, StringComparison.OrdinalIgnoreCase)) { datasetName = Path.GetFileName(psmFilePath.Substring(0, psmFilePath.Length - MASICResultsMerger.RESULTS_SUFFIX.Length)); } else { datasetName = Path.GetFileNameWithoutExtension(psmFilePath); } if (datasetName.EndsWith("_syn", StringComparison.OrdinalIgnoreCase) || datasetName.EndsWith("_fht", StringComparison.OrdinalIgnoreCase)) { datasetName = datasetName.Substring(0, datasetName.Length - 4); } // ReSharper disable StringLiteralTypo if (datasetName.EndsWith("_msgfplus", StringComparison.OrdinalIgnoreCase)) { datasetName = datasetName.Substring(0, datasetName.Length - "_msgfplus".Length); } else if (datasetName.EndsWith("_msgfdb", StringComparison.OrdinalIgnoreCase)) { datasetName = datasetName.Substring(0, datasetName.Length - "_msgfdb".Length); } // ReSharper restore StringLiteralTypo var psmGroup = new DartIdData(); using var reader = new StreamReader(new FileStream(inputFile.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); using var writer = new StreamWriter(new FileStream(outputFilePath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite)); var headerLine = new List <string> { "Dataset", "Peptide", "MSGFDB_SpecEValue", "Charge", "LeadingProtein", "Proteins", "ElutionTime", "PeakWidthMinutes" }; writer.WriteLine(string.Join("\t", headerLine)); while (!reader.EndOfStream) { var dataLine = reader.ReadLine(); if (string.IsNullOrWhiteSpace(dataLine)) { continue; } if (mScanTimeColIndex < 0) { var success = ParseMergedFileHeaderLine(dataLine, msgfPlusColumns); if (!success) { return(false); } if (mScanTimeColIndex < 0) { OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name, MASICResultsMerger.SCAN_STATS_ELUTION_TIME_COLUMN)); return(false); } if (mPeakWidthMinutesColIndex < 0) { OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name, MASICResultsMerger.PEAK_WIDTH_MINUTES_COLUMN)); return(false); } // Validate that the required columns exist foreach (var requiredColumn in requiredColumns) { if (ColumnExists(msgfPlusColumns, requiredColumn)) { continue; } OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name, requiredColumn.ToString())); return(false); } continue; } var dataColumns = dataLine.Split('\t'); var scanNumber = GetValueInt(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Scan); var charge = GetValueInt(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Charge); var peptide = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Peptide); var protein = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Protein); if (!PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(peptide, out var primarySequence, out _, out _)) { primarySequence = peptide; } if (scanNumber != psmGroup.ScanNumber || charge != psmGroup.Charge || !string.Equals(primarySequence, psmGroup.PrimarySequence)) { StoreResult(writer, psmGroup, datasetName); psmGroup = new DartIdData(dataLine, scanNumber, peptide, primarySequence, protein) { SpecEValue = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.SpecEValue), Charge = charge, ElutionTime = dataColumns[mScanTimeColIndex], PeakWidthMinutes = dataColumns[mPeakWidthMinutesColIndex] }; } else { psmGroup.Proteins.Add(protein); } } StoreResult(writer, psmGroup, datasetName); return(true); } catch (Exception ex) { OnErrorEvent("Error in ConsolidatePSMs", ex); return(false); } }
private static void TestPHRPReader(string synOrFhtFile, bool blnSkipDuplicates) { var inputFile = new FileInfo(synOrFhtFile); Console.WriteLine("Instantiating reader"); var startupOptions = new StartupOptions { LoadModsAndSeqInfo = true, LoadMSGFResults = true, LoadScanStatsData = false, MaxProteinsPerPSM = 100 }; var phrpReader = new ReaderFactory(inputFile.FullName, PeptideHitResultTypes.Unknown, startupOptions) { EchoMessagesToConsole = false, SkipDuplicatePSMs = blnSkipDuplicates }; // Check for any load errors if (phrpReader.ErrorMessages.Count > 0) { Console.WriteLine("Error(s) instantiating the reader:"); foreach (var errorMessage in phrpReader.ErrorMessages) { Console.WriteLine(" " + errorMessage); } } phrpReader.ErrorEvent += ErrorEventHandler; phrpReader.StatusEvent += MessageEventHandler; phrpReader.WarningEvent += WarningEventHandler; const bool fastReadEnabled = true; phrpReader.FastReadMode = fastReadEnabled; var massCalculator = new PeptideMassCalculator(); if (!phrpReader.CanRead) { Console.WriteLine("Aborting since PHRPReader is not ready: " + phrpReader.ErrorMessage); return; } var lstValues = new List <string>(); var intPSMsRead = 0; var intModifiedPSMsRead = 0; // ReSharper disable once CollectionNeverQueried.Local var dctCachedValues = new Dictionary <int, PSM>(); Console.WriteLine("Reading data"); while (phrpReader.MoveNext()) { var psm = phrpReader.CurrentPSM; intPSMsRead += 1; lstValues.Clear(); phrpReader.FinalizeCurrentPSM(); PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(psm.Peptide, out _, out _, out _); var strMassErrorPPM = GetCorrectedMassErrorPPM(psm, out _); lstValues.Add(phrpReader.DatasetName + "_dta.txt"); // #SpecFile lstValues.Add("index=" + intPSMsRead); // SpecID lstValues.Add(psm.ScanNumber.ToString()); // ScanNum lstValues.Add(psm.CollisionMode); // FragMethod lstValues.Add(massCalculator.ConvoluteMass(psm.PrecursorNeutralMass, 0, psm.Charge).ToString(CultureInfo.InvariantCulture)); // Precursor m/z lstValues.Add(strMassErrorPPM); // PrecursorError(ppm) lstValues.Add(psm.Charge.ToString()); // Charge lstValues.Add(psm.NumTrypticTermini.ToString()); // Tryptic state (0, 1, or 2) lstValues.Add(CleanupPeptide(psm.PeptideWithNumericMods)); // Peptide if (psm.SeqID <= 0) { lstValues.Add("**" + psm.SeqID + "**"); // SeqID is undefined } else { lstValues.Add(psm.SeqID.ToString()); // SeqID } lstValues.Add(psm.ProteinFirst); if (psm.ProteinDetails.Count > 0) { var firstProteinDetail = psm.ProteinDetails.First(); if (!string.Equals(psm.ProteinFirst, firstProteinDetail.Key)) { lstValues.Add(firstProteinDetail.Key); } else { lstValues.Add("<Match>"); } lstValues.Add(firstProteinDetail.Value.ResidueStart.ToString()); lstValues.Add(firstProteinDetail.Value.ResidueEnd.ToString()); } var strXCorr = GetScore(psm, SequestSynFileReader.GetColumnNameByID(SequestSynopsisFileColumns.XCorr), "0"); lstValues.Add(strXCorr); lstValues.Add(GetScore(psm, SequestSynFileReader.GetColumnNameByID(SequestSynopsisFileColumns.Sp), "0")); lstValues.Add(psm.MSGFSpecEValue); lstValues.Add(GetScore(psm, SequestSynFileReader.GetColumnNameByID(SequestSynopsisFileColumns.DeltaCn2), "0")); lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetMSGFDBColumnNameByID(MSGFDBSynFileColumns.PValue), "0")); lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.EValue), "0")); lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.RankSpecEValue), "0")); lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetMSGFDBColumnNameByID(MSGFDBSynFileColumns.FDR), "1")); lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.QValue), "0")); lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.PepQValue), "0")); if (psm.PeptideCleanSequence == "QQIEESTSDYDKEK") { Console.WriteLine(psm.Peptide + " in scan " + psm.ScanNumber); var parentIonMZ = massCalculator.ConvoluteMass(psm.PrecursorNeutralMass, 0, psm.Charge); Console.WriteLine("ParentIonMZ = " + parentIonMZ); Console.WriteLine("PeptideWithNumericMods = " + psm.PeptideWithNumericMods); } if (psm.ModifiedResidues.Count > 0) { intModifiedPSMsRead += 1; if (intModifiedPSMsRead % 500 == 0) { Console.WriteLine("PeptideWithNumericMods = " + psm.PeptideWithNumericMods); foreach (var modifiedResidue in psm.ModifiedResidues) { Console.WriteLine(" " + modifiedResidue.Residue + modifiedResidue.EndResidueLocInPeptide + ": " + modifiedResidue.ModDefinition.ModificationMassAsText); } } var dblPeptideMassRecomputed = massCalculator.ComputeSequenceMassNumericMods(psm.PeptideWithNumericMods); if (Math.Abs(psm.PeptideMonoisotopicMass - dblPeptideMassRecomputed) > 0.1) { Console.WriteLine(" Peptide mass disagreement: " + (psm.PeptideMonoisotopicMass - dblPeptideMassRecomputed).ToString("0.0000000")); } } var strFlattened = FlattenList(lstValues); if (intPSMsRead % 10000 == 0) { Console.WriteLine(strFlattened); } dctCachedValues.Add(intPSMsRead, psm); } }