/// <summary> /// Pull sequences out of AScore results, clean them, and output them to a peptide sequence list file /// </summary> private void CreatePeptideList() { var columnMap = new Dictionary <string, int>(StringComparer.OrdinalIgnoreCase); var peptides = new Dictionary <string, int>(); // Write out a list of peptides for clsPeptideToProteinMapEngine using var aScoreReader = new StreamReader(new FileStream(mAScoreResultsFilePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); using var peptideWriter = new StreamWriter(new FileStream(mPeptideListFilePath, FileMode.Create, FileAccess.Write, FileShare.Read)); while (!aScoreReader.EndOfStream) { var dataLine = aScoreReader.ReadLine(); if (string.IsNullOrWhiteSpace(dataLine)) { continue; } var columns = dataLine.Split('\t'); if (columnMap.Count == 0) { // Assume the first line is column names for (var i = 0; i < columns.Length; ++i) { columnMap.Add(columns[i], i); } var requiredColumns = new List <string> { "BestSequence" }; if (!VerifyRequiredColumns(requiredColumns, columnMap, "CreatePeptideList", mAScoreResultsFilePath)) { return; } continue; } var sequence = columns[columnMap["BestSequence"]]; var cleanSequence = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(sequence, true); if (!peptides.ContainsKey(cleanSequence)) { peptides.Add(cleanSequence, 0); } peptideWriter.WriteLine(cleanSequence); } mDistinctPeptides = peptides.Count; }
/// <summary> /// Combine the data from AScore and the PeptideToProteinMapper into one results file /// </summary> private void CombineAScoreAndProteinData() { // Read the AScore again... using var aScoreReader = new StreamReader(new FileStream(mAScoreResultsFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)); using var mappedWriter = new StreamWriter(new FileStream(mMappingResultsFilePath, FileMode.Create, FileAccess.Write, FileShare.Read)); var columnMapAScore = new Dictionary <string, int>(StringComparer.OrdinalIgnoreCase); if (aScoreReader.EndOfStream) { return; } // Run as long as we can successfully read while (!aScoreReader.EndOfStream) { var dataLine = aScoreReader.ReadLine(); if (string.IsNullOrWhiteSpace(dataLine)) { continue; } var columns = dataLine.Split('\t'); if (columnMapAScore.Count == 0) { for (var i = 0; i < columns.Length; ++i) { columnMapAScore.Add(columns[i], i); } var requiredColumns = new List <string> { "BestSequence" }; if (!VerifyRequiredColumns(requiredColumns, columnMapAScore, "CombineAScoreAndProteinData", mAScoreResultsFilePath)) { return; } var outputFileHeaders = new List <string>(); outputFileHeaders.AddRange(columns); // Append additional columns to outputFileHeaders outputFileHeaders.Add("ProteinName"); // Protein Description - if it contains key-value pairs, use it. if (mOutputProteinDescriptions) { outputFileHeaders.Add("Description"); } outputFileHeaders.Add("ProteinCount"); outputFileHeaders.Add("Residue"); outputFileHeaders.Add("Position"); mappedWriter.WriteLine(string.Join("\t", outputFileHeaders)); continue; } var sequence = columns[columnMapAScore["BestSequence"]]; var cleanSequence = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(sequence, true); ++mTotalPeptides; if (!mPeptideToProteinMap.ContainsKey(cleanSequence)) { // Match not found WriteCombinedLine(mappedWriter, dataLine); if (!mPeptidesNotFound.ContainsKey(cleanSequence)) { mPeptidesNotFound.Add(cleanSequence, 0); } mPeptidesNotFound[cleanSequence]++; ++mTotalPeptidesNotFound; continue; } PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(sequence, out var noPrefixSequence, out _, out _); var mods = new List <int>(); for (var i = 0; i < noPrefixSequence.Length; ++i) { if (noPrefixSequence[i] == '*') { mods.Add(i); } } foreach (var match in mPeptideToProteinMap[cleanSequence]) { // Protein Name var proteinName = match.proteinName; var proteinDescription = string.Empty; // Protein Description - if it contains key-value pairs, use it. if (mOutputProteinDescriptions) { proteinDescription = mProteinDescriptions[match.proteinName]; } // # of proteins occurred in var proteinCount = mPeptideToProteinMap[cleanSequence].Count; var matchFound = false; for (var i = 0; i < mods.Count; ++i) { matchFound = true; var modifiedResidue = ' '; // Residue of mod if (mods[i] > 0) { modifiedResidue = noPrefixSequence[mods[i] - 1]; } // Position of residue // With multiple residues, we need to adjust the position of each subsequent residue by the number of residues we have read var residuePosition = match.residueStart + mods[i] - i - 1; WriteCombinedLine(mappedWriter, dataLine, proteinName, proteinDescription, proteinCount, modifiedResidue, residuePosition); } if (!matchFound) { const char modifiedResidue = ' '; const int residuePosition = 0; WriteCombinedLine(mappedWriter, dataLine, proteinName, proteinDescription, proteinCount, modifiedResidue, residuePosition); } } } }
/// <summary> /// Load the Peptide to Protein mapping using the specified PHRP result file /// </summary> /// <remarks>The PepToProtMap file contains Residue_Start and Residue_End columns</remarks> /// <param name="filePath"></param> /// <param name="pepToProteinMap">Peptide to protein mapping</param> /// <returns>True if successful, false if an error</returns> private bool LoadPepToProtMapData(string filePath, IDictionary <string, PepToProteinMapInfo> pepToProteinMap) { var linesRead = 0; var lastProgress = DateTime.UtcNow; var notifyComplete = false; try { // Read the data from the PepToProtMap file using var reader = new StreamReader(new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)); while (!reader.EndOfStream) { var lineIn = reader.ReadLine(); linesRead++; if (!string.IsNullOrEmpty(lineIn)) { var splitLine = lineIn.Split('\t'); if (splitLine.Length >= 4) { // Parse out the numbers from the last two columns // (the first line of the file is the header line, and it will get skipped) if (int.TryParse(splitLine[2], out var residueStart)) { if (int.TryParse(splitLine[3], out var residueEnd)) { var peptide = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(splitLine[0], true); if (pepToProteinMap.TryGetValue(peptide, out var pepToProtMapInfo)) { if (MaxProteinsPerSeqID == 0 || pepToProtMapInfo.ProteinCount < MaxProteinsPerSeqID) { pepToProtMapInfo.AddProtein(splitLine[1], residueStart, residueEnd); } } else { pepToProtMapInfo = new PepToProteinMapInfo(splitLine[1], residueStart, residueEnd); pepToProteinMap.Add(peptide, pepToProtMapInfo); } } } } if (linesRead % 100 != 0) { continue; } if (DateTime.UtcNow.Subtract(lastProgress).TotalSeconds < 5) { continue; } var percentComplete = reader.BaseStream.Position / (float)reader.BaseStream.Length * 100; Console.WriteLine(" ... caching PepToProtMapData: {0:0.0}% complete", percentComplete); lastProgress = DateTime.UtcNow; notifyComplete = true; } } if (notifyComplete) { Console.WriteLine(" ... caching PepToProtMapData: 100% complete"); } } catch (Exception ex) { throw new Exception("Exception loading Peptide to Protein Map data from " + Path.GetFileName(filePath) + ": " + ex.Message); } return(true); }