public void TestSplitPrefixAndSuffix(string sequence, string expectedPrimarySeq, string expectedPrefix, string expectedSuffix)
        {
            PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(sequence, out var primarySequence, out var prefix, out var suffix);

            Assert.AreEqual(expectedPrimarySeq, primarySequence);
            Assert.AreEqual(expectedPrefix, prefix);
            Assert.AreEqual(expectedSuffix, suffix);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="peptideSeqWithModsAndContext"></param>
        /// <param name="peptideCleanSeq"></param>
        public FirstHitInfo(string peptideSeqWithModsAndContext, string peptideCleanSeq)
        {
            if (!PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(peptideSeqWithModsAndContext, out mPrimarySequence, out mPrefix, out mSuffix))
            {
                throw new Exception("Unable to split the prefix and suffix from peptide " + peptideSeqWithModsAndContext);
            }

            CleanSequence = peptideCleanSeq;
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Pull sequences out of AScore results, clean them, and output them to a peptide sequence list file
        /// </summary>
        private void CreatePeptideList()
        {
            var columnMap = new Dictionary <string, int>(StringComparer.OrdinalIgnoreCase);
            var peptides  = new Dictionary <string, int>();

            // Write out a list of peptides for clsPeptideToProteinMapEngine
            using var aScoreReader  = new StreamReader(new FileStream(mAScoreResultsFilePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite));
            using var peptideWriter = new StreamWriter(new FileStream(mPeptideListFilePath, FileMode.Create, FileAccess.Write, FileShare.Read));

            while (!aScoreReader.EndOfStream)
            {
                var dataLine = aScoreReader.ReadLine();
                if (string.IsNullOrWhiteSpace(dataLine))
                {
                    continue;
                }

                var columns = dataLine.Split('\t');

                if (columnMap.Count == 0)
                {
                    // Assume the first line is column names
                    for (var i = 0; i < columns.Length; ++i)
                    {
                        columnMap.Add(columns[i], i);
                    }

                    var requiredColumns = new List <string>
                    {
                        "BestSequence"
                    };

                    if (!VerifyRequiredColumns(requiredColumns, columnMap, "CreatePeptideList", mAScoreResultsFilePath))
                    {
                        return;
                    }

                    continue;
                }

                var sequence      = columns[columnMap["BestSequence"]];
                var cleanSequence = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(sequence, true);
                if (!peptides.ContainsKey(cleanSequence))
                {
                    peptides.Add(cleanSequence, 0);
                }

                peptideWriter.WriteLine(cleanSequence);
            }

            mDistinctPeptides = peptides.Count;
        }
Ejemplo n.º 4
0
        private static string CleanupPeptide(string strPeptide)
        {
            if (PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(strPeptide, out var strPrimarySequence, out var strPrefix, out var strSuffix))
            {
                // Look for an N-terminal iTraq mod
                var reMatch = RegexFindItraq.Match(strPrimarySequence);

                if (reMatch.Success)
                {
                    strPeptide = strPrefix + "." + reMatch.Groups[2].Value + reMatch.Groups[1].Value + reMatch.Groups[3].Value + "." + strSuffix;
                }
            }

            return(strPeptide);
        }
 public void Init()
 {
     mCleavageStateCalculator = new PeptideCleavageStateCalculator();
 }
Ejemplo n.º 6
0
        /// <summary>
        /// Combine the data from AScore and the PeptideToProteinMapper into one results file
        /// </summary>
        private void CombineAScoreAndProteinData()
        {
            // Read the AScore again...
            using var aScoreReader = new StreamReader(new FileStream(mAScoreResultsFilePath, FileMode.Open, FileAccess.Read, FileShare.Read));
            using var mappedWriter = new StreamWriter(new FileStream(mMappingResultsFilePath, FileMode.Create, FileAccess.Write, FileShare.Read));

            var columnMapAScore = new Dictionary <string, int>(StringComparer.OrdinalIgnoreCase);

            if (aScoreReader.EndOfStream)
            {
                return;
            }

            // Run as long as we can successfully read
            while (!aScoreReader.EndOfStream)
            {
                var dataLine = aScoreReader.ReadLine();
                if (string.IsNullOrWhiteSpace(dataLine))
                {
                    continue;
                }

                var columns = dataLine.Split('\t');

                if (columnMapAScore.Count == 0)
                {
                    for (var i = 0; i < columns.Length; ++i)
                    {
                        columnMapAScore.Add(columns[i], i);
                    }

                    var requiredColumns = new List <string>
                    {
                        "BestSequence"
                    };

                    if (!VerifyRequiredColumns(requiredColumns, columnMapAScore, "CombineAScoreAndProteinData", mAScoreResultsFilePath))
                    {
                        return;
                    }

                    var outputFileHeaders = new List <string>();
                    outputFileHeaders.AddRange(columns);

                    // Append additional columns to outputFileHeaders
                    outputFileHeaders.Add("ProteinName");

                    // Protein Description - if it contains key-value pairs, use it.
                    if (mOutputProteinDescriptions)
                    {
                        outputFileHeaders.Add("Description");
                    }

                    outputFileHeaders.Add("ProteinCount");
                    outputFileHeaders.Add("Residue");
                    outputFileHeaders.Add("Position");

                    mappedWriter.WriteLine(string.Join("\t", outputFileHeaders));

                    continue;
                }

                var sequence = columns[columnMapAScore["BestSequence"]];

                var cleanSequence = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(sequence, true);

                ++mTotalPeptides;

                if (!mPeptideToProteinMap.ContainsKey(cleanSequence))
                {
                    // Match not found
                    WriteCombinedLine(mappedWriter, dataLine);

                    if (!mPeptidesNotFound.ContainsKey(cleanSequence))
                    {
                        mPeptidesNotFound.Add(cleanSequence, 0);
                    }
                    mPeptidesNotFound[cleanSequence]++;
                    ++mTotalPeptidesNotFound;

                    continue;
                }

                PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(sequence, out var noPrefixSequence, out _, out _);

                var mods = new List <int>();

                for (var i = 0; i < noPrefixSequence.Length; ++i)
                {
                    if (noPrefixSequence[i] == '*')
                    {
                        mods.Add(i);
                    }
                }

                foreach (var match in mPeptideToProteinMap[cleanSequence])
                {
                    // Protein Name
                    var proteinName = match.proteinName;

                    var proteinDescription = string.Empty;

                    // Protein Description - if it contains key-value pairs, use it.
                    if (mOutputProteinDescriptions)
                    {
                        proteinDescription = mProteinDescriptions[match.proteinName];
                    }

                    // # of proteins occurred in
                    var proteinCount = mPeptideToProteinMap[cleanSequence].Count;

                    var matchFound = false;

                    for (var i = 0; i < mods.Count; ++i)
                    {
                        matchFound = true;

                        var modifiedResidue = ' ';

                        // Residue of mod
                        if (mods[i] > 0)
                        {
                            modifiedResidue = noPrefixSequence[mods[i] - 1];
                        }

                        // Position of residue
                        // With multiple residues, we need to adjust the position of each subsequent residue by the number of residues we have read
                        var residuePosition = match.residueStart + mods[i] - i - 1;

                        WriteCombinedLine(mappedWriter, dataLine, proteinName, proteinDescription, proteinCount, modifiedResidue, residuePosition);
                    }

                    if (!matchFound)
                    {
                        const char modifiedResidue = ' ';
                        const int  residuePosition = 0;

                        WriteCombinedLine(mappedWriter, dataLine, proteinName, proteinDescription, proteinCount, modifiedResidue, residuePosition);
                    }
                }
            }
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Load the Peptide to Protein mapping using the specified PHRP result file
        /// </summary>
        /// <remarks>The PepToProtMap file contains Residue_Start and Residue_End columns</remarks>
        /// <param name="filePath"></param>
        /// <param name="pepToProteinMap">Peptide to protein mapping</param>
        /// <returns>True if successful, false if an error</returns>
        private bool LoadPepToProtMapData(string filePath, IDictionary <string, PepToProteinMapInfo> pepToProteinMap)
        {
            var linesRead      = 0;
            var lastProgress   = DateTime.UtcNow;
            var notifyComplete = false;

            try
            {
                // Read the data from the PepToProtMap file
                using var reader = new StreamReader(new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite));

                while (!reader.EndOfStream)
                {
                    var lineIn = reader.ReadLine();
                    linesRead++;

                    if (!string.IsNullOrEmpty(lineIn))
                    {
                        var splitLine = lineIn.Split('\t');

                        if (splitLine.Length >= 4)
                        {
                            // Parse out the numbers from the last two columns
                            // (the first line of the file is the header line, and it will get skipped)
                            if (int.TryParse(splitLine[2], out var residueStart))
                            {
                                if (int.TryParse(splitLine[3], out var residueEnd))
                                {
                                    var peptide = PeptideCleavageStateCalculator.ExtractCleanSequenceFromSequenceWithMods(splitLine[0], true);

                                    if (pepToProteinMap.TryGetValue(peptide, out var pepToProtMapInfo))
                                    {
                                        if (MaxProteinsPerSeqID == 0 || pepToProtMapInfo.ProteinCount < MaxProteinsPerSeqID)
                                        {
                                            pepToProtMapInfo.AddProtein(splitLine[1], residueStart, residueEnd);
                                        }
                                    }
                                    else
                                    {
                                        pepToProtMapInfo = new PepToProteinMapInfo(splitLine[1], residueStart, residueEnd);

                                        pepToProteinMap.Add(peptide, pepToProtMapInfo);
                                    }
                                }
                            }
                        }

                        if (linesRead % 100 != 0)
                        {
                            continue;
                        }

                        if (DateTime.UtcNow.Subtract(lastProgress).TotalSeconds < 5)
                        {
                            continue;
                        }

                        var percentComplete = reader.BaseStream.Position / (float)reader.BaseStream.Length * 100;
                        Console.WriteLine(" ... caching PepToProtMapData: {0:0.0}% complete", percentComplete);
                        lastProgress   = DateTime.UtcNow;
                        notifyComplete = true;
                    }
                }

                if (notifyComplete)
                {
                    Console.WriteLine(" ... caching PepToProtMapData: 100% complete");
                }
            }
            catch (Exception ex)
            {
                throw new Exception("Exception loading Peptide to Protein Map data from " + Path.GetFileName(filePath) + ": " + ex.Message);
            }

            return(true);
        }
Ejemplo n.º 8
0
        public bool ConsolidatePSMs(string psmFilePath, bool multiJobFile)
        {
            try
            {
                var    inputFile      = new FileInfo(psmFilePath);
                var    outputFileName = Path.GetFileNameWithoutExtension(inputFile.Name) + "_ForDartID.txt";
                string outputFilePath;

                if (inputFile.DirectoryName != null)
                {
                    outputFilePath = Path.Combine(inputFile.DirectoryName, outputFileName);
                }
                else
                {
                    outputFilePath = outputFileName;
                }

                var msgfPlusColumns = new SortedDictionary <Enum, int>();
                mScanTimeColIndex         = -1;
                mPeakWidthMinutesColIndex = -1;

                var requiredColumns = new List <MSGFPlusSynFileColumns>
                {
                    MSGFPlusSynFileColumns.Peptide,
                    MSGFPlusSynFileColumns.SpecEValue,
                    MSGFPlusSynFileColumns.Charge,
                    MSGFPlusSynFileColumns.Protein
                };

                string datasetName;
                if (multiJobFile)
                {
                    datasetName = "TBD";
                    throw new NotImplementedException(
                              "ConsolidatePSMs needs to be updated to support an input file where Job or Dataset is the first column");
                }

                // Obtain the dataset name from the filename
                if (psmFilePath.EndsWith(MASICResultsMerger.RESULTS_SUFFIX, StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = Path.GetFileName(psmFilePath.Substring(0, psmFilePath.Length - MASICResultsMerger.RESULTS_SUFFIX.Length));
                }
                else
                {
                    datasetName = Path.GetFileNameWithoutExtension(psmFilePath);
                }

                if (datasetName.EndsWith("_syn", StringComparison.OrdinalIgnoreCase) ||
                    datasetName.EndsWith("_fht", StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = datasetName.Substring(0, datasetName.Length - 4);
                }

                // ReSharper disable StringLiteralTypo
                if (datasetName.EndsWith("_msgfplus", StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = datasetName.Substring(0, datasetName.Length - "_msgfplus".Length);
                }
                else if (datasetName.EndsWith("_msgfdb", StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = datasetName.Substring(0, datasetName.Length - "_msgfdb".Length);
                }

                // ReSharper restore StringLiteralTypo

                var psmGroup = new DartIdData();

                using var reader = new StreamReader(new FileStream(inputFile.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite));
                using var writer = new StreamWriter(new FileStream(outputFilePath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite));

                var headerLine = new List <string>
                {
                    "Dataset",
                    "Peptide",
                    "MSGFDB_SpecEValue",
                    "Charge",
                    "LeadingProtein",
                    "Proteins",
                    "ElutionTime",
                    "PeakWidthMinutes"
                };

                writer.WriteLine(string.Join("\t", headerLine));

                while (!reader.EndOfStream)
                {
                    var dataLine = reader.ReadLine();
                    if (string.IsNullOrWhiteSpace(dataLine))
                    {
                        continue;
                    }

                    if (mScanTimeColIndex < 0)
                    {
                        var success = ParseMergedFileHeaderLine(dataLine, msgfPlusColumns);

                        if (!success)
                        {
                            return(false);
                        }

                        if (mScanTimeColIndex < 0)
                        {
                            OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name,
                                                       MASICResultsMerger.SCAN_STATS_ELUTION_TIME_COLUMN));
                            return(false);
                        }

                        if (mPeakWidthMinutesColIndex < 0)
                        {
                            OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name,
                                                       MASICResultsMerger.PEAK_WIDTH_MINUTES_COLUMN));
                            return(false);
                        }

                        // Validate that the required columns exist
                        foreach (var requiredColumn in requiredColumns)
                        {
                            if (ColumnExists(msgfPlusColumns, requiredColumn))
                            {
                                continue;
                            }

                            OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name, requiredColumn.ToString()));
                            return(false);
                        }

                        continue;
                    }

                    var dataColumns = dataLine.Split('\t');

                    var scanNumber = GetValueInt(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Scan);
                    var charge     = GetValueInt(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Charge);
                    var peptide    = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Peptide);
                    var protein    = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Protein);

                    if (!PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(peptide, out var primarySequence, out _, out _))
                    {
                        primarySequence = peptide;
                    }

                    if (scanNumber != psmGroup.ScanNumber ||
                        charge != psmGroup.Charge ||
                        !string.Equals(primarySequence, psmGroup.PrimarySequence))
                    {
                        StoreResult(writer, psmGroup, datasetName);

                        psmGroup = new DartIdData(dataLine, scanNumber, peptide, primarySequence, protein)
                        {
                            SpecEValue       = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.SpecEValue),
                            Charge           = charge,
                            ElutionTime      = dataColumns[mScanTimeColIndex],
                            PeakWidthMinutes = dataColumns[mPeakWidthMinutesColIndex]
                        };
                    }
                    else
                    {
                        psmGroup.Proteins.Add(protein);
                    }
                }

                StoreResult(writer, psmGroup, datasetName);

                return(true);
            }
            catch (Exception ex)
            {
                OnErrorEvent("Error in ConsolidatePSMs", ex);
                return(false);
            }
        }
Ejemplo n.º 9
0
        private static void TestPHRPReader(string synOrFhtFile, bool blnSkipDuplicates)
        {
            var inputFile = new FileInfo(synOrFhtFile);

            Console.WriteLine("Instantiating reader");
            var startupOptions = new StartupOptions
            {
                LoadModsAndSeqInfo = true,
                LoadMSGFResults    = true,
                LoadScanStatsData  = false,
                MaxProteinsPerPSM  = 100
            };

            var phrpReader =
                new ReaderFactory(inputFile.FullName, PeptideHitResultTypes.Unknown, startupOptions)
            {
                EchoMessagesToConsole = false,
                SkipDuplicatePSMs     = blnSkipDuplicates
            };

            // Check for any load errors
            if (phrpReader.ErrorMessages.Count > 0)
            {
                Console.WriteLine("Error(s) instantiating the reader:");
                foreach (var errorMessage in phrpReader.ErrorMessages)
                {
                    Console.WriteLine("  " + errorMessage);
                }
            }

            phrpReader.ErrorEvent   += ErrorEventHandler;
            phrpReader.StatusEvent  += MessageEventHandler;
            phrpReader.WarningEvent += WarningEventHandler;

            const bool fastReadEnabled = true;

            phrpReader.FastReadMode = fastReadEnabled;

            var massCalculator = new PeptideMassCalculator();

            if (!phrpReader.CanRead)
            {
                Console.WriteLine("Aborting since PHRPReader is not ready: " + phrpReader.ErrorMessage);
                return;
            }

            var lstValues = new List <string>();

            var intPSMsRead         = 0;
            var intModifiedPSMsRead = 0;

            // ReSharper disable once CollectionNeverQueried.Local
            var dctCachedValues = new Dictionary <int, PSM>();

            Console.WriteLine("Reading data");

            while (phrpReader.MoveNext())
            {
                var psm = phrpReader.CurrentPSM;

                intPSMsRead += 1;
                lstValues.Clear();

                phrpReader.FinalizeCurrentPSM();

                PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(psm.Peptide, out _, out _, out _);

                var strMassErrorPPM = GetCorrectedMassErrorPPM(psm, out _);

                lstValues.Add(phrpReader.DatasetName + "_dta.txt");                                                                          // #SpecFile
                lstValues.Add("index=" + intPSMsRead);                                                                                       // SpecID
                lstValues.Add(psm.ScanNumber.ToString());                                                                                    // ScanNum
                lstValues.Add(psm.CollisionMode);                                                                                            // FragMethod
                lstValues.Add(massCalculator.ConvoluteMass(psm.PrecursorNeutralMass, 0, psm.Charge).ToString(CultureInfo.InvariantCulture)); // Precursor m/z

                lstValues.Add(strMassErrorPPM);                                                                                              // PrecursorError(ppm)
                lstValues.Add(psm.Charge.ToString());                                                                                        // Charge
                lstValues.Add(psm.NumTrypticTermini.ToString());                                                                             // Tryptic state (0, 1, or 2)
                lstValues.Add(CleanupPeptide(psm.PeptideWithNumericMods));                                                                   // Peptide

                if (psm.SeqID <= 0)
                {
                    lstValues.Add("**" + psm.SeqID + "**");                 // SeqID is undefined
                }
                else
                {
                    lstValues.Add(psm.SeqID.ToString());                    // SeqID
                }

                lstValues.Add(psm.ProteinFirst);

                if (psm.ProteinDetails.Count > 0)
                {
                    var firstProteinDetail = psm.ProteinDetails.First();

                    if (!string.Equals(psm.ProteinFirst, firstProteinDetail.Key))
                    {
                        lstValues.Add(firstProteinDetail.Key);
                    }
                    else
                    {
                        lstValues.Add("<Match>");
                    }
                    lstValues.Add(firstProteinDetail.Value.ResidueStart.ToString());
                    lstValues.Add(firstProteinDetail.Value.ResidueEnd.ToString());
                }

                var strXCorr = GetScore(psm, SequestSynFileReader.GetColumnNameByID(SequestSynopsisFileColumns.XCorr), "0");
                lstValues.Add(strXCorr);

                lstValues.Add(GetScore(psm, SequestSynFileReader.GetColumnNameByID(SequestSynopsisFileColumns.Sp), "0"));
                lstValues.Add(psm.MSGFSpecEValue);
                lstValues.Add(GetScore(psm, SequestSynFileReader.GetColumnNameByID(SequestSynopsisFileColumns.DeltaCn2), "0"));

                lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetMSGFDBColumnNameByID(MSGFDBSynFileColumns.PValue), "0"));
                lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.EValue), "0"));
                lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.RankSpecEValue), "0"));
                lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetMSGFDBColumnNameByID(MSGFDBSynFileColumns.FDR), "1"));
                lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.QValue), "0"));
                lstValues.Add(GetScore(psm, MSGFPlusSynFileReader.GetColumnNameByID(MSGFPlusSynFileColumns.PepQValue), "0"));


                if (psm.PeptideCleanSequence == "QQIEESTSDYDKEK")
                {
                    Console.WriteLine(psm.Peptide + " in scan " + psm.ScanNumber);

                    var parentIonMZ = massCalculator.ConvoluteMass(psm.PrecursorNeutralMass, 0, psm.Charge);

                    Console.WriteLine("ParentIonMZ   = " + parentIonMZ);
                    Console.WriteLine("PeptideWithNumericMods   = " + psm.PeptideWithNumericMods);
                }

                if (psm.ModifiedResidues.Count > 0)
                {
                    intModifiedPSMsRead += 1;

                    if (intModifiedPSMsRead % 500 == 0)
                    {
                        Console.WriteLine("PeptideWithNumericMods   = " + psm.PeptideWithNumericMods);
                        foreach (var modifiedResidue in psm.ModifiedResidues)
                        {
                            Console.WriteLine("  " + modifiedResidue.Residue + modifiedResidue.EndResidueLocInPeptide + ": " + modifiedResidue.ModDefinition.ModificationMassAsText);
                        }
                    }

                    var dblPeptideMassRecomputed = massCalculator.ComputeSequenceMassNumericMods(psm.PeptideWithNumericMods);
                    if (Math.Abs(psm.PeptideMonoisotopicMass - dblPeptideMassRecomputed) > 0.1)
                    {
                        Console.WriteLine("  Peptide mass disagreement: " + (psm.PeptideMonoisotopicMass - dblPeptideMassRecomputed).ToString("0.0000000"));
                    }
                }

                var strFlattened = FlattenList(lstValues);

                if (intPSMsRead % 10000 == 0)
                {
                    Console.WriteLine(strFlattened);
                }

                dctCachedValues.Add(intPSMsRead, psm);
            }
        }