Exemplo n.º 1
0
        private void StoreResult(TextWriter writer, DartIdData groupData, string datasetName)
        {
            if (string.IsNullOrWhiteSpace(groupData.DataLine))
            {
                return;
            }

            var outputValues = new List <string>
            {
                datasetName,
                groupData.Peptide,
                groupData.SpecEValue,
                groupData.Charge.ToString(),
                groupData.Proteins.First(),
                string.Join(";", groupData.Proteins.Distinct()),
                groupData.ElutionTime,
                groupData.PeakWidthMinutes
            };

            writer.WriteLine(string.Join("\t", outputValues));
        }
Exemplo n.º 2
0
        public bool ConsolidatePSMs(string psmFilePath, bool multiJobFile)
        {
            try
            {
                var    inputFile      = new FileInfo(psmFilePath);
                var    outputFileName = Path.GetFileNameWithoutExtension(inputFile.Name) + "_ForDartID.txt";
                string outputFilePath;

                if (inputFile.DirectoryName != null)
                {
                    outputFilePath = Path.Combine(inputFile.DirectoryName, outputFileName);
                }
                else
                {
                    outputFilePath = outputFileName;
                }

                var msgfPlusColumns = new SortedDictionary <Enum, int>();
                mScanTimeColIndex         = -1;
                mPeakWidthMinutesColIndex = -1;

                var requiredColumns = new List <MSGFPlusSynFileColumns>
                {
                    MSGFPlusSynFileColumns.Peptide,
                    MSGFPlusSynFileColumns.SpecEValue,
                    MSGFPlusSynFileColumns.Charge,
                    MSGFPlusSynFileColumns.Protein
                };

                string datasetName;
                if (multiJobFile)
                {
                    datasetName = "TBD";
                    throw new NotImplementedException(
                              "ConsolidatePSMs needs to be updated to support an input file where Job or Dataset is the first column");
                }

                // Obtain the dataset name from the filename
                if (psmFilePath.EndsWith(MASICResultsMerger.RESULTS_SUFFIX, StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = Path.GetFileName(psmFilePath.Substring(0, psmFilePath.Length - MASICResultsMerger.RESULTS_SUFFIX.Length));
                }
                else
                {
                    datasetName = Path.GetFileNameWithoutExtension(psmFilePath);
                }

                if (datasetName.EndsWith("_syn", StringComparison.OrdinalIgnoreCase) ||
                    datasetName.EndsWith("_fht", StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = datasetName.Substring(0, datasetName.Length - 4);
                }

                // ReSharper disable StringLiteralTypo
                if (datasetName.EndsWith("_msgfplus", StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = datasetName.Substring(0, datasetName.Length - "_msgfplus".Length);
                }
                else if (datasetName.EndsWith("_msgfdb", StringComparison.OrdinalIgnoreCase))
                {
                    datasetName = datasetName.Substring(0, datasetName.Length - "_msgfdb".Length);
                }

                // ReSharper restore StringLiteralTypo

                var psmGroup = new DartIdData();

                using var reader = new StreamReader(new FileStream(inputFile.FullName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite));
                using var writer = new StreamWriter(new FileStream(outputFilePath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite));

                var headerLine = new List <string>
                {
                    "Dataset",
                    "Peptide",
                    "MSGFDB_SpecEValue",
                    "Charge",
                    "LeadingProtein",
                    "Proteins",
                    "ElutionTime",
                    "PeakWidthMinutes"
                };

                writer.WriteLine(string.Join("\t", headerLine));

                while (!reader.EndOfStream)
                {
                    var dataLine = reader.ReadLine();
                    if (string.IsNullOrWhiteSpace(dataLine))
                    {
                        continue;
                    }

                    if (mScanTimeColIndex < 0)
                    {
                        var success = ParseMergedFileHeaderLine(dataLine, msgfPlusColumns);

                        if (!success)
                        {
                            return(false);
                        }

                        if (mScanTimeColIndex < 0)
                        {
                            OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name,
                                                       MASICResultsMerger.SCAN_STATS_ELUTION_TIME_COLUMN));
                            return(false);
                        }

                        if (mPeakWidthMinutesColIndex < 0)
                        {
                            OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name,
                                                       MASICResultsMerger.PEAK_WIDTH_MINUTES_COLUMN));
                            return(false);
                        }

                        // Validate that the required columns exist
                        foreach (var requiredColumn in requiredColumns)
                        {
                            if (ColumnExists(msgfPlusColumns, requiredColumn))
                            {
                                continue;
                            }

                            OnErrorEvent(string.Format("File {0} is missing column {1} on the header line", inputFile.Name, requiredColumn.ToString()));
                            return(false);
                        }

                        continue;
                    }

                    var dataColumns = dataLine.Split('\t');

                    var scanNumber = GetValueInt(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Scan);
                    var charge     = GetValueInt(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Charge);
                    var peptide    = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Peptide);
                    var protein    = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.Protein);

                    if (!PeptideCleavageStateCalculator.SplitPrefixAndSuffixFromSequence(peptide, out var primarySequence, out _, out _))
                    {
                        primarySequence = peptide;
                    }

                    if (scanNumber != psmGroup.ScanNumber ||
                        charge != psmGroup.Charge ||
                        !string.Equals(primarySequence, psmGroup.PrimarySequence))
                    {
                        StoreResult(writer, psmGroup, datasetName);

                        psmGroup = new DartIdData(dataLine, scanNumber, peptide, primarySequence, protein)
                        {
                            SpecEValue       = GetValue(dataColumns, msgfPlusColumns, MSGFPlusSynFileColumns.SpecEValue),
                            Charge           = charge,
                            ElutionTime      = dataColumns[mScanTimeColIndex],
                            PeakWidthMinutes = dataColumns[mPeakWidthMinutesColIndex]
                        };
                    }
                    else
                    {
                        psmGroup.Proteins.Add(protein);
                    }
                }

                StoreResult(writer, psmGroup, datasetName);

                return(true);
            }
            catch (Exception ex)
            {
                OnErrorEvent("Error in ConsolidatePSMs", ex);
                return(false);
            }
        }