public void ReadFileLowMem(string inputFileRelativePath, int expectedNativeIDs, int expectedResults, int expectedPeptides, int expectedProteinSeqs)
        {
            if (!TestPath.FindInputFile(inputFileRelativePath, out var sourceFile))
            {
                Console.WriteLine("File not found: " + inputFileRelativePath);
                return;
            }

            var reader      = new SimpleMZIdentMLReader();
            var spectrumIDs = new SortedSet <string>();
            var peptides    = new SortedSet <string>();
            var proteinSeqs = new SortedSet <string>();
            var specResults = 0;

            using (var results = reader.ReadLowMem(sourceFile.FullName))
            {
                var resultIdentifications = results.Identifications.ToList();
                var resultCountTotal      = resultIdentifications.Count;

                foreach (var specItem in resultIdentifications)
                {
                    specResults++;

                    if (!spectrumIDs.Contains(specItem.NativeId))
                    {
                        spectrumIDs.Add(specItem.NativeId);
                    }

                    foreach (var evidenceItem in specItem.PepEvidence)
                    {
                        if (!peptides.Contains(evidenceItem.SequenceWithNumericMods))
                        {
                            peptides.Add(evidenceItem.SequenceWithNumericMods);
                        }

                        if (!proteinSeqs.Contains(evidenceItem.DbSeq.Accession))
                        {
                            proteinSeqs.Add(evidenceItem.DbSeq.Accession);
                        }
                    }

                    if (specResults % 1000 == 0)
                    {
                        Console.WriteLine("{0,6:N0} / {1,6:N0}", specResults, resultCountTotal);
                    }
                }
            }

            Console.WriteLine();
            Console.WriteLine("Spectrum Identification Results: {0,6:N0}", specResults);
            Console.WriteLine("Native IDs: {0,6:N0}", spectrumIDs.Count);
            Console.WriteLine("Unique Peptides: {0,6:N0}", peptides.Count);
            Console.WriteLine("Unique Protein Sequences: {0,6:N0}", proteinSeqs.Count);

            Assert.AreEqual(expectedResults, specResults, "Spectrum Identification Results");
            Assert.AreEqual(expectedNativeIDs, spectrumIDs.Count, "Native IDs");
            Assert.AreEqual(expectedPeptides, peptides.Count, "Unique Peptides");
            Assert.AreEqual(expectedProteinSeqs, proteinSeqs.Count, "Unique Protein Sequences");
        }
Exemplo n.º 2
0
        public void ReadFile(string path, int expectedNativeIDs, int expectedResults, int expectedPeptides, int expectedProteinSeqs)
        {
            var sourceFile = new FileInfo(Path.Combine(TestPath.ExtTestDataDirectory, path));

            if (!sourceFile.Exists)
            {
                Console.WriteLine("File not found: " + sourceFile.FullName);
                return;
            }

            var reader      = new SimpleMZIdentMLReader();
            var spectrumIDs = new SortedSet <string>();
            var peptides    = new SortedSet <string>();
            var proteinSeqs = new SortedSet <string>();

            var results          = reader.Read(sourceFile.FullName);
            var specResults      = 0;
            var resultCountTotal = results.Identifications.Count;

            foreach (var specItem in results.Identifications)
            {
                specResults += 1;

                if (!spectrumIDs.Contains(specItem.NativeId))
                {
                    spectrumIDs.Add(specItem.NativeId);
                }

                foreach (var evidenceItem in specItem.PepEvidence)
                {
                    if (!peptides.Contains(evidenceItem.SequenceWithNumericMods))
                    {
                        peptides.Add(evidenceItem.SequenceWithNumericMods);
                    }

                    if (!proteinSeqs.Contains(evidenceItem.DbSeq.Accession))
                    {
                        proteinSeqs.Add(evidenceItem.DbSeq.Accession);
                    }
                }

                if (specResults % 1000 == 0)
                {
                    Console.WriteLine("{0,6:N0} / {1,6:N0}", specResults, resultCountTotal);
                }
            }

            Console.WriteLine();
            Console.WriteLine("Spectrum Identification Results: {0,6:N0}", specResults);
            Console.WriteLine("Native IDs: {0,6:N0}", spectrumIDs.Count);
            Console.WriteLine("Unique Peptides: {0,6:N0}", peptides.Count);
            Console.WriteLine("Unique Protein Sequences: {0,6:N0}", proteinSeqs.Count);

            Assert.AreEqual(expectedResults, specResults, "Spectrum Identification Results");
            Assert.AreEqual(expectedNativeIDs, spectrumIDs.Count, "Native IDs");
            Assert.AreEqual(expectedPeptides, peptides.Count, "Unique Peptides");
            Assert.AreEqual(expectedProteinSeqs, proteinSeqs.Count, "Unique Protein Sequences");
        }
Exemplo n.º 3
0
        public void TestFileReading()
        {
            var mzIdReader = new SimpleMZIdentMLReader();
            var results    = mzIdReader.Read(@"C:\Users\wilk011\Documents\mspf_test\er\2016-12-27_Ecoli_Ribosome_1.mzid");

            foreach (var id in results.Identifications)
            {
                var sequence = id.Peptide.GetIpSequence();
            }
        }
Exemplo n.º 4
0
        /// <summary>
        /// Read the MZIdentML file and cache the data
        /// </summary>
        /// <param name="path">Path to *.mzid/mzIdentML file</param>
        /// <returns>List of ScanData</returns>
        public List <IdentData> Read(string path)
        {
            var psmResults = new List <IdentData>();

            // Read in the file
            var mzIdentMLData = new SimpleMZIdentMLReader().Read(path);

            HaveScanTimes = false;

            IdentProg = mzIdentMLData.AnalysisSoftware switch
            {
                "MyriMatch" => IdentProgramType.MyriMatch,
                "MS-GF+" => IdentProgramType.MSGFPlus,
                _ => IdentProgramType.Unset
            };

            while (true)
            {
                psmResults.Clear();

                // Filter and process the data
                foreach (var result in mzIdentMLData.Identifications)
                {
                    ProcessSpectrumIdentificationResult(result, psmResults);
                }

                if (psmResults.Count >= 500)
                {
                    OnStatusEvent("  {0:N0} PSMs passed the filters", psmResults.Count);
                    break;
                }

                OnStatusEvent("  Fewer than 500 PSMs passed the filters ({0})", psmResults.Count);

                // Loosen the filters and try again (up to 3 times)
                if (!AdjustThreshold())
                {
                    if (psmResults.Count == 0)
                    {
                        OnWarningEvent("  No PSMs passed the filters");
                    }
                    else
                    {
                        OnStatusEvent("  Plotting errors using these PSMs");
                    }
                    break;
                }

                OnStatusEvent("  Loosening thresholds and trying again");
            }

            return(psmResults);
        }
Exemplo n.º 5
0
        public const string DEFAULT_MODIFICATION_SYMBOLS = "*#@$&!%~†‡¤º^`×÷+=ø¢";         // A few other possibilities: €£¥§

        public MsgfMzid(string mzidFileName) : base(mzidFileName, false)
        {
            // load mzid file;
            // obviously won't have a 'Job' number available
            var reader   = new SimpleMZIdentMLReader();
            var mzidData = reader.Read(mzidFileName);

            data = mzidData.Identifications;

            AssignSymbolsToMods(mzidData.SearchModifications);

            // maxSteps normally set using DataTable information in base constructor
            maxSteps = data.Count;
        }
Exemplo n.º 6
0
        /// <summary>
        /// Read the file at path <paramref name="filePath"/>.
        /// </summary>
        /// <param name="filePath"></param>
        /// <returns></returns>
        public static SimpleMZIdentMLReader.SimpleMZIdentMLData ReadResultFile(string filePath)
        {
            SimpleMZIdentMLReader.SimpleMZIdentMLData results = null;

            var lowerFilePath = filePath.ToLower();

            if (lowerFilePath.EndsWith(".mzid") || lowerFilePath.EndsWith(".mzid.gz"))
            {
                var mzidReader = new SimpleMZIdentMLReader();
                results = mzidReader.Read(filePath);
            }
            else if (lowerFilePath.EndsWith("_ictda.tsv"))
            {
                results = DatabaseSearchResultData.ReadResultsFromFileToMzIdData(filePath);
            }

            return(results);
        }
Exemplo n.º 7
0
        private Dictionary <int, SimpleMZIdentMLReader.SpectrumIdItem> ParseMzid(string mzid)
        {
            var mzIdReader = new SimpleMZIdentMLReader();
            var ids        = mzIdReader.Read(mzid).Identifications
                             .Where(id => id.QValue <= 1e-5)
                             .Where(id => id.SpecEv <= 1e-10)
                             .Where(id => id.PepEvidence.Any(pepEv => !pepEv.IsDecoy))
                             .Where(id => id.Peptide.Mods.Count == 0)
                             .GroupBy(id => id.ScanNum);

            var mzids = new Dictionary <int, SimpleMZIdentMLReader.SpectrumIdItem>();

            foreach (var idGroup in ids)
            {
                var id = idGroup.OrderBy(sp => sp.SpecEv).First();
                mzids.Add(id.ScanNum, id);
            }

            return(mzids);
        }
Exemplo n.º 8
0
 /// <summary>
 /// Initializes a new instance of the <see cref="MzIdentMlReader"/> class.
 /// </summary>
 /// <param name="filePath">The path for the MZID file.</param>
 public MzIdentMlReader(string filePath)
 {
     this.filePath   = filePath;
     mzIdentMlReader = new SimpleMZIdentMLReader();
     Modifications   = new List <Modification>();
 }
        /// <summary>
        /// Convert the given .mzid file to a .tsv file
        /// </summary>
        /// <param name="mzidPath">.mzid file to read (supports .mzid.gz)</param>
        /// <param name="tsvPath">.tsv file to create (cannot be an empty string)</param>
        /// <param name="options">Processing options</param>
        /// <returns>True if successful, false if an error</returns>
        public bool ConvertToTsv(
            string mzidPath,
            string tsvPath,
            ConverterOptions options)
        {
            var filterOnSpecEValue = ConverterOptions.FilterEnabled(options.MaxSpecEValue);
            var filterOnEValue     = options.MaxEValue > 0;
            var filterOnQValue     = ConverterOptions.FilterEnabled(options.MaxQValue);

            if (string.IsNullOrWhiteSpace(tsvPath))
            {
                ConsoleMsgUtils.ShowWarning("The target .tsv file path must be defined when calling ConvertToTsv with file paths");
                Thread.Sleep(1500);
                return(false);
            }

            var tsvFile = new FileInfo(tsvPath);

            if (tsvFile.Exists)
            {
                ConsoleMsgUtils.ShowWarning("Overwriting existing file: " + PathUtils.CompactPathString(tsvFile.FullName, 90));
                Console.WriteLine();
            }
            else
            {
                ConsoleMsgUtils.ShowWarning("Creating: " + PathUtils.CompactPathString(tsvFile.FullName, 115));
            }

            var writtenCount = 0;

            // DelimitedProteinNames takes precedence over UnrollResults
            // However, behavior below needs to be the same for UnrollResults and DelimitedProteinNames
            var maxMatchedProteins = 1;

            if (options.UnrollResults || options.DelimitedProteinNames)
            {
                maxMatchedProteins = int.MaxValue;
            }

            var reader = new SimpleMZIdentMLReader(options.SkipDuplicateIds, s => Console.WriteLine("MZID PARSE ERROR: {0}", s));

            try
            {
                var configuration = new CsvConfiguration(CultureInfo.CurrentCulture)
                {
                    AllowComments = false,
                    Delimiter     = "\t"
                };

                using var data   = reader.ReadLowMem(mzidPath);
                using var writer = new StreamWriter(new FileStream(tsvFile.FullName, FileMode.Create, FileAccess.Write, FileShare.ReadWrite));
                using var csv    = new CsvWriter(writer, configuration);

                csv.Context.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields, options.AddGeneId));

                // SPECIAL CASE:
                // Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in
                // SpectrumIdentificationItems was correct, but if there was a modification in the first 3 residues there was at
                // least a 50% chance of the PeptideEvidenceRefs within the SpectrumIdentificationItem being incorrect. So, for
                // those bad versions, use the peptide_ref rather than the PeptideEvidenceRefs to get the sequence.
                var isBadMsGfMzid = false;

                if (data.AnalysisSoftwareCvAccession.IndexOf("MS:1002048", StringComparison.OrdinalIgnoreCase) >= 0 &&
                    !string.IsNullOrWhiteSpace(data.AnalysisSoftwareVersion))
                {
                    // bad versions: v10280 (introduced), v10282, v2016.01.20, v2016.01.21, v2016.01.29, v2016.02.12, v2016.05.25, v2016.0.13, v2016.06.13, v2016.06.14, v2016.06.15, v2016.06.29, v2016.07.26, v2016.08.31, v2016.09.07, v2016.09.22, v2016.09.23 (fixed with version v2016.10.10)
                    var badVersions = new[]
                    {
                        "v10280", "v10282",
                        "v2016.01.20", "v2016.01.21", "v2016.01.29", "v2016.02.12", "v2016.05.25", "v2016.0.13",
                        "v2016.06.13", "v2016.06.14", "v2016.06.15", "v2016.06.29", "v2016.07.26",
                        "v2016.08.31", "v2016.09.07", "v2016.09.22", "v2016.09.23"
                    };

                    foreach (var version in badVersions)
                    {
                        if (data.AnalysisSoftwareVersion.Contains(version))
                        {
                            isBadMsGfMzid = true;
                        }
                    }
                }

                if (isBadMsGfMzid)
                {
                    ConsoleMsgUtils.ShowWarning(
                        "Warning: file \"{0}\" was created with a version of MS-GF+ that had some erroneous output in the mzid file." +
                        " Using sequences from the peptide_ref attribute instead of the PeptideEvidenceRef element to try to bypass the issue.",
                        mzidPath);
                }

                csv.WriteHeader <PeptideMatch>();
                csv.NextRecord();

                var lastScanNum = 0;

                // Number of items in data.Identifications
                // Incremented during the for each loop
                var unfilteredCount = 0;

                // Number of identifications that did not pass the score filters
                var filteredOutCount = 0;

                // List of matches in a single result. List is cleared before use.
                // Only contains multiple when outputting all protein matches, and a result has multiple protein matches.
                var matches = new List <PeptideMatch>(30);
                foreach (var id in data.Identifications)
                {
                    if (options.SingleResultPerSpectrum && id.ScanNum == lastScanNum)
                    {
                        continue;
                    }

                    unfilteredCount++;

                    lastScanNum = id.ScanNum;

                    if (filterOnSpecEValue && id.SpecEv > options.MaxSpecEValue)
                    {
                        filteredOutCount++;
                        continue;
                    }

                    if (filterOnEValue && id.EValue > options.MaxEValue)
                    {
                        filteredOutCount++;
                        continue;
                    }

                    if (filterOnQValue && id.QValue > options.MaxQValue)
                    {
                        filteredOutCount++;
                        continue;
                    }

                    // Clear out the list of matches.
                    matches.Clear();
                    var uniquePepProteinList = new HashSet <string>();

                    // id.PepEvidence has one entry for each protein associated with this PSM
                    IEnumerable <SimpleMZIdentMLReader.PeptideEvidence> pepEvEnum = id.PepEvidence;
                    if (!options.ShowDecoy)
                    {
                        pepEvEnum = pepEvEnum.Where(x => !x.IsDecoy);
                    }

                    // maxMatchedProteins is '1' or 'int.MaxValue'
                    foreach (var pepEv in pepEvEnum.Take(maxMatchedProteins))
                    {
                        var peptide = pepEv.SequenceWithNumericMods;

                        // Produce correct output with bad MS-GF+ mzid
                        if (isBadMsGfMzid)
                        {
                            // Add the prefix and suffix residues for this protein
                            // Do not use pepEv.SequenceWithNumericMods; it isn't necessarily correct for this spectrum
                            peptide = pepEv.Pre + "." + id.Peptide.SequenceWithNumericMods + "." + pepEv.Post;
                        }

                        var protein = pepEv.DbSeq.Accession;

                        if (!uniquePepProteinList.Add(peptide + protein))
                        {
                            // Don't process the check for the gene ID if it's not a unique match
                            continue;
                        }

                        var geneId = string.Empty;
                        if (options.AddGeneId && !pepEv.IsDecoy)
                        {
                            // Note that .ProteinDescription includes both the Protein Name and the Description
                            var success = TryGetGeneId(options.GeneIdRegex, pepEv.DbSeq.ProteinDescription, out geneId);
                            if (!success)
                            {
                                geneId = string.Empty;
                            }
                        }

                        matches.Add(new PeptideMatch
                        {
                            SpecFile       = data.SpectrumFile,
                            Identification = id,
                            Peptide        = peptide,
                            Protein        = protein,
                            GeneId         = geneId,
                        });
                    }

                    if (matches.Count == 0)
                    {
                        continue;
                    }

                    if (options.DelimitedProteinNames && matches.Count > 1)
                    {
                        CombineProteinNames(options, matches);

                        // The first item in matches already lists all of the protein names; remove all remaining matches.
                        matches.RemoveRange(1, matches.Count - 1);
                    }

                    foreach (var item in matches)
                    {
                        csv.WriteRecord(item);
                        csv.NextRecord();
                    }

                    writtenCount++;
                }

                if (unfilteredCount == 0)
                {
                    ConsoleMsgUtils.ShowWarning("Warning: .mzid file does not have any results");
                    Thread.Sleep(1500);
                }
                else if (writtenCount == 0)
                {
                    ConsoleMsgUtils.ShowWarning("Warning: none of the results passed the specified filter(s)");
                    Thread.Sleep(1500);
                }
                else
                {
                    Console.WriteLine("Wrote {0:N0} results to {1}", writtenCount, PathUtils.CompactPathString(tsvFile.FullName, 70));
                    if (filteredOutCount > 0)
                    {
                        Console.WriteLine("Filtered out {0:N0} results", filteredOutCount);
                    }
                }

                return(true);
            }
            catch (SimpleMZIdentMLReader.DuplicateKeyException ex)
            {
                ConsoleMsgUtils.ShowError("MZID PARSE ERROR", ex);
                ConsoleMsgUtils.ShowWarning("This type of error is usually caused by an error in the MZID output.");
                return(false);
            }
            catch (Exception ex)
            {
                ConsoleMsgUtils.ShowError(
                    string.Format("Error converting the file (so far, {0:N0} results have been written", writtenCount), ex);
                return(false);
            }
        }
Exemplo n.º 10
0
        public void ConvertToTsv(string mzidPath, string tsvPath, bool showDecoy = true, bool unrollResults = true, bool singleResult = false)
        {
            var reader = new SimpleMZIdentMLReader();
            var data   = reader.Read(mzidPath);

            var headers = new List <string> {
                "#SpecFile", "SpecID", "ScanNum", "FragMethod",
                "Precursor", "IsotopeError", "PrecursorError(ppm)", "Charge",
                "Peptide", "Protein", "DeNovoScore", "MSGFScore",
                "SpecEValue", "EValue", "QValue", "PepQValue"
            };

            // SPECIAL CASE:
            // Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in
            // SpectrumIdentificationItems was correct, but if there was a modification in the first 3 residues there was at
            // least a 50% chance of the PeptideEvidenceRefs within the SpectrumIdentificationItem being incorrect. So, for
            // those bad versions, use the peptide_ref rather than the PeptideEvidenceRefs to get the sequence.
            var isBadMsGfMzid = false;

            if (data.AnalysisSoftwareCvAccession.ToUpper().Contains("MS:1002048") && !string.IsNullOrWhiteSpace(data.AnalysisSoftwareVersion))
            {
                // bad versions: v10280 (introduced), v10282, v2016.01.20, v2016.01.21, v2016.01.29, v2016.02.12, v2016.05.25, v2016.0.13, v2016.06.13, v2016.06.14, v2016.06.15, v2016.06.29, v2016.07.26, v2016.08.31, v2016.09.07, v2016.09.22, v2016.09.23 (fixed with version v2016.10.10)
                var badVersions = new string[]
                {
                    "v10280", "v10282", "v2016.01.20", "v2016.01.21", "v2016.01.29", "v2016.02.12", "v2016.05.25", "v2016.0.13", "v2016.06.13", "v2016.06.14",
                    "v2016.06.15", "v2016.06.29", "v2016.07.26", "v2016.08.31", "v2016.09.07", "v2016.09.22", "v2016.09.23"
                };
                foreach (var version in badVersions)
                {
                    if (data.AnalysisSoftwareVersion.Contains(version))
                    {
                        isBadMsGfMzid = true;
                    }
                }
            }
            if (isBadMsGfMzid)
            {
                ShowWarning(string.Format("Warning: file \"{0}\" was created with a version of MS-GF+ that had some erroneous output in the mzid file." +
                                          " Using sequences from the peptide_ref attribute instead of the PeptideEvidenceRef element to try to bypass the issue.", mzidPath));
            }

            using (var stream = new StreamWriter(new FileStream(tsvPath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite)))
            {
                stream.WriteLine(string.Join("\t", headers));

                if (data.Identifications.Count == 0)
                {
                    ShowWarning("Warning: .mzID file does not have any results");
                    System.Threading.Thread.Sleep(1500);
                    return;
                }

                var lastScanNum = 0;
                foreach (var id in data.Identifications)
                {
                    if (singleResult && id.ScanNum == lastScanNum)
                    {
                        continue;
                    }
                    lastScanNum = id.ScanNum;
                    var specFile   = data.SpectrumFile;
                    var specId     = id.NativeId;
                    var scanNum    = id.ScanNum;
                    var fragMethod = "CID";
                    if (id.AllParamsDict.ContainsKey("AssumedDissociationMethod"))
                    {
                        fragMethod = id.AllParamsDict["AssumedDissociationMethod"];
                    }
                    var precursor    = id.ExperimentalMz;
                    var isotopeError = "0";
                    if (id.AllParamsDict.ContainsKey("IsotopeError"))
                    {
                        isotopeError = id.AllParamsDict["IsotopeError"];
                    }
                    var adjExpMz = id.ExperimentalMz - IsotopeMass * int.Parse(isotopeError) / id.Charge;
                    //var precursorError = (id.CalMz - id.ExperimentalMz) / id.CalMz * 1e6;
                    var precursorError = (adjExpMz - id.CalMz) / id.CalMz * 1e6;

                    var charge      = id.Charge;
                    var deNovoScore = id.DeNovoScore;
                    var msgfScore   = id.RawScore;
                    var specEValue  = id.SpecEv;
                    var eValue      = id.EValue;
                    var qValue      = id.QValue;
                    var pepQValue   = id.PepQValue;

                    var dedup = new HashSet <string>();
                    foreach (var pepEv in id.PepEvidence)
                    {
                        if (!showDecoy && pepEv.IsDecoy)
                        {
                            continue;
                        }

                        var peptideWithModsAndContext = pepEv.SequenceWithNumericMods;
                        // Produce correct output with bad MS-GF+ mzid
                        if (isBadMsGfMzid)
                        {
                            // Add the prefix and suffix residues for this protein
                            // Do not use pepEv.SequenceWithNumericMods; it isn't necessarily correct for this spectrum
                            peptideWithModsAndContext = pepEv.Pre + "." + id.Peptide.SequenceWithNumericMods + "." + pepEv.Post;
                        }

                        var protein = pepEv.DbSeq.Accession;
                        if (!dedup.Add(peptideWithModsAndContext + protein))
                        {
                            continue;
                        }

                        /*var line = string.Format(CultureInfo.InvariantCulture,
                         *  "{0}\t{1}\t{2}\t{3}\t{4:0.0####}\t{5}\t{6:0.0###}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12:G6}\t{13:G6}\t{14:0.0####}\t{15:0.0####}",
                         *  specFile, specId,
                         *  scanNum, fragMethod, precursor, isotopeError, precursorError, charge, peptideWithModsAndContext, protein, deNovoScore, msgfScore, specEValue,
                         *  eValue, qValue, pepQValue);
                         * stream.WriteLine(line);*/
                        /*stream.WriteLine(CultureInfo.InvariantCulture, "{0}\t{1}\t{2}\t{3}\t{4:0.0####}\t{5}\t{6:0.0###}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12:0.0####}\t{13:0.0####}\t{14:0.0####}\t{15:0.0####}", specFile, specId,
                         *  scanNum, fragMethod, precursor, isotopeError, precursorError, charge, peptideWithModsAndContext, protein, deNovoScore, msgfScore, specEValue,
                         *  eValue, qValue, pepQValue);*/
                        var specEValueString = StringUtilities.ValueToString(specEValue, 5, 0.001);
                        var eValueString     = StringUtilities.ValueToString(eValue, 5, 0.001);

                        var line = string.Format(CultureInfo.InvariantCulture,
                                                 "{0}\t{1}\t{2}\t{3}\t{4:0.0####}\t{5}\t{6:0.0###}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14:0.0####}\t{15:0.0####}",
                                                 specFile, specId, scanNum, fragMethod, precursor, isotopeError, precursorError, charge, peptideWithModsAndContext, protein,
                                                 deNovoScore, msgfScore, specEValueString, eValueString, qValue, pepQValue);
                        stream.WriteLine(line);

                        if (!unrollResults)
                        {
                            break;
                        }
                    }
                }
            }
        }
Exemplo n.º 11
0
        /// <summary>
        /// Process spectra and identifications.
        /// </summary>
        /// <param name="rawFilePath">Full file path to raw file.</param>
        /// <param name="idFilePath">Full file path to identification file.</param>
        /// <param name="cancellationToken">For notification of cancellation.</param>
        /// <param name="progress">Progress reporter.</param>
        /// <returns>List of processed IDs.</returns>
        public List <ProcessedResult> Process(string rawFilePath, string idFilePath, CancellationToken cancellationToken, IProgress <ProgressData> progress = null)
        {
            // Set up progress reporter
            progress = progress ?? new Progress <ProgressData>();

            var progressData = new ProgressData(progress);

            // Show initial loading message
            progressData.Report(0.1, "Loading...");

            // Read mzid file
            var mzidReader      = new SimpleMZIdentMLReader();
            var identifications = mzidReader.Read(idFilePath, cancellationToken);

            // Check to make sure raw and MZID file match.
            var rawFileName = Path.GetFileNameWithoutExtension(rawFilePath);

            var spectrumFileFromId = Path.GetFileNameWithoutExtension(identifications.SpectrumFile);
            var dtaIndex           = spectrumFileFromId.LastIndexOf("_dta");

            if (dtaIndex >= 0)
            {
                spectrumFileFromId = spectrumFileFromId.Substring(0, dtaIndex);
            }

            if (rawFileName != spectrumFileFromId)
            {
                throw new ArgumentException($"Mismatch between spectrum file ({rawFileName}) and id file ({spectrumFileFromId}).");
            }

            // Group IDs into a hash by scan number
            var idMap = identifications.Identifications.GroupBy(id => id.ScanNum).ToDictionary(scan => scan.Key, ids => ids);

            var processedResults = new ConcurrentBag <ProcessedResult>();

            // Load raw file
            using (var lcms = MassSpecDataReaderFactory.GetMassSpecDataReader(rawFilePath))
            {
                int count = 0;
                Parallel.ForEach(
                    lcms.ReadAllSpectra(),
                    spectrum =>
                {
                    if (cancellationToken.IsCancellationRequested)
                    {           // Cancel if necessary
                        return;
                    }

                    // Report completion percentage and current scan number
                    if (count % (int)Math.Max(0.01 * lcms.NumSpectra, 1) == 0)
                    {
                        progressData.Report(count, lcms.NumSpectra, $"{Math.Round(100.0*count / lcms.NumSpectra)}%");
                    }

                    Interlocked.Increment(ref count);

                    // Skip spectrum if it isn't MS2
                    var productSpectrum = spectrum as ProductSpectrum;
                    if (productSpectrum == null || !idMap.ContainsKey(spectrum.ScanNum))
                    {
                        return;
                    }

                    var specResults = idMap[spectrum.ScanNum];

                    var results = from specResult in specResults
                                  let sequence = specResult.Peptide.GetIpSequence()
                                                 let coverage = this.CalculateSequenceCoverage(productSpectrum, sequence, specResult.Charge)
                                                                select new ProcessedResult
                    {
                        ScanNum          = spectrum.ScanNum,
                        Sequence         = sequence,
                        Charge           = specResult.Charge,
                        PrecursorMz      = specResult.CalMz,
                        DeNovoScore      = specResult.DeNovoScore,
                        SpecEValue       = specResult.SpecEv,
                        EValue           = specResult.EValue,
                        QValue           = specResult.QValue,
                        PepQValue        = specResult.PepQValue,
                        FragMethod       = productSpectrum.ActivationMethod,
                        IsotopeError     = specResult.IsoError,
                        SequenceCoverage = Math.Round(coverage),
                    };

                    foreach (var result in results)
                    {
                        processedResults.Add(result);
                    }
                });
            }

            // Sort spectra by SpecEValue
            return(processedResults.OrderBy(pr => pr.SpecEValue).ToList());
        }