private static void CombineProteinNames(ConverterOptions options, IReadOnlyList <PeptideMatch> matches) { const int MAX_LIST_LENGTH = 100000; var proteinNames = new SortedSet <string>(); var proteinNameLength = 0; var proteinDelimiterLength = options.ProteinNameDelimiter.Length; foreach (var item in matches) { if (!proteinNames.Contains(item.Protein)) { proteinNames.Add(item.Protein); proteinNameLength += item.Protein.Length + proteinDelimiterLength; } if (proteinNameLength > MAX_LIST_LENGTH) { break; } } // Store the list of protein names in the first item in matches matches[0].Protein = string.Join(options.ProteinNameDelimiter, proteinNames); if (proteinNameLength > MAX_LIST_LENGTH) { matches[0].Protein = string.Format("{0}{1}{2}", matches[0].Protein, options.ProteinNameDelimiter, "..."); } }
static int Main(string[] args) { var options = new ConverterOptions(); var asmName = typeof(Program).GetTypeInfo().Assembly.GetName(); var programVersion = typeof(Program).GetTypeInfo().Assembly.GetName().Version; var version = string.Format("version {0}.{1}.{2}", programVersion.Major, programVersion.Minor, programVersion.Build); if (!CommandLineParser <ConverterOptions> .ParseArgs(args, options, asmName.Name, version) || !options.ValidateArgs()) { System.Threading.Thread.Sleep(1500); return(-1); } options.OutputSetOptions(); #if !DEBUG try { #endif var converter = new MzidToTsvConverter(); converter.ConvertToTsv(options); Console.WriteLine("Conversion finished!"); System.Threading.Thread.Sleep(700); return(0); #if !DEBUG } catch (Exception e) { Console.WriteLine("Conversion failed: " + e.Message); Console.WriteLine(e.StackTrace); System.Threading.Thread.Sleep(1500); var errorCode = e.Message.GetHashCode(); if (errorCode == 0) { return(-1); } return(errorCode); } #endif }
// Ignore Spelling: mzid, tsv /// <summary> /// Convert the .mzid file(s) specified by options.MzidPath /// </summary> /// <param name="options">Processing options</param> /// <returns>True if successful for at least one .mzid file, false if an error</returns> public bool ConvertToTsv(ConverterOptions options) { var fileCountConverted = 0; if (ConverterOptions.HasWildcard(options.MzidPath)) { // Find matching files var mzidFiles = PathUtils.FindFilesWildcard(options.MzidPath); if (mzidFiles.Count == 0) { ConsoleMsgUtils.ShowWarning("No mzid files were found with path spec " + options.MzidPath); return(false); } foreach (var mzidFile in mzidFiles) { Console.WriteLine(); Console.WriteLine("Converting " + mzidFile.FullName); string tsvPath; if (ConverterOptions.HasWildcard(options.TsvPath)) { tsvPath = options.AutoNameTsvFromMzid(mzidFile.FullName); } else { if (Directory.Exists(options.TsvPath)) { tsvPath = Path.Combine(options.TsvPath, options.AutoNameTsvFromMzid(mzidFile.Name)); } else { tsvPath = options.TsvPath; } } var success = ConvertToTsv(mzidFile.FullName, tsvPath, options); if (success) { fileCountConverted++; } } return(fileCountConverted > 0); } if (options.IsDirectory) { if (options.MzidPaths.Count == 0) { var subDirsMessage = options.RecurseDirectories ? " or subdirectories" : string.Empty; ConsoleMsgUtils.ShowWarning($"No mzid[.gz] files found in directory \"{options.MzidPath}\"{subDirsMessage}."); return(false); } foreach (var mzidFile in options.MzidPaths) { var tsvPath = options.AutoNameTsvFromMzid(mzidFile); var success = ConvertToTsv(mzidFile, tsvPath, options); if (success) { fileCountConverted++; } } return(fileCountConverted > 0); } return(ConvertToTsv(options.MzidPath, options.TsvPath, options)); }
/// <summary> /// Convert the given .mzid file to a .tsv file /// </summary> /// <param name="mzidPath">.mzid file to read (supports .mzid.gz)</param> /// <param name="tsvPath">.tsv file to create (cannot be an empty string)</param> /// <param name="options">Processing options</param> /// <returns>True if successful, false if an error</returns> public bool ConvertToTsv( string mzidPath, string tsvPath, ConverterOptions options) { var filterOnSpecEValue = ConverterOptions.FilterEnabled(options.MaxSpecEValue); var filterOnEValue = options.MaxEValue > 0; var filterOnQValue = ConverterOptions.FilterEnabled(options.MaxQValue); if (string.IsNullOrWhiteSpace(tsvPath)) { ConsoleMsgUtils.ShowWarning("The target .tsv file path must be defined when calling ConvertToTsv with file paths"); Thread.Sleep(1500); return(false); } var tsvFile = new FileInfo(tsvPath); if (tsvFile.Exists) { ConsoleMsgUtils.ShowWarning("Overwriting existing file: " + PathUtils.CompactPathString(tsvFile.FullName, 90)); Console.WriteLine(); } else { ConsoleMsgUtils.ShowWarning("Creating: " + PathUtils.CompactPathString(tsvFile.FullName, 115)); } var writtenCount = 0; // DelimitedProteinNames takes precedence over UnrollResults // However, behavior below needs to be the same for UnrollResults and DelimitedProteinNames var maxMatchedProteins = 1; if (options.UnrollResults || options.DelimitedProteinNames) { maxMatchedProteins = int.MaxValue; } var reader = new SimpleMZIdentMLReader(options.SkipDuplicateIds, s => Console.WriteLine("MZID PARSE ERROR: {0}", s)); try { var configuration = new CsvConfiguration(CultureInfo.CurrentCulture) { AllowComments = false, Delimiter = "\t" }; using var data = reader.ReadLowMem(mzidPath); using var writer = new StreamWriter(new FileStream(tsvFile.FullName, FileMode.Create, FileAccess.Write, FileShare.ReadWrite)); using var csv = new CsvWriter(writer, configuration); csv.Context.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields, options.AddGeneId)); // SPECIAL CASE: // Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in // SpectrumIdentificationItems was correct, but if there was a modification in the first 3 residues there was at // least a 50% chance of the PeptideEvidenceRefs within the SpectrumIdentificationItem being incorrect. So, for // those bad versions, use the peptide_ref rather than the PeptideEvidenceRefs to get the sequence. var isBadMsGfMzid = false; if (data.AnalysisSoftwareCvAccession.IndexOf("MS:1002048", StringComparison.OrdinalIgnoreCase) >= 0 && !string.IsNullOrWhiteSpace(data.AnalysisSoftwareVersion)) { // bad versions: v10280 (introduced), v10282, v2016.01.20, v2016.01.21, v2016.01.29, v2016.02.12, v2016.05.25, v2016.0.13, v2016.06.13, v2016.06.14, v2016.06.15, v2016.06.29, v2016.07.26, v2016.08.31, v2016.09.07, v2016.09.22, v2016.09.23 (fixed with version v2016.10.10) var badVersions = new[] { "v10280", "v10282", "v2016.01.20", "v2016.01.21", "v2016.01.29", "v2016.02.12", "v2016.05.25", "v2016.0.13", "v2016.06.13", "v2016.06.14", "v2016.06.15", "v2016.06.29", "v2016.07.26", "v2016.08.31", "v2016.09.07", "v2016.09.22", "v2016.09.23" }; foreach (var version in badVersions) { if (data.AnalysisSoftwareVersion.Contains(version)) { isBadMsGfMzid = true; } } } if (isBadMsGfMzid) { ConsoleMsgUtils.ShowWarning( "Warning: file \"{0}\" was created with a version of MS-GF+ that had some erroneous output in the mzid file." + " Using sequences from the peptide_ref attribute instead of the PeptideEvidenceRef element to try to bypass the issue.", mzidPath); } csv.WriteHeader <PeptideMatch>(); csv.NextRecord(); var lastScanNum = 0; // Number of items in data.Identifications // Incremented during the for each loop var unfilteredCount = 0; // Number of identifications that did not pass the score filters var filteredOutCount = 0; // List of matches in a single result. List is cleared before use. // Only contains multiple when outputting all protein matches, and a result has multiple protein matches. var matches = new List <PeptideMatch>(30); foreach (var id in data.Identifications) { if (options.SingleResultPerSpectrum && id.ScanNum == lastScanNum) { continue; } unfilteredCount++; lastScanNum = id.ScanNum; if (filterOnSpecEValue && id.SpecEv > options.MaxSpecEValue) { filteredOutCount++; continue; } if (filterOnEValue && id.EValue > options.MaxEValue) { filteredOutCount++; continue; } if (filterOnQValue && id.QValue > options.MaxQValue) { filteredOutCount++; continue; } // Clear out the list of matches. matches.Clear(); var uniquePepProteinList = new HashSet <string>(); // id.PepEvidence has one entry for each protein associated with this PSM IEnumerable <SimpleMZIdentMLReader.PeptideEvidence> pepEvEnum = id.PepEvidence; if (!options.ShowDecoy) { pepEvEnum = pepEvEnum.Where(x => !x.IsDecoy); } // maxMatchedProteins is '1' or 'int.MaxValue' foreach (var pepEv in pepEvEnum.Take(maxMatchedProteins)) { var peptide = pepEv.SequenceWithNumericMods; // Produce correct output with bad MS-GF+ mzid if (isBadMsGfMzid) { // Add the prefix and suffix residues for this protein // Do not use pepEv.SequenceWithNumericMods; it isn't necessarily correct for this spectrum peptide = pepEv.Pre + "." + id.Peptide.SequenceWithNumericMods + "." + pepEv.Post; } var protein = pepEv.DbSeq.Accession; if (!uniquePepProteinList.Add(peptide + protein)) { // Don't process the check for the gene ID if it's not a unique match continue; } var geneId = string.Empty; if (options.AddGeneId && !pepEv.IsDecoy) { // Note that .ProteinDescription includes both the Protein Name and the Description var success = TryGetGeneId(options.GeneIdRegex, pepEv.DbSeq.ProteinDescription, out geneId); if (!success) { geneId = string.Empty; } } matches.Add(new PeptideMatch { SpecFile = data.SpectrumFile, Identification = id, Peptide = peptide, Protein = protein, GeneId = geneId, }); } if (matches.Count == 0) { continue; } if (options.DelimitedProteinNames && matches.Count > 1) { CombineProteinNames(options, matches); // The first item in matches already lists all of the protein names; remove all remaining matches. matches.RemoveRange(1, matches.Count - 1); } foreach (var item in matches) { csv.WriteRecord(item); csv.NextRecord(); } writtenCount++; } if (unfilteredCount == 0) { ConsoleMsgUtils.ShowWarning("Warning: .mzid file does not have any results"); Thread.Sleep(1500); } else if (writtenCount == 0) { ConsoleMsgUtils.ShowWarning("Warning: none of the results passed the specified filter(s)"); Thread.Sleep(1500); } else { Console.WriteLine("Wrote {0:N0} results to {1}", writtenCount, PathUtils.CompactPathString(tsvFile.FullName, 70)); if (filteredOutCount > 0) { Console.WriteLine("Filtered out {0:N0} results", filteredOutCount); } } return(true); } catch (SimpleMZIdentMLReader.DuplicateKeyException ex) { ConsoleMsgUtils.ShowError("MZID PARSE ERROR", ex); ConsoleMsgUtils.ShowWarning("This type of error is usually caused by an error in the MZID output."); return(false); } catch (Exception ex) { ConsoleMsgUtils.ShowError( string.Format("Error converting the file (so far, {0:N0} results have been written", writtenCount), ex); return(false); } }
public void ConvertToTsv(ConverterOptions options) { ConvertToTsv(options.MzidPath, options.TsvPath, options.ShowDecoy, options.UnrollResults, options.SingleResultPerSpectrum); }