public void CountMatchedScansPerProtein() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minTagLength = 6; var proteinToScan = new Dictionary <string, HashSet <int> >(); const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length); //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag"; const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var isHeader = true; var numMatchedPairs = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length != 3) { continue; } var scan = Convert.ToInt32(token[0]); var tag = token[1]; if (tag.Length < minTagLength) { continue; } foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { ++numMatchedPairs; HashSet <int> matchedScans; if (proteinToScan.TryGetValue(matchedProtein, out matchedScans)) { matchedScans.Add(scan); } else { matchedScans = new HashSet <int> { scan }; proteinToScan.Add(matchedProtein, matchedScans); } } } var numMatchedProteins = proteinToScan.Keys.Count; var numAllProteins = fastaDb.GetNumEntries(); Console.WriteLine("NumAllProteins: {0}", numAllProteins); Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins); Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins); }
public void TestFasta() { var db = new FastaDatabase(@"\\protoapps\UserData\Jungkap\Lewy\db\ID_005140_7A170668.fasta"); Console.WriteLine(db.GetNumEntries()); }
public void WriteResultsToMzid(IEnumerable <DatabaseSearchResultData> matches, string outputFilePath) { var datasetName = Path.GetFileNameWithoutExtension(outputFilePath); var creator = new IdentDataCreator("MSPathFinder_" + datasetName, "MSPathFinder_" + datasetName); var soft = creator.AddAnalysisSoftware("Software_1", "MSPathFinder", System.Reflection.Assembly.GetCallingAssembly().GetName().Version.ToString(), CV.CVID.MS_MSPathFinder, "MSPathFinder"); var settings = creator.AddAnalysisSettings(soft, "Settings_1", CV.CVID.MS_ms_ms_search); var searchDb = creator.AddSearchDatabase(database.GetFastaFilePath(), database.GetNumEntries(), Path.GetFileNameWithoutExtension(database.GetFastaFilePath()), CV.CVID.CVID_Unknown, CV.CVID.MS_FASTA_format); if (options.TargetDecoySearchMode.HasFlag(DatabaseSearchMode.Decoy)) { searchDb.CVParams.AddRange(new CVParamObj[] { new CVParamObj() { Cvid = CV.CVID.MS_DB_composition_target_decoy, }, new CVParamObj() { Cvid = CV.CVID.MS_decoy_DB_accession_regexp, Value = "^XXX", }, //new CVParamObj() { Cvid = CV.CVID.MS_decoy_DB_type_reverse, }, new CVParamObj() { Cvid = CV.CVID.MS_decoy_DB_type_randomized, }, }); } // store the settings... CreateMzidSettings(settings); var path = options.SpecFilePath; var run = lcmsRun as PbfLcMsRun; if (run != null) { var rawPath = run.RawFilePath; if (!string.IsNullOrWhiteSpace(rawPath)) { path = rawPath; } } // TODO: fix this to match correctly to the original file - May need to modify the PBF format to add an input format specifier // TODO: Should probably? request a CV Term for the PBF format? var nativeIdFormat = lcmsRun.NativeIdFormat; if (nativeIdFormat == CV.CVID.CVID_Unknown) { nativeIdFormat = CV.CVID.MS_scan_number_only_nativeID_format; } var specData = creator.AddSpectraData(path, datasetName, nativeIdFormat, lcmsRun.NativeFormat); // Get the search modifications as they were passed into the AminoAcidSet constructor, so we can retrieve masses from them var modDict = new Dictionary <string, Modification>(); foreach (var mod in options.AminoAcidSet.SearchModifications) { if (!modDict.ContainsKey(mod.Modification.Name)) { modDict.Add(mod.Modification.Name, mod.Modification); } else if (!modDict[mod.Modification.Name].Composition.Equals(mod.Modification.Composition)) { throw new System.Exception( "ERROR: Cannot have modifications with the same name and different composition/mass! Fix input modifications! Duplicated modification name: " + mod.Modification.Name); } } foreach (var match in matches) { var scanNum = match.ScanNum; var spec = lcmsRun.GetSpectrum(scanNum, false); var matchIon = new Ion(Composition.Parse(match.Composition), match.Charge); var nativeId = spec.NativeId; if (string.IsNullOrWhiteSpace(spec.NativeId)) { nativeId = "scan=" + spec.ScanNum; } var specIdent = creator.AddSpectrumIdentification(specData, nativeId, spec.ElutionTime, match.MostAbundantIsotopeMz, match.Charge, 1, double.NaN); specIdent.CalculatedMassToCharge = matchIon.GetMonoIsotopicMz(); var pep = new PeptideObj(match.Sequence); var modText = match.Modifications; if (!string.IsNullOrWhiteSpace(modText)) { var mods = modText.Split(','); foreach (var mod in mods) { var tokens = mod.Split(' '); var modInfo = modDict[tokens[0]]; var modObj = new ModificationObj(CV.CVID.MS_unknown_modification, modInfo.Name, int.Parse(tokens[1]), modInfo.Mass); pep.Modifications.Add(modObj); } } specIdent.Peptide = pep; var proteinName = match.ProteinName; var protLength = match.ProteinLength; var proteinDescription = match.ProteinDescription; var dbSeq = new DbSequenceObj(searchDb, protLength, proteinName, proteinDescription); var start = match.Start; var end = match.End; var pepEv = new PeptideEvidenceObj(dbSeq, pep, start, end, match.Pre, match.Post, match.ProteinName.StartsWith("XXX")); specIdent.AddPeptideEvidence(pepEv); var probability = match.Probability; specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_chemical_compound_formula, Value = match.Composition, }); //specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_number_of_matched_peaks, Value = match.NumMatchedFragments.ToString(), }); specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_MSPathFinder_RawScore, Value = probability.ToString(CultureInfo.InvariantCulture), }); specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_MSPathFinder_SpecEValue, Value = match.SpecEValue.ToString(CultureInfo.InvariantCulture), }); specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_MSPathFinder_EValue, Value = match.EValue.ToString(CultureInfo.InvariantCulture), }); if (match.HasTdaScores) { specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_MSPathFinder_QValue, Value = match.QValue.ToString(CultureInfo.InvariantCulture), }); specIdent.CVParams.Add(new CVParamObj() { Cvid = CV.CVID.MS_MSPathFinder_PepQValue, Value = match.PepQValue.ToString(CultureInfo.InvariantCulture), }); } // MS-GF+ similarity: find/add isotope error? // MS-GF+ similarity: find/add assumed dissociation method? //specIdent.UserParams.Add(new UserParamObj() {Name = "Assumed Dissociation Method", Value = match.}); } var identData = creator.GetIdentData(); MzIdentMlReaderWriter.Write(new MzIdentMLType(identData), outputFilePath); }
private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue"); foreach (var scanNum in _ms2ScanNums) { var match = matches[scanNum]; if (match == null) { continue; } var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var proteinDescription = database.GetProteinDescription(match.Offset); var probability = CompositeScorer.GetProbability(match.Score); // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}", scanNum, match.Pre, // Pre sequence, // Sequence match.Post, // Post match.ModificationText, // Modifications ion.Composition, // Composition proteinName, // ProteinName proteinDescription, // ProteinDescription protLength, // ProteinLength start, // Start position in protein end, // End position in protein ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.NumMatchedFragments, // (Number of matched fragments) StringUtilities.DblToString(probability, 4), // Probability StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001), // EValue; will be displayed using scientific notation if the value is less than 0.001 StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001) // SpecEValue; will be displayed using scientific notation if the value is less than 0.001 ); } } }