public void AddMatch(DatabaseSequenceSpectrumMatch newMatch) { if (newMatch.Score < _scoreCutoff) return; var scanIndex = _ms2ScanToIndexMap[newMatch.ScanNum]; var modIndex = (newMatch.Modifications == null) ? 0 : newMatch.Modifications.GetNumModifications(); if (modIndex >= _matchedSet.Length) return; // thread safe lock (_matchedSet[modIndex]) { if (_matchedSet[modIndex][scanIndex] == null) { _matchedSet[modIndex][scanIndex] = new SortedSet<DatabaseSequenceSpectrumMatch> { newMatch }; } else // already exists { var existingMatches = _matchedSet[modIndex][scanIndex]; var maxScore = existingMatches.Max.Score; if (existingMatches.Count < NumMatchesPerSpectrum && maxScore * ScoreRatioCutoff < newMatch.Score) { existingMatches.Add(newMatch); existingMatches.RemoveWhere(mt => mt.Score < maxScore * ScoreRatioCutoff); } else { var minScore = existingMatches.Min.Score; if (newMatch.Score > minScore) { existingMatches.Add(newMatch); existingMatches.RemoveWhere(mt => mt.Score < maxScore * ScoreRatioCutoff); } } } } }
public IcBottomUpScores GetScores(DatabaseSequenceSpectrumMatch match, Composition composition, int charge, int ms2ScanNum) { return GetScores(match.Pre, match.Sequence, match.Post, match.NTerm, match.CTerm, composition, charge, ms2ScanNum); }
private SortedSet<DatabaseSequenceSpectrumMatch>[] RunSearch(IEnumerable<AnnotationAndOffset> annotationsAndOffsets, ISequenceFilter ms1Filter, bool isDecoy) { var sw = new Stopwatch(); var numPeptides = 0; sw.Reset(); sw.Start(); var matches = new SortedSet<DatabaseSequenceSpectrumMatch>[_run.MaxLcScan + 1]; // TODO: N-term Met cleavage foreach (var annotationAndOffset in annotationsAndOffsets) { ++numPeptides; var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; if (numPeptides % 100000 == 0) { Console.Write(@"Processing {0}{1} peptides...", numPeptides, numPeptides == 1 ? "st" : numPeptides == 2 ? "nd" : numPeptides == 3 ? "rd" : "th"); if (numPeptides != 0) { sw.Stop(); var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); sw.Start(); } } var seqGraph = SequenceGraph.CreateGraph(AminoAcidSet, annotation); if (seqGraph == null) { // Console.WriteLine("Ignoring illegal protein: {0}", annotation); continue; } //var protCompositions = seqGraph.GetSequenceCompositions(); var numProteoforms = seqGraph.GetNumProteoformCompositions(); var modCombs = seqGraph.GetModificationCombinations(); for (var modIndex = 0; modIndex < numProteoforms; modIndex++) { seqGraph.SetSink(modIndex); var protCompositionWithH2O = seqGraph.GetSinkSequenceCompositionWithH2O(); var sequenceMass = protCompositionWithH2O.Mass; var modCombinations = modCombs[modIndex]; foreach (var ms2ScanNum in ms1Filter.GetMatchingMs2ScanNums(sequenceMass)) { var spec = _run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (spec == null) continue; var charge = (int)Math.Round(sequenceMass / (spec.IsolationWindow.IsolationWindowTargetMz - Constants.Proton)); var scorer = _ms2ScorerFactory.GetMs2Scorer(ms2ScanNum); var score = seqGraph.GetFragmentScore(scorer); if (score <= 2) continue; var precursorIon = new Ion(protCompositionWithH2O, charge); var sequence = annotation.Substring(2, annotation.Length - 4); var pre = annotation[0]; var post = annotation[annotation.Length - 1]; var prsm = new DatabaseSequenceSpectrumMatch(sequence, pre, post, ms2ScanNum, offset, 0, modCombinations, precursorIon, score, isDecoy); if (matches[ms2ScanNum] == null) { matches[ms2ScanNum] = new SortedSet<DatabaseSequenceSpectrumMatch> { prsm }; } else // already exists { var existingMatches = matches[ms2ScanNum]; if (existingMatches.Count < NumMatchesPerSpectrum) existingMatches.Add(prsm); else { var minScore = existingMatches.Min.Score; if (score > minScore) { existingMatches.Add(prsm); existingMatches.Remove(existingMatches.Min); } } } } } } return matches; }
private DatabaseSequenceSpectrumMatch[] RunGeneratingFunction(SortedSet<DatabaseSequenceSpectrumMatch>[] sortedMatches, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null) { var progData = new ProgressData(progress) { Status = "Calculating spectral E-values for matches" }; if (_cachedScoreDistributions == null) { _cachedScoreDistributions = new LinkedList<Tuple<double, ScoreDistribution>>[_run.MaxLcScan + 1]; foreach (var scanNum in _ms2ScanNums) _cachedScoreDistributions[scanNum] = new LinkedList<Tuple<double, ScoreDistribution>>(); } var sw = new Stopwatch(); var topDownScorer = new InformedTopDownScorer(_run, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance); // Rescore and Estimate #proteins for GF calculation var matches = new LinkedList<DatabaseSequenceSpectrumMatch>[sortedMatches.Length]; long estimatedProteins = 0; foreach(var scanNum in _ms2ScanNums) { var prsms = sortedMatches[scanNum]; if (prsms == null) continue; var spec = _run.GetSpectrum(scanNum) as ProductSpectrum; if (spec == null) return null; foreach (var match in prsms) { var sequence = match.Sequence; var ion = match.Ion; // Re-scoring var scores = topDownScorer.GetScores(spec, sequence, ion.Composition, ion.Charge, scanNum); if (scores == null) continue; match.Score = scores.Score; match.ModificationText = scores.Modifications; match.NumMatchedFragments = scores.NumMatchedFrags; if (match.Score > CompositeScorer.ScoreParam.Cutoff) { if (matches[scanNum] == null) matches[scanNum] = new LinkedList<DatabaseSequenceSpectrumMatch>(); matches[scanNum].AddLast(match); } } if (matches[scanNum] != null) estimatedProteins += matches[scanNum].Count; } Console.WriteLine(@"Estimated matched proteins: " + estimatedProteins); var numProteins = 0; var lastUpdate = DateTime.MinValue; // Force original update of 0% sw.Reset(); sw.Start(); var scanNums = _ms2ScanNums.Where(scanNum => matches[scanNum] != null).ToArray(); var pfeOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxNumThreads, CancellationToken = cancellationToken ?? CancellationToken.None }; Parallel.ForEach(scanNums, pfeOptions, scanNum => { var currentTask = "?"; try { var scoreDistributions = _cachedScoreDistributions[scanNum]; foreach (var match in matches[scanNum]) { var currentIteration = "for scan " + scanNum + " and mass " + match.Ion.Composition.Mass; currentTask = "Calling GetMs2ScoringGraph " + currentIteration; var graph = _ms2ScorerFactory2.GetMs2ScoringGraph(scanNum, match.Ion.Composition.Mass); if (graph == null) continue; currentTask = "Calling ComputeGeneratingFunction " + currentIteration; var scoreDist = (from distribution in scoreDistributions where Math.Abs(distribution.Item1 - match.Ion.Composition.Mass) < PrecursorIonTolerance.GetToleranceAsTh(match.Ion.Composition.Mass) select distribution.Item2).FirstOrDefault(); if (scoreDist == null) { var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); scoreDist = gf.GetScoreDistribution(); scoreDistributions.AddLast(new Tuple<double, ScoreDistribution>(match.Ion.Composition.Mass, scoreDist)); } currentTask = "Calling GetSpectralEValue " + currentIteration + " and score " + (int)match.Score; match.SpecEvalue = scoreDist.GetSpectralEValue(match.Score); currentTask = "Reporting progress " + currentIteration; SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData); } } catch (Exception ex) { var errMsg = string.Format("Exception while {0}: {1}", currentTask, ex.Message); Console.WriteLine(errMsg); throw new Exception(errMsg, ex); } }); var finalMatches = new DatabaseSequenceSpectrumMatch[matches.Length]; foreach (var scanNum in scanNums) { finalMatches[scanNum] = matches[scanNum].OrderBy(m => m.SpecEvalue).First(); } progData.StatusInternal = string.Empty; progData.Report(100.0); return finalMatches; }
private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue"); foreach(var scanNum in _ms2ScanNums) { var match = matches[scanNum]; if (match == null) continue; var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var proteinDescription = database.GetProteinDescription(match.Offset); var probability = CompositeScorer.GetProbability(match.Score); // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}", scanNum, match.Pre, // Pre sequence, // Sequence match.Post, // Post match.ModificationText, // Modifications ion.Composition, // Composition proteinName, // ProteinName proteinDescription, // ProteinDescription protLength, // ProteinLength start, // Start position in protein end, // End position in protein ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.NumMatchedFragments, // (Number of matched fragments) StringUtilities.DblToString(probability, 4), // Probability StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001), // EValue; will be displayed using scientific notation if the value is less than 0.001 StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001) // SpecEValue; will be displayed using scientific notation if the value is less than 0.001 ); } } }
private void AddMatch(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, int ms2ScanNum, DatabaseSequenceSpectrumMatch prsm) { lock (matches) { if (matches[ms2ScanNum] == null) { matches[ms2ScanNum] = new SortedSet<DatabaseSequenceSpectrumMatch> {prsm}; } else // already exists { var existingMatches = matches[ms2ScanNum]; //var maxScore = existingMatches.Max.Score; if (existingMatches.Count < NumMatchesPerSpectrum) { //if (!(maxScore*0.7 < prsm.Score)) return; existingMatches.Add(prsm); } else { var minScore = existingMatches.Min.Score; if (!(prsm.Score > minScore)) return; existingMatches.Add(prsm); existingMatches.Remove(existingMatches.Min); } //if (NumMatchesPerSpectrum > 1) existingMatches.RemoveWhere(mt => mt.Score < maxScore * 0.7); } } }
private void SearchForMatches(AnnotationAndOffset annotationAndOffset, ISequenceFilter sequenceFilter, SortedSet<DatabaseSequenceSpectrumMatch>[] matches, int maxNumNTermCleavages, bool isDecoy, CancellationToken? cancellationToken = null) { var pfeOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxNumThreads, CancellationToken = cancellationToken ?? CancellationToken.None }; var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; //var protein = db.GetProteinName(offset); var protSequence = annotation.Substring(2, annotation.Length - 4); var seqGraph = SequenceGraph.CreateGraph(AminoAcidSet, AminoAcid.ProteinNTerm, protSequence, AminoAcid.ProteinCTerm); if (seqGraph == null) return; // No matches will be found without a sequence graph. for (var numNTermCleavages = 0; numNTermCleavages <= maxNumNTermCleavages; numNTermCleavages++) { if (numNTermCleavages > 0) seqGraph.CleaveNTerm(); var numProteoforms = seqGraph.GetNumProteoformCompositions(); var modCombs = seqGraph.GetModificationCombinations(); for (var modIndex = 0; modIndex < numProteoforms; modIndex++) { seqGraph.SetSink(modIndex); var protCompositionWithH2O = seqGraph.GetSinkSequenceCompositionWithH2O(); var sequenceMass = protCompositionWithH2O.Mass; if (sequenceMass < MinSequenceMass || sequenceMass > MaxSequenceMass) continue; var modCombinations = modCombs[modIndex]; var ms2ScanNums = this.ScanNumbers ?? sequenceFilter.GetMatchingMs2ScanNums(sequenceMass); Parallel.ForEach(ms2ScanNums, pfeOptions, ms2ScanNum => { if (ms2ScanNum > _ms2ScanNums.Last() || ms2ScanNum < _ms2ScanNums.First()) return; var scorer = _ms2ScorerFactory2.GetMs2Scorer(ms2ScanNum); var score = seqGraph.GetFragmentScore(scorer); var isoTargetMz = _isolationWindowTargetMz[ms2ScanNum]; if (!(isoTargetMz > 0)) return; var charge = (int)Math.Round(sequenceMass / (isoTargetMz - Constants.Proton)); var precursorIon = new Ion(protCompositionWithH2O, charge); var sequence = protSequence.Substring(numNTermCleavages); var pre = numNTermCleavages == 0 ? annotation[0] : annotation[numNTermCleavages + 1]; var post = annotation[annotation.Length - 1]; var prsm = new DatabaseSequenceSpectrumMatch(sequence, pre, post, ms2ScanNum, offset, numNTermCleavages, modCombinations, precursorIon, score, isDecoy); AddMatch(matches, ms2ScanNum, prsm); }); } } }
private void RunTagBasedSearch(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, FastaDatabase db, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null) { _tagSearchEngine.SetDatabase(db); //var ms2ScanNums = _run.GetScanNumbers(2); var progData = new ProgressData(progress) { Status = "Tag-based Searching for matches" }; var sw = new Stopwatch(); long estimatedProteins = _tagMs2ScanNum.Length; Console.WriteLine(@"Number of spectra containing sequence tags: " + estimatedProteins); var numProteins = 0; var lastUpdate = DateTime.MinValue; // Force original update of 0% sw.Reset(); sw.Start(); var pfeOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxNumThreads, CancellationToken = cancellationToken ?? CancellationToken.None }; Parallel.ForEach(_tagMs2ScanNum, pfeOptions, ms2ScanNum => { var tagSeqMatches = _tagSearchEngine.RunSearch(ms2ScanNum); foreach (var tagSequenceMatch in tagSeqMatches) { var offset = _tagSearchEngine.FastaDatabase.GetOffset(tagSequenceMatch.ProteinName); if (offset == null) continue; var sequence = tagSequenceMatch.Sequence; var numNTermCleavages = tagSequenceMatch.TagMatch.StartIndex; var seqObj = Sequence.CreateSequence(sequence, tagSequenceMatch.TagMatch.ModificationText, AminoAcidSet); var precursorIon = new Ion(seqObj.Composition + Composition.H2O, tagSequenceMatch.TagMatch.Charge); var prsm = new DatabaseSequenceSpectrumMatch(sequence, tagSequenceMatch.Pre, tagSequenceMatch.Post, ms2ScanNum, (long)offset, numNTermCleavages, tagSequenceMatch.TagMatch.Modifications, precursorIon, tagSequenceMatch.TagMatch.Score, db.IsDecoy) { ModificationText = tagSequenceMatch.TagMatch.ModificationText, }; AddMatch(matches, ms2ScanNum, prsm); } SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData, "spectra"); }); Console.WriteLine(@"Collected candidate matches: {0}", GetNumberOfMatches(matches)); progData.StatusInternal = string.Empty; progData.Report(100.0); }
public IcBottomUpScores GetScores(DatabaseSequenceSpectrumMatch match, Composition composition, int charge, int ms2ScanNum) { return(GetScores(match.Pre, match.Sequence, match.Post, match.NTerm, match.CTerm, composition, charge, ms2ScanNum)); }
private SortedSet <DatabaseSequenceSpectrumMatch>[] RunSearch(IEnumerable <AnnotationAndOffset> annotationsAndOffsets, ISequenceFilter ms1Filter, bool isDecoy) { var sw = new Stopwatch(); var numPeptides = 0; sw.Reset(); sw.Start(); var matches = new SortedSet <DatabaseSequenceSpectrumMatch> [_run.MaxLcScan + 1]; // TODO: N-term Met cleavage foreach (var annotationAndOffset in annotationsAndOffsets) { ++numPeptides; var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; if (numPeptides % 100000 == 0) { Console.Write(@"Processing {0}{1} peptides...", numPeptides, numPeptides == 1 ? "st" : numPeptides == 2 ? "nd" : numPeptides == 3 ? "rd" : "th"); if (numPeptides != 0) { sw.Stop(); var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); sw.Start(); } } var seqGraph = SequenceGraph.CreateGraph(AminoAcidSet, annotation); if (seqGraph == null) { // Console.WriteLine("Ignoring illegal protein: {0}", annotation); continue; } //var protCompositions = seqGraph.GetSequenceCompositions(); var numProteoforms = seqGraph.GetNumProteoformCompositions(); var modCombs = seqGraph.GetModificationCombinations(); for (var modIndex = 0; modIndex < numProteoforms; modIndex++) { seqGraph.SetSink(modIndex); var protCompositionWithH2O = seqGraph.GetSinkSequenceCompositionWithH2O(); var sequenceMass = protCompositionWithH2O.Mass; var modCombinations = modCombs[modIndex]; foreach (var ms2ScanNum in ms1Filter.GetMatchingMs2ScanNums(sequenceMass)) { var spec = _run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (spec == null) { continue; } var charge = (int)Math.Round(sequenceMass / (spec.IsolationWindow.IsolationWindowTargetMz - Constants.Proton)); var scorer = _ms2ScorerFactory.GetMs2Scorer(ms2ScanNum); var score = seqGraph.GetFragmentScore(scorer); if (score <= 2) { continue; } var precursorIon = new Ion(protCompositionWithH2O, charge); var sequence = annotation.Substring(2, annotation.Length - 4); var pre = annotation[0]; var post = annotation[annotation.Length - 1]; var prsm = new DatabaseSequenceSpectrumMatch(sequence, pre, post, ms2ScanNum, offset, 0, modCombinations, precursorIon, score, isDecoy); if (matches[ms2ScanNum] == null) { matches[ms2ScanNum] = new SortedSet <DatabaseSequenceSpectrumMatch> { prsm }; } else // already exists { var existingMatches = matches[ms2ScanNum]; if (existingMatches.Count < NumMatchesPerSpectrum) { existingMatches.Add(prsm); } else { var minScore = existingMatches.Min.Score; if (score > minScore) { existingMatches.Add(prsm); existingMatches.Remove(existingMatches.Min); } } } } } } return(matches); }