public void AddMatch(DatabaseSequenceSpectrumMatch newMatch)
        {
            if (newMatch.Score < _scoreCutoff) return;
            var scanIndex = _ms2ScanToIndexMap[newMatch.ScanNum];
            var modIndex = (newMatch.Modifications == null) ? 0 : newMatch.Modifications.GetNumModifications();
            if (modIndex >= _matchedSet.Length) return;

            // thread safe
            lock (_matchedSet[modIndex])
            {
                if (_matchedSet[modIndex][scanIndex] == null)
                {
                    _matchedSet[modIndex][scanIndex] = new SortedSet<DatabaseSequenceSpectrumMatch> { newMatch };
                }
                else // already exists
                {
                    var existingMatches = _matchedSet[modIndex][scanIndex];
                    var maxScore = existingMatches.Max.Score;
                    if (existingMatches.Count < NumMatchesPerSpectrum && maxScore * ScoreRatioCutoff < newMatch.Score)
                    {
                        existingMatches.Add(newMatch);
                        existingMatches.RemoveWhere(mt => mt.Score < maxScore * ScoreRatioCutoff);
                    }
                    else
                    {
                        var minScore = existingMatches.Min.Score;
                        if (newMatch.Score > minScore)
                        {
                            existingMatches.Add(newMatch);
                            existingMatches.RemoveWhere(mt => mt.Score < maxScore * ScoreRatioCutoff);
                        }
                    }
                }                
            }
        }
Пример #2
0
 public IcBottomUpScores GetScores(DatabaseSequenceSpectrumMatch match, Composition composition, int charge,
     int ms2ScanNum)
 {
     return GetScores(match.Pre, match.Sequence, match.Post, match.NTerm, match.CTerm, composition, charge, ms2ScanNum);
 }
Пример #3
0
        private SortedSet<DatabaseSequenceSpectrumMatch>[] RunSearch(IEnumerable<AnnotationAndOffset> annotationsAndOffsets, ISequenceFilter ms1Filter, bool isDecoy)
        {
            var sw = new Stopwatch();
            var numPeptides = 0;
            sw.Reset();
            sw.Start();

            var matches = new SortedSet<DatabaseSequenceSpectrumMatch>[_run.MaxLcScan + 1];

            // TODO: N-term Met cleavage
            foreach (var annotationAndOffset in annotationsAndOffsets)
            {
                ++numPeptides;

                var annotation = annotationAndOffset.Annotation;
                var offset = annotationAndOffset.Offset;

                if (numPeptides % 100000 == 0)
                {
                    Console.Write(@"Processing {0}{1} peptides...", numPeptides,
                        numPeptides == 1 ? "st" : numPeptides == 2 ? "nd" : numPeptides == 3 ? "rd" : "th");
                    if (numPeptides != 0)
                    {
                        sw.Stop();
                        var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                        Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);
                        sw.Reset();
                        sw.Start();
                    }
                }

                var seqGraph = SequenceGraph.CreateGraph(AminoAcidSet, annotation);
                if (seqGraph == null)
                {
                    //                    Console.WriteLine("Ignoring illegal protein: {0}", annotation);
                    continue;
                }
                

                //var protCompositions = seqGraph.GetSequenceCompositions();
                var numProteoforms = seqGraph.GetNumProteoformCompositions();
                var modCombs = seqGraph.GetModificationCombinations();
                for (var modIndex = 0; modIndex < numProteoforms; modIndex++)
                {
                    seqGraph.SetSink(modIndex);
                    var protCompositionWithH2O = seqGraph.GetSinkSequenceCompositionWithH2O();
                    var sequenceMass = protCompositionWithH2O.Mass;
                    var modCombinations = modCombs[modIndex];

                    foreach (var ms2ScanNum in ms1Filter.GetMatchingMs2ScanNums(sequenceMass))
                    {
                        var spec = _run.GetSpectrum(ms2ScanNum) as ProductSpectrum;
                        if (spec == null) continue;
                        var charge =
                            (int)Math.Round(sequenceMass / (spec.IsolationWindow.IsolationWindowTargetMz - Constants.Proton));
                        var scorer = _ms2ScorerFactory.GetMs2Scorer(ms2ScanNum);
                        var score = seqGraph.GetFragmentScore(scorer);
                        if (score <= 2) continue;

                        var precursorIon = new Ion(protCompositionWithH2O, charge);
                        var sequence = annotation.Substring(2, annotation.Length - 4);
                        var pre = annotation[0];
                        var post = annotation[annotation.Length - 1];
                        var prsm = new DatabaseSequenceSpectrumMatch(sequence, pre, post, ms2ScanNum, offset, 0, modCombinations,
                            precursorIon, score, isDecoy);

                        if (matches[ms2ScanNum] == null)
                        {
                            matches[ms2ScanNum] = new SortedSet<DatabaseSequenceSpectrumMatch> { prsm };
                        }
                        else // already exists
                        {
                            var existingMatches = matches[ms2ScanNum];
                            if (existingMatches.Count < NumMatchesPerSpectrum) existingMatches.Add(prsm);
                            else
                            {
                                var minScore = existingMatches.Min.Score;
                                if (score > minScore)
                                {
                                    existingMatches.Add(prsm);
                                    existingMatches.Remove(existingMatches.Min);
                                }
                            }
                        }
                    }
                }
            }

            return matches;
        }
Пример #4
0
        private DatabaseSequenceSpectrumMatch[] RunGeneratingFunction(SortedSet<DatabaseSequenceSpectrumMatch>[] sortedMatches, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null)
        {
            var progData = new ProgressData(progress)
            {
                Status = "Calculating spectral E-values for matches"
            };

            if (_cachedScoreDistributions == null)
            {
                _cachedScoreDistributions = new LinkedList<Tuple<double, ScoreDistribution>>[_run.MaxLcScan + 1];
                foreach (var scanNum in _ms2ScanNums) _cachedScoreDistributions[scanNum] = new LinkedList<Tuple<double, ScoreDistribution>>();
            }
            
            var sw = new Stopwatch();

            var topDownScorer = new InformedTopDownScorer(_run, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance);

            // Rescore and Estimate #proteins for GF calculation
            var matches = new LinkedList<DatabaseSequenceSpectrumMatch>[sortedMatches.Length];
            long estimatedProteins = 0;
            foreach(var scanNum in _ms2ScanNums)
            {
                var prsms = sortedMatches[scanNum];
                if (prsms == null) continue;
                var spec = _run.GetSpectrum(scanNum) as ProductSpectrum;
                if (spec == null) return null;

                foreach (var match in prsms)
                {
                    var sequence = match.Sequence;
                    var ion = match.Ion;

                    // Re-scoring
                    var scores = topDownScorer.GetScores(spec, sequence, ion.Composition, ion.Charge, scanNum);
                    if (scores == null) continue;
                    
                    match.Score = scores.Score;
                    match.ModificationText = scores.Modifications;
                    match.NumMatchedFragments = scores.NumMatchedFrags;
                    if (match.Score > CompositeScorer.ScoreParam.Cutoff)
                    {
                        if (matches[scanNum] == null) matches[scanNum] = new LinkedList<DatabaseSequenceSpectrumMatch>();
                        matches[scanNum].AddLast(match);
                    }
                }

                if (matches[scanNum] != null) estimatedProteins += matches[scanNum].Count;
            }

            Console.WriteLine(@"Estimated matched proteins: " + estimatedProteins);

            var numProteins = 0;
            var lastUpdate = DateTime.MinValue; // Force original update of 0%
            sw.Reset();
            sw.Start();

            var scanNums = _ms2ScanNums.Where(scanNum => matches[scanNum] != null).ToArray();

            var pfeOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxNumThreads,
                CancellationToken = cancellationToken ?? CancellationToken.None
            };
            Parallel.ForEach(scanNums, pfeOptions, scanNum =>
            {
                var currentTask = "?";
                try
                {
                    var scoreDistributions = _cachedScoreDistributions[scanNum];
                    foreach (var match in matches[scanNum])
                    {
                        var currentIteration = "for scan " + scanNum + " and mass " + match.Ion.Composition.Mass;
                        currentTask = "Calling GetMs2ScoringGraph " + currentIteration;

                        var graph = _ms2ScorerFactory2.GetMs2ScoringGraph(scanNum, match.Ion.Composition.Mass);
                        if (graph == null) continue;

                        currentTask = "Calling ComputeGeneratingFunction " + currentIteration;

                        var scoreDist = (from distribution in scoreDistributions
                                         where Math.Abs(distribution.Item1 - match.Ion.Composition.Mass) < PrecursorIonTolerance.GetToleranceAsTh(match.Ion.Composition.Mass)
                                         select distribution.Item2).FirstOrDefault();
                        if (scoreDist == null)
                        {
                            var gf = new GeneratingFunction(graph);
                            gf.ComputeGeneratingFunction();
                            scoreDist = gf.GetScoreDistribution();
                            scoreDistributions.AddLast(new Tuple<double, ScoreDistribution>(match.Ion.Composition.Mass, scoreDist));
                        }

                        currentTask = "Calling GetSpectralEValue " + currentIteration + " and score " + (int)match.Score;
                        match.SpecEvalue = scoreDist.GetSpectralEValue(match.Score);

                        currentTask = "Reporting progress " + currentIteration;
                        SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData);
                    }
                }
                catch (Exception ex)
                {
                    var errMsg = string.Format("Exception while {0}: {1}", currentTask, ex.Message);
                    Console.WriteLine(errMsg);
                    throw new Exception(errMsg, ex);
                }
            });
            
            var finalMatches = new DatabaseSequenceSpectrumMatch[matches.Length];
            foreach (var scanNum in scanNums)
            {
                finalMatches[scanNum] = matches[scanNum].OrderBy(m => m.SpecEvalue).First();
            }
            
            progData.StatusInternal = string.Empty;
            progData.Report(100.0);
            return finalMatches;
        }
Пример #5
0
        private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                                 "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue");
                
                foreach(var scanNum in _ms2ScanNums)
                {
                    var match = matches[scanNum];
                    if (match == null) continue;

                    var sequence = match.Sequence;
                    var offset = match.Offset;
                    var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                    var end = start + sequence.Length - 1;
                    var proteinName = database.GetProteinName(match.Offset);
                    var protLength = database.GetProteinLength(proteinName);
                    var ion = match.Ion;
                    var proteinDescription = database.GetProteinDescription(match.Offset);
                    var probability = CompositeScorer.GetProbability(match.Score);

                    // Note for DblToString(value, 9, true), by having "9" and "true",
                    // values between 100 and 999 Da will have 7 digits after the decimal place, and
                    // values between 1000 and 9999 will have 6 digits after the decimal place
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}",
                        scanNum,
                        match.Pre,                 // Pre
                        sequence,                  // Sequence
                        match.Post,                // Post
                        match.ModificationText,    // Modifications
                        ion.Composition,           // Composition
                        proteinName,               // ProteinName
                        proteinDescription,        // ProteinDescription
                        protLength,                // ProteinLength
                        start,                     // Start position in protein
                        end,                       // End position in protein
                        ion.Charge,                // precursorCharge
                        StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz
                        StringUtilities.DblToString(ion.Composition.Mass, 9, true),           // Mass
                        match.NumMatchedFragments,                                          // (Number of matched fragments)
                        StringUtilities.DblToString(probability, 4),                        // Probability
                        StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001),                             // EValue; will be displayed using scientific notation if the value is less than 0.001
                        StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001)   // SpecEValue; will be displayed using scientific notation if the value is less than 0.001
                        );

                }
            }
        }
Пример #6
0
 private void AddMatch(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, int ms2ScanNum, DatabaseSequenceSpectrumMatch prsm)
 {
     lock (matches)
     {
         if (matches[ms2ScanNum] == null)
         {
             matches[ms2ScanNum] = new SortedSet<DatabaseSequenceSpectrumMatch> {prsm};
         }
         else // already exists
         {
             var existingMatches = matches[ms2ScanNum];
             //var maxScore = existingMatches.Max.Score;
             if (existingMatches.Count < NumMatchesPerSpectrum)
             {
                 //if (!(maxScore*0.7 < prsm.Score)) return;
                 existingMatches.Add(prsm);
             }
             else
             {
                 var minScore = existingMatches.Min.Score;
                 if (!(prsm.Score > minScore)) return;
                 existingMatches.Add(prsm);
                 existingMatches.Remove(existingMatches.Min);
             }
             //if (NumMatchesPerSpectrum > 1) existingMatches.RemoveWhere(mt => mt.Score < maxScore * 0.7);
         }
     }
 }
Пример #7
0
        private void SearchForMatches(AnnotationAndOffset annotationAndOffset,
            ISequenceFilter sequenceFilter, SortedSet<DatabaseSequenceSpectrumMatch>[] matches, int maxNumNTermCleavages, bool isDecoy, CancellationToken? cancellationToken = null)
        {
            var pfeOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxNumThreads,
                CancellationToken = cancellationToken ?? CancellationToken.None
            };
            
            var annotation = annotationAndOffset.Annotation;
            var offset = annotationAndOffset.Offset;
            //var protein = db.GetProteinName(offset);
            var protSequence = annotation.Substring(2, annotation.Length - 4);
            var seqGraph = SequenceGraph.CreateGraph(AminoAcidSet, AminoAcid.ProteinNTerm, protSequence,
                AminoAcid.ProteinCTerm);

            if (seqGraph == null) return; // No matches will be found without a sequence graph.

            for (var numNTermCleavages = 0; numNTermCleavages <= maxNumNTermCleavages; numNTermCleavages++)
            {
                if (numNTermCleavages > 0) seqGraph.CleaveNTerm();
                var numProteoforms = seqGraph.GetNumProteoformCompositions();
                var modCombs = seqGraph.GetModificationCombinations();
                for (var modIndex = 0; modIndex < numProteoforms; modIndex++)
                {
                    seqGraph.SetSink(modIndex);
                    var protCompositionWithH2O = seqGraph.GetSinkSequenceCompositionWithH2O();
                    var sequenceMass = protCompositionWithH2O.Mass;

                    if (sequenceMass < MinSequenceMass || sequenceMass > MaxSequenceMass) continue;

                    var modCombinations = modCombs[modIndex];
                    var ms2ScanNums = this.ScanNumbers ?? sequenceFilter.GetMatchingMs2ScanNums(sequenceMass);

                    Parallel.ForEach(ms2ScanNums, pfeOptions, ms2ScanNum =>
                    {
                        if (ms2ScanNum > _ms2ScanNums.Last() || ms2ScanNum < _ms2ScanNums.First()) return;
                        
                        var scorer = _ms2ScorerFactory2.GetMs2Scorer(ms2ScanNum);
                        var score = seqGraph.GetFragmentScore(scorer);
                        var isoTargetMz = _isolationWindowTargetMz[ms2ScanNum];
                        if (!(isoTargetMz > 0)) return;
                        var charge = (int)Math.Round(sequenceMass / (isoTargetMz - Constants.Proton));

                        var precursorIon = new Ion(protCompositionWithH2O, charge);
                        var sequence = protSequence.Substring(numNTermCleavages);
                        var pre = numNTermCleavages == 0 ? annotation[0] : annotation[numNTermCleavages + 1];
                        var post = annotation[annotation.Length - 1];
                        var prsm = new DatabaseSequenceSpectrumMatch(sequence, pre, post, ms2ScanNum, offset, numNTermCleavages,
                            modCombinations, precursorIon, score, isDecoy);
                        
                        AddMatch(matches, ms2ScanNum, prsm);
                    });
                }
            }
        }
Пример #8
0
        private void RunTagBasedSearch(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, FastaDatabase db,
                                        CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null)
        {
            _tagSearchEngine.SetDatabase(db);

            //var ms2ScanNums = _run.GetScanNumbers(2);
            var progData = new ProgressData(progress)
            {
                Status = "Tag-based Searching for matches"
            };

            var sw = new Stopwatch();

            long estimatedProteins = _tagMs2ScanNum.Length;
            Console.WriteLine(@"Number of spectra containing sequence tags: " + estimatedProteins);
            var numProteins = 0;
            var lastUpdate = DateTime.MinValue; // Force original update of 0%

            sw.Reset();
            sw.Start();

            var pfeOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxNumThreads,
                CancellationToken = cancellationToken ?? CancellationToken.None
            };

            Parallel.ForEach(_tagMs2ScanNum, pfeOptions, ms2ScanNum =>
            {
                var tagSeqMatches = _tagSearchEngine.RunSearch(ms2ScanNum);

                foreach (var tagSequenceMatch in tagSeqMatches)
                {
                    var offset = _tagSearchEngine.FastaDatabase.GetOffset(tagSequenceMatch.ProteinName);
                    if (offset == null) continue;

                    var sequence = tagSequenceMatch.Sequence;
                    var numNTermCleavages = tagSequenceMatch.TagMatch.StartIndex;

                    var seqObj = Sequence.CreateSequence(sequence, tagSequenceMatch.TagMatch.ModificationText, AminoAcidSet);
                    var precursorIon = new Ion(seqObj.Composition + Composition.H2O, tagSequenceMatch.TagMatch.Charge);

                    var prsm = new DatabaseSequenceSpectrumMatch(sequence, tagSequenceMatch.Pre, tagSequenceMatch.Post,
                                                                 ms2ScanNum, (long)offset, numNTermCleavages,
                                                                 tagSequenceMatch.TagMatch.Modifications,
                                                                 precursorIon, tagSequenceMatch.TagMatch.Score, db.IsDecoy)
                    {
                        ModificationText = tagSequenceMatch.TagMatch.ModificationText,
                    };
                    
                    AddMatch(matches, ms2ScanNum, prsm);    
                }
                
                SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData, "spectra");
            });

            Console.WriteLine(@"Collected candidate matches: {0}", GetNumberOfMatches(matches));

            progData.StatusInternal = string.Empty;
            progData.Report(100.0);
        }
Пример #9
0
 public IcBottomUpScores GetScores(DatabaseSequenceSpectrumMatch match, Composition composition, int charge,
                                   int ms2ScanNum)
 {
     return(GetScores(match.Pre, match.Sequence, match.Post, match.NTerm, match.CTerm, composition, charge, ms2ScanNum));
 }
Пример #10
0
        private SortedSet <DatabaseSequenceSpectrumMatch>[] RunSearch(IEnumerable <AnnotationAndOffset> annotationsAndOffsets, ISequenceFilter ms1Filter, bool isDecoy)
        {
            var sw          = new Stopwatch();
            var numPeptides = 0;

            sw.Reset();
            sw.Start();

            var matches = new SortedSet <DatabaseSequenceSpectrumMatch> [_run.MaxLcScan + 1];

            // TODO: N-term Met cleavage
            foreach (var annotationAndOffset in annotationsAndOffsets)
            {
                ++numPeptides;

                var annotation = annotationAndOffset.Annotation;
                var offset     = annotationAndOffset.Offset;

                if (numPeptides % 100000 == 0)
                {
                    Console.Write(@"Processing {0}{1} peptides...", numPeptides,
                                  numPeptides == 1 ? "st" : numPeptides == 2 ? "nd" : numPeptides == 3 ? "rd" : "th");
                    if (numPeptides != 0)
                    {
                        sw.Stop();
                        var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                        Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);
                        sw.Reset();
                        sw.Start();
                    }
                }

                var seqGraph = SequenceGraph.CreateGraph(AminoAcidSet, annotation);
                if (seqGraph == null)
                {
                    //                    Console.WriteLine("Ignoring illegal protein: {0}", annotation);
                    continue;
                }

                //var protCompositions = seqGraph.GetSequenceCompositions();
                var numProteoforms = seqGraph.GetNumProteoformCompositions();
                var modCombs       = seqGraph.GetModificationCombinations();
                for (var modIndex = 0; modIndex < numProteoforms; modIndex++)
                {
                    seqGraph.SetSink(modIndex);
                    var protCompositionWithH2O = seqGraph.GetSinkSequenceCompositionWithH2O();
                    var sequenceMass           = protCompositionWithH2O.Mass;
                    var modCombinations        = modCombs[modIndex];

                    foreach (var ms2ScanNum in ms1Filter.GetMatchingMs2ScanNums(sequenceMass))
                    {
                        var spec = _run.GetSpectrum(ms2ScanNum) as ProductSpectrum;
                        if (spec == null)
                        {
                            continue;
                        }
                        var charge =
                            (int)Math.Round(sequenceMass / (spec.IsolationWindow.IsolationWindowTargetMz - Constants.Proton));
                        var scorer = _ms2ScorerFactory.GetMs2Scorer(ms2ScanNum);
                        var score  = seqGraph.GetFragmentScore(scorer);
                        if (score <= 2)
                        {
                            continue;
                        }

                        var precursorIon = new Ion(protCompositionWithH2O, charge);
                        var sequence     = annotation.Substring(2, annotation.Length - 4);
                        var pre          = annotation[0];
                        var post         = annotation[annotation.Length - 1];
                        var prsm         = new DatabaseSequenceSpectrumMatch(sequence, pre, post, ms2ScanNum, offset, 0, modCombinations,
                                                                             precursorIon, score, isDecoy);

                        if (matches[ms2ScanNum] == null)
                        {
                            matches[ms2ScanNum] = new SortedSet <DatabaseSequenceSpectrumMatch> {
                                prsm
                            };
                        }
                        else // already exists
                        {
                            var existingMatches = matches[ms2ScanNum];
                            if (existingMatches.Count < NumMatchesPerSpectrum)
                            {
                                existingMatches.Add(prsm);
                            }
                            else
                            {
                                var minScore = existingMatches.Min.Score;
                                if (score > minScore)
                                {
                                    existingMatches.Add(prsm);
                                    existingMatches.Remove(existingMatches.Min);
                                }
                            }
                        }
                    }
                }
            }

            return(matches);
        }