public IcRescorer(string specFilePath, string icResultFilePath, string outputFilePath, AminoAcidSet aaSet, Tolerance tolerance, double ms2CorrThreshold = 0.7 , int minProductIonCharge = 1, int maxProductIonCharge = 10) { var run = InMemoryLcMsRun.GetLcMsRun(specFilePath, 1.4826, 1.4826); _topDownScorer = new InformedTopDownScorer(run, aaSet, minProductIonCharge, maxProductIonCharge, tolerance, ms2CorrThreshold); Rescore(icResultFilePath, outputFilePath); }
public void TestRescoring() { //const string specFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; const string specFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf"; //const string sequence = "SGWYELSKSSNDQFKFVLKAGNGEVILTSELYTGKSGAMNGIESVQTNSPIEARYAKEVAKNDKPYFNLKAANHQIIGTSQMYSSTA"; //const int scanNum = 4084; const string sequence = "SKTKHPLPEQWQKNQEAAKATQVAFDLDEKFQYSIRKAALDAGVSPSDQIRTILGLSVSRRPTRPRLTVSLNADDYVQLAEKYDLNADAQLEIKRRVLEDLVRFVAED"; const int scanNum = 5448; const int charge = 11; // Configure amino acid set var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false); const int numMaxModsPerProtein = 4; var searchModifications = new List<SearchModification> { dehydroC, glutathioneC, oxM, acetylN, }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var composition = aaSet.GetComposition(sequence) + Composition.H2O; var run = PbfLcMsRun.GetLcMsRun(specFilePath, 0, 0); var informedScorer = new InformedTopDownScorer(run, aaSet, 1, 15, new Tolerance(10)); var scores = informedScorer.GetScores(AminoAcid.ProteinNTerm, sequence, AminoAcid.ProteinCTerm, composition, charge, scanNum); Console.WriteLine("Total Score = " + scores.Score); Console.WriteLine("#Fragments = " + scores.NumMatchedFrags); }
private DatabaseSequenceSpectrumMatch[] RunGeneratingFunction(SortedSet<DatabaseSequenceSpectrumMatch>[] sortedMatches, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null) { var progData = new ProgressData(progress) { Status = "Calculating spectral E-values for matches" }; if (_cachedScoreDistributions == null) { _cachedScoreDistributions = new LinkedList<Tuple<double, ScoreDistribution>>[_run.MaxLcScan + 1]; foreach (var scanNum in _ms2ScanNums) _cachedScoreDistributions[scanNum] = new LinkedList<Tuple<double, ScoreDistribution>>(); } var sw = new Stopwatch(); var topDownScorer = new InformedTopDownScorer(_run, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance); // Rescore and Estimate #proteins for GF calculation var matches = new LinkedList<DatabaseSequenceSpectrumMatch>[sortedMatches.Length]; long estimatedProteins = 0; foreach(var scanNum in _ms2ScanNums) { var prsms = sortedMatches[scanNum]; if (prsms == null) continue; var spec = _run.GetSpectrum(scanNum) as ProductSpectrum; if (spec == null) return null; foreach (var match in prsms) { var sequence = match.Sequence; var ion = match.Ion; // Re-scoring var scores = topDownScorer.GetScores(spec, sequence, ion.Composition, ion.Charge, scanNum); if (scores == null) continue; match.Score = scores.Score; match.ModificationText = scores.Modifications; match.NumMatchedFragments = scores.NumMatchedFrags; if (match.Score > CompositeScorer.ScoreParam.Cutoff) { if (matches[scanNum] == null) matches[scanNum] = new LinkedList<DatabaseSequenceSpectrumMatch>(); matches[scanNum].AddLast(match); } } if (matches[scanNum] != null) estimatedProteins += matches[scanNum].Count; } Console.WriteLine(@"Estimated matched proteins: " + estimatedProteins); var numProteins = 0; var lastUpdate = DateTime.MinValue; // Force original update of 0% sw.Reset(); sw.Start(); var scanNums = _ms2ScanNums.Where(scanNum => matches[scanNum] != null).ToArray(); var pfeOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxNumThreads, CancellationToken = cancellationToken ?? CancellationToken.None }; Parallel.ForEach(scanNums, pfeOptions, scanNum => { var currentTask = "?"; try { var scoreDistributions = _cachedScoreDistributions[scanNum]; foreach (var match in matches[scanNum]) { var currentIteration = "for scan " + scanNum + " and mass " + match.Ion.Composition.Mass; currentTask = "Calling GetMs2ScoringGraph " + currentIteration; var graph = _ms2ScorerFactory2.GetMs2ScoringGraph(scanNum, match.Ion.Composition.Mass); if (graph == null) continue; currentTask = "Calling ComputeGeneratingFunction " + currentIteration; var scoreDist = (from distribution in scoreDistributions where Math.Abs(distribution.Item1 - match.Ion.Composition.Mass) < PrecursorIonTolerance.GetToleranceAsTh(match.Ion.Composition.Mass) select distribution.Item2).FirstOrDefault(); if (scoreDist == null) { var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); scoreDist = gf.GetScoreDistribution(); scoreDistributions.AddLast(new Tuple<double, ScoreDistribution>(match.Ion.Composition.Mass, scoreDist)); } currentTask = "Calling GetSpectralEValue " + currentIteration + " and score " + (int)match.Score; match.SpecEvalue = scoreDist.GetSpectralEValue(match.Score); currentTask = "Reporting progress " + currentIteration; SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData); } } catch (Exception ex) { var errMsg = string.Format("Exception while {0}: {1}", currentTask, ex.Message); Console.WriteLine(errMsg); throw new Exception(errMsg, ex); } }); var finalMatches = new DatabaseSequenceSpectrumMatch[matches.Length]; foreach (var scanNum in scanNums) { finalMatches[scanNum] = matches[scanNum].OrderBy(m => m.SpecEvalue).First(); } progData.StatusInternal = string.Empty; progData.Report(100.0); return finalMatches; }
public void TestCompositeScoring() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //const string rawFilePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\SpecFiles\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List<SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); const int minCharge = 1; const int maxCharge = 20; var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); var aminoAcidSet = new AminoAcidSet(); //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge); var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance); var fileExt = new string[] {"IcTarget", "IcDecoy"}; foreach (var ext in fileExt) { var resultFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}.tsv", ext); var parser = new TsvFileParser(resultFileName); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var protSequences = parser.GetData("Sequence").ToArray(); var modStrs = parser.GetData("Modifications").ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var protMass = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray(); var outputFileName = string.Format(@"D:\MassSpecFiles\training\Rescoring\QC_Shew_Intact_26Sep14_Bane_C2Column3_{0}_Rescored.tsv", ext); using (var writer = new StreamWriter(outputFileName)) { writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue"); var lines = new string[parser.NumData]; //for (var i = 0; i < parser.NumData; i++) Parallel.For(0, parser.NumData, i => { var scan = scans[i]; var charge = charges[i]; var protSequence = protSequences[i]; var modStr = modStrs[i]; var sequence = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet); Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O)); var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum; Assert.True(ms2Spec != null); var scores = scorer.GetScores(sequence, charge, scan); var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge, isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7); var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance, comparer); var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]); var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); var specEvalue = gf.GetSpectralEValue(scores.Score); var rowStr = parser.GetRows()[i]; var items = rowStr.Split('\t').ToArray(); var newRowStr = string.Join("\t", items, 0, 15); //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); lock (lines) { lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); } //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue); }); foreach (var line in lines) writer.WriteLine(line); } Console.WriteLine("Done"); } }
public void TestSumMs2Spectra() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string specFilePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\TestYufengData\NewQC_LongSep_29Sep14_141001104925.raw"; if (!File.Exists(specFilePath)) { Assert.Ignore(@"Skipping test " + methodName + @" since file not found: " + specFilePath); } const int minScanNum = 1289; //const int maxScanNum = 1389; const int minCharge = 6; //const int maxCharge = 6; const string sequence = "EIRGYRPPEPYKGKGVRYDDEEVRRKEAKKK"; var aaSet = new AminoAcidSet(); var run = PbfLcMsRun.GetLcMsRun(specFilePath); var scorer = new InformedTopDownScorer(run, aaSet, 1, minCharge - 1, new Tolerance(10)); scorer.GetScores(AminoAcid.ProteinNTerm, sequence, AminoAcid.ProteinCTerm, Composition.Parse("C(166) H(270) N(52) O(49) S(0)"), minCharge, minScanNum); }
public void TestPrSm() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //const string specFilePath = @"C:\cygwin\home\kims336\Data\TopDownYufeng\raw\yufeng_column_test2.raw"; //const string annotation = // "_.MKTKLSVLSAAMLAATLTMMPAVSQAAIPQSVEGQSIPSLAPMLERTTPAVVSVAVSGTHVSKQRVPDVFRYFFGPNAPQEQVQERPFRGLGSGVIIDADKGYIVTNNHVIDGADDIQVG" + // "LHDGREVKAKLIGTDSESDIALLQIEAKNLVAIKTSDSDELRVGDFAVAIGNPFGLGQTV" + // "TSGIVSALGRSGLGIEMLENFIQTDAAINSGNSGGALVNLKGELIGINTAIVAPNGGNVG" + // "IGFAIPANMVKNLIAQIAEHGEVRRGVLGIAGRDLDSQLAQGFGLDTQHGGFVNEVSAGS" + // "AAEKAGIKAGDIIVSVDGRAIKSFQELRAKVATMGAGAKVELGLIRDGDKKTVNVTLGEA" + // "NQTTEKAAGAVHPMLQGASLENASKGVEITDVAQGSPAAMSGLQKGDLIVGINRTAVKDL" + // "KSLKELLKDQEGAVALKIVRGKSMLYLVLR._"; //var aaSet = new AminoAcidSet(); //const int charge = 60; //const int ms2ScanNum = 46661; const string specFilePath = @"D:\Research\Data\Jon\AH_SF_mouseliver_3-1_Intact_2_6Feb14_Bane_PL011402.raw"; if (!File.Exists(specFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath); } const int ms2ScanNum = 19011; const int charge = 7; const string annotation = "_.SKVSFKITLTSDPRLPYKVLSVPESTPFTAVLKFAAEEFKVPAATSAIITNDGIGINPAQTAGNVFLKHGSELRIIPRDRVGSC._"; var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, true); var modVal = Modification.RegisterAndGetModification("AddVal", new Composition(5, 9, 1, 1, 0)); var searchMods = AminoAcid.StandardAminoAcidCharacters.Select(residue => new SearchModification(modVal, residue, SequenceLocation.Everywhere, false)).ToList(); searchMods.Add(acetylN); const int numMaxModsPerProtein = 1; var aaSet = new AminoAcidSet(searchMods, numMaxModsPerProtein); var graph = SequenceGraph.CreateGraph(aaSet, annotation); Console.WriteLine("NumProteoforms: " + graph.GetNumProteoformCompositions()); var run = InMemoryLcMsRun.GetLcMsRun(specFilePath, 1.4826, 1.4826); var ms2Scorer = new ProductScorerBasedOnDeconvolutedSpectra(run, 1, 15); ms2Scorer.GetScorer(ms2ScanNum); var scorer = ms2Scorer.GetMs2Scorer(ms2ScanNum); Assert.NotNull(scorer, "Scorer is null!"); for (var i = 0; i < graph.GetNumProteoformCompositions(); i++) { graph.SetSink(i); Console.WriteLine("ModComb: " + graph.GetModificationCombinations()[i]); var score = graph.GetFragmentScore(scorer); Console.WriteLine("Fast search score: " + score); var composition = graph.GetSinkSequenceCompositionWithH2O(); var informedScorer = new InformedTopDownScorer(run, aaSet, 1, 30, new Tolerance(10)); var refinedScore = informedScorer.GetScores(AminoAcid.ProteinNTerm, SimpleStringProcessing.GetStringBetweenDots(annotation), AminoAcid.ProteinCTerm, composition, charge, ms2ScanNum); Console.WriteLine("Modifications: {0}", refinedScore.Modifications); Console.WriteLine("Composition: {0}", composition); Console.WriteLine("RefinedScores: {0}", refinedScore); } }