/// <summary> /// Initializes static members of the <see cref="MsgfPlusSequenceReader"/> class. /// </summary> static MsgfPlusSequenceReader() { AminoAcidSet = new AminoAcidSet(); }
public void TestPrSm() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //const string specFilePath = @"C:\cygwin\home\kims336\Data\TopDownYufeng\raw\yufeng_column_test2.raw"; //const string annotation = // "_.MKTKLSVLSAAMLAATLTMMPAVSQAAIPQSVEGQSIPSLAPMLERTTPAVVSVAVSGTHVSKQRVPDVFRYFFGPNAPQEQVQERPFRGLGSGVIIDADKGYIVTNNHVIDGADDIQVG" + // "LHDGREVKAKLIGTDSESDIALLQIEAKNLVAIKTSDSDELRVGDFAVAIGNPFGLGQTV" + // "TSGIVSALGRSGLGIEMLENFIQTDAAINSGNSGGALVNLKGELIGINTAIVAPNGGNVG" + // "IGFAIPANMVKNLIAQIAEHGEVRRGVLGIAGRDLDSQLAQGFGLDTQHGGFVNEVSAGS" + // "AAEKAGIKAGDIIVSVDGRAIKSFQELRAKVATMGAGAKVELGLIRDGDKKTVNVTLGEA" + // "NQTTEKAAGAVHPMLQGASLENASKGVEITDVAQGSPAAMSGLQKGDLIVGINRTAVKDL" + // "KSLKELLKDQEGAVALKIVRGKSMLYLVLR._"; //var aaSet = new AminoAcidSet(); //const int charge = 60; //const int ms2ScanNum = 46661; const string specFilePath = @"D:\Research\Data\Jon\AH_SF_mouseliver_3-1_Intact_2_6Feb14_Bane_PL011402.raw"; if (!File.Exists(specFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath); } const int ms2ScanNum = 19011; const int charge = 7; const string annotation = "_.SKVSFKITLTSDPRLPYKVLSVPESTPFTAVLKFAAEEFKVPAATSAIITNDGIGINPAQTAGNVFLKHGSELRIIPRDRVGSC._"; var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, true); var modVal = Modification.RegisterAndGetModification("AddVal", new Composition(5, 9, 1, 1, 0)); var searchMods = AminoAcid.StandardAminoAcidCharacters.Select(residue => new SearchModification(modVal, residue, SequenceLocation.Everywhere, false)).ToList(); searchMods.Add(acetylN); const int numMaxModsPerProtein = 1; var aaSet = new AminoAcidSet(searchMods, numMaxModsPerProtein); var graph = SequenceGraph.CreateGraph(aaSet, annotation); Console.WriteLine("NumProteoforms: " + graph.GetNumProteoformCompositions()); var run = InMemoryLcMsRun.GetLcMsRun(specFilePath, 1.4826, 1.4826); var ms2Scorer = new ProductScorerBasedOnDeconvolutedSpectra(run, 1, 15); ms2Scorer.GetScorer(ms2ScanNum); var scorer = ms2Scorer.GetMs2Scorer(ms2ScanNum); Assert.NotNull(scorer, "Scorer is null!"); for (var i = 0; i < graph.GetNumProteoformCompositions(); i++) { graph.SetSink(i); Console.WriteLine("ModComb: " + graph.GetModificationCombinations()[i]); var score = graph.GetFragmentScore(scorer); Console.WriteLine("Fast search score: " + score); var composition = graph.GetSinkSequenceCompositionWithH2O(); var informedScorer = new InformedTopDownScorer(run, aaSet, 1, 30, new Tolerance(10)); var refinedScore = informedScorer.GetScores(AminoAcid.ProteinNTerm, SimpleStringProcessing.GetStringBetweenDots(annotation), AminoAcid.ProteinCTerm, composition, charge, ms2ScanNum); Console.WriteLine("Modifications: {0}", refinedScore.Modifications); Console.WriteLine("Composition: {0}", composition); Console.WriteLine("RefinedScores: {0}", refinedScore); } }
/// <summary> /// Parse a protein/peptide sequence in the LCMSSpectator style. /// </summary> /// <param name="sequence">The sequence as a string.</param> /// <returns>The parsed sequence.</returns> public Sequence Read(string sequence) { if (this.trimAnnotations) { var firstIndex = sequence.IndexOf('.'); if (firstIndex >= 0) { var index = Math.Min(firstIndex + 1, sequence.Length - 1); sequence = sequence.Substring(index, sequence.Length - index - 1); } var lastIndex = sequence.LastIndexOf('.'); if (lastIndex >= 0) { var index = Math.Min(lastIndex, sequence.Length - 1); sequence = sequence.Substring(0, index); } } const string AminoAcidRegex = @"[" + AminoAcid.StandardAminoAcidCharacters + "]"; ////const string modRegex = @"\[([A-Z]|[a-z])+\]"; const string ModRegex = @"\[([A-Z]|[a-z]|[0-9]|-|>)+\]"; if (string.IsNullOrEmpty(sequence)) { return(new Sequence(new List <AminoAcid>())); } if (!Regex.IsMatch(sequence, "(" + AminoAcidRegex + "|" + ModRegex + ")+")) { return(null); } var stdAaSet = new AminoAcidSet(); var aminoAcidList = new List <AminoAcid>(); var matches = Regex.Matches(sequence, "(" + AminoAcidRegex + "|" + ModRegex + ")"); AminoAcid aa = null; var mods = new List <Modification>(); foreach (Match match in matches) { var element = match.Value; if (element.Length == 0) { continue; } if (element.Length == 1 && char.IsLetter(element[0])) { // amino acid if (aa != null) { aa = mods.Aggregate(aa, (current, mod) => new ModifiedAminoAcid(current, mod)); aminoAcidList.Add(aa); mods = new List <Modification>(); } aa = stdAaSet.GetAminoAcid(element[0]); if (aa == null) { throw new FormatException("Unrecognized amino acid character: " + element[0]); } //// Console.WriteLine("{0} {1} {2}", aa.Residue, aa.Composition, aa.GetMass()); } else { var modName = element.Substring(1, element.Length - 2); var mod = Modification.Get(modName); if (mod == null) { throw new FormatException("Unrecognized modification: " + modName); } mods.Add(mod); //// Console.WriteLine("{0} {1} {2}", mod.Name, mod.Composition, mod.Composition.AveragineMass); } } if (aa != null) { aa = mods.Aggregate(aa, (current, mod) => new ModifiedAminoAcid(current, mod)); aminoAcidList.Add(aa); } return(new Sequence(aminoAcidList)); }
private void TestTopDownSearch(string specFilePath, string dbFilePath, string outputDir, AminoAcidSet aaSet, int minSequenceLength, int maxSequenceLength, int minPrecursorIonCharge, int maxPrecursorIonCharge, int minProductIonCharge, int maxProductIonCharge, double minSequenceMass, double maxSequenceMass, DatabaseSearchMode tda, InternalCleavageType searchMode) { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); // Search parameters const int maxNumNTermCleavages = 1; // 30 const int maxNumCTermCleavages = 0; const int precursorIonTolerancePpm = 10; const int productIonTolerancePpm = 10; var topDownOptions = new MsPfParameters( specFilePath, dbFilePath, outputDir, aaSet, "") { MinSequenceLength = minSequenceLength, MaxSequenceLength = maxSequenceLength, MaxNumNTermCleavages = maxNumNTermCleavages, MaxNumCTermCleavages = maxNumCTermCleavages, MinPrecursorIonCharge = minPrecursorIonCharge, MaxPrecursorIonCharge = maxPrecursorIonCharge, MinProductIonCharge = minProductIonCharge, MaxProductIonCharge = maxProductIonCharge, MinSequenceMass = minSequenceMass, MaxSequenceMass = maxSequenceMass, PrecursorIonTolerancePpm = precursorIonTolerancePpm, ProductIonTolerancePpm = productIonTolerancePpm, TargetDecoySearchMode = tda, InternalCleavageMode = searchMode, }; var topDownLauncher = new IcTopDownLauncher(topDownOptions); //topDownLauncher.ForceParallel = true; //topDownLauncher.MaxNumThreads = -1; topDownLauncher.RunSearch(0.7); //topDownLauncher.RunIntactProteinSearch(); }
public void TestTopDownSearch(string specFilePath, string dbFilePath, string outputDir, AminoAcidSet aaSet, bool?tda, int searchMode) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minSequenceLength = 21; // 7 const int maxSequenceLength = 500; // 1000 const int minPrecursorIonCharge = 2; // 3 const int maxPrecursorIonCharge = 60; // 67 const int minProductIonCharge = 1; // 1 const int maxProductIonCharge = 20; // 15 const double minSequenceMass = 3000.0; const double maxSequenceMass = 50000.0; TestTopDownSearch(specFilePath, dbFilePath, outputDir, aaSet, minSequenceLength, maxSequenceLength, minPrecursorIonCharge, maxPrecursorIonCharge, minProductIonCharge, maxProductIonCharge, minSequenceMass, maxSequenceMass, tda, searchMode ); }
public void TestFeatureId() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3"; if (!File.Exists(dataSet)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet); } // Feature: 5236-5286 6-12 8480.3681 5 const int minScanNum = 5236; const int maxScanNum = 5286; const double featureMass = 8480.3681; //const int minScanNum = 7251; //const int maxScanNum = 7326; //const double featureMass = 32347.18; // const int minScanNum = 4451; // const int maxScanNum = 4541; // const double featureMass = 31267.95; var tolerance = new Tolerance(10); var relaxedTolerance = new Tolerance(20); const int minTagLength = 5; const int minMergedTagLength = 7; const int minNumTagMatches = 1; var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw"); var run = PbfLcMsRun.GetLcMsRun(rawFileName); var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft"); var filter = new Ms1FtFilter(run, tolerance, featureFileName); var ms2ScanNums = filter.GetMatchingMs2ScanNums(featureMass) .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum) .ToArray(); const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); var tagParser = new SequenceTagParser(tagFileName, minTagLength); var proteinsToTags = new Dictionary <string, IList <MatchedTag> >(); foreach (var ms2ScanNum in ms2ScanNums) { var tags = tagParser.GetSequenceTags(ms2ScanNum); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetZeroBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, featureMass); IList <MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List <MatchedTag> { matchedTag }); } } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) { break; } var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet, tolerance, relaxedTolerance); Console.WriteLine("********** Before merging"); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); matchedTagSet.Add(matchedTag); } Console.WriteLine("********** After merging"); foreach (var matchedTag in matchedTagSet.Tags) { if (matchedTag.Length < minMergedTagLength) { continue; } var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}", (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } break; } }
public void FilteringEfficiencyQcShew() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string rawFilePath = @"C:\cygwin\home\kims336\Data\TopDownQCShew\raw\QC_ShewIntact_2ug_3k_CID_4Apr14_Bane_PL011402.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = InMemoryLcMsRun.GetLcMsRun(rawFilePath, 1.4826, 1.4826); sw.Stop(); Console.WriteLine(@"Reading run: {0:f4} sec", sw.Elapsed.TotalSeconds); const int minPrecursorCharge = 3; const int maxPrecursorCharge = 30; const int tolerancePpm = 10; var tolerance = new Tolerance(tolerancePpm); sw.Reset(); sw.Start(); var ms1BasedFilter = new Ms1IsotopeAndChargeCorrFilter(run, new Tolerance(10.0), minPrecursorCharge, maxPrecursorCharge, 3000, 50000, 0.7, 0.7, 0.7, 40); //var ms1BasedFilter = new Ms1IsotopeCorrFilter(run, minPrecursorCharge, maxPrecursorCharge, 15, 0.5, 40); sw.Stop(); Console.WriteLine(@"Ms1 filter: {0:f4} sec", sw.Elapsed.TotalSeconds); ISequenceFilter ms1Filter = ms1BasedFilter; sw.Reset(); sw.Start(); const double minProteinMass = 3000.0; const double maxProteinMass = 30000.0; var minBinNum = ProductScorerBasedOnDeconvolutedSpectra.GetBinNumber(minProteinMass); var maxBinNum = ProductScorerBasedOnDeconvolutedSpectra.GetBinNumber(maxProteinMass); var numComparisons = 0L; for (var binNum = minBinNum; binNum <= maxBinNum; binNum++) { var mass = ProductScorerBasedOnDeconvolutedSpectra.GetMz(binNum); numComparisons += ms1Filter.GetMatchingMs2ScanNums(mass).Count(); } sw.Stop(); Console.WriteLine(@"Calculating #matches per bin: {0:f4} sec", sw.Elapsed.TotalSeconds); //const string prot = // "ADVFHLGLTKAMLDGATLAIVPGDPERVKRIAELMDNATFLASHREYTSYLAYADGKPVVICSTGIGGPSTSIAVEELAQLGVNTFLRVGTTGAIQPHVNVGDVIVTQASVRLDGASLHFAPMEFPAVANFECTTAMVAACRDAGVEPHIGVTASSDTFYPGQERYDTVTGRVTRRFAGSMKEWQDMGVLNYEMESATLFTMCATQGWRAACVAGVIVNRTQQEIPDEATMKKTEVSAVSIVVAAAKKLLA"; //var protMass = (new AminoAcidSet().GetComposition(prot) + Composition.H2O).Mass; //Console.WriteLine("************ScanNums: " + string.Join("\t", ms1Filter.GetMatchingMs2ScanNums(protMass))); const string resultFilePath = @"C:\cygwin\home\kims336\Data\TopDownQCShew\MSAlign\NoMod.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var tsvReader = new TsvFileParser(resultFilePath); var scanNums = tsvReader.GetData("Scan(s)"); var charges = tsvReader.GetData("Charge"); var scores = tsvReader.GetData("E-value"); var sequences = tsvReader.GetData("Peptide"); //const string resultFilePath = @"C:\cygwin\home\kims336\Data\TopDownQCShew\raw\QC_ShewIntact_2ug_3k_CID_4Apr14_Bane_PL011402_N30_C30.tsv"; //var tsvReader = new TsvFileParser(resultFilePath); //var scanNums = tsvReader.GetData("ScanNum"); //var charges = tsvReader.GetData("Charge"); //var scores = tsvReader.GetData("Score"); //var sequences = tsvReader.GetData("Sequence"); var aaSet = new AminoAcidSet(); var seqSet = new HashSet <string>(); var allSeqSet = new HashSet <string>(); var numUnfilteredSpecs = 0; var totalSpecs = 0; for (var i = 0; i < scores.Count; i++) { var score = Convert.ToDouble(scores[i]); if (score > 1E-4) { continue; } //if (score < 10) continue; var scanNum = Convert.ToInt32(scanNums[i]); var charge = Convert.ToInt32(charges[i]); var sequence = SimpleStringProcessing.GetStringBetweenDots(sequences[i]); if (sequence == null || sequence.Contains("(")) { continue; } //var sequence = sequences[i]; var composition = aaSet.GetComposition(sequence) + Composition.H2O; var precursorIon = new Ion(composition, charge); var isValid = run.GetSpectrum(scanNum) is ProductSpectrum spec && spec.IsolationWindow.Contains(precursorIon.GetMostAbundantIsotopeMz()); if (!isValid) { continue; } ++totalSpecs; var precursorScanNum = run.GetPrecursorScanNum(scanNum); var precursorSpec = run.GetSpectrum(precursorScanNum); var corr1 = precursorSpec.GetCorrScore(precursorIon, tolerance, 0.1); var nextScanNum = run.GetNextScanNum(scanNum, 1); var nextSpec = run.GetSpectrum(nextScanNum); var corr2 = nextSpec.GetCorrScore(precursorIon, tolerance, 0.1); var corr3 = ms1Filter.GetMatchingMs2ScanNums(composition.Mass).Contains(scanNum) ? 1 : 0; if (corr3 == 1) { numUnfilteredSpecs++; seqSet.Add(sequences[i]); } allSeqSet.Add(sequences[i]); var corrMax = new[] { corr1, corr2, corr3 }.Max(); Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}", scanNum, precursorScanNum, corr1, nextScanNum, corr2, corr3, corrMax); } Console.WriteLine("TotalNumComparisons: {0}", numComparisons); Console.WriteLine("AverageNumComparisons: {0:f2}", numComparisons / (double)(maxBinNum - minBinNum + 1)); Console.WriteLine("SuccessRate: {0:f2} {1} / {2}", numUnfilteredSpecs / (double)totalSpecs, numUnfilteredSpecs, totalSpecs); Console.WriteLine("NumUniqueSequences: {0:f2}, {1} / {2}", seqSet.Count / (double)allSeqSet.Count, seqSet.Count, allSeqSet.Count); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
/// <summary> /// Initializes a new instance of the <see cref="MsPfParameters"/> class, with the parameters specifying required options for a search /// </summary> /// <param name="specFilePath"></param> /// <param name="dbFilePath"></param> /// <param name="outputDir"></param> /// <param name="aaSet"></param> /// <param name="featureFilePath"></param> public MsPfParameters(string specFilePath, string dbFilePath, string outputDir, AminoAcidSet aaSet, string featureFilePath = null) : this() { SpecFilePath = specFilePath; DatabaseFilePath = dbFilePath; AminoAcidSet = aaSet; OutputDir = outputDir; FeatureFilePath = featureFilePath; }
public void Test43KProtein() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); // Configure amino acid set var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false); var dethiomethylM = new SearchModification(Modification.Dethiomethyl, 'M', SequenceLocation.Everywhere, false); var deamidatedN = new SearchModification(Modification.Deamidation, 'N', SequenceLocation.Everywhere, false); var deamidatedQ = new SearchModification(Modification.Deamidation, 'Q', SequenceLocation.Everywhere, false); var pyroCarbamidomethylC = new SearchModification(Modification.PyroCarbamidomethyl, 'C', SequenceLocation.ProteinNTerm, false); var phosphoS = new SearchModification(Modification.Phosphorylation, 'S', SequenceLocation.Everywhere, false); var phosphoT = new SearchModification(Modification.Phosphorylation, 'T', SequenceLocation.Everywhere, false); var phosphoY = new SearchModification(Modification.Phosphorylation, 'Y', SequenceLocation.Everywhere, false); var nitrosylC = new SearchModification(Modification.Nitrosyl, 'C', SequenceLocation.Everywhere, false); var nethylmaleimideC = new SearchModification(Modification.Nethylmaleimide, 'C', SequenceLocation.Everywhere, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, glutathioneC, oxM, dethiomethylM, acetylN, //phosphoS, //phosphoT, //phosphoY, deamidatedN, // deamidatedQ, glutathioneC, pyroCarbamidomethylC, nitrosylC, nethylmaleimideC }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); // var aaSet = new AminoAcidSet(); if (!File.Exists(TestRawFilePath)) { Assert.Ignore(@"Skipping test " + methodName + @" since file not found: " + TestRawFilePath); } var run = PbfLcMsRun.GetLcMsRun(TestRawFilePath); const string protSequence = "AIPQSVEGQSIPSLAPMLERTTPAVVSVAVSGTHVSKQRVPDVFRYFFGPNAPQEQVQERPFRGLGSGVIIDADKGYIVTNNHVIDGADDIQVGLHDGREVKAKLIGTDSESDIALLQIEAKNLVAIKTSDSDELRVGDFAVAIGNPFGLGQTVTSGIVSALGRSGLGIEMLENFIQTDAAINSGNSGGALVNLKGELIGINTAIVAPNGGNVGIGFAIPANMVKNLIAQIAEHGEVRRGVLGIAGRDLDSQLAQGFGLDTQHGGFVNEVSAGSAAEKAGIKAGDIIVSVDGRAIKSFQELRAKVATMGAGAKVELGLIRDGDKKTVNVTLGEANQTTEKAAGAVHPMLQGASLENASKGVEITDVAQGSPAAMSGLQKGDLIVGINRTAVKDLKSLKELLKDQEGAVALKIVRGKSMLYLVLR"; const string annotation = "_." + protSequence + "._"; var seqGraph = SequenceGraph.CreateGraph(aaSet, AminoAcid.ProteinNTerm, protSequence, AminoAcid.ProteinCTerm); if (seqGraph == null) { return; } var ms1Filter = new SimpleMs1Filter(); var ms2ScorerFactory = new ProductScorerBasedOnDeconvolutedSpectra(run); foreach (var ms2ScanNum in Ms2ScanNums) { ms2ScorerFactory.GetScorer(ms2ScanNum); } for (var numNTermCleavages = 0; numNTermCleavages <= 0; numNTermCleavages++) { if (numNTermCleavages > 0) { seqGraph.CleaveNTerm(); } var numProteoforms = seqGraph.GetNumProteoformCompositions(); var modCombs = seqGraph.GetModificationCombinations(); for (var modIndex = 0; modIndex < numProteoforms; modIndex++) { seqGraph.SetSink(modIndex); var protCompositionWithH2O = seqGraph.GetSinkSequenceCompositionWithH2O(); var sequenceMass = protCompositionWithH2O.Mass; var modCombinations = modCombs[modIndex]; foreach (var ms2ScanNum in ms1Filter.GetMatchingMs2ScanNums(sequenceMass)) { var spec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (spec == null) { continue; } var charge = (int) Math.Round(sequenceMass / (spec.IsolationWindow.IsolationWindowTargetMz - Constants.Proton)); var scorer = ms2ScorerFactory.GetMs2Scorer(ms2ScanNum); var score = seqGraph.GetFragmentScore(scorer); if (score <= 3) { continue; } var precursorIon = new Ion(protCompositionWithH2O, charge); var sequence = protSequence.Substring(numNTermCleavages); var pre = numNTermCleavages == 0 ? annotation[0] : annotation[numNTermCleavages + 1]; var post = annotation[annotation.Length - 1]; Console.WriteLine("{0}.{1}.{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}", pre, sequence, post, ms2ScanNum, modCombinations, precursorIon.GetMostAbundantIsotopeMz(), precursorIon.Charge, precursorIon.Composition.Mass, score); } } } }
/// <summary> /// Initializes a new instance of the <see cref="MsPfParameters"/> class, with the parameters specifying required options for a search /// </summary> /// <param name="specFilePath"></param> /// <param name="dbFilePath"></param> /// <param name="outputDir"></param> /// <param name="aaSet"></param> /// <param name="featureFilePath"></param> public MsPfParameters(string specFilePath, string dbFilePath, string outputDir, AminoAcidSet aaSet, string featureFilePath = null) : this() { // ReSharper disable VirtualMemberCallInConstructor SpecFilePath = specFilePath; DatabaseFilePath = dbFilePath; AminoAcidSet = aaSet; OutputDir = outputDir; FeatureFilePath = featureFilePath; // ReSharper restore VirtualMemberCallInConstructor }
public IcRescorer(string specFilePath, string icResultFilePath, string outputFilePath, AminoAcidSet aaSet, Tolerance tolerance, double ms2CorrThreshold = 0.7 , int minProductIonCharge = 1, int maxProductIonCharge = 10) { var run = InMemoryLcMsRun.GetLcMsRun(specFilePath, 1.4826, 1.4826); _topDownScorer = new InformedTopDownScorer(run, aaSet, minProductIonCharge, maxProductIonCharge, tolerance, ms2CorrThreshold); Rescore(icResultFilePath, outputFilePath); }
public void FindProteinDeltaMass() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string folderPath = @"D:\MassSpecFiles\Glyco\"; if (!Directory.Exists(folderPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath); } var fileSet = new string[] { "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105", "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015" }; const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta"; for (var i = 0; i < fileSet.Length; i++) { var datasetName = fileSet[i]; var tagFilePath = folderPath + datasetName + ".seqtag"; //var outputFilePath = folderPath + datasetName + ".matchedtag"; var outputFilePath = folderPath + datasetName + ".dmass"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; var nReadSeqTag = 0; Console.WriteLine(@"Reading {0} file", tagFilePath); var nColumn = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; nColumn = line.Split('\t').Length; writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass"); continue; } var token = line.Split('\t'); if (token.Length != nColumn) { continue; } var tag = token[1]; //var scan = Convert.ToInt32(token[0]); if (tag.Length < 6) { continue; } var nTerminal = token[2].Equals("1"); var detectedFlankingMass = Double.Parse(token[3]); if (!nTerminal) { detectedFlankingMass -= Composition.H2O.Mass; } nReadSeqTag++; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); if (matchedProteins.Length < 1) { continue; } foreach (var protName in matchedProteins) { var seqStr = fastaDb.GetProteinSequence(protName); var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet()); var startIdx = 0; while (true) { var idx = seqStr.IndexOf(tag, startIdx); if (idx < 0) { break; //no matching } //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length; var nClv = (nTerminal) ? 2 : 1; for (var j = 0; j < nClv; j++) { var flankComposition = (nTerminal) ? oriSeq.GetComposition(j, idx) : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j); var massDiff = (detectedFlankingMass - flankComposition.Mass); if (massDiff > -500 && massDiff < 2000) { //writer.WriteLine(massDiff); writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff); } if (massDiff > 2000) { break; } } startIdx = idx + tag.Length; } } //var matchedProteinStr = string.Join(",", matchedProteins); //var massDiffStr = string.Join(",", massDiffList); //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr); } Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag); } Console.WriteLine(@"Done"); } }
public string Parse(Dictionary <string, string> parameters) { var message = CheckIsValid(parameters); if (message != null) { return(message); } var specFilePath = parameters["-s"]; if (Directory.Exists(specFilePath)) // Directory { SpecFilePaths = Directory.GetFiles(specFilePath, "*.raw"); } else { SpecFilePaths = new[] { specFilePath }; } DatabaseFilePath = parameters["-d"]; var outputDir = parameters["-o"] ?? Environment.CurrentDirectory; if (outputDir[outputDir.Length - 1] == Path.DirectorySeparatorChar) { outputDir = outputDir.Remove(outputDir.Length - 1); } if (!Directory.Exists(outputDir)) { if (File.Exists(outputDir) && !File.GetAttributes(outputDir).HasFlag(FileAttributes.Directory)) { return("OutputDir " + outputDir + " is not a directory!"); } Directory.CreateDirectory(outputDir); } OutputDir = outputDir; var modFilePath = parameters["-mod"]; if (modFilePath != null) { var parser = new ModFileParser(modFilePath); _searchModifications = parser.SearchModifications; _maxNumDynModsPerSequence = parser.MaxNumDynModsPerSequence; if (_searchModifications == null) { return("Error while parsing " + modFilePath + "!"); } AminoAcidSet = new AminoAcidSet(_searchModifications, _maxNumDynModsPerSequence); } else { AminoAcidSet = new AminoAcidSet(); _searchModifications = new SearchModification[0]; } var enzymeId = Convert.ToInt32(parameters["-e"]); Enzyme enzyme; switch (enzymeId) { case 0: enzyme = Enzyme.UnspecificCleavage; break; case 1: enzyme = Enzyme.Trypsin; break; case 2: enzyme = Enzyme.Chymotrypsin; break; case 3: enzyme = Enzyme.LysC; break; case 4: enzyme = Enzyme.LysN; break; case 5: enzyme = Enzyme.GluC; break; case 6: enzyme = Enzyme.ArgC; break; case 7: enzyme = Enzyme.AspN; break; case 8: enzyme = Enzyme.Alp; break; case 9: enzyme = Enzyme.NoCleavage; break; default: return("Invalid enzyme ID (" + enzymeId + ") for parameter -e"); } Enzyme = enzyme; NumTolerableTermini = Convert.ToInt32(parameters["-ntt"]); if (NumTolerableTermini < 0 || NumTolerableTermini > 2) { return("Invalid value (" + NumTolerableTermini + ") for parameter -m"); } PrecursorIonTolerancePpm = Convert.ToDouble(parameters["-t"]); ProductIonTolerancePpm = Convert.ToDouble(parameters["-f"]); var tdaVal = Convert.ToInt32(parameters["-tda"]); if (tdaVal != 0 && tdaVal != 1) { return("Invalid value (" + tdaVal + ") for parameter -tda"); } Tda = (tdaVal == 1); MinSequenceLength = Convert.ToInt32(parameters["-minLength"]); MaxSequenceLength = Convert.ToInt32(parameters["-maxLength"]); if (MinSequenceLength > MaxSequenceLength) { return("MinSequenceLength (" + MinSequenceLength + ") is larger than MaxSequenceLength (" + MaxSequenceLength + ")!"); } MinPrecursorIonCharge = Convert.ToInt32(parameters["-minCharge"]); MaxPrecursorIonCharge = Convert.ToInt32(parameters["-maxCharge"]); if (MinSequenceLength > MaxSequenceLength) { return("MinPrecursorCharge (" + MinPrecursorIonCharge + ") is larger than MaxPrecursorCharge (" + MaxPrecursorIonCharge + ")!"); } MinProductIonCharge = Convert.ToInt32(parameters["-minFragCharge"]); MaxProductIonCharge = Convert.ToInt32(parameters["-maxFragCharge"]); if (MinSequenceLength > MaxSequenceLength) { return("MinFragmentCharge (" + MinProductIonCharge + ") is larger than MaxFragmentCharge (" + MaxProductIonCharge + ")!"); } return(null); }
public void TestGetScoreDistribution(int scanNum, string protSequence) { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var pbfFilePath = Utils.GetPbfTestFilePath(false); var pbfFile = Utils.GetTestFile(methodName, pbfFilePath); if (!pbfFile.Exists) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, pbfFile); } const string modStr = ""; const int maxCharge = 20; const int minCharge = 1; const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); var run = PbfLcMsRun.GetLcMsRun(pbfFile.FullName); // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); //Console.WriteLine("{0}\t{1}", comparer.NumberOfBins, comparer.GetBinNumber(proteinMass)); var stopwatch = Stopwatch.StartNew(); var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); stopwatch.Stop(); Console.WriteLine(@"edge generation elapsed time = {0:0.000} sec", (stopwatch.ElapsedMilliseconds) / 1000.0d); var stopwatch2 = Stopwatch.StartNew(); var sequence = Sequence.CreateSequence(protSequence, modStr, aaSet); var proteinMass = sequence.Mass + Composition.H2O.Mass; Console.WriteLine("Mass = {0}", proteinMass); var spectrum = run.GetSpectrum(scanNum) as ProductSpectrum; var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(spectrum, minCharge, maxCharge, isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7); stopwatch.Restart(); var scorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, spectrum, tolerance, comparer); var graph = graphFactory.CreateScoringGraph(scorer, proteinMass); stopwatch.Stop(); Console.WriteLine(@"node generation elapsed time = {0:0.000} sec", (stopwatch.ElapsedMilliseconds) / 1000.0d); stopwatch.Reset(); stopwatch.Start(); var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); //gf.ComputeGeneratingFunction(graph); stopwatch.Stop(); Console.WriteLine(@"computing generation function = {0:0.000} sec", (stopwatch.ElapsedMilliseconds) / 1000.0d); var scoreDist = gf.GetScoreDistribution(); Console.WriteLine("{0}-{1}", scoreDist.MinScore, scoreDist.MaxScore); Console.WriteLine("{0} : {1}", "score", "specEValue"); for (var score = 15; score <= gf.MaximumScore; score++) { var specEvalue = gf.GetSpectralEValue(score); Console.WriteLine("{0} : {1}", score, specEvalue); } stopwatch2.Stop(); Console.WriteLine(@"TOTAL computing generation function = {0:0.000} sec", stopwatch2.ElapsedMilliseconds / 1000.0d); }
public FilteredProteinMassBinning(AminoAcidSet aaSet, double maxProteinMass = 50000, int numBits = 27) { _aminoAcidSet = aaSet; var terminalModifications = GetTerminalModifications(aaSet); var extendedAminoAcidArray = GetExtendedAminoAcidArray(aaSet); MaxMass = maxProteinMass; MinMass = MaxMass; foreach (var aa in extendedAminoAcidArray) { if (aa.Mass < MinMass) { MinMass = aa.Mass; } foreach (var mod in terminalModifications) { var modAa = new ModifiedAminoAcid(aa, mod); if (modAa.Mass < MinMass) { MinMass = modAa.Mass; } } } _mzComparer = new MzComparerWithBinning(numBits); _minMzBinIndex = _mzComparer.GetBinNumber(MinMass); _maxMzBinIndex = _mzComparer.GetBinNumber(MaxMass); var numberOfMzBins = _maxMzBinIndex - _minMzBinIndex + 2; // pad zero mass bin _mzBinToFilteredBinMap = new int[numberOfMzBins]; for (var i = 0; i < numberOfMzBins; i++) { _mzBinToFilteredBinMap[i] = -1; } var tempMap = new int[numberOfMzBins]; var fineNodes = new BitArray(Constants.GetBinNumHighPrecision(MaxMass)); fineNodes[0] = true; var effectiveBinCounter = 0; for (var fineBinIdx = 0; fineBinIdx < fineNodes.Length; fineBinIdx++) { if (!fineNodes[fineBinIdx]) { continue; } var fineNodeMass = fineBinIdx / Constants.RescalingConstantHighPrecision; foreach (var aa in extendedAminoAcidArray) { var validFineNodeIndex = Constants.GetBinNumHighPrecision(fineNodeMass + aa.Mass); if (validFineNodeIndex >= fineNodes.Length) { break; } fineNodes[validFineNodeIndex] = true; if (fineBinIdx == 0 && !(aa is ModifiedAminoAcid)) // include terminal modifications { foreach (var terminalMod in terminalModifications) { var modifiedAa = new ModifiedAminoAcid(aa, terminalMod); validFineNodeIndex = Constants.GetBinNumHighPrecision(fineNodeMass + modifiedAa.Mass); if (validFineNodeIndex >= fineNodes.Length) { break; } fineNodes[validFineNodeIndex] = true; } } } /*foreach (var m in massList) * { * var validFineNodeIndex = Constants.GetBinNumHighPrecision(fineNodeMass + m); * if (validFineNodeIndex >= fineNodes.Length) break; * fineNodes[validFineNodeIndex] = true; * }*/ var binNum = _mzComparer.GetBinNumber(fineNodeMass); if (fineBinIdx == 0 || (binNum >= _minMzBinIndex && binNum <= _maxMzBinIndex && _mzBinToFilteredBinMap[binNum - _minMzBinIndex + 1] < 0)) { _mzBinToFilteredBinMap[binNum == 0 ? 0 : binNum - _minMzBinIndex + 1] = effectiveBinCounter; tempMap[effectiveBinCounter] = binNum; effectiveBinCounter++; } } _filteredBinToMzBinMap = new int[effectiveBinCounter]; Array.Copy(tempMap, _filteredBinToMzBinMap, effectiveBinCounter); }
public void TestBottomUpSearch(string specFilePath, string dbFilePath, string outputDir, AminoAcidSet aaSet, bool?tda, int ntt, double corrThreshold = 0.3) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // Search parameters const int minSequenceLength = 6; // 7 const int maxSequenceLength = 40; // 1000 const int minPrecursorIonCharge = 1; // 3 const int maxPrecursorIonCharge = 4; // 67 const int minProductIonCharge = 1; // 1 const int maxProductIonCharge = 2; // 15 const int precursorIonTolerancePpm = 10; const int productIonTolerancePpm = 10; var enzyme = Enzyme.Trypsin; var bottomUpLauncher = new IcBottomUpLauncher( specFilePath, dbFilePath, outputDir, aaSet, enzyme) { MinSequenceLength = minSequenceLength, MaxSequenceLength = maxSequenceLength, MinPrecursorIonCharge = minPrecursorIonCharge, MaxPrecursorIonCharge = maxPrecursorIonCharge, MinProductIonCharge = minProductIonCharge, MaxProductIonCharge = maxProductIonCharge, PrecursorIonTolerancePpm = precursorIonTolerancePpm, ProductIonTolerancePpm = productIonTolerancePpm, RunTargetDecoyAnalysisBool = tda, NumTolerableTermini = ntt }; bottomUpLauncher.RunSearch(corrThreshold); //topDownLauncher.RunIntactProteinSearch(); }
public void TestGetProteinsWithTagMatchingSingleSpec() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSet = @"H:\Research\Lewy\raw\Lewy_intact_07"; // const int scanNum = 5158; const int minTagLength = 7; const int minNumTagMatches = 1; var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet(); const int scanNum = 2; // Parse sequence tags //const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag"); const string rawFilePath = ""; const string fastaFilePath = @"H:\Research\Lewy\ID_004858_0EE8CF61.fasta"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); //var tagParser = new SequenceTagParser(tagFileName, minTagLength); //var tags = tagParser.GetSequenceTags(scanNum); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var spec = run.GetSpectrum(scanNum) as ProductSpectrum; var tagFinder = new SequenceTagFinder(spec, new Tolerance(5)); var tags = tagFinder.GetAllSequenceTagString(); var proteinsToTags = new Dictionary <string, IList <MatchedTag> >(); foreach (var tag in tags) { var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray(); foreach (var index in matchedIndices) { var protein = fastaDb.GetProteinName(index); var startIndex = fastaDb.GetOneBasedPositionInProtein(index); var matchedTag = new MatchedTag(tag, startIndex, 0.0); IList <MatchedTag> existingTags; if (proteinsToTags.TryGetValue(protein, out existingTags)) { existingTags.Add(matchedTag); } else { proteinsToTags.Add(protein, new List <MatchedTag> { matchedTag }); } } } foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count)) { if (entry.Value.Count < minNumTagMatches) { break; } var proteinName = entry.Key; var proteinSequence = fastaDb.GetProteinSequence(proteinName); var protein = new Sequence(proteinSequence, aminoAcidSet); Console.WriteLine(proteinName + "\t" + entry.Value.Count); foreach (var matchedTag in entry.Value) { var seq = proteinSequence.Substring(matchedTag.StartIndex, matchedTag.EndIndex - matchedTag.StartIndex); var nTermMass = protein.GetMass(0, matchedTag.StartIndex); var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count); Console.WriteLine("\t{0} ({1})\t{2}\t{3} ({4})\t{5}\t{6}\t{7}", matchedTag.NTermFlankingMass, (matchedTag.NTermFlankingMass - nTermMass), seq, matchedTag.CTermFlankingMass, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex, matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable); } } }
public void TestChaoChao(string specFilePath) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); if (!File.Exists(specFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath); } const string dbFilePath = @"D:\Research\Data\ChaoChao\database\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(dbFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFilePath); } const string outputDir = @"D:\Research\Data\ChaoChao\Ic\"; // Configure amino acid set //var carbamidomethylC = new SearchModification(Modification.Carbamidomethylation, 'C', SequenceLocation.Everywhere, true); //var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); //var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); //var pyroGluQ = new SearchModification(Modification.PyroGluQ, 'Q', SequenceLocation.Everywhere, false); //var deamdN = new SearchModification(Modification.Deamidation, 'N', SequenceLocation.Everywhere, false); //var deamdQ = new SearchModification(Modification.Deamidation, 'Q', SequenceLocation.Everywhere, false); const int numMaxModsPerProtein = 0; var searchModifications = new List <SearchModification> { //carbamidomethylC, //acetylN, //oxM }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); bool?tda = true; // true: target & decoy, false: target, null: decoy const int minSequenceLength = 7; // 7 const int maxSequenceLength = 150; // 1000 const int minPrecursorIonCharge = 1; // 3 const int maxPrecursorIonCharge = 30; // 67 const int minProductIonCharge = 1; // 1 const int maxProductIonCharge = 15; // 15 const double precursorIonTolerancePpm = 10; const double productIonTolerancePpm = 10; const double corrThreshold = 0.7; var bottomUpLauncher = new IcBottomUpLauncher( specFilePath, dbFilePath, outputDir, aaSet, null) { MinSequenceLength = minSequenceLength, MaxSequenceLength = maxSequenceLength, MinPrecursorIonCharge = minPrecursorIonCharge, MaxPrecursorIonCharge = maxPrecursorIonCharge, MinProductIonCharge = minProductIonCharge, MaxProductIonCharge = maxProductIonCharge, PrecursorIonTolerancePpm = precursorIonTolerancePpm, ProductIonTolerancePpm = productIonTolerancePpm, RunTargetDecoyAnalysisBool = tda, NumTolerableTermini = 0 }; bottomUpLauncher.RunSearch(corrThreshold); }
public void TestRunningTimeChromGen() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string rafFilePath = @"C:\cygwin\home\kims336\Data\QCShewQE\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.raf"; if (!File.Exists(rafFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rafFilePath); } var rafRun = new PbfLcMsRun(rafFilePath); var tolerance = new Tolerance(10); const string dbFile = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var aaSet = new AminoAcidSet(Modification.Carbamidomethylation); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var numPeptides = 0; foreach (var peptide in indexedDb.AnnotationsAndOffsets(6, 30, 2, 2, Enzyme.Trypsin)) { ++numPeptides; var comp = new Sequence(peptide.Annotation.Substring(2, peptide.Annotation.Length - 4), aaSet).Composition + Composition.H2O; var mz = new Ion(comp, 2).GetMonoIsotopicMz(); //Console.WriteLine(peptide.Annotation + " " + mz); rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //var xic1 = run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //var xic2 = rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //Assert.True(xic1.Count == xic2.Count); //for (var i = 0; i < xic1.Count; i++) //{ // if (!xic1[i].Equals(xic2[i])) // { // Console.WriteLine("{0} {1} {2}", i, xic1[i], xic2[i]); // } // Assert.True(xic1[i].Equals(xic2[i])); //} if (numPeptides == 100000) { break; } } sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestCompositeScoring() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var pbfFilePath = Utils.GetPbfTestFilePath(false); var pbfFile = Utils.GetTestFile(methodName, pbfFilePath); // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); var run = PbfLcMsRun.GetLcMsRun(pbfFile.FullName); const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); const int minCharge = 1; const int maxCharge = 20; var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); var aminoAcidSet = new AminoAcidSet(); //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge); var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance); if (pbfFile.DirectoryName == null) { Assert.Ignore("Ignoring test since cannot determine the parent directory of " + pbfFile.FullName); } var fileExt = new[] { "IcTarget", "IcDecoy" }; foreach (var ext in fileExt) { var resultFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}.tsv", ext); var parser = new TsvFileParser(resultFileName); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var protSequences = parser.GetData("Sequence").ToArray(); var modStrs = parser.GetData("Modifications").ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var protMass = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray(); var outputFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}_Rescored.tsv", ext); using (var writer = new StreamWriter(outputFileName)) { writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue"); var lines = new string[parser.NumData]; //for (var i = 0; i < parser.NumData; i++) Parallel.For(0, 30, i => { var scan = scans[i]; var charge = charges[i]; var protSequence = protSequences[i]; var modStr = modStrs[i]; var sequence = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet); // Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O)); if (!(run.GetSpectrum(scan) is ProductSpectrum ms2Spec)) { Console.WriteLine("Could not get the spectrum datafor scan {0}", scan); }
public void TestNominalMassErrors() { const int MAX_RUNTIME_SECONDS = 60; var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minLength = 300; const int maxLength = 400; var sw = new System.Diagnostics.Stopwatch(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_003962_71E1A1D4.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; sw.Start(); var hist = new long[11]; var aaSet = new AminoAcidSet(); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength)) { ++numSequences; var annotation = peptideAnnotationAndOffset.Annotation; var sequenceStr = annotation.Substring(2, annotation.Length - 4); var sequenceComp = aaSet.GetComposition(sequenceStr); var mass = sequenceComp.Mass; var nominalMass = sequenceComp.NominalMass; var error = (int)Math.Round(mass * Constants.RescalingConstant) - nominalMass; var errorBin = error + hist.Length / 2; if (errorBin < 0) { errorBin = 0; } if (errorBin >= hist.Length) { errorBin = hist.Length - 1; } hist[errorBin]++; if (numSequences % 100 == 0 && sw.Elapsed.TotalSeconds > MAX_RUNTIME_SECONDS) { break; } } Console.WriteLine("Sequence count: {0:N0}", numSequences); Console.WriteLine("{0,10} {1,10} {2,10}", "Bin ", "Count", "Fraction"); for (var i = 0; i < hist.Length; i++) { Console.WriteLine("{0,10:F1} {1,10:N0} {2,10:F1}%", i - hist.Length / 2, hist[i], hist[i] / (double)numSequences * 100); } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:F1} sec", sw.Elapsed.TotalSeconds); }
public void TestMatchedPeakCounter() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); // Parameters var precursorIonTolerance = new Tolerance(15); var productIonTolerance = new Tolerance(15); var sw = new System.Diagnostics.Stopwatch(); var aaSet = new AminoAcidSet(); const string protAnnotation = "_.MFQQEVTITAPNGLHTRPAAQFVKEAKGFTSEITVTSNGKSASAKSLFKLQTLGLTQGTVVTISAEGEDEQKAVEHLVKLMAELE._"; // Create a sequence graph var seqGraph = SequenceGraph.CreateGraph(aaSet, protAnnotation); Assert.NotNull(seqGraph, "Invalid sequence: {0}", protAnnotation); var specFilePath = Base.Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_SPEC_FILES_FOLDER, "SBEP_STM_001_02272012_Aragon.pbf")); var run = InMemoryLcMsRun.GetLcMsRun(specFilePath.FullName, 1.4826, 1.4826); sw.Start(); var precursorFilter = new Ms1ContainsIonFilter(run, precursorIonTolerance); var seqCompositionArr = seqGraph.GetSequenceCompositions(); Console.WriteLine("Length: {0}\tNumCompositions: {1}", protAnnotation.Length - 4, seqCompositionArr.Length); const int charge = 6; const int modIndex = 0; const int ms2ScanNum = 4448; var seqComposition = seqCompositionArr[modIndex]; var peptideComposition = seqComposition + Composition.H2O; peptideComposition.GetIsotopomerEnvelopeRelativeIntensities(); Console.WriteLine("Composition: {0}, AveragineMass: {1}", seqComposition, seqComposition.Mass); seqGraph.SetSink(modIndex); var precursorIon = new Ion(peptideComposition, charge); Assert.True(precursorFilter.IsValid(precursorIon, ms2ScanNum)); var spec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum; Assert.True(spec != null); var scorer = new MatchedPeakCounter(spec, productIonTolerance, 1, 10); var score = seqGraph.GetFragmentScore(scorer); Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", protAnnotation, charge, precursorIon.GetMostAbundantIsotopeMz(), ms2ScanNum, score); sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestForYufeng() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // QC_Shew const string specFilePath = @"H:\Research\Yufeng\TopDownYufeng\raw\yufeng_column_test2.raw"; //const string dbFilePath = @"H:\Research\Yufeng\TopDownYufeng\database\ID_002216_235ACCEA.fasta"; const string dbFilePath = @"H:\Research\Yufeng\TopDownYufeng\database\SO_3942_Truncated.fasta"; const string outputDir = @"H:\Research\Yufeng\TopDownYufeng\Debug"; if (!File.Exists(specFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath); } if (!File.Exists(dbFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFilePath); } // Configure amino acid set //var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); //var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); //// var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false); //var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); //var pyroGluQ = new SearchModification(Modification.PTeyroGluQ, 'Q', SequenceLocation.Everywhere, false); //var cysteinylC = new SearchModification(Modification.Cysteinyl, 'C', SequenceLocation.Everywhere, false); //var deamdN = new SearchModification(Modification.Deamidation, 'N', SequenceLocation.Everywhere, false); //var deamdQ = new SearchModification(Modification.Deamidation, 'Q', SequenceLocation.Everywhere, false); //const int numMaxModsPerProtein = 0; //var searchModifications = new List<SearchModification> //{ // dehydroC, // oxM, // acetylN //}; //var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var aaSet = new AminoAcidSet(); const int searchMode = 2; // 0: all subsequences, 1: close to N- or C-term, 2: close to N- and C-term bool? tda = false; // true: target & decoy, false: target, null: decoy const int minSequenceLength = 21; // 7 const int maxSequenceLength = 500; // 1000 const int minPrecursorIonCharge = 2; // 3 const int maxPrecursorIonCharge = 50; // 67 const int minProductIonCharge = 1; // 1 const int maxProductIonCharge = 20; // 15 const double minSequenceMass = 3000.0; const double maxSequenceMass = 50000.0; TestTopDownSearch(specFilePath, dbFilePath, outputDir, aaSet, minSequenceLength, maxSequenceLength, minPrecursorIonCharge, maxPrecursorIonCharge, minProductIonCharge, maxProductIonCharge, minSequenceMass, maxSequenceMass, tda, searchMode ); }
} // true: target and decoy, false: target only, null: decoy only public void QuickId() { const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt"; const int numBits = 29; // max error: 4ppm const int minCharge = 1; const int maxCharge = 20; var tolerance = new Tolerance(10); const double corrThreshold = 0.7; var comparer = new MzComparerWithBinning(numBits); const double minFragmentMass = 200.0; const double maxFragmentMass = 50000.0; var minFragMassBin = comparer.GetBinNumber(minFragmentMass); var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass); var aminoAcidSet = new AminoAcidSet(modFilePath); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var ms2ScanNumArr = run.GetScanNumbers(2).ToArray(); var sw = new Stopwatch(); sw.Start(); Console.Write("Building Spectrum Arrays..."); var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1]; for (var i = minFragMassBin; i <= maxFragMassBin; i++) { massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1); } foreach (var ms2ScanNum in ms2ScanNumArr) { var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (productSpec == null) { continue; } var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec.Peaks, minCharge, maxCharge, 2, 1.1, tolerance, corrThreshold); if (deconvolutedPeaks == null) { continue; } foreach (var p in deconvolutedPeaks) { var mass = p.Mass; var deltaMass = tolerance.GetToleranceAsDa(mass, 1); var minMass = mass - deltaMass; var maxMass = mass + deltaMass; var minBinNum = comparer.GetBinNumber(minMass); var maxBinNum = comparer.GetBinNumber(maxMass); for (var binNum = minBinNum; binNum <= maxBinNum; binNum++) { if (binNum >= minFragMassBin && binNum <= maxFragMassBin) { massVectors[binNum - minFragMassBin][ms2ScanNum] = true; } } } } sw.Stop(); Console.WriteLine(@"{0:f1} sec.", sw.Elapsed.TotalSeconds); sw.Reset(); sw.Start(); var fastaDb = new FastaDatabase(fastaFilePath); fastaDb.Read(); var indexedDb = new IndexedDatabase(fastaDb); var numProteins = 0; var intactProteinAnnotationAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue); var bestProtein = new string[run.MaxLcScan + 1]; var bestScore = new int[run.MaxLcScan + 1]; foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets) { if (++numProteins % 10 == 0) { Console.WriteLine(@"Processing, {0} proteins done, {1:f1} sec elapsed", numProteins, sw.Elapsed.TotalSeconds); } var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var protSequence = annotation.Substring(2, annotation.Length - 4); // suffix var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence, AminoAcid.ProteinCTerm); if (seqGraph == null) { continue; } for (var numNTermCleavage = 0; numNTermCleavage <= 1; numNTermCleavage++) { if (numNTermCleavage > 0) { seqGraph.CleaveNTerm(); } var allCompositions = seqGraph.GetAllFragmentNodeCompositions(); var scoreArr = new int[run.MaxLcScan + 1]; foreach (var fragComp in allCompositions) { var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass; var binNum = comparer.GetBinNumber(suffixMass); if (binNum < minFragMassBin || binNum > maxFragMassBin) { continue; } var vector = massVectors[binNum - minFragMassBin]; foreach (var ms2ScanNum in ms2ScanNumArr) { if (vector[ms2ScanNum]) { ++scoreArr[ms2ScanNum]; } } } foreach (var ms2ScanNum in ms2ScanNumArr) { if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum]) { bestScore[ms2ScanNum] = scoreArr[ms2ScanNum]; var proteinName = fastaDb.GetProteinName(offset); bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : ""); } } } // prefix } Console.WriteLine("ScanNum\tBestProtein\tScore"); foreach (var ms2ScanNum in ms2ScanNumArr) { Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestScore[ms2ScanNum], bestProtein[ms2ScanNum] ?? ""); } }
public void TestTopDownSearch(string specFilePath, string dbFilePath, string outputDir, AminoAcidSet aaSet, int minSequenceLength, int maxSequenceLength, int minPrecursorIonCharge, int maxPrecursorIonCharge, int minProductIonCharge, int maxProductIonCharge, double minSequenceMass, double maxSequenceMass, bool?tda, int searchMode) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // Search parameters const int maxNumNTermCleavages = 1; // 30 const int maxNumCTermCleavages = 0; const int precursorIonTolerancePpm = 10; const int productIonTolerancePpm = 10; var topDownLauncher = new IcTopDownLauncher( specFilePath, dbFilePath, outputDir, aaSet, "") { MinSequenceLength = minSequenceLength, MaxSequenceLength = maxSequenceLength, MaxNumNTermCleavages = maxNumNTermCleavages, MaxNumCTermCleavages = maxNumCTermCleavages, MinPrecursorIonCharge = minPrecursorIonCharge, MaxPrecursorIonCharge = maxPrecursorIonCharge, MinProductIonCharge = minProductIonCharge, MaxProductIonCharge = maxProductIonCharge, MinSequenceMass = minSequenceMass, MaxSequenceMass = maxSequenceMass, PrecursorIonTolerancePpm = precursorIonTolerancePpm, ProductIonTolerancePpm = productIonTolerancePpm, RunTargetDecoyAnalysisBool = tda, SearchModeInt = searchMode, }; //topDownLauncher.ForceParallel = true; //topDownLauncher.MaxNumThreads = -1; topDownLauncher.RunSearch(0.7); //topDownLauncher.RunIntactProteinSearch(); }
static MgfSequenceReader() { StandardAminoAcidSet = new AminoAcidSet(Modification.Carbamidomethylation); Modifications = new Dictionary <string, Tuple <AminoAcid, List <Modification> > >(); Modifications.Add("99.032", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('G'), new List <Modification> { Modification.Acetylation })); Modifications.Add("113.048", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('A'), new List <Modification> { Modification.Acetylation })); Modifications.Add("129.043", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('S'), new List <Modification> { Modification.Acetylation })); Modifications.Add("141.079", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('V'), new List <Modification> { Modification.Acetylation })); Modifications.Add("143.059", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('T'), new List <Modification> { Modification.Acetylation })); Modifications.Add("147.035", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('M'), new List <Modification> { Modification.Oxidation })); Modifications.Add("157.038", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('D'), new List <Modification> { Modification.Acetylation })); Modifications.Add("160.03", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('C'), new List <Modification> { Modification.Carbamidomethylation })); Modifications.Add("171.054", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('E'), new List <Modification> { Modification.Acetylation })); Modifications.Add("173.051", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('M'), new List <Modification> { Modification.Acetylation })); Modifications.Add("189.046", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('F'), new List <Modification> { Modification.Acetylation })); Modifications.Add("202.041", new Tuple <AminoAcid, List <Modification> >(StandardAminoAcidSet.GetAminoAcid('C'), new List <Modification> { Modification.Carbamidomethylation, Modification.Acetylation })); }
public void TestForVlad() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string specFilePath = @"D:\Research\Data\Vlad\raw\Alz_RA_C1_HCD_11012013_SW_03Nov2013.raw"; const string dbFilePath = @"D:\Research\Data\Vlad\database\ID_004221_1C042A1F.fasta"; //const string dbFilePath = @"D:\Research\Data\Vlad\database\HBA_MOUSE.fasta"; const string outputDir = @"D:\Research\Data\Vlad\Ic\POPSICLETest_M1"; if (!File.Exists(specFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath); } if (!File.Exists(dbFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFilePath); } // Configure amino acid set var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false); var thrToAla = new SearchModification(Modification.ThrToAla, 'T', SequenceLocation.Everywhere, false); var dethiomethylM = new SearchModification(Modification.Dethiomethyl, 'M', SequenceLocation.Everywhere, false); var deamidatedN = new SearchModification(Modification.Deamidation, 'N', SequenceLocation.Everywhere, false); var deamidatedQ = new SearchModification(Modification.Deamidation, 'Q', SequenceLocation.Everywhere, false); var serToAsn = new SearchModification(Modification.SerToAsn, 'S', SequenceLocation.Everywhere, false); var pyroCarbamidomethylC = new SearchModification(Modification.PyroCarbamidomethyl, 'C', SequenceLocation.ProteinNTerm, false); var phosphoS = new SearchModification(Modification.Phosphorylation, 'S', SequenceLocation.Everywhere, false); var phosphoT = new SearchModification(Modification.Phosphorylation, 'T', SequenceLocation.Everywhere, false); var phosphoY = new SearchModification(Modification.Phosphorylation, 'Y', SequenceLocation.Everywhere, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, // glutathioneC, oxM, // dethiomethylM, acetylN, phosphoS, phosphoT, phosphoY // thrToAla, // serToAsn, // deamidatedN, // deamidatedQ, // pyroCarbamidomethylC }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); const int searchMode = 1; // 0: all subsequences, 1: close to N- or C-term, 2: close to N- and C-term bool? tda = false; // true: target & decoy, false: target, null: decoy TestTopDownSearch(specFilePath, dbFilePath, outputDir, aaSet, tda, searchMode); }
public void CreatePeptideAbundanceTableWithSkyline() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); // Reading Henry's results var pepKeySet = new HashSet <string>(); var resultDic = new Dictionary <string, Tuple <double, double> >(); const string henryResultPath = @"H:\Research\IPRG2015\Henry_results\tsv"; if (!Directory.Exists(henryResultPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, henryResultPath); } var aaSet = new AminoAcidSet(); foreach (var resultFile in Directory.GetFiles(henryResultPath, "*.tsv")) { var fileName = Path.GetFileName(resultFile); if (fileName == null) { continue; } var sample = fileName.Substring(0, 2); Console.WriteLine("Processing {0}", sample); var tsvReader = new TsvFileParser(resultFile); var peptides = tsvReader.GetData("Peptide").ToArray(); var charge = tsvReader.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray(); var prob = tsvReader.GetData("Prob").Select(Convert.ToDouble).ToArray(); var qValue = tsvReader.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < tsvReader.NumData; i++) { var peptide = peptides[i]; var nominalMass = GetNominalMass(aaSet, peptide); var key = sample + ":" + GetPeptide(peptides[i]) + ":" + nominalMass + ":" + charge[i]; var pepKey = GetPeptide(peptides[i]) + ":" + nominalMass; pepKeySet.Add(pepKey); Tuple <double, double> existingScores; if (resultDic.TryGetValue(key, out existingScores)) { if (prob[i] > existingScores.Item1) { resultDic[key] = new Tuple <double, double>(prob[i], qValue[i]); } } else { resultDic.Add(key, new Tuple <double, double>(prob[i], qValue[i])); } } } const string skylineFilePath = @"H:\Research\IPRG2015\MySkyline\TransitionResults.csv"; if (!File.Exists(skylineFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, skylineFilePath); } var skylineTable = new TsvFileParser(skylineFilePath, ','); const string outputFilePath = @"H:\Research\IPRG2015\MySkyline\SkylineTransitionResultsWithScores3.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var peptides = skylineTable.GetData("Peptide Sequence").ToArray(); var samples = skylineTable.GetData("Replicate Name").Select(s => "" + s[0] + s[2]).ToArray(); var charges = skylineTable.GetData("Precursor Charge").Select(c => Convert.ToInt32(c)).ToArray(); var precursorMzs = skylineTable.GetData("Precursor Mz").Select(Convert.ToDouble).ToArray(); writer.WriteLine("{0}\tProbability\tQValue", string.Join("\t", skylineTable.GetHeaders().Take(skylineTable.GetHeaders().Count - 2))); for (var i = 0; i < skylineTable.NumData; i++) { var precursorMz = precursorMzs[i]; var charge = charges[i]; var nominalMass = (int)Math.Round(((precursorMz - Constants.Proton) * charge - Composition.H2O.Mass) * Constants.RescalingConstant); var pepKey = peptides[i] + ":" + nominalMass; if (!pepKeySet.Contains(pepKey)) { //Console.WriteLine("Removing {0}", pepKey); continue; } var key = samples[i] + ":" + peptides[i] + ":" + nominalMass + ":" + charge; double?prob = null, qValue = null; Tuple <double, double> scores; if (resultDic.TryGetValue(key, out scores)) { prob = scores.Item1; qValue = scores.Item2; } var skylineData = skylineTable.GetRows()[i].Split(','); for (var j = 0; j < skylineData.Length - 2; j++) { if (j != 2) { writer.Write(skylineData[j] + "\t"); } else { writer.Write("" + skylineData[j][0] + skylineData[j][2] + "\t"); } } writer.WriteLine("{0}\t{1}", prob != null ? prob.ToString() : "NA", qValue != null ? qValue.ToString() : "NA"); } } Console.WriteLine("Done"); }
public IcBottomUpRescorer(string specFilePath, string icResultFilePath, string outputFilePath, AminoAcidSet aaSet, Tolerance tolerance) { _run = InMemoryLcMsRun.GetLcMsRun(specFilePath, 1.4826, 0.0); Rescore(icResultFilePath, outputFilePath); }
public void TestCompositeScoring() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var pbfFilePath = Utils.GetPbfTestFilePath(false); var pbfFile = Utils.GetTestFile(methodName, pbfFilePath); // Configure amino acid set var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var dehydroC = new SearchModification(Modification.Dehydro, 'C', SequenceLocation.Everywhere, false); var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, false); const int numMaxModsPerProtein = 4; var searchModifications = new List <SearchModification> { dehydroC, oxM, acetylN }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var comparer = new FilteredProteinMassBinning(aaSet, 50000, 28); var run = PbfLcMsRun.GetLcMsRun(pbfFile.FullName); const double filteringWindowSize = 1.1; const int isotopeOffsetTolerance = 2; var tolerance = new Tolerance(10); const int minCharge = 1; const int maxCharge = 20; var graphFactory = new ProteinScoringGraphFactory(comparer, aaSet); var aminoAcidSet = new AminoAcidSet(); //var scorer = new MatchedPeakPostScorer(tolerance, minCharge, maxCharge); var scorer = new InformedTopDownScorer(run, aminoAcidSet, minCharge, maxCharge, tolerance); if (pbfFile.DirectoryName == null) { Assert.Ignore("Ignoring test since cannot determine the parent directory of " + pbfFile.FullName); } var fileExt = new string[] { "IcTarget", "IcDecoy" }; foreach (var ext in fileExt) { var resultFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}.tsv", ext); var parser = new TsvFileParser(resultFileName); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32((string)s)).ToArray(); var charges = parser.GetData("Charge").Select(s => Convert.ToInt32(s)).ToArray(); var protSequences = parser.GetData("Sequence").ToArray(); var modStrs = parser.GetData("Modifications").ToArray(); var compositions = parser.GetData("Composition").Select(Composition.Parse).ToArray(); var protMass = parser.GetData("Mass").Select(s => Convert.ToDouble(s)).ToArray(); var outputFileName = Path.Combine(pbfFile.DirectoryName, Path.GetFileNameWithoutExtension(pbfFile.Name)) + string.Format("_{0}_Rescored.tsv", ext); using (var writer = new StreamWriter(outputFileName)) { writer.WriteLine(string.Join("\t", parser.GetHeaders().ToArray(), 0, 15) + "\tScore\tEValue"); var lines = new string[parser.NumData]; //for (var i = 0; i < parser.NumData; i++) Parallel.For(0, 30, i => { var scan = scans[i]; var charge = charges[i]; var protSequence = protSequences[i]; var modStr = modStrs[i]; var sequence = Sequence.CreateSequence(protSequence, modStr, aminoAcidSet); // Assert.True(sequence.Composition.Equals(compositions[i] - Composition.H2O)); var ms2Spec = run.GetSpectrum(scan) as ProductSpectrum; if (ms2Spec == null) { Console.WriteLine("Could not get the spectrum datafor scan {0}", scan); } else { Assert.True(ms2Spec != null); var scores = scorer.GetScores(sequence, charge, scan); var deconvSpec = Deconvoluter.GetDeconvolutedSpectrum(ms2Spec, minCharge, maxCharge, isotopeOffsetTolerance, filteringWindowSize, tolerance, 0.7); var deconvScorer = new CompositeScorerBasedOnDeconvolutedSpectrum(deconvSpec, ms2Spec, tolerance, comparer); var graph = graphFactory.CreateScoringGraph(deconvScorer, protMass[i]); var gf = new GeneratingFunction(graph); gf.ComputeGeneratingFunction(); var specEvalue = gf.GetSpectralEValue(scores.Score); var rowStr = parser.GetRows()[i]; var items = rowStr.Split('\t').ToArray(); var newRowStr = string.Join("\t", items, 0, 15); //writer.WriteLine("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); lines[i] = string.Format("{0}\t{1}\t{2}", newRowStr, scores.Score, specEvalue); //Console.WriteLine("{0}\t{1}\t{2}", items[0], scores.Score, specEvalue); } }); foreach (var line in (from item in lines where !string.IsNullOrWhiteSpace(item) select item).Take(20)) { Console.WriteLine(line); } } Console.WriteLine("Done"); } }