private void CreateDecoy() { Sequence.Reverse(); var sequence = Sequence.Aggregate("", (current, aa) => current + aa.Residue); sequence = SimpleStringProcessing.Mutate(sequence, sequence.Length / 2); Peptide = sequence; Sequence = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequence); }
public void DiaRankScore() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataFile = @"\\protoapps\UserData\Wilkins\BottomUp\HCD_QCShew\raw\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.raw"; const string tsvFile = @"\\protoapps\UserData\Wilkins\BottomUp\HCD_QCShew\tsv\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.tsv"; if (!File.Exists(dataFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataFile); } if (!File.Exists(tsvFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tsvFile); } var parser = new TsvFileParser(tsvFile); var sequences = parser.GetData("Peptide"); var charges = parser.GetData("Charge"); var scans = parser.GetData("ScanNum"); var lcms = InMemoryLcMsRun.GetLcMsRun(dataFile, 0, 0); var rankScorer = new DiaRankScore( @"C:\Users\wilk011\Documents\DataFiles\TestFolder\HCD_QExactive_Tryp.txt"); using ( var outFile = new StreamWriter(@"C:\Users\wilk011\Documents\DataFiles\TestFolder\HCD_QCShew_Score_2.txt")) { outFile.WriteLine("Target\tDecoy"); for (int i = 0; i < sequences.Count; i++) { string sequenceStr = sequences[i]; int charge = Convert.ToInt32(charges[i]); int scan = Convert.ToInt32(scans[i]); var sequence = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequenceStr); var decoySeq = Sequence.GetSequenceFromMsGfPlusPeptideStr(sequenceStr); decoySeq.Reverse(); var decoyStr = decoySeq.Aggregate("", (current, aa) => current + aa); decoyStr = SimpleStringProcessing.Mutate(decoyStr, sequence.Count / 2); decoySeq = Sequence.GetSequenceFromMsGfPlusPeptideStr(decoyStr); var sequenceScore = rankScorer.GetScore(sequence, charge, scan, lcms); var decoyScore = rankScorer.GetScore(decoySeq, charge, scan, lcms); outFile.WriteLine("{0}\t{1}", sequenceScore, decoyScore); } } }
/// <summary> /// Create the decoy version of this databse /// </summary> /// <param name="enzyme"></param> /// <param name="shuffle"></param> public void CreateDecoyDatabase(Enzyme enzyme, bool shuffle) { var reader = new FastaFileReader(); if (!reader.OpenFile(_databaseFilePath)) { return; } var decoyDatabaseFileName = GetDecoyDatabasePath(enzyme, shuffle); Console.WriteLine("Creating " + decoyDatabaseFileName); using (var decoyWriter = new StreamWriter(decoyDatabaseFileName)) { while (reader.ReadNextProteinEntry()) { var name = reader.ProteinName; var description = reader.ProteinDescription; var sequence = reader.ProteinSequence; decoyWriter.WriteLine(">{0}_{1} {2}", FastaDatabaseConstants.DecoyProteinPrefix, name, description); if (!shuffle) { // Reversed protein sequence var decoySequence = new StringBuilder(); for (var i = sequence.Length - 1; i >= 0; i--) { var residue = sequence[i]; if (enzyme != null && enzyme.Residues.Length > 0 && enzyme.IsCleavable(residue) && decoySequence.Length > 0) { var residueToBeReplaced = decoySequence[decoySequence.Length - 1]; decoySequence.Remove(decoySequence.Length - 1, 1); decoySequence.Append((char)residue); decoySequence.Append(residueToBeReplaced); } else { decoySequence.Append((char)residue); } } decoyWriter.WriteLine(decoySequence); } else { // Shuffled protein sequences decoyWriter.WriteLine(SimpleStringProcessing.Mutate(SimpleStringProcessing.Shuffle(sequence), NumMutations)); } } reader.CloseFile(); } }
public void TestStringShuffling() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string str = "MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG"; // Histone H4 var shuffled = SimpleStringProcessing.Shuffle(str); var strSorted = String.Concat(str.OrderBy(c => c)); var shuffledSorted = String.Concat(shuffled.OrderBy(c => c)); Assert.IsTrue(strSorted.Equals(shuffledSorted)); }
private void Rescore(string msAlignFilePath, string outputFilePath) { var parser = new TsvFileParser(msAlignFilePath); var sequences = parser.GetData("Peptide"); var scanNums = parser.GetData("Scan(s)").Select(s => Convert.ToInt32(s)).ToArray(); var charges = parser.GetData("Charge").Select(c => Convert.ToInt32(c)).ToArray(); var rows = parser.GetRows(); var headers = parser.GetHeaders(); using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("{0}\t{1}", string.Join("\t", headers), IcScores.GetScoreNames()); for (var i = 0; i < parser.NumData; i++) { var row = rows[i]; var seqStr = SimpleStringProcessing.GetStringBetweenDots(sequences[i]); if (seqStr == null || seqStr.Contains("(")) { continue; //TODO: currently ignore ids with modifications } var composition = AASet.GetComposition(seqStr); //var sequence = new Sequence(seqStr, AASet); //if (sequence == null) //{ // Console.WriteLine("Ignore illegal sequence: {0}", seqStr); // continue; //} var charge = charges[i]; var scanNum = scanNums[i]; var scores = _topDownScorer.GetScores(AminoAcid.ProteinNTerm, seqStr, AminoAcid.ProteinCTerm, composition, charge, scanNum); if (scores == null) { continue; } writer.WriteLine("{0}\t{1}", row, scores); } } }
public void TestStringMutation() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string str = "MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG"; // Histone H4 const int numMutations = 3; var mutated = SimpleStringProcessing.Mutate(str, numMutations); Console.WriteLine(mutated); Assert.IsTrue(str.Length == mutated.Length); var numDiff = str.Where((t, i) => t != mutated[i]).Count(); Console.WriteLine("Mutations: {0}", numDiff); //var strSorted = String.Concat(str.OrderBy(c => c)); //var shuffledSorted = String.Concat(mutated.OrderBy(c => c)); //Assert.IsTrue(strSorted.Equals(shuffledSorted)); }
public void TestPrSm() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //const string specFilePath = @"C:\cygwin\home\kims336\Data\TopDownYufeng\raw\yufeng_column_test2.raw"; //const string annotation = // "_.MKTKLSVLSAAMLAATLTMMPAVSQAAIPQSVEGQSIPSLAPMLERTTPAVVSVAVSGTHVSKQRVPDVFRYFFGPNAPQEQVQERPFRGLGSGVIIDADKGYIVTNNHVIDGADDIQVG" + // "LHDGREVKAKLIGTDSESDIALLQIEAKNLVAIKTSDSDELRVGDFAVAIGNPFGLGQTV" + // "TSGIVSALGRSGLGIEMLENFIQTDAAINSGNSGGALVNLKGELIGINTAIVAPNGGNVG" + // "IGFAIPANMVKNLIAQIAEHGEVRRGVLGIAGRDLDSQLAQGFGLDTQHGGFVNEVSAGS" + // "AAEKAGIKAGDIIVSVDGRAIKSFQELRAKVATMGAGAKVELGLIRDGDKKTVNVTLGEA" + // "NQTTEKAAGAVHPMLQGASLENASKGVEITDVAQGSPAAMSGLQKGDLIVGINRTAVKDL" + // "KSLKELLKDQEGAVALKIVRGKSMLYLVLR._"; //var aaSet = new AminoAcidSet(); //const int charge = 60; //const int ms2ScanNum = 46661; const string specFilePath = @"D:\Research\Data\Jon\AH_SF_mouseliver_3-1_Intact_2_6Feb14_Bane_PL011402.raw"; if (!File.Exists(specFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath); } const int ms2ScanNum = 19011; const int charge = 7; const string annotation = "_.SKVSFKITLTSDPRLPYKVLSVPESTPFTAVLKFAAEEFKVPAATSAIITNDGIGINPAQTAGNVFLKHGSELRIIPRDRVGSC._"; var acetylN = new SearchModification(Modification.Acetylation, '*', SequenceLocation.ProteinNTerm, true); var modVal = Modification.RegisterAndGetModification("AddVal", new Composition(5, 9, 1, 1, 0)); var searchMods = AminoAcid.StandardAminoAcidCharacters.Select(residue => new SearchModification(modVal, residue, SequenceLocation.Everywhere, false)).ToList(); searchMods.Add(acetylN); const int numMaxModsPerProtein = 1; var aaSet = new AminoAcidSet(searchMods, numMaxModsPerProtein); var graph = SequenceGraph.CreateGraph(aaSet, annotation); Console.WriteLine("NumProteoforms: " + graph.GetNumProteoformCompositions()); var run = InMemoryLcMsRun.GetLcMsRun(specFilePath, 1.4826, 1.4826); var ms2Scorer = new ProductScorerBasedOnDeconvolutedSpectra(run, 1, 15); ms2Scorer.GetScorer(ms2ScanNum); var scorer = ms2Scorer.GetMs2Scorer(ms2ScanNum); Assert.NotNull(scorer, "Scorer is null!"); for (var i = 0; i < graph.GetNumProteoformCompositions(); i++) { graph.SetSink(i); Console.WriteLine("ModComb: " + graph.GetModificationCombinations()[i]); var score = graph.GetFragmentScore(scorer); Console.WriteLine("Fast search score: " + score); var composition = graph.GetSinkSequenceCompositionWithH2O(); var informedScorer = new InformedTopDownScorer(run, aaSet, 1, 30, new Tolerance(10)); var refinedScore = informedScorer.GetScores(AminoAcid.ProteinNTerm, SimpleStringProcessing.GetStringBetweenDots(annotation), AminoAcid.ProteinCTerm, composition, charge, ms2ScanNum); Console.WriteLine("Modifications: {0}", refinedScore.Modifications); Console.WriteLine("Composition: {0}", composition); Console.WriteLine("RefinedScores: {0}", refinedScore); } }
public void FilteringEfficiencyQcShew() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string rawFilePath = @"C:\cygwin\home\kims336\Data\TopDownQCShew\raw\QC_ShewIntact_2ug_3k_CID_4Apr14_Bane_PL011402.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = InMemoryLcMsRun.GetLcMsRun(rawFilePath, 1.4826, 1.4826); sw.Stop(); Console.WriteLine(@"Reading run: {0:f4} sec", sw.Elapsed.TotalSeconds); const int minPrecursorCharge = 3; const int maxPrecursorCharge = 30; const int tolerancePpm = 10; var tolerance = new Tolerance(tolerancePpm); sw.Reset(); sw.Start(); var ms1BasedFilter = new Ms1IsotopeAndChargeCorrFilter(run, new Tolerance(10.0), minPrecursorCharge, maxPrecursorCharge, 3000, 50000, 0.7, 0.7, 0.7, 40); //var ms1BasedFilter = new Ms1IsotopeCorrFilter(run, minPrecursorCharge, maxPrecursorCharge, 15, 0.5, 40); sw.Stop(); Console.WriteLine(@"Ms1 filter: {0:f4} sec", sw.Elapsed.TotalSeconds); ISequenceFilter ms1Filter = ms1BasedFilter; sw.Reset(); sw.Start(); const double minProteinMass = 3000.0; const double maxProteinMass = 30000.0; var minBinNum = ProductScorerBasedOnDeconvolutedSpectra.GetBinNumber(minProteinMass); var maxBinNum = ProductScorerBasedOnDeconvolutedSpectra.GetBinNumber(maxProteinMass); var numComparisons = 0L; for (var binNum = minBinNum; binNum <= maxBinNum; binNum++) { var mass = ProductScorerBasedOnDeconvolutedSpectra.GetMz(binNum); numComparisons += ms1Filter.GetMatchingMs2ScanNums(mass).Count(); } sw.Stop(); Console.WriteLine(@"Calculating #matches per bin: {0:f4} sec", sw.Elapsed.TotalSeconds); //const string prot = // "ADVFHLGLTKAMLDGATLAIVPGDPERVKRIAELMDNATFLASHREYTSYLAYADGKPVVICSTGIGGPSTSIAVEELAQLGVNTFLRVGTTGAIQPHVNVGDVIVTQASVRLDGASLHFAPMEFPAVANFECTTAMVAACRDAGVEPHIGVTASSDTFYPGQERYDTVTGRVTRRFAGSMKEWQDMGVLNYEMESATLFTMCATQGWRAACVAGVIVNRTQQEIPDEATMKKTEVSAVSIVVAAAKKLLA"; //var protMass = (new AminoAcidSet().GetComposition(prot) + Composition.H2O).Mass; //Console.WriteLine("************ScanNums: " + string.Join("\t", ms1Filter.GetMatchingMs2ScanNums(protMass))); const string resultFilePath = @"C:\cygwin\home\kims336\Data\TopDownQCShew\MSAlign\NoMod.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var tsvReader = new TsvFileParser(resultFilePath); var scanNums = tsvReader.GetData("Scan(s)"); var charges = tsvReader.GetData("Charge"); var scores = tsvReader.GetData("E-value"); var sequences = tsvReader.GetData("Peptide"); //const string resultFilePath = @"C:\cygwin\home\kims336\Data\TopDownQCShew\raw\QC_ShewIntact_2ug_3k_CID_4Apr14_Bane_PL011402_N30_C30.tsv"; //var tsvReader = new TsvFileParser(resultFilePath); //var scanNums = tsvReader.GetData("ScanNum"); //var charges = tsvReader.GetData("Charge"); //var scores = tsvReader.GetData("Score"); //var sequences = tsvReader.GetData("Sequence"); var aaSet = new AminoAcidSet(); var seqSet = new HashSet <string>(); var allSeqSet = new HashSet <string>(); var numUnfilteredSpecs = 0; var totalSpecs = 0; for (var i = 0; i < scores.Count; i++) { var score = Convert.ToDouble(scores[i]); if (score > 1E-4) { continue; } //if (score < 10) continue; var scanNum = Convert.ToInt32(scanNums[i]); var charge = Convert.ToInt32(charges[i]); var sequence = SimpleStringProcessing.GetStringBetweenDots(sequences[i]); if (sequence == null || sequence.Contains("(")) { continue; } //var sequence = sequences[i]; var composition = aaSet.GetComposition(sequence) + Composition.H2O; var precursorIon = new Ion(composition, charge); var isValid = run.GetSpectrum(scanNum) is ProductSpectrum spec && spec.IsolationWindow.Contains(precursorIon.GetMostAbundantIsotopeMz()); if (!isValid) { continue; } ++totalSpecs; var precursorScanNum = run.GetPrecursorScanNum(scanNum); var precursorSpec = run.GetSpectrum(precursorScanNum); var corr1 = precursorSpec.GetCorrScore(precursorIon, tolerance, 0.1); var nextScanNum = run.GetNextScanNum(scanNum, 1); var nextSpec = run.GetSpectrum(nextScanNum); var corr2 = nextSpec.GetCorrScore(precursorIon, tolerance, 0.1); var corr3 = ms1Filter.GetMatchingMs2ScanNums(composition.Mass).Contains(scanNum) ? 1 : 0; if (corr3 == 1) { numUnfilteredSpecs++; seqSet.Add(sequences[i]); } allSeqSet.Add(sequences[i]); var corrMax = new[] { corr1, corr2, corr3 }.Max(); Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}", scanNum, precursorScanNum, corr1, nextScanNum, corr2, corr3, corrMax); } Console.WriteLine("TotalNumComparisons: {0}", numComparisons); Console.WriteLine("AverageNumComparisons: {0:f2}", numComparisons / (double)(maxBinNum - minBinNum + 1)); Console.WriteLine("SuccessRate: {0:f2} {1} / {2}", numUnfilteredSpecs / (double)totalSpecs, numUnfilteredSpecs, totalSpecs); Console.WriteLine("NumUniqueSequences: {0:f2}, {1} / {2}", seqSet.Count / (double)allSeqSet.Count, seqSet.Count, allSeqSet.Count); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }