public void GeneratePrmInfo(string resultFilePath, string outputFilePath) { Console.Write("Processing {0}", Path.GetFileName(resultFilePath)); Console.Out.Flush(); var rawFilePath = @"D:\Research\Data\EDRN\DDA\raw\" + Path.GetFileNameWithoutExtension(resultFilePath) + ".raw"; var reader = new XCaliburReader(rawFilePath); var run = InMemoryLcMsRun.GetLcMsRun(rawFilePath); var tolerance = new Tolerance(10, ToleranceUnit.Ppm); const string spikedInPeptideFile = @"D:\Research\Data\EDRN\SpikedPeptides.txt"; var spikedInPeptides = File.ReadAllLines(spikedInPeptideFile); var spikedInPepSet = new HashSet <string>(); foreach (var p in spikedInPeptides) { spikedInPepSet.Add(p); } // const string resultFilePath = @"D:\Research\Data\EDRN\DDA\Frac7_NTT2.tsv"; //const string resultFilePath = @"D:\Research\Data\EDRN\DDA\Heavy\342865_EDRN_Serum_07_DDA_1_12Nov13_Samwise_13-07-28.tsv"; // const string resultFilePath = @"D:\Research\Data\EDRN\DDA\NTT1_NoMod\342865_EDRN_Serum_07_DDA_1_12Nov13_Samwise_13-07-28.tsv"; const double qValueThreshold = 0.01; var pepSet = new HashSet <string>(); MsGfPlusHeaderInformation headerInfo = null; //var prefix = new HashSet<string>(); //var suffix = new HashSet<string>(); var numPeptides = 0; var prevScanNum = -1; using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Peptide\tCharge\tMonoMz\tMostAbundantMz\tMs2ScanNum\tRtMs2\tRtApex\tRtStart\tRtEnd\tSpecEValue\tPepQValue"); foreach (var line in File.ReadLines(resultFilePath)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) { continue; } prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) { continue; } if (match.PepQValue > qValueThreshold) { continue; } var peptide = match.Peptide.Replace("C+57.021", "C").Replace("K+8.014", "K").Replace("R+10.008", "R"); if (pepSet.Contains(peptide)) { continue; } pepSet.Add(peptide); if (spikedInPepSet.Contains(peptide)) { var ion = new Ion(match.Formula, match.Charge); var mostAbundantIonMz = ion.GetMostAbundantIsotopeMz(); var xic = run.GetPrecursorExtractedIonChromatogram(mostAbundantIonMz, tolerance, match.ScanNum); if (xic.Count == 0) { continue; } var minScan = xic.Min().ScanNum; var maxScan = xic.Max().ScanNum; writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}", peptide, match.Charge, ion.GetMonoIsotopicMz(), mostAbundantIonMz, match.ScanNum, reader.RtFromScanNum(match.ScanNum), reader.RtFromScanNum(xic.GetApexScanNum()), // Rt apex reader.RtFromScanNum(minScan), // Rt start reader.RtFromScanNum(maxScan), // Rt end match.SpecEValue, match.PepQValue); ++numPeptides; } //else //{ // foreach (var spikedInPeptide in spikedInPeptides) // { // if (spikedInPeptide.StartsWith(peptide)) prefix.Add(spikedInPeptide + "\t" + peptide + "\t" + match.ScanNum); // else if (spikedInPeptide.EndsWith(peptide)) suffix.Add(spikedInPeptide + "\t" + peptide + "\t" + match.ScanNum); // } //} } } //Console.WriteLine("*********Prefix"); //foreach(var p in prefix) Console.WriteLine(p); //Console.WriteLine("*********Suffix"); //foreach (var p in suffix) Console.WriteLine(p); Console.WriteLine("\t{0}", numPeptides); }
public void TestAbrfSpecCount() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"D:\Research\Data\IPRG2014"; if (!Directory.Exists(dir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir); } const double qValueThreshold = 0.01; var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" }; var accessions = new[] { "P00924", "P00330", "P62894", "P02769" }; //const string databaseFilePath = dir + @"\database\E_coli_K12_uniprot_reviewed_2013-01-31.revCat.fasta"; const string resultDir = dir + @"\10ppm_TI0_NTT1"; if (!Directory.Exists(resultDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir); } Console.WriteLine("Run\tTotal PSM\t" + string.Join("\t", names)); foreach (var msgfResultFile in Directory.GetFiles(resultDir, "*.tsv")) { var fileId = msgfResultFile.Substring(msgfResultFile.LastIndexOf('_') + 1, 3); Console.Write(fileId); var totalPsm = 0; var specCount = new int[accessions.Length]; MsGfPlusHeaderInformation headerInfo = null; var prevScanNum = -1; foreach (var line in File.ReadLines(msgfResultFile)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) { continue; } prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) { continue; } if (match.QValue > qValueThreshold) { continue; } totalPsm++; for (var i = 0; i < accessions.Length; i++) { if (match.Protein.StartsWith("sp|" + accessions[i])) { specCount[i]++; } } } Console.Write("\t" + totalPsm); for (var i = 0; i < accessions.Length; i++) { Console.Write("\t{0}", specCount[i]); } Console.WriteLine(); } }
public void GenerateAbrfSpecCountAllProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"H:\Research\IPRG2015"; if (!Directory.Exists(dir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir); } const double qValueThreshold = 0.01; //var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" }; //var accessions = new[] { "P00924", "P00330", "P62894", "P02769" }; const string resultDir = dir + @"\NTT1"; if (!Directory.Exists(resultDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir); } var msgfResultFiles = Directory.GetFiles(resultDir, "*.tsv").ToArray(); var specCount = new Dictionary <string, int[]>(); // protein name => array of counts for (var i = 0; i < msgfResultFiles.Length; i++) { var msgfResultFile = msgfResultFiles[i]; MsGfPlusHeaderInformation headerInfo = null; var prevScanNum = -1; foreach (var line in File.ReadLines(msgfResultFile)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) { continue; } prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) { continue; } if (match.QValue > qValueThreshold) { continue; } var proteins = match.Protein.Split(';'); foreach (var protein in proteins) { var proteinName = protein.Substring(0, protein.LastIndexOf("(pre=", StringComparison.Ordinal)); int[] countArr; if (!specCount.TryGetValue(proteinName, out countArr)) { specCount[proteinName] = new int[msgfResultFiles.Length]; } specCount[proteinName][i]++; } } } // Writing const string databaseFilePath = dir + @"\database\iPRG2015.fasta"; var database = new FastaDatabase(databaseFilePath); database.Read(); // var spikeInAccessions = new[] { "STANDARD_Alpha-Casein", "STANDARD_Beta-Lactoglobulin", "STANDARD_Carbonic-Anhydrase", "P02769"}; const string outputFilePath = dir + @"\SpecCountAllProteins.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var fileIds = msgfResultFiles.Select(f => f.Substring(f.IndexOf("_sample", StringComparison.Ordinal) + 1, f.LastIndexOf('.') - f.IndexOf("_sample", StringComparison.Ordinal) - 1)); writer.WriteLine("Protein\tLength\t" + string.Join("\t", fileIds) + "\tSpikeIn"); foreach (var entry in specCount) { var proteinId = entry.Key; var length = database.GetProteinLength(proteinId); Assert.True(length > 0); var counts = entry.Value; Assert.True(counts.Length == msgfResultFiles.Length); var spikeIn = 0; //if (spikeInAccessions.Any(spikeInAccession => proteinId.StartsWith("sp|" + spikeInAccession))) if (proteinId.StartsWith("sp|")) { spikeIn = 1; } writer.WriteLine("{0}\t{1}\t{2}\t{3}", proteinId, length, string.Join("\t", counts), spikeIn); } } }
public void TestAbrfSpecCount() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"D:\Research\Data\IPRG2014"; if (!Directory.Exists(dir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir); } const double qValueThreshold = 0.01; var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" }; var accessions = new[] { "P00924", "P00330", "P62894", "P02769" }; //const string databaseFilePath = dir + @"\database\E_coli_K12_uniprot_reviewed_2013-01-31.revCat.fasta"; const string resultDir = dir + @"\10ppm_TI0_NTT1"; if (!Directory.Exists(resultDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir); } Console.WriteLine("Run\tTotal PSM\t" + string.Join("\t", names)); foreach (var msgfResultFile in Directory.GetFiles(resultDir, "*.tsv")) { var fileId = msgfResultFile.Substring(msgfResultFile.LastIndexOf('_') + 1, 3); Console.Write(fileId); var totalPsm = 0; var specCount = new int[accessions.Length]; MsGfPlusHeaderInformation headerInfo = null; var prevScanNum = -1; foreach (var line in File.ReadLines(msgfResultFile)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) continue; prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) continue; if (match.QValue > qValueThreshold) continue; totalPsm++; for (var i = 0; i < accessions.Length; i++) { if (match.Protein.StartsWith("sp|" + accessions[i])) { specCount[i]++; } } } Console.Write("\t" + totalPsm); for (var i = 0; i < accessions.Length; i++) { Console.Write("\t{0}", specCount[i]); } Console.WriteLine(); } }
public void GenerateAbrfSpecCountAllProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"H:\Research\IPRG2015"; if (!Directory.Exists(dir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir); } const double qValueThreshold = 0.01; //var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" }; //var accessions = new[] { "P00924", "P00330", "P62894", "P02769" }; const string resultDir = dir + @"\NTT1"; if (!Directory.Exists(resultDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir); } var msgfResultFiles = Directory.GetFiles(resultDir, "*.tsv").ToArray(); var specCount = new Dictionary<string, int[]>(); // protein name => array of counts for (var i = 0; i < msgfResultFiles.Length; i++) { var msgfResultFile = msgfResultFiles[i]; MsGfPlusHeaderInformation headerInfo = null; var prevScanNum = -1; foreach (var line in File.ReadLines(msgfResultFile)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) continue; prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) continue; if (match.QValue > qValueThreshold) continue; var proteins = match.Protein.Split(';'); foreach (var protein in proteins) { var proteinName = protein.Substring(0, protein.LastIndexOf("(pre=", StringComparison.Ordinal)); int[] countArr; if (!specCount.TryGetValue(proteinName, out countArr)) specCount[proteinName] = new int[msgfResultFiles.Length]; specCount[proteinName][i]++; } } } // Writing const string databaseFilePath = dir + @"\database\iPRG2015.fasta"; var database = new FastaDatabase(databaseFilePath); database.Read(); // var spikeInAccessions = new[] { "STANDARD_Alpha-Casein", "STANDARD_Beta-Lactoglobulin", "STANDARD_Carbonic-Anhydrase", "P02769"}; const string outputFilePath = dir + @"\SpecCountAllProteins.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var fileIds = msgfResultFiles.Select(f => f.Substring(f.IndexOf("_sample", StringComparison.Ordinal) + 1, f.LastIndexOf('.') - f.IndexOf("_sample", StringComparison.Ordinal) - 1)); writer.WriteLine("Protein\tLength\t" + string.Join("\t", fileIds) + "\tSpikeIn"); foreach (var entry in specCount) { var proteinId = entry.Key; var length = database.GetProteinLength(proteinId); Assert.True(length > 0); var counts = entry.Value; Assert.True(counts.Length == msgfResultFiles.Length); var spikeIn = 0; //if (spikeInAccessions.Any(spikeInAccession => proteinId.StartsWith("sp|" + spikeInAccession))) if (proteinId.StartsWith("sp|")) { spikeIn = 1; } writer.WriteLine("{0}\t{1}\t{2}\t{3}", proteinId, length, string.Join("\t", counts), spikeIn); } } }
public void GeneratePrmInfo(string resultFilePath, string outputFilePath) { Console.Write("Processing {0}", Path.GetFileName(resultFilePath)); Console.Out.Flush(); var rawFilePath = @"D:\Research\Data\EDRN\DDA\raw\" + Path.GetFileNameWithoutExtension(resultFilePath) + ".raw"; var reader = new XCaliburReader(rawFilePath); var run = InMemoryLcMsRun.GetLcMsRun(rawFilePath); var tolerance = new Tolerance(10, ToleranceUnit.Ppm); const string spikedInPeptideFile = @"D:\Research\Data\EDRN\SpikedPeptides.txt"; var spikedInPeptides = File.ReadAllLines(spikedInPeptideFile); var spikedInPepSet = new HashSet<string>(); foreach (var p in spikedInPeptides) { spikedInPepSet.Add(p); } // const string resultFilePath = @"D:\Research\Data\EDRN\DDA\Frac7_NTT2.tsv"; //const string resultFilePath = @"D:\Research\Data\EDRN\DDA\Heavy\342865_EDRN_Serum_07_DDA_1_12Nov13_Samwise_13-07-28.tsv"; // const string resultFilePath = @"D:\Research\Data\EDRN\DDA\NTT1_NoMod\342865_EDRN_Serum_07_DDA_1_12Nov13_Samwise_13-07-28.tsv"; const double qValueThreshold = 0.01; var pepSet = new HashSet<string>(); MsGfPlusHeaderInformation headerInfo = null; //var prefix = new HashSet<string>(); //var suffix = new HashSet<string>(); var numPeptides = 0; var prevScanNum = -1; using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Peptide\tCharge\tMonoMz\tMostAbundantMz\tMs2ScanNum\tRtMs2\tRtApex\tRtStart\tRtEnd\tSpecEValue\tPepQValue"); foreach (var line in File.ReadLines(resultFilePath)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) continue; prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) continue; if (match.PepQValue > qValueThreshold) continue; var peptide = match.Peptide.Replace("C+57.021", "C").Replace("K+8.014", "K").Replace("R+10.008", "R"); if (pepSet.Contains(peptide)) continue; pepSet.Add(peptide); if (spikedInPepSet.Contains(peptide)) { var ion = new Ion(match.Formula, match.Charge); var mostAbundantIonMz = ion.GetMostAbundantIsotopeMz(); var xic = run.GetPrecursorExtractedIonChromatogram(mostAbundantIonMz, tolerance, match.ScanNum); if (xic.Count == 0) continue; var minScan = xic.Min().ScanNum; var maxScan = xic.Max().ScanNum; writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}", peptide, match.Charge, ion.GetMonoIsotopicMz(), mostAbundantIonMz, match.ScanNum, reader.RtFromScanNum(match.ScanNum), reader.RtFromScanNum(xic.GetApexScanNum()), // Rt apex reader.RtFromScanNum(minScan), // Rt start reader.RtFromScanNum(maxScan), // Rt end match.SpecEValue, match.PepQValue); ++numPeptides; } //else //{ // foreach (var spikedInPeptide in spikedInPeptides) // { // if (spikedInPeptide.StartsWith(peptide)) prefix.Add(spikedInPeptide + "\t" + peptide + "\t" + match.ScanNum); // else if (spikedInPeptide.EndsWith(peptide)) suffix.Add(spikedInPeptide + "\t" + peptide + "\t" + match.ScanNum); // } //} } } //Console.WriteLine("*********Prefix"); //foreach(var p in prefix) Console.WriteLine(p); //Console.WriteLine("*********Suffix"); //foreach (var p in suffix) Console.WriteLine(p); Console.WriteLine("\t{0}", numPeptides); }