private void WriteResultsToFile(SortedSet <DatabaseSequenceSpectrumMatch>[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tIcScore" ); for (var scanNum = _run.MinLcScan; scanNum <= _run.MaxLcScan; scanNum++) { if (matches[scanNum] == null) { continue; } foreach (var match in matches[scanNum].Reverse()) { var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var scores = _bottomUpScorer.GetScores(match, ion.Composition, ion.Charge, scanNum); if (ion == null) { Console.WriteLine(@"Null ion!"); } if (scores == null) { Console.WriteLine(@"Null scores"); } // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}", scanNum, match.Pre, sequence, // Sequence match.Post, scores.Modifications, // Modifications ion.Composition, // Composition proteinName, // ProteinName database.GetProteinDescription(match.Offset), // ProteinDescription protLength, // ProteinLength start, // Start end, // End ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.Score, scores.Score // Score (re-scored) ); } } } }
public void TestGettingProteinLengthAndPosition() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin)) { var annotation = peptideAnnotationAndOffset.Annotation; var offset = peptideAnnotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", annotation, offset, db.GetProteinName(offset), db.GetProteinLength(db.GetProteinName(offset)), db.GetOneBasedPositionInProtein(offset) + 1); } }
public void TestGettingProteinLengthAndPosition() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\Short.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); var indexedDb = new IndexedDatabase(db); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin)) { var annotation = peptideAnnotationAndOffset.Annotation; var offset = peptideAnnotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", annotation, offset, db.GetProteinName(offset), db.GetProteinLength(db.GetProteinName(offset)), db.GetOneBasedPositionInProtein(offset) + 1); } }
private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue"); foreach (var scanNum in _ms2ScanNums) { var match = matches[scanNum]; if (match == null) { continue; } var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var proteinDescription = database.GetProteinDescription(match.Offset); var probability = CompositeScorer.GetProbability(match.Score); // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}", scanNum, match.Pre, // Pre sequence, // Sequence match.Post, // Post match.ModificationText, // Modifications ion.Composition, // Composition proteinName, // ProteinName proteinDescription, // ProteinDescription protLength, // ProteinLength start, // Start position in protein end, // End position in protein ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.NumMatchedFragments, // (Number of matched fragments) StringUtilities.DblToString(probability, 4), // Probability StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001), // EValue; will be displayed using scientific notation if the value is less than 0.001 StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001) // SpecEValue; will be displayed using scientific notation if the value is less than 0.001 ); } } }
public void AddProteinLengths() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string databaseFilePath = @"H:\Research\IPRG2015\database\yeast6proteaprotein.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); const string resultPath = @"H:\Research\IPRG2015\AMT_Peptides_NA.tsv"; if (!File.Exists(resultPath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath); } const string outputFilePath = @"H:\Research\IPRG2015\AMT_Peptides.tsv"; using (var writer = new StreamWriter(outputFilePath)) { foreach (var line in File.ReadLines(resultPath)) { var data = line.Split(null); if (data.Length != 14) { continue; } var peptide = data[0]; if (peptide.Equals("Peptide")) { writer.WriteLine("Peptide\tProtein\tLength\t{0}", string.Join("\t", data.Skip(2))); continue; } var protein = data[1]; var length = database.GetProteinLength(protein); writer.WriteLine("{0}\t{1}\t{2}\t{3}", peptide, protein, length, string.Join("\t", data.Skip(2))); } } }
public void GenerateAbrfSpecCountAllProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"H:\Research\IPRG2015"; if (!Directory.Exists(dir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir); } const double qValueThreshold = 0.01; //var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" }; //var accessions = new[] { "P00924", "P00330", "P62894", "P02769" }; const string resultDir = dir + @"\NTT1"; if (!Directory.Exists(resultDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir); } var msgfResultFiles = Directory.GetFiles(resultDir, "*.tsv").ToArray(); var specCount = new Dictionary <string, int[]>(); // protein name => array of counts for (var i = 0; i < msgfResultFiles.Length; i++) { var msgfResultFile = msgfResultFiles[i]; MsGfPlusHeaderInformation headerInfo = null; var prevScanNum = -1; foreach (var line in File.ReadLines(msgfResultFile)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) { continue; } prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) { continue; } if (match.QValue > qValueThreshold) { continue; } var proteins = match.Protein.Split(';'); foreach (var protein in proteins) { var proteinName = protein.Substring(0, protein.LastIndexOf("(pre=", StringComparison.Ordinal)); int[] countArr; if (!specCount.TryGetValue(proteinName, out countArr)) { specCount[proteinName] = new int[msgfResultFiles.Length]; } specCount[proteinName][i]++; } } } // Writing const string databaseFilePath = dir + @"\database\iPRG2015.fasta"; var database = new FastaDatabase(databaseFilePath); database.Read(); // var spikeInAccessions = new[] { "STANDARD_Alpha-Casein", "STANDARD_Beta-Lactoglobulin", "STANDARD_Carbonic-Anhydrase", "P02769"}; const string outputFilePath = dir + @"\SpecCountAllProteins.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var fileIds = msgfResultFiles.Select(f => f.Substring(f.IndexOf("_sample", StringComparison.Ordinal) + 1, f.LastIndexOf('.') - f.IndexOf("_sample", StringComparison.Ordinal) - 1)); writer.WriteLine("Protein\tLength\t" + string.Join("\t", fileIds) + "\tSpikeIn"); foreach (var entry in specCount) { var proteinId = entry.Key; var length = database.GetProteinLength(proteinId); Assert.True(length > 0); var counts = entry.Value; Assert.True(counts.Length == msgfResultFiles.Length); var spikeIn = 0; //if (spikeInAccessions.Any(spikeInAccession => proteinId.StartsWith("sp|" + spikeInAccession))) if (proteinId.StartsWith("sp|")) { spikeIn = 1; } writer.WriteLine("{0}\t{1}\t{2}\t{3}", proteinId, length, string.Join("\t", counts), spikeIn); } } }
public void ProcessIprg2015PreStudy() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"H:\Research\IPRG2015"; const string databaseFilePath = dir + @"\database\yeast6proteaprotein.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); const string jobFilePath = dir + @"\Jobs.tsv"; if (!File.Exists(jobFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, jobFilePath); } var jobParser = new TsvFileParser(jobFilePath); var jobs = jobParser.GetData("Jobs").Select(j => Convert.ToInt32(j)).ToArray(); var experiments = jobParser.GetData("Experiments").Select(e => e.Split('_')[2]).ToArray(); //const string resultFilePath = dir + @"\AMT_Proteins_NA.tsv"; //const string outputFilePath = dir + @"\AMT_Proteins.tsv"; const string resultFilePath = dir + @"\AMT_Peptides_NA.tsv"; const string outputFilePath = dir + @"\AMT_Peptides.tsv"; var parser = new TsvFileParser(resultFilePath); var headers = parser.GetHeaders(); var jobColNum = new int[jobs.Length]; for (var i = 0; i < jobs.Length; i++) { for (var j = 0; j < headers.Count; j++) { if (headers[j].Contains("" + jobs[i])) { jobColNum[i] = j; break; } } } for (var i = 0; i < jobs.Length; i++) { Console.WriteLine("{0}\t{1}\t{2}", jobs[i], jobColNum[i], experiments[i]); } using (var writer = new StreamWriter(outputFilePath)) { var peptides = parser.GetData("Peptide"); // Peptides var proteins = parser.GetData("Reference"); // Proteins var abundances = new string[jobs.Length][]; for (var i = 0; i < jobs.Length; i++) { abundances[i] = parser.GetData(headers[jobColNum[i]]).ToArray(); } if (peptides != null) { writer.Write("Peptide\t"); } writer.Write("Protein\tLength"); for (var i = 0; i < jobs.Length; i++) { writer.Write("\t" + experiments[i]); } writer.WriteLine("\tSpikeIn"); for (var i = 0; i < proteins.Count; i++) { var protein = proteins[i]; if (protein.StartsWith("XXX") || protein.StartsWith("Contaminant")) { continue; } var length = database.GetProteinLength(protein); //if (length <= 0) //{ // Console.WriteLine("Shit!"); // return; //} if (peptides != null) { writer.Write(peptides[i] + "\t"); } writer.Write(protein + "\t" + length); for (var j = 0; j < jobs.Length; j++) { writer.Write("\t" + abundances[j][i]); } writer.WriteLine("\t" + (protein.StartsWith("STANDARD") ? 1 : 0)); } } }