public void TestGettingProteinLengthAndPosition() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin)) { var annotation = peptideAnnotationAndOffset.Annotation; var offset = peptideAnnotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", annotation, offset, db.GetProteinName(offset), db.GetProteinLength(db.GetProteinName(offset)), db.GetOneBasedPositionInProtein(offset) + 1); } }
public void CreateTargetList() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string databaseFilePath = @"D:\Research\Data\IPRG2014\database\SpikedInPeptides.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); var indexedDatabase = new IndexedDatabase(database); var numTargets = 0; var aaSet = new AminoAcidSet(Modification.Carbamidomethylation); Console.WriteLine("Peptide\tFormula\tProtein"); foreach (var annotationAndOffset in indexedDatabase.AnnotationsAndOffsets(6, 30, 1, 1, Enzyme.Trypsin)) { var annotation = annotationAndOffset.Annotation; var peptide = annotation.Substring(2, annotation.Length - 4); var offset = annotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}", peptide, (aaSet.GetComposition(peptide) + Composition.H2O).ToPlainString(), database.GetProteinName(offset)); numTargets++; } Console.WriteLine("NumTargets: {0}", numTargets); }
public void TestGettingProteinLengthAndPosition() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\Short.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); var indexedDb = new IndexedDatabase(db); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin)) { var annotation = peptideAnnotationAndOffset.Annotation; var offset = peptideAnnotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", annotation, offset, db.GetProteinName(offset), db.GetProteinLength(db.GetProteinName(offset)), db.GetOneBasedPositionInProtein(offset) + 1); } }
public void TestNominalMassErrors() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minLength = 300; const int maxLength = 400; var sw = new System.Diagnostics.Stopwatch(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_003962_71E1A1D4.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; sw.Start(); var hist = new long[11]; var aaSet = new AminoAcidSet(); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength)) { ++numSequences; var annotation = peptideAnnotationAndOffset.Annotation; var sequenceStr = annotation.Substring(2, annotation.Length - 4); var sequenceComp = aaSet.GetComposition(sequenceStr); var mass = sequenceComp.Mass; var nominalMass = sequenceComp.NominalMass; var error = (int)Math.Round(mass * Constants.RescalingConstant) - nominalMass; var errorBin = error + hist.Length / 2; if (errorBin < 0) { errorBin = 0; } if (errorBin >= hist.Length) { errorBin = hist.Length - 1; } hist[errorBin]++; } Console.WriteLine("NumSequences: {0}", numSequences); for (var i = 0; i < hist.Length; i++) { Console.WriteLine("{0}\t{1}\t{2}", i - hist.Length / 2, hist[i], hist[i] / (double)numSequences); } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestNominalMassErrors() { const int minLength = 300; const int maxLength = 400; var sw = new System.Diagnostics.Stopwatch(); // const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\ID_003962_71E1A1D4.fasta"; //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\TargetProteins.fasta"; var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; sw.Start(); var hist = new long[11]; var aaSet = new AminoAcidSet(); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength)) { ++numSequences; var annotation = peptideAnnotationAndOffset.Annotation; var sequenceStr = annotation.Substring(2, annotation.Length - 4); var sequenceComp = aaSet.GetComposition(sequenceStr); var mass = sequenceComp.Mass; var nominalMass = sequenceComp.NominalMass; var error = (int)Math.Round(mass * Constants.RescalingConstant) - nominalMass; var errorBin = error + hist.Length / 2; if (errorBin < 0) { errorBin = 0; } if (errorBin >= hist.Length) { errorBin = hist.Length - 1; } hist[errorBin]++; } Console.WriteLine("NumSequences: {0}", numSequences); for (var i = 0; i < hist.Length; i++) { Console.WriteLine("{0}\t{1}\t{2}", i - hist.Length / 2, hist[i], hist[i] / (double)numSequences); } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestTempCompRefLcMsFeatureAlign() { const string dataFolder = @"D:\MassSpecFiles\CompRef"; const string fastaFilePath = @"D:\MassSpecFiles\CompRef\db\ID_003278_4B4B3CB1.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); fastaDb.Read(); var fileEntries = Directory.GetFiles(dataFolder); var dataset = (from fileName in fileEntries where fileName.EndsWith("pbf") select Path.GetFileNameWithoutExtension(fileName)).ToList(); dataset.Sort(); for (var i = 0; i < dataset.Count; i++) { var writer = new StreamWriter(string.Format(@"D:\MassSpecFiles\CompRef\MsPathFinderMerged\{0}_IcTda.tsv", dataset[i])); writer.Write("Scan"); writer.Write("\t"); writer.Write("Sequence"); writer.Write("\t"); writer.Write("Modifications"); writer.Write("\t"); writer.Write("Mass"); writer.Write("\t"); writer.Write("ProteinName"); writer.Write("\t"); writer.Write("ProteinDesc"); writer.Write("\t"); writer.Write("Start"); writer.Write("\t"); writer.Write("End"); writer.Write("\t"); writer.Write("#MatchedFragments"); writer.Write("\t"); writer.Write("QValue"); writer.Write("\n"); var path1 = string.Format(@"D:\MassSpecFiles\CompRef\MsPathFinder\{0}_IcTda.tsv", dataset[i]); var parser1 = new TsvFileParser(path1); OutputMergedResult(writer, parser1, fastaDb); var path2 = string.Format(@"D:\MassSpecFiles\CompRef\seqtag\{0}_tagmatch.tsv", dataset[i]); var parser2 = new TsvFileParser(path2); OutputMergedResult(writer, parser2, fastaDb); writer.Close(); } }
public void AddProteinLengths() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string databaseFilePath = @"H:\Research\IPRG2015\database\yeast6proteaprotein.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); const string resultPath = @"H:\Research\IPRG2015\AMT_Peptides_NA.tsv"; if (!File.Exists(resultPath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath); } const string outputFilePath = @"H:\Research\IPRG2015\AMT_Peptides.tsv"; using (var writer = new StreamWriter(outputFilePath)) { foreach (var line in File.ReadLines(resultPath)) { var data = line.Split(null); if (data.Length != 14) { continue; } var peptide = data[0]; if (peptide.Equals("Peptide")) { writer.WriteLine("Peptide\tProtein\tLength\t{0}", string.Join("\t", data.Skip(2))); continue; } var protein = data[1]; var length = database.GetProteinLength(protein); writer.WriteLine("{0}\t{1}\t{2}\t{3}", peptide, protein, length, string.Join("\t", data.Skip(2))); } } }
public void TestSumParallel() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); //var array = Enumerable.Range(0, short.MaxValue).ToArray(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_002216_235ACCEA.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); //var indexedDb = new IndexedDatabase(db); //indexedDb.Read(); //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); var charArray = db.Characters().Select(c => (int)c).ToList(); // Test methods. var defaultSum = SumAsParallel(charArray); var parallelSum = SumAsParallel(charArray); Console.WriteLine("Default sum {0}", defaultSum); Console.WriteLine("Parallel sum {0}", parallelSum); Assert.AreEqual(parallelSum, defaultSum); const int m = 100; var s1 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumDefault(charArray); } s1.Stop(); var s2 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumAsParallel(charArray); } s2.Stop(); Console.WriteLine("{0:F2} msec/sum, on average for default", s1.Elapsed.TotalMilliseconds / m); Console.WriteLine("{0:F2} msec/sum, on average for parallel", s2.Elapsed.TotalMilliseconds / m); }
public void TestSequenceEnumerationParallel2() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\ID_002216_235ACCEA.fasta")); var db = new FastaDatabase(fastaFile.FullName); db.Read(); var indexedDb = new IndexedDatabase(db); var arr = db.Characters().ToArray(); sw.Start(); //var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); // var num = annotationsAndOffsets.AsParallel().LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = annotationsAndOffsets.LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = arr.AsParallel().Where(c => c == 'W').LongCount(); var num = 0; var sum = 0L; //foreach (var c in arr) for (var a = 0; a < arr.Length; a++) { var c = arr[a]; for (var i = 0; i < c * 10000; i++) { sum += i; } // Interlocked.Increment(ref num); if (++num == 1000) { break; } } Console.WriteLine("NumPeptides: {0}", sum); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestSumParallel() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //var array = Enumerable.Range(0, short.MaxValue).ToArray(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta"; var db = new FastaDatabase(dbFile); db.Read(); //var indexedDb = new IndexedDatabase(db); //indexedDb.Read(); //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); var charArray = db.Characters().Select(c => (int)c); // Test methods. Console.WriteLine(SumAsParallel(charArray)); Console.WriteLine(SumDefault(charArray)); const int m = 100; var s1 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumDefault(charArray); } s1.Stop(); var s2 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumAsParallel(charArray); } s2.Stop(); Console.WriteLine((s1.Elapsed.TotalMilliseconds * 1000000 / m).ToString("0.00 ns")); Console.WriteLine((s2.Elapsed.TotalMilliseconds * 1000000 / m).ToString("0.00 ns")); Console.Read(); }
} // true: target and decoy, false: target only, null: decoy only public void QuickId() { const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt"; const int numBits = 29; // max error: 4ppm const int minCharge = 1; const int maxCharge = 20; var tolerance = new Tolerance(10); const double corrThreshold = 0.7; var comparer = new MzComparerWithBinning(numBits); const double minFragmentMass = 200.0; const double maxFragmentMass = 50000.0; var minFragMassBin = comparer.GetBinNumber(minFragmentMass); var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass); var aminoAcidSet = new AminoAcidSet(modFilePath); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var ms2ScanNumArr = run.GetScanNumbers(2).ToArray(); var sw = new Stopwatch(); sw.Start(); Console.Write("Building Spectrum Arrays..."); var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1]; for (var i = minFragMassBin; i <= maxFragMassBin; i++) { massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1); } foreach (var ms2ScanNum in ms2ScanNumArr) { var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (productSpec == null) { continue; } var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec.Peaks, minCharge, maxCharge, 2, 1.1, tolerance, corrThreshold); if (deconvolutedPeaks == null) { continue; } foreach (var p in deconvolutedPeaks) { var mass = p.Mass; var deltaMass = tolerance.GetToleranceAsDa(mass, 1); var minMass = mass - deltaMass; var maxMass = mass + deltaMass; var minBinNum = comparer.GetBinNumber(minMass); var maxBinNum = comparer.GetBinNumber(maxMass); for (var binNum = minBinNum; binNum <= maxBinNum; binNum++) { if (binNum >= minFragMassBin && binNum <= maxFragMassBin) { massVectors[binNum - minFragMassBin][ms2ScanNum] = true; } } } } sw.Stop(); Console.WriteLine(@"{0:f1} sec.", sw.Elapsed.TotalSeconds); sw.Reset(); sw.Start(); var fastaDb = new FastaDatabase(fastaFilePath); fastaDb.Read(); var indexedDb = new IndexedDatabase(fastaDb); var numProteins = 0; var intactProteinAnnotationAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue); var bestProtein = new string[run.MaxLcScan + 1]; var bestScore = new int[run.MaxLcScan + 1]; foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets) { if (++numProteins % 10 == 0) { Console.WriteLine(@"Processing, {0} proteins done, {1:f1} sec elapsed", numProteins, sw.Elapsed.TotalSeconds); } var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var protSequence = annotation.Substring(2, annotation.Length - 4); // suffix var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence, AminoAcid.ProteinCTerm); if (seqGraph == null) { continue; } for (var numNTermCleavage = 0; numNTermCleavage <= 1; numNTermCleavage++) { if (numNTermCleavage > 0) { seqGraph.CleaveNTerm(); } var allCompositions = seqGraph.GetAllFragmentNodeCompositions(); var scoreArr = new int[run.MaxLcScan + 1]; foreach (var fragComp in allCompositions) { var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass; var binNum = comparer.GetBinNumber(suffixMass); if (binNum < minFragMassBin || binNum > maxFragMassBin) { continue; } var vector = massVectors[binNum - minFragMassBin]; foreach (var ms2ScanNum in ms2ScanNumArr) { if (vector[ms2ScanNum]) { ++scoreArr[ms2ScanNum]; } } } foreach (var ms2ScanNum in ms2ScanNumArr) { if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum]) { bestScore[ms2ScanNum] = scoreArr[ms2ScanNum]; var proteinName = fastaDb.GetProteinName(offset); bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : ""); } } } // prefix } Console.WriteLine("ScanNum\tBestProtein\tScore"); foreach (var ms2ScanNum in ms2ScanNumArr) { Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestScore[ms2ScanNum], bestProtein[ms2ScanNum] ?? ""); } }
public bool RunSearch(double corrThreshold) { var sw = new Stopwatch(); ErrorMessage = string.Empty; Console.Write(@"Reading raw file..."); sw.Start(); _run = InMemoryLcMsRun.GetLcMsRun(SpecFilePath, 1.4826, 1.4826); _bottomUpScorer = new InformedBottomUpScorer(_run, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance); sw.Stop(); var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Determining precursor masses..."); sw.Start(); var ms1Filter = new Ms1IsotopeAndChargeCorrFilter(_run, PrecursorIonTolerance, MinPrecursorIonCharge, MaxPrecursorIonCharge, 400, 5000, corrThreshold, 0, 0); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Deconvoluting MS2 spectra..."); sw.Start(); _ms2ScorerFactory = new ProductScorerBasedOnDeconvolutedSpectra( _run, MinProductIonCharge, MaxProductIonCharge, new Tolerance(10), 0 ); _ms2ScorerFactory.DeconvoluteAllProductSpectra(); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); // Target database var targetDb = new FastaDatabase(DatabaseFilePath); // string dirName = OutputDir ?? Path.GetDirectoryName(SpecFilePath); var baseName = Path.GetFileNameWithoutExtension(SpecFilePath); var targetOutputFilePath = Path.Combine(OutputDir, baseName + TargetFileExtension); var decoyOutputFilePath = Path.Combine(OutputDir, baseName + DecoyFileExtension); var tdaOutputFilePath = Path.Combine(OutputDir, baseName + TdaFileExtension); if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Target)) { sw.Reset(); Console.Write(@"Reading the target database..."); sw.Start(); targetDb.Read(); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.WriteLine(@"Searching the target database"); sw.Start(); var targetMatches = RunSearch(GetAnnotationsAndOffsets(targetDb), ms1Filter, false); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Target database search elapsed time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Rescoring and writing target results..."); sw.Start(); WriteResultsToFile(targetMatches, targetOutputFilePath, targetDb); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed time: {0:f4} sec", sec); } if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Decoy)) { // Decoy database sw.Reset(); Console.Write(@"Reading the decoy database..."); sw.Start(); var decoyDb = targetDb.Decoy(Enzyme); decoyDb.Read(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.WriteLine(@"Searching the decoy database"); sw.Start(); var decoyMatches = RunSearch(GetAnnotationsAndOffsets(decoyDb), ms1Filter, true); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Decoy database search elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Rescoring and writing decoy results..."); sw.Start(); WriteResultsToFile(decoyMatches, decoyOutputFilePath, decoyDb); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed time: {0:f4} sec", sec); } if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Both)) { var fdrCalculator = new FdrCalculator(targetOutputFilePath, decoyOutputFilePath); if (fdrCalculator.HasError()) { ErrorMessage = fdrCalculator.ErrorMessage; Console.WriteLine(@"Error computing FDR: " + fdrCalculator.ErrorMessage); return(false); } fdrCalculator.WriteTo(tdaOutputFilePath); } Console.WriteLine(@"Done"); return(true); }
public void GenerateAbrfSpecCountAllProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"H:\Research\IPRG2015"; if (!Directory.Exists(dir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir); } const double qValueThreshold = 0.01; //var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" }; //var accessions = new[] { "P00924", "P00330", "P62894", "P02769" }; const string resultDir = dir + @"\NTT1"; if (!Directory.Exists(resultDir)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir); } var msgfResultFiles = Directory.GetFiles(resultDir, "*.tsv").ToArray(); var specCount = new Dictionary <string, int[]>(); // protein name => array of counts for (var i = 0; i < msgfResultFiles.Length; i++) { var msgfResultFile = msgfResultFiles[i]; MsGfPlusHeaderInformation headerInfo = null; var prevScanNum = -1; foreach (var line in File.ReadLines(msgfResultFile)) { if (line.StartsWith("#")) { headerInfo = new MsGfPlusHeaderInformation(line); continue; } var match = new MsGfMatch(line, headerInfo); if (match.ScanNum == prevScanNum) { continue; } prevScanNum = match.ScanNum; if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) { continue; } if (match.QValue > qValueThreshold) { continue; } var proteins = match.Protein.Split(';'); foreach (var protein in proteins) { var proteinName = protein.Substring(0, protein.LastIndexOf("(pre=", StringComparison.Ordinal)); int[] countArr; if (!specCount.TryGetValue(proteinName, out countArr)) { specCount[proteinName] = new int[msgfResultFiles.Length]; } specCount[proteinName][i]++; } } } // Writing const string databaseFilePath = dir + @"\database\iPRG2015.fasta"; var database = new FastaDatabase(databaseFilePath); database.Read(); // var spikeInAccessions = new[] { "STANDARD_Alpha-Casein", "STANDARD_Beta-Lactoglobulin", "STANDARD_Carbonic-Anhydrase", "P02769"}; const string outputFilePath = dir + @"\SpecCountAllProteins.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var fileIds = msgfResultFiles.Select(f => f.Substring(f.IndexOf("_sample", StringComparison.Ordinal) + 1, f.LastIndexOf('.') - f.IndexOf("_sample", StringComparison.Ordinal) - 1)); writer.WriteLine("Protein\tLength\t" + string.Join("\t", fileIds) + "\tSpikeIn"); foreach (var entry in specCount) { var proteinId = entry.Key; var length = database.GetProteinLength(proteinId); Assert.True(length > 0); var counts = entry.Value; Assert.True(counts.Length == msgfResultFiles.Length); var spikeIn = 0; //if (spikeInAccessions.Any(spikeInAccession => proteinId.StartsWith("sp|" + spikeInAccession))) if (proteinId.StartsWith("sp|")) { spikeIn = 1; } writer.WriteLine("{0}\t{1}\t{2}\t{3}", proteinId, length, string.Join("\t", counts), spikeIn); } } }
public void ProcessIprg2015PreStudy() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dir = @"H:\Research\IPRG2015"; const string databaseFilePath = dir + @"\database\yeast6proteaprotein.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); const string jobFilePath = dir + @"\Jobs.tsv"; if (!File.Exists(jobFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, jobFilePath); } var jobParser = new TsvFileParser(jobFilePath); var jobs = jobParser.GetData("Jobs").Select(j => Convert.ToInt32(j)).ToArray(); var experiments = jobParser.GetData("Experiments").Select(e => e.Split('_')[2]).ToArray(); //const string resultFilePath = dir + @"\AMT_Proteins_NA.tsv"; //const string outputFilePath = dir + @"\AMT_Proteins.tsv"; const string resultFilePath = dir + @"\AMT_Peptides_NA.tsv"; const string outputFilePath = dir + @"\AMT_Peptides.tsv"; var parser = new TsvFileParser(resultFilePath); var headers = parser.GetHeaders(); var jobColNum = new int[jobs.Length]; for (var i = 0; i < jobs.Length; i++) { for (var j = 0; j < headers.Count; j++) { if (headers[j].Contains("" + jobs[i])) { jobColNum[i] = j; break; } } } for (var i = 0; i < jobs.Length; i++) { Console.WriteLine("{0}\t{1}\t{2}", jobs[i], jobColNum[i], experiments[i]); } using (var writer = new StreamWriter(outputFilePath)) { var peptides = parser.GetData("Peptide"); // Peptides var proteins = parser.GetData("Reference"); // Proteins var abundances = new string[jobs.Length][]; for (var i = 0; i < jobs.Length; i++) { abundances[i] = parser.GetData(headers[jobColNum[i]]).ToArray(); } if (peptides != null) { writer.Write("Peptide\t"); } writer.Write("Protein\tLength"); for (var i = 0; i < jobs.Length; i++) { writer.Write("\t" + experiments[i]); } writer.WriteLine("\tSpikeIn"); for (var i = 0; i < proteins.Count; i++) { var protein = proteins[i]; if (protein.StartsWith("XXX") || protein.StartsWith("Contaminant")) { continue; } var length = database.GetProteinLength(protein); //if (length <= 0) //{ // Console.WriteLine("Shit!"); // return; //} if (peptides != null) { writer.Write(peptides[i] + "\t"); } writer.Write(protein + "\t" + length); for (var j = 0; j < jobs.Length; j++) { writer.Write("\t" + abundances[j][i]); } writer.WriteLine("\t" + (protein.StartsWith("STANDARD") ? 1 : 0)); } } }
public void GetProteinAccessions() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string uniprotAccession = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"; var uniProtPattern = new Regex(uniprotAccession); const string databaseFilePath = @"H:\Research\IPRG2015\Henry_results\iPRG2015.TargDecoy.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); var nameToAccession = new Dictionary <string, string>(); foreach (var proteinName in database.GetProteinNames()) { var start = proteinName.IndexOf('|'); var end = proteinName.LastIndexOf('|'); //var accession = proteinName.Substring(start + 1, end - start - 1); var name = proteinName.Substring(end + 1); if (proteinName.StartsWith("DECOY")) { name = name + "-DECOY"; } // Console.WriteLine(name + " -> " +accession); Assert.IsTrue(uniProtPattern.IsMatch(proteinName)); nameToAccession.Add(name, proteinName); // Console.WriteLine(name); } const string resultPath = @"H:\Research\IPRG2015\Henry_results\ProteinNames.txt"; if (!File.Exists(resultPath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath); } foreach (var line in File.ReadLines(resultPath)) { if (line.Length == 0) { continue; } var name = line; // if (name.Contains(";")) // { // } name = name.Split()[0]; if (name.Contains('|')) { name = name.Substring(name.LastIndexOf('|') + 1); } string proteinName; if (nameToAccession.TryGetValue(name, out proteinName)) { Console.WriteLine(proteinName); } else { Console.WriteLine(name); Assert.IsTrue(false); } } }
public bool RunSearch(double corrThreshold = 0.7, CancellationToken?cancellationToken = null, IProgress <ProgressData> progress = null) { // Get the Normalized spec file/folder path SpecFilePath = MassSpecDataReaderFactory.NormalizeDatasetPath(SpecFilePath); var prog = new Progress <ProgressData>(); var progData = new ProgressData(progress); if (progress != null) { prog = new Progress <ProgressData>(p => { progData.Status = p.Status; progData.StatusInternal = p.StatusInternal; progData.Report(p.Percent); }); } var sw = new Stopwatch(); var swAll = new Stopwatch(); swAll.Start(); ErrorMessage = string.Empty; Console.Write(@"Reading raw file..."); progData.Status = "Reading spectra file"; progData.StepRange(10.0); sw.Start(); _run = PbfLcMsRun.GetLcMsRun(SpecFilePath, 0, 0, prog); _ms2ScanNums = _run.GetScanNumbers(2).ToArray(); _isolationWindowTargetMz = new double[_run.MaxLcScan + 1]; foreach (var ms2Scan in _ms2ScanNums) { var ms2Spec = _run.GetSpectrum(ms2Scan) as ProductSpectrum; if (ms2Spec == null) { continue; } _isolationWindowTargetMz[ms2Scan] = ms2Spec.IsolationWindow.IsolationWindowTargetMz; } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); progData.StepRange(20.0); ISequenceFilter ms1Filter; if (this.ScanNumbers != null && this.ScanNumbers.Any()) { ms1Filter = new SelectedMsMsFilter(this.ScanNumbers); } else if (string.IsNullOrWhiteSpace(FeatureFilePath)) { // Checks whether SpecFileName.ms1ft exists var ms1FtFilePath = MassSpecDataReaderFactory.ChangeExtension(SpecFilePath, LcMsFeatureFinderLauncher.FileExtension); if (!File.Exists(ms1FtFilePath)) { Console.WriteLine(@"Running ProMex..."); sw.Start(); var param = new LcMsFeatureFinderInputParameter { InputPath = SpecFilePath, MinSearchMass = MinSequenceMass, MaxSearchMass = MaxSequenceMass, MinSearchCharge = MinPrecursorIonCharge, MaxSearchCharge = MaxPrecursorIonCharge, CsvOutput = false, ScoreReport = false, LikelihoodScoreThreshold = -10 }; var featureFinder = new LcMsFeatureFinderLauncher(param); featureFinder.Run(); } sw.Reset(); sw.Start(); Console.Write(@"Reading ProMex results..."); ms1Filter = new Ms1FtFilter(_run, PrecursorIonTolerance, ms1FtFilePath, -10); } else { sw.Reset(); sw.Start(); var extension = Path.GetExtension(FeatureFilePath); if (extension.ToLower().Equals(".csv")) { Console.Write(@"Reading ICR2LS/Decon2LS results..."); ms1Filter = new IsosFilter(_run, PrecursorIonTolerance, FeatureFilePath); } else if (extension.ToLower().Equals(".ms1ft")) { Console.Write(@"Reading ProMex results..."); ms1Filter = new Ms1FtFilter(_run, PrecursorIonTolerance, FeatureFilePath, -10); } else if (extension.ToLower().Equals(".msalign")) { Console.Write(@"Reading MS-Align+ results..."); ms1Filter = new MsDeconvFilter(_run, PrecursorIonTolerance, FeatureFilePath); } else { ms1Filter = null; //new Ms1FeatureMatrix(_run); } } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); // pre-generate deconvoluted spectra for scoring _massBinComparer = new FilteredProteinMassBinning(AminoAcidSet, MaxSequenceMass + 1000); _ms2ScorerFactory2 = new CompositeScorerFactory(_run, _massBinComparer, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance); sw.Reset(); Console.WriteLine(@"Generating deconvoluted spectra for MS/MS spectra..."); sw.Start(); var pfeOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxNumThreads, CancellationToken = cancellationToken ?? CancellationToken.None }; Parallel.ForEach(_ms2ScanNums, pfeOptions, ms2ScanNum => { _ms2ScorerFactory2.DeconvonluteProductSpectrum(ms2ScanNum); }); sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); progData.StepRange(10.0); progData.Status = "Reading Fasta File"; // Target database var targetDb = new FastaDatabase(DatabaseFilePath); targetDb.Read(); // Generate sequence tags for all MS/MS spectra if (TagBasedSearch) { progData.StepRange(25.0); progData.Status = "Generating Sequence Tags"; sw.Reset(); Console.WriteLine(@"Generating sequence tags for MS/MS spectra..."); sw.Start(); var seqTagGen = GetSequenceTagGenerator(); _tagMs2ScanNum = seqTagGen.GetMs2ScanNumsContainingTags().ToArray(); sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); _tagSearchEngine = new ScanBasedTagSearchEngine(_run, seqTagGen, new LcMsPeakMatrix(_run, ms1Filter), targetDb, ProductIonTolerance, AminoAcidSet, _ms2ScorerFactory2, ScanBasedTagSearchEngine.DefaultMinMatchedTagLength, MaxSequenceMass, MinProductIonCharge, MaxProductIonCharge); } var specFileName = MassSpecDataReaderFactory.RemoveExtension(Path.GetFileName(SpecFilePath)); var targetOutputFilePath = Path.Combine(OutputDir, specFileName + TargetFileNameEnding); var decoyOutputFilePath = Path.Combine(OutputDir, specFileName + DecoyFileNameEnding); var tdaOutputFilePath = Path.Combine(OutputDir, specFileName + TdaFileNameEnding); progData.StepRange(60.0); progData.Status = "Running Target search"; if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Target)) { sw.Reset(); Console.Write(@"Reading the target database..."); sw.Start(); targetDb.Read(); sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); var targetMatches = new SortedSet <DatabaseSequenceSpectrumMatch> [_run.MaxLcScan + 1]; progData.MaxPercentage = 42.5; if (TagBasedSearch) { sw.Reset(); Console.WriteLine(@"Tag-based searching the target database"); sw.Start(); RunTagBasedSearch(targetMatches, targetDb, null, prog); Console.WriteLine(@"Target database tag-based search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); } progData.MaxPercentage = 60.0; sw.Reset(); Console.WriteLine(@"Searching the target database"); sw.Start(); RunSearch(targetMatches, targetDb, ms1Filter, null, prog); Console.WriteLine(@"Target database search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); // calculate spectral e-value usign generating function sw.Reset(); Console.WriteLine(@"Calculating spectral E-values for target-spectrum matches"); sw.Start(); var bestTargetMatches = RunGeneratingFunction(targetMatches); WriteResultsToFile(bestTargetMatches, targetOutputFilePath, targetDb); sw.Stop(); Console.WriteLine(@"Target-spectrum match E-value calculation elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); } progData.StepRange(95.0); // total to 95% progData.Status = "Running Decoy search"; if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Decoy)) { // Decoy database sw.Reset(); sw.Start(); var decoyDb = targetDb.Decoy(null, true); Console.Write(@"Reading the decoy database..."); decoyDb.Read(); Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); progData.MaxPercentage = 77.5; var decoyMatches = new SortedSet <DatabaseSequenceSpectrumMatch> [_run.MaxLcScan + 1]; if (TagBasedSearch) { sw.Reset(); Console.WriteLine(@"Tag-based searching the decoy database"); sw.Start(); RunTagBasedSearch(decoyMatches, decoyDb, null, prog); Console.WriteLine(@"Decoy database tag-based search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); } progData.MaxPercentage = 95.0; sw.Reset(); Console.WriteLine(@"Searching the decoy database"); sw.Start(); RunSearch(decoyMatches, decoyDb, ms1Filter, null, prog); Console.WriteLine(@"Decoy database search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); // calculate spectral e-value usign generating function sw.Reset(); Console.WriteLine(@"Calculating spectral E-values for decoy-spectrum matches"); sw.Start(); var bestDecoyMatches = RunGeneratingFunction(decoyMatches); WriteResultsToFile(bestDecoyMatches, decoyOutputFilePath, decoyDb); sw.Stop(); Console.WriteLine(@"Decoy-spectrum match E-value calculation elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds); } progData.StepRange(100.0); progData.Status = "Writing combined results file"; if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Both)) { // Add "Qvalue" and "PepQValue" var fdrCalculator = new FdrCalculator(targetOutputFilePath, decoyOutputFilePath); if (fdrCalculator.HasError()) { ErrorMessage = fdrCalculator.ErrorMessage; Console.WriteLine(@"Error computing FDR: " + fdrCalculator.ErrorMessage); return(false); } fdrCalculator.WriteTo(tdaOutputFilePath); } progData.Report(100.0); Console.WriteLine(@"Done."); swAll.Stop(); Console.WriteLine(@"Total elapsed time for search: {0:f1} sec ({1:f2} min)", swAll.Elapsed.TotalSeconds, swAll.Elapsed.TotalMinutes); return(true); }