public FeatureBasedTagSearchEngine( LcMsRun run, Ms1FtParser featureParser, ProductScorerBasedOnDeconvolutedSpectra ms2Scorer, SequenceTagParser tagParser, FastaDatabase fastaDb, Tolerance tolerance, AminoAcidSet aaSet, double maxSequenceMass = 50000.0, int minProductIonCharge = 1, int maxProductIonCharge = 20) { _run = run; _ms2Scorer = ms2Scorer; _featureParser = featureParser; _ms1FtFilter = new Ms1FtFilter(run, tolerance, featureParser.Ms1FtFileName); _tagParser = tagParser; _fastaDb = fastaDb; _searchableDb = new SearchableDatabase(fastaDb); _tolerance = tolerance; _aaSet = aaSet; _maxSequenceMass = maxSequenceMass; _minProductIonCharge = minProductIonCharge; _maxProductIonCharge = maxProductIonCharge; }
public void TestSequenceEnumerationParallel2() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta"; var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); var arr = db.Characters().ToArray(); sw.Start(); //var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); // var num = annotationsAndOffsets.AsParallel().LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = annotationsAndOffsets.LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = arr.AsParallel().Where(c => c == 'W').LongCount(); var num = 0; var sum = 0L; //foreach (var c in arr) for (var a = 0; a < arr.Length; a++) { var c = arr[a]; for (var i = 0; i < c * 10000; i++) sum += i; // Interlocked.Increment(ref num); if (++num == 1000) break; } Console.WriteLine("NumPeptides: {0}", sum); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
private static void TestCountingPeptides() { var aaSet = new AminoAcidSet(); var sw = new Stopwatch(); sw.Start(); //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002166_F86E3B2F.fasta"; const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_003456_9B916A8B.fasta"; // const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta"; var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); indexedDb.Read(); //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount(); var peptides = indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin); Parallel.ForEach(peptides, annotationAndOffset => //foreach(var annotationAndOffset in peptides) { var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var graph = SequenceGraph.CreateGraph(aaSet, annotation); } ) ; // Console.WriteLine("NumPeptides: {0}", numPeptides); sw.Stop(); var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"{0:f4} sec", sec); }
public void TestSearching() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var searchableDb = new SearchableDatabase(db); //const string pattern = "NSGSHFCGGSLINSQWVVSAAH"; const string pattern = "FPTDDDDK"; var position = searchableDb.Search(pattern); Assert.True(position >= 0); Console.WriteLine("Position: {0}", position); Console.WriteLine("Matched indices: {0}", string.Join(",", searchableDb.FindAllMatchedSequenceIndices(pattern))); Console.WriteLine("Protein indices: {0}", string.Join(",", searchableDb.FindAllMatchedSequenceIndices(pattern).Select(i => db.GetOneBasedPositionInProtein(i)))); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public ScanBasedTagSearchEngine( LcMsRun run, ISequenceTagFinder seqTagFinder, LcMsPeakMatrix featureFinder, FastaDatabase fastaDb, Tolerance tolerance, AminoAcidSet aaSet, CompositeScorerFactory ms2ScorerFactory = null, int minMatchedTagLength = DefaultMinMatchedTagLength, double maxSequenceMass = 50000.0, int minProductIonCharge = 1, int maxProductIonCharge = 20) { _run = run; _featureFinder = featureFinder; _searchableDb = new SearchableDatabase(fastaDb); _tolerance = tolerance; _aaSet = aaSet; _minMatchedTagLength = minMatchedTagLength; _maxSequenceMass = maxSequenceMass; _minProductIonCharge = minProductIonCharge; _maxProductIonCharge = maxProductIonCharge; MinScan = int.MinValue; MaxScan = int.MaxValue; _ms2ScorerFactory = ms2ScorerFactory; _seqTagFinder = seqTagFinder; }
public IndexedDatabase(FastaDatabase fastaDatabase) { FastaDatabase = fastaDatabase; var databaseFilePath = FastaDatabase.GetFastaFilePath(); //var databaseFilePathNoExt = Path.Combine(Path.GetDirectoryName(databaseFilePath), Path.GetFileNameWithoutExtension(databaseFilePath)); //_pLcpFilePath = databaseFilePathNoExt + PermutedLongestCommonPrefixFileExtension; _pLcpFilePath = Path.ChangeExtension(databaseFilePath, PermutedLongestCommonPrefixFileExtension); var lastWriteTimeHash = FastaDatabase.GetLastWriteTimeHash(); if (!File.Exists(_pLcpFilePath) || !FastaDatabase.CheckHashCodeBinaryFile(_pLcpFilePath, lastWriteTimeHash)) { Console.Write("Generating " + _pLcpFilePath + " ... "); CreatePermutedLongestCommonPrefixFile(); Console.WriteLine("Done"); } }
public ProteoformSpectrumMatchContainer(FastaDatabase database, int[] ms2ScanVector, int maxModifications, int maxNumMatchesPerSpectrum, int minScore = 4) { Database = database; NumMatchesPerSpectrum = maxNumMatchesPerSpectrum; _scoreCutoff = minScore; Ms2ScanVector = ms2ScanVector; _ms2ScanToIndexMap = new int[ms2ScanVector.Last() + 1]; for(var i = 0; i < ms2ScanVector.Length; i++) { var scanNum = ms2ScanVector[i]; _ms2ScanToIndexMap[scanNum] = i; } _matchedSet = new SortedSet<DatabaseSequenceSpectrumMatch>[maxModifications + 1][]; for(var i = 0; i <= maxModifications; i++) _matchedSet[i] = new SortedSet<DatabaseSequenceSpectrumMatch>[ms2ScanVector.Length]; _checkedOutScanNumbers = new List<int>(); }
/// <summary> /// Constructor - build the index /// </summary> /// <param name="fastaDatabase"></param> public IndexedDatabase(FastaDatabase fastaDatabase) { FastaDatabase = fastaDatabase; var databaseFilePath = FastaDatabase.GetFastaFilePath(); //var databaseFilePathNoExt = Path.Combine(Path.GetDirectoryName(databaseFilePath), Path.GetFileNameWithoutExtension(databaseFilePath)); //_pLcpFilePath = databaseFilePathNoExt + PermutedLongestCommonPrefixFileExtension; _pLcpFilePath = Path.ChangeExtension(databaseFilePath, PermutedLongestCommonPrefixFileExtension); var lastWriteTimeHash = FastaDatabase.GetLastWriteTimeHash(); if (!File.Exists(_pLcpFilePath) || !FastaDatabase.CheckHashCodeBinaryFile(_pLcpFilePath, lastWriteTimeHash)) { Console.Write("Generating " + _pLcpFilePath + " ... "); CreatePermutedLongestCommonPrefixFile(); Console.WriteLine("Done"); } }
public void TestSearchWithTagGeneration() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"D:\MSPathFinder\Fasta\ID_002216_235ACCEA.fasta"; //const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_004973_9BA6912F.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var tolerance = new Tolerance(10); var modsFilePath = @"D:\MSPathFinder\Fasta\Mods.txt"; if (!File.Exists(modsFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath); } var aaSet = new AminoAcidSet(modsFilePath); //TestTagBasedSearch(run, fastaDb, tolerance, aaSet); var tagSearchEngine = new ScanBasedTagSearchEngine(run, new SequenceTagGenerator(run, new Tolerance(8)), new LcMsPeakMatrix(run), fastaDb, tolerance,aaSet); var matchedTags = tagSearchEngine.RunSearch(4672); foreach (var match in matchedTags) { Console.Write(match.Sequence); Console.WriteLine("\t{0}\t{1}\t{2}", match.TagMatch.StartIndex, match.TagMatch.EndIndex, match.TagMatch.Mass); } }
public SearchableDatabase(FastaDatabase fastaDatabase) { FastaDatabase = fastaDatabase; _sequence = fastaDatabase.GetSequence(); _suffixArray = new int[_sequence.Length]; SAIS.sufsort(_sequence, _suffixArray, _sequence.Length); var neighboringLcps = new byte[_suffixArray.Length]; neighboringLcps[0] = 0; for (var i = 1; i < _suffixArray.Length; i++) { var lcp = IndexedDatabase.GetLcp(_sequence, _suffixArray[i - 1], _suffixArray[i]); neighboringLcps[i] = lcp; } _leftLcps = new byte[_suffixArray.Length]; _rightLcps = new byte[_suffixArray.Length]; InitializeLcps(neighboringLcps, _leftLcps, _rightLcps, 0, _suffixArray.Length-1); }
/// <summary> /// Constructor /// </summary> /// <param name="fastaDatabase"></param> public SearchableDatabase(FastaDatabase fastaDatabase) { FastaDatabase = fastaDatabase; _sequence = fastaDatabase.GetSequence(); _suffixArray = new int[_sequence.Length]; SAIS.sufsort(_sequence, _suffixArray, _sequence.Length); var neighboringLcps = new byte[_suffixArray.Length]; neighboringLcps[0] = 0; for (var i = 1; i < _suffixArray.Length; i++) { var lcp = IndexedDatabase.GetLcp(_sequence, _suffixArray[i - 1], _suffixArray[i]); neighboringLcps[i] = lcp; } _leftLcps = new byte[_suffixArray.Length]; _rightLcps = new byte[_suffixArray.Length]; InitializeLcps(neighboringLcps, _leftLcps, _rightLcps, 0, _suffixArray.Length - 1); }
public FeatureBasedTagSearchEngine( LcMsRun run, Ms1FtParser featureParser, SequenceTagParser tagParser, FastaDatabase fastaDb, Tolerance tolerance, AminoAcidSet aaSet, double maxSequenceMass = 50000.0, int minProductIonCharge = 1, int maxProductIonCharge = 20) : this( run, featureParser, null, tagParser, fastaDb, tolerance, aaSet, maxSequenceMass, minProductIonCharge, maxProductIonCharge) { }
public void TestSumParallel() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); //var array = Enumerable.Range(0, short.MaxValue).ToArray(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta"; var db = new FastaDatabase(dbFile); db.Read(); //var indexedDb = new IndexedDatabase(db); //indexedDb.Read(); //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); var charArray = db.Characters().Select(c => (int)c); // Test methods. Console.WriteLine(SumAsParallel(charArray)); Console.WriteLine(SumDefault(charArray)); const int m = 100; var s1 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumDefault(charArray); } s1.Stop(); var s2 = Stopwatch.StartNew(); for (var i = 0; i < m; i++) { SumAsParallel(charArray); } s2.Stop(); Console.WriteLine((s1.Elapsed.TotalMilliseconds * 1000000 / m).ToString("0.00 ns")); Console.WriteLine((s2.Elapsed.TotalMilliseconds * 1000000 / m).ToString("0.00 ns")); Console.Read(); }
public void TestEnumeratingProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); foreach (var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(100, 300, 3)) { Console.WriteLine(annotationAndOffset.Annotation); } sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
[TestCase(3, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)] // 3MB //[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)] // 6MB //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)] // 15MB public void TestSequenceEnumeration(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName, dbFile); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); //int coreCount = 0; //foreach (var item in new System.Management.ManagementObjectSearcher("Select NumberOfCores from Win32_Processor").Get()) //{ // coreCount += int.Parse(item["NumberOfCores"].ToString()); //} //Console.WriteLine("Number Of Cores: {0}", coreCount); //Console.WriteLine("Processors: " + System.Environment.ProcessorCount); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(30, 250); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); /*/Parallel.ForEach( annotationsAndOffsets, // new ParallelOptions { MaxDegreeOfParallelism = 2}, annotationAndOffset => { Interlocked.Increment(ref numSequences); //++numSequences; } );/**/ //annotationsAndOffsets.Select(annotationsAndOffset => annotationsAndOffset.) // Below, original: 110, 109(total) seconds // Parallelizing AnnotationsAndOffsetsNoEnzyme: 86 seconds // Parallelizing AnnotationsAndOffsetsNoEnzyme, yield returns: 79.6, 94, 60, 60 seconds // // 3MB // serial: // Parallel2: 107, // // 6MB // serial: // Parallel2: // // 15MB // serial: // Parallel2: //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(dbFile) + "_par.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
private void WriteResultsToFile(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tIcScore" ); for (var scanNum = _run.MinLcScan; scanNum <= _run.MaxLcScan; scanNum++) { if (matches[scanNum] == null) continue; foreach (var match in matches[scanNum].Reverse()) { var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var scores = _bottomUpScorer.GetScores(match, ion.Composition, ion.Charge, scanNum); if (ion == null) { Console.WriteLine(@"Null ion!"); } if (scores == null) { Console.WriteLine(@"Null scores"); } // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}", scanNum, match.Pre, sequence, // Sequence match.Post, scores.Modifications, // Modifications ion.Composition, // Composition proteinName, // ProteinName database.GetProteinDescription(match.Offset), // ProteinDescription protLength, // ProteinLength start, // Start end, // End ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.Score, scores.Score // Score (re-scored) ); } } } }
private IEnumerable<AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database) { var indexedDbTarget = new IndexedDatabase(database); IEnumerable<AnnotationAndOffset> annotationsAndOffsets; if (NumTolerableTermini == 0) { annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength); } else { annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength, NumTolerableTermini, 2, Enzyme); } return annotationsAndOffsets; }
public bool RunSearch(double corrThreshold) { var sw = new Stopwatch(); ErrorMessage = string.Empty; Console.Write(@"Reading raw file..."); sw.Start(); _run = InMemoryLcMsRun.GetLcMsRun(SpecFilePath, 1.4826, 1.4826); _bottomUpScorer = new InformedBottomUpScorer(_run, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance); sw.Stop(); var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Determining precursor masses..."); sw.Start(); var ms1Filter = new Ms1IsotopeAndChargeCorrFilter(_run, PrecursorIonTolerance, MinPrecursorIonCharge, MaxPrecursorIonCharge, 400, 5000, corrThreshold, 0, 0); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Deconvoluting MS2 spectra..."); sw.Start(); _ms2ScorerFactory = new ProductScorerBasedOnDeconvolutedSpectra( _run, MinProductIonCharge, MaxProductIonCharge, new Tolerance(10), 0 ); _ms2ScorerFactory.DeconvoluteAllProductSpectra(); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); // Target database var targetDb = new FastaDatabase(DatabaseFilePath); // string dirName = OutputDir ?? Path.GetDirectoryName(SpecFilePath); var baseName = Path.GetFileNameWithoutExtension(SpecFilePath); var targetOutputFilePath = Path.Combine(OutputDir, baseName + TargetFileExtension); var decoyOutputFilePath = Path.Combine(OutputDir, baseName + DecoyFileExtension); var tdaOutputFilePath = Path.Combine(OutputDir, baseName + TdaFileExtension); if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Target)) { sw.Reset(); Console.Write(@"Reading the target database..."); sw.Start(); targetDb.Read(); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.WriteLine(@"Searching the target database"); sw.Start(); var targetMatches = RunSearch(GetAnnotationsAndOffsets(targetDb), ms1Filter, false); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Target database search elapsed time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Rescoring and writing target results..."); sw.Start(); WriteResultsToFile(targetMatches, targetOutputFilePath, targetDb); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed time: {0:f4} sec", sec); } if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Decoy)) { // Decoy database sw.Reset(); Console.Write(@"Reading the decoy database..."); sw.Start(); var decoyDb = targetDb.Decoy(Enzyme); decoyDb.Read(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.WriteLine(@"Searching the decoy database"); sw.Start(); var decoyMatches = RunSearch(GetAnnotationsAndOffsets(decoyDb), ms1Filter, true); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Decoy database search elapsed Time: {0:f4} sec", sec); sw.Reset(); Console.Write(@"Rescoring and writing decoy results..."); sw.Start(); WriteResultsToFile(decoyMatches, decoyOutputFilePath, decoyDb); sw.Stop(); sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"Elapsed time: {0:f4} sec", sec); } if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Both)) { var fdrCalculator = new FdrCalculator(targetOutputFilePath, decoyOutputFilePath); if (fdrCalculator.HasError()) { ErrorMessage = fdrCalculator.ErrorMessage; Console.WriteLine(@"Error computing FDR: " + fdrCalculator.ErrorMessage); return false; } fdrCalculator.WriteTo(tdaOutputFilePath); } Console.WriteLine(@"Done"); return true; }
private void RunSearch(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, FastaDatabase db, ISequenceFilter sequenceFilter, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null) { var progData = new ProgressData(progress) { Status = "Searching for matches" }; var sw = new Stopwatch(); long estimatedProteins; var annotationsAndOffsets = GetAnnotationsAndOffsets(db, out estimatedProteins, cancellationToken); Console.WriteLine(@"Estimated proteins: " + estimatedProteins); var numProteins = 0; var lastUpdate = DateTime.MinValue; // Force original update of 0% sw.Reset(); sw.Start(); var pfeOptions = new ParallelOptions { MaxDegreeOfParallelism = MaxNumThreads, CancellationToken = cancellationToken ?? CancellationToken.None }; var maxNumNTermCleavages = SearchMode == InternalCleavageType.NoInternalCleavage ? MaxNumNTermCleavages : 0; //foreach (var annotationAndOffset in annotationsAndOffsets) Parallel.ForEach(annotationsAndOffsets, pfeOptions, annotationAndOffset => { if (cancellationToken != null && cancellationToken.Value.IsCancellationRequested) { //return matches; return; } SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData); SearchForMatches(annotationAndOffset, sequenceFilter, matches, maxNumNTermCleavages, db.IsDecoy, cancellationToken); }); Console.WriteLine(@"Collected candidate matches: {0}", GetNumberOfMatches(matches)); progData.StatusInternal = string.Empty; progData.Report(100.0); }
public void FindProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } const string outputFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_matchedtag.tsv"; using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; writer.WriteLine(line+"\t"+"Proteins"); continue; } var token = line.Split('\t'); if (token.Length != 3) continue; var tag = token[1]; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); var matchedProteinStr = string.Join(",", matchedProteins); writer.WriteLine("{0}\t{1}\t{2}", line, matchedProteins.Length, matchedProteinStr); } } Console.WriteLine(@"Done"); }
[TestCase(3, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)] // 3MB //[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)] // 6MB //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)] // 15MB public void TestSequenceEnumerationSerial(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName, dbFile); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta"; // 1.5MB //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta"; // 3MB //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta"; // 6MB //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta"; // 15MB var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); indexedDb.Read(); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(30, 250); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); //foreach (var annotationsAndOffset in annotationsAndOffsets) //{ // //Interlocked.Increment(ref numSequences); // ++numSequences; //} //using ( // var ofstream = // new FileStream( // Path.Combine(@"F:\InformedProteomicsTestFiles", // Path.GetFileNameWithoutExtension(dbFile) + "_old.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
public void TestTopDownScoringForAllXics() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // Search parameters const int numNTermCleavages = 1; // 30 const int minLength = 7; const int maxLength = 1000; //const int minCharge = 5; // 3 //const int maxCharge = 15; // 67 const int numMaxModsPerProtein = 0; // 6 var precursorTolerance = new Tolerance(10); const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.fasta"; //const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.icdecoy.KR.fasta"; //const string dbFilePath = @"..\..\..\TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; // const string dbFilePath = // @"C:\cygwin\home\kims336\Data\TopDown\ID_003558_56D73071.fasta"; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); Console.Write("Reading raw file..."); const string specFilePath = @"C:\workspace\TopDown\E_coli_iscU_60_mock.raw"; var run = InMemoryLcMsRun.GetLcMsRun(specFilePath); sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); // Configure amino acid set // var pyroGluQ = new SearchModification(Modification.PyroGluQ, 'Q', SequenceLocation.ProteinNTerm, false); var dehydro = new SearchModification(Modification.PyroGluQ, 'C', SequenceLocation.Everywhere, false); var cysteinylC = new SearchModification(Modification.Cysteinyl, 'C', SequenceLocation.Everywhere, false); var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false); // var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var searchModifications = new List<SearchModification> { //pyroGluQ, dehydro, cysteinylC, glutathioneC, //oxM }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var targetDb = new FastaDatabase(dbFilePath); // targetDb.CreateDecoyDatabase(Enzyme.Trypsin); // System.Environment.Exit(1); var indexedDb = new IndexedDatabase(targetDb); var numProteins = 0; long totalProtCompositions = 0; //long numXics = 0; TopDownScorer.MaxCharge = 25; TopDownScorer.MinCharge = 8; sw.Reset(); sw.Start(); Console.WriteLine("Generating XICs..."); foreach (var protAnnotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minLength, maxLength)) { ++numProteins; //if (numProteins > 2000) break; if (numProteins % 1000 == 0) { Console.WriteLine("Processed {0} proteins", numProteins); } //Console.WriteLine(protAnnotation); var seqGraph = SequenceGraph.CreateGraph(aaSet, protAnnotationAndOffset.Annotation); //Console.WriteLine(seqGraph.GetSequenceCompositions()[0]); if (seqGraph == null) continue; for (var nTermCleavages = 0; nTermCleavages <= numNTermCleavages; nTermCleavages++) { if(nTermCleavages > 0) seqGraph.CleaveNTerm(); var protCompositions = seqGraph.GetSequenceCompositions(); foreach (var protComposition in protCompositions) { totalProtCompositions++; // Console.WriteLine(protComposition); var scorer = new TopDownScorer(protComposition, run, precursorTolerance, null); var score = scorer.GetScore(); Console.WriteLine(score); } } } sw.Stop(); Console.WriteLine("NumProteins: {0}", numProteins); Console.WriteLine("NumProteinCompositions: {0}", totalProtCompositions); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestTagBasedSearchForLewy() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string rawFilePath = @"D:\MassSpecFiles\Lewy\Lewy_AT_AD1_21May15_Bane_14-09-01RZ.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const int minTagLength = 4; var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFilePath, ".seqtag"); //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 10000); const string fastaFilePath = @"D:\MassSpecFiles\Lewy\a4_human.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var tolerance = new Tolerance(10); var modsFilePath = @"D:\MassSpecFiles\Lewy\Mods.txt"; if (!File.Exists(modsFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath); } var aaSet = new AminoAcidSet(modsFilePath); TestTagBasedSearch(run, fastaDb, tolerance, aaSet); }
public void FindProteinDeltaMass() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string folderPath = @"D:\MassSpecFiles\Glyco\"; if (!Directory.Exists(folderPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath); } var fileSet = new string[] { "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105", "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015" }; const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta"; for (var i = 0; i < fileSet.Length; i++) { var datasetName = fileSet[i]; var tagFilePath = folderPath + datasetName + ".seqtag"; //var outputFilePath = folderPath + datasetName + ".matchedtag"; var outputFilePath = folderPath + datasetName + ".dmass"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); using (var writer = new StreamWriter(outputFilePath)) { var isHeader = true; var nReadSeqTag = 0; Console.WriteLine(@"Reading {0} file", tagFilePath); var nColumn = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; nColumn = line.Split('\t').Length; writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass"); continue; } var token = line.Split('\t'); if (token.Length != nColumn) continue; var tag = token[1]; //var scan = Convert.ToInt32(token[0]); if (tag.Length < 6) continue; var nTerminal = token[2].Equals("1"); var detectedFlankingMass = Double.Parse(token[3]); if (!nTerminal) detectedFlankingMass -= Composition.H2O.Mass; nReadSeqTag++; var matchedProteins = searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index)) .Distinct().ToArray(); if (matchedProteins.Length < 1) continue; foreach (var protName in matchedProteins) { var seqStr = fastaDb.GetProteinSequence(protName); var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet()); var startIdx = 0; while (true) { var idx = seqStr.IndexOf(tag, startIdx); if (idx < 0) break; //no matching //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length; var nClv = (nTerminal) ? 2 : 1; for (var j = 0; j < nClv; j++) { var flankComposition = (nTerminal) ? oriSeq.GetComposition(j, idx) : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j); var massDiff = (detectedFlankingMass - flankComposition.Mass); if (massDiff > -500 && massDiff < 2000) { //writer.WriteLine(massDiff); writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff); } if (massDiff > 2000) break; } startIdx = idx + tag.Length; } } //var matchedProteinStr = string.Join(",", matchedProteins); //var massDiffStr = string.Join(",", massDiffList); //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr); } Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag); } Console.WriteLine(@"Done"); } }
public void CountMatchedScansPerProtein() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 6; var proteinToScan = new Dictionary<string, HashSet<int>>(); const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length); //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv"; //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag"; const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var isHeader = true; var numMatchedPairs = 0; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length != 3) continue; var scan = Convert.ToInt32(token[0]); var tag = token[1]; if (tag.Length < minTagLength) continue; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { ++numMatchedPairs; HashSet<int> matchedScans; if (proteinToScan.TryGetValue(matchedProtein, out matchedScans)) { matchedScans.Add(scan); } else { matchedScans = new HashSet<int> {scan}; proteinToScan.Add(matchedProtein, matchedScans); } } } var numMatchedProteins = proteinToScan.Keys.Count; var numAllProteins = fastaDb.GetNumEntries(); Console.WriteLine("NumAllProteins: {0}", numAllProteins); Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins); Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins); }
public void TestTagBasedSearchCompRef() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dataSetPath = @"D:\MassSpecFiles\CompRef"; const string fastaFilePath = @"D:\MassSpecFiles\CompRef\ID_003278_4B4B3CB1.fasta"; const string modsFilePath = @"D:\MassSpecFiles\CompRef\Mods.txt"; if (!Directory.Exists(dataSetPath)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dataSetPath); } if (!File.Exists(modsFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath); } if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fileEntries = Directory.GetFiles(dataSetPath); var dataset = (from fileName in fileEntries where fileName.EndsWith("pbf") select Path.GetFileNameWithoutExtension(fileName)).ToList(); dataset.Sort(); var fastaDb = new FastaDatabase(fastaFilePath); var tolerance = new Tolerance(10); var aaSet = new AminoAcidSet(modsFilePath); for (var i = 0; i < dataset.Count; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", dataSetPath, dataset[i]); var ms1File = string.Format(@"{0}\{1}.ms1ft", dataSetPath, dataset[i]); var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFile, ".seqtag"); var run = PbfLcMsRun.GetLcMsRun(rawFile); const int minTagLength = 5; //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 100); Console.WriteLine("-----------------{0}--------------------", rawFile); TestTagBasedSearch(run, fastaDb, tolerance, aaSet); Console.WriteLine("-----------------------------------------------------------------------"); } }
public void TestTagBasedSearch() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // const string rawFilePath = @"H:\Research\Lewy\raw\Lewy_intact_01.raw"; // const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; // const string rawFilePath = @"H:\Research\Yufeng\TopDownYufeng\raw\yufeng_column_test2.raw"; // const string rawFilePath = @"H:\Research\Weijun_TopDown\raw\UC4_Intact_plasmaTest_90_6May15_Bane_14-09-01RZ.raw"; // const string rawFilePath = @"H:\Research\Charles\TopDown\raw\SBEP_STM_001_02272012_Aragon.raw"; const string rawFilePath = @"D:\MassSpecFiles\60k\Yufeng_SampleTest1_150614113438.pbf"; //const string rawFilePath = @"D:\MassSpecFiles\60k\NCR_50K_Test_24Jun15_Bane_15-02-02RZ.pbf"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const int minTagLength = 5; var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFilePath, ".seqtag"); //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 100); const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_003836_DA9CC1E4.fasta"; //const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_004973_9BA6912F.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } var fastaDb = new FastaDatabase(fastaFilePath); var tolerance = new Tolerance(10); // var modsFilePath = @"H:\Research\QCShew_TopDown\Production\Mods_Methyl.txt"; var modsFilePath = @"D:\MassSpecFiles\60k\Mods.txt"; // var modsFilePath = ""; if (!File.Exists(modsFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath); } var aaSet = new AminoAcidSet(modsFilePath); TestTagBasedSearch(run, fastaDb, tolerance, aaSet); }
public void TestNominalMassErrors() { const int minLength = 300; const int maxLength = 400; var sw = new System.Diagnostics.Stopwatch(); // const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\ID_003962_71E1A1D4.fasta"; //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\TargetProteins.fasta"; var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; sw.Start(); var hist = new long[11]; var aaSet = new AminoAcidSet(); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength)) { ++numSequences; var annotation = peptideAnnotationAndOffset.Annotation; var sequenceStr = annotation.Substring(2, annotation.Length - 4); var sequenceComp = aaSet.GetComposition(sequenceStr); var mass = sequenceComp.Mass; var nominalMass = sequenceComp.NominalMass; var error = (int) Math.Round(mass*Constants.RescalingConstant) - nominalMass; var errorBin = error + hist.Length/2; if (errorBin < 0) errorBin = 0; if (errorBin >= hist.Length) errorBin = hist.Length - 1; hist[errorBin]++; } Console.WriteLine("NumSequences: {0}", numSequences); for (var i = 0; i < hist.Length; i++) { Console.WriteLine("{0}\t{1}\t{2}", i - hist.Length/2, hist[i], hist[i]/(double)numSequences); } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 8898)] // 6MB //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 6334)] // 15MB public void TestSequenceEnumerationIntact(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName, dbFile); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const int numCTermCleavages = 0; var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(2, 21, 300, 1, numCTermCleavages); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, numCTermCleavages); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); /*/Parallel.ForEach( annotationsAndOffsets, // new ParallelOptions { MaxDegreeOfParallelism = 2}, annotationAndOffset => { Interlocked.Increment(ref numSequences); //++numSequences; } );/**/ //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(dbFile) + "_par.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database) { using (var writer = new StreamWriter(outputFilePath)) { writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" + "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue"); foreach(var scanNum in _ms2ScanNums) { var match = matches[scanNum]; if (match == null) continue; var sequence = match.Sequence; var offset = match.Offset; var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages; var end = start + sequence.Length - 1; var proteinName = database.GetProteinName(match.Offset); var protLength = database.GetProteinLength(proteinName); var ion = match.Ion; var proteinDescription = database.GetProteinDescription(match.Offset); var probability = CompositeScorer.GetProbability(match.Score); // Note for DblToString(value, 9, true), by having "9" and "true", // values between 100 and 999 Da will have 7 digits after the decimal place, and // values between 1000 and 9999 will have 6 digits after the decimal place writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}", scanNum, match.Pre, // Pre sequence, // Sequence match.Post, // Post match.ModificationText, // Modifications ion.Composition, // Composition proteinName, // ProteinName proteinDescription, // ProteinDescription protLength, // ProteinLength start, // Start position in protein end, // End position in protein ion.Charge, // precursorCharge StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz StringUtilities.DblToString(ion.Composition.Mass, 9, true), // Mass match.NumMatchedFragments, // (Number of matched fragments) StringUtilities.DblToString(probability, 4), // Probability StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001), // EValue; will be displayed using scientific notation if the value is less than 0.001 StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001) // SpecEValue; will be displayed using scientific notation if the value is less than 0.001 ); } } }
public void CountMatchedProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minTagLength = 3; var scanToProtein = new Dictionary<int, string>(); var idTag = new Dictionary<int, bool>(); const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv"; if (!File.Exists(resultFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath); } var parser = new TsvFileParser(resultFilePath); var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray(); var proteinNames = parser.GetData("ProteinName").ToArray(); var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray(); for (var i = 0; i < qValues.Length; i++) { if (qValues[i] > 0.01) break; scanToProtein.Add(scans[i], proteinNames[i]); idTag.Add(scans[i], false); } const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; if (!File.Exists(rawFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath); } var run = PbfLcMsRun.GetLcMsRun(rawFilePath); const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; if (!File.Exists(fastaFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath); } // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; // const string fastaFilePath = // @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; var fastaDb = new FastaDatabase(fastaFilePath); var searchableDb = new SearchableDatabase(fastaDb); Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length); const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag"; if (!File.Exists(tagFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath); } var hist = new Dictionary<int, int>(); var scanSet = new HashSet<int>(); HashSet<string> proteinSetForThisScan = null; var prevScan = -1; var totalNumMatches = 0L; var isHeader = true; foreach (var line in File.ReadAllLines(tagFilePath)) { if (isHeader) { isHeader = false; continue; } var token = line.Split('\t'); if (token.Length < 3) continue; var scan = Convert.ToInt32(token[0]); var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null; if (scan != prevScan) { if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } prevScan = scan; proteinSetForThisScan = new HashSet<string>(); } scanSet.Add(scan); var tag = token[1]; if (tag.Length < minTagLength) continue; if (proteinSetForThisScan == null) continue; var numMatchesForThisTag = 0; foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag) .Select(index => fastaDb.GetProteinName(index))) { proteinSetForThisScan.Add(matchedProtein); ++numMatchesForThisTag; if (proteinId != null && matchedProtein.Equals(proteinId)) { idTag[scan] = true; } } totalNumMatches += numMatchesForThisTag; // if (numMatchesForThisTag > 10) // { // Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag); // } } if (proteinSetForThisScan != null) { var numMatches = proteinSetForThisScan.Count; int numOcc; if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1; else hist.Add(numMatches, 1); } Console.WriteLine("AvgNumMatches: {0}", totalNumMatches/(float)scanSet.Count); Console.WriteLine("Histogram:"); foreach (var entry in hist.OrderBy(e => e.Key)) { Console.WriteLine("{0}\t{1}", entry.Key, entry.Value); } Console.WriteLine("NumId: {0}", idTag.Count); Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v)); }
private void TestTagBasedSearch(LcMsRun run, FastaDatabase fastaDb, Tolerance tolerance, AminoAcidSet aaSet) { var engine = new ScanBasedTagSearchEngine(run, new SequenceTagGenerator(run, new Tolerance(8)), new LcMsPeakMatrix(run), fastaDb, tolerance, aaSet); // engine.MinScan = 3400; // engine.MaxScan = 3900; engine.RunSearch(); }