private IEnumerable <AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database, out long estimatedProteins, CancellationToken?cancellationToken = null) { var indexedDb = new IndexedDatabase(database); indexedDb.Read(); estimatedProteins = indexedDb.EstimateTotalPeptides(SearchMode, MinSequenceLength, MaxSequenceLength, MaxNumNTermCleavages, MaxNumCTermCleavages); IEnumerable <AnnotationAndOffset> annotationsAndOffsets; if (SearchMode == InternalCleavageType.MultipleInternalCleavages) { //annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength); annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(MinSequenceLength, MaxSequenceLength, MaxNumThreads, cancellationToken); } else if (SearchMode == InternalCleavageType.NoInternalCleavage) { annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength, MaxNumCTermCleavages); } else { annotationsAndOffsets = indexedDb .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan( MinSequenceLength, MaxSequenceLength, MaxNumNTermCleavages, MaxNumCTermCleavages); } return(annotationsAndOffsets); }
[TestCase(6, @"TEST_FOLDER\MSPathFinderT\ID_004530_B63BD900.fasta", 8898)] // 6MB //[TestCase(15, @"TEST_FOLDER\MSPathFinderT\ID_004208_295531A4.fasta", 6334)] // 15MB public void TestSequenceEnumerationIntact(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName, dbFile); var fastaFile = Utils.GetTestFile(methodName, dbFile.Replace("TEST_FOLDER", Utils.DEFAULT_TEST_FILE_FOLDER)); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const int numCTermCleavages = 0; var db = new FastaDatabase(fastaFile.FullName); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(2, 21, 300, 1, numCTermCleavages); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, numCTermCleavages); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); /*/Parallel.ForEach( * annotationsAndOffsets, * // new ParallelOptions { MaxDegreeOfParallelism = 2}, * annotationAndOffset => * { * Interlocked.Increment(ref numSequences); * //++numSequences; * } * );/**/ //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(fastaFile) + "_par.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
public void TestCountingProteoformsCloseToNTermOrCTerm() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minSequenceLength = 21; // 21 const int maxSequenceLength = 300; // 300 const int maxNumNTermCleavages = 1; const int maxNumCTermCleavages = 0; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownQCShew\database\ID_002216_235ACCEA.fasta"; const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var both = 0L; var nTermOnly = 0L; var cTermOnly = 0L; foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages <= maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); var numNTermCleavage = 0; int cleavedLength; while ((cleavedLength = length - numNTermCleavage) >= minSequenceLength) { if (cleavedLength <= maxSequenceLength) { if (numNTermCleavage <= maxNumNTermCleavages) { ++both; } else { ++cTermOnly; } var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); Console.WriteLine(anno); } ++numNTermCleavage; } } foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages > maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); for (var numNTermCleavage = 0; numNTermCleavage <= maxNumNTermCleavages; numNTermCleavage++) { var cleavedLength = length - numNTermCleavage; if (cleavedLength >= minSequenceLength && cleavedLength <= maxSequenceLength) { ++nTermOnly; var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); Console.WriteLine(anno); } } } Console.WriteLine("Both: {0}", both); Console.WriteLine("N-term only: {0}", nTermOnly); Console.WriteLine("C-term only: {0}", cTermOnly); Console.WriteLine("All: {0}", both + nTermOnly + cTermOnly); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
} // true: target and decoy, false: target only, null: decoy only public void QuickId() { const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta"; const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt"; const int numBits = 29; // max error: 4ppm const int minCharge = 1; const int maxCharge = 20; var tolerance = new Tolerance(10); const double corrThreshold = 0.7; var comparer = new MzComparerWithBinning(numBits); const double minFragmentMass = 200.0; const double maxFragmentMass = 50000.0; var minFragMassBin = comparer.GetBinNumber(minFragmentMass); var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass); var aminoAcidSet = new AminoAcidSet(modFilePath); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); var ms2ScanNumArr = run.GetScanNumbers(2).ToArray(); var sw = new Stopwatch(); sw.Start(); Console.Write("Building Spectrum Arrays..."); var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1]; for (var i = minFragMassBin; i <= maxFragMassBin; i++) { massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1); } foreach (var ms2ScanNum in ms2ScanNumArr) { var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (productSpec == null) { continue; } var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec.Peaks, minCharge, maxCharge, 2, 1.1, tolerance, corrThreshold); if (deconvolutedPeaks == null) { continue; } foreach (var p in deconvolutedPeaks) { var mass = p.Mass; var deltaMass = tolerance.GetToleranceAsDa(mass, 1); var minMass = mass - deltaMass; var maxMass = mass + deltaMass; var minBinNum = comparer.GetBinNumber(minMass); var maxBinNum = comparer.GetBinNumber(maxMass); for (var binNum = minBinNum; binNum <= maxBinNum; binNum++) { if (binNum >= minFragMassBin && binNum <= maxFragMassBin) { massVectors[binNum - minFragMassBin][ms2ScanNum] = true; } } } } sw.Stop(); Console.WriteLine(@"{0:f1} sec.", sw.Elapsed.TotalSeconds); sw.Reset(); sw.Start(); var fastaDb = new FastaDatabase(fastaFilePath); fastaDb.Read(); var indexedDb = new IndexedDatabase(fastaDb); var numProteins = 0; var intactProteinAnnotationAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue); var bestProtein = new string[run.MaxLcScan + 1]; var bestScore = new int[run.MaxLcScan + 1]; foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets) { if (++numProteins % 10 == 0) { Console.WriteLine(@"Processing, {0} proteins done, {1:f1} sec elapsed", numProteins, sw.Elapsed.TotalSeconds); } var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var protSequence = annotation.Substring(2, annotation.Length - 4); // suffix var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence, AminoAcid.ProteinCTerm); if (seqGraph == null) { continue; } for (var numNTermCleavage = 0; numNTermCleavage <= 1; numNTermCleavage++) { if (numNTermCleavage > 0) { seqGraph.CleaveNTerm(); } var allCompositions = seqGraph.GetAllFragmentNodeCompositions(); var scoreArr = new int[run.MaxLcScan + 1]; foreach (var fragComp in allCompositions) { var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass; var binNum = comparer.GetBinNumber(suffixMass); if (binNum < minFragMassBin || binNum > maxFragMassBin) { continue; } var vector = massVectors[binNum - minFragMassBin]; foreach (var ms2ScanNum in ms2ScanNumArr) { if (vector[ms2ScanNum]) { ++scoreArr[ms2ScanNum]; } } } foreach (var ms2ScanNum in ms2ScanNumArr) { if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum]) { bestScore[ms2ScanNum] = scoreArr[ms2ScanNum]; var proteinName = fastaDb.GetProteinName(offset); bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : ""); } } } // prefix } Console.WriteLine("ScanNum\tBestProtein\tScore"); foreach (var ms2ScanNum in ms2ScanNumArr) { Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestScore[ms2ScanNum], bestProtein[ms2ScanNum] ?? ""); } }
public void TestCountingProteoformsCloseToNTermOrCTerm() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const int minSequenceLength = 21; // 21 const int maxSequenceLength = 300; // 300 const int maxNumNTermCleavages = 1; const int maxNumCTermCleavages = 0; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var fastaFile = Utils.GetTestFile(methodName, Path.Combine(Utils.DEFAULT_TEST_FILE_FOLDER, @"MSPathFinderT\Short.fasta")); var db = new FastaDatabase(fastaFile.FullName); var indexedDb = new IndexedDatabase(db); var both = 0L; var nTermOnly = 0L; var cTermOnly = 0L; var displayedResults = 0; foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages <= maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); var numNTermCleavage = 0; int cleavedLength; while ((cleavedLength = length - numNTermCleavage) >= minSequenceLength) { if (cleavedLength <= maxSequenceLength) { if (numNTermCleavage <= maxNumNTermCleavages) { ++both; } else { ++cTermOnly; } var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); if (displayedResults < 20) { Console.WriteLine(anno); } displayedResults++; } ++numNTermCleavage; } } displayedResults = 0; foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages > maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); for (var numNTermCleavage = 0; numNTermCleavage <= maxNumNTermCleavages; numNTermCleavage++) { var cleavedLength = length - numNTermCleavage; if (cleavedLength >= minSequenceLength && cleavedLength <= maxSequenceLength) { ++nTermOnly; var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); if (displayedResults < 20) { Console.WriteLine(anno); } displayedResults++; } } } Console.WriteLine("Both: {0}", both); Console.WriteLine("N-term only: {0}", nTermOnly); Console.WriteLine("C-term only: {0}", cTermOnly); Console.WriteLine("All: {0}", both + nTermOnly + cTermOnly); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestTopDownScoringForAllXics() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); // Search parameters const int numNTermCleavages = 1; // 30 const int minLength = 7; const int maxLength = 1000; //const int minCharge = 5; // 3 //const int maxCharge = 15; // 67 const int numMaxModsPerProtein = 0; // 6 var precursorTolerance = new Tolerance(10); const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.fasta"; //const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.icdecoy.KR.fasta"; //const string dbFilePath = @"..\..\..\TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; // const string dbFilePath = // @"C:\cygwin\home\kims336\Data\TopDown\ID_003558_56D73071.fasta"; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); Console.Write("Reading raw file..."); const string specFilePath = @"C:\workspace\TopDown\E_coli_iscU_60_mock.raw"; var run = InMemoryLcMsRun.GetLcMsRun(specFilePath); sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); // Configure amino acid set // var pyroGluQ = new SearchModification(Modification.PyroGluQ, 'Q', SequenceLocation.ProteinNTerm, false); var dehydro = new SearchModification(Modification.PyroGluQ, 'C', SequenceLocation.Everywhere, false); var cysteinylC = new SearchModification(Modification.Cysteinyl, 'C', SequenceLocation.Everywhere, false); var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false); // var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var searchModifications = new List <SearchModification> { //pyroGluQ, dehydro, cysteinylC, glutathioneC, //oxM }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var targetDb = new FastaDatabase(dbFilePath); // targetDb.CreateDecoyDatabase(Enzyme.Trypsin); // System.Environment.Exit(1); var indexedDb = new IndexedDatabase(targetDb); var numProteins = 0; long totalProtCompositions = 0; //long numXics = 0; TopDownScorer.MaxCharge = 25; TopDownScorer.MinCharge = 8; sw.Reset(); sw.Start(); Console.WriteLine("Generating XICs..."); foreach (var protAnnotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minLength, maxLength)) { ++numProteins; //if (numProteins > 2000) break; if (numProteins % 1000 == 0) { Console.WriteLine("Processed {0} proteins", numProteins); } //Console.WriteLine(protAnnotation); var seqGraph = SequenceGraph.CreateGraph(aaSet, protAnnotationAndOffset.Annotation); //Console.WriteLine(seqGraph.GetSequenceCompositions()[0]); if (seqGraph == null) { continue; } for (var nTermCleavages = 0; nTermCleavages <= numNTermCleavages; nTermCleavages++) { if (nTermCleavages > 0) { seqGraph.CleaveNTerm(); } var protCompositions = seqGraph.GetSequenceCompositions(); foreach (var protComposition in protCompositions) { totalProtCompositions++; // Console.WriteLine(protComposition); var scorer = new TopDownScorer(protComposition, run, precursorTolerance); var score = scorer.GetScore(); Console.WriteLine(score); } } } sw.Stop(); Console.WriteLine("NumProteins: {0}", numProteins); Console.WriteLine("NumProteinCompositions: {0}", totalProtCompositions); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }