public void TestSequenceEnumerationParallel2() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta"; var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); var arr = db.Characters().ToArray(); sw.Start(); //var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30); // var num = annotationsAndOffsets.AsParallel().LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = annotationsAndOffsets.LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0); //var num = arr.AsParallel().Where(c => c == 'W').LongCount(); var num = 0; var sum = 0L; //foreach (var c in arr) for (var a = 0; a < arr.Length; a++) { var c = arr[a]; for (var i = 0; i < c * 10000; i++) sum += i; // Interlocked.Increment(ref num); if (++num == 1000) break; } Console.WriteLine("NumPeptides: {0}", sum); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
private static void TestCountingPeptides() { var aaSet = new AminoAcidSet(); var sw = new Stopwatch(); sw.Start(); //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002166_F86E3B2F.fasta"; const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_003456_9B916A8B.fasta"; // const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta"; var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); indexedDb.Read(); //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount(); var peptides = indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin); Parallel.ForEach(peptides, annotationAndOffset => //foreach(var annotationAndOffset in peptides) { var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var graph = SequenceGraph.CreateGraph(aaSet, annotation); } ) ; // Console.WriteLine("NumPeptides: {0}", numPeptides); sw.Stop(); var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency; Console.WriteLine(@"{0:f4} sec", sec); }
public void TestForManyMods() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dbFilePath = @"\\protoapps\UserData\Jungkap\Lewy\db\ID_005140_7A170668.fasta"; var indexedDb = new IndexedDatabase(new FastaDatabase(dbFilePath)); indexedDb.Read(); var nProt = indexedDb.EstimateTotalPeptides(1, 21, 300, 1, 0); Console.WriteLine(nProt); nProt = indexedDb.EstimateTotalPeptides(1, 21, 400, 1, 0); Console.WriteLine(nProt); nProt = indexedDb.EstimateTotalPeptides(1, 21, 500, 1, 0); Console.WriteLine(nProt); Console.WriteLine(@"Test not implemented: " + methodName); }
/// <summary> /// Constructor /// </summary> /// <param name="fastaDatabase"></param> public SearchableDatabase(FastaDatabase fastaDatabase) { FastaDatabase = fastaDatabase; _sequence = fastaDatabase.GetSequence(); _suffixArray = new int[_sequence.Length]; SAIS.sufsort(_sequence, _suffixArray, _sequence.Length); var neighboringLcps = new byte[_suffixArray.Length]; neighboringLcps[0] = 0; for (var i = 1; i < _suffixArray.Length; i++) { var lcp = IndexedDatabase.GetLcp(_sequence, _suffixArray[i - 1], _suffixArray[i]); neighboringLcps[i] = lcp; } _leftLcps = new byte[_suffixArray.Length]; _rightLcps = new byte[_suffixArray.Length]; InitializeLcps(neighboringLcps, _leftLcps, _rightLcps, 0, _suffixArray.Length - 1); }
public void TestEnumeratingProteins() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); foreach (var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(100, 300, 3)) { Console.WriteLine(annotationAndOffset.Annotation); } sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
[TestCase(3, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)] // 3MB //[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)] // 6MB //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)] // 15MB public void TestSequenceEnumeration(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName, dbFile); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); //int coreCount = 0; //foreach (var item in new System.Management.ManagementObjectSearcher("Select NumberOfCores from Win32_Processor").Get()) //{ // coreCount += int.Parse(item["NumberOfCores"].ToString()); //} //Console.WriteLine("Number Of Cores: {0}", coreCount); //Console.WriteLine("Processors: " + System.Environment.ProcessorCount); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(30, 250); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); /*/Parallel.ForEach( annotationsAndOffsets, // new ParallelOptions { MaxDegreeOfParallelism = 2}, annotationAndOffset => { Interlocked.Increment(ref numSequences); //++numSequences; } );/**/ //annotationsAndOffsets.Select(annotationsAndOffset => annotationsAndOffset.) // Below, original: 110, 109(total) seconds // Parallelizing AnnotationsAndOffsetsNoEnzyme: 86 seconds // Parallelizing AnnotationsAndOffsetsNoEnzyme, yield returns: 79.6, 94, 60, 60 seconds // // 3MB // serial: // Parallel2: 107, // // 6MB // serial: // Parallel2: // // 15MB // serial: // Parallel2: //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(dbFile) + "_par.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
private IEnumerable<AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database) { var indexedDbTarget = new IndexedDatabase(database); IEnumerable<AnnotationAndOffset> annotationsAndOffsets; if (NumTolerableTermini == 0) { annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength); } else { annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength, NumTolerableTermini, 2, Enzyme); } return annotationsAndOffsets; }
public void TestTopDownScoringForAllXics() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); // Search parameters const int numNTermCleavages = 1; // 30 const int minLength = 7; const int maxLength = 1000; //const int minCharge = 5; // 3 //const int maxCharge = 15; // 67 const int numMaxModsPerProtein = 0; // 6 var precursorTolerance = new Tolerance(10); const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.fasta"; //const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.icdecoy.KR.fasta"; //const string dbFilePath = @"..\..\..\TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; // const string dbFilePath = // @"C:\cygwin\home\kims336\Data\TopDown\ID_003558_56D73071.fasta"; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); Console.Write("Reading raw file..."); const string specFilePath = @"C:\workspace\TopDown\E_coli_iscU_60_mock.raw"; var run = InMemoryLcMsRun.GetLcMsRun(specFilePath); sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); // Configure amino acid set // var pyroGluQ = new SearchModification(Modification.PyroGluQ, 'Q', SequenceLocation.ProteinNTerm, false); var dehydro = new SearchModification(Modification.PyroGluQ, 'C', SequenceLocation.Everywhere, false); var cysteinylC = new SearchModification(Modification.Cysteinyl, 'C', SequenceLocation.Everywhere, false); var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false); // var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false); var searchModifications = new List<SearchModification> { //pyroGluQ, dehydro, cysteinylC, glutathioneC, //oxM }; var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein); var targetDb = new FastaDatabase(dbFilePath); // targetDb.CreateDecoyDatabase(Enzyme.Trypsin); // System.Environment.Exit(1); var indexedDb = new IndexedDatabase(targetDb); var numProteins = 0; long totalProtCompositions = 0; //long numXics = 0; TopDownScorer.MaxCharge = 25; TopDownScorer.MinCharge = 8; sw.Reset(); sw.Start(); Console.WriteLine("Generating XICs..."); foreach (var protAnnotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minLength, maxLength)) { ++numProteins; //if (numProteins > 2000) break; if (numProteins % 1000 == 0) { Console.WriteLine("Processed {0} proteins", numProteins); } //Console.WriteLine(protAnnotation); var seqGraph = SequenceGraph.CreateGraph(aaSet, protAnnotationAndOffset.Annotation); //Console.WriteLine(seqGraph.GetSequenceCompositions()[0]); if (seqGraph == null) continue; for (var nTermCleavages = 0; nTermCleavages <= numNTermCleavages; nTermCleavages++) { if(nTermCleavages > 0) seqGraph.CleaveNTerm(); var protCompositions = seqGraph.GetSequenceCompositions(); foreach (var protComposition in protCompositions) { totalProtCompositions++; // Console.WriteLine(protComposition); var scorer = new TopDownScorer(protComposition, run, precursorTolerance, null); var score = scorer.GetScore(); Console.WriteLine(score); } } } sw.Stop(); Console.WriteLine("NumProteins: {0}", numProteins); Console.WriteLine("NumProteinCompositions: {0}", totalProtCompositions); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestNominalMassErrors() { const int minLength = 300; const int maxLength = 400; var sw = new System.Diagnostics.Stopwatch(); // const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\ID_003962_71E1A1D4.fasta"; //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\TargetProteins.fasta"; var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; sw.Start(); var hist = new long[11]; var aaSet = new AminoAcidSet(); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength)) { ++numSequences; var annotation = peptideAnnotationAndOffset.Annotation; var sequenceStr = annotation.Substring(2, annotation.Length - 4); var sequenceComp = aaSet.GetComposition(sequenceStr); var mass = sequenceComp.Mass; var nominalMass = sequenceComp.NominalMass; var error = (int) Math.Round(mass*Constants.RescalingConstant) - nominalMass; var errorBin = error + hist.Length/2; if (errorBin < 0) errorBin = 0; if (errorBin >= hist.Length) errorBin = hist.Length - 1; hist[errorBin]++; } Console.WriteLine("NumSequences: {0}", numSequences); for (var i = 0; i < hist.Length; i++) { Console.WriteLine("{0}\t{1}\t{2}", i - hist.Length/2, hist[i], hist[i]/(double)numSequences); } sw.Stop(); Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestRunningTimeChromGen() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string rafFilePath = @"C:\cygwin\home\kims336\Data\QCShewQE\QC_Shew_13_04_A_17Feb14_Samwise_13-07-28.raf"; if (!File.Exists(rafFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rafFilePath); } var rafRun = new PbfLcMsRun(rafFilePath); var tolerance = new Tolerance(10); const string dbFile = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var aaSet = new AminoAcidSet(Modification.Carbamidomethylation); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); var numPeptides = 0; foreach (var peptide in indexedDb.AnnotationsAndOffsets(6, 30, 2, 2, Enzyme.Trypsin)) { ++numPeptides; var comp = new Sequence(peptide.Annotation.Substring(2, peptide.Annotation.Length-4), aaSet).Composition + Composition.H2O; var mz = new Ion(comp, 2).GetMonoIsotopicMz(); //Console.WriteLine(peptide.Annotation + " " + mz); rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //var xic1 = run.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //var xic2 = rafRun.GetFullPrecursorIonExtractedIonChromatogram(mz, tolerance); //Assert.True(xic1.Count == xic2.Count); //for (var i = 0; i < xic1.Count; i++) //{ // if (!xic1[i].Equals(xic2[i])) // { // Console.WriteLine("{0} {1} {2}", i, xic1[i], xic2[i]); // } // Assert.True(xic1[i].Equals(xic2[i])); //} if (numPeptides == 100000) break; } sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestId() { const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw"; // const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta"; //const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\Decoy_SO4280.fasta"; const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\SO2312.fasta"; const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt"; const int numBits = 29; // max error: 4ppm const int minCharge = 2; const int maxCharge = 20; var tolerance = new Tolerance(10); const double corrThreshold = 0.7; var comparer = new MzComparerWithBinning(numBits); const double minFragmentMass = 200.0; const double maxFragmentMass = 50000.0; var minFragMassBin = comparer.GetBinNumber(minFragmentMass); var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass); var aminoAcidSet = new AminoAcidSet(modFilePath); var run = PbfLcMsRun.GetLcMsRun(rawFilePath); // var ms2ScanNumArr = run.GetScanNumbers(2).ToArray(); //var ms2ScanNumArr = new[] {4130}; var ms2ScanNumArr = new[] { 5189 }; var sw = new Stopwatch(); sw.Start(); Console.Write("Building Spectrum Arrays..."); var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1]; for (var i = minFragMassBin; i <= maxFragMassBin; i++) { massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1); } foreach (var ms2ScanNum in ms2ScanNumArr) { var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum; if (productSpec == null) continue; productSpec.FilterNoise(); var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec, minCharge, maxCharge, 2, 1.1, tolerance, corrThreshold); if (deconvolutedPeaks == null) continue; foreach (var p in deconvolutedPeaks) { var mass = p.Mass; var deltaMass = tolerance.GetToleranceAsDa(mass, 1); var minMass = mass - deltaMass; var maxMass = mass + deltaMass; var minBinNum = comparer.GetBinNumber(minMass); var maxBinNum = comparer.GetBinNumber(maxMass); for (var binNum = minBinNum; binNum <= maxBinNum; binNum++) { if (binNum >= minFragMassBin && binNum <= maxFragMassBin) massVectors[binNum - minFragMassBin][ms2ScanNum] = true; } } } sw.Stop(); Console.WriteLine(@"{0:f4} sec.", sw.Elapsed.TotalSeconds); sw.Reset(); sw.Start(); var fastaDb = new FastaDatabase(fastaFilePath); fastaDb.Read(); var indexedDb = new IndexedDatabase(fastaDb); var numProteins = 0; var intactProteinAnnotationAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue); var bestProtein = new string[run.MaxLcScan + 1]; var bestScore = new int[run.MaxLcScan + 1]; foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets) { if (++numProteins % 10 == 0) { Console.WriteLine("Processing {0}{1} proteins...", numProteins, numProteins == 1 ? "st" : numProteins == 2 ? "nd" : numProteins == 3 ? "rd" : "th"); if (numProteins != 0) { sw.Stop(); Console.WriteLine("Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds); sw.Reset(); sw.Start(); } } var annotation = annotationAndOffset.Annotation; var offset = annotationAndOffset.Offset; var protSequence = annotation.Substring(2, annotation.Length - 4); // suffix var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence, AminoAcid.ProteinCTerm); if (seqGraph == null) continue; for (var numNTermCleavage = 0; numNTermCleavage <= 0; numNTermCleavage++) { if (numNTermCleavage > 0) seqGraph.CleaveNTerm(); var allCompositions = seqGraph.GetAllFragmentNodeCompositions().ToArray(); var scoreArr = new int[run.MaxLcScan + 1]; foreach (var fragComp in allCompositions) { var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass; var binNum = comparer.GetBinNumber(suffixMass); if (binNum < minFragMassBin || binNum > maxFragMassBin) continue; var vector = massVectors[binNum - minFragMassBin]; foreach (var ms2ScanNum in ms2ScanNumArr) { if (vector[ms2ScanNum]) { ++scoreArr[ms2ScanNum]; Console.WriteLine(suffixMass); } } } foreach (var ms2ScanNum in ms2ScanNumArr) { if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum]) { bestScore[ms2ScanNum] = scoreArr[ms2ScanNum]; var proteinName = fastaDb.GetProteinName(offset); bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : ""); } } } //// prefix //var seqGraphPrefix = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence, // AminoAcid.ProteinCTerm); //if (seqGraphPrefix == null) continue; //{ // if (numNTermCleavage > 0) seqGraph.CleaveNTerm(); // var allCompositions = seqGraph.GetAllFragmentNodeCompositions(); // var scoreArr = new int[run.MaxLcScan + 1]; // foreach (var fragComp in allCompositions) // { // var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass; // var binNum = comparer.GetBinNumber(suffixMass); // if (binNum < minFragMassBin || binNum > maxFragMassBin) continue; // var vector = massVectors[binNum - minFragMassBin]; // foreach (var ms2ScanNum in ms2ScanNumArr) // { // if (vector[ms2ScanNum]) ++scoreArr[ms2ScanNum]; // } // } // foreach (var ms2ScanNum in ms2ScanNumArr) // { // if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum]) // { // bestScore[ms2ScanNum] = scoreArr[ms2ScanNum]; // var proteinName = fastaDb.GetProteinName(offset); // bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : ""); // } // } //} } Console.WriteLine("ScanNum\tBestProtein\tScore"); foreach (var ms2ScanNum in ms2ScanNumArr) { Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestProtein[ms2ScanNum] ?? "", bestScore[ms2ScanNum]); } //sw.Stop(); //Console.WriteLine(@"Scoring: {0:f4} sec.", sw.Elapsed.TotalSeconds); }
public void TestCountingPeptides() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } // const string dbFile = @"C:\cygwin\home\kims336\Data\QCShew\ID_003456_9B916A8B.fasta"; // const string dbFile = @"H:\Research\DDAPlus\database\Yeast_SGD_withContam.fasta"; // const string dbFile = @"H:\Research\CPTAC_Phospho\database\ID_004208_295531A4.fasta"; var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, 0).LongCount()*31; var peptides = indexedDb .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan( 100, 300, 1, 0); var numPeptides = 0; foreach (var peptide in peptides) { Console.WriteLine("{0}\t{1}",peptide.Annotation, peptide.Offset); numPeptides++; } //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount(); //var numPeptides = // indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin).LongCount(); //var numPeptides = indexedDb.AnnotationsAndOffsets(6, 40, 2, 2, Enzyme.Trypsin).LongCount(); //var numPeptides = indexedDb.IntactSequenceAnnotationsAndOffsets(30, 250, 0).LongCount(); // .Select(annotationAndSequence => annotationAndSequence.Annotation.Length - 4) // .Aggregate(0L, (current, length) => current + Math.Min(length - 29, 30)); Console.WriteLine("NumPeptides: {0}", numPeptides); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
public void TestGettingProteinLengthAndPosition() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); db.Read(); var indexedDb = new IndexedDatabase(db); foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin)) { var annotation = peptideAnnotationAndOffset.Annotation; var offset = peptideAnnotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", annotation, offset, db.GetProteinName(offset), db.GetProteinLength(db.GetProteinName(offset)), db.GetOneBasedPositionInProtein(offset)+1); } }
public void TestCountingProteoformsCloseToNTermOrCTerm() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const int minSequenceLength = 21; // 21 const int maxSequenceLength = 300; // 300 const int maxNumNTermCleavages = 1; const int maxNumCTermCleavages = 0; var sw = new System.Diagnostics.Stopwatch(); sw.Start(); //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownQCShew\database\ID_002216_235ACCEA.fasta"; const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta"; if (!File.Exists(dbFile)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile); } var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var both = 0L; var nTermOnly = 0L; var cTermOnly = 0L; foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages <= maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); var numNTermCleavage = 0; int cleavedLength; while ((cleavedLength = length - numNTermCleavage) >= minSequenceLength) { if (cleavedLength <= maxSequenceLength) { if (numNTermCleavage <= maxNumNTermCleavages) { ++both; } else { ++cTermOnly; } var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); Console.WriteLine(anno); } ++numNTermCleavage; } } foreach ( var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(minSequenceLength, int.MaxValue, maxNumCTermCleavages)) { // numCTermCleavages > maxNumCTermCleavages var annotation = annotationAndOffset.Annotation; var length = (annotation.Length - 4); for (var numNTermCleavage = 0; numNTermCleavage <= maxNumNTermCleavages; numNTermCleavage++) { var cleavedLength = length - numNTermCleavage; if (cleavedLength >= minSequenceLength && cleavedLength <= maxSequenceLength) { ++nTermOnly; var anno = numNTermCleavage == 0 ? annotation : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage)); Console.WriteLine(anno); } } } Console.WriteLine("Both: {0}", both); Console.WriteLine("N-term only: {0}", nTermOnly); Console.WriteLine("C-term only: {0}", cTermOnly); Console.WriteLine("All: {0}", both + nTermOnly + cTermOnly); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); }
[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 8898)] // 6MB //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 6334)] // 15MB public void TestSequenceEnumerationIntact(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName, dbFile); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); const int numCTermCleavages = 0; var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(2, 21, 300, 1, numCTermCleavages); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, numCTermCleavages); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); /*/Parallel.ForEach( annotationsAndOffsets, // new ParallelOptions { MaxDegreeOfParallelism = 2}, annotationAndOffset => { Interlocked.Increment(ref numSequences); //++numSequences; } );/**/ //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(dbFile) + "_par.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
private IEnumerable<AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database, out long estimatedProteins, CancellationToken? cancellationToken = null) { var indexedDb = new IndexedDatabase(database); indexedDb.Read(); estimatedProteins = indexedDb.EstimateTotalPeptides(SearchMode, MinSequenceLength, MaxSequenceLength, MaxNumNTermCleavages, MaxNumCTermCleavages); IEnumerable<AnnotationAndOffset> annotationsAndOffsets; if (SearchMode == InternalCleavageType.MultipleInternalCleavages) { //annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength); annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(MinSequenceLength, MaxSequenceLength, MaxNumThreads, cancellationToken); } else if (SearchMode == InternalCleavageType.NoInternalCleavage) { annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength, MaxNumCTermCleavages); } else { annotationsAndOffsets = indexedDb .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan( MinSequenceLength, MaxSequenceLength, MaxNumNTermCleavages, MaxNumCTermCleavages); } return annotationsAndOffsets; }
[TestCase(3, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)] // 3MB //[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)] // 6MB //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)] // 15MB public void TestSequenceEnumerationSerial(double size, string dbFile, int expected) { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName, dbFile); var sw = new System.Diagnostics.Stopwatch(); sw.Start(); //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta"; // 1.5MB //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta"; // 3MB //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta"; // 6MB //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta"; // 15MB var db = new FastaDatabase(dbFile); var indexedDb = new IndexedDatabase(db); indexedDb.Read(); var numSequences = 0L; var timeDB = sw.Elapsed; Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds"); var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250); var timeEstimate = sw.Elapsed; Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds"); Console.WriteLine("Estimated results: " + estimatedAnnOff); var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(30, 250); var timeGetAnn = sw.Elapsed; Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds"); //foreach (var annotationsAndOffset in annotationsAndOffsets) //{ // //Interlocked.Increment(ref numSequences); // ++numSequences; //} //using ( // var ofstream = // new FileStream( // Path.Combine(@"F:\InformedProteomicsTestFiles", // Path.GetFileNameWithoutExtension(dbFile) + "_old.txt"), FileMode.Create)) //using (var fout = new StreamWriter(ofstream)) //{ // foreach (var annOff in annotationsAndOffsets) // { // numSequences++; // fout.WriteLine(annOff.Annotation); // } //} numSequences = annotationsAndOffsets.Count(); var timeParForEach = sw.Elapsed; Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds"); Console.WriteLine("NumPeptides: {0}", numSequences); sw.Stop(); Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds); //Assert.AreEqual(188961836, numSequences); Assert.AreEqual(expected, numSequences); }
public void CreateTargetList() { var methodName = MethodBase.GetCurrentMethod().Name; TestUtils.ShowStarting(methodName); const string databaseFilePath = @"D:\Research\Data\IPRG2014\database\SpikedInPeptides.fasta"; if (!File.Exists(databaseFilePath)) { Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath); } var database = new FastaDatabase(databaseFilePath); database.Read(); var indexedDatabase = new IndexedDatabase(database); var numTargets = 0; var aaSet = new AminoAcidSet(Modification.Carbamidomethylation); Console.WriteLine("Peptide\tFormula\tProtein"); foreach (var annotationAndOffset in indexedDatabase.AnnotationsAndOffsets(6, 30, 1, 1, Enzyme.Trypsin)) { var annotation = annotationAndOffset.Annotation; var peptide = annotation.Substring(2, annotation.Length - 4); var offset = annotationAndOffset.Offset; Console.WriteLine("{0}\t{1}\t{2}", peptide, (aaSet.GetComposition(peptide) + Composition.H2O).ToPlainString(), database.GetProteinName(offset)); numTargets++; } Console.WriteLine("NumTargets: {0}", numTargets); }