示例#1
0
        [TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 8898)]  // 6MB
        //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 6334)]  // 15MB
        public void TestSequenceEnumerationIntact(double size, string dbFile, int expected)
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName, dbFile);

            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            const int numCTermCleavages = 0;
            var db = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            var numSequences = 0L;
            var timeDB = sw.Elapsed;
            Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds");
            var estimatedAnnOff = indexedDb.EstimateTotalPeptides(2, 21, 300, 1, numCTermCleavages);
            var timeEstimate = sw.Elapsed;
            Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds");
            Console.WriteLine("Estimated results: " + estimatedAnnOff);
            var annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, numCTermCleavages);
            var timeGetAnn = sw.Elapsed;
            Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds");
            /*/Parallel.ForEach(
                annotationsAndOffsets,
                //                new ParallelOptions { MaxDegreeOfParallelism = 2},
                annotationAndOffset =>
                {
                    Interlocked.Increment(ref numSequences);
                    //++numSequences;
                }
                );/**/
            //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(dbFile) + "_par.txt"), FileMode.Create))
            //using (var fout = new StreamWriter(ofstream))
            //{
            //    foreach (var annOff in annotationsAndOffsets)
            //    {
            //        numSequences++;
            //        fout.WriteLine(annOff.Annotation);
            //    }
            //}
            numSequences = annotationsAndOffsets.Count();
            var timeParForEach = sw.Elapsed;
            Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds");

            Console.WriteLine("NumPeptides: {0}", numSequences);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
            //Assert.AreEqual(188961836, numSequences);
            Assert.AreEqual(expected, numSequences);
        }
示例#2
0
        public void TestTopDownScoringForAllXics()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            // Search parameters
            const int numNTermCleavages = 1;  // 30
            const int minLength = 7;
            const int maxLength = 1000;
            //const int minCharge = 5; // 3
            //const int maxCharge = 15; // 67
            const int numMaxModsPerProtein = 0; // 6
            var precursorTolerance = new Tolerance(10);
            const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.fasta";
            //const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.icdecoy.KR.fasta";
            
            //const string dbFilePath = @"..\..\..\TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            // const string dbFilePath =
            //    @"C:\cygwin\home\kims336\Data\TopDown\ID_003558_56D73071.fasta";

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();
            Console.Write("Reading raw file...");
            const string specFilePath = @"C:\workspace\TopDown\E_coli_iscU_60_mock.raw";
            var run = InMemoryLcMsRun.GetLcMsRun(specFilePath);

            sw.Stop();

            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);

            // Configure amino acid set
            //            var pyroGluQ = new SearchModification(Modification.PyroGluQ, 'Q', SequenceLocation.ProteinNTerm, false);
            var dehydro = new SearchModification(Modification.PyroGluQ, 'C', SequenceLocation.Everywhere, false);
            var cysteinylC = new SearchModification(Modification.Cysteinyl, 'C', SequenceLocation.Everywhere, false);
            var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false);
            //            var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false);

            var searchModifications = new List<SearchModification>
                {
                    //pyroGluQ,
                    dehydro,
                    cysteinylC,
                    glutathioneC,
                    //oxM
                };
            var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein);

            var targetDb = new FastaDatabase(dbFilePath);
         //   targetDb.CreateDecoyDatabase(Enzyme.Trypsin);
         //   System.Environment.Exit(1);
            var indexedDb = new IndexedDatabase(targetDb);

            var numProteins = 0;
            long totalProtCompositions = 0;
            //long numXics = 0;
            TopDownScorer.MaxCharge = 25;
            TopDownScorer.MinCharge = 8;

            sw.Reset();
            sw.Start();
            Console.WriteLine("Generating XICs...");

            foreach (var protAnnotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minLength, maxLength))
            {

                ++numProteins;
                //if (numProteins > 2000) break;

                if (numProteins % 1000 == 0)
                {
                    Console.WriteLine("Processed {0} proteins", numProteins);
                }

                //Console.WriteLine(protAnnotation);


                var seqGraph = SequenceGraph.CreateGraph(aaSet, protAnnotationAndOffset.Annotation);
                
                //Console.WriteLine(seqGraph.GetSequenceCompositions()[0]);
                
                if (seqGraph == null) continue;

                for (var nTermCleavages = 0; nTermCleavages <= numNTermCleavages; nTermCleavages++)
                {
                    if(nTermCleavages > 0) seqGraph.CleaveNTerm();
                    var protCompositions = seqGraph.GetSequenceCompositions();
                    foreach (var protComposition in protCompositions)
                    {
                        totalProtCompositions++;
                       // Console.WriteLine(protComposition);
                        var scorer = new TopDownScorer(protComposition, run, precursorTolerance, null);
                        var score = scorer.GetScore();

                        Console.WriteLine(score);


                    }
                }
            }

            sw.Stop();
            Console.WriteLine("NumProteins: {0}", numProteins);
            Console.WriteLine("NumProteinCompositions: {0}", totalProtCompositions);

            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);
        }
示例#3
0
        public void TestId()
        {
            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
            //const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\Decoy_SO4280.fasta";
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\SO2312.fasta";
            const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt";
            const int numBits = 29; // max error: 4ppm
            const int minCharge = 2;
            const int maxCharge = 20;
            var tolerance = new Tolerance(10);
            const double corrThreshold = 0.7;

            var comparer = new MzComparerWithBinning(numBits);
            const double minFragmentMass = 200.0;
            const double maxFragmentMass = 50000.0;
            var minFragMassBin = comparer.GetBinNumber(minFragmentMass);
            var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass);

            var aminoAcidSet = new AminoAcidSet(modFilePath);

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);
//            var ms2ScanNumArr = run.GetScanNumbers(2).ToArray();
            //var ms2ScanNumArr = new[] {4130};
            var ms2ScanNumArr = new[] { 5189 };

            var sw = new Stopwatch();

            sw.Start();
            Console.Write("Building Spectrum Arrays...");
            var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1];
            for (var i = minFragMassBin; i <= maxFragMassBin; i++)
            {
                massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1);
            }

            foreach (var ms2ScanNum in ms2ScanNumArr)
            {
                var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum;
                if (productSpec == null) continue;

                productSpec.FilterNoise();
                var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec, minCharge, maxCharge, 2, 1.1,
                    tolerance, corrThreshold);

                if (deconvolutedPeaks == null) continue;

                foreach (var p in deconvolutedPeaks)
                {
                    var mass = p.Mass;
                    var deltaMass = tolerance.GetToleranceAsDa(mass, 1);
                    var minMass = mass - deltaMass;
                    var maxMass = mass + deltaMass;

                    var minBinNum = comparer.GetBinNumber(minMass);
                    var maxBinNum = comparer.GetBinNumber(maxMass);
                    for (var binNum = minBinNum; binNum <= maxBinNum; binNum++)
                    {
                        if (binNum >= minFragMassBin && binNum <= maxFragMassBin) massVectors[binNum - minFragMassBin][ms2ScanNum] = true;
                    }
                }
            }
            sw.Stop();
            Console.WriteLine(@"{0:f4} sec.", sw.Elapsed.TotalSeconds);

            sw.Reset();
            sw.Start();
            var fastaDb = new FastaDatabase(fastaFilePath);
            fastaDb.Read();
            var indexedDb = new IndexedDatabase(fastaDb);
            var numProteins = 0;
            var intactProteinAnnotationAndOffsets =
                indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue);

            var bestProtein = new string[run.MaxLcScan + 1];
            var bestScore = new int[run.MaxLcScan + 1];
            foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets)
            {
                if (++numProteins % 10 == 0)
                {
                    Console.WriteLine("Processing {0}{1} proteins...", numProteins,
                        numProteins == 1 ? "st" : numProteins == 2 ? "nd" : numProteins == 3 ? "rd" : "th");
                    if (numProteins != 0)
                    {
                        sw.Stop();

                        Console.WriteLine("Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);
                        sw.Reset();
                        sw.Start();
                    }
                }
                var annotation = annotationAndOffset.Annotation;
                var offset = annotationAndOffset.Offset;

                var protSequence = annotation.Substring(2, annotation.Length - 4);

                // suffix
                var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence,
                    AminoAcid.ProteinCTerm);
                if (seqGraph == null) continue;

                for (var numNTermCleavage = 0; numNTermCleavage <= 0; numNTermCleavage++)
                {
                    if (numNTermCleavage > 0) seqGraph.CleaveNTerm();
                    var allCompositions = seqGraph.GetAllFragmentNodeCompositions().ToArray();

                    var scoreArr = new int[run.MaxLcScan + 1];
                    foreach (var fragComp in allCompositions)
                    {
                        var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass;
                        var binNum = comparer.GetBinNumber(suffixMass);
                        if (binNum < minFragMassBin || binNum > maxFragMassBin) continue;

                        var vector = massVectors[binNum - minFragMassBin];
                        foreach (var ms2ScanNum in ms2ScanNumArr)
                        {
                            if (vector[ms2ScanNum])
                            {
                                ++scoreArr[ms2ScanNum];
                                Console.WriteLine(suffixMass);
                            }
                        }
                    }
                    foreach (var ms2ScanNum in ms2ScanNumArr)
                    {
                        if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum])
                        {
                            bestScore[ms2ScanNum] = scoreArr[ms2ScanNum];
                            var proteinName = fastaDb.GetProteinName(offset);
                            bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : "");
                        }
                    }
                }
                //// prefix
                //var seqGraphPrefix = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence,
                //    AminoAcid.ProteinCTerm);
                //if (seqGraphPrefix == null) continue;

                //{
                //    if (numNTermCleavage > 0) seqGraph.CleaveNTerm();
                //    var allCompositions = seqGraph.GetAllFragmentNodeCompositions();

                //    var scoreArr = new int[run.MaxLcScan + 1];
                //    foreach (var fragComp in allCompositions)
                //    {
                //        var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass;
                //        var binNum = comparer.GetBinNumber(suffixMass);
                //        if (binNum < minFragMassBin || binNum > maxFragMassBin) continue;

                //        var vector = massVectors[binNum - minFragMassBin];
                //        foreach (var ms2ScanNum in ms2ScanNumArr)
                //        {
                //            if (vector[ms2ScanNum]) ++scoreArr[ms2ScanNum];
                //        }
                //    }
                //    foreach (var ms2ScanNum in ms2ScanNumArr)
                //    {
                //        if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum])
                //        {
                //            bestScore[ms2ScanNum] = scoreArr[ms2ScanNum];
                //            var proteinName = fastaDb.GetProteinName(offset);
                //            bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : "");
                //        }
                //    }
                //}
            }

            Console.WriteLine("ScanNum\tBestProtein\tScore");
            foreach (var ms2ScanNum in ms2ScanNumArr)
            {
                Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestProtein[ms2ScanNum] ?? "", bestScore[ms2ScanNum]);
            }
            //sw.Stop();
            //Console.WriteLine(@"Scoring: {0:f4} sec.", sw.Elapsed.TotalSeconds);
        }
示例#4
0
        private IEnumerable<AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database, out long estimatedProteins, CancellationToken? cancellationToken = null)
        {
            var indexedDb = new IndexedDatabase(database);
            indexedDb.Read();
            estimatedProteins = indexedDb.EstimateTotalPeptides(SearchMode, MinSequenceLength, MaxSequenceLength, MaxNumNTermCleavages, MaxNumCTermCleavages);
            IEnumerable<AnnotationAndOffset> annotationsAndOffsets;
            if (SearchMode == InternalCleavageType.MultipleInternalCleavages)
            {
                //annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength);
                annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(MinSequenceLength, MaxSequenceLength, MaxNumThreads, cancellationToken);
            }
            else if (SearchMode == InternalCleavageType.NoInternalCleavage)
            {
                annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength, MaxNumCTermCleavages);
            }
            else
            {
                annotationsAndOffsets = indexedDb
                    .SequenceAnnotationsAndOffsetsWithNtermOrCtermCleavageNoLargerThan(
                        MinSequenceLength, MaxSequenceLength, MaxNumNTermCleavages, MaxNumCTermCleavages);
            }

            return annotationsAndOffsets;
        }
示例#5
0
        public void TestCountingProteoformsCloseToNTermOrCTerm()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const int minSequenceLength = 21;   // 21
            const int maxSequenceLength = 300;  // 300
            const int maxNumNTermCleavages = 1;
            const int maxNumCTermCleavages = 0;

            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownQCShew\database\ID_002216_235ACCEA.fasta";
            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";
            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);

            var both = 0L;
            var nTermOnly = 0L;
            var cTermOnly = 0L;
            
            foreach (
                var annotationAndOffset in
                    indexedDb.IntactSequenceAnnotationsAndOffsets(minSequenceLength, int.MaxValue,
                        maxNumCTermCleavages))
            {
                // numCTermCleavages <= maxNumCTermCleavages
                var annotation = annotationAndOffset.Annotation;
                var length = (annotation.Length - 4);
                var numNTermCleavage = 0;
                int cleavedLength;
                while ((cleavedLength = length - numNTermCleavage) >= minSequenceLength)
                {
                    if (cleavedLength <= maxSequenceLength)
                    {
                        if (numNTermCleavage <= maxNumNTermCleavages)
                        {
                            ++both;
                        }
                        else
                        {
                            ++cTermOnly;
                        }
                        var anno = numNTermCleavage == 0 
                            ? annotation 
                            : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage));
                        Console.WriteLine(anno);
                    }
                    ++numNTermCleavage;
                }
            }

            foreach (
                var annotationAndOffset in
                    indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(minSequenceLength, int.MaxValue,
                        maxNumCTermCleavages))
            {
                // numCTermCleavages > maxNumCTermCleavages
                var annotation = annotationAndOffset.Annotation;
                var length = (annotation.Length - 4);
                for (var numNTermCleavage = 0; numNTermCleavage <= maxNumNTermCleavages; numNTermCleavage++)
                {
                    var cleavedLength = length - numNTermCleavage;
                    if (cleavedLength >= minSequenceLength && cleavedLength <= maxSequenceLength)
                    {
                        ++nTermOnly;
                        var anno = numNTermCleavage == 0
                            ? annotation
                            : string.Format("{0}.{1}", annotation[1 + numNTermCleavage], annotation.Substring(2 + numNTermCleavage));
                        Console.WriteLine(anno);
                    }
                }
            }

            Console.WriteLine("Both: {0}", both);
            Console.WriteLine("N-term only: {0}", nTermOnly);
            Console.WriteLine("C-term only: {0}", cTermOnly);
            Console.WriteLine("All: {0}", both + nTermOnly + cTermOnly);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }