public FeatureBasedTagSearchEngine(
     LcMsRun run,
     Ms1FtParser featureParser,
     ProductScorerBasedOnDeconvolutedSpectra ms2Scorer,
     SequenceTagParser tagParser,
     FastaDatabase fastaDb,
     Tolerance tolerance,
     AminoAcidSet aaSet,
     double maxSequenceMass = 50000.0,
     int minProductIonCharge = 1,
     int maxProductIonCharge = 20)
 {
     _run = run;
     _ms2Scorer = ms2Scorer;
     _featureParser = featureParser;
     _ms1FtFilter = new Ms1FtFilter(run, tolerance, featureParser.Ms1FtFileName);
     _tagParser = tagParser;
     _fastaDb = fastaDb;
     _searchableDb = new SearchableDatabase(fastaDb);
     _tolerance = tolerance;
     _aaSet = aaSet;
     _maxSequenceMass = maxSequenceMass;
     _minProductIonCharge = minProductIonCharge;
     _maxProductIonCharge = maxProductIonCharge;
 }
Beispiel #2
0
        public void TestSequenceEnumerationParallel2()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            var sw = new System.Diagnostics.Stopwatch();

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta";
            var db = new FastaDatabase(dbFile);
            db.Read();
            var indexedDb = new IndexedDatabase(db);
            var arr = db.Characters().ToArray();

            sw.Start();
            //var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30);
            //            var num = annotationsAndOffsets.AsParallel().LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0);
            //var num = annotationsAndOffsets.LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0);
            //var num = arr.AsParallel().Where(c => c == 'W').LongCount();
            var num = 0;
            var sum = 0L;
            //foreach (var c in arr)
            for (var a = 0; a < arr.Length; a++)
            {
                var c = arr[a];
                for (var i = 0; i < c * 10000; i++) sum += i;
                //                Interlocked.Increment(ref num);
                if (++num == 1000) break;
            }

            Console.WriteLine("NumPeptides: {0}", sum);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Beispiel #3
0
        private static void TestCountingPeptides()
        {
            var aaSet = new AminoAcidSet();

            var sw = new Stopwatch();
            sw.Start();

            //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002166_F86E3B2F.fasta";
            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_003456_9B916A8B.fasta";
            //            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta";
            var db = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            indexedDb.Read();
            //var numPeptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 150).LongCount();
            var peptides =
                indexedDb.AnnotationsAndOffsets(7, 40, 2, 2, Enzyme.Trypsin);

            Parallel.ForEach(peptides, annotationAndOffset =>
            //foreach(var annotationAndOffset in peptides)
            {
                var annotation = annotationAndOffset.Annotation;
                var offset = annotationAndOffset.Offset;

                var graph = SequenceGraph.CreateGraph(aaSet, annotation);
            }
                )
            ;

//            Console.WriteLine("NumPeptides: {0}", numPeptides);
            sw.Stop();
            var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
            Console.WriteLine(@"{0:f4} sec", sec);
        }
Beispiel #4
0
        public void TestSearching()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";
            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db = new FastaDatabase(dbFile);
            var searchableDb = new SearchableDatabase(db);
            //const string pattern = "NSGSHFCGGSLINSQWVVSAAH";
            const string pattern = "FPTDDDDK";
            var position = searchableDb.Search(pattern);
            Assert.True(position >= 0);
            Console.WriteLine("Position: {0}", position);
            Console.WriteLine("Matched indices: {0}", string.Join(",", searchableDb.FindAllMatchedSequenceIndices(pattern)));
            Console.WriteLine("Protein indices: {0}", string.Join(",", searchableDb.FindAllMatchedSequenceIndices(pattern).Select(i => db.GetOneBasedPositionInProtein(i))));
            sw.Stop();
            
            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
        public ScanBasedTagSearchEngine(
            LcMsRun run,
            ISequenceTagFinder seqTagFinder,
            LcMsPeakMatrix featureFinder,
            FastaDatabase fastaDb,
            Tolerance tolerance,
            AminoAcidSet aaSet,
            CompositeScorerFactory ms2ScorerFactory = null,
            int minMatchedTagLength = DefaultMinMatchedTagLength,
            double maxSequenceMass = 50000.0,
            int minProductIonCharge = 1,
            int maxProductIonCharge = 20)
        {
            _run = run;
            _featureFinder = featureFinder;
            
            _searchableDb = new SearchableDatabase(fastaDb);

            _tolerance = tolerance;
            _aaSet = aaSet;
            _minMatchedTagLength = minMatchedTagLength;
            _maxSequenceMass = maxSequenceMass;
            _minProductIonCharge = minProductIonCharge;
            _maxProductIonCharge = maxProductIonCharge;
            MinScan = int.MinValue;
            MaxScan = int.MaxValue;
            _ms2ScorerFactory = ms2ScorerFactory;
            _seqTagFinder = seqTagFinder;
        }
Beispiel #6
0
        public IndexedDatabase(FastaDatabase fastaDatabase)
        {
            FastaDatabase = fastaDatabase;
            var databaseFilePath = FastaDatabase.GetFastaFilePath();
            //var databaseFilePathNoExt = Path.Combine(Path.GetDirectoryName(databaseFilePath), Path.GetFileNameWithoutExtension(databaseFilePath));
            //_pLcpFilePath = databaseFilePathNoExt + PermutedLongestCommonPrefixFileExtension;
            _pLcpFilePath = Path.ChangeExtension(databaseFilePath, PermutedLongestCommonPrefixFileExtension);
            var lastWriteTimeHash = FastaDatabase.GetLastWriteTimeHash();

            if (!File.Exists(_pLcpFilePath) || !FastaDatabase.CheckHashCodeBinaryFile(_pLcpFilePath, lastWriteTimeHash))
            {
                Console.Write("Generating " + _pLcpFilePath + " ... ");
                CreatePermutedLongestCommonPrefixFile();
                Console.WriteLine("Done");
            }
        }
        public ProteoformSpectrumMatchContainer(FastaDatabase database, int[] ms2ScanVector, int maxModifications, int maxNumMatchesPerSpectrum, int minScore = 4)
        {
            Database = database;
            NumMatchesPerSpectrum = maxNumMatchesPerSpectrum;
            _scoreCutoff = minScore;
            Ms2ScanVector = ms2ScanVector;
            _ms2ScanToIndexMap = new int[ms2ScanVector.Last() + 1];
            for(var i = 0; i < ms2ScanVector.Length; i++)
            {
                var scanNum = ms2ScanVector[i];
                _ms2ScanToIndexMap[scanNum] = i;
            }
            _matchedSet = new SortedSet<DatabaseSequenceSpectrumMatch>[maxModifications + 1][];
            for(var i = 0; i <= maxModifications; i++) _matchedSet[i] = new SortedSet<DatabaseSequenceSpectrumMatch>[ms2ScanVector.Length];

            _checkedOutScanNumbers = new List<int>();
        }
Beispiel #8
0
        /// <summary>
        /// Constructor - build the index
        /// </summary>
        /// <param name="fastaDatabase"></param>
        public IndexedDatabase(FastaDatabase fastaDatabase)
        {
            FastaDatabase = fastaDatabase;
            var databaseFilePath = FastaDatabase.GetFastaFilePath();

            //var databaseFilePathNoExt = Path.Combine(Path.GetDirectoryName(databaseFilePath), Path.GetFileNameWithoutExtension(databaseFilePath));
            //_pLcpFilePath = databaseFilePathNoExt + PermutedLongestCommonPrefixFileExtension;
            _pLcpFilePath = Path.ChangeExtension(databaseFilePath, PermutedLongestCommonPrefixFileExtension);
            var lastWriteTimeHash = FastaDatabase.GetLastWriteTimeHash();

            if (!File.Exists(_pLcpFilePath) || !FastaDatabase.CheckHashCodeBinaryFile(_pLcpFilePath, lastWriteTimeHash))
            {
                Console.Write("Generating " + _pLcpFilePath + " ... ");
                CreatePermutedLongestCommonPrefixFile();
                Console.WriteLine("Done");
            }
        }
        public void TestSearchWithTagGeneration()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string rawFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);
            const string fastaFilePath = @"D:\MSPathFinder\Fasta\ID_002216_235ACCEA.fasta";
            //const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_004973_9BA6912F.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var tolerance = new Tolerance(10);
            var modsFilePath = @"D:\MSPathFinder\Fasta\Mods.txt";

            if (!File.Exists(modsFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath);
            }

            var aaSet = new AminoAcidSet(modsFilePath);

            //TestTagBasedSearch(run, fastaDb, tolerance, aaSet);
            var tagSearchEngine = new ScanBasedTagSearchEngine(run, new SequenceTagGenerator(run, new Tolerance(8)),  new LcMsPeakMatrix(run),  fastaDb, tolerance,aaSet);

            var matchedTags = tagSearchEngine.RunSearch(4672);
            foreach (var match in matchedTags)
            {
                Console.Write(match.Sequence);
                Console.WriteLine("\t{0}\t{1}\t{2}", match.TagMatch.StartIndex, match.TagMatch.EndIndex, match.TagMatch.Mass);
            }


        }
Beispiel #10
0
        public SearchableDatabase(FastaDatabase fastaDatabase)
        {
            FastaDatabase = fastaDatabase;
            _sequence = fastaDatabase.GetSequence();
            _suffixArray = new int[_sequence.Length];
            SAIS.sufsort(_sequence, _suffixArray, _sequence.Length);

            var neighboringLcps = new byte[_suffixArray.Length];
            neighboringLcps[0] = 0;

            for (var i = 1; i < _suffixArray.Length; i++)
            {
                var lcp = IndexedDatabase.GetLcp(_sequence, _suffixArray[i - 1], _suffixArray[i]);
                neighboringLcps[i] = lcp;
            }

            _leftLcps = new byte[_suffixArray.Length];
            _rightLcps = new byte[_suffixArray.Length];

            InitializeLcps(neighboringLcps, _leftLcps, _rightLcps, 0, _suffixArray.Length-1);
        }
Beispiel #11
0
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="fastaDatabase"></param>
        public SearchableDatabase(FastaDatabase fastaDatabase)
        {
            FastaDatabase = fastaDatabase;
            _sequence     = fastaDatabase.GetSequence();
            _suffixArray  = new int[_sequence.Length];
            SAIS.sufsort(_sequence, _suffixArray, _sequence.Length);

            var neighboringLcps = new byte[_suffixArray.Length];

            neighboringLcps[0] = 0;

            for (var i = 1; i < _suffixArray.Length; i++)
            {
                var lcp = IndexedDatabase.GetLcp(_sequence, _suffixArray[i - 1], _suffixArray[i]);
                neighboringLcps[i] = lcp;
            }

            _leftLcps  = new byte[_suffixArray.Length];
            _rightLcps = new byte[_suffixArray.Length];

            InitializeLcps(neighboringLcps, _leftLcps, _rightLcps, 0, _suffixArray.Length - 1);
        }
 public FeatureBasedTagSearchEngine(
     LcMsRun run,
     Ms1FtParser featureParser,
     SequenceTagParser tagParser,
     FastaDatabase fastaDb,
     Tolerance tolerance,
     AminoAcidSet aaSet,
     double maxSequenceMass = 50000.0,
     int minProductIonCharge = 1,
     int maxProductIonCharge = 20)
     : this(
         run,
         featureParser,
         null,
         tagParser,
         fastaDb,
         tolerance,
         aaSet,
         maxSequenceMass,
         minProductIonCharge,
         maxProductIonCharge)
 {
 }
Beispiel #13
0
        public void TestSumParallel()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            //var array = Enumerable.Range(0, short.MaxValue).ToArray();
            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta";
            var db = new FastaDatabase(dbFile);
            db.Read();
            //var indexedDb = new IndexedDatabase(db);
            //indexedDb.Read();
            //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30);
            var charArray = db.Characters().Select(c => (int)c);

            // Test methods.
            Console.WriteLine(SumAsParallel(charArray));
            Console.WriteLine(SumDefault(charArray));

            const int m = 100;
            var s1 = Stopwatch.StartNew();
            for (var i = 0; i < m; i++)
            {
                SumDefault(charArray);
            }
            s1.Stop();
            var s2 = Stopwatch.StartNew();
            for (var i = 0; i < m; i++)
            {
                SumAsParallel(charArray);
            }
            s2.Stop();
            Console.WriteLine((s1.Elapsed.TotalMilliseconds * 1000000 /
                m).ToString("0.00 ns"));
            Console.WriteLine((s2.Elapsed.TotalMilliseconds * 1000000 /
                m).ToString("0.00 ns"));
            Console.Read();
        }
Beispiel #14
0
        public void TestEnumeratingProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";
            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            foreach (var annotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsetsWithCTermCleavagesLargerThan(100, 300, 3))
            {
                Console.WriteLine(annotationAndOffset.Annotation);
            }
            sw.Stop();
            
            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Beispiel #15
0
        [TestCase(3, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)]  // 3MB
        //[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)]  // 6MB
        //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)]  // 15MB
        public void TestSequenceEnumeration(double size, string dbFile, int expected)
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName, dbFile);

            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            var db = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            var numSequences = 0L;
            var timeDB = sw.Elapsed;
            Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds");
            var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250);
            var timeEstimate = sw.Elapsed;
            Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds");
            //int coreCount = 0;
            //foreach (var item in new System.Management.ManagementObjectSearcher("Select NumberOfCores from Win32_Processor").Get())
            //{
            //    coreCount += int.Parse(item["NumberOfCores"].ToString());
            //}
            //Console.WriteLine("Number Of Cores: {0}", coreCount);
            //Console.WriteLine("Processors: " + System.Environment.ProcessorCount);
            Console.WriteLine("Estimated results: " + estimatedAnnOff);
            var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzymeParallel(30, 250);
            var timeGetAnn = sw.Elapsed;
            Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds");
            /*/Parallel.ForEach(
                annotationsAndOffsets,
                //                new ParallelOptions { MaxDegreeOfParallelism = 2},
                annotationAndOffset =>
                {
                    Interlocked.Increment(ref numSequences);
                    //++numSequences;
                }
                );/**/
            //annotationsAndOffsets.Select(annotationsAndOffset => annotationsAndOffset.)
            // Below, original: 110, 109(total) seconds
            // Parallelizing AnnotationsAndOffsetsNoEnzyme: 86 seconds
            // Parallelizing AnnotationsAndOffsetsNoEnzyme, yield returns: 79.6, 94, 60, 60 seconds
            //
            // 3MB
            // serial: 
            // Parallel2: 107, 
            //
            // 6MB
            // serial: 
            // Parallel2: 
            //
            // 15MB
            // serial: 
            // Parallel2: 
            //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(dbFile) + "_par.txt"), FileMode.Create))
            //using (var fout = new StreamWriter(ofstream))
            //{
            //    foreach (var annOff in annotationsAndOffsets)
            //    {
            //        numSequences++;
            //        fout.WriteLine(annOff.Annotation);
            //    }
            //}
            numSequences = annotationsAndOffsets.Count();
            var timeParForEach = sw.Elapsed;
            Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds");

            Console.WriteLine("NumPeptides: {0}", numSequences);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
            //Assert.AreEqual(188961836, numSequences);
            Assert.AreEqual(expected, numSequences);
        }
Beispiel #16
0
        private void WriteResultsToFile(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                             "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tIcScore"
                             );
                for (var scanNum = _run.MinLcScan; scanNum <= _run.MaxLcScan; scanNum++)
                {
                    if (matches[scanNum] == null) continue;
                    foreach (var match in matches[scanNum].Reverse())
                    {
                        var sequence = match.Sequence;
                        var offset = match.Offset;
                        var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                        var end = start + sequence.Length - 1;
                        var proteinName = database.GetProteinName(match.Offset);
                        var protLength = database.GetProteinLength(proteinName);
                        var ion = match.Ion;

                        var scores = _bottomUpScorer.GetScores(match, ion.Composition, ion.Charge, scanNum);

                        if (ion == null)
                        {
                            Console.WriteLine(@"Null ion!");
                        }
                        if (scores == null)
                        {
                            Console.WriteLine(@"Null scores");
                        }

                        // Note for DblToString(value, 9, true), by having "9" and "true",
                        // values between 100 and 999 Da will have 7 digits after the decimal place, and
                        // values between 1000 and 9999 will have 6 digits after the decimal place

                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}",
                            scanNum,
                            match.Pre,
                            sequence, // Sequence
                            match.Post,
                            scores.Modifications, // Modifications
                            ion.Composition, // Composition
                            proteinName, // ProteinName
                            database.GetProteinDescription(match.Offset), // ProteinDescription
                            protLength, // ProteinLength
                            start, // Start
                            end, // End
                            ion.Charge, // precursorCharge
                            StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz
                            StringUtilities.DblToString(ion.Composition.Mass, 9, true),           // Mass
                            match.Score,
                            scores.Score    // Score (re-scored)
                            );

                    }
                }
            }
        }
Beispiel #17
0
        private IEnumerable<AnnotationAndOffset> GetAnnotationsAndOffsets(FastaDatabase database)
        {
            var indexedDbTarget = new IndexedDatabase(database);
            IEnumerable<AnnotationAndOffset> annotationsAndOffsets;
            if (NumTolerableTermini == 0)
            {
                annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsetsNoEnzyme(MinSequenceLength, MaxSequenceLength);
            }
            else 
            {
                annotationsAndOffsets = indexedDbTarget.AnnotationsAndOffsets(MinSequenceLength, MaxSequenceLength,
                    NumTolerableTermini, 2, Enzyme);
            }

            return annotationsAndOffsets;
        }
Beispiel #18
0
        public bool RunSearch(double corrThreshold)
        {
            var sw = new Stopwatch();
            ErrorMessage = string.Empty;

            Console.Write(@"Reading raw file...");
            sw.Start();
            _run = InMemoryLcMsRun.GetLcMsRun(SpecFilePath, 1.4826, 1.4826);
            _bottomUpScorer = new InformedBottomUpScorer(_run, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance);
            sw.Stop();
            var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

            sw.Reset();
            Console.Write(@"Determining precursor masses...");
            sw.Start();
            var ms1Filter = new Ms1IsotopeAndChargeCorrFilter(_run, PrecursorIonTolerance, MinPrecursorIonCharge, MaxPrecursorIonCharge,
                400, 5000, corrThreshold, 0, 0);
            sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

            sw.Reset();
            Console.Write(@"Deconvoluting MS2 spectra...");
            sw.Start();
            _ms2ScorerFactory = new ProductScorerBasedOnDeconvolutedSpectra(
                _run,
                MinProductIonCharge, MaxProductIonCharge,
                new Tolerance(10),
                0
                );
            _ms2ScorerFactory.DeconvoluteAllProductSpectra();
            sw.Stop();
            sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

            // Target database
            var targetDb = new FastaDatabase(DatabaseFilePath);

            //            string dirName = OutputDir ?? Path.GetDirectoryName(SpecFilePath);

            var baseName = Path.GetFileNameWithoutExtension(SpecFilePath);
            var targetOutputFilePath = Path.Combine(OutputDir, baseName + TargetFileExtension);
            var decoyOutputFilePath = Path.Combine(OutputDir, baseName + DecoyFileExtension);
            var tdaOutputFilePath = Path.Combine(OutputDir, baseName + TdaFileExtension);

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Target))
            {
                sw.Reset();
                Console.Write(@"Reading the target database...");
                sw.Start();
                targetDb.Read();
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

                sw.Reset();
                Console.WriteLine(@"Searching the target database");
                sw.Start();
                var targetMatches = RunSearch(GetAnnotationsAndOffsets(targetDb), ms1Filter, false);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Target database search elapsed time: {0:f4} sec", sec);

                sw.Reset();
                Console.Write(@"Rescoring and writing target results...");
                sw.Start();
                WriteResultsToFile(targetMatches, targetOutputFilePath, targetDb);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed time: {0:f4} sec", sec);
            }

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Decoy))
            {
                // Decoy database
                sw.Reset();
                Console.Write(@"Reading the decoy database...");
                sw.Start();
                var decoyDb = targetDb.Decoy(Enzyme);
                decoyDb.Read();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

                sw.Reset();
                Console.WriteLine(@"Searching the decoy database");
                sw.Start();
                var decoyMatches = RunSearch(GetAnnotationsAndOffsets(decoyDb), ms1Filter, true);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Decoy database search elapsed Time: {0:f4} sec", sec);

                sw.Reset();
                Console.Write(@"Rescoring and writing decoy results...");
                sw.Start();
                WriteResultsToFile(decoyMatches, decoyOutputFilePath, decoyDb);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed time: {0:f4} sec", sec);
            }

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Both))
            {
                var fdrCalculator = new FdrCalculator(targetOutputFilePath, decoyOutputFilePath);
                if (fdrCalculator.HasError())
                {
                    ErrorMessage = fdrCalculator.ErrorMessage;
                    Console.WriteLine(@"Error computing FDR: " + fdrCalculator.ErrorMessage);
                    return false;
                }

                fdrCalculator.WriteTo(tdaOutputFilePath);
            }

            Console.WriteLine(@"Done");
            return true;
        }
Beispiel #19
0
        private void RunSearch(SortedSet<DatabaseSequenceSpectrumMatch>[] matches, FastaDatabase db, ISequenceFilter sequenceFilter, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null)
        {
            var progData = new ProgressData(progress)
            {
                Status = "Searching for matches"
            };

            var sw = new Stopwatch();
            long estimatedProteins;
            var annotationsAndOffsets = GetAnnotationsAndOffsets(db, out estimatedProteins, cancellationToken);
            Console.WriteLine(@"Estimated proteins: " + estimatedProteins);

            var numProteins = 0;
            var lastUpdate = DateTime.MinValue; // Force original update of 0%

            sw.Reset();
            sw.Start();
            var pfeOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxNumThreads,
                CancellationToken = cancellationToken ?? CancellationToken.None
            };

            var maxNumNTermCleavages = SearchMode == InternalCleavageType.NoInternalCleavage ? MaxNumNTermCleavages : 0;
            //foreach (var annotationAndOffset in annotationsAndOffsets)
            Parallel.ForEach(annotationsAndOffsets, pfeOptions, annotationAndOffset =>
            {
                if (cancellationToken != null && cancellationToken.Value.IsCancellationRequested)
                {
                    //return matches;
                    return;
                }

                SearchProgressReport(ref numProteins, ref lastUpdate, estimatedProteins, sw, progData);
                SearchForMatches(annotationAndOffset, sequenceFilter, matches, maxNumNTermCleavages, db.IsDecoy, cancellationToken);
            });

            Console.WriteLine(@"Collected candidate matches: {0}", GetNumberOfMatches(matches));

            progData.StatusInternal = string.Empty;
            progData.Report(100.0);
        }
        public void FindProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);

            const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv";
            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            const string outputFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_matchedtag.tsv";
            using (var writer = new StreamWriter(outputFilePath))
            {
                var isHeader = true;
                foreach (var line in File.ReadAllLines(tagFilePath))
                {
                    if (isHeader)
                    {
                        isHeader = false;
                        writer.WriteLine(line+"\t"+"Proteins");
                        continue;
                    }

                    var token = line.Split('\t');
                    if (token.Length != 3) continue;
                    var tag = token[1];

                    var matchedProteins =
                        searchableDb.FindAllMatchedSequenceIndices(tag)
                            .Select(index => fastaDb.GetProteinName(index))
                            .Distinct().ToArray();
                    var matchedProteinStr = string.Join(",", matchedProteins);
                    writer.WriteLine("{0}\t{1}\t{2}", line, matchedProteins.Length, matchedProteinStr);
                }
            }
            Console.WriteLine(@"Done");
        }
Beispiel #21
0
        [TestCase(3, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta", 323719193)]  // 3MB
        //[TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 595227563)]  // 6MB
        //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 1882434687)]  // 15MB
        public void TestSequenceEnumerationSerial(double size, string dbFile, int expected)
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName, dbFile);

            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta";  // 1.5MB
            //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_005133_8491EFA2.fasta";  // 3MB
            //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta";  // 6MB
            //const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta";  // 15MB
            var db = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            indexedDb.Read();
            var numSequences = 0L;
            var timeDB = sw.Elapsed;
            Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds");
            var estimatedAnnOff = indexedDb.EstimateTotalPeptides(0, 30, 250);
            var timeEstimate = sw.Elapsed;
            Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds");
            Console.WriteLine("Estimated results: " + estimatedAnnOff);
            var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(30, 250);
            var timeGetAnn = sw.Elapsed;
            Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds");
            //foreach (var annotationsAndOffset in annotationsAndOffsets)
            //{
            //    //Interlocked.Increment(ref numSequences);
            //    ++numSequences;
            //}
            //using (
            //    var ofstream =
            //        new FileStream(
            //            Path.Combine(@"F:\InformedProteomicsTestFiles",
            //                Path.GetFileNameWithoutExtension(dbFile) + "_old.txt"), FileMode.Create))
            //using (var fout = new StreamWriter(ofstream))
            //{
            //    foreach (var annOff in annotationsAndOffsets)
            //    {
            //        numSequences++;
            //        fout.WriteLine(annOff.Annotation);
            //    }
            //}
            numSequences = annotationsAndOffsets.Count();
            var timeParForEach = sw.Elapsed;
            Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds");

            Console.WriteLine("NumPeptides: {0}", numSequences);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
            //Assert.AreEqual(188961836, numSequences);
            Assert.AreEqual(expected, numSequences);
        }
Beispiel #22
0
        public void TestTopDownScoringForAllXics()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            // Search parameters
            const int numNTermCleavages = 1;  // 30
            const int minLength = 7;
            const int maxLength = 1000;
            //const int minCharge = 5; // 3
            //const int maxCharge = 15; // 67
            const int numMaxModsPerProtein = 0; // 6
            var precursorTolerance = new Tolerance(10);
            const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.fasta";
            //const string dbFilePath = @"..\..\..\TestFiles\sprot.Ecoli.2012_07.icdecoy.KR.fasta";
            
            //const string dbFilePath = @"..\..\..\TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            // const string dbFilePath =
            //    @"C:\cygwin\home\kims336\Data\TopDown\ID_003558_56D73071.fasta";

            var sw = new System.Diagnostics.Stopwatch();

            sw.Start();
            Console.Write("Reading raw file...");
            const string specFilePath = @"C:\workspace\TopDown\E_coli_iscU_60_mock.raw";
            var run = InMemoryLcMsRun.GetLcMsRun(specFilePath);

            sw.Stop();

            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);

            // Configure amino acid set
            //            var pyroGluQ = new SearchModification(Modification.PyroGluQ, 'Q', SequenceLocation.ProteinNTerm, false);
            var dehydro = new SearchModification(Modification.PyroGluQ, 'C', SequenceLocation.Everywhere, false);
            var cysteinylC = new SearchModification(Modification.Cysteinyl, 'C', SequenceLocation.Everywhere, false);
            var glutathioneC = new SearchModification(Modification.Glutathione, 'C', SequenceLocation.Everywhere, false);
            //            var oxM = new SearchModification(Modification.Oxidation, 'M', SequenceLocation.Everywhere, false);

            var searchModifications = new List<SearchModification>
                {
                    //pyroGluQ,
                    dehydro,
                    cysteinylC,
                    glutathioneC,
                    //oxM
                };
            var aaSet = new AminoAcidSet(searchModifications, numMaxModsPerProtein);

            var targetDb = new FastaDatabase(dbFilePath);
         //   targetDb.CreateDecoyDatabase(Enzyme.Trypsin);
         //   System.Environment.Exit(1);
            var indexedDb = new IndexedDatabase(targetDb);

            var numProteins = 0;
            long totalProtCompositions = 0;
            //long numXics = 0;
            TopDownScorer.MaxCharge = 25;
            TopDownScorer.MinCharge = 8;

            sw.Reset();
            sw.Start();
            Console.WriteLine("Generating XICs...");

            foreach (var protAnnotationAndOffset in indexedDb.IntactSequenceAnnotationsAndOffsets(minLength, maxLength))
            {

                ++numProteins;
                //if (numProteins > 2000) break;

                if (numProteins % 1000 == 0)
                {
                    Console.WriteLine("Processed {0} proteins", numProteins);
                }

                //Console.WriteLine(protAnnotation);


                var seqGraph = SequenceGraph.CreateGraph(aaSet, protAnnotationAndOffset.Annotation);
                
                //Console.WriteLine(seqGraph.GetSequenceCompositions()[0]);
                
                if (seqGraph == null) continue;

                for (var nTermCleavages = 0; nTermCleavages <= numNTermCleavages; nTermCleavages++)
                {
                    if(nTermCleavages > 0) seqGraph.CleaveNTerm();
                    var protCompositions = seqGraph.GetSequenceCompositions();
                    foreach (var protComposition in protCompositions)
                    {
                        totalProtCompositions++;
                       // Console.WriteLine(protComposition);
                        var scorer = new TopDownScorer(protComposition, run, precursorTolerance, null);
                        var score = scorer.GetScore();

                        Console.WriteLine(score);


                    }
                }
            }

            sw.Stop();
            Console.WriteLine("NumProteins: {0}", numProteins);
            Console.WriteLine("NumProteinCompositions: {0}", totalProtCompositions);

            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);
        }
        public void TestTagBasedSearchForLewy()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string rawFilePath = @"D:\MassSpecFiles\Lewy\Lewy_AT_AD1_21May15_Bane_14-09-01RZ.pbf";
            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            const int minTagLength = 4;
            var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFilePath, ".seqtag");
            //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 10000);

            const string fastaFilePath = @"D:\MassSpecFiles\Lewy\a4_human.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);

            var tolerance = new Tolerance(10);
            var modsFilePath = @"D:\MassSpecFiles\Lewy\Mods.txt";
            if (!File.Exists(modsFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath);
            }

            var aaSet = new AminoAcidSet(modsFilePath);            

            TestTagBasedSearch(run, fastaDb, tolerance, aaSet);
        }
        public void FindProteinDeltaMass()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string folderPath = @"D:\MassSpecFiles\Glyco\";
            if (!Directory.Exists(folderPath))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, folderPath);
            }

            var fileSet = new string[]
            {
                "User_sample_test_02252015", "User_sample_test_MWCO_02262016", "User_sample_test_SEC_F3_03022105",
                "User_sample_test_SEC_F1_02272015", "User_sample_test_SEC_F2_02282015"
            };
            const string fastaFilePath = folderPath + "ID_003836_DA9CC1E4.fasta";

            for (var i = 0; i < fileSet.Length; i++)
            {
                var datasetName = fileSet[i];
                var tagFilePath = folderPath + datasetName + ".seqtag";
                //var outputFilePath = folderPath + datasetName + ".matchedtag";
                var outputFilePath = folderPath + datasetName + ".dmass";
                var fastaDb = new FastaDatabase(fastaFilePath);
                var searchableDb = new SearchableDatabase(fastaDb);

                using (var writer = new StreamWriter(outputFilePath))
                {
                    var isHeader = true;
                    var nReadSeqTag = 0;

                    Console.WriteLine(@"Reading {0} file", tagFilePath);

                    var nColumn = 0;
                    foreach (var line in File.ReadAllLines(tagFilePath))
                    {
                        if (isHeader)
                        {
                            isHeader = false;
                            nColumn = line.Split('\t').Length;
                            writer.WriteLine(line + "\t" + "Protein" + "\t" + "DetectedFlankingMass" + "\t" + "ExpectedFlankingMass" + "\t" + "DeltaMass");
                            continue;
                        }

                        var token = line.Split('\t');
                        if (token.Length != nColumn) continue;
                        var tag = token[1];
                        //var scan = Convert.ToInt32(token[0]);

                        if (tag.Length < 6) continue;

                        var nTerminal = token[2].Equals("1");
                        var detectedFlankingMass = Double.Parse(token[3]);

                        if (!nTerminal) detectedFlankingMass -= Composition.H2O.Mass;

                        nReadSeqTag++;

                        var matchedProteins =
                            searchableDb.FindAllMatchedSequenceIndices(tag)
                                .Select(index => fastaDb.GetProteinName(index))
                                .Distinct().ToArray();

                        if (matchedProteins.Length < 1) continue;

                        foreach (var protName in matchedProteins)
                        {
                            var seqStr = fastaDb.GetProteinSequence(protName);
                            var oriSeq = new Sequence(seqStr, AminoAcidSet.GetStandardAminoAcidSet());
                            
                            var startIdx = 0;
                            while (true)
                            {
                                var idx = seqStr.IndexOf(tag, startIdx);

                                if (idx < 0) break; //no matching

                                //var nClv = (nTerminal) ? idx : seqStr.Length - idx - tag.Length;
                                var nClv = (nTerminal) ? 2 : 1; 

                                for (var j = 0; j < nClv; j++)
                                {
                                    var flankComposition = (nTerminal)
                                        ? oriSeq.GetComposition(j, idx)
                                        : oriSeq.GetComposition(idx + tag.Length, oriSeq.Count - j);

                                    var massDiff = (detectedFlankingMass - flankComposition.Mass);
                                    if (massDiff > -500 && massDiff < 2000)
                                    {
                                        //writer.WriteLine(massDiff);
                                        writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", line, protName, detectedFlankingMass, flankComposition.Mass, massDiff);
                                    }

                                    if (massDiff > 2000) break;
                                }
                                
                                startIdx = idx + tag.Length;
                            }
                        }
                        //var matchedProteinStr = string.Join(",", matchedProteins);
                        //var massDiffStr = string.Join(",", massDiffList);
                        //writer.WriteLine("{0}\t{1}\t{2}\t{3}", line, matchedProteins.Length, matchedProteinStr, massDiffStr);
                    }

                    Console.WriteLine(@"{0} seq tags are processed", nReadSeqTag);
                }
                Console.WriteLine(@"Done");
            }
        }
        public void CountMatchedScansPerProtein()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const int minTagLength = 6;

            var proteinToScan = new Dictionary<string, HashSet<int>>();
            const string fastaFilePath = @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            Console.WriteLine(@"Sequence length: {0}", fastaDb.GetSequence().Length);

            //const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3_seqtag.tsv";
            //const string tagFilePath = @"\\protoapps\UserData\Jungkap\Co_culture\23B_pellet_TD_3Feb14_Bane_PL011402.seqtag";
            const string tagFilePath = @"D:\MassSpecFiles\co_culture\23A_pellet_TD_3Feb14_Bane_PL011402.seqtag";
            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var isHeader = true;
            var numMatchedPairs = 0;
            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length != 3) continue;
                var scan = Convert.ToInt32(token[0]);

                var tag = token[1];
                if (tag.Length < minTagLength) continue;

                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                    .Select(index => fastaDb.GetProteinName(index)))
                {
                    ++numMatchedPairs;
                    HashSet<int> matchedScans;
                    if (proteinToScan.TryGetValue(matchedProtein, out matchedScans))
                    {
                        matchedScans.Add(scan);
                    }
                    else
                    {
                        matchedScans = new HashSet<int> {scan};
                        proteinToScan.Add(matchedProtein, matchedScans);
                    }
                }
            }

            var numMatchedProteins = proteinToScan.Keys.Count;
            var numAllProteins = fastaDb.GetNumEntries();
            Console.WriteLine("NumAllProteins: {0}", numAllProteins);
            Console.WriteLine("NumMatchedProteins: {0}", numMatchedProteins);
            Console.WriteLine("AvgMatchedScansPerProtein: {0}", numMatchedPairs / (float)numAllProteins);
        }
        public void TestTagBasedSearchCompRef()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dataSetPath = @"D:\MassSpecFiles\CompRef";
            const string fastaFilePath = @"D:\MassSpecFiles\CompRef\ID_003278_4B4B3CB1.fasta";
            const string modsFilePath = @"D:\MassSpecFiles\CompRef\Mods.txt";

            if (!Directory.Exists(dataSetPath))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dataSetPath);
            }
            if (!File.Exists(modsFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath);
            }
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fileEntries = Directory.GetFiles(dataSetPath);

            var dataset = (from fileName in fileEntries where fileName.EndsWith("pbf") select Path.GetFileNameWithoutExtension(fileName)).ToList();
            dataset.Sort();
          
            
            var fastaDb = new FastaDatabase(fastaFilePath);
            var tolerance = new Tolerance(10);
            var aaSet = new AminoAcidSet(modsFilePath);

            for (var i = 0; i < dataset.Count; i++)
            {
                var rawFile = string.Format(@"{0}\{1}.pbf", dataSetPath, dataset[i]);
                var ms1File = string.Format(@"{0}\{1}.ms1ft", dataSetPath, dataset[i]);
                var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFile, ".seqtag");

                var run = PbfLcMsRun.GetLcMsRun(rawFile);
                const int minTagLength = 5;
                //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 100);

                Console.WriteLine("-----------------{0}--------------------", rawFile);

                TestTagBasedSearch(run, fastaDb, tolerance, aaSet);

                Console.WriteLine("-----------------------------------------------------------------------");
            }

        }
        public void TestTagBasedSearch()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

//            const string rawFilePath = @"H:\Research\Lewy\raw\Lewy_intact_01.raw";
//            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
//            const string rawFilePath = @"H:\Research\Yufeng\TopDownYufeng\raw\yufeng_column_test2.raw";
//            const string rawFilePath = @"H:\Research\Weijun_TopDown\raw\UC4_Intact_plasmaTest_90_6May15_Bane_14-09-01RZ.raw";
//            const string rawFilePath = @"H:\Research\Charles\TopDown\raw\SBEP_STM_001_02272012_Aragon.raw";
            const string rawFilePath = @"D:\MassSpecFiles\60k\Yufeng_SampleTest1_150614113438.pbf";
            //const string rawFilePath = @"D:\MassSpecFiles\60k\NCR_50K_Test_24Jun15_Bane_15-02-02RZ.pbf";

            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            const int minTagLength = 5;
            var tagFilePath = MassSpecDataReaderFactory.ChangeExtension(rawFilePath, ".seqtag");
            //var tagParser = new SequenceTagParser(tagFilePath, minTagLength, 100);

            const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_003836_DA9CC1E4.fasta";
            //const string fastaFilePath = @"D:\MassSpecFiles\60k\ID_004973_9BA6912F.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);

            var tolerance = new Tolerance(10);
//            var modsFilePath = @"H:\Research\QCShew_TopDown\Production\Mods_Methyl.txt";
            var modsFilePath = @"D:\MassSpecFiles\60k\Mods.txt";
//            var modsFilePath = "";

            if (!File.Exists(modsFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, modsFilePath);
            }

            var aaSet = new AminoAcidSet(modsFilePath);

            TestTagBasedSearch(run, fastaDb, tolerance, aaSet);
        }
Beispiel #28
0
        public void TestNominalMassErrors()
        {
            const int minLength = 300;
            const int maxLength = 400;

            var sw = new System.Diagnostics.Stopwatch();

//            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\ID_003962_71E1A1D4.fasta";

            //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\TargetProteins.fasta";
            var db = new FastaDatabase(dbFile);
            db.Read();
            var indexedDb = new IndexedDatabase(db);
            var numSequences = 0L;
            sw.Start();

            var hist = new long[11];
            var aaSet = new AminoAcidSet();
            foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength))
            {
                ++numSequences;
                var annotation = peptideAnnotationAndOffset.Annotation;
                var sequenceStr = annotation.Substring(2, annotation.Length - 4);
                var sequenceComp = aaSet.GetComposition(sequenceStr);
                var mass = sequenceComp.Mass;
                var nominalMass = sequenceComp.NominalMass;
                var error = (int) Math.Round(mass*Constants.RescalingConstant) - nominalMass;
                var errorBin = error + hist.Length/2;
                if (errorBin < 0) errorBin = 0;
                if (errorBin >= hist.Length) errorBin = hist.Length - 1;
                hist[errorBin]++;
            }

            Console.WriteLine("NumSequences: {0}", numSequences);
            for (var i = 0; i < hist.Length; i++)
            {
                Console.WriteLine("{0}\t{1}\t{2}", i - hist.Length/2, hist[i], hist[i]/(double)numSequences);
            }

            sw.Stop();

            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Beispiel #29
0
        [TestCase(6, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004530_B63BD900.fasta", 8898)]  // 6MB
        //[TestCase(15, @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_004208_295531A4.fasta", 6334)]  // 15MB
        public void TestSequenceEnumerationIntact(double size, string dbFile, int expected)
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName, dbFile);

            var sw = new System.Diagnostics.Stopwatch();
            sw.Start();

            const int numCTermCleavages = 0;
            var db = new FastaDatabase(dbFile);
            var indexedDb = new IndexedDatabase(db);
            var numSequences = 0L;
            var timeDB = sw.Elapsed;
            Console.WriteLine("Read DB in " + timeDB.TotalSeconds + " Seconds");
            var estimatedAnnOff = indexedDb.EstimateTotalPeptides(2, 21, 300, 1, numCTermCleavages);
            var timeEstimate = sw.Elapsed;
            Console.WriteLine("Read Estimate in " + (timeEstimate - timeDB).TotalSeconds + " Seconds");
            Console.WriteLine("Estimated results: " + estimatedAnnOff);
            var annotationsAndOffsets = indexedDb.IntactSequenceAnnotationsAndOffsets(21, 300, numCTermCleavages);
            var timeGetAnn = sw.Elapsed;
            Console.WriteLine("Read Annotations in " + (timeGetAnn - timeEstimate).TotalSeconds + " Seconds");
            /*/Parallel.ForEach(
                annotationsAndOffsets,
                //                new ParallelOptions { MaxDegreeOfParallelism = 2},
                annotationAndOffset =>
                {
                    Interlocked.Increment(ref numSequences);
                    //++numSequences;
                }
                );/**/
            //using (var ofstream = new FileStream(Path.Combine(@"F:\InformedProteomicsTestFiles", Path.GetFileNameWithoutExtension(dbFile) + "_par.txt"), FileMode.Create))
            //using (var fout = new StreamWriter(ofstream))
            //{
            //    foreach (var annOff in annotationsAndOffsets)
            //    {
            //        numSequences++;
            //        fout.WriteLine(annOff.Annotation);
            //    }
            //}
            numSequences = annotationsAndOffsets.Count();
            var timeParForEach = sw.Elapsed;
            Console.WriteLine("Parallel ForEach in " + (timeParForEach - timeGetAnn).TotalSeconds + " Seconds");

            Console.WriteLine("NumPeptides: {0}", numSequences);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
            //Assert.AreEqual(188961836, numSequences);
            Assert.AreEqual(expected, numSequences);
        }
Beispiel #30
0
        private void WriteResultsToFile(DatabaseSequenceSpectrumMatch[] matches, string outputFilePath, FastaDatabase database)
        {
            using (var writer = new StreamWriter(outputFilePath))
            {
                writer.WriteLine("Scan\tPre\tSequence\tPost\tModifications\tComposition\tProteinName\tProteinDesc" +
                                 "\tProteinLength\tStart\tEnd\tCharge\tMostAbundantIsotopeMz\tMass\t#MatchedFragments\tProbability\tSpecEValue\tEValue");
                
                foreach(var scanNum in _ms2ScanNums)
                {
                    var match = matches[scanNum];
                    if (match == null) continue;

                    var sequence = match.Sequence;
                    var offset = match.Offset;
                    var start = database.GetOneBasedPositionInProtein(offset) + 1 + match.NumNTermCleavages;
                    var end = start + sequence.Length - 1;
                    var proteinName = database.GetProteinName(match.Offset);
                    var protLength = database.GetProteinLength(proteinName);
                    var ion = match.Ion;
                    var proteinDescription = database.GetProteinDescription(match.Offset);
                    var probability = CompositeScorer.GetProbability(match.Score);

                    // Note for DblToString(value, 9, true), by having "9" and "true",
                    // values between 100 and 999 Da will have 7 digits after the decimal place, and
                    // values between 1000 and 9999 will have 6 digits after the decimal place
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}",
                        scanNum,
                        match.Pre,                 // Pre
                        sequence,                  // Sequence
                        match.Post,                // Post
                        match.ModificationText,    // Modifications
                        ion.Composition,           // Composition
                        proteinName,               // ProteinName
                        proteinDescription,        // ProteinDescription
                        protLength,                // ProteinLength
                        start,                     // Start position in protein
                        end,                       // End position in protein
                        ion.Charge,                // precursorCharge
                        StringUtilities.DblToString(ion.GetMostAbundantIsotopeMz(), 9, true), // MostAbundantIsotopeMz
                        StringUtilities.DblToString(ion.Composition.Mass, 9, true),           // Mass
                        match.NumMatchedFragments,                                          // (Number of matched fragments)
                        StringUtilities.DblToString(probability, 4),                        // Probability
                        StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue), 6, true, 0.001),                             // EValue; will be displayed using scientific notation if the value is less than 0.001
                        StringUtilities.DblToString(ExcelMinValue(match.SpecEvalue * database.GetNumEntries()), 6, true, 0.001)   // SpecEValue; will be displayed using scientific notation if the value is less than 0.001
                        );

                }
            }
        }
        public void CountMatchedProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const int minTagLength = 3;

            var scanToProtein = new Dictionary<int, string>();
            var idTag = new Dictionary<int, bool>();
            const string resultFilePath = @"H:\Research\ProMex\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";
            if (!File.Exists(resultFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultFilePath);
            }

            var parser = new TsvFileParser(resultFilePath);
            var scans = parser.GetData("Scan").Select(s => Convert.ToInt32(s)).ToArray();
            var proteinNames = parser.GetData("ProteinName").ToArray();
            var qValues = parser.GetData("QValue").Select(Convert.ToDouble).ToArray();
            for (var i = 0; i < qValues.Length; i++)
            {
                if (qValues[i] > 0.01) break;
                scanToProtein.Add(scans[i], proteinNames[i]);
                idTag.Add(scans[i], false);
            }

            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";
            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
//            const string fastaFilePath =
//                @"D:\Research\Data\CommonContaminants\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            Console.WriteLine("Sequence length: {0}", fastaDb.GetSequence().Length);

            const string tagFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.seqtag";
            if (!File.Exists(tagFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, tagFilePath);
            }

            var hist = new Dictionary<int, int>();
            
            var scanSet = new HashSet<int>();
            HashSet<string> proteinSetForThisScan = null;
            var prevScan = -1;
            var totalNumMatches = 0L;
            var isHeader = true;
            foreach (var line in File.ReadAllLines(tagFilePath))
            {
                if (isHeader)
                {
                    isHeader = false;
                    continue;
                }

                var token = line.Split('\t');
                if (token.Length < 3) continue;
                var scan = Convert.ToInt32(token[0]);
                var proteinId = scanToProtein.ContainsKey(scan) ? scanToProtein[scan] : null;

                if (scan != prevScan)
                {
                    if (proteinSetForThisScan != null)
                    {
                        var numMatches = proteinSetForThisScan.Count;
                        int numOcc;
                        if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1;
                        else hist.Add(numMatches, 1);
                    }

                    prevScan = scan;
                    proteinSetForThisScan = new HashSet<string>();
                }

                scanSet.Add(scan);
                var tag = token[1];
                if (tag.Length < minTagLength) continue;

                if (proteinSetForThisScan == null) continue;

                var numMatchesForThisTag = 0;
                foreach (var matchedProtein in searchableDb.FindAllMatchedSequenceIndices(tag)
                    .Select(index => fastaDb.GetProteinName(index)))
                {
                    proteinSetForThisScan.Add(matchedProtein);
                    ++numMatchesForThisTag;

                    if (proteinId != null && matchedProtein.Equals(proteinId))
                    {
                        idTag[scan] = true;
                    }
                }
                totalNumMatches += numMatchesForThisTag;
//                if (numMatchesForThisTag > 10)
//                {
//                    Console.WriteLine("{0}\t{1}", tag, numMatchesForThisTag);
//                }
            }

            if (proteinSetForThisScan != null)
            {
                var numMatches = proteinSetForThisScan.Count;
                int numOcc;
                if (hist.TryGetValue(numMatches, out numOcc)) hist[numMatches] = numOcc + 1;
                else hist.Add(numMatches, 1);
            }
            
            Console.WriteLine("AvgNumMatches: {0}", totalNumMatches/(float)scanSet.Count);
            Console.WriteLine("Histogram:");
            foreach (var entry in hist.OrderBy(e => e.Key))
            {
                Console.WriteLine("{0}\t{1}", entry.Key, entry.Value);
            }

            Console.WriteLine("NumId: {0}", idTag.Count);
            Console.WriteLine("NumIdByTag: {0}", idTag.Select(e => e.Value).Count(v => v));
        }
        private void TestTagBasedSearch(LcMsRun run, 
            FastaDatabase fastaDb, Tolerance tolerance, AminoAcidSet aaSet)
        {
            var engine = new ScanBasedTagSearchEngine(run, new SequenceTagGenerator(run, new Tolerance(8)), new LcMsPeakMatrix(run), fastaDb, tolerance, aaSet);
//            engine.MinScan = 3400;
//            engine.MaxScan = 3900;
            engine.RunSearch();
        }