public FeatureBasedTagSearchEngine(
     LcMsRun run,
     Ms1FtParser featureParser,
     ProductScorerBasedOnDeconvolutedSpectra ms2Scorer,
     SequenceTagParser tagParser,
     FastaDatabase fastaDb,
     Tolerance tolerance,
     AminoAcidSet aaSet,
     double maxSequenceMass = 50000.0,
     int minProductIonCharge = 1,
     int maxProductIonCharge = 20)
 {
     _run = run;
     _ms2Scorer = ms2Scorer;
     _featureParser = featureParser;
     _ms1FtFilter = new Ms1FtFilter(run, tolerance, featureParser.Ms1FtFileName);
     _tagParser = tagParser;
     _fastaDb = fastaDb;
     _searchableDb = new SearchableDatabase(fastaDb);
     _tolerance = tolerance;
     _aaSet = aaSet;
     _maxSequenceMass = maxSequenceMass;
     _minProductIonCharge = minProductIonCharge;
     _maxProductIonCharge = maxProductIonCharge;
 }
示例#2
0
        public void TestReadingProMexFile()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);
            
            const string rawFilePath = @"\\proto-11\MSXML_Cache\PBF_Gen_1_214\2014_3\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf";
 
            if (!File.Exists(rawFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, rawFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);

            const string promexFileName = @"\\Proto-5\VOrbiETD02\2014_3\QC_Shew_Intact_26Sep14_Bane_C2Column3\MSP201508271107_Auto1226713\QC_Shew_Intact_26Sep14_Bane_C2Column3.ms1ft";
            Console.Write("Reading ProMex results...");
            var ms1Filter = new Ms1FtFilter(run, new Tolerance(10), promexFileName);
            Console.WriteLine(string.Join(",", ms1Filter.GetMatchingMs2ScanNums(3016.6583)));
        }
示例#3
0
        public bool RunSearch(double corrThreshold = 0.7, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null)
        {
            // Get the Normalized spec file/folder path
            SpecFilePath = MassSpecDataReaderFactory.NormalizeDatasetPath(SpecFilePath);

            var prog = new Progress<ProgressData>();
            var progData = new ProgressData(progress);
            if (progress != null)
            {
                prog = new Progress<ProgressData>(p =>
                {
                    progData.Status = p.Status;
                    progData.StatusInternal = p.StatusInternal;
                    progData.Report(p.Percent);
                });
            }

            var sw = new Stopwatch();
            var swAll = new Stopwatch();
            swAll.Start();
            ErrorMessage = string.Empty;

            Console.Write(@"Reading raw file...");
            progData.Status = "Reading spectra file";
            progData.StepRange(10.0);
            sw.Start();

            _run = PbfLcMsRun.GetLcMsRun(SpecFilePath, 0, 0, prog);

            _ms2ScanNums = _run.GetScanNumbers(2).ToArray();
            _isolationWindowTargetMz = new double[_run.MaxLcScan + 1];
            foreach (var ms2Scan in _ms2ScanNums)
            {
                var ms2Spec = _run.GetSpectrum(ms2Scan) as ProductSpectrum;
                if (ms2Spec == null) continue;
                _isolationWindowTargetMz[ms2Scan] = ms2Spec.IsolationWindow.IsolationWindowTargetMz;
            }

            
            sw.Stop();
            Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

            progData.StepRange(20.0);
            ISequenceFilter ms1Filter;
            if (this.ScanNumbers != null && this.ScanNumbers.Any())
            {
                ms1Filter = new SelectedMsMsFilter(this.ScanNumbers);
            }
            else if (string.IsNullOrWhiteSpace(FeatureFilePath))
            {
                // Checks whether SpecFileName.ms1ft exists
                var ms1FtFilePath = MassSpecDataReaderFactory.ChangeExtension(SpecFilePath, LcMsFeatureFinderLauncher.FileExtension);
                if (!File.Exists(ms1FtFilePath))
                {
                    Console.WriteLine(@"Running ProMex...");
                    sw.Start();
                    var param = new LcMsFeatureFinderInputParameter
                    {
                        InputPath = SpecFilePath,
                        MinSearchMass = MinSequenceMass,
                        MaxSearchMass = MaxSequenceMass,
                        MinSearchCharge = MinPrecursorIonCharge,
                        MaxSearchCharge = MaxPrecursorIonCharge,
                        CsvOutput = false,
                        ScoreReport = false,
                        LikelihoodScoreThreshold = -10
                    };
                    var featureFinder = new LcMsFeatureFinderLauncher(param);
                    featureFinder.Run();
                }
                sw.Reset();
                sw.Start();
                Console.Write(@"Reading ProMex results...");
                ms1Filter = new Ms1FtFilter(_run, PrecursorIonTolerance, ms1FtFilePath, -10);
            }
            else
            {
                sw.Reset();
                sw.Start();
                var extension = Path.GetExtension(FeatureFilePath);
                if (extension.ToLower().Equals(".csv"))
                {
                    Console.Write(@"Reading ICR2LS/Decon2LS results...");
                    ms1Filter = new IsosFilter(_run, PrecursorIonTolerance, FeatureFilePath);
                }
                else if (extension.ToLower().Equals(".ms1ft"))
                {
                    Console.Write(@"Reading ProMex results...");
                    ms1Filter = new Ms1FtFilter(_run, PrecursorIonTolerance, FeatureFilePath, -10);
                }
                else if (extension.ToLower().Equals(".msalign"))
                {
                    Console.Write(@"Reading MS-Align+ results...");
                    ms1Filter = new MsDeconvFilter(_run, PrecursorIonTolerance, FeatureFilePath);
                }
                else ms1Filter = null; //new Ms1FeatureMatrix(_run);
            }

            sw.Stop();
            Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

            
            // pre-generate deconvoluted spectra for scoring
            _massBinComparer = new FilteredProteinMassBinning(AminoAcidSet, MaxSequenceMass+1000);

            _ms2ScorerFactory2 = new CompositeScorerFactory(_run, _massBinComparer, AminoAcidSet,
                                                               MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance);
            sw.Reset();
            Console.WriteLine(@"Generating deconvoluted spectra for MS/MS spectra...");
            sw.Start();
            var pfeOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxNumThreads,
                CancellationToken = cancellationToken ?? CancellationToken.None
            };
            Parallel.ForEach(_ms2ScanNums, pfeOptions, ms2ScanNum =>
            {
                _ms2ScorerFactory2.DeconvonluteProductSpectrum(ms2ScanNum);
            });
            sw.Stop();
            Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

            progData.StepRange(10.0);
            progData.Status = "Reading Fasta File";

            // Target database
            var targetDb = new FastaDatabase(DatabaseFilePath);
            targetDb.Read();
            
            // Generate sequence tags for all MS/MS spectra
            if (TagBasedSearch)
            {
                progData.StepRange(25.0);
                progData.Status = "Generating Sequence Tags";
                sw.Reset();
                Console.WriteLine(@"Generating sequence tags for MS/MS spectra...");
                sw.Start();
                var seqTagGen = GetSequenceTagGenerator();
                _tagMs2ScanNum = seqTagGen.GetMs2ScanNumsContainingTags().ToArray();
                sw.Stop();
                Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
                _tagSearchEngine = new ScanBasedTagSearchEngine(_run, seqTagGen, new LcMsPeakMatrix(_run, ms1Filter), targetDb, ProductIonTolerance, AminoAcidSet,
                                _ms2ScorerFactory2,
                                ScanBasedTagSearchEngine.DefaultMinMatchedTagLength,
                                MaxSequenceMass, MinProductIonCharge, MaxProductIonCharge);                
            }
            
            var specFileName = MassSpecDataReaderFactory.RemoveExtension(Path.GetFileName(SpecFilePath));
            var targetOutputFilePath = Path.Combine(OutputDir, specFileName + TargetFileNameEnding);
            var decoyOutputFilePath = Path.Combine(OutputDir, specFileName + DecoyFileNameEnding);
            var tdaOutputFilePath = Path.Combine(OutputDir, specFileName + TdaFileNameEnding);

            progData.StepRange(60.0);
            progData.Status = "Running Target search";

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Target))
            {
                sw.Reset();
                Console.Write(@"Reading the target database...");
                sw.Start();
                targetDb.Read();
                sw.Stop();
                Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                var targetMatches = new SortedSet<DatabaseSequenceSpectrumMatch>[_run.MaxLcScan + 1];
                
                progData.MaxPercentage = 42.5;
                if (TagBasedSearch)
                {
                    sw.Reset();
                    Console.WriteLine(@"Tag-based searching the target database");
                    sw.Start();
                    RunTagBasedSearch(targetMatches, targetDb, null, prog);
                    Console.WriteLine(@"Target database tag-based search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
                }
                progData.MaxPercentage = 60.0;

                sw.Reset();
                Console.WriteLine(@"Searching the target database");
                sw.Start();
                RunSearch(targetMatches, targetDb, ms1Filter, null, prog);
                Console.WriteLine(@"Target database search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                // calculate spectral e-value usign generating function
                sw.Reset();
                Console.WriteLine(@"Calculating spectral E-values for target-spectrum matches");
                sw.Start();
                var bestTargetMatches = RunGeneratingFunction(targetMatches);
                WriteResultsToFile(bestTargetMatches, targetOutputFilePath, targetDb);
                sw.Stop();
                Console.WriteLine(@"Target-spectrum match E-value calculation elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
            }

            progData.StepRange(95.0); // total to 95%
            progData.Status = "Running Decoy search";

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Decoy))
            {
                // Decoy database
                sw.Reset();
                sw.Start();
                var decoyDb = targetDb.Decoy(null, true);

                Console.Write(@"Reading the decoy database...");
                decoyDb.Read();
                Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                progData.MaxPercentage = 77.5;
                var decoyMatches = new SortedSet<DatabaseSequenceSpectrumMatch>[_run.MaxLcScan + 1];
                if (TagBasedSearch)
                {
                    sw.Reset();
                    Console.WriteLine(@"Tag-based searching the decoy database");
                    sw.Start();
                    RunTagBasedSearch(decoyMatches, decoyDb, null, prog);
                    Console.WriteLine(@"Decoy database tag-based search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);                    
                }
                progData.MaxPercentage = 95.0;

                sw.Reset();
                Console.WriteLine(@"Searching the decoy database");
                sw.Start();
                RunSearch(decoyMatches, decoyDb, ms1Filter, null, prog);
                Console.WriteLine(@"Decoy database search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                // calculate spectral e-value usign generating function
                sw.Reset();
                Console.WriteLine(@"Calculating spectral E-values for decoy-spectrum matches");
                sw.Start();
                var bestDecoyMatches = RunGeneratingFunction(decoyMatches);
                WriteResultsToFile(bestDecoyMatches, decoyOutputFilePath, decoyDb);
                sw.Stop();
                Console.WriteLine(@"Decoy-spectrum match E-value calculation elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
            }

            progData.StepRange(100.0);
            progData.Status = "Writing combined results file";
            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Both))
            {
                // Add "Qvalue" and "PepQValue"
                var fdrCalculator = new FdrCalculator(targetOutputFilePath, decoyOutputFilePath);
                if (fdrCalculator.HasError())
                {
                    ErrorMessage = fdrCalculator.ErrorMessage;
                    Console.WriteLine(@"Error computing FDR: " + fdrCalculator.ErrorMessage);
                    return false;
                }

                fdrCalculator.WriteTo(tdaOutputFilePath);
            }
            progData.Report(100.0);

            Console.WriteLine(@"Done.");
            swAll.Stop();
            Console.WriteLine(@"Total elapsed time for search: {0:f1} sec ({1:f2} min)", swAll.Elapsed.TotalSeconds, swAll.Elapsed.TotalMinutes);

            return true;
        }
示例#4
0
        public void TestMs1Filter()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            // QC_Shew
            const string specFilePath = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.pbf";
            const string ms1FtFileName = @"D:\MassSpecFiles\training\raw\QC_Shew_Intact_26Sep14_Bane_C2Column3.ms1ft";
            const string idFilePath =  @"D:\MassSpecFiles\training\IcTda\QC_Shew_Intact_26Sep14_Bane_C2Column3_IcTda.tsv";


            if (!File.Exists(specFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath);
            }

            if (!File.Exists(idFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, idFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(specFilePath);
            var massTolerance = new Tolerance(10);

            var ms1ftFilter = new Ms1FtFilter(run, massTolerance, ms1FtFileName);
            var n = 0;
            var ms2ScanNums = run.GetScanNumbers(2);
            foreach (var ms2ScanNum in ms2ScanNums)
            {
                var matchingMass = ms1ftFilter.GetMatchingMass(ms2ScanNum);
                n += matchingMass.Count();
            }
            
            Console.WriteLine("{0} / {1}", n, ms2ScanNums.Count);
            /*
            var tsvReader = new TsvFileParser(idFilePath);

            for (var i = 0; i < tsvReader.NumData; i++)
            {
                var qv = double.Parse(tsvReader.GetData("QValue")[i]);
                if (qv > 0.01) break;

                var scan = int.Parse(tsvReader.GetData("Scan")[i]);
                var charge = int.Parse(tsvReader.GetData("Charge")[i]);
                var mass = double.Parse(tsvReader.GetData("Mass")[i]);

                if (mass > 15000) continue;

                var seq = tsvReader.GetData("Sequence")[i];
                var mod = tsvReader.GetData("Modifications")[i];
                var nMatched = int.Parse(tsvReader.GetData("#MatchedFragments")[i]);

                

                var hit = false;
                foreach (var ms2Scan in ms1ftFilter.GetMatchingMs2ScanNums(mass))
                {
                    if (ms2Scan == scan)
                    {
                        hit = true;
                        break;
                    }
                }

                if (!hit)
                {
                    Console.WriteLine("{0}\t{1}\t{2}", scan, mass, nMatched);
                }
            }*/
        }
示例#5
0
        public void TestFeatureId()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dataSet = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3";

            if (!File.Exists(dataSet))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dataSet);
            }

            // Feature: 5236-5286	6-12	8480.3681	5
            const int minScanNum = 5236;
            const int maxScanNum = 5286;
            const double featureMass = 8480.3681;

            //const int minScanNum = 7251;
            //const int maxScanNum = 7326;
            //const double featureMass = 32347.18;

//            const int minScanNum = 4451;
//            const int maxScanNum = 4541;
//            const double featureMass = 31267.95;

            var tolerance = new Tolerance(10);
            var relaxedTolerance = new Tolerance(20);

            const int minTagLength = 5;
            const int minMergedTagLength = 7;
            const int minNumTagMatches = 1;

            var rawFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".raw");
            var run = PbfLcMsRun.GetLcMsRun(rawFileName);

            var aminoAcidSet = AminoAcidSet.GetStandardAminoAcidSet();
            var featureFileName = MassSpecDataReaderFactory.ChangeExtension(dataSet, ".ms1ft");
            var filter = new Ms1FtFilter(run, tolerance, featureFileName);
            var ms2ScanNums =
                filter.GetMatchingMs2ScanNums(featureMass)
                    .Where(scanNum => scanNum > minScanNum && scanNum < maxScanNum)
                    .ToArray();

            const string tagFileName = dataSet + ".seqtag"; //"_MinLength3.seqtag"; //Path.ChangeExtension(dataSet, ".seqtag");
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.fasta";

            if (!File.Exists(fastaFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, fastaFilePath);
            }

            var fastaDb = new FastaDatabase(fastaFilePath);
            var searchableDb = new SearchableDatabase(fastaDb);
            var tagParser = new SequenceTagParser(tagFileName, minTagLength);

            var proteinsToTags = new Dictionary<string, IList<MatchedTag>>();
            foreach (var ms2ScanNum in ms2ScanNums)
            {
                var tags = tagParser.GetSequenceTags(ms2ScanNum);
                foreach (var tag in tags)
                {
                    var matchedIndices = searchableDb.FindAllMatchedSequenceIndices(tag.Sequence).ToArray();
                    foreach (var index in matchedIndices)
                    {
                        var protein = fastaDb.GetProteinName(index);
                        var startIndex = fastaDb.GetZeroBasedPositionInProtein(index);
                        var matchedTag = new MatchedTag(tag, startIndex, featureMass);
                        IList<MatchedTag> existingTags;
                        if (proteinsToTags.TryGetValue(protein, out existingTags))
                        {
                            existingTags.Add(matchedTag);
                        }
                        else
                        {
                            proteinsToTags.Add(protein, new List<MatchedTag> { matchedTag });
                        }
                    }
                }
            }

            foreach (var entry in proteinsToTags.OrderByDescending(e => e.Value.Count))
            {
                if (entry.Value.Count < minNumTagMatches) break;
                var proteinName = entry.Key;
                var proteinSequence = fastaDb.GetProteinSequence(proteinName);
                var protein = new Sequence(proteinSequence, aminoAcidSet);
                Console.WriteLine(proteinName + "\t" + entry.Value.Count);

                var matchedTagSet = new MatchedTagSet(proteinSequence, aminoAcidSet,
                    tolerance, relaxedTolerance);

                Console.WriteLine("********** Before merging");
                foreach (var matchedTag in entry.Value)
                {
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                        (matchedTag.NTermFlankingMass - nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                        matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);

                    matchedTagSet.Add(matchedTag);
                }

                Console.WriteLine("********** After merging");
                foreach (var matchedTag in matchedTagSet.Tags)
                {
                    if (matchedTag.Length < minMergedTagLength) continue;
                    var seq = proteinSequence.Substring(matchedTag.StartIndex,
                        matchedTag.EndIndex - matchedTag.StartIndex);
                    var nTermMass = protein.GetMass(0, matchedTag.StartIndex);
                    var cTermMass = protein.GetMass(matchedTag.EndIndex, protein.Count);
                    Console.WriteLine("\t{0}\t{1}\t{2}\t{3}\t{4}\t{5}",
                        (matchedTag.NTermFlankingMass-nTermMass), seq, (matchedTag.CTermFlankingMass - cTermMass), matchedTag.StartIndex,
                        matchedTag.IsNTermFlankingMassReliable, matchedTag.IsCTermFlankingMassReliable);
                }

                break;
            }
        }
示例#6
0
        public void TestProMexFilter()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string specFilePath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\TopDown\ProductionQCShew\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
            if (!File.Exists(specFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, specFilePath);
            }

            var run = PbfLcMsRun.GetLcMsRun(specFilePath, 0, 0);
            
            const string ms1FtPath = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\TopDown\ProductionQCShew\QC_Shew_Intact_26Sep14_Bane_C2Column3.ms1ft";
            if (!File.Exists(ms1FtPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, ms1FtPath);
            }

            var filter = new Ms1FtFilter(run, new Tolerance(10), ms1FtPath);

//            Console.WriteLine("ScanNums: {0}", string.Join("\t",filter.GetMatchingMs2ScanNums(8480.327609)));
            Assert.IsTrue(filter.GetMatchingMs2ScanNums(8480.327609).Contains(5255));
        }