Example #1
0
        public void TestSequenceEnumerationParallel2()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            var sw = new System.Diagnostics.Stopwatch();

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta";
            var db = new FastaDatabase(dbFile);
            db.Read();
            var indexedDb = new IndexedDatabase(db);
            var arr = db.Characters().ToArray();

            sw.Start();
            //var annotationsAndOffsets = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30);
            //            var num = annotationsAndOffsets.AsParallel().LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0);
            //var num = annotationsAndOffsets.LongCount(annotationsAndOffset => annotationsAndOffset.Annotation.IndexOf('W') >= 0);
            //var num = arr.AsParallel().Where(c => c == 'W').LongCount();
            var num = 0;
            var sum = 0L;
            //foreach (var c in arr)
            for (var a = 0; a < arr.Length; a++)
            {
                var c = arr[a];
                for (var i = 0; i < c * 10000; i++) sum += i;
                //                Interlocked.Increment(ref num);
                if (++num == 1000) break;
            }

            Console.WriteLine("NumPeptides: {0}", sum);
            sw.Stop();

            Console.WriteLine(@"{0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Example #2
0
        public void TestSumParallel()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            //var array = Enumerable.Range(0, short.MaxValue).ToArray();
            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\ID_002216_235ACCEA.fasta";
            var db = new FastaDatabase(dbFile);
            db.Read();
            //var indexedDb = new IndexedDatabase(db);
            //indexedDb.Read();
            //var peptides = indexedDb.AnnotationsAndOffsetsNoEnzyme(7, 30);
            var charArray = db.Characters().Select(c => (int)c);

            // Test methods.
            Console.WriteLine(SumAsParallel(charArray));
            Console.WriteLine(SumDefault(charArray));

            const int m = 100;
            var s1 = Stopwatch.StartNew();
            for (var i = 0; i < m; i++)
            {
                SumDefault(charArray);
            }
            s1.Stop();
            var s2 = Stopwatch.StartNew();
            for (var i = 0; i < m; i++)
            {
                SumAsParallel(charArray);
            }
            s2.Stop();
            Console.WriteLine((s1.Elapsed.TotalMilliseconds * 1000000 /
                m).ToString("0.00 ns"));
            Console.WriteLine((s2.Elapsed.TotalMilliseconds * 1000000 /
                m).ToString("0.00 ns"));
            Console.Read();
        }
Example #3
0
        public void TestNominalMassErrors()
        {
            const int minLength = 300;
            const int maxLength = 400;

            var sw = new System.Diagnostics.Stopwatch();

//            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\H_sapiens_Uniprot_SPROT_2013-05-01_withContam.fasta";
            const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\ID_003962_71E1A1D4.fasta";

            //const string dbFile = @"C:\cygwin\home\kims336\Data\TopDownJia\database\TargetProteins.fasta";
            var db = new FastaDatabase(dbFile);
            db.Read();
            var indexedDb = new IndexedDatabase(db);
            var numSequences = 0L;
            sw.Start();

            var hist = new long[11];
            var aaSet = new AminoAcidSet();
            foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsetsNoEnzyme(minLength, maxLength))
            {
                ++numSequences;
                var annotation = peptideAnnotationAndOffset.Annotation;
                var sequenceStr = annotation.Substring(2, annotation.Length - 4);
                var sequenceComp = aaSet.GetComposition(sequenceStr);
                var mass = sequenceComp.Mass;
                var nominalMass = sequenceComp.NominalMass;
                var error = (int) Math.Round(mass*Constants.RescalingConstant) - nominalMass;
                var errorBin = error + hist.Length/2;
                if (errorBin < 0) errorBin = 0;
                if (errorBin >= hist.Length) errorBin = hist.Length - 1;
                hist[errorBin]++;
            }

            Console.WriteLine("NumSequences: {0}", numSequences);
            for (var i = 0; i < hist.Length; i++)
            {
                Console.WriteLine("{0}\t{1}\t{2}", i - hist.Length/2, hist[i], hist[i]/(double)numSequences);
            }

            sw.Stop();

            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);
        }
Example #4
0
        public bool RunSearch(double corrThreshold)
        {
            var sw = new Stopwatch();
            ErrorMessage = string.Empty;

            Console.Write(@"Reading raw file...");
            sw.Start();
            _run = InMemoryLcMsRun.GetLcMsRun(SpecFilePath, 1.4826, 1.4826);
            _bottomUpScorer = new InformedBottomUpScorer(_run, AminoAcidSet, MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance);
            sw.Stop();
            var sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

            sw.Reset();
            Console.Write(@"Determining precursor masses...");
            sw.Start();
            var ms1Filter = new Ms1IsotopeAndChargeCorrFilter(_run, PrecursorIonTolerance, MinPrecursorIonCharge, MaxPrecursorIonCharge,
                400, 5000, corrThreshold, 0, 0);
            sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

            sw.Reset();
            Console.Write(@"Deconvoluting MS2 spectra...");
            sw.Start();
            _ms2ScorerFactory = new ProductScorerBasedOnDeconvolutedSpectra(
                _run,
                MinProductIonCharge, MaxProductIonCharge,
                new Tolerance(10),
                0
                );
            _ms2ScorerFactory.DeconvoluteAllProductSpectra();
            sw.Stop();
            sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
            Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

            // Target database
            var targetDb = new FastaDatabase(DatabaseFilePath);

            //            string dirName = OutputDir ?? Path.GetDirectoryName(SpecFilePath);

            var baseName = Path.GetFileNameWithoutExtension(SpecFilePath);
            var targetOutputFilePath = Path.Combine(OutputDir, baseName + TargetFileExtension);
            var decoyOutputFilePath = Path.Combine(OutputDir, baseName + DecoyFileExtension);
            var tdaOutputFilePath = Path.Combine(OutputDir, baseName + TdaFileExtension);

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Target))
            {
                sw.Reset();
                Console.Write(@"Reading the target database...");
                sw.Start();
                targetDb.Read();
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

                sw.Reset();
                Console.WriteLine(@"Searching the target database");
                sw.Start();
                var targetMatches = RunSearch(GetAnnotationsAndOffsets(targetDb), ms1Filter, false);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Target database search elapsed time: {0:f4} sec", sec);

                sw.Reset();
                Console.Write(@"Rescoring and writing target results...");
                sw.Start();
                WriteResultsToFile(targetMatches, targetOutputFilePath, targetDb);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed time: {0:f4} sec", sec);
            }

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Decoy))
            {
                // Decoy database
                sw.Reset();
                Console.Write(@"Reading the decoy database...");
                sw.Start();
                var decoyDb = targetDb.Decoy(Enzyme);
                decoyDb.Read();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed Time: {0:f4} sec", sec);

                sw.Reset();
                Console.WriteLine(@"Searching the decoy database");
                sw.Start();
                var decoyMatches = RunSearch(GetAnnotationsAndOffsets(decoyDb), ms1Filter, true);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Decoy database search elapsed Time: {0:f4} sec", sec);

                sw.Reset();
                Console.Write(@"Rescoring and writing decoy results...");
                sw.Start();
                WriteResultsToFile(decoyMatches, decoyOutputFilePath, decoyDb);
                sw.Stop();
                sec = sw.ElapsedTicks / (double)Stopwatch.Frequency;
                Console.WriteLine(@"Elapsed time: {0:f4} sec", sec);
            }

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Both))
            {
                var fdrCalculator = new FdrCalculator(targetOutputFilePath, decoyOutputFilePath);
                if (fdrCalculator.HasError())
                {
                    ErrorMessage = fdrCalculator.ErrorMessage;
                    Console.WriteLine(@"Error computing FDR: " + fdrCalculator.ErrorMessage);
                    return false;
                }

                fdrCalculator.WriteTo(tdaOutputFilePath);
            }

            Console.WriteLine(@"Done");
            return true;
        }
Example #5
0
        public void TestId()
        {
            const string rawFilePath = @"H:\Research\QCShew_TopDown\Production\QC_Shew_Intact_26Sep14_Bane_C2Column3.raw";
//            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\ID_002216_235ACCEA.icsfldecoy.fasta";
            //const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\Decoy_SO4280.fasta";
            const string fastaFilePath = @"H:\Research\QCShew_TopDown\Production\SO2312.fasta";
            const string modFilePath = @"H:\Research\QCShew_TopDown\Production\Mods.txt";
            const int numBits = 29; // max error: 4ppm
            const int minCharge = 2;
            const int maxCharge = 20;
            var tolerance = new Tolerance(10);
            const double corrThreshold = 0.7;

            var comparer = new MzComparerWithBinning(numBits);
            const double minFragmentMass = 200.0;
            const double maxFragmentMass = 50000.0;
            var minFragMassBin = comparer.GetBinNumber(minFragmentMass);
            var maxFragMassBin = comparer.GetBinNumber(maxFragmentMass);

            var aminoAcidSet = new AminoAcidSet(modFilePath);

            var run = PbfLcMsRun.GetLcMsRun(rawFilePath);
//            var ms2ScanNumArr = run.GetScanNumbers(2).ToArray();
            //var ms2ScanNumArr = new[] {4130};
            var ms2ScanNumArr = new[] { 5189 };

            var sw = new Stopwatch();

            sw.Start();
            Console.Write("Building Spectrum Arrays...");
            var massVectors = new BitArray[maxFragMassBin - minFragMassBin + 1];
            for (var i = minFragMassBin; i <= maxFragMassBin; i++)
            {
                massVectors[i - minFragMassBin] = new BitArray(run.MaxLcScan + 1);
            }

            foreach (var ms2ScanNum in ms2ScanNumArr)
            {
                var productSpec = run.GetSpectrum(ms2ScanNum) as ProductSpectrum;
                if (productSpec == null) continue;

                productSpec.FilterNoise();
                var deconvolutedPeaks = Deconvoluter.GetDeconvolutedPeaks(productSpec, minCharge, maxCharge, 2, 1.1,
                    tolerance, corrThreshold);

                if (deconvolutedPeaks == null) continue;

                foreach (var p in deconvolutedPeaks)
                {
                    var mass = p.Mass;
                    var deltaMass = tolerance.GetToleranceAsDa(mass, 1);
                    var minMass = mass - deltaMass;
                    var maxMass = mass + deltaMass;

                    var minBinNum = comparer.GetBinNumber(minMass);
                    var maxBinNum = comparer.GetBinNumber(maxMass);
                    for (var binNum = minBinNum; binNum <= maxBinNum; binNum++)
                    {
                        if (binNum >= minFragMassBin && binNum <= maxFragMassBin) massVectors[binNum - minFragMassBin][ms2ScanNum] = true;
                    }
                }
            }
            sw.Stop();
            Console.WriteLine(@"{0:f4} sec.", sw.Elapsed.TotalSeconds);

            sw.Reset();
            sw.Start();
            var fastaDb = new FastaDatabase(fastaFilePath);
            fastaDb.Read();
            var indexedDb = new IndexedDatabase(fastaDb);
            var numProteins = 0;
            var intactProteinAnnotationAndOffsets =
                indexedDb.IntactSequenceAnnotationsAndOffsets(0, int.MaxValue);

            var bestProtein = new string[run.MaxLcScan + 1];
            var bestScore = new int[run.MaxLcScan + 1];
            foreach (var annotationAndOffset in intactProteinAnnotationAndOffsets)
            {
                if (++numProteins % 10 == 0)
                {
                    Console.WriteLine("Processing {0}{1} proteins...", numProteins,
                        numProteins == 1 ? "st" : numProteins == 2 ? "nd" : numProteins == 3 ? "rd" : "th");
                    if (numProteins != 0)
                    {
                        sw.Stop();

                        Console.WriteLine("Elapsed Time: {0:f4} sec", sw.Elapsed.TotalSeconds);
                        sw.Reset();
                        sw.Start();
                    }
                }
                var annotation = annotationAndOffset.Annotation;
                var offset = annotationAndOffset.Offset;

                var protSequence = annotation.Substring(2, annotation.Length - 4);

                // suffix
                var seqGraph = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence,
                    AminoAcid.ProteinCTerm);
                if (seqGraph == null) continue;

                for (var numNTermCleavage = 0; numNTermCleavage <= 0; numNTermCleavage++)
                {
                    if (numNTermCleavage > 0) seqGraph.CleaveNTerm();
                    var allCompositions = seqGraph.GetAllFragmentNodeCompositions().ToArray();

                    var scoreArr = new int[run.MaxLcScan + 1];
                    foreach (var fragComp in allCompositions)
                    {
                        var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass;
                        var binNum = comparer.GetBinNumber(suffixMass);
                        if (binNum < minFragMassBin || binNum > maxFragMassBin) continue;

                        var vector = massVectors[binNum - minFragMassBin];
                        foreach (var ms2ScanNum in ms2ScanNumArr)
                        {
                            if (vector[ms2ScanNum])
                            {
                                ++scoreArr[ms2ScanNum];
                                Console.WriteLine(suffixMass);
                            }
                        }
                    }
                    foreach (var ms2ScanNum in ms2ScanNumArr)
                    {
                        if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum])
                        {
                            bestScore[ms2ScanNum] = scoreArr[ms2ScanNum];
                            var proteinName = fastaDb.GetProteinName(offset);
                            bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : "");
                        }
                    }
                }
                //// prefix
                //var seqGraphPrefix = SequenceGraph.CreateGraph(aminoAcidSet, AminoAcid.ProteinNTerm, protSequence,
                //    AminoAcid.ProteinCTerm);
                //if (seqGraphPrefix == null) continue;

                //{
                //    if (numNTermCleavage > 0) seqGraph.CleaveNTerm();
                //    var allCompositions = seqGraph.GetAllFragmentNodeCompositions();

                //    var scoreArr = new int[run.MaxLcScan + 1];
                //    foreach (var fragComp in allCompositions)
                //    {
                //        var suffixMass = fragComp.Mass + BaseIonType.Y.OffsetComposition.Mass;
                //        var binNum = comparer.GetBinNumber(suffixMass);
                //        if (binNum < minFragMassBin || binNum > maxFragMassBin) continue;

                //        var vector = massVectors[binNum - minFragMassBin];
                //        foreach (var ms2ScanNum in ms2ScanNumArr)
                //        {
                //            if (vector[ms2ScanNum]) ++scoreArr[ms2ScanNum];
                //        }
                //    }
                //    foreach (var ms2ScanNum in ms2ScanNumArr)
                //    {
                //        if (scoreArr[ms2ScanNum] > bestScore[ms2ScanNum])
                //        {
                //            bestScore[ms2ScanNum] = scoreArr[ms2ScanNum];
                //            var proteinName = fastaDb.GetProteinName(offset);
                //            bestProtein[ms2ScanNum] = proteinName + (numNTermCleavage == 1 ? "'" : "");
                //        }
                //    }
                //}
            }

            Console.WriteLine("ScanNum\tBestProtein\tScore");
            foreach (var ms2ScanNum in ms2ScanNumArr)
            {
                Console.WriteLine("{0}\t{1}\t{2}", ms2ScanNum, bestProtein[ms2ScanNum] ?? "", bestScore[ms2ScanNum]);
            }
            //sw.Stop();
            //Console.WriteLine(@"Scoring: {0:f4} sec.", sw.Elapsed.TotalSeconds);
        }
Example #6
0
        public bool RunSearch(double corrThreshold = 0.7, CancellationToken? cancellationToken = null, IProgress<ProgressData> progress = null)
        {
            // Get the Normalized spec file/folder path
            SpecFilePath = MassSpecDataReaderFactory.NormalizeDatasetPath(SpecFilePath);

            var prog = new Progress<ProgressData>();
            var progData = new ProgressData(progress);
            if (progress != null)
            {
                prog = new Progress<ProgressData>(p =>
                {
                    progData.Status = p.Status;
                    progData.StatusInternal = p.StatusInternal;
                    progData.Report(p.Percent);
                });
            }

            var sw = new Stopwatch();
            var swAll = new Stopwatch();
            swAll.Start();
            ErrorMessage = string.Empty;

            Console.Write(@"Reading raw file...");
            progData.Status = "Reading spectra file";
            progData.StepRange(10.0);
            sw.Start();

            _run = PbfLcMsRun.GetLcMsRun(SpecFilePath, 0, 0, prog);

            _ms2ScanNums = _run.GetScanNumbers(2).ToArray();
            _isolationWindowTargetMz = new double[_run.MaxLcScan + 1];
            foreach (var ms2Scan in _ms2ScanNums)
            {
                var ms2Spec = _run.GetSpectrum(ms2Scan) as ProductSpectrum;
                if (ms2Spec == null) continue;
                _isolationWindowTargetMz[ms2Scan] = ms2Spec.IsolationWindow.IsolationWindowTargetMz;
            }

            
            sw.Stop();
            Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

            progData.StepRange(20.0);
            ISequenceFilter ms1Filter;
            if (this.ScanNumbers != null && this.ScanNumbers.Any())
            {
                ms1Filter = new SelectedMsMsFilter(this.ScanNumbers);
            }
            else if (string.IsNullOrWhiteSpace(FeatureFilePath))
            {
                // Checks whether SpecFileName.ms1ft exists
                var ms1FtFilePath = MassSpecDataReaderFactory.ChangeExtension(SpecFilePath, LcMsFeatureFinderLauncher.FileExtension);
                if (!File.Exists(ms1FtFilePath))
                {
                    Console.WriteLine(@"Running ProMex...");
                    sw.Start();
                    var param = new LcMsFeatureFinderInputParameter
                    {
                        InputPath = SpecFilePath,
                        MinSearchMass = MinSequenceMass,
                        MaxSearchMass = MaxSequenceMass,
                        MinSearchCharge = MinPrecursorIonCharge,
                        MaxSearchCharge = MaxPrecursorIonCharge,
                        CsvOutput = false,
                        ScoreReport = false,
                        LikelihoodScoreThreshold = -10
                    };
                    var featureFinder = new LcMsFeatureFinderLauncher(param);
                    featureFinder.Run();
                }
                sw.Reset();
                sw.Start();
                Console.Write(@"Reading ProMex results...");
                ms1Filter = new Ms1FtFilter(_run, PrecursorIonTolerance, ms1FtFilePath, -10);
            }
            else
            {
                sw.Reset();
                sw.Start();
                var extension = Path.GetExtension(FeatureFilePath);
                if (extension.ToLower().Equals(".csv"))
                {
                    Console.Write(@"Reading ICR2LS/Decon2LS results...");
                    ms1Filter = new IsosFilter(_run, PrecursorIonTolerance, FeatureFilePath);
                }
                else if (extension.ToLower().Equals(".ms1ft"))
                {
                    Console.Write(@"Reading ProMex results...");
                    ms1Filter = new Ms1FtFilter(_run, PrecursorIonTolerance, FeatureFilePath, -10);
                }
                else if (extension.ToLower().Equals(".msalign"))
                {
                    Console.Write(@"Reading MS-Align+ results...");
                    ms1Filter = new MsDeconvFilter(_run, PrecursorIonTolerance, FeatureFilePath);
                }
                else ms1Filter = null; //new Ms1FeatureMatrix(_run);
            }

            sw.Stop();
            Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

            
            // pre-generate deconvoluted spectra for scoring
            _massBinComparer = new FilteredProteinMassBinning(AminoAcidSet, MaxSequenceMass+1000);

            _ms2ScorerFactory2 = new CompositeScorerFactory(_run, _massBinComparer, AminoAcidSet,
                                                               MinProductIonCharge, MaxProductIonCharge, ProductIonTolerance);
            sw.Reset();
            Console.WriteLine(@"Generating deconvoluted spectra for MS/MS spectra...");
            sw.Start();
            var pfeOptions = new ParallelOptions
            {
                MaxDegreeOfParallelism = MaxNumThreads,
                CancellationToken = cancellationToken ?? CancellationToken.None
            };
            Parallel.ForEach(_ms2ScanNums, pfeOptions, ms2ScanNum =>
            {
                _ms2ScorerFactory2.DeconvonluteProductSpectrum(ms2ScanNum);
            });
            sw.Stop();
            Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

            progData.StepRange(10.0);
            progData.Status = "Reading Fasta File";

            // Target database
            var targetDb = new FastaDatabase(DatabaseFilePath);
            targetDb.Read();
            
            // Generate sequence tags for all MS/MS spectra
            if (TagBasedSearch)
            {
                progData.StepRange(25.0);
                progData.Status = "Generating Sequence Tags";
                sw.Reset();
                Console.WriteLine(@"Generating sequence tags for MS/MS spectra...");
                sw.Start();
                var seqTagGen = GetSequenceTagGenerator();
                _tagMs2ScanNum = seqTagGen.GetMs2ScanNumsContainingTags().ToArray();
                sw.Stop();
                Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
                _tagSearchEngine = new ScanBasedTagSearchEngine(_run, seqTagGen, new LcMsPeakMatrix(_run, ms1Filter), targetDb, ProductIonTolerance, AminoAcidSet,
                                _ms2ScorerFactory2,
                                ScanBasedTagSearchEngine.DefaultMinMatchedTagLength,
                                MaxSequenceMass, MinProductIonCharge, MaxProductIonCharge);                
            }
            
            var specFileName = MassSpecDataReaderFactory.RemoveExtension(Path.GetFileName(SpecFilePath));
            var targetOutputFilePath = Path.Combine(OutputDir, specFileName + TargetFileNameEnding);
            var decoyOutputFilePath = Path.Combine(OutputDir, specFileName + DecoyFileNameEnding);
            var tdaOutputFilePath = Path.Combine(OutputDir, specFileName + TdaFileNameEnding);

            progData.StepRange(60.0);
            progData.Status = "Running Target search";

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Target))
            {
                sw.Reset();
                Console.Write(@"Reading the target database...");
                sw.Start();
                targetDb.Read();
                sw.Stop();
                Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                var targetMatches = new SortedSet<DatabaseSequenceSpectrumMatch>[_run.MaxLcScan + 1];
                
                progData.MaxPercentage = 42.5;
                if (TagBasedSearch)
                {
                    sw.Reset();
                    Console.WriteLine(@"Tag-based searching the target database");
                    sw.Start();
                    RunTagBasedSearch(targetMatches, targetDb, null, prog);
                    Console.WriteLine(@"Target database tag-based search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
                }
                progData.MaxPercentage = 60.0;

                sw.Reset();
                Console.WriteLine(@"Searching the target database");
                sw.Start();
                RunSearch(targetMatches, targetDb, ms1Filter, null, prog);
                Console.WriteLine(@"Target database search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                // calculate spectral e-value usign generating function
                sw.Reset();
                Console.WriteLine(@"Calculating spectral E-values for target-spectrum matches");
                sw.Start();
                var bestTargetMatches = RunGeneratingFunction(targetMatches);
                WriteResultsToFile(bestTargetMatches, targetOutputFilePath, targetDb);
                sw.Stop();
                Console.WriteLine(@"Target-spectrum match E-value calculation elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
            }

            progData.StepRange(95.0); // total to 95%
            progData.Status = "Running Decoy search";

            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Decoy))
            {
                // Decoy database
                sw.Reset();
                sw.Start();
                var decoyDb = targetDb.Decoy(null, true);

                Console.Write(@"Reading the decoy database...");
                decoyDb.Read();
                Console.WriteLine(@"Elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                progData.MaxPercentage = 77.5;
                var decoyMatches = new SortedSet<DatabaseSequenceSpectrumMatch>[_run.MaxLcScan + 1];
                if (TagBasedSearch)
                {
                    sw.Reset();
                    Console.WriteLine(@"Tag-based searching the decoy database");
                    sw.Start();
                    RunTagBasedSearch(decoyMatches, decoyDb, null, prog);
                    Console.WriteLine(@"Decoy database tag-based search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);                    
                }
                progData.MaxPercentage = 95.0;

                sw.Reset();
                Console.WriteLine(@"Searching the decoy database");
                sw.Start();
                RunSearch(decoyMatches, decoyDb, ms1Filter, null, prog);
                Console.WriteLine(@"Decoy database search elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);

                // calculate spectral e-value usign generating function
                sw.Reset();
                Console.WriteLine(@"Calculating spectral E-values for decoy-spectrum matches");
                sw.Start();
                var bestDecoyMatches = RunGeneratingFunction(decoyMatches);
                WriteResultsToFile(bestDecoyMatches, decoyOutputFilePath, decoyDb);
                sw.Stop();
                Console.WriteLine(@"Decoy-spectrum match E-value calculation elapsed Time: {0:f1} sec", sw.Elapsed.TotalSeconds);
            }

            progData.StepRange(100.0);
            progData.Status = "Writing combined results file";
            if (RunTargetDecoyAnalysis.HasFlag(DatabaseSearchMode.Both))
            {
                // Add "Qvalue" and "PepQValue"
                var fdrCalculator = new FdrCalculator(targetOutputFilePath, decoyOutputFilePath);
                if (fdrCalculator.HasError())
                {
                    ErrorMessage = fdrCalculator.ErrorMessage;
                    Console.WriteLine(@"Error computing FDR: " + fdrCalculator.ErrorMessage);
                    return false;
                }

                fdrCalculator.WriteTo(tdaOutputFilePath);
            }
            progData.Report(100.0);

            Console.WriteLine(@"Done.");
            swAll.Stop();
            Console.WriteLine(@"Total elapsed time for search: {0:f1} sec ({1:f2} min)", swAll.Elapsed.TotalSeconds, swAll.Elapsed.TotalMinutes);

            return true;
        }
Example #7
0
        public void TestGettingProteinLengthAndPosition()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dbFile = @"\\proto-2\UnitTest_Files\InformedProteomics_TestFiles\MSPathFinderT\Short.fasta";
            if (!File.Exists(dbFile))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, dbFile);
            }

            var db = new FastaDatabase(dbFile);
            db.Read();
            var indexedDb = new IndexedDatabase(db);
            foreach (var peptideAnnotationAndOffset in indexedDb.AnnotationsAndOffsets(6, 20, 2, 0, Enzyme.Trypsin))
            {
                var annotation = peptideAnnotationAndOffset.Annotation;
                var offset = peptideAnnotationAndOffset.Offset;
                Console.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}",
                    annotation, 
                    offset, 
                    db.GetProteinName(offset), 
                    db.GetProteinLength(db.GetProteinName(offset)), 
                    db.GetOneBasedPositionInProtein(offset)+1);
            }
        }
        public void TestTempCompRefLcMsFeatureAlign()
        {
            const string dataFolder = @"D:\MassSpecFiles\CompRef";
            const string fastaFilePath = @"D:\MassSpecFiles\CompRef\db\ID_003278_4B4B3CB1.fasta";
            var fastaDb = new FastaDatabase(fastaFilePath);
            fastaDb.Read();

            var fileEntries = Directory.GetFiles(dataFolder);

            var dataset = (from fileName in fileEntries where fileName.EndsWith("pbf") select Path.GetFileNameWithoutExtension(fileName)).ToList();
            dataset.Sort();

            for (var i = 0; i < dataset.Count; i++)
            {
                var writer =
                    new StreamWriter(string.Format(@"D:\MassSpecFiles\CompRef\MsPathFinderMerged\{0}_IcTda.tsv",
                        dataset[i]));
                
                writer.Write("Scan");
                writer.Write("\t");
                writer.Write("Sequence");
                writer.Write("\t");
                writer.Write("Modifications");
                writer.Write("\t");
                writer.Write("Mass");
                writer.Write("\t");
                writer.Write("ProteinName");
                writer.Write("\t");
                writer.Write("ProteinDesc");
                writer.Write("\t");
                writer.Write("Start");
                writer.Write("\t");
                writer.Write("End");
                writer.Write("\t");
                writer.Write("#MatchedFragments");
                writer.Write("\t");
                writer.Write("QValue");
                writer.Write("\n");


                var path1 = string.Format(@"D:\MassSpecFiles\CompRef\MsPathFinder\{0}_IcTda.tsv", dataset[i]);
                var parser1 = new TsvFileParser(path1);
                OutputMergedResult(writer, parser1, fastaDb);

                var path2 = string.Format(@"D:\MassSpecFiles\CompRef\seqtag\{0}_tagmatch.tsv", dataset[i]);
                var parser2 = new TsvFileParser(path2);
                OutputMergedResult(writer, parser2, fastaDb);
                writer.Close();
            }
        }
Example #9
0
        public void CreateTargetList()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string databaseFilePath = @"D:\Research\Data\IPRG2014\database\SpikedInPeptides.fasta";
            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);
            database.Read();
            var indexedDatabase = new IndexedDatabase(database);
            var numTargets = 0;

            var aaSet = new AminoAcidSet(Modification.Carbamidomethylation);

            Console.WriteLine("Peptide\tFormula\tProtein");
            foreach (var annotationAndOffset in indexedDatabase.AnnotationsAndOffsets(6, 30, 1, 1, Enzyme.Trypsin))
            {
                var annotation = annotationAndOffset.Annotation;
                var peptide = annotation.Substring(2, annotation.Length - 4);
                var offset = annotationAndOffset.Offset;

                Console.WriteLine("{0}\t{1}\t{2}", peptide, (aaSet.GetComposition(peptide) + Composition.H2O).ToPlainString(), database.GetProteinName(offset));
                numTargets++;
            }
            Console.WriteLine("NumTargets: {0}", numTargets);
        }
Example #10
0
        public void GenerateAbrfSpecCountAllProteins()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dir = @"H:\Research\IPRG2015";
            if (!Directory.Exists(dir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, dir);
            }

            const double qValueThreshold = 0.01;
            //var names = new[] { "ENO1_YEAST", "ADH1_YEAST", "CYC_BOVIN", "ALBU_BOVIN" };
            //var accessions = new[] { "P00924", "P00330", "P62894", "P02769" };

            const string resultDir = dir + @"\NTT1";
            if (!Directory.Exists(resultDir))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, resultDir);
            }

            var msgfResultFiles = Directory.GetFiles(resultDir, "*.tsv").ToArray();

            var specCount = new Dictionary<string, int[]>();  // protein name => array of counts

            for (var i = 0; i < msgfResultFiles.Length; i++)
            {
                var msgfResultFile = msgfResultFiles[i];

                MsGfPlusHeaderInformation headerInfo = null;

                var prevScanNum = -1;
                foreach (var line in File.ReadLines(msgfResultFile))
                {
                    if (line.StartsWith("#"))
                    {
                        headerInfo = new MsGfPlusHeaderInformation(line);
                        continue;
                    }

                    var match = new MsGfMatch(line, headerInfo);

                    if (match.ScanNum == prevScanNum) continue;
                    prevScanNum = match.ScanNum;

                    if (!match.IsValid || match.Protein.StartsWith(FastaDatabase.DecoyProteinPrefix)) continue;
                    if (match.QValue > qValueThreshold) continue;

                    var proteins = match.Protein.Split(';');
                    foreach (var protein in proteins)
                    {
                        var proteinName = protein.Substring(0, protein.LastIndexOf("(pre=", StringComparison.Ordinal));
                        int[] countArr;
                        if (!specCount.TryGetValue(proteinName, out countArr)) specCount[proteinName] = new int[msgfResultFiles.Length];
                        specCount[proteinName][i]++;
                    }
                }
            }

            // Writing
            const string databaseFilePath = dir + @"\database\iPRG2015.fasta";
            var database = new FastaDatabase(databaseFilePath);
            database.Read();

            //            var spikeInAccessions = new[] { "STANDARD_Alpha-Casein", "STANDARD_Beta-Lactoglobulin", "STANDARD_Carbonic-Anhydrase", "P02769"};

            const string outputFilePath = dir + @"\SpecCountAllProteins.tsv";
            using (var writer = new StreamWriter(outputFilePath))
            {
                var fileIds = msgfResultFiles.Select(f => f.Substring(f.IndexOf("_sample", StringComparison.Ordinal) + 1,
                    f.LastIndexOf('.') - f.IndexOf("_sample", StringComparison.Ordinal) - 1));
                writer.WriteLine("Protein\tLength\t" + string.Join("\t", fileIds) + "\tSpikeIn");
                foreach (var entry in specCount)
                {
                    var proteinId = entry.Key;
                    var length = database.GetProteinLength(proteinId);
                    Assert.True(length > 0);
                    var counts = entry.Value;
                    Assert.True(counts.Length == msgfResultFiles.Length);
                    var spikeIn = 0;
                    //if (spikeInAccessions.Any(spikeInAccession => proteinId.StartsWith("sp|" + spikeInAccession)))
                    if (proteinId.StartsWith("sp|"))
                    {
                        spikeIn = 1;
                    }
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}", proteinId, length, string.Join("\t", counts), spikeIn);
                }
            }
        }
Example #11
0
        public void ProcessIprg2015PreStudy()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string dir = @"H:\Research\IPRG2015";

            const string databaseFilePath = dir + @"\database\yeast6proteaprotein.fasta";
            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);
            database.Read();

            const string jobFilePath = dir + @"\Jobs.tsv";
            if (!File.Exists(jobFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, jobFilePath);
            }

            var jobParser = new TsvFileParser(jobFilePath);
            var jobs = jobParser.GetData("Jobs").Select(j => Convert.ToInt32(j)).ToArray();
            var experiments = jobParser.GetData("Experiments").Select(e => e.Split('_')[2]).ToArray();

            //const string resultFilePath = dir + @"\AMT_Proteins_NA.tsv";
            //const string outputFilePath = dir + @"\AMT_Proteins.tsv";

            const string resultFilePath = dir + @"\AMT_Peptides_NA.tsv";
            const string outputFilePath = dir + @"\AMT_Peptides.tsv";

            var parser = new TsvFileParser(resultFilePath);
            var headers = parser.GetHeaders();
            var jobColNum = new int[jobs.Length];
            for (var i = 0; i < jobs.Length; i++)
            {
                for (var j = 0; j < headers.Count; j++)
                {
                    if (headers[j].Contains("" + jobs[i]))
                    {
                        jobColNum[i] = j;
                        break;
                    }
                }
            }

            for (var i = 0; i < jobs.Length; i++)
            {
                Console.WriteLine("{0}\t{1}\t{2}", jobs[i], jobColNum[i], experiments[i]);
            }

            using (var writer = new StreamWriter(outputFilePath))
            {
                var peptides = parser.GetData("Peptide");   // Peptides
                var proteins = parser.GetData("Reference");     // Proteins
                var abundances = new string[jobs.Length][];
                for (var i = 0; i < jobs.Length; i++)
                {
                    abundances[i] = parser.GetData(headers[jobColNum[i]]).ToArray();
                }

                if (peptides != null) writer.Write("Peptide\t");
                writer.Write("Protein\tLength");
                for (var i = 0; i < jobs.Length; i++)
                {
                    writer.Write("\t" + experiments[i]);
                }
                writer.WriteLine("\tSpikeIn");
                for (var i = 0; i < proteins.Count; i++)
                {
                    var protein = proteins[i];
                    if (protein.StartsWith("XXX") || protein.StartsWith("Contaminant")) continue;
                    var length = database.GetProteinLength(protein);
                    //if (length <= 0)
                    //{
                    //    Console.WriteLine("Shit!");
                    //    return;
                    //}
                    if (peptides != null) writer.Write(peptides[i] + "\t");
                    writer.Write(protein + "\t" + length);
                    for (var j = 0; j < jobs.Length; j++)
                    {
                        writer.Write("\t" + abundances[j][i]);
                    }
                    writer.WriteLine("\t" + (protein.StartsWith("STANDARD") ? 1 : 0));
                }
            }
        }
Example #12
0
        public void GetProteinAccessions()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string uniprotAccession = "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}";
            var uniProtPattern = new Regex(uniprotAccession);
            const string databaseFilePath = @"H:\Research\IPRG2015\Henry_results\iPRG2015.TargDecoy.fasta";
            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);
            database.Read();
            var nameToAccession = new Dictionary<string, string>();
            foreach (var proteinName in database.GetProteinNames())
            {
                var start = proteinName.IndexOf('|');
                var end = proteinName.LastIndexOf('|');
                //var accession = proteinName.Substring(start + 1, end - start - 1);
                var name = proteinName.Substring(end + 1);
                if (proteinName.StartsWith("DECOY")) name = name + "-DECOY";
//                Console.WriteLine(name + " -> " +accession);
                Assert.IsTrue(uniProtPattern.IsMatch(proteinName));
                nameToAccession.Add(name, proteinName);
//                Console.WriteLine(name);
            }

            const string resultPath = @"H:\Research\IPRG2015\Henry_results\ProteinNames.txt";
            if (!File.Exists(resultPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath);
            }

            foreach (var line in File.ReadLines(resultPath))
            {
                if (line.Length == 0) continue;
                var name = line;
//                if (name.Contains(";"))
//                {
//                }
                name = name.Split()[0];
                if (name.Contains('|')) name = name.Substring(name.LastIndexOf('|') + 1);
                string proteinName;
                if (nameToAccession.TryGetValue(name, out proteinName))
                {
                    Console.WriteLine(proteinName);
                }
                else
                {
                    Console.WriteLine(name);
                    Assert.IsTrue(false);
                }
            }
        }
Example #13
0
        public void AddProteinLengths()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;
            TestUtils.ShowStarting(methodName);

            const string databaseFilePath = @"H:\Research\IPRG2015\database\yeast6proteaprotein.fasta";
            if (!File.Exists(databaseFilePath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, databaseFilePath);
            }

            var database = new FastaDatabase(databaseFilePath);
            database.Read();

            const string resultPath = @"H:\Research\IPRG2015\AMT_Peptides_NA.tsv";
            if (!File.Exists(resultPath))
            {
                Assert.Ignore(@"Skipping test {0} since file not found: {1}", methodName, resultPath);
            }

            const string outputFilePath = @"H:\Research\IPRG2015\AMT_Peptides.tsv";
            using (var writer = new StreamWriter(outputFilePath))
            {
                foreach (var line in File.ReadLines(resultPath))
                {
                    var data = line.Split(null);
                    if (data.Length != 14) continue;
                    var peptide = data[0];
                    if (peptide.Equals("Peptide"))
                    {
                        writer.WriteLine("Peptide\tProtein\tLength\t{0}", string.Join("\t", data.Skip(2)));
                        continue;
                    }
                    var protein = data[1];
                    var length = database.GetProteinLength(protein);
                    writer.WriteLine("{0}\t{1}\t{2}\t{3}", peptide, protein, length, string.Join("\t", data.Skip(2)));
                }
            }
        }